1 /*
2  * libxml2_htmlparser.c : an HTML 4.0 non-verifying parser
3  *
4  * See Copyright for the status of this software.
5  *
6  * daniel@veillard.com
7  * Portion Copyright � 2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved.
8  */
9 
10 #define IN_LIBXML
11 #include "xmlenglibxml.h"
12 
13 #include <string.h>
14 #if defined(HAVE_CTYPE_H)
15 #include <ctype.h>
16 #endif
17 #ifdef HAVE_STDLIB_H
18 #include <stdlib.h>
19 #endif
20 #ifdef HAVE_SYS_STAT_H
21 #include <sys/stat.h>
22 #endif
23 #ifdef HAVE_FCNTL_H
24 #include <fcntl.h>
25 #endif
26 #ifdef HAVE_UNISTD_H
27 #include <unistd.h>
28 #endif
29 #ifdef HAVE_ZLIB_H
30 #include <zlib.h>
31 #endif
32 
33 
34 #include <libxml2_globals.h>
35 #include <libxml2_xmlmemory.h>
36 #include <libxml2_tree.h>
37 #include <libxml2_parser.h>
38 #include <libxml2_parserinternals.h>
39 #include <libxml2_xmlerror.h>
40 #include "libxml2_xmlerror2.h"
41 #include "libxml2_htmlparser.h"
42 #include "libxml2_htmltree.h"
43 #include "libxml2_entities.h"
44 #include <libxml2_encoding.h>
45 #include <libxml2_valid.h>
46 #include <libxml2_xmlio.h>
47 #include <libxml2_uri.h>
48 
49 #define HTML_MAX_NAMELEN 1000
50 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
51 #define HTML_PARSER_BUFFER_SIZE 100
52 
53 #ifdef LIBXML_HTML_ENABLED
54 
55 /* #define DEBUG */
56 /* #define DEBUG_PUSH */
57 
58 static const int htmlOmittedDefaultValue = 1;
59 
60 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
61                              xmlChar end, xmlChar  end2, xmlChar end3);
62 static void htmlParseComment(htmlParserCtxtPtr ctxt);
63 
64 /************************************************************************
65  *                                                                      *
66  *              Some factorized error routines                          *
67  *                                                                      *
68  ************************************************************************/
69 
70 /**
71  * htmlErrMemory:
72  * @param ctxt an HTML parser context
73  * @param extra extra informations
74  *
75  * Handle a redefinition of attribute error
76  */
77 void
78 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra); // moved to XSLT-enabled part of this file
79 
80 /**
81  * htmlParseErr:
82  * @param ctxt an HTML parser context
83  * @param error the error number
84  * @param msg the error message
85  * @param str1 string infor
86  * @param str2 string infor
87  *
88  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
89  */
90 static void
htmlParseErr(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,const xmlChar * str1,const xmlChar * str2)91 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
92              const char *msg, const xmlChar *str1, const xmlChar *str2)
93 {
94     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
95         (ctxt->instate == XML_PARSER_EOF))
96         return;
97     ctxt->errNo = error;
98     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
99                     XML_ERR_ERROR, NULL, 0,
100                     (const char *) str1, (const char *) str2,
101                     NULL, 0, 0,
102                     msg, str1, str2);
103     ctxt->wellFormed = 0;
104 }
105 
106 /**
107  * htmlParseErrInt:
108  * @param ctxt an HTML parser context
109  * @param error the error number
110  * @param msg the error message
111  * @param val integer info
112  *
113  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
114  */
115 static void
htmlParseErrInt(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,int val)116 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
117              const char *msg, int val)
118 {
119     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
120         (ctxt->instate == XML_PARSER_EOF))
121         return;
122     ctxt->errNo = error;
123     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
124                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
125                     NULL, val, 0, msg, val);
126     ctxt->wellFormed = 0;
127 }
128 
129 /************************************************************************
130  *                                                                      *
131  *              Parser stacks related functions and macros              *
132  *                                                                      *
133  ************************************************************************/
134 
135 /**
136  * htmlnamePush:
137  * @param ctxt an HTML parser context
138  * @param value the element name
139  *
140  * Pushes a new element name on top of the name stack
141  *
142  * Returns 0 in case of error, the index in the stack otherwise
143  */
144 static int
htmlnamePush(htmlParserCtxtPtr ctxt,const xmlChar * value)145 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
146 {
147     if (ctxt->nameNr >= ctxt->nameMax) {
148         void* allocTmp; // DONE: Fix xmlRealloc
149         allocTmp = xmlRealloc((xmlChar**)ctxt->nameTab,
150                                ctxt->nameMax * 2 * sizeof(ctxt->nameTab[0]));
151         if (!allocTmp) {
152             htmlErrMemory(ctxt, NULL);
153             return (0);
154         }
155         ctxt->nameMax *= 2;
156         ctxt->nameTab = (const xmlChar**) allocTmp;
157     }
158     ctxt->nameTab[ctxt->nameNr] = value;
159     ctxt->name = value;
160     return (ctxt->nameNr++);
161 }
162 /**
163  * htmlnamePop:
164  * @param ctxt an HTML parser context
165  *
166  * Pops the top element name from the name stack
167  *
168  * Returns the name just removed
169  */
170 static const xmlChar *
htmlnamePop(htmlParserCtxtPtr ctxt)171 htmlnamePop(htmlParserCtxtPtr ctxt)
172 {
173     const xmlChar *ret;
174 
175     if (ctxt->nameNr <= 0)
176         return (0);
177     ctxt->nameNr--;
178     if (ctxt->nameNr < 0)
179         return (0);
180     if (ctxt->nameNr > 0)
181         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
182     else
183         ctxt->name = NULL;
184     ret = ctxt->nameTab[ctxt->nameNr];
185     ctxt->nameTab[ctxt->nameNr] = 0;
186     return (ret);
187 }
188 
189 /*
190  * Macros for accessing the content. Those should be used only by the parser,
191  * and not exported.
192  *
193  * Dirty macros, i.e. one need to make assumption on the context to use them
194  *
195  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
196  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
197  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
198  *           in UNICODE mode. This should be used internally by the parser
199  *           only to compare to ASCII values otherwise it would break when
200  *           running with UTF-8 encoding.
201  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
202  *           to compare on ASCII based substring.
203  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
204  *           it should be used only to compare on ASCII based substring.
205  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
206  *           strings without newlines within the parser.
207  *
208  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
209  *
210  *   CURRENT Returns the current char value, with the full decoding of
211  *           UTF-8 if we are using this mode. It returns an int.
212  *   NEXT    Skip to the next character, this does the proper decoding
213  *           in UTF-8 mode.
214  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
215  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
216  */
217 
218 #define UPPER (toupper(*ctxt->input->cur))
219 
220 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
221 
222 #define NXT(val) ctxt->input->cur[(val)]
223 
224 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
225 
226 #define CUR_PTR ctxt->input->cur
227 
228 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
229                    (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
230         xmlParserInputShrink(ctxt->input)
231 
232 #define GROW if ((ctxt->progressive == 0) &&                            \
233                  (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))   \
234         xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
235 
236 #define CURRENT ((int) (*ctxt->input->cur))
237 
238 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
239 
240 /* Inported from XML */
241 
242 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
243 #define CUR ((int) (*ctxt->input->cur))
244 #define NEXT xmlNextChar(ctxt)
245 
246 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
247 #define NXT(val) ctxt->input->cur[(val)]
248 #define CUR_PTR ctxt->input->cur
249 
250 
251 #define NEXTL(l) do {                                                   \
252     if (*(ctxt->input->cur) == '\n') {                                  \
253         ctxt->input->line++; ctxt->input->col = 1;                      \
254     } else ctxt->input->col++;                                          \
255     ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;            \
256   } while (0)
257 
258 /************
259     \
260     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);     \
261     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
262  ************/
263 
264 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
265 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
266 
267 #define COPY_BUF(l,b,i,v)                                               \
268     if (l == 1) b[i++] = (xmlChar) v;                                   \
269     else i += xmlCopyChar(l,&b[i],v)
270 
271 /**
272  * htmlCurrentChar:
273  * @param ctxt the HTML parser context
274  * @param len pointer to the length of the char read
275  *
276  * The current char value, if using UTF-8 this may actually span multiple
277  * bytes in the input buffer. Implement the end of line normalization:
278  * 2.11 End-of-Line Handling
279  * If the encoding is unspecified, in the case we find an ISO-Latin-1
280  * char, then the encoding converter is plugged in automatically.
281  *
282  * Returns the current char value and its length
283  */
284 
285 static int
htmlCurrentChar(xmlParserCtxtPtr ctxt,int * len)286 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
287     if (ctxt->instate == XML_PARSER_EOF)
288         return(0);
289 
290     if (ctxt->token != 0) {
291         *len = 0;
292         return(ctxt->token);
293     }
294     if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
295         /*
296          * We are supposed to handle UTF8, check it's valid
297          * From rfc2044: encoding of the Unicode values on UTF-8:
298          *
299          * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
300          * 0000 0000-0000 007F   0xxxxxxx
301          * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
302          * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
303          *
304          * Check for the 0x110000 limit too
305          */
306         const unsigned char *cur = ctxt->input->cur;
307         unsigned char c;
308         unsigned int val;
309 
310         c = *cur;
311         if (c & 0x80) {
312             if (cur[1] == 0)
313                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
314             if ((cur[1] & 0xc0) != 0x80)
315                 goto encoding_error;
316             if ((c & 0xe0) == 0xe0) {
317 
318                 if (cur[2] == 0)
319                     xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
320                 if ((cur[2] & 0xc0) != 0x80)
321                     goto encoding_error;
322                 if ((c & 0xf0) == 0xf0) {
323                     if (cur[3] == 0)
324                         xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
325                     if (((c & 0xf8) != 0xf0) ||
326                         ((cur[3] & 0xc0) != 0x80))
327                         goto encoding_error;
328                     /* 4-byte code */
329                     *len = 4;
330                     val = (cur[0] & 0x7) << 18;
331                     val |= (cur[1] & 0x3f) << 12;
332                     val |= (cur[2] & 0x3f) << 6;
333                     val |= cur[3] & 0x3f;
334                 } else {
335                   /* 3-byte code */
336                     *len = 3;
337                     val = (cur[0] & 0xf) << 12;
338                     val |= (cur[1] & 0x3f) << 6;
339                     val |= cur[2] & 0x3f;
340                 }
341             } else {
342               /* 2-byte code */
343                 *len = 2;
344                 val = (cur[0] & 0x1f) << 6;
345                 val |= cur[1] & 0x3f;
346             }
347             if (!IS_CHAR(val)) {
348                 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
349                                 "Char 0x%X out of allowed range\n", val);
350         }
351             return(val);
352         } else {
353             /* 1-byte code */
354             *len = 1;
355             return((int) *ctxt->input->cur);
356         }
357     }
358     /*
359      * Assume it's a fixed length encoding (1) with
360      * a compatible encoding for the ASCII set, since
361      * XML constructs only use < 128 chars
362      */
363     *len = 1;
364     if ((int) *ctxt->input->cur < 0x80)
365         return((int) *ctxt->input->cur);
366 
367     /*
368      * Humm this is bad, do an automatic flow conversion
369      */
370     xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
371     ctxt->charset = XML_CHAR_ENCODING_UTF8;
372     return(xmlCurrentChar(ctxt, len));
373 
374 encoding_error:
375     /*
376      * If we detect an UTF8 error that probably mean that the
377      * input encoding didn't get properly advertized in the
378      * declaration header. Report the error and switch the encoding
379      * to ISO-Latin-1 (if you don't like this policy, just declare the
380      * encoding !)
381      */
382     htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
383                  "Input is not proper UTF-8, indicate encoding !\n",
384                  NULL, NULL);
385     if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
386         ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
387                         ctxt->input->cur[0], ctxt->input->cur[1],
388                         ctxt->input->cur[2], ctxt->input->cur[3]);
389     }
390 
391     ctxt->charset = XML_CHAR_ENCODING_8859_1;
392     *len = 1;
393     return((int) *ctxt->input->cur);
394 }
395 
396 /**
397  * htmlSkipBlankChars:
398  * @param ctxt the HTML parser context
399  *
400  * skip all blanks character found at that point in the input streams.
401  *
402  * Returns the number of space chars skipped
403  */
404 
405 static int
htmlSkipBlankChars(xmlParserCtxtPtr ctxt)406 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
407     int res = 0;
408 
409     while (IS_BLANK_CH(*(ctxt->input->cur))) {
410         if ((*ctxt->input->cur == 0) &&
411             (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
412                 xmlPopInput(ctxt);
413         } else {
414             if (*(ctxt->input->cur) == '\n') {
415                 ctxt->input->line++; ctxt->input->col = 1;
416             } else ctxt->input->col++;
417             ctxt->input->cur++;
418             ctxt->nbChars++;
419             if (*ctxt->input->cur == 0)
420                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
421         }
422         res++;
423     }
424     return(res);
425 }
426 
427 
428 #endif  /* defined(LIBXML_HTML_ENABLED */
429 
430 #if defined(LIBXML_HTML_ENABLED) || defined(XMLENGINE_XSLT)
431 
432 /************************************************************************
433  *                                                                      *
434  *      The list of HTML elements and their properties                  *
435  *                                                                      *
436  ************************************************************************/
437 
438 /*
439  *  Start Tag: 1 means the start tag can be ommited
440  *  End Tag:   1 means the end tag can be ommited
441  *             2 means it's forbidden (empty elements)
442  *             3 means the tag is stylistic and should be closed easily
443  *  Depr:      this element is deprecated
444  *  DTD:       1 means that this element is valid only in the Loose DTD
445  *             2 means that this element is valid only in the Frameset DTD
446  *
447  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
448         , subElements , impliedsubelt , Attributes, userdata
449  */
450 
451 /* Definitions and a couple of vars for HTML Elements */
452 
453 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
454 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
455 #define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
456 #define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
457 #define BLOCK HEADING LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
458 #define FORMCTRL "input", "select", "textarea", "label", "button"
459 #define PCDATA
460 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
461 #define LIST "ul", "ol", "dir", "menu"
462 #define MODIFIER
463 #define FLOW BLOCK,INLINE
464 #define EMPTY NULL
465 
466 // TO DO libxslt added 2nd const in between
467 static const char* const html_flow [] = { FLOW, NULL } ;
468 static const char* const html_inline [] = { INLINE, NULL } ;
469 
470 /* placeholders: elts with content but no subelements */
471 static const char* const html_pcdata[] = { NULL } ;
472 #define html_cdata html_pcdata
473 
474 
475 /* ... and for HTML Attributes */
476 
477 #define COREATTRS "id", "class", "style", "title"
478 #define I18N "lang", "dir"
479 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
480 #define ATTRS COREATTRS,I18N,EVENTS
481 #define CELLHALIGN "align", "char", "charoff"
482 #define CELLVALIGN "valign"
483 
484 static const char* const html_attrs [] = { ATTRS, NULL } ;
485 static const char* const core_i18n_attrs [] = { COREATTRS, I18N, NULL } ;
486 static const char* const core_attrs [] = { COREATTRS, NULL } ;
487 static const char* const i18n_attrs [] = { I18N, NULL } ;
488 
489 /* Other declarations that should go inline ... */
490 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
491         "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
492         "tabindex", "onfocus", "onblur", NULL } ;
493 static const char* const target_attr[] = { "target", NULL } ;
494 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
495 static const char* const alt_attr[] = { "alt", NULL } ;
496 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
497 static const char* const href_attrs[] = { "href", NULL } ;
498 static const char* const clear_attrs[] = { "clear", NULL } ;
499 static const char* const inline_p[] = { INLINE, "p", NULL } ;
500 static const char* const flow_param[] = { FLOW, "param", NULL } ;
501 static const char* const applet_attrs[] = { COREATTRS , "codebase",
502                 "archive", "alt", "name", "height", "width", "align",
503                 "hspace", "vspace", NULL } ;
504 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
505         "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
506 static const char* const basefont_attrs[] =
507         { "id", "size", "color", "face", NULL } ;
508 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
509 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
510 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
511 static const char* const body_depr[] = { "background", "bgcolor", "text",
512         "link", "vlink", "alink", NULL } ;
513 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
514         "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
515 
516 
517 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
518 static const char* const col_elt[] = { "col", NULL } ;
519 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
520 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
521 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
522 static const char* const compact_attr[] = { "compact", NULL } ;
523 static const char* const label_attr[] = { "label", NULL } ;
524 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
525 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
526 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
527 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
528 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
529 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
530 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
531 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
532 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
533 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
534 static const char* const version_attr[] = { "version", NULL } ;
535 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
536 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
537 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
538 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
539 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
540 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
541 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
542 static const char* const align_attr[] = { "align", NULL } ;
543 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
544 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
545 static const char* const name_attr[] = { "name", NULL } ;
546 static const char* const action_attr[] = { "action", NULL } ;
547 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
548 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
549 static const char* const content_attr[] = { "content", NULL } ;
550 static const char* const type_attr[] = { "type", NULL } ;
551 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
552 static const char* const object_contents[] = { FLOW, "param", NULL } ;
553 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
554 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
555 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
556 static const char* const option_elt[] = { "option", NULL } ;
557 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
558 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
559 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
560 static const char* const width_attr[] = { "width", NULL } ;
561 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
562 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
563 static const char* const language_attr[] = { "language", NULL } ;
564 static const char* const select_content[] = { "optgroup", "option", NULL } ;
565 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
566 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
567 static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
568 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
569 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
570 static const char* const tr_elt[] = { "tr", NULL } ;
571 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
572 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
573 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
574 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
575 static const char* const tr_contents[] = { "th", "td", NULL } ;
576 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
577 static const char* const li_elt[] = { "li", NULL } ;
578 static const char* const ul_depr[] = { "type", "compact", NULL} ;
579 static const char* const dir_attr[] = { "dir", NULL} ;
580 
581 #define DECL (const char**)
582 
583 static const htmlElemDesc  html40ElementTable [] = {
584 { "a",          0, 0, 0, 0, 0, 0, 1, "anchor ",
585         DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
586 },
587 { "abbr",       0, 0, 0, 0, 0, 0, 1, "abbreviated form",
588         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
589 },
590 { "acronym",    0, 0, 0, 0, 0, 0, 1, "",
591         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
592 },
593 { "address",    0, 0, 0, 0, 0, 0, 0, "information on author ",
594         DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
595 },
596 { "applet",     0, 0, 0, 0, 1, 1, 2, "java applet ",
597         DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
598 },
599 { "area",       0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
600         EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
601 },
602 { "b",          0, 3, 0, 0, 0, 0, 1, "bold text style",
603         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
604 },
605 { "base",       0, 2, 2, 1, 0, 0, 0, "document base uri ",
606         EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
607 },
608 { "basefont",   0, 2, 2, 1, 1, 1, 1, "base font size " ,
609         EMPTY , NULL , NULL, DECL basefont_attrs, NULL
610 },
611 { "bdo",        0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
612         DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
613 },
614 { "big",        0, 3, 0, 0, 0, 0, 1, "large text style",
615         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
616 },
617 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
618         DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
619 },
620 { "body",       1, 1, 0, 0, 0, 0, 0, "document body ",
621         DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
622 },
623 { "br",         0, 2, 2, 1, 0, 0, 1, "forced line break ",
624         EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
625 },
626 { "button",     0, 0, 0, 0, 0, 0, 2, "push button ",
627         DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
628 },
629 { "caption",    0, 0, 0, 0, 0, 0, 0, "table caption ",
630         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
631 },
632 { "center",     0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
633         DECL html_flow , NULL , NULL, DECL html_attrs, NULL
634 },
635 { "cite",       0, 0, 0, 0, 0, 0, 1, "citation",
636         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
637 },
638 { "code",       0, 0, 0, 0, 0, 0, 1, "computer code fragment",
639         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
640 },
641 { "col",        0, 2, 2, 1, 0, 0, 0, "table column ",
642         EMPTY , NULL , DECL col_attrs , NULL, NULL
643 },
644 { "colgroup",   0, 1, 0, 0, 0, 0, 0, "table column group ",
645         DECL col_elt , "col" , DECL col_attrs , NULL, NULL
646 },
647 { "dd",         0, 1, 0, 0, 0, 0, 0, "definition description ",
648         DECL html_flow , NULL , DECL html_attrs, NULL, NULL
649 },
650 { "del",        0, 0, 0, 0, 0, 0, 2, "deleted text ",
651         DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
652 },
653 { "dfn",        0, 0, 0, 0, 0, 0, 1, "instance definition",
654         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
655 },
656 { "dir",        0, 0, 0, 0, 1, 1, 0, "directory list",
657         DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
658 },
659 { "div",        0, 0, 0, 0, 0, 0, 0, "generic language/style container",
660         DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
661 },
662 { "dl",         0, 0, 0, 0, 0, 0, 0, "definition list ",
663         DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
664 },
665 { "dt",         0, 1, 0, 0, 0, 0, 0, "definition term ",
666         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
667 },
668 { "em",         0, 3, 0, 0, 0, 0, 1, "emphasis",
669         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
670 },
671 { "fieldset",   0, 0, 0, 0, 0, 0, 0, "form control group ",
672         DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
673 },
674 { "font",       0, 3, 0, 0, 1, 1, 1, "local change to font ",
675         DECL html_inline, NULL, NULL, DECL font_attrs, NULL
676 },
677 { "form",       0, 0, 0, 0, 0, 0, 0, "interactive form ",
678         DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
679 },
680 { "frame",      0, 2, 2, 1, 0, 2, 0, "subwindow " ,
681         EMPTY, NULL, NULL, DECL frame_attrs, NULL
682 },
683 { "frameset",   0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
684         DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
685 },
686 { "h1",         0, 0, 0, 0, 0, 0, 0, "heading ",
687         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
688 },
689 { "h2",         0, 0, 0, 0, 0, 0, 0, "heading ",
690         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
691 },
692 { "h3",         0, 0, 0, 0, 0, 0, 0, "heading ",
693         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
694 },
695 { "h4",         0, 0, 0, 0, 0, 0, 0, "heading ",
696         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
697 },
698 { "h5",         0, 0, 0, 0, 0, 0, 0, "heading ",
699         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
700 },
701 { "h6",         0, 0, 0, 0, 0, 0, 0, "heading ",
702         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
703 },
704 { "head",       1, 1, 0, 0, 0, 0, 0, "document head ",
705         DECL head_contents, NULL, DECL head_attrs, NULL, NULL
706 },
707 { "hr",         0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
708         EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
709 },
710 { "html",       1, 1, 0, 0, 0, 0, 0, "document root element ",
711         DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
712 },
713 { "i",          0, 3, 0, 0, 0, 0, 1, "italic text style",
714         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
715 },
716 { "iframe",     0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
717         DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
718 },
719 { "img",        0, 2, 2, 1, 0, 0, 1, "embedded image ",
720         EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
721 },
722 { "input",      0, 2, 2, 1, 0, 0, 1, "form control ",
723         EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
724 },
725 { "ins",        0, 0, 0, 0, 0, 0, 2, "inserted text",
726         DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
727 },
728 { "isindex",    0, 2, 2, 1, 1, 1, 0, "single line prompt ",
729         EMPTY, NULL, NULL, DECL prompt_attrs, NULL
730 },
731 { "kbd",        0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
732         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
733 },
734 { "label",      0, 0, 0, 0, 0, 0, 1, "form field label text ",
735         DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
736 },
737 { "legend",     0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
738         DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
739 },
740 { "li",         0, 1, 1, 0, 0, 0, 0, "list item ",
741         DECL html_flow, NULL, DECL html_attrs, NULL, NULL
742 },
743 { "link",       0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
744         EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
745 },
746 { "map",        0, 0, 0, 0, 0, 0, 2, "client-side image map ",
747         DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
748 },
749 { "menu",       0, 0, 0, 0, 1, 1, 0, "menu list ",
750         DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
751 },
752 { "meta",       0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
753         EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
754 },
755 { "noframes",   0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
756         DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
757 },
758 { "noscript",   0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
759         DECL html_flow, "div", DECL html_attrs, NULL, NULL
760 },
761 { "object",     0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
762         DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
763 },
764 { "ol",         0, 0, 0, 0, 0, 0, 0, "ordered list ",
765         DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
766 },
767 { "optgroup",   0, 0, 0, 0, 0, 0, 0, "option group ",
768         DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
769 },
770 { "option",     0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
771         DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
772 },
773 { "p",          0, 1, 0, 0, 0, 0, 0, "paragraph ",
774         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
775 },
776 { "param",      0, 2, 2, 1, 0, 0, 0, "named property value ",
777         EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
778 },
779 { "pre",        0, 0, 0, 0, 0, 0, 0, "preformatted text ",
780         DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
781 },
782 { "q",          0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
783         DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
784 },
785 { "s",          0, 3, 0, 0, 1, 1, 1, "strike-through text style",
786         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
787 },
788 { "samp",       0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
789         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
790 },
791 { "script",     0, 0, 0, 0, 0, 0, 2, "script statements ",
792         DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
793 },
794 { "select",     0, 0, 0, 0, 0, 0, 1, "option selector ",
795         DECL select_content, NULL, DECL select_attrs, NULL, NULL
796 },
797 { "small",      0, 3, 0, 0, 0, 0, 1, "small text style",
798         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
799 },
800 { "span",       0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
801         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
802 },
803 { "strike",     0, 3, 0, 0, 1, 1, 1, "strike-through text",
804         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
805 },
806 { "strong",     0, 3, 0, 0, 0, 0, 1, "strong emphasis",
807         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
808 },
809 { "style",      0, 0, 0, 0, 0, 0, 0, "style info ",
810         DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
811 },
812 { "sub",        0, 3, 0, 0, 0, 0, 1, "subscript",
813         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
814 },
815 { "sup",        0, 3, 0, 0, 0, 0, 1, "superscript ",
816         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
817 },
818 { "table",      0, 0, 0, 0, 0, 0, 0, "",
819         DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
820 },
821 { "tbody",      1, 0, 0, 0, 0, 0, 0, "table body ",
822         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
823 },
824 { "td",         0, 0, 0, 0, 0, 0, 0, "table data cell",
825         DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
826 },
827 { "textarea",   0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
828         DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
829 },
830 { "tfoot",      0, 1, 0, 0, 0, 0, 0, "table footer ",
831         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
832 },
833 { "th",         0, 1, 0, 0, 0, 0, 0, "table header cell",
834         DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
835 },
836 { "thead",      0, 1, 0, 0, 0, 0, 0, "table header ",
837         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
838 },
839 { "title",      0, 0, 0, 0, 0, 0, 0, "document title ",
840         DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
841 },
842 { "tr",         0, 0, 0, 0, 0, 0, 0, "table row ",
843         DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
844 },
845 { "tt",         0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
846         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
847 },
848 { "u",          0, 3, 0, 0, 1, 1, 1, "underlined text style",
849         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
850 },
851 { "ul",         0, 0, 0, 0, 0, 0, 0, "unordered list ",
852         DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
853 },
854 { "var",        0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
855         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
856 }
857 };
858 
859 #endif /* defined(LIBXML_HTML_ENABLED) || defined(XMLENGINE_XSLT) */
860 
861 #ifdef  LIBXML_HTML_ENABLED
862 
863 /*
864  * start tags that imply the end of current element
865  */
866 static const char * const htmlStartClose [] = {
867 "form",         "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
868                 "dl", "ul", "ol", "menu", "dir", "address", "pre",
869                 "listing", "xmp", "head", NULL,
870 "head",         "p", NULL,
871 "title",        "p", NULL,
872 "body",         "head", "style", "link", "title", "p", NULL,
873 "frameset",     "head", "style", "link", "title", "p", NULL,
874 "li",           "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
875                 "pre", "listing", "xmp", "head", "li", NULL,
876 "hr",           "p", "head", NULL,
877 "h1",           "p", "head", NULL,
878 "h2",           "p", "head", NULL,
879 "h3",           "p", "head", NULL,
880 "h4",           "p", "head", NULL,
881 "h5",           "p", "head", NULL,
882 "h6",           "p", "head", NULL,
883 "dir",          "p", "head", NULL,
884 "address",      "p", "head", "ul", NULL,
885 "pre",          "p", "head", "ul", NULL,
886 "listing",      "p", "head", NULL,
887 "xmp",          "p", "head", NULL,
888 "blockquote",   "p", "head", NULL,
889 "dl",           "p", "dt", "menu", "dir", "address", "pre", "listing",
890                 "xmp", "head", NULL,
891 "dt",           "p", "menu", "dir", "address", "pre", "listing", "xmp",
892                 "head", "dd", NULL,
893 "dd",           "p", "menu", "dir", "address", "pre", "listing", "xmp",
894                 "head", "dt", NULL,
895 "ul",           "p", "head", "ol", "menu", "dir", "address", "pre",
896                 "listing", "xmp", NULL,
897 "ol",           "p", "head", "ul", NULL,
898 "menu",         "p", "head", "ul", NULL,
899 "p",            "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
900 "div",          "p", "head", NULL,
901 "noscript",     "p", "head", NULL,
902 "center",       "font", "b", "i", "p", "head", NULL,
903 "a",            "a", NULL,
904 "caption",      "p", NULL,
905 "colgroup",     "caption", "colgroup", "col", "p", NULL,
906 "col",          "caption", "col", "p", NULL,
907 "table",        "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
908                 "listing", "xmp", "a", NULL,
909 "th",           "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
910 "td",           "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
911 "tr",           "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
912 "thead",        "caption", "col", "colgroup", NULL,
913 "tfoot",        "th", "td", "tr", "caption", "col", "colgroup", "thead",
914                 "tbody", "p", NULL,
915 "tbody",        "th", "td", "tr", "caption", "col", "colgroup", "thead",
916                 "tfoot", "tbody", "p", NULL,
917 "optgroup",     "option", NULL,
918 "option",       "option", NULL,
919 "fieldset",     "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
920                 "pre", "listing", "xmp", "a", NULL,
921 NULL
922 };
923 
924 /*
925  * The list of HTML elements which are supposed not to have
926  * CDATA content and where a p element will be implied
927  *
928 
929 
930  */
931 static const char * const htmlNoContentElements[] = {
932     "html",
933     "head",
934     "body",
935     NULL
936 };
937 
938 /*
939  * The list of HTML attributes which are of content %Script;
940  * NOTE: when adding ones, check htmlIsScriptAttribute() since
941  *       it assumes the name starts with 'on'
942  */
943 static const char * const htmlScriptAttributes[] = {
944     "onclick",
945     "ondblclick",
946     "onmousedown",
947     "onmouseup",
948     "onmouseover",
949     "onmousemove",
950     "onmouseout",
951     "onkeypress",
952     "onkeydown",
953     "onkeyup",
954     "onload",
955     "onunload",
956     "onfocus",
957     "onblur",
958     "onsubmit",
959     "onrest",
960     "onchange",
961     "onselect"
962 };
963 
964 /*
965  * This table is used by the htmlparser to know what to do with
966  * broken html pages. By assigning different priorities to different
967  * elements the parser can decide how to handle extra endtags.
968  * Endtags are only allowed to close elements with lower or equal
969  * priority.
970  */
971 
972 typedef struct {
973     const char *name;
974     int priority;
975 } elementPriority;
976 
977 static const elementPriority htmlEndPriority [] = {
978     {"div",   150},
979     {"td",    160},
980     {"th",    160},
981     {"tr",    170},
982     {"thead", 180},
983     {"tbody", 180},
984     {"tfoot", 180},
985     {"table", 190},
986     {"head",  200},
987     {"body",  200},
988     {"html",  220},
989     {NULL,    100} /* Default priority */
990 };
991 
992 /************************************************************************
993  *                                                                      *
994  *      functions to handle HTML specific data                          *
995  *                                                                      *
996  ************************************************************************/
997 
998 /**
999  * htmlInitAutoClose:
1000  *
1001  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1002  * This is not reentrant. Call xmlInitParser() once before processing in
1003  * case of use in multithreaded programs.
1004  */
1005 void
htmlInitAutoClose(void)1006 htmlInitAutoClose(void) {
1007     int indx, i = 0;
1008 
1009     if (htmlStartCloseIndexinitialized) return;
1010 
1011     for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1012     indx = 0;
1013     while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1014         // libxslt port: (const char**) cast was added
1015         htmlStartCloseIndex[indx++] = (const char**)&htmlStartClose[i];
1016         while(htmlStartClose[i++]) {};
1017             i++;
1018     }
1019     htmlStartCloseIndexinitialized = 1;
1020 }
1021 
1022 /**
1023  * htmlGetEndPriority:
1024  * @param name The name of the element to look up the priority for.
1025  *
1026  * Return value: The "endtag" priority.
1027  **/
1028 static int
htmlGetEndPriority(const xmlChar * name)1029 htmlGetEndPriority (const xmlChar *name) {
1030     int i = 0;
1031 
1032     while ((htmlEndPriority[i].name != NULL) &&
1033            (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1034         i++;
1035 
1036     return(htmlEndPriority[i].priority);
1037 }
1038 
1039 
1040 /**
1041  * htmlCheckAutoClose:
1042  * @param newtag The new tag name
1043  * @param oldtag The old tag name
1044  *
1045  * Checks whether the new tag is one of the registered valid tags for
1046  * closing old.
1047  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1048  *
1049  * Returns 0 if no, 1 if yes.
1050  */
1051 static int
htmlCheckAutoClose(const xmlChar * newtag,const xmlChar * oldtag)1052 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1053 {
1054     int i, indx;
1055     const char **closed = NULL;
1056 
1057     if (htmlStartCloseIndexinitialized == 0)
1058         htmlInitAutoClose();
1059 
1060     /* inefficient, but not a big deal */
1061     for (indx = 0; indx < 100; indx++) {
1062         closed = htmlStartCloseIndex[indx];
1063         if (closed == NULL)
1064             return (0);
1065         if (xmlStrEqual(BAD_CAST * closed, newtag))
1066             break;
1067     }
1068 
1069     i = closed - htmlStartClose;
1070     i++;
1071     while (htmlStartClose[i] != NULL) {
1072         if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1073             return (1);
1074         }
1075         i++;
1076     }
1077     return (0);
1078 }
1079 
1080 /**
1081  * htmlAutoCloseOnClose:
1082  * @param ctxt an HTML parser context
1083  * @param newtag The new tag name
1084  * @param force force the tag closure
1085  *
1086  * The HTML DTD allows an ending tag to implicitly close other tags.
1087  */
1088 static void
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1089 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1090 {
1091     const htmlElemDesc *info;
1092     int i, priority;
1093 
1094     priority = htmlGetEndPriority(newtag);
1095 
1096     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1097 
1098         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1099             break;
1100         /*
1101          * A missplaced endtag can only close elements with lower
1102          * or equal priority, so if we find an element with higher
1103          * priority before we find an element with
1104          * matching name, we just ignore this endtag
1105          */
1106         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1107             return;
1108     }
1109     if (i < 0)
1110         return;
1111 
1112     while (!xmlStrEqual(newtag, ctxt->name)) {
1113         info = htmlTagLookup(ctxt->name);
1114         if ((info != NULL) && (info->endTag == 3)) {
1115             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1116                          "Opening and ending tag mismatch: %s and %s\n",
1117                          newtag, ctxt->name);
1118         }
1119         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1120             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1121         htmlnamePop(ctxt);
1122     }
1123 }
1124 
1125 /**
1126  * htmlAutoCloseOnEnd:
1127  * @param ctxt an HTML parser context
1128  *
1129  * Close all remaining tags at the end of the stream
1130  */
1131 static void
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)1132 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1133 {
1134     int i;
1135 
1136     if (ctxt->nameNr == 0)
1137         return;
1138     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1139         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1140             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1141         htmlnamePop(ctxt);
1142     }
1143 }
1144 
1145 /**
1146  * htmlAutoClose:
1147  * @param ctxt an HTML parser context
1148  * @param newtag The new tag name or NULL
1149  *
1150  * The HTML DTD allows a tag to implicitly close other tags.
1151  * The list is kept in htmlStartClose array. This function is
1152  * called when a new tag has been detected and generates the
1153  * appropriates closes if possible/needed.
1154  * If newtag is NULL this mean we are at the end of the resource
1155  * and we should check
1156  */
1157 static void
htmlAutoClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1158 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1159 {
1160     while ((newtag != NULL) && (ctxt->name != NULL) &&
1161            (htmlCheckAutoClose(newtag, ctxt->name))) {
1162         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1163             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1164         htmlnamePop(ctxt);
1165     }
1166     if (newtag == NULL) {
1167         htmlAutoCloseOnEnd(ctxt);
1168         return;
1169     }
1170     while ((newtag == NULL) && (ctxt->name != NULL) &&
1171            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1172             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1173             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1174         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1175             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1176         htmlnamePop(ctxt);
1177     }
1178 }
1179 
1180 /**
1181  * htmlAutoCloseTag:
1182  * @param doc the HTML document
1183  * @param name The tag name
1184  * @param elem the HTML element
1185  *
1186  * The HTML DTD allows a tag to implicitly close other tags.
1187  * The list is kept in htmlStartClose array. This function checks
1188  * if the element or one of it's children would autoclose the
1189  * given tag.
1190  *
1191  * Returns 1 if autoclose, 0 otherwise
1192  */
1193 int
htmlAutoCloseTag(htmlDocPtr doc,const xmlChar * name,htmlNodePtr elem)1194 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1195     htmlNodePtr child;
1196 
1197     if (elem == NULL) return(1);
1198     if (xmlStrEqual(name, elem->name)) return(0);
1199     if (htmlCheckAutoClose(elem->name, name)) return(1);
1200     child = elem->children;
1201     while (child != NULL) {
1202         if (htmlAutoCloseTag(doc, name, child)) return(1);
1203         child = child->next;
1204     }
1205     return(0);
1206 }
1207 
1208 /**
1209  * htmlIsAutoClosed:
1210  * @param doc the HTML document
1211  * @param elem the HTML element
1212  *
1213  * The HTML DTD allows a tag to implicitly close other tags.
1214  * The list is kept in htmlStartClose array. This function checks
1215  * if a tag is autoclosed by one of it's child
1216  *
1217  * Returns 1 if autoclosed, 0 otherwise
1218  */
1219 int
htmlIsAutoClosed(htmlDocPtr doc,htmlNodePtr elem)1220 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1221     htmlNodePtr child;
1222 
1223     if (elem == NULL) return(1);
1224     child = elem->children;
1225     while (child != NULL) {
1226         if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1227         child = child->next;
1228     }
1229     return(0);
1230 }
1231 
1232 /**
1233  * htmlCheckImplied:
1234  * @param ctxt an HTML parser context
1235  * @param newtag The new tag name
1236  *
1237  * The HTML DTD allows a tag to exists only implicitly
1238  * called when a new tag has been detected and generates the
1239  * appropriates implicit tags if missing
1240  */
1241 static void
htmlCheckImplied(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1242 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1243     if (!htmlOmittedDefaultValue)
1244         return;
1245     if (xmlStrEqual(newtag, BAD_CAST"html"))
1246         return;
1247     if (ctxt->nameNr <= 0) {
1248         htmlnamePush(ctxt, BAD_CAST"html");
1249         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1250             ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1251     }
1252     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1253         return;
1254     if ((ctxt->nameNr <= 1) &&
1255         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1256          (xmlStrEqual(newtag, BAD_CAST"style")) ||
1257          (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1258          (xmlStrEqual(newtag, BAD_CAST"link")) ||
1259          (xmlStrEqual(newtag, BAD_CAST"title")) ||
1260          (xmlStrEqual(newtag, BAD_CAST"base")))) {
1261             /*
1262              * dropped OBJECT ... i you put it first BODY will be
1263              * assumed !
1264              */
1265             htmlnamePush(ctxt, BAD_CAST"head");
1266             if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1267                 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1268     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1269                (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1270                (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1271         int i;
1272         for (i = 0;i < ctxt->nameNr;i++) {
1273             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1274                 return;
1275             }
1276             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1277                 return;
1278             }
1279         }
1280 
1281         htmlnamePush(ctxt, BAD_CAST"body");
1282         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1283             ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1284     }
1285 }
1286 
1287 /**
1288  * htmlCheckParagraph
1289  * @param ctxt an HTML parser context
1290  *
1291  * Check whether a p element need to be implied before inserting
1292  * characters in the current element.
1293  *
1294  * Returns 1 if a paragraph has been inserted, 0 if not and -1
1295  *         in case of error.
1296  */
1297 
1298 static int
htmlCheckParagraph(htmlParserCtxtPtr ctxt)1299 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1300     const xmlChar *tag;
1301     int i;
1302 
1303     if (ctxt == NULL)
1304         return(-1);
1305     tag = ctxt->name;
1306     if (tag == NULL) {
1307         htmlAutoClose(ctxt, BAD_CAST"p");
1308         htmlCheckImplied(ctxt, BAD_CAST"p");
1309         htmlnamePush(ctxt, BAD_CAST"p");
1310         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1311             ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1312         return(1);
1313     }
1314     if (!htmlOmittedDefaultValue)
1315         return(0);
1316     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1317         if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1318             htmlAutoClose(ctxt, BAD_CAST"p");
1319             htmlCheckImplied(ctxt, BAD_CAST"p");
1320             htmlnamePush(ctxt, BAD_CAST"p");
1321             if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1322                 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1323             return(1);
1324         }
1325     }
1326     return(0);
1327 }
1328 
1329 /**
1330  * htmlIsScriptAttribute:
1331  * @param name an attribute name
1332  *
1333  * Check if an attribute is of content type Script
1334  *
1335  * Returns 1 is the attribute is a script 0 otherwise
1336  */
1337 int
htmlIsScriptAttribute(const xmlChar * name)1338 htmlIsScriptAttribute(const xmlChar *name) {
1339     unsigned int i;
1340 
1341     if (name == NULL)
1342         return(0);
1343     /*
1344      * all script attributes start with 'on'
1345      */
1346     if ((name[0] != 'o') || (name[1] != 'n'))
1347         return(0);
1348     for (i = 0;
1349          i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1350          i++) {
1351         if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1352             return(1);
1353     }
1354     return(0);
1355 }
1356 
1357 /************************************************************************
1358  *                                                                      *
1359  *              The list of HTML predefined entities                    *
1360  *                                                                      *
1361  ************************************************************************/
1362 
1363 
1364 static const htmlEntityDesc html40EntitiesTable[] = {
1365 /*
1366  * the 4 absolute ones, plus apostrophe.
1367  */
1368 { 34,   "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1369 { 38,   "amp",  "ampersand, U+0026 ISOnum" },
1370 { 39,   "apos", "single quote" },
1371 { 60,   "lt",   "less-than sign, U+003C ISOnum" },
1372 { 62,   "gt",   "greater-than sign, U+003E ISOnum" },
1373 
1374 /*
1375  * A bunch still in the 128-255 range
1376  * Replacing them depend really on the charset used.
1377  */
1378 { 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1379 { 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1380 { 162,  "cent", "cent sign, U+00A2 ISOnum" },
1381 { 163,  "pound","pound sign, U+00A3 ISOnum" },
1382 { 164,  "curren","currency sign, U+00A4 ISOnum" },
1383 { 165,  "yen",  "yen sign = yuan sign, U+00A5 ISOnum" },
1384 { 166,  "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1385 { 167,  "sect", "section sign, U+00A7 ISOnum" },
1386 { 168,  "uml",  "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1387 { 169,  "copy", "copyright sign, U+00A9 ISOnum" },
1388 { 170,  "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1389 { 171,  "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1390 { 172,  "not",  "not sign, U+00AC ISOnum" },
1391 { 173,  "shy",  "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1392 { 174,  "reg",  "registered sign = registered trade mark sign, U+00AE ISOnum" },
1393 { 175,  "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1394 { 176,  "deg",  "degree sign, U+00B0 ISOnum" },
1395 { 177,  "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1396 { 178,  "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1397 { 179,  "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1398 { 180,  "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1399 { 181,  "micro","micro sign, U+00B5 ISOnum" },
1400 { 182,  "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1401 { 183,  "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1402 { 184,  "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1403 { 185,  "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1404 { 186,  "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1405 { 187,  "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1406 { 188,  "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1407 { 189,  "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1408 { 190,  "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1409 { 191,  "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1410 { 192,  "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1411 { 193,  "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1412 { 194,  "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1413 { 195,  "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1414 { 196,  "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1415 { 197,  "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1416 { 198,  "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1417 { 199,  "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1418 { 200,  "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1419 { 201,  "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1420 { 202,  "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1421 { 203,  "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1422 { 204,  "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1423 { 205,  "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1424 { 206,  "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1425 { 207,  "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1426 { 208,  "ETH",  "latin capital letter ETH, U+00D0 ISOlat1" },
1427 { 209,  "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1428 { 210,  "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1429 { 211,  "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1430 { 212,  "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1431 { 213,  "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1432 { 214,  "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1433 { 215,  "times","multiplication sign, U+00D7 ISOnum" },
1434 { 216,  "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1435 { 217,  "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1436 { 218,  "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1437 { 219,  "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1438 { 220,  "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1439 { 221,  "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1440 { 222,  "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1441 { 223,  "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1442 { 224,  "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1443 { 225,  "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1444 { 226,  "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1445 { 227,  "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1446 { 228,  "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1447 { 229,  "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1448 { 230,  "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1449 { 231,  "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1450 { 232,  "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1451 { 233,  "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1452 { 234,  "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1453 { 235,  "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1454 { 236,  "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1455 { 237,  "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1456 { 238,  "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1457 { 239,  "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1458 { 240,  "eth",  "latin small letter eth, U+00F0 ISOlat1" },
1459 { 241,  "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1460 { 242,  "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1461 { 243,  "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1462 { 244,  "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1463 { 245,  "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1464 { 246,  "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1465 { 247,  "divide","division sign, U+00F7 ISOnum" },
1466 { 248,  "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1467 { 249,  "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1468 { 250,  "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1469 { 251,  "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1470 { 252,  "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1471 { 253,  "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1472 { 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1473 { 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1474 
1475 { 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1476 { 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },
1477 { 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1478 { 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1479 { 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1480 
1481 /*
1482  * Anything below should really be kept as entities references
1483  */
1484 { 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1485 
1486 { 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1487 { 732,  "tilde","small tilde, U+02DC ISOdia" },
1488 
1489 { 913,  "Alpha","greek capital letter alpha, U+0391" },
1490 { 914,  "Beta", "greek capital letter beta, U+0392" },
1491 { 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1492 { 916,  "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1493 { 917,  "Epsilon","greek capital letter epsilon, U+0395" },
1494 { 918,  "Zeta", "greek capital letter zeta, U+0396" },
1495 { 919,  "Eta",  "greek capital letter eta, U+0397" },
1496 { 920,  "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1497 { 921,  "Iota", "greek capital letter iota, U+0399" },
1498 { 922,  "Kappa","greek capital letter kappa, U+039A" },
1499 { 923,  "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1500 { 924,  "Mu",   "greek capital letter mu, U+039C" },
1501 { 925,  "Nu",   "greek capital letter nu, U+039D" },
1502 { 926,  "Xi",   "greek capital letter xi, U+039E ISOgrk3" },
1503 { 927,  "Omicron","greek capital letter omicron, U+039F" },
1504 { 928,  "Pi",   "greek capital letter pi, U+03A0 ISOgrk3" },
1505 { 929,  "Rho",  "greek capital letter rho, U+03A1" },
1506 { 931,  "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1507 { 932,  "Tau",  "greek capital letter tau, U+03A4" },
1508 { 933,  "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1509 { 934,  "Phi",  "greek capital letter phi, U+03A6 ISOgrk3" },
1510 { 935,  "Chi",  "greek capital letter chi, U+03A7" },
1511 { 936,  "Psi",  "greek capital letter psi, U+03A8 ISOgrk3" },
1512 { 937,  "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1513 
1514 { 945,  "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1515 { 946,  "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1516 { 947,  "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1517 { 948,  "delta","greek small letter delta, U+03B4 ISOgrk3" },
1518 { 949,  "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1519 { 950,  "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1520 { 951,  "eta",  "greek small letter eta, U+03B7 ISOgrk3" },
1521 { 952,  "theta","greek small letter theta, U+03B8 ISOgrk3" },
1522 { 953,  "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1523 { 954,  "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1524 { 955,  "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1525 { 956,  "mu",   "greek small letter mu, U+03BC ISOgrk3" },
1526 { 957,  "nu",   "greek small letter nu, U+03BD ISOgrk3" },
1527 { 958,  "xi",   "greek small letter xi, U+03BE ISOgrk3" },
1528 { 959,  "omicron","greek small letter omicron, U+03BF NEW" },
1529 { 960,  "pi",   "greek small letter pi, U+03C0 ISOgrk3" },
1530 { 961,  "rho",  "greek small letter rho, U+03C1 ISOgrk3" },
1531 { 962,  "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1532 { 963,  "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1533 { 964,  "tau",  "greek small letter tau, U+03C4 ISOgrk3" },
1534 { 965,  "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1535 { 966,  "phi",  "greek small letter phi, U+03C6 ISOgrk3" },
1536 { 967,  "chi",  "greek small letter chi, U+03C7 ISOgrk3" },
1537 { 968,  "psi",  "greek small letter psi, U+03C8 ISOgrk3" },
1538 { 969,  "omega","greek small letter omega, U+03C9 ISOgrk3" },
1539 { 977,  "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1540 { 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1541 { 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
1542 
1543 { 8194, "ensp", "en space, U+2002 ISOpub" },
1544 { 8195, "emsp", "em space, U+2003 ISOpub" },
1545 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1546 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1547 { 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
1548 { 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
1549 { 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
1550 { 8211, "ndash","en dash, U+2013 ISOpub" },
1551 { 8212, "mdash","em dash, U+2014 ISOpub" },
1552 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1553 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1554 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1555 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1556 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1557 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1558 { 8224, "dagger","dagger, U+2020 ISOpub" },
1559 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1560 
1561 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1562 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1563 
1564 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1565 
1566 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1567 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1568 
1569 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1570 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1571 
1572 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1573 { 8260, "frasl","fraction slash, U+2044 NEW" },
1574 
1575 { 8364, "euro", "euro sign, U+20AC NEW" },
1576 
1577 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1578 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1579 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1580 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1581 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1582 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1583 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1584 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1585 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1586 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1587 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1588 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1589 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1590 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1591 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1592 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1593 
1594 { 8704, "forall","for all, U+2200 ISOtech" },
1595 { 8706, "part", "partial differential, U+2202 ISOtech" },
1596 { 8707, "exist","there exists, U+2203 ISOtech" },
1597 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1598 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1599 { 8712, "isin", "element of, U+2208 ISOtech" },
1600 { 8713, "notin","not an element of, U+2209 ISOtech" },
1601 { 8715, "ni",   "contains as member, U+220B ISOtech" },
1602 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1603 { 8721, "sum",  "n-ary summation, U+2211 ISOamsb" },
1604 { 8722, "minus","minus sign, U+2212 ISOtech" },
1605 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1606 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
1607 { 8733, "prop", "proportional to, U+221D ISOtech" },
1608 { 8734, "infin","infinity, U+221E ISOtech" },
1609 { 8736, "ang",  "angle, U+2220 ISOamso" },
1610 { 8743, "and",  "logical and = wedge, U+2227 ISOtech" },
1611 { 8744, "or",   "logical or = vee, U+2228 ISOtech" },
1612 { 8745, "cap",  "intersection = cap, U+2229 ISOtech" },
1613 { 8746, "cup",  "union = cup, U+222A ISOtech" },
1614 { 8747, "int",  "integral, U+222B ISOtech" },
1615 { 8756, "there4","therefore, U+2234 ISOtech" },
1616 { 8764, "sim",  "tilde operator = varies with = similar to, U+223C ISOtech" },
1617 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1618 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1619 { 8800, "ne",   "not equal to, U+2260 ISOtech" },
1620 { 8801, "equiv","identical to, U+2261 ISOtech" },
1621 { 8804, "le",   "less-than or equal to, U+2264 ISOtech" },
1622 { 8805, "ge",   "greater-than or equal to, U+2265 ISOtech" },
1623 { 8834, "sub",  "subset of, U+2282 ISOtech" },
1624 { 8835, "sup",  "superset of, U+2283 ISOtech" },
1625 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1626 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1627 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1628 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1629 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1630 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1631 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1632 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1633 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1634 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1635 { 8971, "rfloor","right floor, U+230B ISOamsc" },
1636 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1637 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1638 { 9674, "loz",  "lozenge, U+25CA ISOpub" },
1639 
1640 { 9824, "spades","black spade suit, U+2660 ISOpub" },
1641 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1642 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1643 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
1644 
1645 };
1646 
1647 /************************************************************************
1648  *                                                                      *
1649  *              Commodity functions to handle entities                  *
1650  *                                                                      *
1651  ************************************************************************/
1652 
1653 /*
1654  * Macro used to grow the current buffer. Buffer is freed in OOM.
1655  */ // DONE: Fix xmlRealloc
1656 #define growBuffer(buffer) {                                            \
1657     void* allocTmp;                                                     \
1658     buffer##_size *= 2;                                                 \
1659     allocTmp = xmlRealloc(buffer, buffer##_size * sizeof(xmlChar));     \
1660     if (!allocTmp) {                                                    \
1661         xmlFree(buffer);                                                \
1662         htmlErrMemory(ctxt, "growing buffer\n");                        \
1663         return(NULL);                                                   \
1664     }                                                                   \
1665     buffer = (xmlChar*) allocTmp;                                       \
1666 }
1667 
1668 /**
1669  * htmlEntityLookup:
1670  * @param name the entity name
1671  *
1672  * Lookup the given entity in EntitiesTable
1673  *
1674 
1675  *
1676  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1677  */
1678 const htmlEntityDesc *
htmlEntityLookup(const xmlChar * name)1679 htmlEntityLookup(const xmlChar *name) {
1680     unsigned int i;
1681 
1682     for (i = 0;i < (sizeof(html40EntitiesTable)/
1683                     sizeof(html40EntitiesTable[0]));i++) {
1684         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1685             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1686         }
1687     }
1688     return(NULL);
1689 }
1690 
1691 /**
1692  * htmlEntityValueLookup:
1693  * @param value the entity's unicode value
1694  *
1695  * Lookup the given entity in EntitiesTable
1696  *
1697 
1698  *
1699  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1700  */
1701 const htmlEntityDesc *
htmlEntityValueLookup(unsigned int value)1702 htmlEntityValueLookup(unsigned int value) {
1703     unsigned int i;
1704 
1705     for (i = 0;i < (sizeof(html40EntitiesTable)/
1706                     sizeof(html40EntitiesTable[0]));i++) {
1707         if (html40EntitiesTable[i].value >= value) {
1708             if (html40EntitiesTable[i].value > value)
1709                 break;
1710             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1711         }
1712     }
1713     return(NULL);
1714 }
1715 
1716 /**
1717  * UTF8ToHtml:
1718  * @param out a pointer to an array of bytes to store the result
1719  * @param outlen the length of out
1720  * @param in a pointer to an array of UTF-8 chars
1721  * @param inlen the length of in
1722  *
1723  * Take a block of UTF-8 chars in and try to convert it to an ASCII
1724  * plus HTML entities block of chars out.
1725  *
1726  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1727  * The value of inlen after return is the number of octets consumed
1728  *     as the return value is positive, else unpredictable.
1729  * The value of outlen after return is the number of octets consumed.
1730  */
1731 int
UTF8ToHtml(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)1732 UTF8ToHtml(unsigned char* out, int *outlen,
1733               const unsigned char* in, int *inlen) {
1734     const unsigned char* processed = in;
1735     const unsigned char* outend;
1736     const unsigned char* outstart = out;
1737     const unsigned char* instart = in;
1738     const unsigned char* inend;
1739     unsigned int c, d;
1740     int trailing;
1741 
1742     if (in == NULL) {
1743         /*
1744          * initialization nothing to do
1745          */
1746         *outlen = 0;
1747         *inlen = 0;
1748         return(0);
1749     }
1750     inend = in + (*inlen);
1751     outend = out + (*outlen);
1752     while (in < inend) {
1753         d = *in++;
1754         if      (d < 0x80)  { c= d; trailing= 0; }
1755         else if (d < 0xC0) {
1756             /* trailing byte in leading position */
1757             *outlen = out - outstart;
1758             *inlen = processed - instart;
1759             return(-2);
1760         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1761         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
1762         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1763         else {
1764             /* no chance for this in Ascii */
1765             *outlen = out - outstart;
1766             *inlen = processed - instart;
1767             return(-2);
1768         }
1769 
1770         if (inend - in < trailing) {
1771             break;
1772         }
1773 
1774         for ( ; trailing; trailing--) {
1775             if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1776                 break;
1777             c <<= 6;
1778             c |= d & 0x3F;
1779         }
1780 
1781         /* assertion: c is a single UTF-4 value */
1782         if (c < 0x80) {
1783             if (out + 1 >= outend)
1784                 break;
1785             *out++ = c;
1786         } else {
1787             int len;
1788             const htmlEntityDesc * ent;
1789 
1790             /*
1791              * Try to lookup a predefined HTML entity for it
1792              */
1793 
1794             ent = htmlEntityValueLookup(c);
1795             if (ent == NULL) {
1796                 /* no chance for this in Ascii */
1797                 *outlen = out - outstart;
1798                 *inlen = processed - instart;
1799                 return(-2);
1800             }
1801             len = strlen(ent->name);
1802             if (out + 2 + len >= outend)
1803                 break;
1804             *out++ = '&';
1805             memcpy(out, ent->name, len);
1806             out += len;
1807             *out++ = ';';
1808         }
1809         processed = in;
1810     }
1811     *outlen = out - outstart;
1812     *inlen = processed - instart;
1813     return(0);
1814 }
1815 
1816 /**
1817  * htmlEncodeEntities:
1818  * @param out a pointer to an array of bytes to store the result
1819  * @param outlen the length of out
1820  * @param in a pointer to an array of UTF-8 chars
1821  * @param inlen the length of in
1822  * @param quoteChar the quote character to escape (' or ") or zero.
1823  *
1824  * Take a block of UTF-8 chars in and try to convert it to an ASCII
1825  * plus HTML entities block of chars out.
1826  *
1827  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1828  * The value of inlen after return is the number of octets consumed
1829  *     as the return value is positive, else unpredictable.
1830  * The value of outlen after return is the number of octets consumed.
1831  */
1832 int
htmlEncodeEntities(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,int quoteChar)1833 htmlEncodeEntities(unsigned char* out, int *outlen,
1834                    const unsigned char* in, int *inlen, int quoteChar) {
1835     const unsigned char* processed = in;
1836     const unsigned char* outend = out + (*outlen);
1837     const unsigned char* outstart = out;
1838     const unsigned char* instart = in;
1839     const unsigned char* inend = in + (*inlen);
1840     unsigned int c, d;
1841     int trailing;
1842 
1843     while (in < inend) {
1844         d = *in++;
1845         if      (d < 0x80)  { c= d; trailing= 0; }
1846         else if (d < 0xC0) {
1847             /* trailing byte in leading position */
1848             *outlen = out - outstart;
1849             *inlen = processed - instart;
1850             return(-2);
1851         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1852         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
1853         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1854         else {
1855             /* no chance for this in Ascii */
1856             *outlen = out - outstart;
1857             *inlen = processed - instart;
1858             return(-2);
1859         }
1860 
1861         if (inend - in < trailing)
1862             break;
1863 
1864         while (trailing--) {
1865             if (((d= *in++) & 0xC0) != 0x80) {
1866                 *outlen = out - outstart;
1867                 *inlen = processed - instart;
1868                 return(-2);
1869             }
1870             c <<= 6;
1871             c |= d & 0x3F;
1872         }
1873 
1874         /* assertion: c is a single UTF-4 value */
1875         if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1876             (c != '&') && (c != '<') && (c != '>')) {
1877             if (out >= outend)
1878                 break;
1879             *out++ = c;
1880         } else {
1881             const htmlEntityDesc * ent;
1882             const char *cp;
1883             char nbuf[16];
1884             int len;
1885 
1886             /*
1887              * Try to lookup a predefined HTML entity for it
1888              */
1889             ent = htmlEntityValueLookup(c);
1890             if (ent == NULL) {
1891                 snprintf(nbuf, sizeof(nbuf), "#%u", c);
1892                 cp = nbuf;
1893             }
1894             else
1895                 cp = ent->name;
1896             len = strlen(cp);
1897             if (out + 2 + len > outend)
1898                 break;
1899             *out++ = '&';
1900             memcpy(out, cp, len);
1901             out += len;
1902             *out++ = ';';
1903         }
1904         processed = in;
1905     }
1906     *outlen = out - outstart;
1907     *inlen = processed - instart;
1908     return(0);
1909 }
1910 
1911 /************************************************************************
1912  *                                                                      *
1913  *              Commodity functions to handle streams                   *
1914  *                                                                      *
1915  ************************************************************************/
1916 
1917 /**
1918  * htmlNewInputStream:
1919  * @param ctxt an HTML parser context
1920  *
1921  * Create a new input stream structure
1922  * Returns the new input stream or NULL
1923  */
1924 static htmlParserInputPtr
htmlNewInputStream(htmlParserCtxtPtr ctxt)1925 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1926     htmlParserInputPtr input;
1927 
1928     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1929     if (input == NULL) {
1930         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
1931         return(NULL);
1932     }
1933     memset(input, 0, sizeof(htmlParserInput));
1934     input->filename = NULL;
1935     input->directory = NULL;
1936     input->base = NULL;
1937     input->cur = NULL;
1938     input->buf = NULL;
1939     input->line = 1;
1940     input->col = 1;
1941     input->buf = NULL;
1942     input->free = NULL;
1943     input->version = NULL;
1944     input->consumed = 0;
1945     input->length = 0;
1946     return(input);
1947 }
1948 
1949 
1950 /************************************************************************
1951  *                                                                      *
1952  *              Commodity functions, cleanup needed ?                   *
1953  *                                                                      *
1954  ************************************************************************/
1955 /*
1956  * all tags allowing pc data from the html 4.01 loose dtd
1957  * NOTE: it might be more apropriate to integrate this information
1958  * into the html40ElementTable array but I don't want to risk any
1959  * binary incomptibility
1960  */
1961 static const char * const allowPCData[] = {
1962     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
1963     "blockquote", "body", "button", "caption", "center", "cite", "code",
1964     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
1965     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
1966     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
1967     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
1968 };
1969 
1970 /**
1971  * areBlanks:
1972  * @param ctxt an HTML parser context
1973  * @param str a xmlChar *
1974  * @param len the size of str
1975  *
1976  * Is this a sequence of blank chars that one can ignore ?
1977  *
1978  * Returns 1 if ignorable 0 otherwise.
1979  */
1980 
areBlanks(htmlParserCtxtPtr ctxt,const xmlChar * str,int len)1981 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
1982     unsigned int i;
1983     int j;
1984     xmlNodePtr lastChild;
1985 
1986     for (j = 0;j < len;j++)
1987         if (!(IS_BLANK_CH(str[j]))) return(0);
1988 
1989     if (CUR == 0) return(1);
1990     if (CUR != '<') return(0);
1991     if (ctxt->name == NULL)
1992         return(1);
1993     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
1994         return(1);
1995     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
1996         return(1);
1997     if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
1998         return(1);
1999     if (ctxt->node == NULL) return(0);
2000     lastChild = xmlGetLastChild(ctxt->node);
2001     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2002         lastChild = lastChild->prev;
2003     if (lastChild == NULL) {
2004         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2005             (ctxt->node->content != NULL)) return(0);
2006         /* keep ws in constructs like ...<b> </b>...
2007            for all tags "b" allowing PCDATA */
2008         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2009             if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2010                 return(0);
2011             }
2012         }
2013     } else if (xmlNodeIsText(lastChild)) {
2014         return(0);
2015     } else {
2016         /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2017            for all tags "p" allowing PCDATA */
2018         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2019             if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2020                 return(0);
2021             }
2022         }
2023     }
2024     return(1);
2025 }
2026 #endif  /* defined(LIBXML_HTML_ENABLED */
2027 
2028 #if defined(LIBXML_HTML_ENABLED) || defined(XMLENGINE_XSLT)
2029 
2030 /**
2031  * htmlErrMemory:
2032  * @param ctxt an HTML parser context
2033  * @param extra extra informations
2034  *
2035  * Handle a redefinition of attribute error
2036  */
2037 static void
htmlErrMemory(xmlParserCtxtPtr ctxt,const char * extra)2038 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
2039 {
2040     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
2041         (ctxt->instate == XML_PARSER_EOF))
2042     return;
2043     if (ctxt != NULL) {
2044         ctxt->errNo = XML_ERR_NO_MEMORY;
2045         ctxt->instate = XML_PARSER_EOF;
2046         ctxt->disableSAX = 1;
2047     }
2048     if (extra)
2049         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
2050                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
2051                         NULL, NULL, 0, 0,
2052                         "Memory allocation failed : %s\n", extra);
2053     else
2054         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
2055                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
2056                         NULL, NULL, 0, 0, "Memory allocation failed\n");
2057 }
2058 
2059 /**
2060  * htmlNewDocNoDtD:
2061  * @param URI URI for the dtd, or NULL
2062  * @param ExternalID the external ID of the DTD, or NULL
2063  *
2064  * Creates a new HTML document without a DTD node if URI and ExternalID
2065  * are NULL
2066  *
2067  * Returns a new document, do not initialize the DTD if not provided
2068  */
2069 XMLPUBFUNEXPORT htmlDocPtr
htmlNewDocNoDtD(const xmlChar * URI,const xmlChar * ExternalID)2070 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2071     xmlDocPtr cur;
2072 
2073     /*
2074      * Allocate a new document and fill the fields.
2075      */
2076     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2077     if (cur == NULL) {
2078         htmlErrMemory(NULL, "HTML document creation failed\n");
2079         return(NULL);
2080     }
2081     memset(cur, 0, sizeof(xmlDoc));
2082 
2083     cur->type = XML_HTML_DOCUMENT_NODE;
2084 #ifdef XE_ENABLE_GS_CACHING
2085     cur->cachedGs = xmlGetGlobalState();
2086 #endif
2087 
2088     //cur->version = NULL;
2089     //cur->intSubset = NULL;
2090     cur->doc = cur;
2091     //cur->name = NULL;
2092     //cur->children = NULL;
2093     //cur->extSubset = NULL;
2094     //cur->oldNs = NULL;
2095     //cur->encoding = NULL;
2096     cur->standalone = 1;
2097     //cur->compression = 0;
2098     //cur->ids = NULL;
2099     //cur->refs = NULL;
2100     //cur->_private = NULL;
2101 
2102     if (ExternalID || URI)
2103         xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2104 
2105     return(cur);
2106 }
2107 
2108 /**
2109  * htmlNewDoc:
2110  * @param URI URI for the dtd, or NULL
2111  * @param ExternalID the external ID of the DTD, or NULL
2112  *
2113  * Creates a new HTML document
2114  *
2115  * Returns a new document
2116  */
2117 XMLPUBFUNEXPORT htmlDocPtr
htmlNewDoc(const xmlChar * URI,const xmlChar * ExternalID)2118 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2119     if ((URI == NULL) && (ExternalID == NULL))
2120         return(htmlNewDocNoDtD(
2121                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2122                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2123 
2124     return(htmlNewDocNoDtD(URI, ExternalID));
2125 }
2126 
2127 /**
2128  * htmlTagLookup:
2129  * @param tag The tag name in lowercase
2130  *
2131  * Lookup the HTML tag in the ElementTable
2132  *
2133  * Returns the related htmlElemDescPtr or NULL if not found.
2134  */
2135 XMLPUBFUNEXPORT const htmlElemDesc *
htmlTagLookup(const xmlChar * tag)2136 htmlTagLookup(const xmlChar *tag) {
2137     unsigned int i;
2138 
2139     for (i = 0; i < (sizeof(html40ElementTable) /
2140                      sizeof(html40ElementTable[0]));i++) {
2141         if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
2142         return((htmlElemDescPtr) &html40ElementTable[i]);
2143     }
2144     return(NULL);
2145 }
2146 
2147 #endif /* defined(LIBXML_HTML_ENABLED) || defined(XMLENGINE_XSLT) */
2148 
2149 #if defined(LIBXML_HTML_ENABLED)
2150 
2151 /************************************************************************
2152  *                                                                      *
2153  *                      The parser itself                               *
2154  *      Relates to http://www.w3.org/TR/html40                          *
2155  *                                                                      *
2156  ************************************************************************/
2157 
2158 /************************************************************************
2159  *                                                                      *
2160  *                      The parser itself                               *
2161  *                                                                      *
2162  ************************************************************************/
2163 
2164 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2165 
2166 /**
2167  * htmlParseHTMLName:
2168  * @param ctxt an HTML parser context
2169  *
2170  * parse an HTML tag or attribute name, note that we convert it to lowercase
2171  * since HTML names are not case-sensitive.
2172  *
2173  * Returns the Tag Name parsed or NULL
2174  */
2175 
2176 static const xmlChar *
htmlParseHTMLName(htmlParserCtxtPtr ctxt)2177 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2178     int i = 0;
2179     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2180 
2181     if (!IS_LETTER_CH(CUR) && (CUR != '_') &&
2182         (CUR != ':')) return(NULL);
2183 
2184     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2185            ((IS_LETTER_CH(CUR)) || (IS_DIGIT_CH(CUR)) ||
2186            (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2187         if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2188         else loc[i] = CUR;
2189         i++;
2190 
2191         NEXT;
2192     }
2193 
2194     return(xmlDictLookup(ctxt->dict, loc, i));
2195 }
2196 
2197 /**
2198  * htmlParseName:
2199  * @param ctxt an HTML parser context
2200  *
2201  * parse an HTML name, this routine is case sensitive.
2202  *
2203  * Returns the Name parsed or NULL
2204  */
2205 
2206 static const xmlChar *
htmlParseName(htmlParserCtxtPtr ctxt)2207 htmlParseName(htmlParserCtxtPtr ctxt) {
2208     const xmlChar *in;
2209     const xmlChar *ret;
2210     int count = 0;
2211 
2212     GROW;
2213 
2214     /*
2215      * Accelerator for simple ASCII names
2216      */
2217     in = ctxt->input->cur;
2218     if (((*in >= 0x61) && (*in <= 0x7A)) ||
2219         ((*in >= 0x41) && (*in <= 0x5A)) ||
2220         (*in == '_') || (*in == ':')) {
2221         in++;
2222         while (((*in >= 0x61) && (*in <= 0x7A)) ||
2223                ((*in >= 0x41) && (*in <= 0x5A)) ||
2224                ((*in >= 0x30) && (*in <= 0x39)) ||
2225                (*in == '_') || (*in == '-') ||
2226                (*in == ':') || (*in == '.'))
2227             in++;
2228         if ((*in > 0) && (*in < 0x80)) {
2229             count = in - ctxt->input->cur;
2230             ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2231             ctxt->input->cur = in;
2232             ctxt->nbChars += count;
2233             ctxt->input->col += count;
2234             return(ret);
2235         }
2236     }
2237     return(htmlParseNameComplex(ctxt));
2238 }
2239 
2240 static const xmlChar *
htmlParseNameComplex(xmlParserCtxtPtr ctxt)2241 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2242     int len = 0, l;
2243     int c;
2244     int count = 0;
2245 
2246     /*
2247      * Handler for more complex cases
2248      */
2249     GROW;
2250     c = CUR_CHAR(l);
2251     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2252         (!IS_LETTER(c) && (c != '_') &&
2253          (c != ':'))) {
2254         return(NULL);
2255     }
2256 
2257     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2258            ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2259             (c == '.') || (c == '-') ||
2260             (c == '_') || (c == ':') ||
2261             (IS_COMBINING(c)) ||
2262             (IS_EXTENDER(c)))) {
2263         if (count++ > 100) {
2264             count = 0;
2265             GROW;
2266         }
2267         len += l;
2268         NEXTL(l);
2269         c = CUR_CHAR(l);
2270     }
2271     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2272 }
2273 
2274 
2275 /**
2276  * htmlParseHTMLAttribute:
2277  * @param ctxt an HTML parser context
2278  * @param stop a char stop value
2279  *
2280  * parse an HTML attribute value till the stop (quote), if
2281  * stop is 0 then it stops at the first space
2282  *
2283  * Returns the attribute parsed or NULL
2284  */
2285 
2286 static xmlChar *
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt,const xmlChar stop)2287 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2288     xmlChar *buffer = NULL;
2289     int buffer_size = 0;
2290     xmlChar *out = NULL;
2291     const xmlChar *name = NULL;
2292     const xmlChar *cur = NULL;
2293     const htmlEntityDesc * ent;
2294 
2295     /*
2296      * allocate a translation buffer.
2297      */
2298     buffer_size = HTML_PARSER_BUFFER_SIZE;
2299     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2300     if (buffer == NULL) {
2301         htmlErrMemory(ctxt, "buffer allocation failed\n");
2302         return(NULL);
2303     }
2304     out = buffer;
2305 
2306     /*
2307      * Ok loop until we reach one of the ending chars
2308      */
2309     while ((CUR != 0) && (CUR != stop)) {
2310         if ((stop == 0) && (CUR == '>')) break;
2311         if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2312         if (CUR == '&') {
2313             if (NXT(1) == '#') {
2314                 unsigned int c;
2315                 int bits;
2316 
2317                 c = htmlParseCharRef(ctxt);
2318                 if      (c <    0x80)
2319                         { *out++  = c;                bits= -6; }
2320                 else if (c <   0x800)
2321                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2322                 else if (c < 0x10000)
2323                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2324                 else
2325                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2326 
2327                 for ( ; bits >= 0; bits-= 6) {
2328                     *out++  = ((c >> bits) & 0x3F) | 0x80;
2329                 }
2330 
2331                 if (out - buffer > buffer_size - 100) {
2332                         int indx = out - buffer;
2333 
2334                         growBuffer(buffer);
2335                         out = &buffer[indx];
2336                 }
2337             } else {
2338                 ent = htmlParseEntityRef(ctxt, &name);
2339                 if (name == NULL) {
2340                     *out++ = '&';
2341                     if (out - buffer > buffer_size - 100) {
2342                         int indx = out - buffer;
2343 
2344                         growBuffer(buffer);
2345                         out = &buffer[indx];
2346                     }
2347                 } else if (ent == NULL) {
2348                     *out++ = '&';
2349                     cur = name;
2350                     while (*cur != 0) {
2351                         if (out - buffer > buffer_size - 100) {
2352                             int indx = out - buffer;
2353 
2354                             growBuffer(buffer);
2355                             out = &buffer[indx];
2356                         }
2357                         *out++ = *cur++;
2358                     }
2359                 } else {
2360                     unsigned int c;
2361                     int bits;
2362 
2363                     if (out - buffer > buffer_size - 100) {
2364                         int indx = out - buffer;
2365 
2366                         growBuffer(buffer);
2367                         out = &buffer[indx];
2368                     }
2369                     c = (xmlChar)ent->value;
2370                     if      (c <    0x80)
2371                         { *out++  = c;                bits= -6; }
2372                     else if (c <   0x800)
2373                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2374                     else if (c < 0x10000)
2375                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2376                     else
2377                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2378 
2379                     for ( ; bits >= 0; bits-= 6) {
2380                         *out++  = ((c >> bits) & 0x3F) | 0x80;
2381                     }
2382                 }
2383             }
2384         } else {
2385             unsigned int c;
2386             int bits, l;
2387 
2388             if (out - buffer > buffer_size - 100) {
2389                 int indx = out - buffer;
2390 
2391                 growBuffer(buffer);
2392                 out = &buffer[indx];
2393             }
2394             c = CUR_CHAR(l);
2395             if      (c <    0x80)
2396                     { *out++  = c;                bits= -6; }
2397             else if (c <   0x800)
2398                     { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2399             else if (c < 0x10000)
2400                     { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2401             else
2402                     { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2403 
2404             for ( ; bits >= 0; bits-= 6) {
2405                 *out++  = ((c >> bits) & 0x3F) | 0x80;
2406             }
2407             NEXT;
2408         }
2409     }
2410     *out++ = 0;
2411     return(buffer);
2412 }
2413 
2414 /**
2415  * htmlParseEntityRef:
2416  * @param ctxt an HTML parser context
2417  * @param str location to store the entity name
2418  *
2419  * parse an HTML ENTITY references
2420  *
2421  * [68] EntityRef ::= '&' Name ';'
2422  *
2423  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2424  *         if non-NULL *str will have to be freed by the caller.
2425  */
2426 const htmlEntityDesc *
htmlParseEntityRef(htmlParserCtxtPtr ctxt,const xmlChar ** str)2427 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2428     const xmlChar *name;
2429     const htmlEntityDesc * ent = NULL;
2430     *str = NULL;
2431 
2432     if (CUR == '&') {
2433         NEXT;
2434         name = htmlParseName(ctxt);
2435         if (name == NULL) {
2436             htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2437                          "htmlParseEntityRef: no name\n", NULL, NULL);
2438         } else {
2439             GROW;
2440             if (CUR == ';') {
2441                 *str = name;
2442 
2443                 /*
2444                  * Lookup the entity in the table.
2445                  */
2446                 ent = htmlEntityLookup(name);
2447                 if (ent != NULL) /* OK that's ugly !!! */
2448                     NEXT;
2449             } else {
2450                 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2451                              "htmlParseEntityRef: expecting ';'\n",
2452                              NULL, NULL);
2453                 *str = name;
2454             }
2455         }
2456     }
2457     return(ent);
2458 }
2459 
2460 /**
2461  * htmlParseAttValue:
2462  * @param ctxt an HTML parser context
2463  *
2464  * parse a value for an attribute
2465  * Note: the parser won't do substitution of entities here, this
2466  * will be handled later in xmlStringGetNodeList, unless it was
2467  * asked for ctxt->replaceEntities != 0
2468  *
2469  * Returns the AttValue parsed or NULL.
2470  */
2471 
2472 static xmlChar *
htmlParseAttValue(htmlParserCtxtPtr ctxt)2473 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2474     xmlChar *ret = NULL;
2475 
2476     if (CUR == '"') {
2477         NEXT;
2478         ret = htmlParseHTMLAttribute(ctxt, '"');
2479         if (CUR != '"') {
2480             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2481                          "AttValue: \" expected\n", NULL, NULL);
2482         } else
2483             NEXT;
2484     } else if (CUR == '\'') {
2485         NEXT;
2486         ret = htmlParseHTMLAttribute(ctxt, '\'');
2487         if (CUR != '\'') {
2488             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2489                          "AttValue: ' expected\n", NULL, NULL);
2490         } else
2491             NEXT;
2492     } else {
2493         /*
2494          * That's an HTMLism, the attribute value may not be quoted
2495          */
2496         ret = htmlParseHTMLAttribute(ctxt, 0);
2497         if (ret == NULL) {
2498             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2499                          "AttValue: no value found\n", NULL, NULL);
2500         }
2501     }
2502     return(ret);
2503 }
2504 
2505 /**
2506  * htmlParseSystemLiteral:
2507  * @param ctxt an HTML parser context
2508  *
2509  * parse an HTML Literal
2510  *
2511  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2512  *
2513  * Returns the SystemLiteral parsed or NULL
2514  */
2515 
2516 static xmlChar *
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt)2517 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2518     const xmlChar *q;
2519     xmlChar *ret = NULL;
2520 
2521     if (CUR == '"') {
2522         NEXT;
2523         q = CUR_PTR;
2524         while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
2525             NEXT;
2526         if (!IS_CHAR_CH(CUR)) {
2527             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2528                          "Unfinished SystemLiteral\n", NULL, NULL);
2529         } else {
2530             ret = xmlStrndup(q, CUR_PTR - q);
2531             NEXT;
2532         }
2533     } else if (CUR == '\'') {
2534         NEXT;
2535         q = CUR_PTR;
2536         while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
2537             NEXT;
2538         if (!IS_CHAR_CH(CUR)) {
2539             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2540                          "Unfinished SystemLiteral\n", NULL, NULL);
2541         } else {
2542             ret = xmlStrndup(q, CUR_PTR - q);
2543             NEXT;
2544         }
2545     } else {
2546         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2547                      " or ' expected\n", NULL, NULL);
2548     }
2549 
2550     return(ret);
2551 }
2552 
2553 /**
2554  * htmlParsePubidLiteral:
2555  * @param ctxt an HTML parser context
2556  *
2557  * parse an HTML public literal
2558  *
2559  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2560  *
2561  * Returns the PubidLiteral parsed or NULL.
2562  */
2563 
2564 static xmlChar *
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt)2565 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2566     const xmlChar *q;
2567     xmlChar *ret = NULL;
2568     /*
2569      * Name ::= (Letter | '_') (NameChar)*
2570      */
2571     if (CUR == '"') {
2572         NEXT;
2573         q = CUR_PTR;
2574         while (IS_PUBIDCHAR_CH(CUR)) NEXT;
2575         if (CUR != '"') {
2576             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2577                          "Unfinished PubidLiteral\n", NULL, NULL);
2578         } else {
2579             ret = xmlStrndup(q, CUR_PTR - q);
2580             NEXT;
2581         }
2582     } else if (CUR == '\'') {
2583         NEXT;
2584         q = CUR_PTR;
2585         while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
2586             NEXT;
2587         if (CUR != '\'') {
2588             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2589                          "Unfinished PubidLiteral\n", NULL, NULL);
2590         } else {
2591             ret = xmlStrndup(q, CUR_PTR - q);
2592             NEXT;
2593         }
2594     } else {
2595         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2596                      "PubidLiteral \" or ' expected\n", NULL, NULL);
2597     }
2598 
2599     return(ret);
2600 }
2601 
2602 /**
2603  * htmlParseScript:
2604  * @param ctxt an HTML parser context
2605  *
2606  * parse the content of an HTML SCRIPT or STYLE element
2607  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2608  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2609  * http://www.w3.org/TR/html4/types.html#type-script
2610  * http://www.w3.org/TR/html4/types.html#h-6.15
2611  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2612  *
2613  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2614  * element and the value of intrinsic event attributes. User agents must
2615  * not evaluate script data as HTML markup but instead must pass it on as
2616  * data to a script engine.
2617  * NOTES:
2618  * - The content is passed like CDATA
2619  * - the attributes for style and scripting "onXXX" are also described
2620  *   as CDATA but SGML allows entities references in attributes so their
2621  *   processing is identical as other attributes
2622  */
2623 static void
htmlParseScript(htmlParserCtxtPtr ctxt)2624 htmlParseScript(htmlParserCtxtPtr ctxt) {
2625     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
2626     int nbchar = 0;
2627     xmlChar cur;
2628 
2629     SHRINK;
2630     cur = CUR;
2631     while (IS_CHAR_CH(cur)) {
2632         if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
2633             (NXT(3) == '-')) {
2634             if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2635                 if (ctxt->sax->cdataBlock!= NULL) {
2636                     /*
2637                      * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2638                      */
2639                     ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2640                 } else if (ctxt->sax->characters != NULL) {
2641                     ctxt->sax->characters(ctxt->userData, buf, nbchar);
2642                 }
2643             }
2644             nbchar = 0;
2645             htmlParseComment(ctxt);
2646             cur = CUR;
2647             continue;
2648         } else if ((cur == '<') && (NXT(1) == '/')) {
2649             /*
2650              * One should break here, the specification is clear:
2651              * Authors should therefore escape "</" within the content.
2652              * Escape mechanisms are specific to each scripting or
2653              * style sheet language.
2654              */
2655             if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2656                 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2657                 break; /* while */
2658         }
2659         buf[nbchar++] = cur;
2660         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2661             if (ctxt->sax->cdataBlock!= NULL) {
2662                 /*
2663                  * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2664                  */
2665                 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2666             } else if (ctxt->sax->characters != NULL) {
2667                 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2668             }
2669             nbchar = 0;
2670         }
2671         NEXT;
2672         cur = CUR;
2673     }
2674     if (!(IS_CHAR_CH(cur))) {
2675         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2676                         "Invalid char in CDATA 0x%X\n", cur);
2677         NEXT;
2678     }
2679 
2680     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2681         if (ctxt->sax->cdataBlock!= NULL) {
2682             /*
2683              * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2684              */
2685             ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2686         } else if (ctxt->sax->characters != NULL) {
2687             ctxt->sax->characters(ctxt->userData, buf, nbchar);
2688         }
2689     }
2690 }
2691 
2692 
2693 /**
2694  * htmlParseCharData:
2695  * @param ctxt an HTML parser context
2696  *
2697  * parse a CharData section.
2698  * if we are within a CDATA section ']]>' marks an end of section.
2699  *
2700  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2701  */
2702 
2703 static void
htmlParseCharData(htmlParserCtxtPtr ctxt)2704 htmlParseCharData(htmlParserCtxtPtr ctxt) {
2705     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2706     int nbchar = 0;
2707     int cur, l;
2708 
2709     SHRINK;
2710     cur = CUR_CHAR(l);
2711     while (((cur != '<') || (ctxt->token == '<')) &&
2712            ((cur != '&') || (ctxt->token == '&')) &&
2713            (IS_CHAR(cur))) {
2714         COPY_BUF(l,buf,nbchar,cur);
2715         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2716             /*
2717              * Ok the segment is to be consumed as chars.
2718              */
2719             if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2720                 if (areBlanks(ctxt, buf, nbchar)) {
2721                     if (ctxt->sax->ignorableWhitespace != NULL)
2722                         ctxt->sax->ignorableWhitespace(ctxt->userData,
2723                                                        buf, nbchar);
2724                 } else {
2725                     htmlCheckParagraph(ctxt);
2726                     if (ctxt->sax->characters != NULL)
2727                         ctxt->sax->characters(ctxt->userData, buf, nbchar);
2728                 }
2729             }
2730             nbchar = 0;
2731         }
2732         NEXTL(l);
2733         cur = CUR_CHAR(l);
2734         if (cur == 0) {
2735             SHRINK;
2736             GROW;
2737             cur = CUR_CHAR(l);
2738         }
2739     }
2740     if (nbchar != 0) {
2741         /*
2742          * Ok the segment is to be consumed as chars.
2743          */
2744         if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2745             if (areBlanks(ctxt, buf, nbchar)) {
2746                 if (ctxt->sax->ignorableWhitespace != NULL)
2747                     ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2748             } else {
2749                 htmlCheckParagraph(ctxt);
2750                 if (ctxt->sax->characters != NULL)
2751                     ctxt->sax->characters(ctxt->userData, buf, nbchar);
2752             }
2753         }
2754     } else {
2755         /*
2756          * Loop detection
2757          */
2758         if (cur == 0)
2759             ctxt->instate = XML_PARSER_EOF;
2760     }
2761 }
2762 
2763 /**
2764  * htmlParseExternalID:
2765  * @param ctxt an HTML parser context
2766  * @param publicID a xmlChar** receiving PubidLiteral
2767  *
2768  * Parse an External ID or a Public ID
2769  *
2770  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2771  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
2772  *
2773  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2774  *
2775  * Returns the function returns SystemLiteral and in the second
2776  *                case publicID receives PubidLiteral, is strict is off
2777  *                it is possible to return NULL and have publicID set.
2778  */
2779 
2780 static xmlChar *
htmlParseExternalID(htmlParserCtxtPtr ctxt,xmlChar ** publicID)2781 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
2782     xmlChar *URI = NULL;
2783 
2784     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2785          (UPP(2) == 'S') && (UPP(3) == 'T') &&
2786          (UPP(4) == 'E') && (UPP(5) == 'M')) {
2787         SKIP(6);
2788         if (!IS_BLANK_CH(CUR)) {
2789             htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2790                          "Space required after 'SYSTEM'\n", NULL, NULL);
2791         }
2792         SKIP_BLANKS;
2793         URI = htmlParseSystemLiteral(ctxt);
2794         if (URI == NULL) {
2795             htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2796                          "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
2797         }
2798     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2799                (UPP(2) == 'B') && (UPP(3) == 'L') &&
2800                (UPP(4) == 'I') && (UPP(5) == 'C')) {
2801         SKIP(6);
2802         if (!IS_BLANK_CH(CUR)) {
2803             htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2804                          "Space required after 'PUBLIC'\n", NULL, NULL);
2805         }
2806         SKIP_BLANKS;
2807         *publicID = htmlParsePubidLiteral(ctxt);
2808         if (*publicID == NULL) {
2809             htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2810                          "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2811                          NULL, NULL);
2812         }
2813         SKIP_BLANKS;
2814         if ((CUR == '"') || (CUR == '\'')) {
2815             URI = htmlParseSystemLiteral(ctxt);
2816         }
2817     }
2818     return(URI);
2819 }
2820 
2821 /**
2822  * htmlParseComment:
2823  * @param ctxt an HTML parser context
2824  *
2825  * Parse an XML (SGML) comment <!-- .... -->
2826  *
2827  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
2828  */
2829 static void
htmlParseComment(htmlParserCtxtPtr ctxt)2830 htmlParseComment(htmlParserCtxtPtr ctxt)
2831 {
2832     xmlChar* buf = NULL;
2833     int len;
2834     int size = HTML_PARSER_BUFFER_SIZE;
2835     int q, ql;
2836     int r, rl;
2837     int cur, l;
2838     xmlParserInputState state;
2839 
2840     /*
2841      * Check that there is a comment right here.
2842      */
2843     if ((RAW != '<') || (NXT(1) != '!') ||
2844         (NXT(2) != '-') || (NXT(3) != '-')) return;
2845 
2846     state = ctxt->instate;
2847     ctxt->instate = XML_PARSER_COMMENT;
2848     SHRINK;
2849     SKIP(4);
2850     buf = (xmlChar*) xmlMallocAtomic(size * sizeof(xmlChar));
2851     if (!buf)
2852         goto OOM_exit;
2853     // Now we must free 'buf' before returning
2854     q = CUR_CHAR(ql);
2855     NEXTL(ql);
2856     r = CUR_CHAR(rl);
2857     NEXTL(rl);
2858     cur = CUR_CHAR(l);
2859     len = 0;
2860     while (IS_CHAR(cur) &&
2861           ((cur != '>') || (r != '-') || (q != '-')))
2862     {
2863         if (len + 5 >= size)
2864         {   // DONE: Fix xmlRealloc
2865             void* tmp;
2866             size *= 2;
2867             tmp = xmlRealloc(buf, size * sizeof(xmlChar));
2868             if (!tmp)
2869             {
2870 OOM:
2871                 xmlFree(buf);
2872 OOM_exit:
2873                 htmlErrMemory(ctxt, "buffer allocation failed\n");
2874                 ctxt->instate = state;
2875                 return;
2876             }
2877             buf = (xmlChar*) tmp;
2878         }
2879         COPY_BUF(ql,buf,len,q);
2880         q = r;
2881         ql = rl;
2882         r = cur;
2883         rl = l;
2884         NEXTL(l);
2885         cur = CUR_CHAR(l);
2886         if (cur == 0) {
2887             SHRINK;
2888             GROW;
2889             cur = CUR_CHAR(l);
2890         }
2891     } // end of "while good character and not the end of comment (-->)"
2892 
2893     buf[len] = 0;
2894     if (!IS_CHAR(cur)) {
2895         htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
2896                  "Comment not terminated \n<!--%.50s\n", buf, NULL);
2897         xmlFree(buf);
2898     } else {
2899         NEXT;
2900         if (ctxt->sax           &&
2901             ctxt->sax->comment  &&
2902             !ctxt->disableSAX)
2903         {
2904             ctxt->sax->comment(ctxt->userData, buf);
2905         }
2906     }
2907     xmlFree(buf);
2908     ctxt->instate = state;
2909 }
2910 
2911 /**
2912  * htmlParseCharRef:
2913  * @param ctxt an HTML parser context
2914  *
2915  * parse Reference declarations
2916  *
2917  * [66] CharRef ::= '&#' [0-9]+ ';' |
2918  *                  '&#x' [0-9a-fA-F]+ ';'
2919  *
2920  * Returns the value parsed (as an int)
2921  */
2922 int
htmlParseCharRef(htmlParserCtxtPtr ctxt)2923 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
2924     int val = 0;
2925 
2926     if ((CUR == '&') && (NXT(1) == '#') &&
2927         ((NXT(2) == 'x') || NXT(2) == 'X')) {
2928         SKIP(3);
2929         while (CUR != ';') {
2930             if ((CUR >= '0') && (CUR <= '9'))
2931                 val = val * 16 + (CUR - '0');
2932             else if ((CUR >= 'a') && (CUR <= 'f'))
2933                 val = val * 16 + (CUR - 'a') + 10;
2934             else if ((CUR >= 'A') && (CUR <= 'F'))
2935                 val = val * 16 + (CUR - 'A') + 10;
2936             else {
2937                 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
2938                              "htmlParseCharRef: invalid hexadecimal value\n",
2939                              NULL, NULL);
2940                 return(0);
2941             }
2942             NEXT;
2943         }
2944         if (CUR == ';')
2945             NEXT;
2946     } else if  ((CUR == '&') && (NXT(1) == '#')) {
2947         SKIP(2);
2948         while (CUR != ';') {
2949             if ((CUR >= '0') && (CUR <= '9'))
2950                 val = val * 10 + (CUR - '0');
2951             else {
2952                 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
2953                              "htmlParseCharRef: invalid decimal value\n",
2954                              NULL, NULL);
2955                 return(0);
2956             }
2957             NEXT;
2958         }
2959         if (CUR == ';')
2960             NEXT;
2961     } else {
2962         htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
2963                      "htmlParseCharRef: invalid value\n", NULL, NULL);
2964     }
2965     /*
2966      * Check the value IS_CHAR ...
2967      */
2968     if (IS_CHAR(val)) {
2969         return(val);
2970     } else {
2971         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2972                         "htmlParseCharRef: invalid xmlChar value %d\n",
2973                         val);
2974     }
2975     return(0);
2976 }
2977 
2978 
2979 /**
2980  * htmlParseDocTypeDecl:
2981  * @param ctxt an HTML parser context
2982  *
2983  * parse a DOCTYPE declaration
2984  *
2985  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
2986  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
2987  */
2988 
2989 static void
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt)2990 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
2991     const xmlChar *name;
2992     xmlChar *ExternalID = NULL;
2993     xmlChar *URI = NULL;
2994 
2995     /*
2996      * We know that '<!DOCTYPE' has been detected.
2997      */
2998     SKIP(9);
2999 
3000     SKIP_BLANKS;
3001 
3002     /*
3003      * Parse the DOCTYPE name.
3004      */
3005     name = htmlParseName(ctxt);
3006     if (name == NULL) {
3007         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3008                      "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3009                      NULL, NULL);
3010     }
3011     /*
3012      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3013      */
3014 
3015     SKIP_BLANKS;
3016 
3017     /*
3018      * Check for SystemID and ExternalID
3019      */
3020     URI = htmlParseExternalID(ctxt, &ExternalID);
3021     SKIP_BLANKS;
3022 
3023     /*
3024      * We should be at the end of the DOCTYPE declaration.
3025      */
3026     if (CUR != '>') {
3027         htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3028                      "DOCTYPE improperly terminated\n", NULL, NULL);
3029         /* We shouldn't try to resynchronize ... */
3030     }
3031     NEXT;
3032 
3033     /*
3034      * Create or update the document accordingly to the DOCTYPE
3035      */
3036     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3037         (!ctxt->disableSAX))
3038         ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3039 
3040     /*
3041      * Cleanup, since we don't use all those identifiers
3042      */
3043     if (URI != NULL) xmlFree(URI);
3044     if (ExternalID != NULL) xmlFree(ExternalID);
3045 }
3046 
3047 /**
3048  * htmlParseAttribute:
3049  * @param ctxt an HTML parser context
3050  * @param value a xmlChar ** used to store the value of the attribute
3051  *
3052  * parse an attribute
3053  *
3054  * [41] Attribute ::= Name Eq AttValue
3055  *
3056  * [25] Eq ::= S? '=' S?
3057  *
3058  * With namespace:
3059  *
3060  * [NS 11] Attribute ::= QName Eq AttValue
3061  *
3062  * Also the case QName == xmlns:??? is handled independently as a namespace
3063  * definition.
3064  *
3065  * Returns the attribute name, and the value in *value.
3066  */
3067 
3068 static const xmlChar *
htmlParseAttribute(htmlParserCtxtPtr ctxt,xmlChar ** value)3069 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3070     const xmlChar *name;
3071     xmlChar *val = NULL;
3072 
3073     *value = NULL;
3074     name = htmlParseHTMLName(ctxt);
3075     if (name == NULL) {
3076         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3077                      "error parsing attribute name\n", NULL, NULL);
3078         return(NULL);
3079     }
3080 
3081     /*
3082      * read the value
3083      */
3084     SKIP_BLANKS;
3085     if (CUR == '=') {
3086         NEXT;
3087         SKIP_BLANKS;
3088         val = htmlParseAttValue(ctxt);
3089         /******
3090     } else {
3091 
3092         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
3093             ctxt->sax->warning(ctxt->userData,
3094                "No value for attribute %s\n", name); */
3095     }
3096 
3097     *value = val;
3098     return(name);
3099 }
3100 
3101 /**
3102  * htmlCheckEncoding:
3103  * @param ctxt an HTML parser context
3104  * @param attvalue the attribute value
3105  *
3106  * Checks an http-equiv attribute from a Meta tag to detect
3107  * the encoding
3108  * If a new encoding is detected the parser is switched to decode
3109  * it and pass UTF8
3110  */
3111 static void
htmlCheckEncoding(htmlParserCtxtPtr ctxt,const xmlChar * attvalue)3112 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3113     const xmlChar *encoding;
3114 
3115     if ((ctxt == NULL) || (attvalue == NULL))
3116         return;
3117 
3118     /* do not change encoding */
3119     if (ctxt->input->encoding != NULL)
3120         return;
3121 
3122     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3123     if (encoding != NULL) {
3124         encoding += 8;
3125     } else {
3126         encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3127         if (encoding != NULL)
3128             encoding += 9;
3129     }
3130     if (encoding != NULL) {
3131         xmlCharEncoding enc;
3132         xmlCharEncodingHandlerPtr handler;
3133 
3134         while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3135 
3136         if (ctxt->input->encoding != NULL)
3137             xmlFree((xmlChar *) ctxt->input->encoding);
3138         ctxt->input->encoding = xmlStrdup(encoding);
3139 
3140         enc = xmlParseCharEncoding((const char *) encoding);
3141         /*
3142          * registered set of known encodings
3143          */
3144         if (enc != XML_CHAR_ENCODING_ERROR) {
3145             xmlSwitchEncoding(ctxt, enc);
3146             ctxt->charset = XML_CHAR_ENCODING_UTF8;
3147         } else {
3148             /*
3149              * fallback for unknown encodings
3150              */
3151             handler = xmlFindCharEncodingHandler((const char *) encoding);
3152             if (handler != NULL) {
3153                 xmlSwitchToEncoding(ctxt, handler);
3154                 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3155             } else {
3156                 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3157             }
3158         }
3159 
3160         if ((ctxt->input->buf != NULL) &&
3161             (ctxt->input->buf->encoder != NULL) &&
3162             (ctxt->input->buf->raw != NULL) &&
3163             (ctxt->input->buf->buffer != NULL)) {
3164             int nbchars;
3165             int processed;
3166 
3167             /*
3168              * convert as much as possible to the parser reading buffer.
3169              */
3170             processed = ctxt->input->cur - ctxt->input->base;
3171             xmlBufferShrink(ctxt->input->buf->buffer, processed);
3172             nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3173                                        ctxt->input->buf->buffer,
3174                                        ctxt->input->buf->raw);
3175             if (nbchars < 0) {
3176                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3177                              "htmlCheckEncoding: encoder error\n",
3178                              NULL, NULL);
3179             }
3180             ctxt->input->base =
3181             ctxt->input->cur = ctxt->input->buf->buffer->content;
3182         }
3183     }
3184 }
3185 
3186 /**
3187  * htmlCheckMeta:
3188  * @param ctxt an HTML parser context
3189  * @param atts the attributes values
3190  *
3191  * Checks an attributes from a Meta tag
3192  */
3193 static void
htmlCheckMeta(htmlParserCtxtPtr ctxt,const xmlChar ** atts)3194 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3195     int i;
3196     const xmlChar *att, *value;
3197     int http = 0;
3198     const xmlChar *content = NULL;
3199 
3200     if ((ctxt == NULL) || (atts == NULL))
3201         return;
3202 
3203     i = 0;
3204     att = atts[i++];
3205     while (att != NULL) {
3206         value = atts[i++];
3207         if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3208          && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3209             http = 1;
3210         else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3211             content = value;
3212         att = atts[i++];
3213     }
3214     if ((http) && (content != NULL))
3215         htmlCheckEncoding(ctxt, content);
3216 
3217 }
3218 
3219 /**
3220  * htmlParseStartTag:
3221  * @param ctxt an HTML parser context
3222  *
3223  * parse a start of tag either for rule element or
3224  * EmptyElement. In both case we don't parse the tag closing chars.
3225  *
3226  * [40] STag ::= '<' Name (S Attribute)* S? '>'
3227  *
3228  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3229  *
3230  * With namespace:
3231  *
3232  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3233  *
3234  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3235  *
3236  */
3237 
3238 static void
htmlParseStartTag(htmlParserCtxtPtr ctxt)3239 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3240     const xmlChar *name;
3241     const xmlChar *attname;
3242     xmlChar *attvalue;
3243     const xmlChar **atts = ctxt->atts;
3244     int nbatts = 0;
3245     int maxatts = ctxt->maxatts;
3246     int meta = 0;
3247     int i;
3248 
3249     if (CUR != '<') return;
3250     NEXT;
3251 
3252     GROW;
3253     name = htmlParseHTMLName(ctxt);
3254     if (name == NULL) {
3255         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3256                      "htmlParseStartTag: invalid element name\n",
3257                      NULL, NULL);
3258         /* Dump the bogus tag like browsers do */
3259         while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3260             NEXT;
3261         return;
3262     }
3263     if (xmlStrEqual(name, BAD_CAST"meta"))
3264         meta = 1;
3265 
3266     /*
3267      * Check for auto-closure of HTML elements.
3268      */
3269     htmlAutoClose(ctxt, name);
3270 
3271     /*
3272      * Check for implied HTML elements.
3273      */
3274     htmlCheckImplied(ctxt, name);
3275 
3276     /*
3277      * Avoid html at any level > 0, head at any level != 1
3278      * or any attempt to recurse body
3279      */
3280     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3281         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3282                      "htmlParseStartTag: misplaced <html> tag\n",
3283                      name, NULL);
3284         return;
3285     }
3286     if ((ctxt->nameNr != 1) &&
3287         (xmlStrEqual(name, BAD_CAST"head"))) {
3288         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3289                      "htmlParseStartTag: misplaced <head> tag\n",
3290                      name, NULL);
3291         return;
3292     }
3293     if (xmlStrEqual(name, BAD_CAST"body")) {
3294         int indx;
3295         for (indx = 0;indx < ctxt->nameNr;indx++) {
3296             if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3297                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3298                              "htmlParseStartTag: misplaced <body> tag\n",
3299                              name, NULL);
3300                 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3301                     NEXT;
3302                 return;
3303             }
3304         }
3305     }
3306 
3307     /*
3308      * Now parse the attributes, it ends up with the ending
3309      *
3310      * (S Attribute)* S?
3311      */
3312     SKIP_BLANKS;
3313     while ((IS_CHAR_CH(CUR)) &&
3314            (CUR != '>') &&
3315            ((CUR != '/') || (NXT(1) != '>'))) {
3316         long cons = ctxt->nbChars;
3317 
3318         GROW;
3319         attname = htmlParseAttribute(ctxt, &attvalue);
3320         if (attname != NULL) {
3321 
3322             /*
3323              * Well formedness requires at most one declaration of an attribute
3324              */
3325             for (i = 0; i < nbatts;i += 2) {
3326                 if (xmlStrEqual(atts[i], attname)) {
3327                     htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3328                                  "Attribute %s redefined\n", attname, NULL);
3329                     if (attvalue != NULL)
3330                         xmlFree(attvalue);
3331                     goto failed;
3332                 }
3333             }
3334 
3335             /*
3336              * Add the pair to atts
3337              */
3338             if (atts == NULL) {
3339                 maxatts = 22; /* allow for 10 attrs by default */
3340                 atts = (const xmlChar **)
3341                        xmlMalloc(maxatts * sizeof(xmlChar *));
3342                 if (atts == NULL) {
3343                     htmlErrMemory(ctxt, NULL);
3344                     if (attvalue != NULL)
3345                         xmlFree(attvalue);
3346                     goto failed;
3347                 }
3348                 ctxt->atts = atts;
3349                 ctxt->maxatts = maxatts;
3350             } else if (nbatts + 4 > maxatts) {
3351                 const xmlChar **n;
3352 
3353                 maxatts *= 2;
3354                 n = (const xmlChar **) xmlRealloc((void *) atts,
3355                                              maxatts * sizeof(const xmlChar *));
3356                 if (n == NULL) {
3357                     htmlErrMemory(ctxt, NULL);
3358                     if (attvalue != NULL)
3359                         xmlFree(attvalue);
3360                     goto failed;
3361                 }
3362                 atts = n;
3363                 ctxt->atts = atts;
3364                 ctxt->maxatts = maxatts;
3365             }
3366             atts[nbatts++] = attname;
3367             atts[nbatts++] = attvalue;
3368             atts[nbatts] = NULL;
3369             atts[nbatts + 1] = NULL;
3370         }
3371         else {
3372             if (attvalue != NULL)
3373                 xmlFree(attvalue);
3374             /* Dump the bogus attribute string up to the next blank or
3375              * the end of the tag. */
3376             while ((IS_CHAR_CH(CUR)) &&
3377                    !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3378                    ((CUR != '/') || (NXT(1) != '>')))
3379                 NEXT;
3380         }
3381 
3382 failed:
3383         SKIP_BLANKS;
3384         if (cons == ctxt->nbChars) {
3385             htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3386                          "htmlParseStartTag: problem parsing attributes\n",
3387                          NULL, NULL);
3388             break;
3389         }
3390     }
3391 
3392     /*
3393      * Handle specific association to the META tag
3394      */
3395     if (meta)
3396         htmlCheckMeta(ctxt, atts);
3397 
3398     /*
3399      * SAX: Start of Element !
3400      */
3401     htmlnamePush(ctxt, name);
3402     if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3403         if (nbatts != 0)
3404             ctxt->sax->startElement(ctxt->userData, name, atts);
3405         else
3406             ctxt->sax->startElement(ctxt->userData, name, NULL);
3407     }
3408 
3409     if (atts != NULL) {
3410         for (i = 1;i < nbatts;i += 2) {
3411             if (atts[i] != NULL)
3412                 xmlFree((xmlChar *) atts[i]);
3413         }
3414     }
3415 }
3416 
3417 /**
3418  * htmlParseEndTag:
3419  * @param ctxt an HTML parser context
3420  *
3421  * parse an end of tag
3422  *
3423  * [42] ETag ::= '</' Name S? '>'
3424  *
3425  * With namespace
3426  *
3427  * [NS 9] ETag ::= '</' QName S? '>'
3428  *
3429  * Returns 1 if the current level should be closed.
3430  */
3431 
3432 static int
htmlParseEndTag(htmlParserCtxtPtr ctxt)3433 htmlParseEndTag(htmlParserCtxtPtr ctxt)
3434 {
3435     const xmlChar *name;
3436     const xmlChar *oldname;
3437     int i, ret;
3438 
3439     if ((CUR != '<') || (NXT(1) != '/')) {
3440         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3441                      "htmlParseEndTag: '</' not found\n", NULL, NULL);
3442         return (0);
3443     }
3444     SKIP(2);
3445 
3446     name = htmlParseHTMLName(ctxt);
3447     if (name == NULL)
3448         return (0);
3449 
3450     /*
3451      * We should definitely be at the ending "S? '>'" part
3452      */
3453     SKIP_BLANKS;
3454     if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
3455         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3456                      "End tag : expected '>'\n", NULL, NULL);
3457     } else
3458         NEXT;
3459 
3460     /*
3461      * If the name read is not one of the element in the parsing stack
3462      * then return, it's just an error.
3463      */
3464     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3465         if (xmlStrEqual(name, ctxt->nameTab[i]))
3466             break;
3467     }
3468     if (i < 0) {
3469         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3470                      "Unexpected end tag : %s\n", name, NULL);
3471         return (0);
3472     }
3473 
3474 
3475     /*
3476      * Check for auto-closure of HTML elements.
3477      */
3478 
3479     htmlAutoCloseOnClose(ctxt, name);
3480 
3481     /*
3482      * Well formedness constraints, opening and closing must match.
3483      * With the exception that the autoclose may have popped stuff out
3484      * of the stack.
3485      */
3486     if (!xmlStrEqual(name, ctxt->name)) {
3487         if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
3488             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3489                          "Opening and ending tag mismatch: %s and %s\n",
3490                          name, ctxt->name);
3491         }
3492     }
3493 
3494     /*
3495      * SAX: End of Tag
3496      */
3497     oldname = ctxt->name;
3498     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3499         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3500             ctxt->sax->endElement(ctxt->userData, name);
3501         htmlnamePop(ctxt);
3502         ret = 1;
3503     } else {
3504         ret = 0;
3505     }
3506 
3507     return (ret);
3508 }
3509 
3510 
3511 /**
3512  * htmlParseReference:
3513  * @param ctxt an HTML parser context
3514  *
3515  * parse and handle entity references in content,
3516  * this will end-up in a call to character() since this is either a
3517  * CharRef, or a predefined entity.
3518  */
3519 static void
htmlParseReference(htmlParserCtxtPtr ctxt)3520 htmlParseReference(htmlParserCtxtPtr ctxt) {
3521     const htmlEntityDesc * ent;
3522     xmlChar out[6];
3523     const xmlChar *name;
3524     if (CUR != '&') return;
3525 
3526     if (NXT(1) == '#') {
3527         unsigned int c;
3528         int bits, i = 0;
3529 
3530         c = htmlParseCharRef(ctxt);
3531         if (c == 0)
3532             return;
3533 
3534         if      (c <    0x80) { out[i++]= c;                bits= -6; }
3535         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3536         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3537         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3538 
3539         for ( ; bits >= 0; bits-= 6) {
3540             out[i++]= ((c >> bits) & 0x3F) | 0x80;
3541         }
3542         out[i] = 0;
3543 
3544         htmlCheckParagraph(ctxt);
3545         if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3546             ctxt->sax->characters(ctxt->userData, out, i);
3547     } else {
3548         ent = htmlParseEntityRef(ctxt, &name);
3549         if (name == NULL) {
3550             htmlCheckParagraph(ctxt);
3551             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3552                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3553             return;
3554         }
3555         if ((ent == NULL) || !(ent->value > 0)) {
3556             htmlCheckParagraph(ctxt);
3557             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3558                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3559                 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3560                 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3561             }
3562         } else {
3563             unsigned int c;
3564             int bits, i = 0;
3565 
3566             c = ent->value;
3567             if      (c <    0x80)
3568                     { out[i++]= c;                bits= -6; }
3569             else if (c <   0x800)
3570                     { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3571             else if (c < 0x10000)
3572                     { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3573             else
3574                     { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3575 
3576             for ( ; bits >= 0; bits-= 6) {
3577                 out[i++]= ((c >> bits) & 0x3F) | 0x80;
3578             }
3579             out[i] = 0;
3580 
3581             htmlCheckParagraph(ctxt);
3582             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3583                 ctxt->sax->characters(ctxt->userData, out, i);
3584         }
3585     }
3586 }
3587 
3588 /**
3589  * htmlParseContent:
3590  * @param ctxt an HTML parser context
3591  * @param name the node name
3592  *
3593  * Parse a content: comment, sub-element, reference or text.
3594  *
3595  */
3596 
3597 static void
htmlParseContent(htmlParserCtxtPtr ctxt)3598 htmlParseContent(htmlParserCtxtPtr ctxt) {
3599     xmlChar *currentNode;
3600     int depth;
3601 
3602     currentNode = xmlStrdup(ctxt->name);
3603     depth = ctxt->nameNr;
3604     while (1) {
3605         long cons = ctxt->nbChars;
3606 
3607         GROW;
3608         /*
3609          * Our tag or one of it's parent or children is ending.
3610          */
3611         if ((CUR == '<') && (NXT(1) == '/')) {
3612             if (htmlParseEndTag(ctxt) &&
3613                 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
3614                 if (currentNode != NULL)
3615                     xmlFree(currentNode);
3616                 return;
3617             }
3618             continue; /* while */
3619         }
3620 
3621         /*
3622          * Has this node been popped out during parsing of
3623          * the next element
3624          */
3625         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3626             (!xmlStrEqual(currentNode, ctxt->name)))
3627              {
3628             if (currentNode != NULL) xmlFree(currentNode);
3629             return;
3630         }
3631 
3632         if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3633             (xmlStrEqual(currentNode, BAD_CAST"style")))) {
3634             /*
3635              * Handle SCRIPT/STYLE separately
3636              */
3637             htmlParseScript(ctxt);
3638         } else {
3639             /*
3640              * Sometimes DOCTYPE arrives in the middle of the document
3641              */
3642             if ((CUR == '<') && (NXT(1) == '!') &&
3643                 (UPP(2) == 'D') && (UPP(3) == 'O') &&
3644                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
3645                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3646                 (UPP(8) == 'E')) {
3647                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3648                              "Misplaced DOCTYPE declaration\n",
3649                              BAD_CAST "DOCTYPE" , NULL);
3650                 htmlParseDocTypeDecl(ctxt);
3651             }
3652 
3653             /*
3654              * First case :  a comment
3655              */
3656             if ((CUR == '<') && (NXT(1) == '!') &&
3657                 (NXT(2) == '-') && (NXT(3) == '-')) {
3658                 htmlParseComment(ctxt);
3659             }
3660 
3661             /*
3662              * Second case :  a sub-element.
3663              */
3664             else if (CUR == '<') {
3665                 htmlParseElement(ctxt);
3666             }
3667 
3668             /*
3669              * Third case : a reference. If if has not been resolved,
3670              *    parsing returns it's Name, create the node
3671              */
3672             else if (CUR == '&') {
3673                 htmlParseReference(ctxt);
3674             }
3675 
3676             /*
3677              * Fourth : end of the resource
3678              */
3679             else if (CUR == 0) {
3680                 htmlAutoCloseOnEnd(ctxt);
3681                 break;
3682             }
3683 
3684             /*
3685              * Last case, text. Note that References are handled directly.
3686              */
3687             else {
3688                 htmlParseCharData(ctxt);
3689             }
3690 
3691             if (cons == ctxt->nbChars) {
3692                 if (ctxt->node != NULL) {
3693                     htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3694                                  "detected an error in element content\n",
3695                                  NULL, NULL);
3696                 }
3697                 break;
3698             }
3699         }
3700         GROW;
3701     }
3702     if (currentNode != NULL) xmlFree(currentNode);
3703 }
3704 
3705 /**
3706  * htmlParseElement:
3707  * @param ctxt an HTML parser context
3708  *
3709  * parse an HTML element, this is highly recursive
3710  *
3711  * [39] element ::= EmptyElemTag | STag content ETag
3712  *
3713  * [41] Attribute ::= Name Eq AttValue
3714  */
3715 
3716 void
htmlParseElement(htmlParserCtxtPtr ctxt)3717 htmlParseElement(htmlParserCtxtPtr ctxt) {
3718     const xmlChar *name;
3719     xmlChar *currentNode = NULL;
3720     const htmlElemDesc * info;
3721     htmlParserNodeInfo node_info;
3722     const xmlChar *oldname;
3723     int depth = ctxt->nameNr;
3724     const xmlChar *oldptr;
3725 
3726     /* Capture start position */
3727     if (ctxt->record_info) {
3728         node_info.begin_pos = ctxt->input->consumed +
3729                           (CUR_PTR - ctxt->input->base);
3730         node_info.begin_line = ctxt->input->line;
3731     }
3732 
3733     oldname = ctxt->name;
3734     htmlParseStartTag(ctxt);
3735     name = ctxt->name;
3736     if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
3737         (name == NULL)) {
3738         if (CUR == '>')
3739             NEXT;
3740         return;
3741     }
3742 
3743     /*
3744      * Lookup the info for that element.
3745      */
3746     info = htmlTagLookup(name);
3747     if (info == NULL) {
3748         htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
3749                      "Tag %s invalid\n", name, NULL);
3750     }
3751 
3752     /*
3753      * Check for an Empty Element labeled the XML/SGML way
3754      */
3755     if ((CUR == '/') && (NXT(1) == '>')) {
3756         SKIP(2);
3757         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3758             ctxt->sax->endElement(ctxt->userData, name);
3759         htmlnamePop(ctxt);
3760         return;
3761     }
3762 
3763     if (CUR == '>') {
3764         NEXT;
3765     } else {
3766         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3767                      "Couldn't find end of Start Tag %s\n", name, NULL);
3768 
3769         /*
3770          * end of parsing of this node.
3771          */
3772         if (xmlStrEqual(name, ctxt->name)) {
3773             nodePop(ctxt);
3774             htmlnamePop(ctxt);
3775         }
3776 
3777         /*
3778          * Capture end position and add node
3779          */
3780         if ( currentNode != NULL && ctxt->record_info ) {
3781            node_info.end_pos = ctxt->input->consumed +
3782                               (CUR_PTR - ctxt->input->base);
3783            node_info.end_line = ctxt->input->line;
3784            node_info.node = ctxt->node;
3785            xmlParserAddNodeInfo(ctxt, &node_info);
3786         }
3787         return;
3788     }
3789 
3790     /*
3791      * Check for an Empty Element from DTD definition
3792      */
3793     if ((info != NULL) && (info->empty)) {
3794         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3795             ctxt->sax->endElement(ctxt->userData, name);
3796         htmlnamePop(ctxt);
3797         return;
3798     }
3799 
3800     /*
3801      * Parse the content of the element:
3802      */
3803     currentNode = xmlStrdup(ctxt->name);
3804     depth = ctxt->nameNr;
3805     while (IS_CHAR_CH(CUR)) {
3806         oldptr = ctxt->input->cur;
3807         htmlParseContent(ctxt);
3808         if (oldptr==ctxt->input->cur) break;
3809         if (ctxt->nameNr < depth) break;
3810     }
3811 
3812     /*
3813      * Capture end position and add node
3814      */
3815     if ( currentNode != NULL && ctxt->record_info ) {
3816        node_info.end_pos = ctxt->input->consumed +
3817                           (CUR_PTR - ctxt->input->base);
3818        node_info.end_line = ctxt->input->line;
3819        node_info.node = ctxt->node;
3820        xmlParserAddNodeInfo(ctxt, &node_info);
3821     }
3822     if (!IS_CHAR_CH(CUR)) {
3823         htmlAutoCloseOnEnd(ctxt);
3824     }
3825 
3826     if (currentNode != NULL)
3827         xmlFree(currentNode);
3828 }
3829 
3830 /**
3831  * htmlParseDocument:
3832  * @param ctxt an HTML parser context
3833  *
3834  * parse an HTML document (and build a tree if using the standard SAX
3835  * interface).
3836  *
3837  * Returns 0, -1 in case of error. the parser context is augmented
3838  *                as a result of the parsing.
3839  */
3840 
3841 int
htmlParseDocument(htmlParserCtxtPtr ctxt)3842 htmlParseDocument(htmlParserCtxtPtr ctxt) {
3843     xmlDtdPtr dtd;
3844 
3845     xmlInitParser();
3846 
3847     htmlDefaultSAXHandlerInit();
3848     ctxt->html = 1;
3849 
3850     GROW;
3851     /*
3852      * SAX: beginning of the document processing.
3853      */
3854     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
3855         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
3856 
3857     /*
3858      * Wipe out everything which is before the first '<'
3859      */
3860     SKIP_BLANKS;
3861     if (CUR == 0) {
3862         htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
3863                      "Document is empty\n", NULL, NULL);
3864     }
3865 
3866     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
3867         ctxt->sax->startDocument(ctxt->userData);
3868 
3869 
3870     /*
3871      * Parse possible comments before any content
3872      */
3873     while ((CUR == '<') && (NXT(1) == '!') &&
3874            (NXT(2) == '-') && (NXT(3) == '-')) {
3875         htmlParseComment(ctxt);
3876         SKIP_BLANKS;
3877     }
3878 
3879 
3880     /*
3881      * Then possibly doc type declaration(s) and more Misc
3882      * (doctypedecl Misc*)?
3883      */
3884     if ((CUR == '<') && (NXT(1) == '!') &&
3885         (UPP(2) == 'D') && (UPP(3) == 'O') &&
3886         (UPP(4) == 'C') && (UPP(5) == 'T') &&
3887         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
3888         (UPP(8) == 'E')) {
3889         htmlParseDocTypeDecl(ctxt);
3890     }
3891     SKIP_BLANKS;
3892 
3893     /*
3894      * Parse possible comments before any content
3895      */
3896     while ((CUR == '<') && (NXT(1) == '!') &&
3897            (NXT(2) == '-') && (NXT(3) == '-')) {
3898         htmlParseComment(ctxt);
3899         SKIP_BLANKS;
3900     }
3901 
3902     /*
3903      * Time to start parsing the tree itself
3904      */
3905     htmlParseContent(ctxt);
3906 
3907     /*
3908      * autoclose
3909      */
3910     if (CUR == 0)
3911         htmlAutoCloseOnEnd(ctxt);
3912 
3913 
3914     /*
3915      * SAX: end of the document processing.
3916      */
3917     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
3918         ctxt->sax->endDocument(ctxt->userData);
3919 
3920     if (ctxt->myDoc != NULL) {
3921         dtd = xmlGetIntSubset(ctxt->myDoc);
3922         if (dtd == NULL)
3923             ctxt->myDoc->intSubset =
3924                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
3925                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
3926                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
3927     }
3928     if (! ctxt->wellFormed) return(-1);
3929     return(0);
3930 }
3931 
3932 
3933 /************************************************************************
3934  *                                                                      *
3935  *                      Parser contexts handling                        *
3936  *                                                                      *
3937  ************************************************************************/
3938 
3939 /**
3940  * htmlInitParserCtxt:
3941  * @param ctxt an HTML parser context
3942  *
3943  * Initialize a parser context
3944  *
3945  * Returns 0 in case of success and -1 in case of error
3946  */
3947 
3948 static int
htmlInitParserCtxt(htmlParserCtxtPtr ctxt)3949 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
3950 {
3951     htmlSAXHandler *sax;
3952 
3953     if (ctxt == NULL) return(-1);
3954     memset(ctxt, 0, sizeof(htmlParserCtxt));
3955     // NOTE: All assignments  ctxt->XX = 0; were commented as unnecessary
3956     ctxt->dict = xmlDictCreate();
3957     if (ctxt->dict == NULL) {
3958         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
3959         return(-1);
3960     }
3961     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
3962     if (sax == NULL) {
3963         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
3964         return(-1);
3965     }
3966     else
3967         memset(sax, 0, sizeof(htmlSAXHandler));
3968 
3969     /* Allocate the Input stack */
3970     ctxt->inputTab = (htmlParserInputPtr *)
3971                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
3972     if (ctxt->inputTab == NULL) {
3973         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
3974     //ctxt->inputNr = 0;
3975     //ctxt->inputMax = 0;
3976     //ctxt->input = NULL;
3977         return(-1);
3978     }
3979     //ctxt->inputNr = 0;
3980     ctxt->inputMax = 5;
3981     //ctxt->input = NULL;
3982     //ctxt->version = NULL;
3983     //ctxt->encoding = NULL;
3984     ctxt->standalone = -1;
3985     ctxt->instate = XML_PARSER_START;
3986 
3987     /* Allocate the Node stack */
3988     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
3989     if (ctxt->nodeTab == NULL) {
3990         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
3991     //ctxt->nodeNr = 0;
3992     //ctxt->nodeMax = 0;
3993     //ctxt->node = NULL;
3994     //ctxt->inputNr = 0;
3995     //ctxt->inputMax = 0;
3996     //ctxt->input = NULL;
3997         return(-1);
3998     }
3999     //ctxt->nodeNr = 0;
4000     ctxt->nodeMax = 10;
4001     //ctxt->node = NULL;
4002 
4003     /* Allocate the Name stack */
4004     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4005     if (ctxt->nameTab == NULL) {
4006         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4007     //ctxt->nameNr = 0;
4008     ctxt->nameMax = 10;
4009     //ctxt->name = NULL;
4010     //ctxt->nodeNr = 0;
4011     //ctxt->nodeMax = 0;
4012     //ctxt->node = NULL;
4013     //ctxt->inputNr = 0;
4014     //ctxt->inputMax = 0;
4015     ctxt->input = NULL;
4016     return(-1);
4017     }
4018     //ctxt->nameNr = 0;
4019     ctxt->nameMax = 10;
4020     //ctxt->name = NULL;
4021 
4022     if (sax == NULL)
4023         ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4024     else {
4025         ctxt->sax = sax;
4026         memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4027     }
4028     ctxt->userData = ctxt;
4029     //ctxt->myDoc = NULL;
4030     ctxt->wellFormed = 1;
4031     //ctxt->replaceEntities = 0;
4032     #ifdef LIBXML_ENABLE_NODE_LINEINFO
4033     ctxt->linenumbers = xmlLineNumbersDefaultValue;
4034     #endif
4035     ctxt->html = 1;
4036     ctxt->vctxt.userData = ctxt;
4037     ctxt->vctxt.error = xmlParserValidityError;
4038     ctxt->vctxt.warning = xmlParserValidityWarning;
4039     //ctxt->record_info = 0;
4040     //ctxt->validate = 0;
4041     //ctxt->nbChars = 0;
4042     //ctxt->checkIndex = 0;
4043     //ctxt->catalogs = NULL;
4044     xmlInitNodeInfoSeq(&ctxt->node_seq);
4045     return(0);
4046 }
4047 
4048 /**
4049  * htmlFreeParserCtxt:
4050  * @param ctxt an HTML parser context
4051  *
4052  * Free all the memory used by a parser context. However the parsed
4053  * document in ctxt->myDoc is not freed.
4054  */
4055 
4056 void
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)4057 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4058 {
4059     xmlFreeParserCtxt(ctxt);
4060 }
4061 
4062 /**
4063  * htmlNewParserCtxt:
4064  *
4065  * Allocate and initialize a new parser context.
4066  *
4067  * Returns the xmlParserCtxtPtr or NULL
4068  */
4069 
4070 static htmlParserCtxtPtr
htmlNewParserCtxt(void)4071 htmlNewParserCtxt(void)
4072 {
4073     xmlParserCtxtPtr ctxt;
4074 
4075     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4076     if (ctxt == NULL) {
4077         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4078         return(NULL);
4079     }
4080     memset(ctxt, 0, sizeof(xmlParserCtxt));
4081 #ifdef XE_ENABLE_GS_CACHING
4082     ctxt->cachedGs = xmlGetGlobalState();
4083 #endif
4084 
4085     if (htmlInitParserCtxt(ctxt) < 0) {
4086         htmlFreeParserCtxt(ctxt);
4087         return(NULL);
4088     }
4089     return(ctxt);
4090 }
4091 
4092 /**
4093  * htmlCreateMemoryParserCtxt:
4094  * @param buffer a pointer to a char array
4095  * @param size the size of the array
4096  *
4097  * Create a parser context for an HTML in-memory document.
4098  *
4099  * Returns the new parser context or NULL
4100  */
4101 htmlParserCtxtPtr
htmlCreateMemoryParserCtxt(const char * buffer,int size)4102 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4103 
4104     xmlParserCtxtPtr ctxt;
4105     xmlParserInputPtr input;
4106     xmlParserInputBufferPtr buf;
4107 
4108     if (buffer == NULL)
4109         return(NULL);
4110     if (size <= 0)
4111         return(NULL);
4112 
4113     ctxt = htmlNewParserCtxt();
4114     if (ctxt == NULL)
4115         return(NULL);
4116 
4117     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4118     if (buf == NULL) return(NULL);
4119 
4120     input = xmlNewInputStream(ctxt);
4121     if (input == NULL) {
4122         xmlFreeParserCtxt(ctxt);
4123         return(NULL);
4124     }
4125 
4126     input->filename = NULL;
4127     input->buf = buf;
4128     input->base = input->buf->buffer->content;
4129     input->cur = input->buf->buffer->content;
4130     input->end = &input->buf->buffer->content[input->buf->buffer->use];
4131 
4132     inputPush(ctxt, input);
4133     return(ctxt);
4134 }
4135 
4136 /**
4137  * htmlCreateDocParserCtxt:
4138  * @param cur a pointer to an array of xmlChar
4139  * @param encoding a free form C string describing the HTML document encoding, or NULL
4140  *
4141  * Create a parser context for an HTML document.
4142  *
4143 
4144  *
4145  * Returns the new parser context or NULL
4146  */
4147 static htmlParserCtxtPtr
htmlCreateDocParserCtxt(xmlChar * cur,const char * encoding ATTRIBUTE_UNUSED)4148 htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
4149     int len;
4150     htmlParserCtxtPtr ctxt;
4151 
4152     if (cur == NULL)
4153         return(NULL);
4154     len = xmlStrlen(cur);
4155     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4156 
4157     if (encoding != NULL) {
4158         xmlCharEncoding enc;
4159         xmlCharEncodingHandlerPtr handler;
4160 
4161         if (ctxt->input->encoding != NULL)
4162             xmlFree((xmlChar *) ctxt->input->encoding);
4163         ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
4164 
4165         enc = xmlParseCharEncoding(encoding);
4166         /*
4167          * registered set of known encodings
4168          */
4169         if (enc != XML_CHAR_ENCODING_ERROR) {
4170             xmlSwitchEncoding(ctxt, enc);
4171             if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4172                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4173                              "Unsupported encoding %s\n",
4174                              (const xmlChar *) encoding, NULL);
4175             }
4176         } else {
4177             /*
4178              * fallback for unknown encodings
4179              */
4180             handler = xmlFindCharEncodingHandler((const char *) encoding);
4181             if (handler != NULL) {
4182                 xmlSwitchToEncoding(ctxt, handler);
4183             } else {
4184                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4185                              "Unsupported encoding %s\n",
4186                              (const xmlChar *) encoding, NULL);
4187             }
4188         }
4189     }
4190     return(ctxt);
4191 }
4192 
4193 #ifdef LIBXML_PUSH_ENABLED
4194 /************************************************************************
4195  *                                                                      *
4196  *              Progressive parsing interfaces                          *
4197  *                                                                      *
4198  ************************************************************************/
4199 
4200 /**
4201  * htmlParseLookupSequence:
4202  * @param ctxt an HTML parser context
4203  * @param first the first char to lookup
4204  * @param next the next char to lookup or zero
4205  * @param third the next char to lookup or zero
4206  * @param comment flag to force checking inside comments
4207  *
4208  * Try to find if a sequence (first, next, third) or  just (first next) or
4209  * (first) is available in the input stream.
4210  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4211  * to avoid rescanning sequences of bytes, it DOES change the state of the
4212  * parser, do not use liberally.
4213  * This is basically similar to xmlParseLookupSequence()
4214  *
4215  * Returns the index to the current parsing point if the full sequence
4216  *      is available, -1 otherwise.
4217  */
4218 static int
htmlParseLookupSequence(htmlParserCtxtPtr ctxt,xmlChar first,xmlChar next,xmlChar third,int iscomment)4219 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4220                         xmlChar next, xmlChar third, int iscomment) {
4221     int base, len;
4222     htmlParserInputPtr in;
4223     const xmlChar *buf;
4224     int incomment = 0;
4225 
4226     in = ctxt->input;
4227     if (in == NULL) return(-1);
4228     base = in->cur - in->base;
4229     if (base < 0) return(-1);
4230     if (ctxt->checkIndex > base)
4231         base = ctxt->checkIndex;
4232     if (in->buf == NULL) {
4233         buf = in->base;
4234         len = in->length;
4235     } else {
4236         buf = in->buf->buffer->content;
4237         len = in->buf->buffer->use;
4238     }
4239     /* take into account the sequence length */
4240     if (third) len -= 2;
4241     else if (next) len --;
4242     for (;base < len;base++) {
4243         if (!incomment && (base + 4 < len) && !iscomment) {
4244             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4245                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4246                 incomment = 1;
4247                 /* do not increment past <! - some people use <!--> */
4248                 base += 2;
4249             }
4250         }
4251         if (incomment) {
4252             if (base + 3 > len)
4253                 return(-1);
4254             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4255                 (buf[base + 2] == '>')) {
4256                 incomment = 0;
4257                 base += 2;
4258             }
4259             continue;
4260         }
4261         if (buf[base] == first) {
4262             if (third != 0) {
4263                 if ((buf[base + 1] != next) ||
4264                     (buf[base + 2] != third)) continue;
4265             } else if (next != 0) {
4266                 if (buf[base + 1] != next) continue;
4267             }
4268             ctxt->checkIndex = 0;
4269 #ifdef DEBUG_PUSH
4270             if (next == 0)
4271                 xmlGenericError(xmlGenericErrorContext,
4272                         "HPP: lookup '%c' found at %d\n",
4273                         first, base);
4274             else if (third == 0)
4275                 xmlGenericError(xmlGenericErrorContext,
4276                         "HPP: lookup '%c%c' found at %d\n",
4277                         first, next, base);
4278             else
4279                 xmlGenericError(xmlGenericErrorContext,
4280                         "HPP: lookup '%c%c%c' found at %d\n",
4281                         first, next, third, base);
4282 #endif
4283             return(base - (in->cur - in->base));
4284         }
4285     }
4286     ctxt->checkIndex = base;
4287 #ifdef DEBUG_PUSH
4288     if (next == 0)
4289         xmlGenericError(xmlGenericErrorContext,
4290                 "HPP: lookup '%c' failed\n", first);
4291     else if (third == 0)
4292         xmlGenericError(xmlGenericErrorContext,
4293                 "HPP: lookup '%c%c' failed\n", first, next);
4294     else
4295         xmlGenericError(xmlGenericErrorContext,
4296                 "HPP: lookup '%c%c%c' failed\n", first, next, third);
4297 #endif
4298     return(-1);
4299 }
4300 
4301 /**
4302  * htmlParseTryOrFinish:
4303  * @param ctxt an HTML parser context
4304  * @param terminate last chunk indicator
4305  *
4306  * Try to progress on parsing
4307  *
4308  * Returns zero if no parsing was possible
4309  */
4310 static int
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt,int terminate)4311 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4312     int ret = 0;
4313     htmlParserInputPtr in;
4314     int avail = 0;
4315     xmlChar cur, next;
4316 
4317 #ifdef DEBUG_PUSH
4318     switch (ctxt->instate) {
4319         case XML_PARSER_EOF:
4320             xmlGenericError(xmlGenericErrorContext,
4321                     "HPP: try EOF\n"); break;
4322         case XML_PARSER_START:
4323             xmlGenericError(xmlGenericErrorContext,
4324                     "HPP: try START\n"); break;
4325         case XML_PARSER_MISC:
4326             xmlGenericError(xmlGenericErrorContext,
4327                     "HPP: try MISC\n");break;
4328         case XML_PARSER_COMMENT:
4329             xmlGenericError(xmlGenericErrorContext,
4330                     "HPP: try COMMENT\n");break;
4331         case XML_PARSER_PROLOG:
4332             xmlGenericError(xmlGenericErrorContext,
4333                     "HPP: try PROLOG\n");break;
4334         case XML_PARSER_START_TAG:
4335             xmlGenericError(xmlGenericErrorContext,
4336                     "HPP: try START_TAG\n");break;
4337         case XML_PARSER_CONTENT:
4338             xmlGenericError(xmlGenericErrorContext,
4339                     "HPP: try CONTENT\n");break;
4340         case XML_PARSER_CDATA_SECTION:
4341             xmlGenericError(xmlGenericErrorContext,
4342                     "HPP: try CDATA_SECTION\n");break;
4343         case XML_PARSER_END_TAG:
4344             xmlGenericError(xmlGenericErrorContext,
4345                     "HPP: try END_TAG\n");break;
4346         case XML_PARSER_ENTITY_DECL:
4347             xmlGenericError(xmlGenericErrorContext,
4348                     "HPP: try ENTITY_DECL\n");break;
4349         case XML_PARSER_ENTITY_VALUE:
4350             xmlGenericError(xmlGenericErrorContext,
4351                     "HPP: try ENTITY_VALUE\n");break;
4352         case XML_PARSER_ATTRIBUTE_VALUE:
4353             xmlGenericError(xmlGenericErrorContext,
4354                     "HPP: try ATTRIBUTE_VALUE\n");break;
4355         case XML_PARSER_DTD:
4356             xmlGenericError(xmlGenericErrorContext,
4357                     "HPP: try DTD\n");break;
4358         case XML_PARSER_EPILOG:
4359             xmlGenericError(xmlGenericErrorContext,
4360                     "HPP: try EPILOG\n");break;
4361         case XML_PARSER_PI:
4362             xmlGenericError(xmlGenericErrorContext,
4363                     "HPP: try PI\n");break;
4364         case XML_PARSER_SYSTEM_LITERAL:
4365             xmlGenericError(xmlGenericErrorContext,
4366                     "HPP: try SYSTEM_LITERAL\n");break;
4367     }
4368 #endif
4369 
4370     while (1) {
4371 
4372         in = ctxt->input;
4373         if (in == NULL) break;
4374         if (in->buf == NULL)
4375             avail = in->length - (in->cur - in->base);
4376         else
4377             avail = in->buf->buffer->use - (in->cur - in->base);
4378         if ((avail == 0) && (terminate)) {
4379             htmlAutoCloseOnEnd(ctxt);
4380             if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4381                 /*
4382                  * SAX: end of the document processing.
4383                  */
4384                 ctxt->instate = XML_PARSER_EOF;
4385                 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4386                     ctxt->sax->endDocument(ctxt->userData);
4387             }
4388         }
4389         if (avail < 1)
4390             goto done;
4391         cur = in->cur[0];
4392         if (cur == 0) {
4393             SKIP(1);
4394             continue;
4395         }
4396 
4397         switch (ctxt->instate) {
4398             case XML_PARSER_EOF:
4399                 /*
4400                  * Document parsing is done !
4401                  */
4402                 goto done;
4403             case XML_PARSER_START:
4404                 /*
4405                  * Very first chars read from the document flow.
4406                  */
4407                 cur = in->cur[0];
4408                 if (IS_BLANK_CH(cur)) {
4409                     SKIP_BLANKS;
4410                     if (in->buf == NULL)
4411                         avail = in->length - (in->cur - in->base);
4412                     else
4413                         avail = in->buf->buffer->use - (in->cur - in->base);
4414                 }
4415                 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4416                     ctxt->sax->setDocumentLocator(ctxt->userData,
4417                                                   &xmlDefaultSAXLocator);
4418                 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4419                     (!ctxt->disableSAX))
4420                     ctxt->sax->startDocument(ctxt->userData);
4421 
4422                 cur = in->cur[0];
4423                 next = in->cur[1];
4424                 if ((cur == '<') && (next == '!') &&
4425                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
4426                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
4427                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4428                     (UPP(8) == 'E')) {
4429                     if ((!terminate) &&
4430                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4431                         goto done;
4432 #ifdef DEBUG_PUSH
4433                     xmlGenericError(xmlGenericErrorContext,
4434                             "HPP: Parsing internal subset\n");
4435 #endif
4436                     htmlParseDocTypeDecl(ctxt);
4437                     ctxt->instate = XML_PARSER_PROLOG;
4438 #ifdef DEBUG_PUSH
4439                     xmlGenericError(xmlGenericErrorContext,
4440                             "HPP: entering PROLOG\n");
4441 #endif
4442                 } else {
4443                     ctxt->instate = XML_PARSER_MISC;
4444                 }
4445 #ifdef DEBUG_PUSH
4446                 xmlGenericError(xmlGenericErrorContext,
4447                         "HPP: entering MISC\n");
4448 #endif
4449                 break;
4450             case XML_PARSER_MISC:
4451                 SKIP_BLANKS;
4452                 if (in->buf == NULL)
4453                     avail = in->length - (in->cur - in->base);
4454                 else
4455                     avail = in->buf->buffer->use - (in->cur - in->base);
4456                 if (avail < 2)
4457                     goto done;
4458                 cur = in->cur[0];
4459                 next = in->cur[1];
4460                 if ((cur == '<') && (next == '!') &&
4461                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
4462                     if ((!terminate) &&
4463                         (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4464                         goto done;
4465 #ifdef DEBUG_PUSH
4466                     xmlGenericError(xmlGenericErrorContext,
4467                             "HPP: Parsing Comment\n");
4468 #endif
4469                     htmlParseComment(ctxt);
4470                     ctxt->instate = XML_PARSER_MISC;
4471                 } else if ((cur == '<') && (next == '!') &&
4472                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
4473                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
4474                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4475                     (UPP(8) == 'E')) {
4476                     if ((!terminate) &&
4477                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4478                         goto done;
4479 #ifdef DEBUG_PUSH
4480                     xmlGenericError(xmlGenericErrorContext,
4481                             "HPP: Parsing internal subset\n");
4482 #endif
4483                     htmlParseDocTypeDecl(ctxt);
4484                     ctxt->instate = XML_PARSER_PROLOG;
4485 #ifdef DEBUG_PUSH
4486                     xmlGenericError(xmlGenericErrorContext,
4487                             "HPP: entering PROLOG\n");
4488 #endif
4489                 } else if ((cur == '<') && (next == '!') &&
4490                            (avail < 9)) {
4491                     goto done;
4492                 } else {
4493                     ctxt->instate = XML_PARSER_START_TAG;
4494 #ifdef DEBUG_PUSH
4495                     xmlGenericError(xmlGenericErrorContext,
4496                             "HPP: entering START_TAG\n");
4497 #endif
4498                 }
4499                 break;
4500             case XML_PARSER_PROLOG:
4501                 SKIP_BLANKS;
4502                 if (in->buf == NULL)
4503                     avail = in->length - (in->cur - in->base);
4504                 else
4505                     avail = in->buf->buffer->use - (in->cur - in->base);
4506                 if (avail < 2)
4507                     goto done;
4508                 cur = in->cur[0];
4509                 next = in->cur[1];
4510                 if ((cur == '<') && (next == '!') &&
4511                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
4512                     if ((!terminate) &&
4513                         (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4514                         goto done;
4515 #ifdef DEBUG_PUSH
4516                     xmlGenericError(xmlGenericErrorContext,
4517                             "HPP: Parsing Comment\n");
4518 #endif
4519                     htmlParseComment(ctxt);
4520                     ctxt->instate = XML_PARSER_PROLOG;
4521                 } else if ((cur == '<') && (next == '!') &&
4522                            (avail < 4)) {
4523                     goto done;
4524                 } else {
4525                     ctxt->instate = XML_PARSER_START_TAG;
4526 #ifdef DEBUG_PUSH
4527                     xmlGenericError(xmlGenericErrorContext,
4528                             "HPP: entering START_TAG\n");
4529 #endif
4530                 }
4531                 break;
4532             case XML_PARSER_EPILOG:
4533                 if (in->buf == NULL)
4534                     avail = in->length - (in->cur - in->base);
4535                 else
4536                     avail = in->buf->buffer->use - (in->cur - in->base);
4537                 if (avail < 1)
4538                     goto done;
4539                 cur = in->cur[0];
4540                 if (IS_BLANK_CH(cur)) {
4541                     htmlParseCharData(ctxt);
4542                     goto done;
4543                 }
4544                 if (avail < 2)
4545                     goto done;
4546                 next = in->cur[1];
4547                 if ((cur == '<') && (next == '!') &&
4548                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
4549                     if ((!terminate) &&
4550                         (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4551                         goto done;
4552 #ifdef DEBUG_PUSH
4553                     xmlGenericError(xmlGenericErrorContext,
4554                             "HPP: Parsing Comment\n");
4555 #endif
4556                     htmlParseComment(ctxt);
4557                     ctxt->instate = XML_PARSER_EPILOG;
4558                 } else if ((cur == '<') && (next == '!') &&
4559                            (avail < 4)) {
4560                     goto done;
4561                 } else {
4562                     ctxt->errNo = XML_ERR_DOCUMENT_END;
4563                     ctxt->wellFormed = 0;
4564                     ctxt->instate = XML_PARSER_EOF;
4565 #ifdef DEBUG_PUSH
4566                     xmlGenericError(xmlGenericErrorContext,
4567                             "HPP: entering EOF\n");
4568 #endif
4569                     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4570                         ctxt->sax->endDocument(ctxt->userData);
4571                     goto done;
4572                 }
4573                 break;
4574             case XML_PARSER_START_TAG: {
4575                 const xmlChar *name, *oldname;
4576                 int depth = ctxt->nameNr;
4577                 const htmlElemDesc * info;
4578 
4579                 if (avail < 2)
4580                     goto done;
4581                 cur = in->cur[0];
4582                 if (cur != '<') {
4583                     ctxt->instate = XML_PARSER_CONTENT;
4584 #ifdef DEBUG_PUSH
4585                     xmlGenericError(xmlGenericErrorContext,
4586                             "HPP: entering CONTENT\n");
4587 #endif
4588                     break;
4589                 }
4590                 if (in->cur[1] == '/') {
4591                     ctxt->instate = XML_PARSER_END_TAG;
4592                     ctxt->checkIndex = 0;
4593 #ifdef DEBUG_PUSH
4594                     xmlGenericError(xmlGenericErrorContext,
4595                             "HPP: entering END_TAG\n");
4596 #endif
4597                     break;
4598                 }
4599                 if ((!terminate) &&
4600                     (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4601                     goto done;
4602 
4603                 oldname = ctxt->name;
4604                 htmlParseStartTag(ctxt);
4605                 name = ctxt->name;
4606                 if (((depth == ctxt->nameNr) &&
4607                      (xmlStrEqual(oldname, ctxt->name))) ||
4608                     (name == NULL)) {
4609                     if (CUR == '>')
4610                         NEXT;
4611                     break;
4612                 }
4613 
4614                 /*
4615                  * Lookup the info for that element.
4616                  */
4617                 info = htmlTagLookup(name);
4618                 if (info == NULL) {
4619                     htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4620                                  "Tag %s invalid\n", name, NULL);
4621                 }
4622 
4623                 /*
4624                  * Check for an Empty Element labeled the XML/SGML way
4625                  */
4626                 if ((CUR == '/') && (NXT(1) == '>')) {
4627                     SKIP(2);
4628                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4629                         ctxt->sax->endElement(ctxt->userData, name);
4630                     oldname = htmlnamePop(ctxt);
4631                     ctxt->instate = XML_PARSER_CONTENT;
4632 #ifdef DEBUG_PUSH
4633                     xmlGenericError(xmlGenericErrorContext,
4634                             "HPP: entering CONTENT\n");
4635 #endif
4636                     break;
4637                 }
4638 
4639                 if (CUR == '>') {
4640                     NEXT;
4641                 } else {
4642                     htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4643                                  "Couldn't find end of Start Tag %s\n",
4644                                  name, NULL);
4645 
4646                     /*
4647                      * end of parsing of this node.
4648                      */
4649                     if (xmlStrEqual(name, ctxt->name)) {
4650                         nodePop(ctxt);
4651                         oldname = htmlnamePop(ctxt);
4652                     }
4653 
4654                     ctxt->instate = XML_PARSER_CONTENT;
4655 #ifdef DEBUG_PUSH
4656                     xmlGenericError(xmlGenericErrorContext,
4657                             "HPP: entering CONTENT\n");
4658 #endif
4659                     break;
4660                 }
4661 
4662                 /*
4663                  * Check for an Empty Element from DTD definition
4664                  */
4665                 if ((info != NULL) && (info->empty)) {
4666                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4667                         ctxt->sax->endElement(ctxt->userData, name);
4668                     oldname = htmlnamePop(ctxt);
4669                 }
4670                 ctxt->instate = XML_PARSER_CONTENT;
4671 #ifdef DEBUG_PUSH
4672                 xmlGenericError(xmlGenericErrorContext,
4673                         "HPP: entering CONTENT\n");
4674 #endif
4675                 break;
4676             }
4677             case XML_PARSER_CONTENT: {
4678                 long cons;
4679                 /*
4680                  * Handle preparsed entities and charRef
4681                  */
4682                 if (ctxt->token != 0) {
4683                     xmlChar chr[2] = { 0 , 0 } ;
4684 
4685                     chr[0] = (xmlChar) ctxt->token;
4686                     htmlCheckParagraph(ctxt);
4687                     if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4688                         ctxt->sax->characters(ctxt->userData, chr, 1);
4689                     ctxt->token = 0;
4690                     ctxt->checkIndex = 0;
4691                 }
4692                 if ((avail == 1) && (terminate)) {
4693                     cur = in->cur[0];
4694                     if ((cur != '<') && (cur != '&')) {
4695                         if (ctxt->sax != NULL) {
4696                             if (IS_BLANK_CH(cur)) {
4697                                 if (ctxt->sax->ignorableWhitespace != NULL)
4698                                     ctxt->sax->ignorableWhitespace(
4699                                             ctxt->userData, &cur, 1);
4700                             } else {
4701                                 htmlCheckParagraph(ctxt);
4702                                 if (ctxt->sax->characters != NULL)
4703                                     ctxt->sax->characters(
4704                                             ctxt->userData, &cur, 1);
4705                             }
4706                         }
4707                         ctxt->token = 0;
4708                         ctxt->checkIndex = 0;
4709                         in->cur++;
4710                         break;
4711                     }
4712                 }
4713                 if (avail < 2)
4714                     goto done;
4715                 cur = in->cur[0];
4716                 next = in->cur[1];
4717                 cons = ctxt->nbChars;
4718                 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
4719                     (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
4720                     /*
4721                      * Handle SCRIPT/STYLE separately
4722                      */
4723                     if ((!terminate) &&
4724                         (htmlParseLookupSequence(ctxt, '<', '/', 0, 0) < 0))
4725                         goto done;
4726                     htmlParseScript(ctxt);
4727                     if ((cur == '<') && (next == '/')) {
4728                         ctxt->instate = XML_PARSER_END_TAG;
4729                         ctxt->checkIndex = 0;
4730 #ifdef DEBUG_PUSH
4731                         xmlGenericError(xmlGenericErrorContext,
4732                                 "HPP: entering END_TAG\n");
4733 #endif
4734                         break;
4735                     }
4736                 } else {
4737                     /*
4738                      * Sometimes DOCTYPE arrives in the middle of the document
4739                      */
4740                     if ((cur == '<') && (next == '!') &&
4741                         (UPP(2) == 'D') && (UPP(3) == 'O') &&
4742                         (UPP(4) == 'C') && (UPP(5) == 'T') &&
4743                         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4744                         (UPP(8) == 'E')) {
4745                         if ((!terminate) &&
4746                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4747                             goto done;
4748                         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4749                                      "Misplaced DOCTYPE declaration\n",
4750                                      BAD_CAST "DOCTYPE" , NULL);
4751                         htmlParseDocTypeDecl(ctxt);
4752                     } else if ((cur == '<') && (next == '!') &&
4753                         (in->cur[2] == '-') && (in->cur[3] == '-')) {
4754                         if ((!terminate) &&
4755                             (htmlParseLookupSequence(
4756                                         ctxt, '-', '-', '>', 1) < 0))
4757                             goto done;
4758 #ifdef DEBUG_PUSH
4759                         xmlGenericError(xmlGenericErrorContext,
4760                                 "HPP: Parsing Comment\n");
4761 #endif
4762                         htmlParseComment(ctxt);
4763                         ctxt->instate = XML_PARSER_CONTENT;
4764                     } else if ((cur == '<') && (next == '!') && (avail < 4)) {
4765                         goto done;
4766                     } else if ((cur == '<') && (next == '/')) {
4767                         ctxt->instate = XML_PARSER_END_TAG;
4768                         ctxt->checkIndex = 0;
4769 #ifdef DEBUG_PUSH
4770                         xmlGenericError(xmlGenericErrorContext,
4771                                 "HPP: entering END_TAG\n");
4772 #endif
4773                         break;
4774                     } else if (cur == '<') {
4775                         ctxt->instate = XML_PARSER_START_TAG;
4776                         ctxt->checkIndex = 0;
4777 #ifdef DEBUG_PUSH
4778                         xmlGenericError(xmlGenericErrorContext,
4779                                 "HPP: entering START_TAG\n");
4780 #endif
4781                         break;
4782                     } else if (cur == '&') {
4783                         if ((!terminate) &&
4784                             (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
4785                             goto done;
4786 #ifdef DEBUG_PUSH
4787                         xmlGenericError(xmlGenericErrorContext,
4788                                 "HPP: Parsing Reference\n");
4789 #endif
4790 
4791                         htmlParseReference(ctxt);
4792                     } else {
4793                         /*
4794                          * check that the text sequence is complete
4795                          * before handing out the data to the parser
4796                          * to avoid problems with erroneous end of
4797                          * data detection.
4798                          */
4799                         if ((!terminate) &&
4800                             (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
4801                             goto done;
4802                         ctxt->checkIndex = 0;
4803 #ifdef DEBUG_PUSH
4804                         xmlGenericError(xmlGenericErrorContext,
4805                                 "HPP: Parsing char data\n");
4806 #endif
4807                         htmlParseCharData(ctxt);
4808                     }
4809                 }
4810                 if (cons == ctxt->nbChars) {
4811                     if (ctxt->node != NULL) {
4812                         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4813                                      "detected an error in element content\n",
4814                                      NULL, NULL);
4815                     }
4816                     NEXT;
4817                     break;
4818                 }
4819 
4820                 break;
4821             }
4822             case XML_PARSER_END_TAG:
4823                 if (avail < 2)
4824                     goto done;
4825                 if ((!terminate) &&
4826                     (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4827                     goto done;
4828                 htmlParseEndTag(ctxt);
4829                 if (ctxt->nameNr == 0) {
4830                     ctxt->instate = XML_PARSER_EPILOG;
4831                 } else {
4832                     ctxt->instate = XML_PARSER_CONTENT;
4833                 }
4834                 ctxt->checkIndex = 0;
4835 #ifdef DEBUG_PUSH
4836                 xmlGenericError(xmlGenericErrorContext,
4837                         "HPP: entering CONTENT\n");
4838 #endif
4839                 break;
4840             case XML_PARSER_CDATA_SECTION:
4841                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4842                         "HPP: internal error, state == CDATA\n",
4843                              NULL, NULL);
4844                 ctxt->instate = XML_PARSER_CONTENT;
4845                 ctxt->checkIndex = 0;
4846 #ifdef DEBUG_PUSH
4847                 xmlGenericError(xmlGenericErrorContext,
4848                         "HPP: entering CONTENT\n");
4849 #endif
4850                 break;
4851             case XML_PARSER_DTD:
4852                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4853                         "HPP: internal error, state == DTD\n",
4854                              NULL, NULL);
4855                 ctxt->instate = XML_PARSER_CONTENT;
4856                 ctxt->checkIndex = 0;
4857 #ifdef DEBUG_PUSH
4858                 xmlGenericError(xmlGenericErrorContext,
4859                         "HPP: entering CONTENT\n");
4860 #endif
4861                 break;
4862             case XML_PARSER_COMMENT:
4863                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4864                         "HPP: internal error, state == COMMENT\n",
4865                              NULL, NULL);
4866                 ctxt->instate = XML_PARSER_CONTENT;
4867                 ctxt->checkIndex = 0;
4868 #ifdef DEBUG_PUSH
4869                 xmlGenericError(xmlGenericErrorContext,
4870                         "HPP: entering CONTENT\n");
4871 #endif
4872                 break;
4873             case XML_PARSER_PI:
4874                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4875                         "HPP: internal error, state == PI\n",
4876                              NULL, NULL);
4877                 ctxt->instate = XML_PARSER_CONTENT;
4878                 ctxt->checkIndex = 0;
4879 #ifdef DEBUG_PUSH
4880                 xmlGenericError(xmlGenericErrorContext,
4881                         "HPP: entering CONTENT\n");
4882 #endif
4883                 break;
4884             case XML_PARSER_ENTITY_DECL:
4885                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4886                         "HPP: internal error, state == ENTITY_DECL\n",
4887                              NULL, NULL);
4888                 ctxt->instate = XML_PARSER_CONTENT;
4889                 ctxt->checkIndex = 0;
4890 #ifdef DEBUG_PUSH
4891                 xmlGenericError(xmlGenericErrorContext,
4892                         "HPP: entering CONTENT\n");
4893 #endif
4894                 break;
4895             case XML_PARSER_ENTITY_VALUE:
4896                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4897                         "HPP: internal error, state == ENTITY_VALUE\n",
4898                              NULL, NULL);
4899                 ctxt->instate = XML_PARSER_CONTENT;
4900                 ctxt->checkIndex = 0;
4901 #ifdef DEBUG_PUSH
4902                 xmlGenericError(xmlGenericErrorContext,
4903                         "HPP: entering DTD\n");
4904 #endif
4905                 break;
4906             case XML_PARSER_ATTRIBUTE_VALUE:
4907                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4908                         "HPP: internal error, state == ATTRIBUTE_VALUE\n",
4909                              NULL, NULL);
4910                 ctxt->instate = XML_PARSER_START_TAG;
4911                 ctxt->checkIndex = 0;
4912 #ifdef DEBUG_PUSH
4913                 xmlGenericError(xmlGenericErrorContext,
4914                         "HPP: entering START_TAG\n");
4915 #endif
4916                 break;
4917             case XML_PARSER_SYSTEM_LITERAL:
4918                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4919                     "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
4920                              NULL, NULL);
4921                 ctxt->instate = XML_PARSER_CONTENT;
4922                 ctxt->checkIndex = 0;
4923 #ifdef DEBUG_PUSH
4924                 xmlGenericError(xmlGenericErrorContext,
4925                         "HPP: entering CONTENT\n");
4926 #endif
4927                 break;
4928             case XML_PARSER_IGNORE:
4929                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4930                         "HPP: internal error, state == XML_PARSER_IGNORE\n",
4931                              NULL, NULL);
4932                 ctxt->instate = XML_PARSER_CONTENT;
4933                 ctxt->checkIndex = 0;
4934 #ifdef DEBUG_PUSH
4935                 xmlGenericError(xmlGenericErrorContext,
4936                         "HPP: entering CONTENT\n");
4937 #endif
4938                 break;
4939             case XML_PARSER_PUBLIC_LITERAL:
4940                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4941                         "HPP: internal error, state == XML_PARSER_LITERAL\n",
4942                              NULL, NULL);
4943                 ctxt->instate = XML_PARSER_CONTENT;
4944                 ctxt->checkIndex = 0;
4945 #ifdef DEBUG_PUSH
4946                 xmlGenericError(xmlGenericErrorContext,
4947                         "HPP: entering CONTENT\n");
4948 #endif
4949                 break;
4950 
4951         }
4952     }
4953 done:
4954     if ((avail == 0) && (terminate)) {
4955         htmlAutoCloseOnEnd(ctxt);
4956         if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4957             /*
4958              * SAX: end of the document processing.
4959              */
4960             ctxt->instate = XML_PARSER_EOF;
4961             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4962                 ctxt->sax->endDocument(ctxt->userData);
4963         }
4964     }
4965     if ((ctxt->myDoc != NULL) &&
4966         ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
4967          (ctxt->instate == XML_PARSER_EPILOG))) {
4968         xmlDtdPtr dtd;
4969         dtd = xmlGetIntSubset(ctxt->myDoc);
4970         if (dtd == NULL)
4971             ctxt->myDoc->intSubset =
4972                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4973                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4974                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4975     }
4976 #ifdef DEBUG_PUSH
4977     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
4978 #endif
4979     return(ret);
4980 }
4981 
4982 /**
4983  * htmlParseChunk:
4984  * @param ctxt an HTML parser context
4985  * @param chunk an char array
4986  * @param size the size in byte of the chunk
4987  * @param terminate last chunk indicator
4988  *
4989  * Parse a Chunk of memory
4990  *
4991  * Returns zero if no error, the xmlParserErrors otherwise.
4992  */
4993 int
htmlParseChunk(htmlParserCtxtPtr ctxt,const char * chunk,int size,int terminate)4994 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
4995               int terminate) {
4996     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
4997         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
4998         int base = ctxt->input->base - ctxt->input->buf->buffer->content;
4999         int cur = ctxt->input->cur - ctxt->input->base;
5000 
5001         xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5002         ctxt->input->base = ctxt->input->buf->buffer->content + base;
5003         ctxt->input->cur = ctxt->input->base + cur;
5004 #ifdef DEBUG_PUSH
5005         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5006 #endif
5007 
5008 #if 0
5009         if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5010             htmlParseTryOrFinish(ctxt, terminate);
5011 #endif
5012     } else if (ctxt->instate != XML_PARSER_EOF) {
5013         if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5014             xmlParserInputBufferPtr in = ctxt->input->buf;
5015             if ((in->encoder != NULL) && (in->buffer != NULL) &&
5016                     (in->raw != NULL)) {
5017                 int nbchars;
5018 
5019                 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5020                 if (nbchars < 0) {
5021                     htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5022                                  "encoder error\n", NULL, NULL);
5023                     return(XML_ERR_INVALID_ENCODING);
5024                 }
5025             }
5026         }
5027     }
5028     htmlParseTryOrFinish(ctxt, terminate);
5029     if (terminate) {
5030         if ((ctxt->instate != XML_PARSER_EOF) &&
5031             (ctxt->instate != XML_PARSER_EPILOG) &&
5032             (ctxt->instate != XML_PARSER_MISC)) {
5033             ctxt->errNo = XML_ERR_DOCUMENT_END;
5034             ctxt->wellFormed = 0;
5035         }
5036         if (ctxt->instate != XML_PARSER_EOF) {
5037             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5038                 ctxt->sax->endDocument(ctxt->userData);
5039         }
5040         ctxt->instate = XML_PARSER_EOF;
5041     }
5042     return((xmlParserErrors) ctxt->errNo);
5043 }
5044 #endif /* LIBXML_PUSH_ENABLED */
5045 
5046 /************************************************************************
5047  *                                                                      *
5048  *                      User entry points                               *
5049  *                                                                      *
5050  ************************************************************************/
5051 
5052 /**
5053  * htmlCreatePushParserCtxt:
5054  * @param sax a SAX handler
5055  * @param user_data The user data returned on SAX callbacks
5056  * @param chunk a pointer to an array of chars
5057  * @param size number of chars in the array
5058  * @param filename an optional file name or URI
5059  * @param enc an optional encoding
5060  *
5061  * Create a parser context for using the HTML parser in push mode
5062  * The value of filename is used for fetching external entities
5063  * and error/warning reports.
5064  *
5065  * Returns the new parser context or NULL
5066  */
5067 htmlParserCtxtPtr
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,void * user_data,const char * chunk,int size,const char * filename,xmlCharEncoding enc)5068 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5069                          const char *chunk, int size, const char *filename,
5070                          xmlCharEncoding enc) {
5071     htmlParserCtxtPtr ctxt;
5072     htmlParserInputPtr inputStream;
5073     xmlParserInputBufferPtr buf;
5074 
5075     xmlInitParser();
5076 
5077     buf = xmlAllocParserInputBuffer(enc);
5078     if (buf == NULL) return(NULL);
5079 
5080     ctxt = htmlNewParserCtxt();
5081     if (ctxt == NULL) {
5082         xmlFreeParserInputBuffer(buf);
5083         return(NULL);
5084     }
5085     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5086         ctxt->charset=XML_CHAR_ENCODING_UTF8;
5087     if (sax != NULL) {
5088         if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
5089             xmlFree(ctxt->sax);
5090         ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5091         if (ctxt->sax == NULL) {
5092             xmlFree(buf);
5093             xmlFree(ctxt);
5094             return(NULL);
5095         }
5096         memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5097         if (user_data != NULL)
5098             ctxt->userData = user_data;
5099     }
5100     if (filename == NULL) {
5101         ctxt->directory = NULL;
5102     } else {
5103         ctxt->directory = xmlParserGetDirectory(filename);
5104     }
5105 
5106     inputStream = htmlNewInputStream(ctxt);
5107     if (inputStream == NULL) {
5108         xmlFreeParserCtxt(ctxt);
5109         xmlFree(buf);
5110         return(NULL);
5111     }
5112 
5113     if (filename == NULL)
5114         inputStream->filename = NULL;
5115     else
5116         inputStream->filename = (char *)
5117             xmlCanonicPath((const xmlChar *) filename);
5118     inputStream->buf = buf;
5119     inputStream->base = inputStream->buf->buffer->content;
5120     inputStream->cur = inputStream->buf->buffer->content;
5121     inputStream->end =
5122         &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
5123 
5124     inputPush(ctxt, inputStream);
5125 
5126     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5127         (ctxt->input->buf != NULL))  {
5128         int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5129         int cur = ctxt->input->cur - ctxt->input->base;
5130 
5131         xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5132 
5133         ctxt->input->base = ctxt->input->buf->buffer->content + base;
5134         ctxt->input->cur = ctxt->input->base + cur;
5135         ctxt->input->end =
5136             &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5137 #ifdef DEBUG_PUSH
5138         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5139 #endif
5140     }
5141 
5142     return(ctxt);
5143 }
5144 
5145 /**
5146  * htmlSAXParseDoc:
5147  * @param cur a pointer to an array of xmlChar
5148  * @param encoding a free form C string describing the HTML document encoding, or NULL
5149  * @param sax the SAX handler block
5150  * @param userData if using SAX, this pointer will be provided on callbacks.
5151  *
5152  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5153  * to handle parse events. If sax is NULL, fallback to the default DOM
5154  * behavior and return a tree.
5155  *
5156  * Returns the resulting document tree unless SAX is NULL or the document is
5157  *     not well formed.
5158  */
5159 
5160 htmlDocPtr
htmlSAXParseDoc(xmlChar * cur,const char * encoding,htmlSAXHandlerPtr sax,void * userData)5161 htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5162     htmlDocPtr ret;
5163     htmlParserCtxtPtr ctxt;
5164 
5165     xmlInitParser();
5166 
5167     if (cur == NULL) return(NULL);
5168 
5169 
5170     ctxt = htmlCreateDocParserCtxt(cur, encoding);
5171     if (ctxt == NULL) return(NULL);
5172     if (sax != NULL) {
5173         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
5174         ctxt->sax = sax;
5175         ctxt->userData = userData;
5176     }
5177 
5178     htmlParseDocument(ctxt);
5179     ret = ctxt->myDoc;
5180     if (sax != NULL) {
5181         ctxt->sax = NULL;
5182         ctxt->userData = NULL;
5183     }
5184     htmlFreeParserCtxt(ctxt);
5185 
5186     return(ret);
5187 }
5188 
5189 /**
5190  * htmlParseDoc:
5191  * @param cur a pointer to an array of xmlChar
5192  * @param encoding a free form C string describing the HTML document encoding, or NULL
5193  *
5194  * parse an HTML in-memory document and build a tree.
5195  *
5196  * Returns the resulting document tree
5197  */
5198 
5199 htmlDocPtr
htmlParseDoc(xmlChar * cur,const char * encoding)5200 htmlParseDoc(xmlChar *cur, const char *encoding) {
5201     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5202 }
5203 
5204 
5205 /**
5206  * htmlCreateFileParserCtxt:
5207  * @param filename the filename
5208  * @param encoding a free form C string describing the HTML document encoding, or NULL
5209  *
5210  * Create a parser context for a file content.
5211  * Automatic support for ZLIB/Compress compressed document is provided
5212  * by default if found at compile-time.
5213  *
5214  * Returns the new parser context or NULL
5215  */
5216 htmlParserCtxtPtr
htmlCreateFileParserCtxt(const char * filename,const char * encoding)5217 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5218 {
5219     htmlParserCtxtPtr ctxt;
5220     htmlParserInputPtr inputStream;
5221     char *canonicFilename;
5222     /* htmlCharEncoding enc; */
5223     xmlChar *content, *content_line = (xmlChar *) "charset=";
5224 
5225     ctxt = htmlNewParserCtxt();
5226     if (ctxt == NULL) {
5227         return(NULL);
5228     }
5229     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5230     if (canonicFilename == NULL) {
5231 #ifdef LIBXML_SAX1_ENABLED
5232         if (xmlDefaultSAXHandler.error != NULL) {
5233             xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5234         }
5235 #endif
5236         xmlFreeParserCtxt(ctxt);
5237         return(NULL);
5238     }
5239 
5240     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5241     xmlFree(canonicFilename);
5242     if (inputStream == NULL) {
5243         xmlFreeParserCtxt(ctxt);
5244         return(NULL);
5245     }
5246 
5247     inputPush(ctxt, inputStream);
5248 
5249     /* set encoding */
5250     if (encoding) {
5251         content = (xmlChar*)xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
5252         if (content) {
5253             strcpy ((char *)content, (char *)content_line);
5254             strcat ((char *)content, (char *)encoding);
5255             htmlCheckEncoding (ctxt, content);
5256             xmlFree (content);
5257         }
5258     }
5259 
5260     return(ctxt);
5261 }
5262 
5263 /**
5264  * htmlSAXParseFile:
5265  * @param filename the filename
5266  * @param encoding a free form C string describing the HTML document encoding, or NULL
5267  * @param sax the SAX handler block
5268  * @param userData if using SAX, this pointer will be provided on callbacks.
5269  *
5270  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5271  * compressed document is provided by default if found at compile-time.
5272  * It use the given SAX function block to handle the parsing callback.
5273  * If sax is NULL, fallback to the default DOM tree building routines.
5274  *
5275  * Returns the resulting document tree unless SAX is NULL or the document is
5276  *     not well formed.
5277  */
5278 
5279 htmlDocPtr
htmlSAXParseFile(const char * filename,const char * encoding,htmlSAXHandlerPtr sax,void * userData)5280 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5281                  void *userData) {
5282     htmlDocPtr ret;
5283     htmlParserCtxtPtr ctxt;
5284     htmlSAXHandlerPtr oldsax = NULL;
5285 
5286     xmlInitParser();
5287 
5288     ctxt = htmlCreateFileParserCtxt(filename, encoding);
5289     if (ctxt == NULL) return(NULL);
5290     if (sax != NULL) {
5291         oldsax = ctxt->sax;
5292         ctxt->sax = sax;
5293         ctxt->userData = userData;
5294     }
5295 
5296     htmlParseDocument(ctxt);
5297 
5298     ret = ctxt->myDoc;
5299     if (sax != NULL) {
5300         ctxt->sax = oldsax;
5301         ctxt->userData = NULL;
5302     }
5303     htmlFreeParserCtxt(ctxt);
5304 
5305     return(ret);
5306 }
5307 
5308 /**
5309  * htmlParseFile:
5310  * @param filename the filename
5311  * @param encoding a free form C string describing the HTML document encoding, or NULL
5312  *
5313  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5314  * compressed document is provided by default if found at compile-time.
5315  *
5316  * Returns the resulting document tree
5317  */
5318 
5319 htmlDocPtr
htmlParseFile(const char * filename,const char * encoding)5320 htmlParseFile(const char *filename, const char *encoding) {
5321     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5322 }
5323 
5324 /**
5325  * htmlHandleOmittedElem:
5326  * @param val int 0 or 1
5327  *
5328  * Set and return the previous value for handling HTML omitted tags.
5329  *
5330  * Returns the last value for 0 for no handling, 1 for auto insertion.
5331  */
5332 
5333 int
htmlHandleOmittedElem(int val)5334 htmlHandleOmittedElem(int val) {
5335     int old = htmlOmittedDefaultValue;
5336 
5337 
5338     return(old);
5339 }
5340 
5341 /**
5342  * htmlElementAllowedHere:
5343  * @param parent HTML parent element
5344  * @param elt HTML element
5345  *
5346  * Checks whether an HTML element may be a direct child of a parent element.
5347  * Note - doesn't check for deprecated elements
5348  *
5349  * Returns 1 if allowed; 0 otherwise.
5350  */
5351 int
htmlElementAllowedHere(const htmlElemDesc * parent,const xmlChar * elt)5352 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5353   const char** p ;
5354 
5355   if ( ! elt || ! parent || ! parent->subelts )
5356         return 0 ;
5357 
5358   for ( p = parent->subelts; *p; ++p )
5359     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5360       return 1 ;
5361 
5362   return 0 ;
5363 }
5364 /**
5365  * htmlElementStatusHere:
5366  * @param parent HTML parent element
5367  * @param elt HTML element
5368  *
5369  * Checks whether an HTML element may be a direct child of a parent element.
5370  * and if so whether it is valid or deprecated.
5371  *
5372  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5373  */
5374 htmlStatus
htmlElementStatusHere(const htmlElemDesc * parent,const htmlElemDesc * elt)5375 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5376   if ( ! parent || ! elt )
5377     return HTML_INVALID ;
5378   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5379     return HTML_INVALID ;
5380 
5381   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5382 }
5383 /**
5384  * htmlAttrAllowed:
5385  * @param elt HTML element
5386  * @param attr HTML attribute
5387  * @param legacy whether to allow deprecated attributes
5388  *
5389  * Checks whether an attribute is valid for an element
5390  * Has full knowledge of Required and Deprecated attributes
5391  *
5392  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5393  */
5394 htmlStatus
htmlAttrAllowed(const htmlElemDesc * elt,const xmlChar * attr,int legacy)5395 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5396   const char** p ;
5397 
5398   if ( !elt || ! attr )
5399         return HTML_INVALID ;
5400 
5401   if ( elt->attrs_req )
5402     for ( p = elt->attrs_req; *p; ++p)
5403       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5404         return HTML_REQUIRED ;
5405 
5406   if ( elt->attrs_opt )
5407     for ( p = elt->attrs_opt; *p; ++p)
5408       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5409         return HTML_VALID ;
5410 
5411   if ( legacy && elt->attrs_depr )
5412     for ( p = elt->attrs_depr; *p; ++p)
5413       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5414         return HTML_DEPRECATED ;
5415 
5416   return HTML_INVALID ;
5417 }
5418 /**
5419  * htmlNodeStatus:
5420  * @param node an htmlNodePtr in a tree
5421  * @param legacy whether to allow deprecated elements (YES is faster here
5422  *      for Element nodes)
5423  *
5424  * Checks whether the tree node is valid.  Experimental (the author
5425  *     only uses the HTML enhancements in a SAX parser)
5426  *
5427  * Return: for Element nodes, a return from htmlElementAllowedHere (if
5428  *      legacy allowed) or htmlElementStatusHere (otherwise).
5429  *      for Attribute nodes, a return from htmlAttrAllowed
5430  *      for other nodes, HTML_NA (no checks performed)
5431  */
5432 htmlStatus
htmlNodeStatus(const htmlNodePtr node,int legacy)5433 htmlNodeStatus(const htmlNodePtr node, int legacy) {
5434   if ( ! node )
5435     return HTML_INVALID ;
5436 
5437   switch ( node->type ) {
5438     case XML_ELEMENT_NODE:
5439       return legacy
5440         ? ( htmlElementAllowedHere (
5441                 htmlTagLookup(node->parent->name) , node->name
5442                 ) ? HTML_VALID : HTML_INVALID )
5443         : htmlElementStatusHere(
5444                 htmlTagLookup(node->parent->name) ,
5445                 htmlTagLookup(node->name) )
5446         ;
5447     case XML_ATTRIBUTE_NODE:
5448       return htmlAttrAllowed(
5449         htmlTagLookup(node->parent->name) , node->name, legacy) ;
5450     default: return HTML_NA ;
5451   }
5452 }
5453 /************************************************************************
5454  *                                                                      *
5455  *      New set (2.6.0) of simpler and more flexible APIs               *
5456  *                                                                      *
5457  ************************************************************************/
5458 /**
5459  * DICT_FREE:
5460  * @param str a string
5461  *
5462  * Free a string if it is not owned by the "dict" dictionnary in the
5463  * current scope
5464  */
5465 #define DICT_FREE(str)                                          \
5466         if ((str) && ((!dict) ||                                \
5467             (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))  \
5468             xmlFree((char *)(str));
5469 
5470 /**
5471  * htmlCtxtReset:
5472  * @param ctxt an HTML parser context
5473  *
5474  * Reset a parser context
5475  */
5476 void
htmlCtxtReset(htmlParserCtxtPtr ctxt)5477 htmlCtxtReset(htmlParserCtxtPtr ctxt)
5478 {
5479     xmlParserInputPtr input;
5480     xmlDictPtr dict = ctxt->dict;
5481 
5482     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5483         xmlFreeInputStream(input);
5484     }
5485     ctxt->inputNr = 0;
5486     ctxt->input = NULL;
5487 
5488     ctxt->spaceNr = 0;
5489     ctxt->spaceTab[0] = -1;
5490     ctxt->space = &ctxt->spaceTab[0];
5491 
5492 
5493     ctxt->nodeNr = 0;
5494     ctxt->node = NULL;
5495 
5496     ctxt->nameNr = 0;
5497     ctxt->name = NULL;
5498 
5499     DICT_FREE(ctxt->version);
5500     ctxt->version = NULL;
5501     DICT_FREE(ctxt->encoding);
5502     ctxt->encoding = NULL;
5503     DICT_FREE(ctxt->directory);
5504     ctxt->directory = NULL;
5505     DICT_FREE(ctxt->extSubURI);
5506     ctxt->extSubURI = NULL;
5507     DICT_FREE(ctxt->extSubSystem);
5508     ctxt->extSubSystem = NULL;
5509     if (ctxt->myDoc != NULL)
5510         xmlFreeDoc(ctxt->myDoc);
5511     ctxt->myDoc = NULL;
5512 
5513     ctxt->standalone = -1;
5514     ctxt->hasExternalSubset = 0;
5515     ctxt->hasPErefs = 0;
5516     ctxt->html = 1;
5517     ctxt->external = 0;
5518     ctxt->instate = XML_PARSER_START;
5519     ctxt->token = 0;
5520 
5521     ctxt->wellFormed = 1;
5522     ctxt->nsWellFormed = 1;
5523     ctxt->valid = 1;
5524     ctxt->vctxt.userData = ctxt;
5525     ctxt->vctxt.error = xmlParserValidityError;
5526     ctxt->vctxt.warning = xmlParserValidityWarning;
5527     ctxt->record_info = 0;
5528     ctxt->nbChars = 0;
5529     ctxt->checkIndex = 0;
5530     ctxt->inSubset = 0;
5531     ctxt->errNo = XML_ERR_OK;
5532     ctxt->depth = 0;
5533     ctxt->charset = XML_CHAR_ENCODING_UTF8;
5534     ctxt->catalogs = NULL;
5535     xmlInitNodeInfoSeq(&ctxt->node_seq);
5536 
5537     if (ctxt->attsDefault != NULL) {
5538         xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5539         ctxt->attsDefault = NULL;
5540     }
5541     if (ctxt->attsSpecial != NULL) {
5542         xmlHashFree(ctxt->attsSpecial, NULL);
5543         ctxt->attsSpecial = NULL;
5544     }
5545 }
5546 
5547 /**
5548  * htmlCtxtUseOptions:
5549  * @param ctxt an HTML parser context
5550  * @param options a combination of htmlParserOption(s)
5551  *
5552  * Applies the options to the parser context
5553  *
5554  * Returns 0 in case of success, the set of unknown or unimplemented options
5555  *         in case of error.
5556  */
5557 int
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt,int options)5558 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5559 {
5560     if (options & HTML_PARSE_NOWARNING) {
5561         ctxt->sax->warning = NULL;
5562         ctxt->vctxt.warning = NULL;
5563         options -= XML_PARSE_NOWARNING;
5564         ctxt->options |= XML_PARSE_NOWARNING;
5565     }
5566     if (options & HTML_PARSE_NOERROR) {
5567         ctxt->sax->error = NULL;
5568         ctxt->vctxt.error = NULL;
5569         ctxt->sax->fatalError = NULL;
5570         options -= XML_PARSE_NOERROR;
5571         ctxt->options |= XML_PARSE_NOERROR;
5572     }
5573     if (options & HTML_PARSE_PEDANTIC) {
5574         ctxt->pedantic = 1;
5575         options -= XML_PARSE_PEDANTIC;
5576         ctxt->options |= XML_PARSE_PEDANTIC;
5577     } else
5578         ctxt->pedantic = 0;
5579     if (options & XML_PARSE_NOBLANKS) {
5580         ctxt->keepBlanks = 0;
5581         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5582         options -= XML_PARSE_NOBLANKS;
5583         ctxt->options |= XML_PARSE_NOBLANKS;
5584     } else
5585         ctxt->keepBlanks = 1;
5586     ctxt->dictNames = 0;
5587     return (options);
5588 }
5589 
5590 /**
5591  * htmlDoRead:
5592  * @param ctxt an HTML parser context
5593  * @param URL the base URL to use for the document
5594  * @param encoding the document encoding, or NULL
5595  * @param options a combination of htmlParserOption(s)
5596  * @param reuse keep the context for reuse
5597  *
5598  * Common front-end for the htmlRead functions
5599  *
5600  * Returns the resulting document tree or NULL
5601  */
5602 static htmlDocPtr
htmlDoRead(htmlParserCtxtPtr ctxt,const char * URL,const char * encoding,int options,int reuse)5603 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5604           int options, int reuse)
5605 {
5606     htmlDocPtr ret;
5607 
5608     htmlCtxtUseOptions(ctxt, options);
5609     ctxt->html = 1;
5610     if (encoding != NULL) {
5611         xmlCharEncodingHandlerPtr hdlr;
5612 
5613         hdlr = xmlFindCharEncodingHandler(encoding);
5614         if (hdlr != NULL)
5615             xmlSwitchToEncoding(ctxt, hdlr);
5616     }
5617     if ((URL != NULL) && (ctxt->input != NULL) &&
5618         (ctxt->input->filename == NULL))
5619         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
5620     htmlParseDocument(ctxt);
5621     ret = ctxt->myDoc;
5622     ctxt->myDoc = NULL;
5623     if (!reuse) {
5624         if ((ctxt->dictNames) &&
5625             (ret != NULL) &&
5626             (ret->dict == ctxt->dict))
5627             ctxt->dict = NULL;
5628         xmlFreeParserCtxt(ctxt);
5629     }
5630     return (ret);
5631 }
5632 
5633 /**
5634  * htmlReadDoc:
5635  * @param cur a pointer to a zero terminated string
5636  * @param URL the base URL to use for the document
5637  * @param encoding the document encoding, or NULL
5638  * @param options a combination of htmlParserOption(s)
5639  *
5640  * parse an XML in-memory document and build a tree.
5641  *
5642  * Returns the resulting document tree
5643  */
5644 htmlDocPtr
htmlReadDoc(const xmlChar * cur,const char * URL,const char * encoding,int options)5645 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
5646 {
5647     htmlParserCtxtPtr ctxt;
5648 
5649     if (cur == NULL)
5650         return (NULL);
5651 
5652     ctxt = xmlCreateDocParserCtxt(cur, sizeof(cur));
5653     if (ctxt == NULL)
5654         return (NULL);
5655     return (htmlDoRead(ctxt, URL, encoding, options, 0));
5656 }
5657 
5658 /**
5659  * htmlReadFile:
5660  * @param filename a file or URL
5661  * @param encoding the document encoding, or NULL
5662  * @param options a combination of htmlParserOption(s)
5663  *
5664  * parse an XML file from the filesystem or the network.
5665  *
5666  * Returns the resulting document tree
5667  */
5668 htmlDocPtr
htmlReadFile(const char * filename,const char * encoding,int options)5669 htmlReadFile(const char *filename, const char *encoding, int options)
5670 {
5671     htmlParserCtxtPtr ctxt;
5672 
5673     ctxt = htmlCreateFileParserCtxt(filename, encoding);
5674     if (ctxt == NULL)
5675         return (NULL);
5676     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
5677 }
5678 
5679 /**
5680  * htmlReadMemory:
5681  * @param buffer a pointer to a char array
5682  * @param size the size of the array
5683  * @param URL the base URL to use for the document
5684  * @param encoding the document encoding, or NULL
5685  * @param options a combination of htmlParserOption(s)
5686  *
5687  * parse an XML in-memory document and build a tree.
5688  *
5689  * Returns the resulting document tree
5690  */
5691 htmlDocPtr
htmlReadMemory(const char * buffer,int size,const char * URL,const char * encoding,int options)5692 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
5693 {
5694     htmlParserCtxtPtr ctxt;
5695 
5696     ctxt = xmlCreateMemoryParserCtxt(buffer, size);
5697     if (ctxt == NULL)
5698         return (NULL);
5699     return (htmlDoRead(ctxt, URL, encoding, options, 0));
5700 }
5701 
5702 /**
5703  * htmlReadFd:
5704  * @param fd an open file descriptor
5705  * @param URL the base URL to use for the document
5706  * @param encoding the document encoding, or NULL
5707  * @param options a combination of htmlParserOption(s)
5708  *
5709  * parse an XML from a file descriptor and build a tree.
5710  *
5711  * Returns the resulting document tree
5712  */
5713 htmlDocPtr
htmlReadFd(int fd,const char * URL,const char * encoding,int options)5714 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
5715 {
5716     htmlParserCtxtPtr ctxt;
5717     xmlParserInputBufferPtr input;
5718     xmlParserInputPtr stream;
5719 
5720     if (fd < 0)
5721         return (NULL);
5722 
5723     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
5724     if (input == NULL)
5725         return (NULL);
5726     ctxt = xmlNewParserCtxt();
5727     if (ctxt == NULL) {
5728         xmlFreeParserInputBuffer(input);
5729         return (NULL);
5730     }
5731     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5732     if (stream == NULL) {
5733         xmlFreeParserInputBuffer(input);
5734         xmlFreeParserCtxt(ctxt);
5735         return (NULL);
5736     }
5737     inputPush(ctxt, stream);
5738     return (htmlDoRead(ctxt, URL, encoding, options, 0));
5739 }
5740 
5741 /**
5742  * htmlReadIO:
5743  * @param ioread an I/O read function
5744  * @param ioclose an I/O close function
5745  * @param ioctx an I/O handler
5746  * @param URL the base URL to use for the document
5747  * @param encoding the document encoding, or NULL
5748  * @param options a combination of htmlParserOption(s)
5749  *
5750  * parse an HTML document from I/O functions and source and build a tree.
5751  *
5752  * Returns the resulting document tree
5753  */
5754 htmlDocPtr
htmlReadIO(xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)5755 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
5756           void *ioctx, const char *URL, const char *encoding, int options)
5757 {
5758     htmlParserCtxtPtr ctxt;
5759     xmlParserInputBufferPtr input;
5760     xmlParserInputPtr stream;
5761 
5762     if (ioread == NULL)
5763         return (NULL);
5764 
5765     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
5766                                          XML_CHAR_ENCODING_NONE);
5767     if (input == NULL)
5768         return (NULL);
5769     ctxt = xmlNewParserCtxt();
5770     if (ctxt == NULL) {
5771         xmlFreeParserInputBuffer(input);
5772         return (NULL);
5773     }
5774     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5775     if (stream == NULL) {
5776         xmlFreeParserInputBuffer(input);
5777         xmlFreeParserCtxt(ctxt);
5778         return (NULL);
5779     }
5780     inputPush(ctxt, stream);
5781     return (htmlDoRead(ctxt, URL, encoding, options, 0));
5782 }
5783 
5784 /**
5785  * htmlCtxtReadDoc:
5786  * @param ctxt an HTML parser context
5787  * @param cur a pointer to a zero terminated string
5788  * @param URL the base URL to use for the document
5789  * @param encoding the document encoding, or NULL
5790  * @param options a combination of htmlParserOption(s)
5791  *
5792  * parse an XML in-memory document and build a tree.
5793  * This reuses the existing ctxt parser context
5794  *
5795  * Returns the resulting document tree
5796  */
5797 htmlDocPtr
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt,const xmlChar * cur,const char * URL,const char * encoding,int options)5798 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
5799                const char *URL, const char *encoding, int options)
5800 {
5801     xmlParserInputPtr stream;
5802 
5803     if (cur == NULL)
5804         return (NULL);
5805     if (ctxt == NULL)
5806         return (NULL);
5807 
5808     htmlCtxtReset(ctxt);
5809 
5810     stream = xmlNewStringInputStream(ctxt, cur);
5811     if (stream == NULL) {
5812         return (NULL);
5813     }
5814     inputPush(ctxt, stream);
5815     return (htmlDoRead(ctxt, URL, encoding, options, 1));
5816 }
5817 
5818 /**
5819  * htmlCtxtReadFile:
5820  * @param ctxt an HTML parser context
5821  * @param filename a file or URL
5822  * @param encoding the document encoding, or NULL
5823  * @param options a combination of htmlParserOption(s)
5824  *
5825  * parse an XML file from the filesystem or the network.
5826  * This reuses the existing ctxt parser context
5827  *
5828  * Returns the resulting document tree
5829  */
5830 htmlDocPtr
htmlCtxtReadFile(htmlParserCtxtPtr ctxt,const char * filename,const char * encoding,int options)5831 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
5832                 const char *encoding, int options)
5833 {
5834     xmlParserInputPtr stream;
5835 
5836     if (filename == NULL)
5837         return (NULL);
5838     if (ctxt == NULL)
5839         return (NULL);
5840 
5841     htmlCtxtReset(ctxt);
5842 
5843     stream = xmlNewInputFromFile(ctxt, filename);
5844     if (stream == NULL) {
5845         return (NULL);
5846     }
5847     inputPush(ctxt, stream);
5848     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
5849 }
5850 
5851 /**
5852  * htmlCtxtReadMemory:
5853  * @param ctxt an HTML parser context
5854  * @param buffer a pointer to a char array
5855  * @param size the size of the array
5856  * @param URL the base URL to use for the document
5857  * @param encoding the document encoding, or NULL
5858  * @param options a combination of htmlParserOption(s)
5859  *
5860  * parse an XML in-memory document and build a tree.
5861  * This reuses the existing ctxt parser context
5862  *
5863  * Returns the resulting document tree
5864  */
5865 htmlDocPtr
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt,const char * buffer,int size,const char * URL,const char * encoding,int options)5866 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
5867                   const char *URL, const char *encoding, int options)
5868 {
5869     xmlParserInputBufferPtr input;
5870     xmlParserInputPtr stream;
5871 
5872     if (ctxt == NULL)
5873         return (NULL);
5874     if (buffer == NULL)
5875         return (NULL);
5876 
5877     htmlCtxtReset(ctxt);
5878 
5879     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5880     if (input == NULL) {
5881         return(NULL);
5882     }
5883 
5884     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5885     if (stream == NULL) {
5886         xmlFreeParserInputBuffer(input);
5887         return(NULL);
5888     }
5889 
5890     inputPush(ctxt, stream);
5891     return (htmlDoRead(ctxt, URL, encoding, options, 1));
5892 }
5893 
5894 /**
5895  * htmlCtxtReadFd:
5896  * @param ctxt an HTML parser context
5897  * @param fd an open file descriptor
5898  * @param URL the base URL to use for the document
5899  * @param encoding the document encoding, or NULL
5900  * @param options a combination of htmlParserOption(s)
5901  *
5902  * parse an XML from a file descriptor and build a tree.
5903  * This reuses the existing ctxt parser context
5904  *
5905  * Returns the resulting document tree
5906  */
5907 htmlDocPtr
htmlCtxtReadFd(htmlParserCtxtPtr ctxt,int fd,const char * URL,const char * encoding,int options)5908 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
5909               const char *URL, const char *encoding, int options)
5910 {
5911     xmlParserInputBufferPtr input;
5912     xmlParserInputPtr stream;
5913 
5914     if (fd < 0)
5915         return (NULL);
5916     if (ctxt == NULL)
5917         return (NULL);
5918 
5919     htmlCtxtReset(ctxt);
5920 
5921 
5922     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
5923     if (input == NULL)
5924         return (NULL);
5925     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5926     if (stream == NULL) {
5927         xmlFreeParserInputBuffer(input);
5928         return (NULL);
5929     }
5930     inputPush(ctxt, stream);
5931     return (htmlDoRead(ctxt, URL, encoding, options, 1));
5932 }
5933 
5934 /**
5935  * htmlCtxtReadIO:
5936  * @param ctxt an HTML parser context
5937  * @param ioread an I/O read function
5938  * @param ioclose an I/O close function
5939  * @param ioctx an I/O handler
5940  * @param URL the base URL to use for the document
5941  * @param encoding the document encoding, or NULL
5942  * @param options a combination of htmlParserOption(s)
5943  *
5944  * parse an HTML document from I/O functions and source and build a tree.
5945  * This reuses the existing ctxt parser context
5946  *
5947  * Returns the resulting document tree
5948  */
5949 htmlDocPtr
htmlCtxtReadIO(htmlParserCtxtPtr ctxt,xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)5950 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
5951               xmlInputCloseCallback ioclose, void *ioctx,
5952               const char *URL,
5953               const char *encoding, int options)
5954 {
5955     xmlParserInputBufferPtr input;
5956     xmlParserInputPtr stream;
5957 
5958     if (ioread == NULL)
5959         return (NULL);
5960     if (ctxt == NULL)
5961         return (NULL);
5962 
5963     htmlCtxtReset(ctxt);
5964 
5965     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
5966                                          XML_CHAR_ENCODING_NONE);
5967     if (input == NULL)
5968         return (NULL);
5969     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
5970     if (stream == NULL) {
5971         xmlFreeParserInputBuffer(input);
5972         return (NULL);
5973     }
5974     inputPush(ctxt, stream);
5975     return (htmlDoRead(ctxt, URL, encoding, options, 1));
5976 }
5977 
5978 #endif /* LIBXML_HTML_ENABLED */
5979