xref: /reactos/sdk/lib/3rdparty/libxml2/HTMLparser.c (revision 0d5a4166)
1 /*
2  * HTMLparser.c : an HTML 4.0 non-verifying parser
3  *
4  * See Copyright for the status of this software.
5  *
6  * daniel@veillard.com
7  */
8 
9 #define IN_LIBXML
10 #include "libxml.h"
11 #ifdef LIBXML_HTML_ENABLED
12 
13 #include <string.h>
14 #include <ctype.h>
15 #include <stdlib.h>
16 
17 #include <libxml/xmlmemory.h>
18 #include <libxml/tree.h>
19 #include <libxml/parser.h>
20 #include <libxml/parserInternals.h>
21 #include <libxml/xmlerror.h>
22 #include <libxml/HTMLparser.h>
23 #include <libxml/HTMLtree.h>
24 #include <libxml/entities.h>
25 #include <libxml/encoding.h>
26 #include <libxml/valid.h>
27 #include <libxml/xmlIO.h>
28 #include <libxml/globals.h>
29 #include <libxml/uri.h>
30 
31 #include "buf.h"
32 #include "enc.h"
33 
34 #define HTML_MAX_NAMELEN 1000
35 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
36 #define HTML_PARSER_BUFFER_SIZE 100
37 
38 /* #define DEBUG */
39 /* #define DEBUG_PUSH */
40 
41 static int htmlOmittedDefaultValue = 1;
42 
43 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
44 			     xmlChar end, xmlChar  end2, xmlChar end3);
45 static void htmlParseComment(htmlParserCtxtPtr ctxt);
46 
47 /************************************************************************
48  *									*
49  *		Some factorized error routines				*
50  *									*
51  ************************************************************************/
52 
53 /**
54  * htmlErrMemory:
55  * @ctxt:  an HTML parser context
56  * @extra:  extra information
57  *
58  * Handle a redefinition of attribute error
59  */
60 static void
htmlErrMemory(xmlParserCtxtPtr ctxt,const char * extra)61 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
62 {
63     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
64         (ctxt->instate == XML_PARSER_EOF))
65 	return;
66     if (ctxt != NULL) {
67         ctxt->errNo = XML_ERR_NO_MEMORY;
68         ctxt->instate = XML_PARSER_EOF;
69         ctxt->disableSAX = 1;
70     }
71     if (extra)
72         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
73                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
74                         NULL, NULL, 0, 0,
75                         "Memory allocation failed : %s\n", extra);
76     else
77         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
78                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
79                         NULL, NULL, 0, 0, "Memory allocation failed\n");
80 }
81 
82 /**
83  * htmlParseErr:
84  * @ctxt:  an HTML parser context
85  * @error:  the error number
86  * @msg:  the error message
87  * @str1:  string infor
88  * @str2:  string infor
89  *
90  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
91  */
92 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErr(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,const xmlChar * str1,const xmlChar * str2)93 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
94              const char *msg, const xmlChar *str1, const xmlChar *str2)
95 {
96     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
97         (ctxt->instate == XML_PARSER_EOF))
98 	return;
99     if (ctxt != NULL)
100 	ctxt->errNo = error;
101     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
102                     XML_ERR_ERROR, NULL, 0,
103 		    (const char *) str1, (const char *) str2,
104 		    NULL, 0, 0,
105 		    msg, str1, str2);
106     if (ctxt != NULL)
107 	ctxt->wellFormed = 0;
108 }
109 
110 /**
111  * htmlParseErrInt:
112  * @ctxt:  an HTML parser context
113  * @error:  the error number
114  * @msg:  the error message
115  * @val:  integer info
116  *
117  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
118  */
119 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErrInt(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,int val)120 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
121              const char *msg, int val)
122 {
123     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
124         (ctxt->instate == XML_PARSER_EOF))
125 	return;
126     if (ctxt != NULL)
127 	ctxt->errNo = error;
128     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
129                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
130 		    NULL, val, 0, msg, val);
131     if (ctxt != NULL)
132 	ctxt->wellFormed = 0;
133 }
134 
135 /************************************************************************
136  *									*
137  *	Parser stacks related functions and macros		*
138  *									*
139  ************************************************************************/
140 
141 /**
142  * htmlnamePush:
143  * @ctxt:  an HTML parser context
144  * @value:  the element name
145  *
146  * Pushes a new element name on top of the name stack
147  *
148  * Returns 0 in case of error, the index in the stack otherwise
149  */
150 static int
htmlnamePush(htmlParserCtxtPtr ctxt,const xmlChar * value)151 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
152 {
153     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
154         ctxt->html = 3;
155     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
156         ctxt->html = 10;
157     if (ctxt->nameNr >= ctxt->nameMax) {
158         ctxt->nameMax *= 2;
159         ctxt->nameTab = (const xmlChar * *)
160                          xmlRealloc((xmlChar * *)ctxt->nameTab,
161                                     ctxt->nameMax *
162                                     sizeof(ctxt->nameTab[0]));
163         if (ctxt->nameTab == NULL) {
164             htmlErrMemory(ctxt, NULL);
165             return (0);
166         }
167     }
168     ctxt->nameTab[ctxt->nameNr] = value;
169     ctxt->name = value;
170     return (ctxt->nameNr++);
171 }
172 /**
173  * htmlnamePop:
174  * @ctxt: an HTML parser context
175  *
176  * Pops the top element name from the name stack
177  *
178  * Returns the name just removed
179  */
180 static const xmlChar *
htmlnamePop(htmlParserCtxtPtr ctxt)181 htmlnamePop(htmlParserCtxtPtr ctxt)
182 {
183     const xmlChar *ret;
184 
185     if (ctxt->nameNr <= 0)
186         return (NULL);
187     ctxt->nameNr--;
188     if (ctxt->nameNr < 0)
189         return (NULL);
190     if (ctxt->nameNr > 0)
191         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
192     else
193         ctxt->name = NULL;
194     ret = ctxt->nameTab[ctxt->nameNr];
195     ctxt->nameTab[ctxt->nameNr] = NULL;
196     return (ret);
197 }
198 
199 /**
200  * htmlNodeInfoPush:
201  * @ctxt:  an HTML parser context
202  * @value:  the node info
203  *
204  * Pushes a new element name on top of the node info stack
205  *
206  * Returns 0 in case of error, the index in the stack otherwise
207  */
208 static int
htmlNodeInfoPush(htmlParserCtxtPtr ctxt,htmlParserNodeInfo * value)209 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
210 {
211     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
212         if (ctxt->nodeInfoMax == 0)
213                 ctxt->nodeInfoMax = 5;
214         ctxt->nodeInfoMax *= 2;
215         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
216                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
217                                     ctxt->nodeInfoMax *
218                                     sizeof(ctxt->nodeInfoTab[0]));
219         if (ctxt->nodeInfoTab == NULL) {
220             htmlErrMemory(ctxt, NULL);
221             return (0);
222         }
223     }
224     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
225     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
226     return (ctxt->nodeInfoNr++);
227 }
228 
229 /**
230  * htmlNodeInfoPop:
231  * @ctxt:  an HTML parser context
232  *
233  * Pops the top element name from the node info stack
234  *
235  * Returns 0 in case of error, the pointer to NodeInfo otherwise
236  */
237 static htmlParserNodeInfo *
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)238 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
239 {
240     if (ctxt->nodeInfoNr <= 0)
241         return (NULL);
242     ctxt->nodeInfoNr--;
243     if (ctxt->nodeInfoNr < 0)
244         return (NULL);
245     if (ctxt->nodeInfoNr > 0)
246         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
247     else
248         ctxt->nodeInfo = NULL;
249     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
250 }
251 
252 /*
253  * Macros for accessing the content. Those should be used only by the parser,
254  * and not exported.
255  *
256  * Dirty macros, i.e. one need to make assumption on the context to use them
257  *
258  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
259  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
260  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
261  *           in UNICODE mode. This should be used internally by the parser
262  *           only to compare to ASCII values otherwise it would break when
263  *           running with UTF-8 encoding.
264  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
265  *           to compare on ASCII based substring.
266  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
267  *           it should be used only to compare on ASCII based substring.
268  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
269  *           strings without newlines within the parser.
270  *
271  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
272  *
273  *   CURRENT Returns the current char value, with the full decoding of
274  *           UTF-8 if we are using this mode. It returns an int.
275  *   NEXT    Skip to the next character, this does the proper decoding
276  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
277  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
278  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
279  */
280 
281 #define UPPER (toupper(*ctxt->input->cur))
282 
283 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
284 
285 #define NXT(val) ctxt->input->cur[(val)]
286 
287 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
288 
289 #define CUR_PTR ctxt->input->cur
290 #define BASE_PTR ctxt->input->base
291 
292 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
293 		   (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
294 	xmlParserInputShrink(ctxt->input)
295 
296 #define GROW if ((ctxt->progressive == 0) &&				\
297 		 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))	\
298 	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
299 
300 #define CURRENT ((int) (*ctxt->input->cur))
301 
302 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
303 
304 /* Imported from XML */
305 
306 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
307 #define CUR ((int) (*ctxt->input->cur))
308 #define NEXT xmlNextChar(ctxt)
309 
310 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
311 
312 
313 #define NEXTL(l) do {							\
314     if (*(ctxt->input->cur) == '\n') {					\
315 	ctxt->input->line++; ctxt->input->col = 1;			\
316     } else ctxt->input->col++;						\
317     ctxt->token = 0; ctxt->input->cur += l;				\
318   } while (0)
319 
320 /************
321     \
322     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);	\
323     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
324  ************/
325 
326 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
327 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
328 
329 #define COPY_BUF(l,b,i,v)						\
330     if (l == 1) b[i++] = (xmlChar) v;					\
331     else i += xmlCopyChar(l,&b[i],v)
332 
333 /**
334  * htmlFindEncoding:
335  * @the HTML parser context
336  *
337  * Ty to find and encoding in the current data available in the input
338  * buffer this is needed to try to switch to the proper encoding when
339  * one face a character error.
340  * That's an heuristic, since it's operating outside of parsing it could
341  * try to use a meta which had been commented out, that's the reason it
342  * should only be used in case of error, not as a default.
343  *
344  * Returns an encoding string or NULL if not found, the string need to
345  *   be freed
346  */
347 static xmlChar *
htmlFindEncoding(xmlParserCtxtPtr ctxt)348 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
349     const xmlChar *start, *cur, *end;
350 
351     if ((ctxt == NULL) || (ctxt->input == NULL) ||
352         (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
353         (ctxt->input->buf->encoder != NULL))
354         return(NULL);
355     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
356         return(NULL);
357 
358     start = ctxt->input->cur;
359     end = ctxt->input->end;
360     /* we also expect the input buffer to be zero terminated */
361     if (*end != 0)
362         return(NULL);
363 
364     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
365     if (cur == NULL)
366         return(NULL);
367     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
368     if (cur == NULL)
369         return(NULL);
370     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
371     if (cur == NULL)
372         return(NULL);
373     cur += 8;
374     start = cur;
375     while (((*cur >= 'A') && (*cur <= 'Z')) ||
376            ((*cur >= 'a') && (*cur <= 'z')) ||
377            ((*cur >= '0') && (*cur <= '9')) ||
378            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
379            cur++;
380     if (cur == start)
381         return(NULL);
382     return(xmlStrndup(start, cur - start));
383 }
384 
385 /**
386  * htmlCurrentChar:
387  * @ctxt:  the HTML parser context
388  * @len:  pointer to the length of the char read
389  *
390  * The current char value, if using UTF-8 this may actually span multiple
391  * bytes in the input buffer. Implement the end of line normalization:
392  * 2.11 End-of-Line Handling
393  * If the encoding is unspecified, in the case we find an ISO-Latin-1
394  * char, then the encoding converter is plugged in automatically.
395  *
396  * Returns the current char value and its length
397  */
398 
399 static int
htmlCurrentChar(xmlParserCtxtPtr ctxt,int * len)400 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
401     const unsigned char *cur;
402     unsigned char c;
403     unsigned int val;
404 
405     if (ctxt->instate == XML_PARSER_EOF)
406 	return(0);
407 
408     if (ctxt->token != 0) {
409 	*len = 0;
410 	return(ctxt->token);
411     }
412     if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
413         xmlChar * guess;
414         xmlCharEncodingHandlerPtr handler;
415 
416         /*
417          * Assume it's a fixed length encoding (1) with
418          * a compatible encoding for the ASCII set, since
419          * HTML constructs only use < 128 chars
420          */
421         if ((int) *ctxt->input->cur < 0x80) {
422             *len = 1;
423             if ((*ctxt->input->cur == 0) &&
424                 (ctxt->input->cur < ctxt->input->end)) {
425                 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
426                                 "Char 0x%X out of allowed range\n", 0);
427                 return(' ');
428             }
429             return((int) *ctxt->input->cur);
430         }
431 
432         /*
433          * Humm this is bad, do an automatic flow conversion
434          */
435         guess = htmlFindEncoding(ctxt);
436         if (guess == NULL) {
437             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
438         } else {
439             if (ctxt->input->encoding != NULL)
440                 xmlFree((xmlChar *) ctxt->input->encoding);
441             ctxt->input->encoding = guess;
442             handler = xmlFindCharEncodingHandler((const char *) guess);
443             if (handler != NULL) {
444                 /*
445                  * Don't use UTF-8 encoder which isn't required and
446                  * can produce invalid UTF-8.
447                  */
448                 if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
449                     xmlSwitchToEncoding(ctxt, handler);
450             } else {
451                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
452                              "Unsupported encoding %s", guess, NULL);
453             }
454         }
455         ctxt->charset = XML_CHAR_ENCODING_UTF8;
456     }
457 
458     /*
459      * We are supposed to handle UTF8, check it's valid
460      * From rfc2044: encoding of the Unicode values on UTF-8:
461      *
462      * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
463      * 0000 0000-0000 007F   0xxxxxxx
464      * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
465      * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
466      *
467      * Check for the 0x110000 limit too
468      */
469     cur = ctxt->input->cur;
470     c = *cur;
471     if (c & 0x80) {
472         if ((c & 0x40) == 0)
473             goto encoding_error;
474         if (cur[1] == 0) {
475             xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
476             cur = ctxt->input->cur;
477         }
478         if ((cur[1] & 0xc0) != 0x80)
479             goto encoding_error;
480         if ((c & 0xe0) == 0xe0) {
481 
482             if (cur[2] == 0) {
483                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
484                 cur = ctxt->input->cur;
485             }
486             if ((cur[2] & 0xc0) != 0x80)
487                 goto encoding_error;
488             if ((c & 0xf0) == 0xf0) {
489                 if (cur[3] == 0) {
490                     xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
491                     cur = ctxt->input->cur;
492                 }
493                 if (((c & 0xf8) != 0xf0) ||
494                     ((cur[3] & 0xc0) != 0x80))
495                     goto encoding_error;
496                 /* 4-byte code */
497                 *len = 4;
498                 val = (cur[0] & 0x7) << 18;
499                 val |= (cur[1] & 0x3f) << 12;
500                 val |= (cur[2] & 0x3f) << 6;
501                 val |= cur[3] & 0x3f;
502                 if (val < 0x10000)
503                     goto encoding_error;
504             } else {
505               /* 3-byte code */
506                 *len = 3;
507                 val = (cur[0] & 0xf) << 12;
508                 val |= (cur[1] & 0x3f) << 6;
509                 val |= cur[2] & 0x3f;
510                 if (val < 0x800)
511                     goto encoding_error;
512             }
513         } else {
514           /* 2-byte code */
515             *len = 2;
516             val = (cur[0] & 0x1f) << 6;
517             val |= cur[1] & 0x3f;
518             if (val < 0x80)
519                 goto encoding_error;
520         }
521         if (!IS_CHAR(val)) {
522             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
523                             "Char 0x%X out of allowed range\n", val);
524         }
525         return(val);
526     } else {
527         if ((*ctxt->input->cur == 0) &&
528             (ctxt->input->cur < ctxt->input->end)) {
529             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
530                             "Char 0x%X out of allowed range\n", 0);
531             *len = 1;
532             return(' ');
533         }
534         /* 1-byte code */
535         *len = 1;
536         return((int) *ctxt->input->cur);
537     }
538 
539 encoding_error:
540     /*
541      * If we detect an UTF8 error that probably mean that the
542      * input encoding didn't get properly advertised in the
543      * declaration header. Report the error and switch the encoding
544      * to ISO-Latin-1 (if you don't like this policy, just declare the
545      * encoding !)
546      */
547     {
548         char buffer[150];
549 
550 	if (ctxt->input->end - ctxt->input->cur >= 4) {
551 	    snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
552 			    ctxt->input->cur[0], ctxt->input->cur[1],
553 			    ctxt->input->cur[2], ctxt->input->cur[3]);
554 	} else {
555 	    snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
556 	}
557 	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
558 		     "Input is not proper UTF-8, indicate encoding !\n",
559 		     BAD_CAST buffer, NULL);
560     }
561 
562     /*
563      * Don't switch encodings twice. Note that if there's an encoder, we
564      * shouldn't receive invalid UTF-8 anyway.
565      *
566      * Note that if ctxt->input->buf == NULL, switching encodings is
567      * impossible, see Gitlab issue #34.
568      */
569     if ((ctxt->input->buf != NULL) &&
570         (ctxt->input->buf->encoder == NULL))
571         xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
572     *len = 1;
573     return((int) *ctxt->input->cur);
574 }
575 
576 /**
577  * htmlSkipBlankChars:
578  * @ctxt:  the HTML parser context
579  *
580  * skip all blanks character found at that point in the input streams.
581  *
582  * Returns the number of space chars skipped
583  */
584 
585 static int
htmlSkipBlankChars(xmlParserCtxtPtr ctxt)586 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
587     int res = 0;
588 
589     while (IS_BLANK_CH(*(ctxt->input->cur))) {
590 	if ((*ctxt->input->cur == 0) &&
591 	    (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
592 		xmlPopInput(ctxt);
593 	} else {
594 	    if (*(ctxt->input->cur) == '\n') {
595 		ctxt->input->line++; ctxt->input->col = 1;
596 	    } else ctxt->input->col++;
597 	    ctxt->input->cur++;
598 	    if (*ctxt->input->cur == 0)
599 		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
600 	}
601 	if (res < INT_MAX)
602 	    res++;
603     }
604     return(res);
605 }
606 
607 
608 
609 /************************************************************************
610  *									*
611  *	The list of HTML elements and their properties		*
612  *									*
613  ************************************************************************/
614 
615 /*
616  *  Start Tag: 1 means the start tag can be omitted
617  *  End Tag:   1 means the end tag can be omitted
618  *             2 means it's forbidden (empty elements)
619  *             3 means the tag is stylistic and should be closed easily
620  *  Depr:      this element is deprecated
621  *  DTD:       1 means that this element is valid only in the Loose DTD
622  *             2 means that this element is valid only in the Frameset DTD
623  *
624  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
625 	, subElements , impliedsubelt , Attributes, userdata
626  */
627 
628 /* Definitions and a couple of vars for HTML Elements */
629 
630 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
631 #define NB_FONTSTYLE 8
632 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
633 #define NB_PHRASE 10
634 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
635 #define NB_SPECIAL 16
636 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
637 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
638 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
639 #define NB_BLOCK NB_HEADING + NB_LIST + 14
640 #define FORMCTRL "input", "select", "textarea", "label", "button"
641 #define NB_FORMCTRL 5
642 #define PCDATA
643 #define NB_PCDATA 0
644 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
645 #define NB_HEADING 6
646 #define LIST "ul", "ol", "dir", "menu"
647 #define NB_LIST 4
648 #define MODIFIER
649 #define NB_MODIFIER 0
650 #define FLOW BLOCK,INLINE
651 #define NB_FLOW NB_BLOCK + NB_INLINE
652 #define EMPTY NULL
653 
654 
655 static const char* const html_flow[] = { FLOW, NULL } ;
656 static const char* const html_inline[] = { INLINE, NULL } ;
657 
658 /* placeholders: elts with content but no subelements */
659 static const char* const html_pcdata[] = { NULL } ;
660 #define html_cdata html_pcdata
661 
662 
663 /* ... and for HTML Attributes */
664 
665 #define COREATTRS "id", "class", "style", "title"
666 #define NB_COREATTRS 4
667 #define I18N "lang", "dir"
668 #define NB_I18N 2
669 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
670 #define NB_EVENTS 9
671 #define ATTRS COREATTRS,I18N,EVENTS
672 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
673 #define CELLHALIGN "align", "char", "charoff"
674 #define NB_CELLHALIGN 3
675 #define CELLVALIGN "valign"
676 #define NB_CELLVALIGN 1
677 
678 static const char* const html_attrs[] = { ATTRS, NULL } ;
679 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
680 static const char* const core_attrs[] = { COREATTRS, NULL } ;
681 static const char* const i18n_attrs[] = { I18N, NULL } ;
682 
683 
684 /* Other declarations that should go inline ... */
685 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
686 	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
687 	"tabindex", "onfocus", "onblur", NULL } ;
688 static const char* const target_attr[] = { "target", NULL } ;
689 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
690 static const char* const alt_attr[] = { "alt", NULL } ;
691 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
692 static const char* const href_attrs[] = { "href", NULL } ;
693 static const char* const clear_attrs[] = { "clear", NULL } ;
694 static const char* const inline_p[] = { INLINE, "p", NULL } ;
695 
696 static const char* const flow_param[] = { FLOW, "param", NULL } ;
697 static const char* const applet_attrs[] = { COREATTRS , "codebase",
698 		"archive", "alt", "name", "height", "width", "align",
699 		"hspace", "vspace", NULL } ;
700 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
701 	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
702 static const char* const basefont_attrs[] =
703 	{ "id", "size", "color", "face", NULL } ;
704 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
705 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
706 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
707 static const char* const body_depr[] = { "background", "bgcolor", "text",
708 	"link", "vlink", "alink", NULL } ;
709 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
710 	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
711 
712 
713 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
714 static const char* const col_elt[] = { "col", NULL } ;
715 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
716 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
717 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
718 static const char* const compact_attr[] = { "compact", NULL } ;
719 static const char* const label_attr[] = { "label", NULL } ;
720 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
721 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
722 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
723 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
724 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
725 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
726 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
727 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
728 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
729 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
730 static const char* const version_attr[] = { "version", NULL } ;
731 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
732 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
733 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
734 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
735 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
736 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
737 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
738 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
739 static const char* const align_attr[] = { "align", NULL } ;
740 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
741 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
742 static const char* const name_attr[] = { "name", NULL } ;
743 static const char* const action_attr[] = { "action", NULL } ;
744 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
745 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
746 static const char* const content_attr[] = { "content", NULL } ;
747 static const char* const type_attr[] = { "type", NULL } ;
748 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
749 static const char* const object_contents[] = { FLOW, "param", NULL } ;
750 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
751 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
752 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
753 static const char* const option_elt[] = { "option", NULL } ;
754 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
755 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
756 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
757 static const char* const width_attr[] = { "width", NULL } ;
758 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
759 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
760 static const char* const language_attr[] = { "language", NULL } ;
761 static const char* const select_content[] = { "optgroup", "option", NULL } ;
762 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
763 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
764 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
765 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
766 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
767 static const char* const tr_elt[] = { "tr", NULL } ;
768 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
769 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
770 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
771 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
772 static const char* const tr_contents[] = { "th", "td", NULL } ;
773 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
774 static const char* const li_elt[] = { "li", NULL } ;
775 static const char* const ul_depr[] = { "type", "compact", NULL} ;
776 static const char* const dir_attr[] = { "dir", NULL} ;
777 
778 #define DECL (const char**)
779 
780 static const htmlElemDesc
781 html40ElementTable[] = {
782 { "a",		0, 0, 0, 0, 0, 0, 1, "anchor ",
783 	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
784 },
785 { "abbr",	0, 0, 0, 0, 0, 0, 1, "abbreviated form",
786 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
787 },
788 { "acronym",	0, 0, 0, 0, 0, 0, 1, "",
789 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
790 },
791 { "address",	0, 0, 0, 0, 0, 0, 0, "information on author ",
792 	DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
793 },
794 { "applet",	0, 0, 0, 0, 1, 1, 2, "java applet ",
795 	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
796 },
797 { "area",	0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
798 	EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
799 },
800 { "b",		0, 3, 0, 0, 0, 0, 1, "bold text style",
801 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
802 },
803 { "base",	0, 2, 2, 1, 0, 0, 0, "document base uri ",
804 	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
805 },
806 { "basefont",	0, 2, 2, 1, 1, 1, 1, "base font size " ,
807 	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
808 },
809 { "bdo",	0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
810 	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
811 },
812 { "big",	0, 3, 0, 0, 0, 0, 1, "large text style",
813 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
814 },
815 { "blockquote",	0, 0, 0, 0, 0, 0, 0, "long quotation ",
816 	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
817 },
818 { "body",	1, 1, 0, 0, 0, 0, 0, "document body ",
819 	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
820 },
821 { "br",		0, 2, 2, 1, 0, 0, 1, "forced line break ",
822 	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
823 },
824 { "button",	0, 0, 0, 0, 0, 0, 2, "push button ",
825 	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
826 },
827 { "caption",	0, 0, 0, 0, 0, 0, 0, "table caption ",
828 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
829 },
830 { "center",	0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
831 	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
832 },
833 { "cite",	0, 0, 0, 0, 0, 0, 1, "citation",
834 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
835 },
836 { "code",	0, 0, 0, 0, 0, 0, 1, "computer code fragment",
837 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
838 },
839 { "col",	0, 2, 2, 1, 0, 0, 0, "table column ",
840 	EMPTY , NULL , DECL col_attrs , NULL, NULL
841 },
842 { "colgroup",	0, 1, 0, 0, 0, 0, 0, "table column group ",
843 	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
844 },
845 { "dd",		0, 1, 0, 0, 0, 0, 0, "definition description ",
846 	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
847 },
848 { "del",	0, 0, 0, 0, 0, 0, 2, "deleted text ",
849 	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
850 },
851 { "dfn",	0, 0, 0, 0, 0, 0, 1, "instance definition",
852 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
853 },
854 { "dir",	0, 0, 0, 0, 1, 1, 0, "directory list",
855 	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
856 },
857 { "div",	0, 0, 0, 0, 0, 0, 0, "generic language/style container",
858 	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
859 },
860 { "dl",		0, 0, 0, 0, 0, 0, 0, "definition list ",
861 	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
862 },
863 { "dt",		0, 1, 0, 0, 0, 0, 0, "definition term ",
864 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
865 },
866 { "em",		0, 3, 0, 0, 0, 0, 1, "emphasis",
867 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
868 },
869 { "embed",	0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
870 	EMPTY, NULL, DECL embed_attrs, NULL, NULL
871 },
872 { "fieldset",	0, 0, 0, 0, 0, 0, 0, "form control group ",
873 	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
874 },
875 { "font",	0, 3, 0, 0, 1, 1, 1, "local change to font ",
876 	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
877 },
878 { "form",	0, 0, 0, 0, 0, 0, 0, "interactive form ",
879 	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
880 },
881 { "frame",	0, 2, 2, 1, 0, 2, 0, "subwindow " ,
882 	EMPTY, NULL, NULL, DECL frame_attrs, NULL
883 },
884 { "frameset",	0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
885 	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
886 },
887 { "h1",		0, 0, 0, 0, 0, 0, 0, "heading ",
888 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
889 },
890 { "h2",		0, 0, 0, 0, 0, 0, 0, "heading ",
891 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
892 },
893 { "h3",		0, 0, 0, 0, 0, 0, 0, "heading ",
894 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
895 },
896 { "h4",		0, 0, 0, 0, 0, 0, 0, "heading ",
897 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
898 },
899 { "h5",		0, 0, 0, 0, 0, 0, 0, "heading ",
900 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
901 },
902 { "h6",		0, 0, 0, 0, 0, 0, 0, "heading ",
903 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
904 },
905 { "head",	1, 1, 0, 0, 0, 0, 0, "document head ",
906 	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
907 },
908 { "hr",		0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
909 	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
910 },
911 { "html",	1, 1, 0, 0, 0, 0, 0, "document root element ",
912 	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
913 },
914 { "i",		0, 3, 0, 0, 0, 0, 1, "italic text style",
915 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
916 },
917 { "iframe",	0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
918 	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
919 },
920 { "img",	0, 2, 2, 1, 0, 0, 1, "embedded image ",
921 	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
922 },
923 { "input",	0, 2, 2, 1, 0, 0, 1, "form control ",
924 	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
925 },
926 { "ins",	0, 0, 0, 0, 0, 0, 2, "inserted text",
927 	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
928 },
929 { "isindex",	0, 2, 2, 1, 1, 1, 0, "single line prompt ",
930 	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
931 },
932 { "kbd",	0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
933 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
934 },
935 { "label",	0, 0, 0, 0, 0, 0, 1, "form field label text ",
936 	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
937 },
938 { "legend",	0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
939 	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
940 },
941 { "li",		0, 1, 1, 0, 0, 0, 0, "list item ",
942 	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
943 },
944 { "link",	0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
945 	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
946 },
947 { "map",	0, 0, 0, 0, 0, 0, 2, "client-side image map ",
948 	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
949 },
950 { "menu",	0, 0, 0, 0, 1, 1, 0, "menu list ",
951 	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
952 },
953 { "meta",	0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
954 	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
955 },
956 { "noframes",	0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
957 	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
958 },
959 { "noscript",	0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
960 	DECL html_flow, "div", DECL html_attrs, NULL, NULL
961 },
962 { "object",	0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
963 	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
964 },
965 { "ol",		0, 0, 0, 0, 0, 0, 0, "ordered list ",
966 	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
967 },
968 { "optgroup",	0, 0, 0, 0, 0, 0, 0, "option group ",
969 	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
970 },
971 { "option",	0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
972 	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
973 },
974 { "p",		0, 1, 0, 0, 0, 0, 0, "paragraph ",
975 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
976 },
977 { "param",	0, 2, 2, 1, 0, 0, 0, "named property value ",
978 	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
979 },
980 { "pre",	0, 0, 0, 0, 0, 0, 0, "preformatted text ",
981 	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
982 },
983 { "q",		0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
984 	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
985 },
986 { "s",		0, 3, 0, 0, 1, 1, 1, "strike-through text style",
987 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
988 },
989 { "samp",	0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
990 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
991 },
992 { "script",	0, 0, 0, 0, 0, 0, 2, "script statements ",
993 	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
994 },
995 { "select",	0, 0, 0, 0, 0, 0, 1, "option selector ",
996 	DECL select_content, NULL, DECL select_attrs, NULL, NULL
997 },
998 { "small",	0, 3, 0, 0, 0, 0, 1, "small text style",
999 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1000 },
1001 { "span",	0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
1002 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1003 },
1004 { "strike",	0, 3, 0, 0, 1, 1, 1, "strike-through text",
1005 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1006 },
1007 { "strong",	0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1008 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1009 },
1010 { "style",	0, 0, 0, 0, 0, 0, 0, "style info ",
1011 	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1012 },
1013 { "sub",	0, 3, 0, 0, 0, 0, 1, "subscript",
1014 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1015 },
1016 { "sup",	0, 3, 0, 0, 0, 0, 1, "superscript ",
1017 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1018 },
1019 { "table",	0, 0, 0, 0, 0, 0, 0, "",
1020 	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1021 },
1022 { "tbody",	1, 0, 0, 0, 0, 0, 0, "table body ",
1023 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1024 },
1025 { "td",		0, 0, 0, 0, 0, 0, 0, "table data cell",
1026 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1027 },
1028 { "textarea",	0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1029 	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1030 },
1031 { "tfoot",	0, 1, 0, 0, 0, 0, 0, "table footer ",
1032 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1033 },
1034 { "th",		0, 1, 0, 0, 0, 0, 0, "table header cell",
1035 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1036 },
1037 { "thead",	0, 1, 0, 0, 0, 0, 0, "table header ",
1038 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1039 },
1040 { "title",	0, 0, 0, 0, 0, 0, 0, "document title ",
1041 	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1042 },
1043 { "tr",		0, 0, 0, 0, 0, 0, 0, "table row ",
1044 	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1045 },
1046 { "tt",		0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1047 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1048 },
1049 { "u",		0, 3, 0, 0, 1, 1, 1, "underlined text style",
1050 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1051 },
1052 { "ul",		0, 0, 0, 0, 0, 0, 0, "unordered list ",
1053 	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1054 },
1055 { "var",	0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1056 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1057 }
1058 };
1059 
1060 typedef struct {
1061     const char *oldTag;
1062     const char *newTag;
1063 } htmlStartCloseEntry;
1064 
1065 /*
1066  * start tags that imply the end of current element
1067  */
1068 static const htmlStartCloseEntry htmlStartClose[] = {
1069     { "a", "a" },
1070     { "a", "fieldset" },
1071     { "a", "table" },
1072     { "a", "td" },
1073     { "a", "th" },
1074     { "address", "dd" },
1075     { "address", "dl" },
1076     { "address", "dt" },
1077     { "address", "form" },
1078     { "address", "li" },
1079     { "address", "ul" },
1080     { "b", "center" },
1081     { "b", "p" },
1082     { "b", "td" },
1083     { "b", "th" },
1084     { "big", "p" },
1085     { "caption", "col" },
1086     { "caption", "colgroup" },
1087     { "caption", "tbody" },
1088     { "caption", "tfoot" },
1089     { "caption", "thead" },
1090     { "caption", "tr" },
1091     { "col", "col" },
1092     { "col", "colgroup" },
1093     { "col", "tbody" },
1094     { "col", "tfoot" },
1095     { "col", "thead" },
1096     { "col", "tr" },
1097     { "colgroup", "colgroup" },
1098     { "colgroup", "tbody" },
1099     { "colgroup", "tfoot" },
1100     { "colgroup", "thead" },
1101     { "colgroup", "tr" },
1102     { "dd", "dt" },
1103     { "dir", "dd" },
1104     { "dir", "dl" },
1105     { "dir", "dt" },
1106     { "dir", "form" },
1107     { "dir", "ul" },
1108     { "dl", "form" },
1109     { "dl", "li" },
1110     { "dt", "dd" },
1111     { "dt", "dl" },
1112     { "font", "center" },
1113     { "font", "td" },
1114     { "font", "th" },
1115     { "form", "form" },
1116     { "h1", "fieldset" },
1117     { "h1", "form" },
1118     { "h1", "li" },
1119     { "h1", "p" },
1120     { "h1", "table" },
1121     { "h2", "fieldset" },
1122     { "h2", "form" },
1123     { "h2", "li" },
1124     { "h2", "p" },
1125     { "h2", "table" },
1126     { "h3", "fieldset" },
1127     { "h3", "form" },
1128     { "h3", "li" },
1129     { "h3", "p" },
1130     { "h3", "table" },
1131     { "h4", "fieldset" },
1132     { "h4", "form" },
1133     { "h4", "li" },
1134     { "h4", "p" },
1135     { "h4", "table" },
1136     { "h5", "fieldset" },
1137     { "h5", "form" },
1138     { "h5", "li" },
1139     { "h5", "p" },
1140     { "h5", "table" },
1141     { "h6", "fieldset" },
1142     { "h6", "form" },
1143     { "h6", "li" },
1144     { "h6", "p" },
1145     { "h6", "table" },
1146     { "head", "a" },
1147     { "head", "abbr" },
1148     { "head", "acronym" },
1149     { "head", "address" },
1150     { "head", "b" },
1151     { "head", "bdo" },
1152     { "head", "big" },
1153     { "head", "blockquote" },
1154     { "head", "body" },
1155     { "head", "br" },
1156     { "head", "center" },
1157     { "head", "cite" },
1158     { "head", "code" },
1159     { "head", "dd" },
1160     { "head", "dfn" },
1161     { "head", "dir" },
1162     { "head", "div" },
1163     { "head", "dl" },
1164     { "head", "dt" },
1165     { "head", "em" },
1166     { "head", "fieldset" },
1167     { "head", "font" },
1168     { "head", "form" },
1169     { "head", "frameset" },
1170     { "head", "h1" },
1171     { "head", "h2" },
1172     { "head", "h3" },
1173     { "head", "h4" },
1174     { "head", "h5" },
1175     { "head", "h6" },
1176     { "head", "hr" },
1177     { "head", "i" },
1178     { "head", "iframe" },
1179     { "head", "img" },
1180     { "head", "kbd" },
1181     { "head", "li" },
1182     { "head", "listing" },
1183     { "head", "map" },
1184     { "head", "menu" },
1185     { "head", "ol" },
1186     { "head", "p" },
1187     { "head", "pre" },
1188     { "head", "q" },
1189     { "head", "s" },
1190     { "head", "samp" },
1191     { "head", "small" },
1192     { "head", "span" },
1193     { "head", "strike" },
1194     { "head", "strong" },
1195     { "head", "sub" },
1196     { "head", "sup" },
1197     { "head", "table" },
1198     { "head", "tt" },
1199     { "head", "u" },
1200     { "head", "ul" },
1201     { "head", "var" },
1202     { "head", "xmp" },
1203     { "hr", "form" },
1204     { "i", "center" },
1205     { "i", "p" },
1206     { "i", "td" },
1207     { "i", "th" },
1208     { "legend", "fieldset" },
1209     { "li", "li" },
1210     { "link", "body" },
1211     { "link", "frameset" },
1212     { "listing", "dd" },
1213     { "listing", "dl" },
1214     { "listing", "dt" },
1215     { "listing", "fieldset" },
1216     { "listing", "form" },
1217     { "listing", "li" },
1218     { "listing", "table" },
1219     { "listing", "ul" },
1220     { "menu", "dd" },
1221     { "menu", "dl" },
1222     { "menu", "dt" },
1223     { "menu", "form" },
1224     { "menu", "ul" },
1225     { "ol", "form" },
1226     { "ol", "ul" },
1227     { "option", "optgroup" },
1228     { "option", "option" },
1229     { "p", "address" },
1230     { "p", "blockquote" },
1231     { "p", "body" },
1232     { "p", "caption" },
1233     { "p", "center" },
1234     { "p", "col" },
1235     { "p", "colgroup" },
1236     { "p", "dd" },
1237     { "p", "dir" },
1238     { "p", "div" },
1239     { "p", "dl" },
1240     { "p", "dt" },
1241     { "p", "fieldset" },
1242     { "p", "form" },
1243     { "p", "frameset" },
1244     { "p", "h1" },
1245     { "p", "h2" },
1246     { "p", "h3" },
1247     { "p", "h4" },
1248     { "p", "h5" },
1249     { "p", "h6" },
1250     { "p", "head" },
1251     { "p", "hr" },
1252     { "p", "li" },
1253     { "p", "listing" },
1254     { "p", "menu" },
1255     { "p", "ol" },
1256     { "p", "p" },
1257     { "p", "pre" },
1258     { "p", "table" },
1259     { "p", "tbody" },
1260     { "p", "td" },
1261     { "p", "tfoot" },
1262     { "p", "th" },
1263     { "p", "title" },
1264     { "p", "tr" },
1265     { "p", "ul" },
1266     { "p", "xmp" },
1267     { "pre", "dd" },
1268     { "pre", "dl" },
1269     { "pre", "dt" },
1270     { "pre", "fieldset" },
1271     { "pre", "form" },
1272     { "pre", "li" },
1273     { "pre", "table" },
1274     { "pre", "ul" },
1275     { "s", "p" },
1276     { "script", "noscript" },
1277     { "small", "p" },
1278     { "span", "td" },
1279     { "span", "th" },
1280     { "strike", "p" },
1281     { "style", "body" },
1282     { "style", "frameset" },
1283     { "tbody", "tbody" },
1284     { "tbody", "tfoot" },
1285     { "td", "tbody" },
1286     { "td", "td" },
1287     { "td", "tfoot" },
1288     { "td", "th" },
1289     { "td", "tr" },
1290     { "tfoot", "tbody" },
1291     { "th", "tbody" },
1292     { "th", "td" },
1293     { "th", "tfoot" },
1294     { "th", "th" },
1295     { "th", "tr" },
1296     { "thead", "tbody" },
1297     { "thead", "tfoot" },
1298     { "title", "body" },
1299     { "title", "frameset" },
1300     { "tr", "tbody" },
1301     { "tr", "tfoot" },
1302     { "tr", "tr" },
1303     { "tt", "p" },
1304     { "u", "p" },
1305     { "u", "td" },
1306     { "u", "th" },
1307     { "ul", "address" },
1308     { "ul", "form" },
1309     { "ul", "menu" },
1310     { "ul", "ol" },
1311     { "ul", "pre" },
1312     { "xmp", "dd" },
1313     { "xmp", "dl" },
1314     { "xmp", "dt" },
1315     { "xmp", "fieldset" },
1316     { "xmp", "form" },
1317     { "xmp", "li" },
1318     { "xmp", "table" },
1319     { "xmp", "ul" }
1320 };
1321 
1322 /*
1323  * The list of HTML elements which are supposed not to have
1324  * CDATA content and where a p element will be implied
1325  *
1326  * TODO: extend that list by reading the HTML SGML DTD on
1327  *       implied paragraph
1328  */
1329 static const char *const htmlNoContentElements[] = {
1330     "html",
1331     "head",
1332     NULL
1333 };
1334 
1335 /*
1336  * The list of HTML attributes which are of content %Script;
1337  * NOTE: when adding ones, check htmlIsScriptAttribute() since
1338  *       it assumes the name starts with 'on'
1339  */
1340 static const char *const htmlScriptAttributes[] = {
1341     "onclick",
1342     "ondblclick",
1343     "onmousedown",
1344     "onmouseup",
1345     "onmouseover",
1346     "onmousemove",
1347     "onmouseout",
1348     "onkeypress",
1349     "onkeydown",
1350     "onkeyup",
1351     "onload",
1352     "onunload",
1353     "onfocus",
1354     "onblur",
1355     "onsubmit",
1356     "onreset",
1357     "onchange",
1358     "onselect"
1359 };
1360 
1361 /*
1362  * This table is used by the htmlparser to know what to do with
1363  * broken html pages. By assigning different priorities to different
1364  * elements the parser can decide how to handle extra endtags.
1365  * Endtags are only allowed to close elements with lower or equal
1366  * priority.
1367  */
1368 
1369 typedef struct {
1370     const char *name;
1371     int priority;
1372 } elementPriority;
1373 
1374 static const elementPriority htmlEndPriority[] = {
1375     {"div",   150},
1376     {"td",    160},
1377     {"th",    160},
1378     {"tr",    170},
1379     {"thead", 180},
1380     {"tbody", 180},
1381     {"tfoot", 180},
1382     {"table", 190},
1383     {"head",  200},
1384     {"body",  200},
1385     {"html",  220},
1386     {NULL,    100} /* Default priority */
1387 };
1388 
1389 /************************************************************************
1390  *									*
1391  *	functions to handle HTML specific data			*
1392  *									*
1393  ************************************************************************/
1394 
1395 /**
1396  * htmlInitAutoClose:
1397  *
1398  * DEPRECATED: This function will be made private. Call xmlInitParser to
1399  * initialize the library.
1400  *
1401  * This is a no-op now.
1402  */
1403 void
htmlInitAutoClose(void)1404 htmlInitAutoClose(void) {
1405 }
1406 
1407 static int
htmlCompareTags(const void * key,const void * member)1408 htmlCompareTags(const void *key, const void *member) {
1409     const xmlChar *tag = (const xmlChar *) key;
1410     const htmlElemDesc *desc = (const htmlElemDesc *) member;
1411 
1412     return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1413 }
1414 
1415 /**
1416  * htmlTagLookup:
1417  * @tag:  The tag name in lowercase
1418  *
1419  * Lookup the HTML tag in the ElementTable
1420  *
1421  * Returns the related htmlElemDescPtr or NULL if not found.
1422  */
1423 const htmlElemDesc *
htmlTagLookup(const xmlChar * tag)1424 htmlTagLookup(const xmlChar *tag) {
1425     if (tag == NULL)
1426         return(NULL);
1427 
1428     return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1429                 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1430                 sizeof(htmlElemDesc), htmlCompareTags));
1431 }
1432 
1433 /**
1434  * htmlGetEndPriority:
1435  * @name: The name of the element to look up the priority for.
1436  *
1437  * Return value: The "endtag" priority.
1438  **/
1439 static int
htmlGetEndPriority(const xmlChar * name)1440 htmlGetEndPriority (const xmlChar *name) {
1441     int i = 0;
1442 
1443     while ((htmlEndPriority[i].name != NULL) &&
1444 	   (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1445 	i++;
1446 
1447     return(htmlEndPriority[i].priority);
1448 }
1449 
1450 
1451 static int
htmlCompareStartClose(const void * vkey,const void * member)1452 htmlCompareStartClose(const void *vkey, const void *member) {
1453     const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1454     const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1455     int ret;
1456 
1457     ret = strcmp(key->oldTag, entry->oldTag);
1458     if (ret == 0)
1459         ret = strcmp(key->newTag, entry->newTag);
1460 
1461     return(ret);
1462 }
1463 
1464 /**
1465  * htmlCheckAutoClose:
1466  * @newtag:  The new tag name
1467  * @oldtag:  The old tag name
1468  *
1469  * Checks whether the new tag is one of the registered valid tags for
1470  * closing old.
1471  *
1472  * Returns 0 if no, 1 if yes.
1473  */
1474 static int
htmlCheckAutoClose(const xmlChar * newtag,const xmlChar * oldtag)1475 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1476 {
1477     htmlStartCloseEntry key;
1478     void *res;
1479 
1480     key.oldTag = (const char *) oldtag;
1481     key.newTag = (const char *) newtag;
1482     res = bsearch(&key, htmlStartClose,
1483             sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1484             sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1485     return(res != NULL);
1486 }
1487 
1488 /**
1489  * htmlAutoCloseOnClose:
1490  * @ctxt:  an HTML parser context
1491  * @newtag:  The new tag name
1492  * @force:  force the tag closure
1493  *
1494  * The HTML DTD allows an ending tag to implicitly close other tags.
1495  */
1496 static void
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1497 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1498 {
1499     const htmlElemDesc *info;
1500     int i, priority;
1501 
1502     priority = htmlGetEndPriority(newtag);
1503 
1504     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1505 
1506         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1507             break;
1508         /*
1509          * A misplaced endtag can only close elements with lower
1510          * or equal priority, so if we find an element with higher
1511          * priority before we find an element with
1512          * matching name, we just ignore this endtag
1513          */
1514         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1515             return;
1516     }
1517     if (i < 0)
1518         return;
1519 
1520     while (!xmlStrEqual(newtag, ctxt->name)) {
1521         info = htmlTagLookup(ctxt->name);
1522         if ((info != NULL) && (info->endTag == 3)) {
1523             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1524 	                 "Opening and ending tag mismatch: %s and %s\n",
1525 			 newtag, ctxt->name);
1526         }
1527         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1528             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1529 	htmlnamePop(ctxt);
1530     }
1531 }
1532 
1533 /**
1534  * htmlAutoCloseOnEnd:
1535  * @ctxt:  an HTML parser context
1536  *
1537  * Close all remaining tags at the end of the stream
1538  */
1539 static void
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)1540 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1541 {
1542     int i;
1543 
1544     if (ctxt->nameNr == 0)
1545         return;
1546     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1547         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1548             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1549 	htmlnamePop(ctxt);
1550     }
1551 }
1552 
1553 /**
1554  * htmlAutoClose:
1555  * @ctxt:  an HTML parser context
1556  * @newtag:  The new tag name or NULL
1557  *
1558  * The HTML DTD allows a tag to implicitly close other tags.
1559  * The list is kept in htmlStartClose array. This function is
1560  * called when a new tag has been detected and generates the
1561  * appropriates closes if possible/needed.
1562  * If newtag is NULL this mean we are at the end of the resource
1563  * and we should check
1564  */
1565 static void
htmlAutoClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1566 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1567 {
1568     while ((newtag != NULL) && (ctxt->name != NULL) &&
1569            (htmlCheckAutoClose(newtag, ctxt->name))) {
1570         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1571             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1572 	htmlnamePop(ctxt);
1573     }
1574     if (newtag == NULL) {
1575         htmlAutoCloseOnEnd(ctxt);
1576         return;
1577     }
1578     while ((newtag == NULL) && (ctxt->name != NULL) &&
1579            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1580             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1581             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1582         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1583             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1584 	htmlnamePop(ctxt);
1585     }
1586 }
1587 
1588 /**
1589  * htmlAutoCloseTag:
1590  * @doc:  the HTML document
1591  * @name:  The tag name
1592  * @elem:  the HTML element
1593  *
1594  * The HTML DTD allows a tag to implicitly close other tags.
1595  * The list is kept in htmlStartClose array. This function checks
1596  * if the element or one of it's children would autoclose the
1597  * given tag.
1598  *
1599  * Returns 1 if autoclose, 0 otherwise
1600  */
1601 int
htmlAutoCloseTag(htmlDocPtr doc,const xmlChar * name,htmlNodePtr elem)1602 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1603     htmlNodePtr child;
1604 
1605     if (elem == NULL) return(1);
1606     if (xmlStrEqual(name, elem->name)) return(0);
1607     if (htmlCheckAutoClose(elem->name, name)) return(1);
1608     child = elem->children;
1609     while (child != NULL) {
1610         if (htmlAutoCloseTag(doc, name, child)) return(1);
1611 	child = child->next;
1612     }
1613     return(0);
1614 }
1615 
1616 /**
1617  * htmlIsAutoClosed:
1618  * @doc:  the HTML document
1619  * @elem:  the HTML element
1620  *
1621  * The HTML DTD allows a tag to implicitly close other tags.
1622  * The list is kept in htmlStartClose array. This function checks
1623  * if a tag is autoclosed by one of it's child
1624  *
1625  * Returns 1 if autoclosed, 0 otherwise
1626  */
1627 int
htmlIsAutoClosed(htmlDocPtr doc,htmlNodePtr elem)1628 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1629     htmlNodePtr child;
1630 
1631     if (elem == NULL) return(1);
1632     child = elem->children;
1633     while (child != NULL) {
1634 	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1635 	child = child->next;
1636     }
1637     return(0);
1638 }
1639 
1640 /**
1641  * htmlCheckImplied:
1642  * @ctxt:  an HTML parser context
1643  * @newtag:  The new tag name
1644  *
1645  * The HTML DTD allows a tag to exists only implicitly
1646  * called when a new tag has been detected and generates the
1647  * appropriates implicit tags if missing
1648  */
1649 static void
htmlCheckImplied(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1650 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1651     int i;
1652 
1653     if (ctxt->options & HTML_PARSE_NOIMPLIED)
1654         return;
1655     if (!htmlOmittedDefaultValue)
1656 	return;
1657     if (xmlStrEqual(newtag, BAD_CAST"html"))
1658 	return;
1659     if (ctxt->nameNr <= 0) {
1660 	htmlnamePush(ctxt, BAD_CAST"html");
1661 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1662 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1663     }
1664     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1665         return;
1666     if ((ctxt->nameNr <= 1) &&
1667         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1668 	 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1669 	 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1670 	 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1671 	 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1672 	 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1673         if (ctxt->html >= 3) {
1674             /* we already saw or generated an <head> before */
1675             return;
1676         }
1677         /*
1678          * dropped OBJECT ... i you put it first BODY will be
1679          * assumed !
1680          */
1681         htmlnamePush(ctxt, BAD_CAST"head");
1682         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1683             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1684     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1685 	       (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1686 	       (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1687         if (ctxt->html >= 10) {
1688             /* we already saw or generated a <body> before */
1689             return;
1690         }
1691 	for (i = 0;i < ctxt->nameNr;i++) {
1692 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1693 		return;
1694 	    }
1695 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1696 		return;
1697 	    }
1698 	}
1699 
1700 	htmlnamePush(ctxt, BAD_CAST"body");
1701 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1702 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1703     }
1704 }
1705 
1706 /**
1707  * htmlCheckParagraph
1708  * @ctxt:  an HTML parser context
1709  *
1710  * Check whether a p element need to be implied before inserting
1711  * characters in the current element.
1712  *
1713  * Returns 1 if a paragraph has been inserted, 0 if not and -1
1714  *         in case of error.
1715  */
1716 
1717 static int
htmlCheckParagraph(htmlParserCtxtPtr ctxt)1718 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1719     const xmlChar *tag;
1720     int i;
1721 
1722     if (ctxt == NULL)
1723 	return(-1);
1724     tag = ctxt->name;
1725     if (tag == NULL) {
1726 	htmlAutoClose(ctxt, BAD_CAST"p");
1727 	htmlCheckImplied(ctxt, BAD_CAST"p");
1728 	htmlnamePush(ctxt, BAD_CAST"p");
1729 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1730 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1731 	return(1);
1732     }
1733     if (!htmlOmittedDefaultValue)
1734 	return(0);
1735     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1736 	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1737 	    htmlAutoClose(ctxt, BAD_CAST"p");
1738 	    htmlCheckImplied(ctxt, BAD_CAST"p");
1739 	    htmlnamePush(ctxt, BAD_CAST"p");
1740 	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1741 		ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1742 	    return(1);
1743 	}
1744     }
1745     return(0);
1746 }
1747 
1748 /**
1749  * htmlIsScriptAttribute:
1750  * @name:  an attribute name
1751  *
1752  * Check if an attribute is of content type Script
1753  *
1754  * Returns 1 is the attribute is a script 0 otherwise
1755  */
1756 int
htmlIsScriptAttribute(const xmlChar * name)1757 htmlIsScriptAttribute(const xmlChar *name) {
1758     unsigned int i;
1759 
1760     if (name == NULL)
1761       return(0);
1762     /*
1763      * all script attributes start with 'on'
1764      */
1765     if ((name[0] != 'o') || (name[1] != 'n'))
1766       return(0);
1767     for (i = 0;
1768 	 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1769 	 i++) {
1770 	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1771 	    return(1);
1772     }
1773     return(0);
1774 }
1775 
1776 /************************************************************************
1777  *									*
1778  *	The list of HTML predefined entities			*
1779  *									*
1780  ************************************************************************/
1781 
1782 
1783 static const htmlEntityDesc  html40EntitiesTable[] = {
1784 /*
1785  * the 4 absolute ones, plus apostrophe.
1786  */
1787 { 34,	"quot",	"quotation mark = APL quote, U+0022 ISOnum" },
1788 { 38,	"amp",	"ampersand, U+0026 ISOnum" },
1789 { 39,	"apos",	"single quote" },
1790 { 60,	"lt",	"less-than sign, U+003C ISOnum" },
1791 { 62,	"gt",	"greater-than sign, U+003E ISOnum" },
1792 
1793 /*
1794  * A bunch still in the 128-255 range
1795  * Replacing them depend really on the charset used.
1796  */
1797 { 160,	"nbsp",	"no-break space = non-breaking space, U+00A0 ISOnum" },
1798 { 161,	"iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1799 { 162,	"cent",	"cent sign, U+00A2 ISOnum" },
1800 { 163,	"pound","pound sign, U+00A3 ISOnum" },
1801 { 164,	"curren","currency sign, U+00A4 ISOnum" },
1802 { 165,	"yen",	"yen sign = yuan sign, U+00A5 ISOnum" },
1803 { 166,	"brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1804 { 167,	"sect",	"section sign, U+00A7 ISOnum" },
1805 { 168,	"uml",	"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1806 { 169,	"copy",	"copyright sign, U+00A9 ISOnum" },
1807 { 170,	"ordf",	"feminine ordinal indicator, U+00AA ISOnum" },
1808 { 171,	"laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1809 { 172,	"not",	"not sign, U+00AC ISOnum" },
1810 { 173,	"shy",	"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1811 { 174,	"reg",	"registered sign = registered trade mark sign, U+00AE ISOnum" },
1812 { 175,	"macr",	"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1813 { 176,	"deg",	"degree sign, U+00B0 ISOnum" },
1814 { 177,	"plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1815 { 178,	"sup2",	"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1816 { 179,	"sup3",	"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1817 { 180,	"acute","acute accent = spacing acute, U+00B4 ISOdia" },
1818 { 181,	"micro","micro sign, U+00B5 ISOnum" },
1819 { 182,	"para",	"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1820 { 183,	"middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1821 { 184,	"cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1822 { 185,	"sup1",	"superscript one = superscript digit one, U+00B9 ISOnum" },
1823 { 186,	"ordm",	"masculine ordinal indicator, U+00BA ISOnum" },
1824 { 187,	"raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1825 { 188,	"frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1826 { 189,	"frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1827 { 190,	"frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1828 { 191,	"iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1829 { 192,	"Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1830 { 193,	"Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1831 { 194,	"Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1832 { 195,	"Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1833 { 196,	"Auml",	"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1834 { 197,	"Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1835 { 198,	"AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1836 { 199,	"Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1837 { 200,	"Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1838 { 201,	"Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1839 { 202,	"Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1840 { 203,	"Euml",	"latin capital letter E with diaeresis, U+00CB ISOlat1" },
1841 { 204,	"Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1842 { 205,	"Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1843 { 206,	"Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1844 { 207,	"Iuml",	"latin capital letter I with diaeresis, U+00CF ISOlat1" },
1845 { 208,	"ETH",	"latin capital letter ETH, U+00D0 ISOlat1" },
1846 { 209,	"Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1847 { 210,	"Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1848 { 211,	"Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1849 { 212,	"Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1850 { 213,	"Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1851 { 214,	"Ouml",	"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1852 { 215,	"times","multiplication sign, U+00D7 ISOnum" },
1853 { 216,	"Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1854 { 217,	"Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1855 { 218,	"Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1856 { 219,	"Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1857 { 220,	"Uuml",	"latin capital letter U with diaeresis, U+00DC ISOlat1" },
1858 { 221,	"Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1859 { 222,	"THORN","latin capital letter THORN, U+00DE ISOlat1" },
1860 { 223,	"szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1861 { 224,	"agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1862 { 225,	"aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1863 { 226,	"acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1864 { 227,	"atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1865 { 228,	"auml",	"latin small letter a with diaeresis, U+00E4 ISOlat1" },
1866 { 229,	"aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1867 { 230,	"aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1868 { 231,	"ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1869 { 232,	"egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1870 { 233,	"eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1871 { 234,	"ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1872 { 235,	"euml",	"latin small letter e with diaeresis, U+00EB ISOlat1" },
1873 { 236,	"igrave","latin small letter i with grave, U+00EC ISOlat1" },
1874 { 237,	"iacute","latin small letter i with acute, U+00ED ISOlat1" },
1875 { 238,	"icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1876 { 239,	"iuml",	"latin small letter i with diaeresis, U+00EF ISOlat1" },
1877 { 240,	"eth",	"latin small letter eth, U+00F0 ISOlat1" },
1878 { 241,	"ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1879 { 242,	"ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1880 { 243,	"oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1881 { 244,	"ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1882 { 245,	"otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1883 { 246,	"ouml",	"latin small letter o with diaeresis, U+00F6 ISOlat1" },
1884 { 247,	"divide","division sign, U+00F7 ISOnum" },
1885 { 248,	"oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1886 { 249,	"ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1887 { 250,	"uacute","latin small letter u with acute, U+00FA ISOlat1" },
1888 { 251,	"ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1889 { 252,	"uuml",	"latin small letter u with diaeresis, U+00FC ISOlat1" },
1890 { 253,	"yacute","latin small letter y with acute, U+00FD ISOlat1" },
1891 { 254,	"thorn","latin small letter thorn with, U+00FE ISOlat1" },
1892 { 255,	"yuml",	"latin small letter y with diaeresis, U+00FF ISOlat1" },
1893 
1894 { 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },
1895 { 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },
1896 { 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1897 { 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },
1898 { 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1899 
1900 /*
1901  * Anything below should really be kept as entities references
1902  */
1903 { 402,	"fnof",	"latin small f with hook = function = florin, U+0192 ISOtech" },
1904 
1905 { 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },
1906 { 732,	"tilde","small tilde, U+02DC ISOdia" },
1907 
1908 { 913,	"Alpha","greek capital letter alpha, U+0391" },
1909 { 914,	"Beta",	"greek capital letter beta, U+0392" },
1910 { 915,	"Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1911 { 916,	"Delta","greek capital letter delta, U+0394 ISOgrk3" },
1912 { 917,	"Epsilon","greek capital letter epsilon, U+0395" },
1913 { 918,	"Zeta",	"greek capital letter zeta, U+0396" },
1914 { 919,	"Eta",	"greek capital letter eta, U+0397" },
1915 { 920,	"Theta","greek capital letter theta, U+0398 ISOgrk3" },
1916 { 921,	"Iota",	"greek capital letter iota, U+0399" },
1917 { 922,	"Kappa","greek capital letter kappa, U+039A" },
1918 { 923,	"Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1919 { 924,	"Mu",	"greek capital letter mu, U+039C" },
1920 { 925,	"Nu",	"greek capital letter nu, U+039D" },
1921 { 926,	"Xi",	"greek capital letter xi, U+039E ISOgrk3" },
1922 { 927,	"Omicron","greek capital letter omicron, U+039F" },
1923 { 928,	"Pi",	"greek capital letter pi, U+03A0 ISOgrk3" },
1924 { 929,	"Rho",	"greek capital letter rho, U+03A1" },
1925 { 931,	"Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1926 { 932,	"Tau",	"greek capital letter tau, U+03A4" },
1927 { 933,	"Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1928 { 934,	"Phi",	"greek capital letter phi, U+03A6 ISOgrk3" },
1929 { 935,	"Chi",	"greek capital letter chi, U+03A7" },
1930 { 936,	"Psi",	"greek capital letter psi, U+03A8 ISOgrk3" },
1931 { 937,	"Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1932 
1933 { 945,	"alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1934 { 946,	"beta",	"greek small letter beta, U+03B2 ISOgrk3" },
1935 { 947,	"gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1936 { 948,	"delta","greek small letter delta, U+03B4 ISOgrk3" },
1937 { 949,	"epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1938 { 950,	"zeta",	"greek small letter zeta, U+03B6 ISOgrk3" },
1939 { 951,	"eta",	"greek small letter eta, U+03B7 ISOgrk3" },
1940 { 952,	"theta","greek small letter theta, U+03B8 ISOgrk3" },
1941 { 953,	"iota",	"greek small letter iota, U+03B9 ISOgrk3" },
1942 { 954,	"kappa","greek small letter kappa, U+03BA ISOgrk3" },
1943 { 955,	"lambda","greek small letter lambda, U+03BB ISOgrk3" },
1944 { 956,	"mu",	"greek small letter mu, U+03BC ISOgrk3" },
1945 { 957,	"nu",	"greek small letter nu, U+03BD ISOgrk3" },
1946 { 958,	"xi",	"greek small letter xi, U+03BE ISOgrk3" },
1947 { 959,	"omicron","greek small letter omicron, U+03BF NEW" },
1948 { 960,	"pi",	"greek small letter pi, U+03C0 ISOgrk3" },
1949 { 961,	"rho",	"greek small letter rho, U+03C1 ISOgrk3" },
1950 { 962,	"sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1951 { 963,	"sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1952 { 964,	"tau",	"greek small letter tau, U+03C4 ISOgrk3" },
1953 { 965,	"upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1954 { 966,	"phi",	"greek small letter phi, U+03C6 ISOgrk3" },
1955 { 967,	"chi",	"greek small letter chi, U+03C7 ISOgrk3" },
1956 { 968,	"psi",	"greek small letter psi, U+03C8 ISOgrk3" },
1957 { 969,	"omega","greek small letter omega, U+03C9 ISOgrk3" },
1958 { 977,	"thetasym","greek small letter theta symbol, U+03D1 NEW" },
1959 { 978,	"upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1960 { 982,	"piv",	"greek pi symbol, U+03D6 ISOgrk3" },
1961 
1962 { 8194,	"ensp",	"en space, U+2002 ISOpub" },
1963 { 8195,	"emsp",	"em space, U+2003 ISOpub" },
1964 { 8201,	"thinsp","thin space, U+2009 ISOpub" },
1965 { 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },
1966 { 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },
1967 { 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },
1968 { 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },
1969 { 8211,	"ndash","en dash, U+2013 ISOpub" },
1970 { 8212,	"mdash","em dash, U+2014 ISOpub" },
1971 { 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },
1972 { 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },
1973 { 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },
1974 { 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },
1975 { 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },
1976 { 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },
1977 { 8224,	"dagger","dagger, U+2020 ISOpub" },
1978 { 8225,	"Dagger","double dagger, U+2021 ISOpub" },
1979 
1980 { 8226,	"bull",	"bullet = black small circle, U+2022 ISOpub" },
1981 { 8230,	"hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1982 
1983 { 8240,	"permil","per mille sign, U+2030 ISOtech" },
1984 
1985 { 8242,	"prime","prime = minutes = feet, U+2032 ISOtech" },
1986 { 8243,	"Prime","double prime = seconds = inches, U+2033 ISOtech" },
1987 
1988 { 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1989 { 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1990 
1991 { 8254,	"oline","overline = spacing overscore, U+203E NEW" },
1992 { 8260,	"frasl","fraction slash, U+2044 NEW" },
1993 
1994 { 8364,	"euro",	"euro sign, U+20AC NEW" },
1995 
1996 { 8465,	"image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1997 { 8472,	"weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1998 { 8476,	"real",	"blackletter capital R = real part symbol, U+211C ISOamso" },
1999 { 8482,	"trade","trade mark sign, U+2122 ISOnum" },
2000 { 8501,	"alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
2001 { 8592,	"larr",	"leftwards arrow, U+2190 ISOnum" },
2002 { 8593,	"uarr",	"upwards arrow, U+2191 ISOnum" },
2003 { 8594,	"rarr",	"rightwards arrow, U+2192 ISOnum" },
2004 { 8595,	"darr",	"downwards arrow, U+2193 ISOnum" },
2005 { 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },
2006 { 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
2007 { 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },
2008 { 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },
2009 { 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },
2010 { 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },
2011 { 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },
2012 
2013 { 8704,	"forall","for all, U+2200 ISOtech" },
2014 { 8706,	"part",	"partial differential, U+2202 ISOtech" },
2015 { 8707,	"exist","there exists, U+2203 ISOtech" },
2016 { 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },
2017 { 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },
2018 { 8712,	"isin",	"element of, U+2208 ISOtech" },
2019 { 8713,	"notin","not an element of, U+2209 ISOtech" },
2020 { 8715,	"ni",	"contains as member, U+220B ISOtech" },
2021 { 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },
2022 { 8721,	"sum",	"n-ary summation, U+2211 ISOamsb" },
2023 { 8722,	"minus","minus sign, U+2212 ISOtech" },
2024 { 8727,	"lowast","asterisk operator, U+2217 ISOtech" },
2025 { 8730,	"radic","square root = radical sign, U+221A ISOtech" },
2026 { 8733,	"prop",	"proportional to, U+221D ISOtech" },
2027 { 8734,	"infin","infinity, U+221E ISOtech" },
2028 { 8736,	"ang",	"angle, U+2220 ISOamso" },
2029 { 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },
2030 { 8744,	"or",	"logical or = vee, U+2228 ISOtech" },
2031 { 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },
2032 { 8746,	"cup",	"union = cup, U+222A ISOtech" },
2033 { 8747,	"int",	"integral, U+222B ISOtech" },
2034 { 8756,	"there4","therefore, U+2234 ISOtech" },
2035 { 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },
2036 { 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },
2037 { 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
2038 { 8800,	"ne",	"not equal to, U+2260 ISOtech" },
2039 { 8801,	"equiv","identical to, U+2261 ISOtech" },
2040 { 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },
2041 { 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },
2042 { 8834,	"sub",	"subset of, U+2282 ISOtech" },
2043 { 8835,	"sup",	"superset of, U+2283 ISOtech" },
2044 { 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },
2045 { 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },
2046 { 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },
2047 { 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },
2048 { 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },
2049 { 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2050 { 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },
2051 { 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
2052 { 8969,	"rceil","right ceiling, U+2309 ISOamsc" },
2053 { 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },
2054 { 8971,	"rfloor","right floor, U+230B ISOamsc" },
2055 { 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },
2056 { 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },
2057 { 9674,	"loz",	"lozenge, U+25CA ISOpub" },
2058 
2059 { 9824,	"spades","black spade suit, U+2660 ISOpub" },
2060 { 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },
2061 { 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },
2062 { 9830,	"diams","black diamond suit, U+2666 ISOpub" },
2063 
2064 };
2065 
2066 /************************************************************************
2067  *									*
2068  *		Commodity functions to handle entities			*
2069  *									*
2070  ************************************************************************/
2071 
2072 /*
2073  * Macro used to grow the current buffer.
2074  */
2075 #define growBuffer(buffer) {						\
2076     xmlChar *tmp;							\
2077     buffer##_size *= 2;							\
2078     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
2079     if (tmp == NULL) {						\
2080 	htmlErrMemory(ctxt, "growing buffer\n");			\
2081 	xmlFree(buffer);						\
2082 	return(NULL);							\
2083     }									\
2084     buffer = tmp;							\
2085 }
2086 
2087 /**
2088  * htmlEntityLookup:
2089  * @name: the entity name
2090  *
2091  * Lookup the given entity in EntitiesTable
2092  *
2093  * TODO: the linear scan is really ugly, an hash table is really needed.
2094  *
2095  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2096  */
2097 const htmlEntityDesc *
htmlEntityLookup(const xmlChar * name)2098 htmlEntityLookup(const xmlChar *name) {
2099     unsigned int i;
2100 
2101     for (i = 0;i < (sizeof(html40EntitiesTable)/
2102                     sizeof(html40EntitiesTable[0]));i++) {
2103         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2104             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2105 	}
2106     }
2107     return(NULL);
2108 }
2109 
2110 /**
2111  * htmlEntityValueLookup:
2112  * @value: the entity's unicode value
2113  *
2114  * Lookup the given entity in EntitiesTable
2115  *
2116  * TODO: the linear scan is really ugly, an hash table is really needed.
2117  *
2118  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2119  */
2120 const htmlEntityDesc *
htmlEntityValueLookup(unsigned int value)2121 htmlEntityValueLookup(unsigned int value) {
2122     unsigned int i;
2123 
2124     for (i = 0;i < (sizeof(html40EntitiesTable)/
2125                     sizeof(html40EntitiesTable[0]));i++) {
2126         if (html40EntitiesTable[i].value >= value) {
2127 	    if (html40EntitiesTable[i].value > value)
2128 		break;
2129             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2130 	}
2131     }
2132     return(NULL);
2133 }
2134 
2135 /**
2136  * UTF8ToHtml:
2137  * @out:  a pointer to an array of bytes to store the result
2138  * @outlen:  the length of @out
2139  * @in:  a pointer to an array of UTF-8 chars
2140  * @inlen:  the length of @in
2141  *
2142  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2143  * plus HTML entities block of chars out.
2144  *
2145  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2146  * The value of @inlen after return is the number of octets consumed
2147  *     as the return value is positive, else unpredictable.
2148  * The value of @outlen after return is the number of octets consumed.
2149  */
2150 int
UTF8ToHtml(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)2151 UTF8ToHtml(unsigned char* out, int *outlen,
2152               const unsigned char* in, int *inlen) {
2153     const unsigned char* processed = in;
2154     const unsigned char* outend;
2155     const unsigned char* outstart = out;
2156     const unsigned char* instart = in;
2157     const unsigned char* inend;
2158     unsigned int c, d;
2159     int trailing;
2160 
2161     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2162     if (in == NULL) {
2163         /*
2164 	 * initialization nothing to do
2165 	 */
2166 	*outlen = 0;
2167 	*inlen = 0;
2168 	return(0);
2169     }
2170     inend = in + (*inlen);
2171     outend = out + (*outlen);
2172     while (in < inend) {
2173 	d = *in++;
2174 	if      (d < 0x80)  { c= d; trailing= 0; }
2175 	else if (d < 0xC0) {
2176 	    /* trailing byte in leading position */
2177 	    *outlen = out - outstart;
2178 	    *inlen = processed - instart;
2179 	    return(-2);
2180         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2181         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2182         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2183 	else {
2184 	    /* no chance for this in Ascii */
2185 	    *outlen = out - outstart;
2186 	    *inlen = processed - instart;
2187 	    return(-2);
2188 	}
2189 
2190 	if (inend - in < trailing) {
2191 	    break;
2192 	}
2193 
2194 	for ( ; trailing; trailing--) {
2195 	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2196 		break;
2197 	    c <<= 6;
2198 	    c |= d & 0x3F;
2199 	}
2200 
2201 	/* assertion: c is a single UTF-4 value */
2202 	if (c < 0x80) {
2203 	    if (out + 1 >= outend)
2204 		break;
2205 	    *out++ = c;
2206 	} else {
2207 	    int len;
2208 	    const htmlEntityDesc * ent;
2209 	    const char *cp;
2210 	    char nbuf[16];
2211 
2212 	    /*
2213 	     * Try to lookup a predefined HTML entity for it
2214 	     */
2215 
2216 	    ent = htmlEntityValueLookup(c);
2217 	    if (ent == NULL) {
2218 	      snprintf(nbuf, sizeof(nbuf), "#%u", c);
2219 	      cp = nbuf;
2220 	    }
2221 	    else
2222 	      cp = ent->name;
2223 	    len = strlen(cp);
2224 	    if (out + 2 + len >= outend)
2225 		break;
2226 	    *out++ = '&';
2227 	    memcpy(out, cp, len);
2228 	    out += len;
2229 	    *out++ = ';';
2230 	}
2231 	processed = in;
2232     }
2233     *outlen = out - outstart;
2234     *inlen = processed - instart;
2235     return(0);
2236 }
2237 
2238 /**
2239  * htmlEncodeEntities:
2240  * @out:  a pointer to an array of bytes to store the result
2241  * @outlen:  the length of @out
2242  * @in:  a pointer to an array of UTF-8 chars
2243  * @inlen:  the length of @in
2244  * @quoteChar: the quote character to escape (' or ") or zero.
2245  *
2246  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2247  * plus HTML entities block of chars out.
2248  *
2249  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2250  * The value of @inlen after return is the number of octets consumed
2251  *     as the return value is positive, else unpredictable.
2252  * The value of @outlen after return is the number of octets consumed.
2253  */
2254 int
htmlEncodeEntities(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,int quoteChar)2255 htmlEncodeEntities(unsigned char* out, int *outlen,
2256 		   const unsigned char* in, int *inlen, int quoteChar) {
2257     const unsigned char* processed = in;
2258     const unsigned char* outend;
2259     const unsigned char* outstart = out;
2260     const unsigned char* instart = in;
2261     const unsigned char* inend;
2262     unsigned int c, d;
2263     int trailing;
2264 
2265     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2266         return(-1);
2267     outend = out + (*outlen);
2268     inend = in + (*inlen);
2269     while (in < inend) {
2270 	d = *in++;
2271 	if      (d < 0x80)  { c= d; trailing= 0; }
2272 	else if (d < 0xC0) {
2273 	    /* trailing byte in leading position */
2274 	    *outlen = out - outstart;
2275 	    *inlen = processed - instart;
2276 	    return(-2);
2277         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2278         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2279         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2280 	else {
2281 	    /* no chance for this in Ascii */
2282 	    *outlen = out - outstart;
2283 	    *inlen = processed - instart;
2284 	    return(-2);
2285 	}
2286 
2287 	if (inend - in < trailing)
2288 	    break;
2289 
2290 	while (trailing--) {
2291 	    if (((d= *in++) & 0xC0) != 0x80) {
2292 		*outlen = out - outstart;
2293 		*inlen = processed - instart;
2294 		return(-2);
2295 	    }
2296 	    c <<= 6;
2297 	    c |= d & 0x3F;
2298 	}
2299 
2300 	/* assertion: c is a single UTF-4 value */
2301 	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2302 	    (c != '&') && (c != '<') && (c != '>')) {
2303 	    if (out >= outend)
2304 		break;
2305 	    *out++ = c;
2306 	} else {
2307 	    const htmlEntityDesc * ent;
2308 	    const char *cp;
2309 	    char nbuf[16];
2310 	    int len;
2311 
2312 	    /*
2313 	     * Try to lookup a predefined HTML entity for it
2314 	     */
2315 	    ent = htmlEntityValueLookup(c);
2316 	    if (ent == NULL) {
2317 		snprintf(nbuf, sizeof(nbuf), "#%u", c);
2318 		cp = nbuf;
2319 	    }
2320 	    else
2321 		cp = ent->name;
2322 	    len = strlen(cp);
2323 	    if (out + 2 + len > outend)
2324 		break;
2325 	    *out++ = '&';
2326 	    memcpy(out, cp, len);
2327 	    out += len;
2328 	    *out++ = ';';
2329 	}
2330 	processed = in;
2331     }
2332     *outlen = out - outstart;
2333     *inlen = processed - instart;
2334     return(0);
2335 }
2336 
2337 /************************************************************************
2338  *									*
2339  *		Commodity functions to handle streams			*
2340  *									*
2341  ************************************************************************/
2342 
2343 #ifdef LIBXML_PUSH_ENABLED
2344 /**
2345  * htmlNewInputStream:
2346  * @ctxt:  an HTML parser context
2347  *
2348  * Create a new input stream structure
2349  * Returns the new input stream or NULL
2350  */
2351 static htmlParserInputPtr
htmlNewInputStream(htmlParserCtxtPtr ctxt)2352 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2353     htmlParserInputPtr input;
2354 
2355     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2356     if (input == NULL) {
2357         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2358 	return(NULL);
2359     }
2360     memset(input, 0, sizeof(htmlParserInput));
2361     input->filename = NULL;
2362     input->directory = NULL;
2363     input->base = NULL;
2364     input->cur = NULL;
2365     input->buf = NULL;
2366     input->line = 1;
2367     input->col = 1;
2368     input->buf = NULL;
2369     input->free = NULL;
2370     input->version = NULL;
2371     input->consumed = 0;
2372     input->length = 0;
2373     return(input);
2374 }
2375 #endif
2376 
2377 
2378 /************************************************************************
2379  *									*
2380  *		Commodity functions, cleanup needed ?			*
2381  *									*
2382  ************************************************************************/
2383 /*
2384  * all tags allowing pc data from the html 4.01 loose dtd
2385  * NOTE: it might be more appropriate to integrate this information
2386  * into the html40ElementTable array but I don't want to risk any
2387  * binary incompatibility
2388  */
2389 static const char *allowPCData[] = {
2390     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2391     "blockquote", "body", "button", "caption", "center", "cite", "code",
2392     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2393     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2394     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2395     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2396 };
2397 
2398 /**
2399  * areBlanks:
2400  * @ctxt:  an HTML parser context
2401  * @str:  a xmlChar *
2402  * @len:  the size of @str
2403  *
2404  * Is this a sequence of blank chars that one can ignore ?
2405  *
2406  * Returns 1 if ignorable 0 otherwise.
2407  */
2408 
areBlanks(htmlParserCtxtPtr ctxt,const xmlChar * str,int len)2409 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2410     unsigned int i;
2411     int j;
2412     xmlNodePtr lastChild;
2413     xmlDtdPtr dtd;
2414 
2415     for (j = 0;j < len;j++)
2416         if (!(IS_BLANK_CH(str[j]))) return(0);
2417 
2418     if (CUR == 0) return(1);
2419     if (CUR != '<') return(0);
2420     if (ctxt->name == NULL)
2421 	return(1);
2422     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2423 	return(1);
2424     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2425 	return(1);
2426 
2427     /* Only strip CDATA children of the body tag for strict HTML DTDs */
2428     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2429         dtd = xmlGetIntSubset(ctxt->myDoc);
2430         if (dtd != NULL && dtd->ExternalID != NULL) {
2431             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2432                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2433                 return(1);
2434         }
2435     }
2436 
2437     if (ctxt->node == NULL) return(0);
2438     lastChild = xmlGetLastChild(ctxt->node);
2439     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2440 	lastChild = lastChild->prev;
2441     if (lastChild == NULL) {
2442         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2443             (ctxt->node->content != NULL)) return(0);
2444 	/* keep ws in constructs like ...<b> </b>...
2445 	   for all tags "b" allowing PCDATA */
2446 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2447 	    if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2448 		return(0);
2449 	    }
2450 	}
2451     } else if (xmlNodeIsText(lastChild)) {
2452         return(0);
2453     } else {
2454 	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2455 	   for all tags "p" allowing PCDATA */
2456 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2457 	    if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2458 		return(0);
2459 	    }
2460 	}
2461     }
2462     return(1);
2463 }
2464 
2465 /**
2466  * htmlNewDocNoDtD:
2467  * @URI:  URI for the dtd, or NULL
2468  * @ExternalID:  the external ID of the DTD, or NULL
2469  *
2470  * Creates a new HTML document without a DTD node if @URI and @ExternalID
2471  * are NULL
2472  *
2473  * Returns a new document, do not initialize the DTD if not provided
2474  */
2475 htmlDocPtr
htmlNewDocNoDtD(const xmlChar * URI,const xmlChar * ExternalID)2476 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2477     xmlDocPtr cur;
2478 
2479     /*
2480      * Allocate a new document and fill the fields.
2481      */
2482     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2483     if (cur == NULL) {
2484 	htmlErrMemory(NULL, "HTML document creation failed\n");
2485 	return(NULL);
2486     }
2487     memset(cur, 0, sizeof(xmlDoc));
2488 
2489     cur->type = XML_HTML_DOCUMENT_NODE;
2490     cur->version = NULL;
2491     cur->intSubset = NULL;
2492     cur->doc = cur;
2493     cur->name = NULL;
2494     cur->children = NULL;
2495     cur->extSubset = NULL;
2496     cur->oldNs = NULL;
2497     cur->encoding = NULL;
2498     cur->standalone = 1;
2499     cur->compression = 0;
2500     cur->ids = NULL;
2501     cur->refs = NULL;
2502     cur->_private = NULL;
2503     cur->charset = XML_CHAR_ENCODING_UTF8;
2504     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2505     if ((ExternalID != NULL) ||
2506 	(URI != NULL))
2507 	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2508     if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2509 	xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2510     return(cur);
2511 }
2512 
2513 /**
2514  * htmlNewDoc:
2515  * @URI:  URI for the dtd, or NULL
2516  * @ExternalID:  the external ID of the DTD, or NULL
2517  *
2518  * Creates a new HTML document
2519  *
2520  * Returns a new document
2521  */
2522 htmlDocPtr
htmlNewDoc(const xmlChar * URI,const xmlChar * ExternalID)2523 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2524     if ((URI == NULL) && (ExternalID == NULL))
2525 	return(htmlNewDocNoDtD(
2526 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2527 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2528 
2529     return(htmlNewDocNoDtD(URI, ExternalID));
2530 }
2531 
2532 
2533 /************************************************************************
2534  *									*
2535  *			The parser itself				*
2536  *	Relates to http://www.w3.org/TR/html40				*
2537  *									*
2538  ************************************************************************/
2539 
2540 /************************************************************************
2541  *									*
2542  *			The parser itself				*
2543  *									*
2544  ************************************************************************/
2545 
2546 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2547 
2548 static void
htmlSkipBogusComment(htmlParserCtxtPtr ctxt)2549 htmlSkipBogusComment(htmlParserCtxtPtr ctxt) {
2550     int c;
2551 
2552     htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT,
2553                  "Incorrectly opened comment\n", NULL, NULL);
2554 
2555     do {
2556         c = CUR;
2557         if (c == 0)
2558             break;
2559         NEXT;
2560     } while (c != '>');
2561 }
2562 
2563 /**
2564  * htmlParseHTMLName:
2565  * @ctxt:  an HTML parser context
2566  *
2567  * parse an HTML tag or attribute name, note that we convert it to lowercase
2568  * since HTML names are not case-sensitive.
2569  *
2570  * Returns the Tag Name parsed or NULL
2571  */
2572 
2573 static const xmlChar *
htmlParseHTMLName(htmlParserCtxtPtr ctxt)2574 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2575     int i = 0;
2576     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2577 
2578     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2579         (CUR != ':') && (CUR != '.')) return(NULL);
2580 
2581     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2582            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2583 	   (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2584            (CUR == '.'))) {
2585 	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2586         else loc[i] = CUR;
2587 	i++;
2588 
2589 	NEXT;
2590     }
2591 
2592     return(xmlDictLookup(ctxt->dict, loc, i));
2593 }
2594 
2595 
2596 /**
2597  * htmlParseHTMLName_nonInvasive:
2598  * @ctxt:  an HTML parser context
2599  *
2600  * parse an HTML tag or attribute name, note that we convert it to lowercase
2601  * since HTML names are not case-sensitive, this doesn't consume the data
2602  * from the stream, it's a look-ahead
2603  *
2604  * Returns the Tag Name parsed or NULL
2605  */
2606 
2607 static const xmlChar *
htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt)2608 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2609     int i = 0;
2610     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2611 
2612     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2613         (NXT(1) != ':')) return(NULL);
2614 
2615     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2616            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2617 	   (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2618 	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2619         else loc[i] = NXT(1+i);
2620 	i++;
2621     }
2622 
2623     return(xmlDictLookup(ctxt->dict, loc, i));
2624 }
2625 
2626 
2627 /**
2628  * htmlParseName:
2629  * @ctxt:  an HTML parser context
2630  *
2631  * parse an HTML name, this routine is case sensitive.
2632  *
2633  * Returns the Name parsed or NULL
2634  */
2635 
2636 static const xmlChar *
htmlParseName(htmlParserCtxtPtr ctxt)2637 htmlParseName(htmlParserCtxtPtr ctxt) {
2638     const xmlChar *in;
2639     const xmlChar *ret;
2640     int count = 0;
2641 
2642     GROW;
2643 
2644     /*
2645      * Accelerator for simple ASCII names
2646      */
2647     in = ctxt->input->cur;
2648     if (((*in >= 0x61) && (*in <= 0x7A)) ||
2649 	((*in >= 0x41) && (*in <= 0x5A)) ||
2650 	(*in == '_') || (*in == ':')) {
2651 	in++;
2652 	while (((*in >= 0x61) && (*in <= 0x7A)) ||
2653 	       ((*in >= 0x41) && (*in <= 0x5A)) ||
2654 	       ((*in >= 0x30) && (*in <= 0x39)) ||
2655 	       (*in == '_') || (*in == '-') ||
2656 	       (*in == ':') || (*in == '.'))
2657 	    in++;
2658 
2659 	if (in == ctxt->input->end)
2660 	    return(NULL);
2661 
2662 	if ((*in > 0) && (*in < 0x80)) {
2663 	    count = in - ctxt->input->cur;
2664 	    ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2665 	    ctxt->input->cur = in;
2666 	    ctxt->input->col += count;
2667 	    return(ret);
2668 	}
2669     }
2670     return(htmlParseNameComplex(ctxt));
2671 }
2672 
2673 static const xmlChar *
htmlParseNameComplex(xmlParserCtxtPtr ctxt)2674 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2675     int len = 0, l;
2676     int c;
2677     int count = 0;
2678     const xmlChar *base = ctxt->input->base;
2679 
2680     /*
2681      * Handler for more complex cases
2682      */
2683     GROW;
2684     c = CUR_CHAR(l);
2685     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2686 	(!IS_LETTER(c) && (c != '_') &&
2687          (c != ':'))) {
2688 	return(NULL);
2689     }
2690 
2691     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2692 	   ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2693             (c == '.') || (c == '-') ||
2694 	    (c == '_') || (c == ':') ||
2695 	    (IS_COMBINING(c)) ||
2696 	    (IS_EXTENDER(c)))) {
2697 	if (count++ > 100) {
2698 	    count = 0;
2699 	    GROW;
2700 	}
2701 	len += l;
2702 	NEXTL(l);
2703 	c = CUR_CHAR(l);
2704 	if (ctxt->input->base != base) {
2705 	    /*
2706 	     * We changed encoding from an unknown encoding
2707 	     * Input buffer changed location, so we better start again
2708 	     */
2709 	    return(htmlParseNameComplex(ctxt));
2710 	}
2711     }
2712 
2713     if (ctxt->input->cur - ctxt->input->base < len) {
2714         /* Sanity check */
2715 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2716                      "unexpected change of input buffer", NULL, NULL);
2717         return (NULL);
2718     }
2719 
2720     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2721 }
2722 
2723 
2724 /**
2725  * htmlParseHTMLAttribute:
2726  * @ctxt:  an HTML parser context
2727  * @stop:  a char stop value
2728  *
2729  * parse an HTML attribute value till the stop (quote), if
2730  * stop is 0 then it stops at the first space
2731  *
2732  * Returns the attribute parsed or NULL
2733  */
2734 
2735 static xmlChar *
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt,const xmlChar stop)2736 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2737     xmlChar *buffer = NULL;
2738     int buffer_size = 0;
2739     xmlChar *out = NULL;
2740     const xmlChar *name = NULL;
2741     const xmlChar *cur = NULL;
2742     const htmlEntityDesc * ent;
2743 
2744     /*
2745      * allocate a translation buffer.
2746      */
2747     buffer_size = HTML_PARSER_BUFFER_SIZE;
2748     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2749     if (buffer == NULL) {
2750 	htmlErrMemory(ctxt, "buffer allocation failed\n");
2751 	return(NULL);
2752     }
2753     out = buffer;
2754 
2755     /*
2756      * Ok loop until we reach one of the ending chars
2757      */
2758     while ((CUR != 0) && (CUR != stop)) {
2759 	if ((stop == 0) && (CUR == '>')) break;
2760 	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2761         if (CUR == '&') {
2762 	    if (NXT(1) == '#') {
2763 		unsigned int c;
2764 		int bits;
2765 
2766 		c = htmlParseCharRef(ctxt);
2767 		if      (c <    0x80)
2768 		        { *out++  = c;                bits= -6; }
2769 		else if (c <   0x800)
2770 		        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2771 		else if (c < 0x10000)
2772 		        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2773 		else
2774 		        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2775 
2776 		for ( ; bits >= 0; bits-= 6) {
2777 		    *out++  = ((c >> bits) & 0x3F) | 0x80;
2778 		}
2779 
2780 		if (out - buffer > buffer_size - 100) {
2781 			int indx = out - buffer;
2782 
2783 			growBuffer(buffer);
2784 			out = &buffer[indx];
2785 		}
2786 	    } else {
2787 		ent = htmlParseEntityRef(ctxt, &name);
2788 		if (name == NULL) {
2789 		    *out++ = '&';
2790 		    if (out - buffer > buffer_size - 100) {
2791 			int indx = out - buffer;
2792 
2793 			growBuffer(buffer);
2794 			out = &buffer[indx];
2795 		    }
2796 		} else if (ent == NULL) {
2797 		    *out++ = '&';
2798 		    cur = name;
2799 		    while (*cur != 0) {
2800 			if (out - buffer > buffer_size - 100) {
2801 			    int indx = out - buffer;
2802 
2803 			    growBuffer(buffer);
2804 			    out = &buffer[indx];
2805 			}
2806 			*out++ = *cur++;
2807 		    }
2808 		} else {
2809 		    unsigned int c;
2810 		    int bits;
2811 
2812 		    if (out - buffer > buffer_size - 100) {
2813 			int indx = out - buffer;
2814 
2815 			growBuffer(buffer);
2816 			out = &buffer[indx];
2817 		    }
2818 		    c = ent->value;
2819 		    if      (c <    0x80)
2820 			{ *out++  = c;                bits= -6; }
2821 		    else if (c <   0x800)
2822 			{ *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2823 		    else if (c < 0x10000)
2824 			{ *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2825 		    else
2826 			{ *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2827 
2828 		    for ( ; bits >= 0; bits-= 6) {
2829 			*out++  = ((c >> bits) & 0x3F) | 0x80;
2830 		    }
2831 		}
2832 	    }
2833 	} else {
2834 	    unsigned int c;
2835 	    int bits, l;
2836 
2837 	    if (out - buffer > buffer_size - 100) {
2838 		int indx = out - buffer;
2839 
2840 		growBuffer(buffer);
2841 		out = &buffer[indx];
2842 	    }
2843 	    c = CUR_CHAR(l);
2844 	    if      (c <    0x80)
2845 		    { *out++  = c;                bits= -6; }
2846 	    else if (c <   0x800)
2847 		    { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2848 	    else if (c < 0x10000)
2849 		    { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2850 	    else
2851 		    { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2852 
2853 	    for ( ; bits >= 0; bits-= 6) {
2854 		*out++  = ((c >> bits) & 0x3F) | 0x80;
2855 	    }
2856 	    NEXT;
2857 	}
2858     }
2859     *out = 0;
2860     return(buffer);
2861 }
2862 
2863 /**
2864  * htmlParseEntityRef:
2865  * @ctxt:  an HTML parser context
2866  * @str:  location to store the entity name
2867  *
2868  * parse an HTML ENTITY references
2869  *
2870  * [68] EntityRef ::= '&' Name ';'
2871  *
2872  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2873  *         if non-NULL *str will have to be freed by the caller.
2874  */
2875 const htmlEntityDesc *
htmlParseEntityRef(htmlParserCtxtPtr ctxt,const xmlChar ** str)2876 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2877     const xmlChar *name;
2878     const htmlEntityDesc * ent = NULL;
2879 
2880     if (str != NULL) *str = NULL;
2881     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2882 
2883     if (CUR == '&') {
2884         NEXT;
2885         name = htmlParseName(ctxt);
2886 	if (name == NULL) {
2887 	    htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2888 	                 "htmlParseEntityRef: no name\n", NULL, NULL);
2889 	} else {
2890 	    GROW;
2891 	    if (CUR == ';') {
2892 	        if (str != NULL)
2893 		    *str = name;
2894 
2895 		/*
2896 		 * Lookup the entity in the table.
2897 		 */
2898 		ent = htmlEntityLookup(name);
2899 		if (ent != NULL) /* OK that's ugly !!! */
2900 		    NEXT;
2901 	    } else {
2902 		htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2903 		             "htmlParseEntityRef: expecting ';'\n",
2904 			     NULL, NULL);
2905 	        if (str != NULL)
2906 		    *str = name;
2907 	    }
2908 	}
2909     }
2910     return(ent);
2911 }
2912 
2913 /**
2914  * htmlParseAttValue:
2915  * @ctxt:  an HTML parser context
2916  *
2917  * parse a value for an attribute
2918  * Note: the parser won't do substitution of entities here, this
2919  * will be handled later in xmlStringGetNodeList, unless it was
2920  * asked for ctxt->replaceEntities != 0
2921  *
2922  * Returns the AttValue parsed or NULL.
2923  */
2924 
2925 static xmlChar *
htmlParseAttValue(htmlParserCtxtPtr ctxt)2926 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2927     xmlChar *ret = NULL;
2928 
2929     if (CUR == '"') {
2930         NEXT;
2931 	ret = htmlParseHTMLAttribute(ctxt, '"');
2932         if (CUR != '"') {
2933 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2934 	                 "AttValue: \" expected\n", NULL, NULL);
2935 	} else
2936 	    NEXT;
2937     } else if (CUR == '\'') {
2938         NEXT;
2939 	ret = htmlParseHTMLAttribute(ctxt, '\'');
2940         if (CUR != '\'') {
2941 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2942 	                 "AttValue: ' expected\n", NULL, NULL);
2943 	} else
2944 	    NEXT;
2945     } else {
2946         /*
2947 	 * That's an HTMLism, the attribute value may not be quoted
2948 	 */
2949 	ret = htmlParseHTMLAttribute(ctxt, 0);
2950 	if (ret == NULL) {
2951 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2952 	                 "AttValue: no value found\n", NULL, NULL);
2953 	}
2954     }
2955     return(ret);
2956 }
2957 
2958 /**
2959  * htmlParseSystemLiteral:
2960  * @ctxt:  an HTML parser context
2961  *
2962  * parse an HTML Literal
2963  *
2964  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2965  *
2966  * Returns the SystemLiteral parsed or NULL
2967  */
2968 
2969 static xmlChar *
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt)2970 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2971     size_t len = 0, startPosition = 0;
2972     int err = 0;
2973     int quote;
2974     xmlChar *ret = NULL;
2975 
2976     if ((CUR != '"') && (CUR != '\'')) {
2977 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2978 	             "SystemLiteral \" or ' expected\n", NULL, NULL);
2979         return(NULL);
2980     }
2981     quote = CUR;
2982     NEXT;
2983 
2984     if (CUR_PTR < BASE_PTR)
2985         return(ret);
2986     startPosition = CUR_PTR - BASE_PTR;
2987 
2988     while ((CUR != 0) && (CUR != quote)) {
2989         /* TODO: Handle UTF-8 */
2990         if (!IS_CHAR_CH(CUR)) {
2991             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2992                             "Invalid char in SystemLiteral 0x%X\n", CUR);
2993             err = 1;
2994         }
2995         NEXT;
2996         len++;
2997     }
2998     if (CUR != quote) {
2999         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3000                      "Unfinished SystemLiteral\n", NULL, NULL);
3001     } else {
3002         NEXT;
3003         if (err == 0)
3004             ret = xmlStrndup((BASE_PTR+startPosition), len);
3005     }
3006 
3007     return(ret);
3008 }
3009 
3010 /**
3011  * htmlParsePubidLiteral:
3012  * @ctxt:  an HTML parser context
3013  *
3014  * parse an HTML public literal
3015  *
3016  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
3017  *
3018  * Returns the PubidLiteral parsed or NULL.
3019  */
3020 
3021 static xmlChar *
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt)3022 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3023     size_t len = 0, startPosition = 0;
3024     int err = 0;
3025     int quote;
3026     xmlChar *ret = NULL;
3027 
3028     if ((CUR != '"') && (CUR != '\'')) {
3029 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
3030 	             "PubidLiteral \" or ' expected\n", NULL, NULL);
3031         return(NULL);
3032     }
3033     quote = CUR;
3034     NEXT;
3035 
3036     /*
3037      * Name ::= (Letter | '_') (NameChar)*
3038      */
3039     if (CUR_PTR < BASE_PTR)
3040         return(ret);
3041     startPosition = CUR_PTR - BASE_PTR;
3042 
3043     while ((CUR != 0) && (CUR != quote)) {
3044         if (!IS_PUBIDCHAR_CH(CUR)) {
3045             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3046                             "Invalid char in PubidLiteral 0x%X\n", CUR);
3047             err = 1;
3048         }
3049         len++;
3050         NEXT;
3051     }
3052 
3053     if (CUR != quote) {
3054         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3055                      "Unfinished PubidLiteral\n", NULL, NULL);
3056     } else {
3057         NEXT;
3058         if (err == 0)
3059             ret = xmlStrndup((BASE_PTR + startPosition), len);
3060     }
3061 
3062     return(ret);
3063 }
3064 
3065 /**
3066  * htmlParseScript:
3067  * @ctxt:  an HTML parser context
3068  *
3069  * parse the content of an HTML SCRIPT or STYLE element
3070  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
3071  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
3072  * http://www.w3.org/TR/html4/types.html#type-script
3073  * http://www.w3.org/TR/html4/types.html#h-6.15
3074  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
3075  *
3076  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
3077  * element and the value of intrinsic event attributes. User agents must
3078  * not evaluate script data as HTML markup but instead must pass it on as
3079  * data to a script engine.
3080  * NOTES:
3081  * - The content is passed like CDATA
3082  * - the attributes for style and scripting "onXXX" are also described
3083  *   as CDATA but SGML allows entities references in attributes so their
3084  *   processing is identical as other attributes
3085  */
3086 static void
htmlParseScript(htmlParserCtxtPtr ctxt)3087 htmlParseScript(htmlParserCtxtPtr ctxt) {
3088     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3089     int nbchar = 0;
3090     int cur,l;
3091 
3092     SHRINK;
3093     cur = CUR_CHAR(l);
3094     while (cur != 0) {
3095 	if ((cur == '<') && (NXT(1) == '/')) {
3096             /*
3097              * One should break here, the specification is clear:
3098              * Authors should therefore escape "</" within the content.
3099              * Escape mechanisms are specific to each scripting or
3100              * style sheet language.
3101              *
3102              * In recovery mode, only break if end tag match the
3103              * current tag, effectively ignoring all tags inside the
3104              * script/style block and treating the entire block as
3105              * CDATA.
3106              */
3107             if (ctxt->recovery) {
3108                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3109 				   xmlStrlen(ctxt->name)) == 0)
3110                 {
3111                     break; /* while */
3112                 } else {
3113 		    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3114 				 "Element %s embeds close tag\n",
3115 		                 ctxt->name, NULL);
3116 		}
3117             } else {
3118                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3119                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3120                 {
3121                     break; /* while */
3122                 }
3123             }
3124 	}
3125         if (IS_CHAR(cur)) {
3126 	    COPY_BUF(l,buf,nbchar,cur);
3127         } else {
3128             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3129                             "Invalid char in CDATA 0x%X\n", cur);
3130         }
3131 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3132             buf[nbchar] = 0;
3133 	    if (ctxt->sax->cdataBlock!= NULL) {
3134 		/*
3135 		 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3136 		 */
3137 		ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3138 	    } else if (ctxt->sax->characters != NULL) {
3139 		ctxt->sax->characters(ctxt->userData, buf, nbchar);
3140 	    }
3141 	    nbchar = 0;
3142 	}
3143 	GROW;
3144 	NEXTL(l);
3145 	cur = CUR_CHAR(l);
3146     }
3147 
3148     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3149         buf[nbchar] = 0;
3150 	if (ctxt->sax->cdataBlock!= NULL) {
3151 	    /*
3152 	     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3153 	     */
3154 	    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3155 	} else if (ctxt->sax->characters != NULL) {
3156 	    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3157 	}
3158     }
3159 }
3160 
3161 
3162 /**
3163  * htmlParseCharDataInternal:
3164  * @ctxt:  an HTML parser context
3165  * @readahead: optional read ahead character in ascii range
3166  *
3167  * parse a CharData section.
3168  * if we are within a CDATA section ']]>' marks an end of section.
3169  *
3170  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3171  */
3172 
3173 static void
htmlParseCharDataInternal(htmlParserCtxtPtr ctxt,int readahead)3174 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3175     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3176     int nbchar = 0;
3177     int cur, l;
3178     int chunk = 0;
3179 
3180     if (readahead)
3181         buf[nbchar++] = readahead;
3182 
3183     SHRINK;
3184     cur = CUR_CHAR(l);
3185     while (((cur != '<') || (ctxt->token == '<')) &&
3186            ((cur != '&') || (ctxt->token == '&')) &&
3187 	   (cur != 0)) {
3188 	if (!(IS_CHAR(cur))) {
3189 	    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3190 	                "Invalid char in CDATA 0x%X\n", cur);
3191 	} else {
3192 	    COPY_BUF(l,buf,nbchar,cur);
3193 	}
3194 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3195             buf[nbchar] = 0;
3196 
3197 	    /*
3198 	     * Ok the segment is to be consumed as chars.
3199 	     */
3200 	    if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3201 		if (areBlanks(ctxt, buf, nbchar)) {
3202 		    if (ctxt->keepBlanks) {
3203 			if (ctxt->sax->characters != NULL)
3204 			    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3205 		    } else {
3206 			if (ctxt->sax->ignorableWhitespace != NULL)
3207 			    ctxt->sax->ignorableWhitespace(ctxt->userData,
3208 			                                   buf, nbchar);
3209 		    }
3210 		} else {
3211 		    htmlCheckParagraph(ctxt);
3212 		    if (ctxt->sax->characters != NULL)
3213 			ctxt->sax->characters(ctxt->userData, buf, nbchar);
3214 		}
3215 	    }
3216 	    nbchar = 0;
3217 	}
3218 	NEXTL(l);
3219         chunk++;
3220         if (chunk > HTML_PARSER_BUFFER_SIZE) {
3221             chunk = 0;
3222             SHRINK;
3223             GROW;
3224         }
3225 	cur = CUR_CHAR(l);
3226 	if (cur == 0) {
3227 	    SHRINK;
3228 	    GROW;
3229 	    cur = CUR_CHAR(l);
3230 	}
3231     }
3232     if (nbchar != 0) {
3233         buf[nbchar] = 0;
3234 
3235 	/*
3236 	 * Ok the segment is to be consumed as chars.
3237 	 */
3238 	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3239 	    if (areBlanks(ctxt, buf, nbchar)) {
3240 		if (ctxt->keepBlanks) {
3241 		    if (ctxt->sax->characters != NULL)
3242 			ctxt->sax->characters(ctxt->userData, buf, nbchar);
3243 		} else {
3244 		    if (ctxt->sax->ignorableWhitespace != NULL)
3245 			ctxt->sax->ignorableWhitespace(ctxt->userData,
3246 			                               buf, nbchar);
3247 		}
3248 	    } else {
3249 		htmlCheckParagraph(ctxt);
3250 		if (ctxt->sax->characters != NULL)
3251 		    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3252 	    }
3253 	}
3254     } else {
3255 	/*
3256 	 * Loop detection
3257 	 */
3258 	if (cur == 0)
3259 	    ctxt->instate = XML_PARSER_EOF;
3260     }
3261 }
3262 
3263 /**
3264  * htmlParseCharData:
3265  * @ctxt:  an HTML parser context
3266  *
3267  * parse a CharData section.
3268  * if we are within a CDATA section ']]>' marks an end of section.
3269  *
3270  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3271  */
3272 
3273 static void
htmlParseCharData(htmlParserCtxtPtr ctxt)3274 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3275     htmlParseCharDataInternal(ctxt, 0);
3276 }
3277 
3278 /**
3279  * htmlParseExternalID:
3280  * @ctxt:  an HTML parser context
3281  * @publicID:  a xmlChar** receiving PubidLiteral
3282  *
3283  * Parse an External ID or a Public ID
3284  *
3285  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3286  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3287  *
3288  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3289  *
3290  * Returns the function returns SystemLiteral and in the second
3291  *                case publicID receives PubidLiteral, is strict is off
3292  *                it is possible to return NULL and have publicID set.
3293  */
3294 
3295 static xmlChar *
htmlParseExternalID(htmlParserCtxtPtr ctxt,xmlChar ** publicID)3296 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3297     xmlChar *URI = NULL;
3298 
3299     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3300          (UPP(2) == 'S') && (UPP(3) == 'T') &&
3301 	 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3302         SKIP(6);
3303 	if (!IS_BLANK_CH(CUR)) {
3304 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3305 	                 "Space required after 'SYSTEM'\n", NULL, NULL);
3306 	}
3307         SKIP_BLANKS;
3308 	URI = htmlParseSystemLiteral(ctxt);
3309 	if (URI == NULL) {
3310 	    htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3311 	                 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3312         }
3313     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3314 	       (UPP(2) == 'B') && (UPP(3) == 'L') &&
3315 	       (UPP(4) == 'I') && (UPP(5) == 'C')) {
3316         SKIP(6);
3317 	if (!IS_BLANK_CH(CUR)) {
3318 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3319 	                 "Space required after 'PUBLIC'\n", NULL, NULL);
3320 	}
3321         SKIP_BLANKS;
3322 	*publicID = htmlParsePubidLiteral(ctxt);
3323 	if (*publicID == NULL) {
3324 	    htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3325 	                 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3326 			 NULL, NULL);
3327 	}
3328         SKIP_BLANKS;
3329         if ((CUR == '"') || (CUR == '\'')) {
3330 	    URI = htmlParseSystemLiteral(ctxt);
3331 	}
3332     }
3333     return(URI);
3334 }
3335 
3336 /**
3337  * xmlParsePI:
3338  * @ctxt:  an XML parser context
3339  *
3340  * parse an XML Processing Instruction.
3341  *
3342  * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3343  */
3344 static void
htmlParsePI(htmlParserCtxtPtr ctxt)3345 htmlParsePI(htmlParserCtxtPtr ctxt) {
3346     xmlChar *buf = NULL;
3347     int len = 0;
3348     int size = HTML_PARSER_BUFFER_SIZE;
3349     int cur, l;
3350     const xmlChar *target;
3351     xmlParserInputState state;
3352     int count = 0;
3353 
3354     if ((RAW == '<') && (NXT(1) == '?')) {
3355 	state = ctxt->instate;
3356         ctxt->instate = XML_PARSER_PI;
3357 	/*
3358 	 * this is a Processing Instruction.
3359 	 */
3360 	SKIP(2);
3361 	SHRINK;
3362 
3363 	/*
3364 	 * Parse the target name and check for special support like
3365 	 * namespace.
3366 	 */
3367         target = htmlParseName(ctxt);
3368 	if (target != NULL) {
3369 	    if (RAW == '>') {
3370 		SKIP(1);
3371 
3372 		/*
3373 		 * SAX: PI detected.
3374 		 */
3375 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3376 		    (ctxt->sax->processingInstruction != NULL))
3377 		    ctxt->sax->processingInstruction(ctxt->userData,
3378 		                                     target, NULL);
3379 		ctxt->instate = state;
3380 		return;
3381 	    }
3382 	    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3383 	    if (buf == NULL) {
3384 		htmlErrMemory(ctxt, NULL);
3385 		ctxt->instate = state;
3386 		return;
3387 	    }
3388 	    cur = CUR;
3389 	    if (!IS_BLANK(cur)) {
3390 		htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3391 			  "ParsePI: PI %s space expected\n", target, NULL);
3392 	    }
3393             SKIP_BLANKS;
3394 	    cur = CUR_CHAR(l);
3395 	    while ((cur != 0) && (cur != '>')) {
3396 		if (len + 5 >= size) {
3397 		    xmlChar *tmp;
3398 
3399 		    size *= 2;
3400 		    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3401 		    if (tmp == NULL) {
3402 			htmlErrMemory(ctxt, NULL);
3403 			xmlFree(buf);
3404 			ctxt->instate = state;
3405 			return;
3406 		    }
3407 		    buf = tmp;
3408 		}
3409 		count++;
3410 		if (count > 50) {
3411 		    GROW;
3412 		    count = 0;
3413 		}
3414                 if (IS_CHAR(cur)) {
3415 		    COPY_BUF(l,buf,len,cur);
3416                 } else {
3417                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3418                                     "Invalid char in processing instruction "
3419                                     "0x%X\n", cur);
3420                 }
3421 		NEXTL(l);
3422 		cur = CUR_CHAR(l);
3423 		if (cur == 0) {
3424 		    SHRINK;
3425 		    GROW;
3426 		    cur = CUR_CHAR(l);
3427 		}
3428 	    }
3429 	    buf[len] = 0;
3430 	    if (cur != '>') {
3431 		htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3432 		      "ParsePI: PI %s never end ...\n", target, NULL);
3433 	    } else {
3434 		SKIP(1);
3435 
3436 		/*
3437 		 * SAX: PI detected.
3438 		 */
3439 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3440 		    (ctxt->sax->processingInstruction != NULL))
3441 		    ctxt->sax->processingInstruction(ctxt->userData,
3442 		                                     target, buf);
3443 	    }
3444 	    xmlFree(buf);
3445 	} else {
3446 	    htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3447                          "PI is not started correctly", NULL, NULL);
3448 	}
3449 	ctxt->instate = state;
3450     }
3451 }
3452 
3453 /**
3454  * htmlParseComment:
3455  * @ctxt:  an HTML parser context
3456  *
3457  * Parse an XML (SGML) comment <!-- .... -->
3458  *
3459  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3460  */
3461 static void
htmlParseComment(htmlParserCtxtPtr ctxt)3462 htmlParseComment(htmlParserCtxtPtr ctxt) {
3463     xmlChar *buf = NULL;
3464     int len;
3465     int size = HTML_PARSER_BUFFER_SIZE;
3466     int q, ql;
3467     int r, rl;
3468     int cur, l;
3469     int next, nl;
3470     xmlParserInputState state;
3471 
3472     /*
3473      * Check that there is a comment right here.
3474      */
3475     if ((RAW != '<') || (NXT(1) != '!') ||
3476         (NXT(2) != '-') || (NXT(3) != '-')) return;
3477 
3478     state = ctxt->instate;
3479     ctxt->instate = XML_PARSER_COMMENT;
3480     SHRINK;
3481     SKIP(4);
3482     buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3483     if (buf == NULL) {
3484         htmlErrMemory(ctxt, "buffer allocation failed\n");
3485 	ctxt->instate = state;
3486 	return;
3487     }
3488     len = 0;
3489     buf[len] = 0;
3490     q = CUR_CHAR(ql);
3491     if (q == 0)
3492         goto unfinished;
3493     if (q == '>') {
3494         htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3495         cur = '>';
3496         goto finished;
3497     }
3498     NEXTL(ql);
3499     r = CUR_CHAR(rl);
3500     if (r == 0)
3501         goto unfinished;
3502     if (q == '-' && r == '>') {
3503         htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3504         cur = '>';
3505         goto finished;
3506     }
3507     NEXTL(rl);
3508     cur = CUR_CHAR(l);
3509     while ((cur != 0) &&
3510            ((cur != '>') ||
3511 	    (r != '-') || (q != '-'))) {
3512 	NEXTL(l);
3513 	next = CUR_CHAR(nl);
3514 	if (next == 0) {
3515 	    SHRINK;
3516 	    GROW;
3517 	    next = CUR_CHAR(nl);
3518 	}
3519 
3520 	if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3521 	  htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3522 		       "Comment incorrectly closed by '--!>'", NULL, NULL);
3523 	  cur = '>';
3524 	  break;
3525 	}
3526 
3527 	if (len + 5 >= size) {
3528 	    xmlChar *tmp;
3529 
3530 	    size *= 2;
3531 	    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3532 	    if (tmp == NULL) {
3533 	        xmlFree(buf);
3534 	        htmlErrMemory(ctxt, "growing buffer failed\n");
3535 		ctxt->instate = state;
3536 		return;
3537 	    }
3538 	    buf = tmp;
3539 	}
3540         if (IS_CHAR(q)) {
3541 	    COPY_BUF(ql,buf,len,q);
3542         } else {
3543             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3544                             "Invalid char in comment 0x%X\n", q);
3545         }
3546 
3547 	q = r;
3548 	ql = rl;
3549 	r = cur;
3550 	rl = l;
3551 	cur = next;
3552 	l = nl;
3553     }
3554 finished:
3555     buf[len] = 0;
3556     if (cur == '>') {
3557         NEXT;
3558 	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3559 	    (!ctxt->disableSAX))
3560 	    ctxt->sax->comment(ctxt->userData, buf);
3561 	xmlFree(buf);
3562 	ctxt->instate = state;
3563 	return;
3564     }
3565 
3566 unfinished:
3567     htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3568 		 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3569     xmlFree(buf);
3570 }
3571 
3572 /**
3573  * htmlParseCharRef:
3574  * @ctxt:  an HTML parser context
3575  *
3576  * parse Reference declarations
3577  *
3578  * [66] CharRef ::= '&#' [0-9]+ ';' |
3579  *                  '&#x' [0-9a-fA-F]+ ';'
3580  *
3581  * Returns the value parsed (as an int)
3582  */
3583 int
htmlParseCharRef(htmlParserCtxtPtr ctxt)3584 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3585     int val = 0;
3586 
3587     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3588 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3589 		     "htmlParseCharRef: context error\n",
3590 		     NULL, NULL);
3591         return(0);
3592     }
3593     if ((CUR == '&') && (NXT(1) == '#') &&
3594         ((NXT(2) == 'x') || NXT(2) == 'X')) {
3595 	SKIP(3);
3596 	while (CUR != ';') {
3597 	    if ((CUR >= '0') && (CUR <= '9')) {
3598                 if (val < 0x110000)
3599 	            val = val * 16 + (CUR - '0');
3600             } else if ((CUR >= 'a') && (CUR <= 'f')) {
3601                 if (val < 0x110000)
3602 	            val = val * 16 + (CUR - 'a') + 10;
3603             } else if ((CUR >= 'A') && (CUR <= 'F')) {
3604                 if (val < 0x110000)
3605 	            val = val * 16 + (CUR - 'A') + 10;
3606             } else {
3607 	        htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3608 		             "htmlParseCharRef: missing semicolon\n",
3609 			     NULL, NULL);
3610 		break;
3611 	    }
3612 	    NEXT;
3613 	}
3614 	if (CUR == ';')
3615 	    NEXT;
3616     } else if  ((CUR == '&') && (NXT(1) == '#')) {
3617 	SKIP(2);
3618 	while (CUR != ';') {
3619 	    if ((CUR >= '0') && (CUR <= '9')) {
3620                 if (val < 0x110000)
3621 	            val = val * 10 + (CUR - '0');
3622             } else {
3623 	        htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3624 		             "htmlParseCharRef: missing semicolon\n",
3625 			     NULL, NULL);
3626 		break;
3627 	    }
3628 	    NEXT;
3629 	}
3630 	if (CUR == ';')
3631 	    NEXT;
3632     } else {
3633 	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3634 	             "htmlParseCharRef: invalid value\n", NULL, NULL);
3635     }
3636     /*
3637      * Check the value IS_CHAR ...
3638      */
3639     if (IS_CHAR(val)) {
3640         return(val);
3641     } else if (val >= 0x110000) {
3642 	htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3643 		     "htmlParseCharRef: value too large\n", NULL, NULL);
3644     } else {
3645 	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3646 			"htmlParseCharRef: invalid xmlChar value %d\n",
3647 			val);
3648     }
3649     return(0);
3650 }
3651 
3652 
3653 /**
3654  * htmlParseDocTypeDecl:
3655  * @ctxt:  an HTML parser context
3656  *
3657  * parse a DOCTYPE declaration
3658  *
3659  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3660  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3661  */
3662 
3663 static void
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt)3664 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3665     const xmlChar *name;
3666     xmlChar *ExternalID = NULL;
3667     xmlChar *URI = NULL;
3668 
3669     /*
3670      * We know that '<!DOCTYPE' has been detected.
3671      */
3672     SKIP(9);
3673 
3674     SKIP_BLANKS;
3675 
3676     /*
3677      * Parse the DOCTYPE name.
3678      */
3679     name = htmlParseName(ctxt);
3680     if (name == NULL) {
3681 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3682 	             "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3683 		     NULL, NULL);
3684     }
3685     /*
3686      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3687      */
3688 
3689     SKIP_BLANKS;
3690 
3691     /*
3692      * Check for SystemID and ExternalID
3693      */
3694     URI = htmlParseExternalID(ctxt, &ExternalID);
3695     SKIP_BLANKS;
3696 
3697     /*
3698      * We should be at the end of the DOCTYPE declaration.
3699      */
3700     if (CUR != '>') {
3701 	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3702 	             "DOCTYPE improperly terminated\n", NULL, NULL);
3703         /* Ignore bogus content */
3704         while ((CUR != 0) && (CUR != '>'))
3705             NEXT;
3706     }
3707     if (CUR == '>')
3708         NEXT;
3709 
3710     /*
3711      * Create or update the document accordingly to the DOCTYPE
3712      */
3713     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3714 	(!ctxt->disableSAX))
3715 	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3716 
3717     /*
3718      * Cleanup, since we don't use all those identifiers
3719      */
3720     if (URI != NULL) xmlFree(URI);
3721     if (ExternalID != NULL) xmlFree(ExternalID);
3722 }
3723 
3724 /**
3725  * htmlParseAttribute:
3726  * @ctxt:  an HTML parser context
3727  * @value:  a xmlChar ** used to store the value of the attribute
3728  *
3729  * parse an attribute
3730  *
3731  * [41] Attribute ::= Name Eq AttValue
3732  *
3733  * [25] Eq ::= S? '=' S?
3734  *
3735  * With namespace:
3736  *
3737  * [NS 11] Attribute ::= QName Eq AttValue
3738  *
3739  * Also the case QName == xmlns:??? is handled independently as a namespace
3740  * definition.
3741  *
3742  * Returns the attribute name, and the value in *value.
3743  */
3744 
3745 static const xmlChar *
htmlParseAttribute(htmlParserCtxtPtr ctxt,xmlChar ** value)3746 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3747     const xmlChar *name;
3748     xmlChar *val = NULL;
3749 
3750     *value = NULL;
3751     name = htmlParseHTMLName(ctxt);
3752     if (name == NULL) {
3753 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3754 	             "error parsing attribute name\n", NULL, NULL);
3755         return(NULL);
3756     }
3757 
3758     /*
3759      * read the value
3760      */
3761     SKIP_BLANKS;
3762     if (CUR == '=') {
3763         NEXT;
3764 	SKIP_BLANKS;
3765 	val = htmlParseAttValue(ctxt);
3766     }
3767 
3768     *value = val;
3769     return(name);
3770 }
3771 
3772 /**
3773  * htmlCheckEncodingDirect:
3774  * @ctxt:  an HTML parser context
3775  * @attvalue: the attribute value
3776  *
3777  * Checks an attribute value to detect
3778  * the encoding
3779  * If a new encoding is detected the parser is switched to decode
3780  * it and pass UTF8
3781  */
3782 static void
htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt,const xmlChar * encoding)3783 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3784 
3785     if ((ctxt == NULL) || (encoding == NULL) ||
3786         (ctxt->options & HTML_PARSE_IGNORE_ENC))
3787 	return;
3788 
3789     /* do not change encoding */
3790     if (ctxt->input->encoding != NULL)
3791         return;
3792 
3793     if (encoding != NULL) {
3794 	xmlCharEncoding enc;
3795 	xmlCharEncodingHandlerPtr handler;
3796 
3797 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3798 
3799 	if (ctxt->input->encoding != NULL)
3800 	    xmlFree((xmlChar *) ctxt->input->encoding);
3801 	ctxt->input->encoding = xmlStrdup(encoding);
3802 
3803 	enc = xmlParseCharEncoding((const char *) encoding);
3804 	/*
3805 	 * registered set of known encodings
3806 	 */
3807 	if (enc != XML_CHAR_ENCODING_ERROR) {
3808 	    if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3809 	         (enc == XML_CHAR_ENCODING_UTF16BE) ||
3810 		 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3811 		 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3812 		(ctxt->input->buf != NULL) &&
3813 		(ctxt->input->buf->encoder == NULL)) {
3814 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3815 		             "htmlCheckEncoding: wrong encoding meta\n",
3816 			     NULL, NULL);
3817 	    } else {
3818 		xmlSwitchEncoding(ctxt, enc);
3819 	    }
3820 	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
3821 	} else {
3822 	    /*
3823 	     * fallback for unknown encodings
3824 	     */
3825 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
3826 	    if (handler != NULL) {
3827 		xmlSwitchToEncoding(ctxt, handler);
3828 		ctxt->charset = XML_CHAR_ENCODING_UTF8;
3829 	    } else {
3830 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3831 		             "htmlCheckEncoding: unknown encoding %s\n",
3832 			     encoding, NULL);
3833 	    }
3834 	}
3835 
3836 	if ((ctxt->input->buf != NULL) &&
3837 	    (ctxt->input->buf->encoder != NULL) &&
3838 	    (ctxt->input->buf->raw != NULL) &&
3839 	    (ctxt->input->buf->buffer != NULL)) {
3840 	    int nbchars;
3841 	    int processed;
3842 
3843 	    /*
3844 	     * convert as much as possible to the parser reading buffer.
3845 	     */
3846 	    processed = ctxt->input->cur - ctxt->input->base;
3847 	    xmlBufShrink(ctxt->input->buf->buffer, processed);
3848 	    nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3849             xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3850 	    if (nbchars < 0) {
3851 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3852 		             "htmlCheckEncoding: encoder error\n",
3853 			     NULL, NULL);
3854 	    }
3855 	}
3856     }
3857 }
3858 
3859 /**
3860  * htmlCheckEncoding:
3861  * @ctxt:  an HTML parser context
3862  * @attvalue: the attribute value
3863  *
3864  * Checks an http-equiv attribute from a Meta tag to detect
3865  * the encoding
3866  * If a new encoding is detected the parser is switched to decode
3867  * it and pass UTF8
3868  */
3869 static void
htmlCheckEncoding(htmlParserCtxtPtr ctxt,const xmlChar * attvalue)3870 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3871     const xmlChar *encoding;
3872 
3873     if (!attvalue)
3874 	return;
3875 
3876     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3877     if (encoding != NULL) {
3878 	encoding += 7;
3879     }
3880     /*
3881      * skip blank
3882      */
3883     if (encoding && IS_BLANK_CH(*encoding))
3884 	encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3885     if (encoding && *encoding == '=') {
3886 	encoding ++;
3887 	htmlCheckEncodingDirect(ctxt, encoding);
3888     }
3889 }
3890 
3891 /**
3892  * htmlCheckMeta:
3893  * @ctxt:  an HTML parser context
3894  * @atts:  the attributes values
3895  *
3896  * Checks an attributes from a Meta tag
3897  */
3898 static void
htmlCheckMeta(htmlParserCtxtPtr ctxt,const xmlChar ** atts)3899 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3900     int i;
3901     const xmlChar *att, *value;
3902     int http = 0;
3903     const xmlChar *content = NULL;
3904 
3905     if ((ctxt == NULL) || (atts == NULL))
3906 	return;
3907 
3908     i = 0;
3909     att = atts[i++];
3910     while (att != NULL) {
3911 	value = atts[i++];
3912 	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3913 	 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3914 	    http = 1;
3915 	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3916 	    htmlCheckEncodingDirect(ctxt, value);
3917 	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3918 	    content = value;
3919 	att = atts[i++];
3920     }
3921     if ((http) && (content != NULL))
3922 	htmlCheckEncoding(ctxt, content);
3923 
3924 }
3925 
3926 /**
3927  * htmlParseStartTag:
3928  * @ctxt:  an HTML parser context
3929  *
3930  * parse a start of tag either for rule element or
3931  * EmptyElement. In both case we don't parse the tag closing chars.
3932  *
3933  * [40] STag ::= '<' Name (S Attribute)* S? '>'
3934  *
3935  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3936  *
3937  * With namespace:
3938  *
3939  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3940  *
3941  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3942  *
3943  * Returns 0 in case of success, -1 in case of error and 1 if discarded
3944  */
3945 
3946 static int
htmlParseStartTag(htmlParserCtxtPtr ctxt)3947 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3948     const xmlChar *name;
3949     const xmlChar *attname;
3950     xmlChar *attvalue;
3951     const xmlChar **atts;
3952     int nbatts = 0;
3953     int maxatts;
3954     int meta = 0;
3955     int i;
3956     int discardtag = 0;
3957 
3958     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3959 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3960 		     "htmlParseStartTag: context error\n", NULL, NULL);
3961 	return -1;
3962     }
3963     if (ctxt->instate == XML_PARSER_EOF)
3964         return(-1);
3965     if (CUR != '<') return -1;
3966     NEXT;
3967 
3968     atts = ctxt->atts;
3969     maxatts = ctxt->maxatts;
3970 
3971     GROW;
3972     name = htmlParseHTMLName(ctxt);
3973     if (name == NULL) {
3974 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3975 	             "htmlParseStartTag: invalid element name\n",
3976 		     NULL, NULL);
3977 	/* Dump the bogus tag like browsers do */
3978 	while ((CUR != 0) && (CUR != '>') &&
3979                (ctxt->instate != XML_PARSER_EOF))
3980 	    NEXT;
3981         return -1;
3982     }
3983     if (xmlStrEqual(name, BAD_CAST"meta"))
3984 	meta = 1;
3985 
3986     /*
3987      * Check for auto-closure of HTML elements.
3988      */
3989     htmlAutoClose(ctxt, name);
3990 
3991     /*
3992      * Check for implied HTML elements.
3993      */
3994     htmlCheckImplied(ctxt, name);
3995 
3996     /*
3997      * Avoid html at any level > 0, head at any level != 1
3998      * or any attempt to recurse body
3999      */
4000     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
4001 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4002 	             "htmlParseStartTag: misplaced <html> tag\n",
4003 		     name, NULL);
4004 	discardtag = 1;
4005 	ctxt->depth++;
4006     }
4007     if ((ctxt->nameNr != 1) &&
4008 	(xmlStrEqual(name, BAD_CAST"head"))) {
4009 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4010 	             "htmlParseStartTag: misplaced <head> tag\n",
4011 		     name, NULL);
4012 	discardtag = 1;
4013 	ctxt->depth++;
4014     }
4015     if (xmlStrEqual(name, BAD_CAST"body")) {
4016 	int indx;
4017 	for (indx = 0;indx < ctxt->nameNr;indx++) {
4018 	    if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
4019 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4020 		             "htmlParseStartTag: misplaced <body> tag\n",
4021 			     name, NULL);
4022 		discardtag = 1;
4023 		ctxt->depth++;
4024 	    }
4025 	}
4026     }
4027 
4028     /*
4029      * Now parse the attributes, it ends up with the ending
4030      *
4031      * (S Attribute)* S?
4032      */
4033     SKIP_BLANKS;
4034     while ((CUR != 0) &&
4035            (CUR != '>') &&
4036 	   ((CUR != '/') || (NXT(1) != '>'))) {
4037 	GROW;
4038 	attname = htmlParseAttribute(ctxt, &attvalue);
4039         if (attname != NULL) {
4040 
4041 	    /*
4042 	     * Well formedness requires at most one declaration of an attribute
4043 	     */
4044 	    for (i = 0; i < nbatts;i += 2) {
4045 	        if (xmlStrEqual(atts[i], attname)) {
4046 		    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
4047 		                 "Attribute %s redefined\n", attname, NULL);
4048 		    if (attvalue != NULL)
4049 			xmlFree(attvalue);
4050 		    goto failed;
4051 		}
4052 	    }
4053 
4054 	    /*
4055 	     * Add the pair to atts
4056 	     */
4057 	    if (atts == NULL) {
4058 	        maxatts = 22; /* allow for 10 attrs by default */
4059 	        atts = (const xmlChar **)
4060 		       xmlMalloc(maxatts * sizeof(xmlChar *));
4061 		if (atts == NULL) {
4062 		    htmlErrMemory(ctxt, NULL);
4063 		    if (attvalue != NULL)
4064 			xmlFree(attvalue);
4065 		    goto failed;
4066 		}
4067 		ctxt->atts = atts;
4068 		ctxt->maxatts = maxatts;
4069 	    } else if (nbatts + 4 > maxatts) {
4070 	        const xmlChar **n;
4071 
4072 	        maxatts *= 2;
4073 	        n = (const xmlChar **) xmlRealloc((void *) atts,
4074 					     maxatts * sizeof(const xmlChar *));
4075 		if (n == NULL) {
4076 		    htmlErrMemory(ctxt, NULL);
4077 		    if (attvalue != NULL)
4078 			xmlFree(attvalue);
4079 		    goto failed;
4080 		}
4081 		atts = n;
4082 		ctxt->atts = atts;
4083 		ctxt->maxatts = maxatts;
4084 	    }
4085 	    atts[nbatts++] = attname;
4086 	    atts[nbatts++] = attvalue;
4087 	    atts[nbatts] = NULL;
4088 	    atts[nbatts + 1] = NULL;
4089 	}
4090 	else {
4091 	    if (attvalue != NULL)
4092 	        xmlFree(attvalue);
4093 	    /* Dump the bogus attribute string up to the next blank or
4094 	     * the end of the tag. */
4095 	    while ((CUR != 0) &&
4096 	           !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
4097 		   ((CUR != '/') || (NXT(1) != '>')))
4098 		NEXT;
4099 	}
4100 
4101 failed:
4102 	SKIP_BLANKS;
4103     }
4104 
4105     /*
4106      * Handle specific association to the META tag
4107      */
4108     if (meta && (nbatts != 0))
4109 	htmlCheckMeta(ctxt, atts);
4110 
4111     /*
4112      * SAX: Start of Element !
4113      */
4114     if (!discardtag) {
4115 	htmlnamePush(ctxt, name);
4116 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4117 	    if (nbatts != 0)
4118 		ctxt->sax->startElement(ctxt->userData, name, atts);
4119 	    else
4120 		ctxt->sax->startElement(ctxt->userData, name, NULL);
4121 	}
4122     }
4123 
4124     if (atts != NULL) {
4125         for (i = 1;i < nbatts;i += 2) {
4126 	    if (atts[i] != NULL)
4127 		xmlFree((xmlChar *) atts[i]);
4128 	}
4129     }
4130 
4131     return(discardtag);
4132 }
4133 
4134 /**
4135  * htmlParseEndTag:
4136  * @ctxt:  an HTML parser context
4137  *
4138  * parse an end of tag
4139  *
4140  * [42] ETag ::= '</' Name S? '>'
4141  *
4142  * With namespace
4143  *
4144  * [NS 9] ETag ::= '</' QName S? '>'
4145  *
4146  * Returns 1 if the current level should be closed.
4147  */
4148 
4149 static int
htmlParseEndTag(htmlParserCtxtPtr ctxt)4150 htmlParseEndTag(htmlParserCtxtPtr ctxt)
4151 {
4152     const xmlChar *name;
4153     const xmlChar *oldname;
4154     int i, ret;
4155 
4156     if ((CUR != '<') || (NXT(1) != '/')) {
4157         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4158 	             "htmlParseEndTag: '</' not found\n", NULL, NULL);
4159         return (0);
4160     }
4161     SKIP(2);
4162 
4163     name = htmlParseHTMLName(ctxt);
4164     if (name == NULL)
4165         return (0);
4166     /*
4167      * We should definitely be at the ending "S? '>'" part
4168      */
4169     SKIP_BLANKS;
4170     if (CUR != '>') {
4171         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4172 	             "End tag : expected '>'\n", NULL, NULL);
4173         /* Skip to next '>' */
4174         while ((CUR != 0) && (CUR != '>'))
4175             NEXT;
4176     }
4177     if (CUR == '>')
4178         NEXT;
4179 
4180     /*
4181      * if we ignored misplaced tags in htmlParseStartTag don't pop them
4182      * out now.
4183      */
4184     if ((ctxt->depth > 0) &&
4185         (xmlStrEqual(name, BAD_CAST "html") ||
4186          xmlStrEqual(name, BAD_CAST "body") ||
4187 	 xmlStrEqual(name, BAD_CAST "head"))) {
4188 	ctxt->depth--;
4189 	return (0);
4190     }
4191 
4192     /*
4193      * If the name read is not one of the element in the parsing stack
4194      * then return, it's just an error.
4195      */
4196     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4197         if (xmlStrEqual(name, ctxt->nameTab[i]))
4198             break;
4199     }
4200     if (i < 0) {
4201         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4202 	             "Unexpected end tag : %s\n", name, NULL);
4203         return (0);
4204     }
4205 
4206 
4207     /*
4208      * Check for auto-closure of HTML elements.
4209      */
4210 
4211     htmlAutoCloseOnClose(ctxt, name);
4212 
4213     /*
4214      * Well formedness constraints, opening and closing must match.
4215      * With the exception that the autoclose may have popped stuff out
4216      * of the stack.
4217      */
4218     if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4219         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4220                      "Opening and ending tag mismatch: %s and %s\n",
4221                      name, ctxt->name);
4222     }
4223 
4224     /*
4225      * SAX: End of Tag
4226      */
4227     oldname = ctxt->name;
4228     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4229         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4230             ctxt->sax->endElement(ctxt->userData, name);
4231 	htmlNodeInfoPop(ctxt);
4232         htmlnamePop(ctxt);
4233         ret = 1;
4234     } else {
4235         ret = 0;
4236     }
4237 
4238     return (ret);
4239 }
4240 
4241 
4242 /**
4243  * htmlParseReference:
4244  * @ctxt:  an HTML parser context
4245  *
4246  * parse and handle entity references in content,
4247  * this will end-up in a call to character() since this is either a
4248  * CharRef, or a predefined entity.
4249  */
4250 static void
htmlParseReference(htmlParserCtxtPtr ctxt)4251 htmlParseReference(htmlParserCtxtPtr ctxt) {
4252     const htmlEntityDesc * ent;
4253     xmlChar out[6];
4254     const xmlChar *name;
4255     if (CUR != '&') return;
4256 
4257     if (NXT(1) == '#') {
4258 	unsigned int c;
4259 	int bits, i = 0;
4260 
4261 	c = htmlParseCharRef(ctxt);
4262 	if (c == 0)
4263 	    return;
4264 
4265         if      (c <    0x80) { out[i++]= c;                bits= -6; }
4266         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4267         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4268         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4269 
4270         for ( ; bits >= 0; bits-= 6) {
4271             out[i++]= ((c >> bits) & 0x3F) | 0x80;
4272         }
4273 	out[i] = 0;
4274 
4275 	htmlCheckParagraph(ctxt);
4276 	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4277 	    ctxt->sax->characters(ctxt->userData, out, i);
4278     } else {
4279 	ent = htmlParseEntityRef(ctxt, &name);
4280 	if (name == NULL) {
4281 	    htmlCheckParagraph(ctxt);
4282 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4283 	        ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4284 	    return;
4285 	}
4286 	if ((ent == NULL) || !(ent->value > 0)) {
4287 	    htmlCheckParagraph(ctxt);
4288 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4289 		ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4290 		ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4291 		/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4292 	    }
4293 	} else {
4294 	    unsigned int c;
4295 	    int bits, i = 0;
4296 
4297 	    c = ent->value;
4298 	    if      (c <    0x80)
4299 	            { out[i++]= c;                bits= -6; }
4300 	    else if (c <   0x800)
4301 	            { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4302 	    else if (c < 0x10000)
4303 	            { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4304 	    else
4305 	            { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4306 
4307 	    for ( ; bits >= 0; bits-= 6) {
4308 		out[i++]= ((c >> bits) & 0x3F) | 0x80;
4309 	    }
4310 	    out[i] = 0;
4311 
4312 	    htmlCheckParagraph(ctxt);
4313 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4314 		ctxt->sax->characters(ctxt->userData, out, i);
4315 	}
4316     }
4317 }
4318 
4319 /**
4320  * htmlParseContent:
4321  * @ctxt:  an HTML parser context
4322  *
4323  * Parse a content: comment, sub-element, reference or text.
4324  * Kept for compatibility with old code
4325  */
4326 
4327 static void
htmlParseContent(htmlParserCtxtPtr ctxt)4328 htmlParseContent(htmlParserCtxtPtr ctxt) {
4329     xmlChar *currentNode;
4330     int depth;
4331     const xmlChar *name;
4332 
4333     currentNode = xmlStrdup(ctxt->name);
4334     depth = ctxt->nameNr;
4335     while (1) {
4336         GROW;
4337 
4338         if (ctxt->instate == XML_PARSER_EOF)
4339             break;
4340 
4341 	/*
4342 	 * Our tag or one of it's parent or children is ending.
4343 	 */
4344         if ((CUR == '<') && (NXT(1) == '/')) {
4345 	    if (htmlParseEndTag(ctxt) &&
4346 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4347 		if (currentNode != NULL)
4348 		    xmlFree(currentNode);
4349 		return;
4350 	    }
4351 	    continue; /* while */
4352         }
4353 
4354 	else if ((CUR == '<') &&
4355 	         ((IS_ASCII_LETTER(NXT(1))) ||
4356 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4357 	    name = htmlParseHTMLName_nonInvasive(ctxt);
4358 	    if (name == NULL) {
4359 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4360 			 "htmlParseStartTag: invalid element name\n",
4361 			 NULL, NULL);
4362 	        /* Dump the bogus tag like browsers do */
4363                 while ((CUR != 0) && (CUR != '>'))
4364 	            NEXT;
4365 
4366 	        if (currentNode != NULL)
4367 	            xmlFree(currentNode);
4368 	        return;
4369 	    }
4370 
4371 	    if (ctxt->name != NULL) {
4372 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4373 	            htmlAutoClose(ctxt, name);
4374 	            continue;
4375 	        }
4376 	    }
4377 	}
4378 
4379 	/*
4380 	 * Has this node been popped out during parsing of
4381 	 * the next element
4382 	 */
4383         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4384 	    (!xmlStrEqual(currentNode, ctxt->name)))
4385 	     {
4386 	    if (currentNode != NULL) xmlFree(currentNode);
4387 	    return;
4388 	}
4389 
4390 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4391 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4392 	    /*
4393 	     * Handle SCRIPT/STYLE separately
4394 	     */
4395 	    htmlParseScript(ctxt);
4396 	}
4397 
4398         else if ((CUR == '<') && (NXT(1) == '!')) {
4399             /*
4400              * Sometimes DOCTYPE arrives in the middle of the document
4401              */
4402             if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4403                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4404                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4405                 (UPP(8) == 'E')) {
4406                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4407                              "Misplaced DOCTYPE declaration\n",
4408                              BAD_CAST "DOCTYPE" , NULL);
4409                 htmlParseDocTypeDecl(ctxt);
4410             }
4411             /*
4412              * First case :  a comment
4413              */
4414             else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4415                 htmlParseComment(ctxt);
4416             }
4417             else {
4418                 htmlSkipBogusComment(ctxt);
4419             }
4420         }
4421 
4422         /*
4423          * Second case : a Processing Instruction.
4424          */
4425         else if ((CUR == '<') && (NXT(1) == '?')) {
4426             htmlParsePI(ctxt);
4427         }
4428 
4429         /*
4430          * Third case :  a sub-element.
4431          */
4432         else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4433             htmlParseElement(ctxt);
4434         }
4435         else if (CUR == '<') {
4436             if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4437                 (ctxt->sax->characters != NULL))
4438                 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4439             NEXT;
4440         }
4441 
4442         /*
4443          * Fourth case : a reference. If if has not been resolved,
4444          *    parsing returns it's Name, create the node
4445          */
4446         else if (CUR == '&') {
4447             htmlParseReference(ctxt);
4448         }
4449 
4450         /*
4451          * Fifth case : end of the resource
4452          */
4453         else if (CUR == 0) {
4454             htmlAutoCloseOnEnd(ctxt);
4455             break;
4456         }
4457 
4458         /*
4459          * Last case, text. Note that References are handled directly.
4460          */
4461         else {
4462             htmlParseCharData(ctxt);
4463         }
4464         GROW;
4465     }
4466     if (currentNode != NULL) xmlFree(currentNode);
4467 }
4468 
4469 /**
4470  * htmlParseElement:
4471  * @ctxt:  an HTML parser context
4472  *
4473  * parse an HTML element, this is highly recursive
4474  * this is kept for compatibility with previous code versions
4475  *
4476  * [39] element ::= EmptyElemTag | STag content ETag
4477  *
4478  * [41] Attribute ::= Name Eq AttValue
4479  */
4480 
4481 void
htmlParseElement(htmlParserCtxtPtr ctxt)4482 htmlParseElement(htmlParserCtxtPtr ctxt) {
4483     const xmlChar *name;
4484     xmlChar *currentNode = NULL;
4485     const htmlElemDesc * info;
4486     htmlParserNodeInfo node_info;
4487     int failed;
4488     int depth;
4489     const xmlChar *oldptr;
4490 
4491     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4492 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4493 		     "htmlParseElement: context error\n", NULL, NULL);
4494 	return;
4495     }
4496 
4497     if (ctxt->instate == XML_PARSER_EOF)
4498         return;
4499 
4500     /* Capture start position */
4501     if (ctxt->record_info) {
4502         node_info.begin_pos = ctxt->input->consumed +
4503                           (CUR_PTR - ctxt->input->base);
4504 	node_info.begin_line = ctxt->input->line;
4505     }
4506 
4507     failed = htmlParseStartTag(ctxt);
4508     name = ctxt->name;
4509     if ((failed == -1) || (name == NULL)) {
4510 	if (CUR == '>')
4511 	    NEXT;
4512         return;
4513     }
4514 
4515     /*
4516      * Lookup the info for that element.
4517      */
4518     info = htmlTagLookup(name);
4519     if (info == NULL) {
4520 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4521 	             "Tag %s invalid\n", name, NULL);
4522     }
4523 
4524     /*
4525      * Check for an Empty Element labeled the XML/SGML way
4526      */
4527     if ((CUR == '/') && (NXT(1) == '>')) {
4528         SKIP(2);
4529 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4530 	    ctxt->sax->endElement(ctxt->userData, name);
4531 	htmlnamePop(ctxt);
4532 	return;
4533     }
4534 
4535     if (CUR == '>') {
4536         NEXT;
4537     } else {
4538 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4539 	             "Couldn't find end of Start Tag %s\n", name, NULL);
4540 
4541 	/*
4542 	 * end of parsing of this node.
4543 	 */
4544 	if (xmlStrEqual(name, ctxt->name)) {
4545 	    nodePop(ctxt);
4546 	    htmlnamePop(ctxt);
4547 	}
4548 
4549 	/*
4550 	 * Capture end position and add node
4551 	 */
4552 	if (ctxt->record_info) {
4553 	   node_info.end_pos = ctxt->input->consumed +
4554 			      (CUR_PTR - ctxt->input->base);
4555 	   node_info.end_line = ctxt->input->line;
4556 	   node_info.node = ctxt->node;
4557 	   xmlParserAddNodeInfo(ctxt, &node_info);
4558 	}
4559 	return;
4560     }
4561 
4562     /*
4563      * Check for an Empty Element from DTD definition
4564      */
4565     if ((info != NULL) && (info->empty)) {
4566 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4567 	    ctxt->sax->endElement(ctxt->userData, name);
4568 	htmlnamePop(ctxt);
4569 	return;
4570     }
4571 
4572     /*
4573      * Parse the content of the element:
4574      */
4575     currentNode = xmlStrdup(ctxt->name);
4576     depth = ctxt->nameNr;
4577     while (CUR != 0) {
4578 	oldptr = ctxt->input->cur;
4579 	htmlParseContent(ctxt);
4580 	if (oldptr==ctxt->input->cur) break;
4581 	if (ctxt->nameNr < depth) break;
4582     }
4583 
4584     /*
4585      * Capture end position and add node
4586      */
4587     if ( currentNode != NULL && ctxt->record_info ) {
4588        node_info.end_pos = ctxt->input->consumed +
4589                           (CUR_PTR - ctxt->input->base);
4590        node_info.end_line = ctxt->input->line;
4591        node_info.node = ctxt->node;
4592        xmlParserAddNodeInfo(ctxt, &node_info);
4593     }
4594     if (CUR == 0) {
4595 	htmlAutoCloseOnEnd(ctxt);
4596     }
4597 
4598     if (currentNode != NULL)
4599 	xmlFree(currentNode);
4600 }
4601 
4602 static void
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt)4603 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4604     /*
4605      * Capture end position and add node
4606      */
4607     if ( ctxt->node != NULL && ctxt->record_info ) {
4608        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4609                                 (CUR_PTR - ctxt->input->base);
4610        ctxt->nodeInfo->end_line = ctxt->input->line;
4611        ctxt->nodeInfo->node = ctxt->node;
4612        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4613        htmlNodeInfoPop(ctxt);
4614     }
4615     if (CUR == 0) {
4616        htmlAutoCloseOnEnd(ctxt);
4617     }
4618 }
4619 
4620 /**
4621  * htmlParseElementInternal:
4622  * @ctxt:  an HTML parser context
4623  *
4624  * parse an HTML element, new version, non recursive
4625  *
4626  * [39] element ::= EmptyElemTag | STag content ETag
4627  *
4628  * [41] Attribute ::= Name Eq AttValue
4629  */
4630 
4631 static void
htmlParseElementInternal(htmlParserCtxtPtr ctxt)4632 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4633     const xmlChar *name;
4634     const htmlElemDesc * info;
4635     htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4636     int failed;
4637 
4638     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4639 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4640 		     "htmlParseElementInternal: context error\n", NULL, NULL);
4641 	return;
4642     }
4643 
4644     if (ctxt->instate == XML_PARSER_EOF)
4645         return;
4646 
4647     /* Capture start position */
4648     if (ctxt->record_info) {
4649         node_info.begin_pos = ctxt->input->consumed +
4650                           (CUR_PTR - ctxt->input->base);
4651 	node_info.begin_line = ctxt->input->line;
4652     }
4653 
4654     failed = htmlParseStartTag(ctxt);
4655     name = ctxt->name;
4656     if ((failed == -1) || (name == NULL)) {
4657 	if (CUR == '>')
4658 	    NEXT;
4659         return;
4660     }
4661 
4662     /*
4663      * Lookup the info for that element.
4664      */
4665     info = htmlTagLookup(name);
4666     if (info == NULL) {
4667 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4668 	             "Tag %s invalid\n", name, NULL);
4669     }
4670 
4671     /*
4672      * Check for an Empty Element labeled the XML/SGML way
4673      */
4674     if ((CUR == '/') && (NXT(1) == '>')) {
4675         SKIP(2);
4676 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4677 	    ctxt->sax->endElement(ctxt->userData, name);
4678 	htmlnamePop(ctxt);
4679 	return;
4680     }
4681 
4682     if (CUR == '>') {
4683         NEXT;
4684     } else {
4685 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4686 	             "Couldn't find end of Start Tag %s\n", name, NULL);
4687 
4688 	/*
4689 	 * end of parsing of this node.
4690 	 */
4691 	if (xmlStrEqual(name, ctxt->name)) {
4692 	    nodePop(ctxt);
4693 	    htmlnamePop(ctxt);
4694 	}
4695 
4696         if (ctxt->record_info)
4697             htmlNodeInfoPush(ctxt, &node_info);
4698         htmlParserFinishElementParsing(ctxt);
4699 	return;
4700     }
4701 
4702     /*
4703      * Check for an Empty Element from DTD definition
4704      */
4705     if ((info != NULL) && (info->empty)) {
4706 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4707 	    ctxt->sax->endElement(ctxt->userData, name);
4708 	htmlnamePop(ctxt);
4709 	return;
4710     }
4711 
4712     if (ctxt->record_info)
4713         htmlNodeInfoPush(ctxt, &node_info);
4714 }
4715 
4716 /**
4717  * htmlParseContentInternal:
4718  * @ctxt:  an HTML parser context
4719  *
4720  * Parse a content: comment, sub-element, reference or text.
4721  * New version for non recursive htmlParseElementInternal
4722  */
4723 
4724 static void
htmlParseContentInternal(htmlParserCtxtPtr ctxt)4725 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4726     xmlChar *currentNode;
4727     int depth;
4728     const xmlChar *name;
4729 
4730     currentNode = xmlStrdup(ctxt->name);
4731     depth = ctxt->nameNr;
4732     while (1) {
4733         GROW;
4734 
4735         if (ctxt->instate == XML_PARSER_EOF)
4736             break;
4737 
4738 	/*
4739 	 * Our tag or one of it's parent or children is ending.
4740 	 */
4741         if ((CUR == '<') && (NXT(1) == '/')) {
4742 	    if (htmlParseEndTag(ctxt) &&
4743 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4744 		if (currentNode != NULL)
4745 		    xmlFree(currentNode);
4746 
4747 	        currentNode = xmlStrdup(ctxt->name);
4748 	        depth = ctxt->nameNr;
4749 	    }
4750 	    continue; /* while */
4751         }
4752 
4753 	else if ((CUR == '<') &&
4754 	         ((IS_ASCII_LETTER(NXT(1))) ||
4755 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4756 	    name = htmlParseHTMLName_nonInvasive(ctxt);
4757 	    if (name == NULL) {
4758 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4759 			 "htmlParseStartTag: invalid element name\n",
4760 			 NULL, NULL);
4761 	        /* Dump the bogus tag like browsers do */
4762 	        while ((CUR == 0) && (CUR != '>'))
4763 	            NEXT;
4764 
4765 	        htmlParserFinishElementParsing(ctxt);
4766 	        if (currentNode != NULL)
4767 	            xmlFree(currentNode);
4768 
4769 	        currentNode = xmlStrdup(ctxt->name);
4770 	        depth = ctxt->nameNr;
4771 	        continue;
4772 	    }
4773 
4774 	    if (ctxt->name != NULL) {
4775 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4776 	            htmlAutoClose(ctxt, name);
4777 	            continue;
4778 	        }
4779 	    }
4780 	}
4781 
4782 	/*
4783 	 * Has this node been popped out during parsing of
4784 	 * the next element
4785 	 */
4786         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4787 	    (!xmlStrEqual(currentNode, ctxt->name)))
4788 	     {
4789 	    htmlParserFinishElementParsing(ctxt);
4790 	    if (currentNode != NULL) xmlFree(currentNode);
4791 
4792 	    currentNode = xmlStrdup(ctxt->name);
4793 	    depth = ctxt->nameNr;
4794 	    continue;
4795 	}
4796 
4797 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4798 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4799 	    /*
4800 	     * Handle SCRIPT/STYLE separately
4801 	     */
4802 	    htmlParseScript(ctxt);
4803 	}
4804 
4805         else if ((CUR == '<') && (NXT(1) == '!')) {
4806             /*
4807              * Sometimes DOCTYPE arrives in the middle of the document
4808              */
4809             if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4810                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4811                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4812                 (UPP(8) == 'E')) {
4813                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4814                              "Misplaced DOCTYPE declaration\n",
4815                              BAD_CAST "DOCTYPE" , NULL);
4816                 htmlParseDocTypeDecl(ctxt);
4817             }
4818             /*
4819              * First case :  a comment
4820              */
4821             else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4822                 htmlParseComment(ctxt);
4823             }
4824             else {
4825                 htmlSkipBogusComment(ctxt);
4826             }
4827         }
4828 
4829         /*
4830          * Second case : a Processing Instruction.
4831          */
4832         else if ((CUR == '<') && (NXT(1) == '?')) {
4833             htmlParsePI(ctxt);
4834         }
4835 
4836         /*
4837          * Third case :  a sub-element.
4838          */
4839         else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4840             htmlParseElementInternal(ctxt);
4841             if (currentNode != NULL) xmlFree(currentNode);
4842 
4843             currentNode = xmlStrdup(ctxt->name);
4844             depth = ctxt->nameNr;
4845         }
4846         else if (CUR == '<') {
4847             if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4848                 (ctxt->sax->characters != NULL))
4849                 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4850             NEXT;
4851         }
4852 
4853         /*
4854          * Fourth case : a reference. If if has not been resolved,
4855          *    parsing returns it's Name, create the node
4856          */
4857         else if (CUR == '&') {
4858             htmlParseReference(ctxt);
4859         }
4860 
4861         /*
4862          * Fifth case : end of the resource
4863          */
4864         else if (CUR == 0) {
4865             htmlAutoCloseOnEnd(ctxt);
4866             break;
4867         }
4868 
4869         /*
4870          * Last case, text. Note that References are handled directly.
4871          */
4872         else {
4873             htmlParseCharData(ctxt);
4874         }
4875         GROW;
4876     }
4877     if (currentNode != NULL) xmlFree(currentNode);
4878 }
4879 
4880 /**
4881  * htmlParseContent:
4882  * @ctxt:  an HTML parser context
4883  *
4884  * Parse a content: comment, sub-element, reference or text.
4885  * This is the entry point when called from parser.c
4886  */
4887 
4888 void
__htmlParseContent(void * ctxt)4889 __htmlParseContent(void *ctxt) {
4890     if (ctxt != NULL)
4891 	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4892 }
4893 
4894 /**
4895  * htmlParseDocument:
4896  * @ctxt:  an HTML parser context
4897  *
4898  * parse an HTML document (and build a tree if using the standard SAX
4899  * interface).
4900  *
4901  * Returns 0, -1 in case of error. the parser context is augmented
4902  *                as a result of the parsing.
4903  */
4904 
4905 int
htmlParseDocument(htmlParserCtxtPtr ctxt)4906 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4907     xmlChar start[4];
4908     xmlCharEncoding enc;
4909     xmlDtdPtr dtd;
4910 
4911     xmlInitParser();
4912 
4913     htmlDefaultSAXHandlerInit();
4914 
4915     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4916 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4917 		     "htmlParseDocument: context error\n", NULL, NULL);
4918 	return(XML_ERR_INTERNAL_ERROR);
4919     }
4920     ctxt->html = 1;
4921     ctxt->linenumbers = 1;
4922     GROW;
4923     /*
4924      * SAX: beginning of the document processing.
4925      */
4926     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4927         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4928 
4929     if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4930         ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4931 	/*
4932 	 * Get the 4 first bytes and decode the charset
4933 	 * if enc != XML_CHAR_ENCODING_NONE
4934 	 * plug some encoding conversion routines.
4935 	 */
4936 	start[0] = RAW;
4937 	start[1] = NXT(1);
4938 	start[2] = NXT(2);
4939 	start[3] = NXT(3);
4940 	enc = xmlDetectCharEncoding(&start[0], 4);
4941 	if (enc != XML_CHAR_ENCODING_NONE) {
4942 	    xmlSwitchEncoding(ctxt, enc);
4943 	}
4944     }
4945 
4946     /*
4947      * Wipe out everything which is before the first '<'
4948      */
4949     SKIP_BLANKS;
4950     if (CUR == 0) {
4951 	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4952 	             "Document is empty\n", NULL, NULL);
4953     }
4954 
4955     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4956 	ctxt->sax->startDocument(ctxt->userData);
4957 
4958 
4959     /*
4960      * Parse possible comments and PIs before any content
4961      */
4962     while (((CUR == '<') && (NXT(1) == '!') &&
4963             (NXT(2) == '-') && (NXT(3) == '-')) ||
4964 	   ((CUR == '<') && (NXT(1) == '?'))) {
4965         htmlParseComment(ctxt);
4966         htmlParsePI(ctxt);
4967 	SKIP_BLANKS;
4968     }
4969 
4970 
4971     /*
4972      * Then possibly doc type declaration(s) and more Misc
4973      * (doctypedecl Misc*)?
4974      */
4975     if ((CUR == '<') && (NXT(1) == '!') &&
4976 	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4977 	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4978 	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4979 	(UPP(8) == 'E')) {
4980 	htmlParseDocTypeDecl(ctxt);
4981     }
4982     SKIP_BLANKS;
4983 
4984     /*
4985      * Parse possible comments and PIs before any content
4986      */
4987     while (((CUR == '<') && (NXT(1) == '!') &&
4988             (NXT(2) == '-') && (NXT(3) == '-')) ||
4989 	   ((CUR == '<') && (NXT(1) == '?'))) {
4990         htmlParseComment(ctxt);
4991         htmlParsePI(ctxt);
4992 	SKIP_BLANKS;
4993     }
4994 
4995     /*
4996      * Time to start parsing the tree itself
4997      */
4998     htmlParseContentInternal(ctxt);
4999 
5000     /*
5001      * autoclose
5002      */
5003     if (CUR == 0)
5004 	htmlAutoCloseOnEnd(ctxt);
5005 
5006 
5007     /*
5008      * SAX: end of the document processing.
5009      */
5010     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5011         ctxt->sax->endDocument(ctxt->userData);
5012 
5013     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
5014 	dtd = xmlGetIntSubset(ctxt->myDoc);
5015 	if (dtd == NULL)
5016 	    ctxt->myDoc->intSubset =
5017 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5018 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5019 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5020     }
5021     if (! ctxt->wellFormed) return(-1);
5022     return(0);
5023 }
5024 
5025 
5026 /************************************************************************
5027  *									*
5028  *			Parser contexts handling			*
5029  *									*
5030  ************************************************************************/
5031 
5032 /**
5033  * htmlInitParserCtxt:
5034  * @ctxt:  an HTML parser context
5035  *
5036  * Initialize a parser context
5037  *
5038  * Returns 0 in case of success and -1 in case of error
5039  */
5040 
5041 static int
htmlInitParserCtxt(htmlParserCtxtPtr ctxt)5042 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
5043 {
5044     htmlSAXHandler *sax;
5045 
5046     if (ctxt == NULL) return(-1);
5047     memset(ctxt, 0, sizeof(htmlParserCtxt));
5048 
5049     ctxt->dict = xmlDictCreate();
5050     if (ctxt->dict == NULL) {
5051         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5052 	return(-1);
5053     }
5054     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
5055     if (sax == NULL) {
5056         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5057 	return(-1);
5058     }
5059     memset(sax, 0, sizeof(htmlSAXHandler));
5060 
5061     /* Allocate the Input stack */
5062     ctxt->inputTab = (htmlParserInputPtr *)
5063                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
5064     if (ctxt->inputTab == NULL) {
5065         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5066 	ctxt->inputNr = 0;
5067 	ctxt->inputMax = 0;
5068 	ctxt->input = NULL;
5069 	return(-1);
5070     }
5071     ctxt->inputNr = 0;
5072     ctxt->inputMax = 5;
5073     ctxt->input = NULL;
5074     ctxt->version = NULL;
5075     ctxt->encoding = NULL;
5076     ctxt->standalone = -1;
5077     ctxt->instate = XML_PARSER_START;
5078 
5079     /* Allocate the Node stack */
5080     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
5081     if (ctxt->nodeTab == NULL) {
5082         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5083 	ctxt->nodeNr = 0;
5084 	ctxt->nodeMax = 0;
5085 	ctxt->node = NULL;
5086 	ctxt->inputNr = 0;
5087 	ctxt->inputMax = 0;
5088 	ctxt->input = NULL;
5089 	return(-1);
5090     }
5091     ctxt->nodeNr = 0;
5092     ctxt->nodeMax = 10;
5093     ctxt->node = NULL;
5094 
5095     /* Allocate the Name stack */
5096     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
5097     if (ctxt->nameTab == NULL) {
5098         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5099 	ctxt->nameNr = 0;
5100 	ctxt->nameMax = 0;
5101 	ctxt->name = NULL;
5102 	ctxt->nodeNr = 0;
5103 	ctxt->nodeMax = 0;
5104 	ctxt->node = NULL;
5105 	ctxt->inputNr = 0;
5106 	ctxt->inputMax = 0;
5107 	ctxt->input = NULL;
5108 	return(-1);
5109     }
5110     ctxt->nameNr = 0;
5111     ctxt->nameMax = 10;
5112     ctxt->name = NULL;
5113 
5114     ctxt->nodeInfoTab = NULL;
5115     ctxt->nodeInfoNr  = 0;
5116     ctxt->nodeInfoMax = 0;
5117 
5118     ctxt->sax = sax;
5119     xmlSAX2InitHtmlDefaultSAXHandler(sax);
5120 
5121     ctxt->userData = ctxt;
5122     ctxt->myDoc = NULL;
5123     ctxt->wellFormed = 1;
5124     ctxt->replaceEntities = 0;
5125     ctxt->linenumbers = xmlLineNumbersDefaultValue;
5126     ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
5127     ctxt->html = 1;
5128     ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
5129     ctxt->vctxt.userData = ctxt;
5130     ctxt->vctxt.error = xmlParserValidityError;
5131     ctxt->vctxt.warning = xmlParserValidityWarning;
5132     ctxt->record_info = 0;
5133     ctxt->validate = 0;
5134     ctxt->checkIndex = 0;
5135     ctxt->catalogs = NULL;
5136     xmlInitNodeInfoSeq(&ctxt->node_seq);
5137     return(0);
5138 }
5139 
5140 /**
5141  * htmlFreeParserCtxt:
5142  * @ctxt:  an HTML parser context
5143  *
5144  * Free all the memory used by a parser context. However the parsed
5145  * document in ctxt->myDoc is not freed.
5146  */
5147 
5148 void
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)5149 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5150 {
5151     xmlFreeParserCtxt(ctxt);
5152 }
5153 
5154 /**
5155  * htmlNewParserCtxt:
5156  *
5157  * Allocate and initialize a new parser context.
5158  *
5159  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5160  */
5161 
5162 htmlParserCtxtPtr
htmlNewParserCtxt(void)5163 htmlNewParserCtxt(void)
5164 {
5165     xmlParserCtxtPtr ctxt;
5166 
5167     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5168     if (ctxt == NULL) {
5169         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5170 	return(NULL);
5171     }
5172     memset(ctxt, 0, sizeof(xmlParserCtxt));
5173     if (htmlInitParserCtxt(ctxt) < 0) {
5174         htmlFreeParserCtxt(ctxt);
5175 	return(NULL);
5176     }
5177     return(ctxt);
5178 }
5179 
5180 /**
5181  * htmlCreateMemoryParserCtxt:
5182  * @buffer:  a pointer to a char array
5183  * @size:  the size of the array
5184  *
5185  * Create a parser context for an HTML in-memory document.
5186  *
5187  * Returns the new parser context or NULL
5188  */
5189 htmlParserCtxtPtr
htmlCreateMemoryParserCtxt(const char * buffer,int size)5190 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5191     xmlParserCtxtPtr ctxt;
5192     xmlParserInputPtr input;
5193     xmlParserInputBufferPtr buf;
5194 
5195     if (buffer == NULL)
5196 	return(NULL);
5197     if (size <= 0)
5198 	return(NULL);
5199 
5200     ctxt = htmlNewParserCtxt();
5201     if (ctxt == NULL)
5202 	return(NULL);
5203 
5204     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5205     if (buf == NULL) return(NULL);
5206 
5207     input = xmlNewInputStream(ctxt);
5208     if (input == NULL) {
5209 	xmlFreeParserInputBuffer(buf);
5210 	xmlFreeParserCtxt(ctxt);
5211 	return(NULL);
5212     }
5213 
5214     input->filename = NULL;
5215     input->buf = buf;
5216     xmlBufResetInput(buf->buffer, input);
5217 
5218     inputPush(ctxt, input);
5219     return(ctxt);
5220 }
5221 
5222 /**
5223  * htmlCreateDocParserCtxt:
5224  * @cur:  a pointer to an array of xmlChar
5225  * @encoding:  a free form C string describing the HTML document encoding, or NULL
5226  *
5227  * Create a parser context for an HTML document.
5228  *
5229  * TODO: check the need to add encoding handling there
5230  *
5231  * Returns the new parser context or NULL
5232  */
5233 static htmlParserCtxtPtr
htmlCreateDocParserCtxt(const xmlChar * cur,const char * encoding)5234 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5235     int len;
5236     htmlParserCtxtPtr ctxt;
5237 
5238     if (cur == NULL)
5239 	return(NULL);
5240     len = xmlStrlen(cur);
5241     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5242     if (ctxt == NULL)
5243 	return(NULL);
5244 
5245     if (encoding != NULL) {
5246 	xmlCharEncoding enc;
5247 	xmlCharEncodingHandlerPtr handler;
5248 
5249 	if (ctxt->input->encoding != NULL)
5250 	    xmlFree((xmlChar *) ctxt->input->encoding);
5251 	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5252 
5253 	enc = xmlParseCharEncoding(encoding);
5254 	/*
5255 	 * registered set of known encodings
5256 	 */
5257 	if (enc != XML_CHAR_ENCODING_ERROR) {
5258 	    xmlSwitchEncoding(ctxt, enc);
5259 	    if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5260 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5261 		             "Unsupported encoding %s\n",
5262 			     (const xmlChar *) encoding, NULL);
5263 	    }
5264 	} else {
5265 	    /*
5266 	     * fallback for unknown encodings
5267 	     */
5268 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
5269 	    if (handler != NULL) {
5270 		xmlSwitchToEncoding(ctxt, handler);
5271 	    } else {
5272 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5273 		             "Unsupported encoding %s\n",
5274 			     (const xmlChar *) encoding, NULL);
5275 	    }
5276 	}
5277     }
5278     return(ctxt);
5279 }
5280 
5281 #ifdef LIBXML_PUSH_ENABLED
5282 /************************************************************************
5283  *									*
5284  *	Progressive parsing interfaces				*
5285  *									*
5286  ************************************************************************/
5287 
5288 /**
5289  * htmlParseLookupSequence:
5290  * @ctxt:  an HTML parser context
5291  * @first:  the first char to lookup
5292  * @next:  the next char to lookup or zero
5293  * @third:  the next char to lookup or zero
5294  * @ignoreattrval: skip over attribute values
5295  *
5296  * Try to find if a sequence (first, next, third) or  just (first next) or
5297  * (first) is available in the input stream.
5298  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5299  * to avoid rescanning sequences of bytes, it DOES change the state of the
5300  * parser, do not use liberally.
5301  * This is basically similar to xmlParseLookupSequence()
5302  *
5303  * Returns the index to the current parsing point if the full sequence
5304  *      is available, -1 otherwise.
5305  */
5306 static int
htmlParseLookupSequence(htmlParserCtxtPtr ctxt,xmlChar first,xmlChar next,xmlChar third,int ignoreattrval)5307 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5308                         xmlChar next, xmlChar third, int ignoreattrval)
5309 {
5310     int base, len;
5311     htmlParserInputPtr in;
5312     const xmlChar *buf;
5313     int invalue = 0;
5314     char valdellim = 0x0;
5315 
5316     in = ctxt->input;
5317     if (in == NULL)
5318         return (-1);
5319 
5320     base = in->cur - in->base;
5321     if (base < 0)
5322         return (-1);
5323 
5324     if (ctxt->checkIndex > base) {
5325         base = ctxt->checkIndex;
5326         /* Abuse hasPErefs member to restore current state. */
5327         invalue = ctxt->hasPErefs & 1 ? 1 : 0;
5328     }
5329 
5330     if (in->buf == NULL) {
5331         buf = in->base;
5332         len = in->length;
5333     } else {
5334         buf = xmlBufContent(in->buf->buffer);
5335         len = xmlBufUse(in->buf->buffer);
5336     }
5337 
5338     /* take into account the sequence length */
5339     if (third)
5340         len -= 2;
5341     else if (next)
5342         len--;
5343     for (; base < len; base++) {
5344         if (ignoreattrval) {
5345             if (buf[base] == '"' || buf[base] == '\'') {
5346                 if (invalue) {
5347                     if (buf[base] == valdellim) {
5348                         invalue = 0;
5349                         continue;
5350                     }
5351                 } else {
5352                     valdellim = buf[base];
5353                     invalue = 1;
5354                     continue;
5355                 }
5356             } else if (invalue) {
5357                 continue;
5358             }
5359         }
5360         if (buf[base] == first) {
5361             if (third != 0) {
5362                 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5363                     continue;
5364             } else if (next != 0) {
5365                 if (buf[base + 1] != next)
5366                     continue;
5367             }
5368             ctxt->checkIndex = 0;
5369 #ifdef DEBUG_PUSH
5370             if (next == 0)
5371                 xmlGenericError(xmlGenericErrorContext,
5372                                 "HPP: lookup '%c' found at %d\n",
5373                                 first, base);
5374             else if (third == 0)
5375                 xmlGenericError(xmlGenericErrorContext,
5376                                 "HPP: lookup '%c%c' found at %d\n",
5377                                 first, next, base);
5378             else
5379                 xmlGenericError(xmlGenericErrorContext,
5380                                 "HPP: lookup '%c%c%c' found at %d\n",
5381                                 first, next, third, base);
5382 #endif
5383             return (base - (in->cur - in->base));
5384         }
5385     }
5386     ctxt->checkIndex = base;
5387     /* Abuse hasPErefs member to track current state. */
5388     if (invalue)
5389         ctxt->hasPErefs |= 1;
5390     else
5391         ctxt->hasPErefs &= ~1;
5392 #ifdef DEBUG_PUSH
5393     if (next == 0)
5394         xmlGenericError(xmlGenericErrorContext,
5395                         "HPP: lookup '%c' failed\n", first);
5396     else if (third == 0)
5397         xmlGenericError(xmlGenericErrorContext,
5398                         "HPP: lookup '%c%c' failed\n", first, next);
5399     else
5400         xmlGenericError(xmlGenericErrorContext,
5401                         "HPP: lookup '%c%c%c' failed\n", first, next,
5402                         third);
5403 #endif
5404     return (-1);
5405 }
5406 
5407 /**
5408  * htmlParseLookupCommentEnd:
5409  * @ctxt: an HTML parser context
5410  *
5411  * Try to find a comment end tag in the input stream
5412  * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5413  * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5414  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5415  * to avoid rescanning sequences of bytes, it DOES change the state of the
5416  * parser, do not use liberally.
5417  * This wraps to htmlParseLookupSequence()
5418  *
5419  * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5420  */
5421 static int
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)5422 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5423 {
5424     int mark = 0;
5425     int cur = CUR_PTR - BASE_PTR;
5426 
5427     while (mark >= 0) {
5428 	mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5429 	if ((mark < 0) ||
5430 	    (NXT(mark+2) == '>') ||
5431 	    ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5432 	    return mark;
5433 	}
5434 	ctxt->checkIndex = cur + mark + 1;
5435     }
5436     return mark;
5437 }
5438 
5439 
5440 /**
5441  * htmlParseTryOrFinish:
5442  * @ctxt:  an HTML parser context
5443  * @terminate:  last chunk indicator
5444  *
5445  * Try to progress on parsing
5446  *
5447  * Returns zero if no parsing was possible
5448  */
5449 static int
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt,int terminate)5450 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5451     int ret = 0;
5452     htmlParserInputPtr in;
5453     ptrdiff_t avail = 0;
5454     xmlChar cur, next;
5455 
5456     htmlParserNodeInfo node_info;
5457 
5458 #ifdef DEBUG_PUSH
5459     switch (ctxt->instate) {
5460 	case XML_PARSER_EOF:
5461 	    xmlGenericError(xmlGenericErrorContext,
5462 		    "HPP: try EOF\n"); break;
5463 	case XML_PARSER_START:
5464 	    xmlGenericError(xmlGenericErrorContext,
5465 		    "HPP: try START\n"); break;
5466 	case XML_PARSER_MISC:
5467 	    xmlGenericError(xmlGenericErrorContext,
5468 		    "HPP: try MISC\n");break;
5469 	case XML_PARSER_COMMENT:
5470 	    xmlGenericError(xmlGenericErrorContext,
5471 		    "HPP: try COMMENT\n");break;
5472 	case XML_PARSER_PROLOG:
5473 	    xmlGenericError(xmlGenericErrorContext,
5474 		    "HPP: try PROLOG\n");break;
5475 	case XML_PARSER_START_TAG:
5476 	    xmlGenericError(xmlGenericErrorContext,
5477 		    "HPP: try START_TAG\n");break;
5478 	case XML_PARSER_CONTENT:
5479 	    xmlGenericError(xmlGenericErrorContext,
5480 		    "HPP: try CONTENT\n");break;
5481 	case XML_PARSER_CDATA_SECTION:
5482 	    xmlGenericError(xmlGenericErrorContext,
5483 		    "HPP: try CDATA_SECTION\n");break;
5484 	case XML_PARSER_END_TAG:
5485 	    xmlGenericError(xmlGenericErrorContext,
5486 		    "HPP: try END_TAG\n");break;
5487 	case XML_PARSER_ENTITY_DECL:
5488 	    xmlGenericError(xmlGenericErrorContext,
5489 		    "HPP: try ENTITY_DECL\n");break;
5490 	case XML_PARSER_ENTITY_VALUE:
5491 	    xmlGenericError(xmlGenericErrorContext,
5492 		    "HPP: try ENTITY_VALUE\n");break;
5493 	case XML_PARSER_ATTRIBUTE_VALUE:
5494 	    xmlGenericError(xmlGenericErrorContext,
5495 		    "HPP: try ATTRIBUTE_VALUE\n");break;
5496 	case XML_PARSER_DTD:
5497 	    xmlGenericError(xmlGenericErrorContext,
5498 		    "HPP: try DTD\n");break;
5499 	case XML_PARSER_EPILOG:
5500 	    xmlGenericError(xmlGenericErrorContext,
5501 		    "HPP: try EPILOG\n");break;
5502 	case XML_PARSER_PI:
5503 	    xmlGenericError(xmlGenericErrorContext,
5504 		    "HPP: try PI\n");break;
5505 	case XML_PARSER_SYSTEM_LITERAL:
5506 	    xmlGenericError(xmlGenericErrorContext,
5507 		    "HPP: try SYSTEM_LITERAL\n");break;
5508     }
5509 #endif
5510 
5511     while (1) {
5512 
5513 	in = ctxt->input;
5514 	if (in == NULL) break;
5515 	if (in->buf == NULL)
5516 	    avail = in->length - (in->cur - in->base);
5517 	else
5518 	    avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5519                     (in->cur - in->base);
5520 	if ((avail == 0) && (terminate)) {
5521 	    htmlAutoCloseOnEnd(ctxt);
5522 	    if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5523 		/*
5524 		 * SAX: end of the document processing.
5525 		 */
5526 		ctxt->instate = XML_PARSER_EOF;
5527 		if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5528 		    ctxt->sax->endDocument(ctxt->userData);
5529 	    }
5530 	}
5531         if (avail < 1)
5532 	    goto done;
5533         /*
5534          * This is done to make progress and avoid an infinite loop
5535          * if a parsing attempt was aborted by hitting a NUL byte. After
5536          * changing htmlCurrentChar, this probably isn't necessary anymore.
5537          * We should consider removing this check.
5538          */
5539 	cur = in->cur[0];
5540 	if (cur == 0) {
5541 	    SKIP(1);
5542 	    continue;
5543 	}
5544 
5545         switch (ctxt->instate) {
5546             case XML_PARSER_EOF:
5547 	        /*
5548 		 * Document parsing is done !
5549 		 */
5550 	        goto done;
5551             case XML_PARSER_START:
5552 	        /*
5553 		 * Very first chars read from the document flow.
5554 		 */
5555 		cur = in->cur[0];
5556 		if (IS_BLANK_CH(cur)) {
5557 		    SKIP_BLANKS;
5558 		    if (in->buf == NULL)
5559 			avail = in->length - (in->cur - in->base);
5560 		    else
5561 			avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5562                                 (in->cur - in->base);
5563 		}
5564 		if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5565 		    ctxt->sax->setDocumentLocator(ctxt->userData,
5566 						  &xmlDefaultSAXLocator);
5567 		if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5568 	            (!ctxt->disableSAX))
5569 		    ctxt->sax->startDocument(ctxt->userData);
5570 
5571 		cur = in->cur[0];
5572 		next = in->cur[1];
5573 		if ((cur == '<') && (next == '!') &&
5574 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5575 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5576 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5577 		    (UPP(8) == 'E')) {
5578 		    if ((!terminate) &&
5579 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5580 			goto done;
5581 #ifdef DEBUG_PUSH
5582 		    xmlGenericError(xmlGenericErrorContext,
5583 			    "HPP: Parsing internal subset\n");
5584 #endif
5585 		    htmlParseDocTypeDecl(ctxt);
5586 		    ctxt->instate = XML_PARSER_PROLOG;
5587 #ifdef DEBUG_PUSH
5588 		    xmlGenericError(xmlGenericErrorContext,
5589 			    "HPP: entering PROLOG\n");
5590 #endif
5591                 } else {
5592 		    ctxt->instate = XML_PARSER_MISC;
5593 #ifdef DEBUG_PUSH
5594 		    xmlGenericError(xmlGenericErrorContext,
5595 			    "HPP: entering MISC\n");
5596 #endif
5597 		}
5598 		break;
5599             case XML_PARSER_MISC:
5600 		SKIP_BLANKS;
5601 		if (in->buf == NULL)
5602 		    avail = in->length - (in->cur - in->base);
5603 		else
5604 		    avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5605                             (in->cur - in->base);
5606 		/*
5607 		 * no chars in buffer
5608 		 */
5609 		if (avail < 1)
5610 		    goto done;
5611 		/*
5612 		 * not enough chars in buffer
5613 		 */
5614 		if (avail < 2) {
5615 		    if (!terminate)
5616 			goto done;
5617 		    else
5618 			next = ' ';
5619 		} else {
5620 		    next = in->cur[1];
5621 		}
5622 		cur = in->cur[0];
5623 	        if ((cur == '<') && (next == '!') &&
5624 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5625 		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5626 			goto done;
5627 #ifdef DEBUG_PUSH
5628 		    xmlGenericError(xmlGenericErrorContext,
5629 			    "HPP: Parsing Comment\n");
5630 #endif
5631 		    htmlParseComment(ctxt);
5632 		    ctxt->instate = XML_PARSER_MISC;
5633 	        } else if ((cur == '<') && (next == '?')) {
5634 		    if ((!terminate) &&
5635 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5636 			goto done;
5637 #ifdef DEBUG_PUSH
5638 		    xmlGenericError(xmlGenericErrorContext,
5639 			    "HPP: Parsing PI\n");
5640 #endif
5641 		    htmlParsePI(ctxt);
5642 		    ctxt->instate = XML_PARSER_MISC;
5643 		} else if ((cur == '<') && (next == '!') &&
5644 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5645 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5646 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5647 		    (UPP(8) == 'E')) {
5648 		    if ((!terminate) &&
5649 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5650 			goto done;
5651 #ifdef DEBUG_PUSH
5652 		    xmlGenericError(xmlGenericErrorContext,
5653 			    "HPP: Parsing internal subset\n");
5654 #endif
5655 		    htmlParseDocTypeDecl(ctxt);
5656 		    ctxt->instate = XML_PARSER_PROLOG;
5657 #ifdef DEBUG_PUSH
5658 		    xmlGenericError(xmlGenericErrorContext,
5659 			    "HPP: entering PROLOG\n");
5660 #endif
5661 		} else if ((cur == '<') && (next == '!') &&
5662 		           (avail < 9)) {
5663 		    goto done;
5664 		} else {
5665 		    ctxt->instate = XML_PARSER_CONTENT;
5666 #ifdef DEBUG_PUSH
5667 		    xmlGenericError(xmlGenericErrorContext,
5668 			    "HPP: entering START_TAG\n");
5669 #endif
5670 		}
5671 		break;
5672             case XML_PARSER_PROLOG:
5673 		SKIP_BLANKS;
5674 		if (in->buf == NULL)
5675 		    avail = in->length - (in->cur - in->base);
5676 		else
5677 		    avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5678                             (in->cur - in->base);
5679 		if (avail < 2)
5680 		    goto done;
5681 		cur = in->cur[0];
5682 		next = in->cur[1];
5683 		if ((cur == '<') && (next == '!') &&
5684 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5685 		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5686 			goto done;
5687 #ifdef DEBUG_PUSH
5688 		    xmlGenericError(xmlGenericErrorContext,
5689 			    "HPP: Parsing Comment\n");
5690 #endif
5691 		    htmlParseComment(ctxt);
5692 		    ctxt->instate = XML_PARSER_PROLOG;
5693 	        } else if ((cur == '<') && (next == '?')) {
5694 		    if ((!terminate) &&
5695 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5696 			goto done;
5697 #ifdef DEBUG_PUSH
5698 		    xmlGenericError(xmlGenericErrorContext,
5699 			    "HPP: Parsing PI\n");
5700 #endif
5701 		    htmlParsePI(ctxt);
5702 		    ctxt->instate = XML_PARSER_PROLOG;
5703 		} else if ((cur == '<') && (next == '!') &&
5704 		           (avail < 4)) {
5705 		    goto done;
5706 		} else {
5707 		    ctxt->instate = XML_PARSER_CONTENT;
5708 #ifdef DEBUG_PUSH
5709 		    xmlGenericError(xmlGenericErrorContext,
5710 			    "HPP: entering START_TAG\n");
5711 #endif
5712 		}
5713 		break;
5714             case XML_PARSER_EPILOG:
5715 		if (in->buf == NULL)
5716 		    avail = in->length - (in->cur - in->base);
5717 		else
5718 		    avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5719                             (in->cur - in->base);
5720 		if (avail < 1)
5721 		    goto done;
5722 		cur = in->cur[0];
5723 		if (IS_BLANK_CH(cur)) {
5724 		    htmlParseCharData(ctxt);
5725 		    goto done;
5726 		}
5727 		if (avail < 2)
5728 		    goto done;
5729 		next = in->cur[1];
5730 	        if ((cur == '<') && (next == '!') &&
5731 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5732 		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5733 			goto done;
5734 #ifdef DEBUG_PUSH
5735 		    xmlGenericError(xmlGenericErrorContext,
5736 			    "HPP: Parsing Comment\n");
5737 #endif
5738 		    htmlParseComment(ctxt);
5739 		    ctxt->instate = XML_PARSER_EPILOG;
5740 	        } else if ((cur == '<') && (next == '?')) {
5741 		    if ((!terminate) &&
5742 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5743 			goto done;
5744 #ifdef DEBUG_PUSH
5745 		    xmlGenericError(xmlGenericErrorContext,
5746 			    "HPP: Parsing PI\n");
5747 #endif
5748 		    htmlParsePI(ctxt);
5749 		    ctxt->instate = XML_PARSER_EPILOG;
5750 		} else if ((cur == '<') && (next == '!') &&
5751 		           (avail < 4)) {
5752 		    goto done;
5753 		} else {
5754 		    ctxt->errNo = XML_ERR_DOCUMENT_END;
5755 		    ctxt->wellFormed = 0;
5756 		    ctxt->instate = XML_PARSER_EOF;
5757 #ifdef DEBUG_PUSH
5758 		    xmlGenericError(xmlGenericErrorContext,
5759 			    "HPP: entering EOF\n");
5760 #endif
5761 		    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5762 			ctxt->sax->endDocument(ctxt->userData);
5763 		    goto done;
5764 		}
5765 		break;
5766             case XML_PARSER_START_TAG: {
5767 	        const xmlChar *name;
5768 		int failed;
5769 		const htmlElemDesc * info;
5770 
5771 		/*
5772 		 * no chars in buffer
5773 		 */
5774 		if (avail < 1)
5775 		    goto done;
5776 		/*
5777 		 * not enough chars in buffer
5778 		 */
5779 		if (avail < 2) {
5780 		    if (!terminate)
5781 			goto done;
5782 		    else
5783 			next = ' ';
5784 		} else {
5785 		    next = in->cur[1];
5786 		}
5787 		cur = in->cur[0];
5788 	        if (cur != '<') {
5789 		    ctxt->instate = XML_PARSER_CONTENT;
5790 #ifdef DEBUG_PUSH
5791 		    xmlGenericError(xmlGenericErrorContext,
5792 			    "HPP: entering CONTENT\n");
5793 #endif
5794 		    break;
5795 		}
5796 		if (next == '/') {
5797 		    ctxt->instate = XML_PARSER_END_TAG;
5798 		    ctxt->checkIndex = 0;
5799 #ifdef DEBUG_PUSH
5800 		    xmlGenericError(xmlGenericErrorContext,
5801 			    "HPP: entering END_TAG\n");
5802 #endif
5803 		    break;
5804 		}
5805 		if ((!terminate) &&
5806 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5807 		    goto done;
5808 
5809                 /* Capture start position */
5810 	        if (ctxt->record_info) {
5811 	             node_info.begin_pos = ctxt->input->consumed +
5812 	                                (CUR_PTR - ctxt->input->base);
5813 	             node_info.begin_line = ctxt->input->line;
5814 	        }
5815 
5816 
5817 		failed = htmlParseStartTag(ctxt);
5818 		name = ctxt->name;
5819 		if ((failed == -1) ||
5820 		    (name == NULL)) {
5821 		    if (CUR == '>')
5822 			NEXT;
5823 		    break;
5824 		}
5825 
5826 		/*
5827 		 * Lookup the info for that element.
5828 		 */
5829 		info = htmlTagLookup(name);
5830 		if (info == NULL) {
5831 		    htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5832 		                 "Tag %s invalid\n", name, NULL);
5833 		}
5834 
5835 		/*
5836 		 * Check for an Empty Element labeled the XML/SGML way
5837 		 */
5838 		if ((CUR == '/') && (NXT(1) == '>')) {
5839 		    SKIP(2);
5840 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5841 			ctxt->sax->endElement(ctxt->userData, name);
5842 		    htmlnamePop(ctxt);
5843 		    ctxt->instate = XML_PARSER_CONTENT;
5844 #ifdef DEBUG_PUSH
5845 		    xmlGenericError(xmlGenericErrorContext,
5846 			    "HPP: entering CONTENT\n");
5847 #endif
5848 		    break;
5849 		}
5850 
5851 		if (CUR == '>') {
5852 		    NEXT;
5853 		} else {
5854 		    htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5855 		                 "Couldn't find end of Start Tag %s\n",
5856 				 name, NULL);
5857 
5858 		    /*
5859 		     * end of parsing of this node.
5860 		     */
5861 		    if (xmlStrEqual(name, ctxt->name)) {
5862 			nodePop(ctxt);
5863 			htmlnamePop(ctxt);
5864 		    }
5865 
5866 		    if (ctxt->record_info)
5867 		        htmlNodeInfoPush(ctxt, &node_info);
5868 
5869 		    ctxt->instate = XML_PARSER_CONTENT;
5870 #ifdef DEBUG_PUSH
5871 		    xmlGenericError(xmlGenericErrorContext,
5872 			    "HPP: entering CONTENT\n");
5873 #endif
5874 		    break;
5875 		}
5876 
5877 		/*
5878 		 * Check for an Empty Element from DTD definition
5879 		 */
5880 		if ((info != NULL) && (info->empty)) {
5881 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5882 			ctxt->sax->endElement(ctxt->userData, name);
5883 		    htmlnamePop(ctxt);
5884 		}
5885 
5886                 if (ctxt->record_info)
5887 	            htmlNodeInfoPush(ctxt, &node_info);
5888 
5889 		ctxt->instate = XML_PARSER_CONTENT;
5890 #ifdef DEBUG_PUSH
5891 		xmlGenericError(xmlGenericErrorContext,
5892 			"HPP: entering CONTENT\n");
5893 #endif
5894                 break;
5895 	    }
5896             case XML_PARSER_CONTENT: {
5897 		xmlChar chr[2] = { 0, 0 };
5898 
5899                 /*
5900 		 * Handle preparsed entities and charRef
5901 		 */
5902 		if (ctxt->token != 0) {
5903 		    chr[0] = (xmlChar) ctxt->token;
5904 		    htmlCheckParagraph(ctxt);
5905 		    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5906 			ctxt->sax->characters(ctxt->userData, chr, 1);
5907 		    ctxt->token = 0;
5908 		    ctxt->checkIndex = 0;
5909 		}
5910 		if ((avail == 1) && (terminate)) {
5911 		    cur = in->cur[0];
5912 		    if ((cur != '<') && (cur != '&')) {
5913 			if (ctxt->sax != NULL) {
5914                             chr[0] = cur;
5915 			    if (IS_BLANK_CH(cur)) {
5916 				if (ctxt->keepBlanks) {
5917 				    if (ctxt->sax->characters != NULL)
5918 					ctxt->sax->characters(
5919 						ctxt->userData, chr, 1);
5920 				} else {
5921 				    if (ctxt->sax->ignorableWhitespace != NULL)
5922 					ctxt->sax->ignorableWhitespace(
5923 						ctxt->userData, chr, 1);
5924 				}
5925 			    } else {
5926 				htmlCheckParagraph(ctxt);
5927 				if (ctxt->sax->characters != NULL)
5928 				    ctxt->sax->characters(
5929 					    ctxt->userData, chr, 1);
5930 			    }
5931 			}
5932 			ctxt->token = 0;
5933 			ctxt->checkIndex = 0;
5934 			in->cur++;
5935 			break;
5936 		    }
5937 		}
5938 		if (avail < 2)
5939 		    goto done;
5940 		cur = in->cur[0];
5941 		next = in->cur[1];
5942 		if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5943 		    (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5944 		    /*
5945 		     * Handle SCRIPT/STYLE separately
5946 		     */
5947 		    if (!terminate) {
5948 		        int idx;
5949 			xmlChar val;
5950 
5951 			idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5952 			if (idx < 0)
5953 			    goto done;
5954 		        val = in->cur[idx + 2];
5955 			if (val == 0) /* bad cut of input */
5956 			    goto done;
5957 		    }
5958 		    htmlParseScript(ctxt);
5959 		    if ((cur == '<') && (next == '/')) {
5960 			ctxt->instate = XML_PARSER_END_TAG;
5961 			ctxt->checkIndex = 0;
5962 #ifdef DEBUG_PUSH
5963 			xmlGenericError(xmlGenericErrorContext,
5964 				"HPP: entering END_TAG\n");
5965 #endif
5966 			break;
5967 		    }
5968 		} else if ((cur == '<') && (next == '!')) {
5969                     /*
5970                      * Sometimes DOCTYPE arrives in the middle of the document
5971                      */
5972                     if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
5973                         (UPP(4) == 'C') && (UPP(5) == 'T') &&
5974                         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5975                         (UPP(8) == 'E')) {
5976                         if ((!terminate) &&
5977                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5978                             goto done;
5979                         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5980                                      "Misplaced DOCTYPE declaration\n",
5981                                      BAD_CAST "DOCTYPE" , NULL);
5982                         htmlParseDocTypeDecl(ctxt);
5983                     } else if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
5984                         if ((!terminate) &&
5985                             (htmlParseLookupCommentEnd(ctxt) < 0))
5986                             goto done;
5987 #ifdef DEBUG_PUSH
5988                         xmlGenericError(xmlGenericErrorContext,
5989                                 "HPP: Parsing Comment\n");
5990 #endif
5991                         htmlParseComment(ctxt);
5992                         ctxt->instate = XML_PARSER_CONTENT;
5993                     } else {
5994                         if ((!terminate) &&
5995                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5996                             goto done;
5997                         htmlSkipBogusComment(ctxt);
5998                     }
5999                 } else if ((cur == '<') && (next == '?')) {
6000                     if ((!terminate) &&
6001                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6002                         goto done;
6003 #ifdef DEBUG_PUSH
6004                     xmlGenericError(xmlGenericErrorContext,
6005                             "HPP: Parsing PI\n");
6006 #endif
6007                     htmlParsePI(ctxt);
6008                     ctxt->instate = XML_PARSER_CONTENT;
6009                 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
6010                     goto done;
6011                 } else if ((cur == '<') && (next == '/')) {
6012                     ctxt->instate = XML_PARSER_END_TAG;
6013                     ctxt->checkIndex = 0;
6014 #ifdef DEBUG_PUSH
6015                     xmlGenericError(xmlGenericErrorContext,
6016                             "HPP: entering END_TAG\n");
6017 #endif
6018                     break;
6019                 } else if ((cur == '<') && IS_ASCII_LETTER(next)) {
6020                     if ((!terminate) && (next == 0))
6021                         goto done;
6022                     ctxt->instate = XML_PARSER_START_TAG;
6023                     ctxt->checkIndex = 0;
6024 #ifdef DEBUG_PUSH
6025                     xmlGenericError(xmlGenericErrorContext,
6026                             "HPP: entering START_TAG\n");
6027 #endif
6028                     break;
6029                 } else if (cur == '<') {
6030                     if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
6031                         (ctxt->sax->characters != NULL))
6032                         ctxt->sax->characters(ctxt->userData,
6033                                               BAD_CAST "<", 1);
6034                     NEXT;
6035                 } else {
6036                     /*
6037                      * check that the text sequence is complete
6038                      * before handing out the data to the parser
6039                      * to avoid problems with erroneous end of
6040                      * data detection.
6041                      */
6042                     if ((!terminate) &&
6043                         (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
6044                         goto done;
6045                     ctxt->checkIndex = 0;
6046 #ifdef DEBUG_PUSH
6047                     xmlGenericError(xmlGenericErrorContext,
6048                             "HPP: Parsing char data\n");
6049 #endif
6050                     while ((ctxt->instate != XML_PARSER_EOF) &&
6051                            (cur != '<') && (in->cur < in->end)) {
6052                         if (cur == '&') {
6053                             htmlParseReference(ctxt);
6054                         } else {
6055                             htmlParseCharData(ctxt);
6056                         }
6057                         cur = in->cur[0];
6058                     }
6059 		}
6060 
6061 		break;
6062 	    }
6063             case XML_PARSER_END_TAG:
6064 		if (avail < 2)
6065 		    goto done;
6066 		if ((!terminate) &&
6067 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6068 		    goto done;
6069 		htmlParseEndTag(ctxt);
6070 		if (ctxt->nameNr == 0) {
6071 		    ctxt->instate = XML_PARSER_EPILOG;
6072 		} else {
6073 		    ctxt->instate = XML_PARSER_CONTENT;
6074 		}
6075 		ctxt->checkIndex = 0;
6076 #ifdef DEBUG_PUSH
6077 		xmlGenericError(xmlGenericErrorContext,
6078 			"HPP: entering CONTENT\n");
6079 #endif
6080 	        break;
6081             case XML_PARSER_CDATA_SECTION:
6082 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6083 			"HPP: internal error, state == CDATA\n",
6084 			     NULL, NULL);
6085 		ctxt->instate = XML_PARSER_CONTENT;
6086 		ctxt->checkIndex = 0;
6087 #ifdef DEBUG_PUSH
6088 		xmlGenericError(xmlGenericErrorContext,
6089 			"HPP: entering CONTENT\n");
6090 #endif
6091 		break;
6092             case XML_PARSER_DTD:
6093 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6094 			"HPP: internal error, state == DTD\n",
6095 			     NULL, NULL);
6096 		ctxt->instate = XML_PARSER_CONTENT;
6097 		ctxt->checkIndex = 0;
6098 #ifdef DEBUG_PUSH
6099 		xmlGenericError(xmlGenericErrorContext,
6100 			"HPP: entering CONTENT\n");
6101 #endif
6102 		break;
6103             case XML_PARSER_COMMENT:
6104 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6105 			"HPP: internal error, state == COMMENT\n",
6106 			     NULL, NULL);
6107 		ctxt->instate = XML_PARSER_CONTENT;
6108 		ctxt->checkIndex = 0;
6109 #ifdef DEBUG_PUSH
6110 		xmlGenericError(xmlGenericErrorContext,
6111 			"HPP: entering CONTENT\n");
6112 #endif
6113 		break;
6114             case XML_PARSER_PI:
6115 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6116 			"HPP: internal error, state == PI\n",
6117 			     NULL, NULL);
6118 		ctxt->instate = XML_PARSER_CONTENT;
6119 		ctxt->checkIndex = 0;
6120 #ifdef DEBUG_PUSH
6121 		xmlGenericError(xmlGenericErrorContext,
6122 			"HPP: entering CONTENT\n");
6123 #endif
6124 		break;
6125             case XML_PARSER_ENTITY_DECL:
6126 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6127 			"HPP: internal error, state == ENTITY_DECL\n",
6128 			     NULL, NULL);
6129 		ctxt->instate = XML_PARSER_CONTENT;
6130 		ctxt->checkIndex = 0;
6131 #ifdef DEBUG_PUSH
6132 		xmlGenericError(xmlGenericErrorContext,
6133 			"HPP: entering CONTENT\n");
6134 #endif
6135 		break;
6136             case XML_PARSER_ENTITY_VALUE:
6137 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6138 			"HPP: internal error, state == ENTITY_VALUE\n",
6139 			     NULL, NULL);
6140 		ctxt->instate = XML_PARSER_CONTENT;
6141 		ctxt->checkIndex = 0;
6142 #ifdef DEBUG_PUSH
6143 		xmlGenericError(xmlGenericErrorContext,
6144 			"HPP: entering DTD\n");
6145 #endif
6146 		break;
6147             case XML_PARSER_ATTRIBUTE_VALUE:
6148 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6149 			"HPP: internal error, state == ATTRIBUTE_VALUE\n",
6150 			     NULL, NULL);
6151 		ctxt->instate = XML_PARSER_START_TAG;
6152 		ctxt->checkIndex = 0;
6153 #ifdef DEBUG_PUSH
6154 		xmlGenericError(xmlGenericErrorContext,
6155 			"HPP: entering START_TAG\n");
6156 #endif
6157 		break;
6158 	    case XML_PARSER_SYSTEM_LITERAL:
6159 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6160 		    "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6161 			     NULL, NULL);
6162 		ctxt->instate = XML_PARSER_CONTENT;
6163 		ctxt->checkIndex = 0;
6164 #ifdef DEBUG_PUSH
6165 		xmlGenericError(xmlGenericErrorContext,
6166 			"HPP: entering CONTENT\n");
6167 #endif
6168 		break;
6169 	    case XML_PARSER_IGNORE:
6170 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6171 			"HPP: internal error, state == XML_PARSER_IGNORE\n",
6172 			     NULL, NULL);
6173 		ctxt->instate = XML_PARSER_CONTENT;
6174 		ctxt->checkIndex = 0;
6175 #ifdef DEBUG_PUSH
6176 		xmlGenericError(xmlGenericErrorContext,
6177 			"HPP: entering CONTENT\n");
6178 #endif
6179 		break;
6180 	    case XML_PARSER_PUBLIC_LITERAL:
6181 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6182 			"HPP: internal error, state == XML_PARSER_LITERAL\n",
6183 			     NULL, NULL);
6184 		ctxt->instate = XML_PARSER_CONTENT;
6185 		ctxt->checkIndex = 0;
6186 #ifdef DEBUG_PUSH
6187 		xmlGenericError(xmlGenericErrorContext,
6188 			"HPP: entering CONTENT\n");
6189 #endif
6190 		break;
6191 
6192 	}
6193     }
6194 done:
6195     if ((avail == 0) && (terminate)) {
6196 	htmlAutoCloseOnEnd(ctxt);
6197 	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6198 	    /*
6199 	     * SAX: end of the document processing.
6200 	     */
6201 	    ctxt->instate = XML_PARSER_EOF;
6202 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6203 		ctxt->sax->endDocument(ctxt->userData);
6204 	}
6205     }
6206     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6207 	((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6208 	 (ctxt->instate == XML_PARSER_EPILOG))) {
6209 	xmlDtdPtr dtd;
6210 	dtd = xmlGetIntSubset(ctxt->myDoc);
6211 	if (dtd == NULL)
6212 	    ctxt->myDoc->intSubset =
6213 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6214 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6215 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6216     }
6217 #ifdef DEBUG_PUSH
6218     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6219 #endif
6220     return(ret);
6221 }
6222 
6223 /**
6224  * htmlParseChunk:
6225  * @ctxt:  an HTML parser context
6226  * @chunk:  an char array
6227  * @size:  the size in byte of the chunk
6228  * @terminate:  last chunk indicator
6229  *
6230  * Parse a Chunk of memory
6231  *
6232  * Returns zero if no error, the xmlParserErrors otherwise.
6233  */
6234 int
htmlParseChunk(htmlParserCtxtPtr ctxt,const char * chunk,int size,int terminate)6235 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6236               int terminate) {
6237     if ((ctxt == NULL) || (ctxt->input == NULL)) {
6238 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6239 		     "htmlParseChunk: context error\n", NULL, NULL);
6240 	return(XML_ERR_INTERNAL_ERROR);
6241     }
6242     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6243         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
6244 	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6245 	size_t cur = ctxt->input->cur - ctxt->input->base;
6246 	int res;
6247 
6248 	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6249         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6250 	if (res < 0) {
6251 	    ctxt->errNo = XML_PARSER_EOF;
6252 	    ctxt->disableSAX = 1;
6253 	    return (XML_PARSER_EOF);
6254 	}
6255 #ifdef DEBUG_PUSH
6256 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6257 #endif
6258 
6259 #if 0
6260 	if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6261 	    htmlParseTryOrFinish(ctxt, terminate);
6262 #endif
6263     } else if (ctxt->instate != XML_PARSER_EOF) {
6264 	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6265 	    xmlParserInputBufferPtr in = ctxt->input->buf;
6266 	    if ((in->encoder != NULL) && (in->buffer != NULL) &&
6267 		    (in->raw != NULL)) {
6268 		int nbchars;
6269 		size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6270 		size_t current = ctxt->input->cur - ctxt->input->base;
6271 
6272 		nbchars = xmlCharEncInput(in, terminate);
6273 		xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6274 		if (nbchars < 0) {
6275 		    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6276 			         "encoder error\n", NULL, NULL);
6277 		    return(XML_ERR_INVALID_ENCODING);
6278 		}
6279 	    }
6280 	}
6281     }
6282     htmlParseTryOrFinish(ctxt, terminate);
6283     if (terminate) {
6284 	if ((ctxt->instate != XML_PARSER_EOF) &&
6285 	    (ctxt->instate != XML_PARSER_EPILOG) &&
6286 	    (ctxt->instate != XML_PARSER_MISC)) {
6287 	    ctxt->errNo = XML_ERR_DOCUMENT_END;
6288 	    ctxt->wellFormed = 0;
6289 	}
6290 	if (ctxt->instate != XML_PARSER_EOF) {
6291 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6292 		ctxt->sax->endDocument(ctxt->userData);
6293 	}
6294 	ctxt->instate = XML_PARSER_EOF;
6295     }
6296     return((xmlParserErrors) ctxt->errNo);
6297 }
6298 
6299 /************************************************************************
6300  *									*
6301  *			User entry points				*
6302  *									*
6303  ************************************************************************/
6304 
6305 /**
6306  * htmlCreatePushParserCtxt:
6307  * @sax:  a SAX handler
6308  * @user_data:  The user data returned on SAX callbacks
6309  * @chunk:  a pointer to an array of chars
6310  * @size:  number of chars in the array
6311  * @filename:  an optional file name or URI
6312  * @enc:  an optional encoding
6313  *
6314  * Create a parser context for using the HTML parser in push mode
6315  * The value of @filename is used for fetching external entities
6316  * and error/warning reports.
6317  *
6318  * Returns the new parser context or NULL
6319  */
6320 htmlParserCtxtPtr
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,void * user_data,const char * chunk,int size,const char * filename,xmlCharEncoding enc)6321 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6322                          const char *chunk, int size, const char *filename,
6323 			 xmlCharEncoding enc) {
6324     htmlParserCtxtPtr ctxt;
6325     htmlParserInputPtr inputStream;
6326     xmlParserInputBufferPtr buf;
6327 
6328     xmlInitParser();
6329 
6330     buf = xmlAllocParserInputBuffer(enc);
6331     if (buf == NULL) return(NULL);
6332 
6333     ctxt = htmlNewParserCtxt();
6334     if (ctxt == NULL) {
6335 	xmlFreeParserInputBuffer(buf);
6336 	return(NULL);
6337     }
6338     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6339 	ctxt->charset=XML_CHAR_ENCODING_UTF8;
6340     if (sax != NULL) {
6341 	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6342 	    xmlFree(ctxt->sax);
6343 	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6344 	if (ctxt->sax == NULL) {
6345 	    xmlFree(buf);
6346 	    xmlFree(ctxt);
6347 	    return(NULL);
6348 	}
6349 	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6350 	if (user_data != NULL)
6351 	    ctxt->userData = user_data;
6352     }
6353     if (filename == NULL) {
6354 	ctxt->directory = NULL;
6355     } else {
6356         ctxt->directory = xmlParserGetDirectory(filename);
6357     }
6358 
6359     inputStream = htmlNewInputStream(ctxt);
6360     if (inputStream == NULL) {
6361 	xmlFreeParserCtxt(ctxt);
6362 	xmlFree(buf);
6363 	return(NULL);
6364     }
6365 
6366     if (filename == NULL)
6367 	inputStream->filename = NULL;
6368     else
6369 	inputStream->filename = (char *)
6370 	    xmlCanonicPath((const xmlChar *) filename);
6371     inputStream->buf = buf;
6372     xmlBufResetInput(buf->buffer, inputStream);
6373 
6374     inputPush(ctxt, inputStream);
6375 
6376     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6377         (ctxt->input->buf != NULL))  {
6378 	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6379 	size_t cur = ctxt->input->cur - ctxt->input->base;
6380 
6381 	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6382 
6383         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6384 #ifdef DEBUG_PUSH
6385 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6386 #endif
6387     }
6388     ctxt->progressive = 1;
6389 
6390     return(ctxt);
6391 }
6392 #endif /* LIBXML_PUSH_ENABLED */
6393 
6394 /**
6395  * htmlSAXParseDoc:
6396  * @cur:  a pointer to an array of xmlChar
6397  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6398  * @sax:  the SAX handler block
6399  * @userData: if using SAX, this pointer will be provided on callbacks.
6400  *
6401  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6402  * to handle parse events. If sax is NULL, fallback to the default DOM
6403  * behavior and return a tree.
6404  *
6405  * Returns the resulting document tree unless SAX is NULL or the document is
6406  *     not well formed.
6407  */
6408 
6409 htmlDocPtr
htmlSAXParseDoc(const xmlChar * cur,const char * encoding,htmlSAXHandlerPtr sax,void * userData)6410 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6411                 htmlSAXHandlerPtr sax, void *userData) {
6412     htmlDocPtr ret;
6413     htmlParserCtxtPtr ctxt;
6414 
6415     xmlInitParser();
6416 
6417     if (cur == NULL) return(NULL);
6418 
6419 
6420     ctxt = htmlCreateDocParserCtxt(cur, encoding);
6421     if (ctxt == NULL) return(NULL);
6422     if (sax != NULL) {
6423         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6424         ctxt->sax = sax;
6425         ctxt->userData = userData;
6426     }
6427 
6428     htmlParseDocument(ctxt);
6429     ret = ctxt->myDoc;
6430     if (sax != NULL) {
6431 	ctxt->sax = NULL;
6432 	ctxt->userData = NULL;
6433     }
6434     htmlFreeParserCtxt(ctxt);
6435 
6436     return(ret);
6437 }
6438 
6439 /**
6440  * htmlParseDoc:
6441  * @cur:  a pointer to an array of xmlChar
6442  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6443  *
6444  * parse an HTML in-memory document and build a tree.
6445  *
6446  * Returns the resulting document tree
6447  */
6448 
6449 htmlDocPtr
htmlParseDoc(const xmlChar * cur,const char * encoding)6450 htmlParseDoc(const xmlChar *cur, const char *encoding) {
6451     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6452 }
6453 
6454 
6455 /**
6456  * htmlCreateFileParserCtxt:
6457  * @filename:  the filename
6458  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6459  *
6460  * Create a parser context for a file content.
6461  * Automatic support for ZLIB/Compress compressed document is provided
6462  * by default if found at compile-time.
6463  *
6464  * Returns the new parser context or NULL
6465  */
6466 htmlParserCtxtPtr
htmlCreateFileParserCtxt(const char * filename,const char * encoding)6467 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6468 {
6469     htmlParserCtxtPtr ctxt;
6470     htmlParserInputPtr inputStream;
6471     char *canonicFilename;
6472     /* htmlCharEncoding enc; */
6473     xmlChar *content, *content_line = (xmlChar *) "charset=";
6474 
6475     if (filename == NULL)
6476         return(NULL);
6477 
6478     ctxt = htmlNewParserCtxt();
6479     if (ctxt == NULL) {
6480 	return(NULL);
6481     }
6482     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6483     if (canonicFilename == NULL) {
6484 #ifdef LIBXML_SAX1_ENABLED
6485 	if (xmlDefaultSAXHandler.error != NULL) {
6486 	    xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6487 	}
6488 #endif
6489 	xmlFreeParserCtxt(ctxt);
6490 	return(NULL);
6491     }
6492 
6493     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6494     xmlFree(canonicFilename);
6495     if (inputStream == NULL) {
6496 	xmlFreeParserCtxt(ctxt);
6497 	return(NULL);
6498     }
6499 
6500     inputPush(ctxt, inputStream);
6501 
6502     /* set encoding */
6503     if (encoding) {
6504         size_t l = strlen(encoding);
6505 
6506 	if (l < 1000) {
6507 	    content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6508 	    if (content) {
6509 		strcpy ((char *)content, (char *)content_line);
6510 		strcat ((char *)content, (char *)encoding);
6511 		htmlCheckEncoding (ctxt, content);
6512 		xmlFree (content);
6513 	    }
6514 	}
6515     }
6516 
6517     return(ctxt);
6518 }
6519 
6520 /**
6521  * htmlSAXParseFile:
6522  * @filename:  the filename
6523  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6524  * @sax:  the SAX handler block
6525  * @userData: if using SAX, this pointer will be provided on callbacks.
6526  *
6527  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6528  * compressed document is provided by default if found at compile-time.
6529  * It use the given SAX function block to handle the parsing callback.
6530  * If sax is NULL, fallback to the default DOM tree building routines.
6531  *
6532  * Returns the resulting document tree unless SAX is NULL or the document is
6533  *     not well formed.
6534  */
6535 
6536 htmlDocPtr
htmlSAXParseFile(const char * filename,const char * encoding,htmlSAXHandlerPtr sax,void * userData)6537 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6538                  void *userData) {
6539     htmlDocPtr ret;
6540     htmlParserCtxtPtr ctxt;
6541     htmlSAXHandlerPtr oldsax = NULL;
6542 
6543     xmlInitParser();
6544 
6545     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6546     if (ctxt == NULL) return(NULL);
6547     if (sax != NULL) {
6548 	oldsax = ctxt->sax;
6549         ctxt->sax = sax;
6550         ctxt->userData = userData;
6551     }
6552 
6553     htmlParseDocument(ctxt);
6554 
6555     ret = ctxt->myDoc;
6556     if (sax != NULL) {
6557         ctxt->sax = oldsax;
6558         ctxt->userData = NULL;
6559     }
6560     htmlFreeParserCtxt(ctxt);
6561 
6562     return(ret);
6563 }
6564 
6565 /**
6566  * htmlParseFile:
6567  * @filename:  the filename
6568  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6569  *
6570  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6571  * compressed document is provided by default if found at compile-time.
6572  *
6573  * Returns the resulting document tree
6574  */
6575 
6576 htmlDocPtr
htmlParseFile(const char * filename,const char * encoding)6577 htmlParseFile(const char *filename, const char *encoding) {
6578     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6579 }
6580 
6581 /**
6582  * htmlHandleOmittedElem:
6583  * @val:  int 0 or 1
6584  *
6585  * Set and return the previous value for handling HTML omitted tags.
6586  *
6587  * Returns the last value for 0 for no handling, 1 for auto insertion.
6588  */
6589 
6590 int
htmlHandleOmittedElem(int val)6591 htmlHandleOmittedElem(int val) {
6592     int old = htmlOmittedDefaultValue;
6593 
6594     htmlOmittedDefaultValue = val;
6595     return(old);
6596 }
6597 
6598 /**
6599  * htmlElementAllowedHere:
6600  * @parent: HTML parent element
6601  * @elt: HTML element
6602  *
6603  * Checks whether an HTML element may be a direct child of a parent element.
6604  * Note - doesn't check for deprecated elements
6605  *
6606  * Returns 1 if allowed; 0 otherwise.
6607  */
6608 int
htmlElementAllowedHere(const htmlElemDesc * parent,const xmlChar * elt)6609 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6610   const char** p ;
6611 
6612   if ( ! elt || ! parent || ! parent->subelts )
6613 	return 0 ;
6614 
6615   for ( p = parent->subelts; *p; ++p )
6616     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6617       return 1 ;
6618 
6619   return 0 ;
6620 }
6621 /**
6622  * htmlElementStatusHere:
6623  * @parent: HTML parent element
6624  * @elt: HTML element
6625  *
6626  * Checks whether an HTML element may be a direct child of a parent element.
6627  * and if so whether it is valid or deprecated.
6628  *
6629  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6630  */
6631 htmlStatus
htmlElementStatusHere(const htmlElemDesc * parent,const htmlElemDesc * elt)6632 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6633   if ( ! parent || ! elt )
6634     return HTML_INVALID ;
6635   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6636     return HTML_INVALID ;
6637 
6638   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6639 }
6640 /**
6641  * htmlAttrAllowed:
6642  * @elt: HTML element
6643  * @attr: HTML attribute
6644  * @legacy: whether to allow deprecated attributes
6645  *
6646  * Checks whether an attribute is valid for an element
6647  * Has full knowledge of Required and Deprecated attributes
6648  *
6649  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6650  */
6651 htmlStatus
htmlAttrAllowed(const htmlElemDesc * elt,const xmlChar * attr,int legacy)6652 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6653   const char** p ;
6654 
6655   if ( !elt || ! attr )
6656 	return HTML_INVALID ;
6657 
6658   if ( elt->attrs_req )
6659     for ( p = elt->attrs_req; *p; ++p)
6660       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6661         return HTML_REQUIRED ;
6662 
6663   if ( elt->attrs_opt )
6664     for ( p = elt->attrs_opt; *p; ++p)
6665       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6666         return HTML_VALID ;
6667 
6668   if ( legacy && elt->attrs_depr )
6669     for ( p = elt->attrs_depr; *p; ++p)
6670       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6671         return HTML_DEPRECATED ;
6672 
6673   return HTML_INVALID ;
6674 }
6675 /**
6676  * htmlNodeStatus:
6677  * @node: an htmlNodePtr in a tree
6678  * @legacy: whether to allow deprecated elements (YES is faster here
6679  *	for Element nodes)
6680  *
6681  * Checks whether the tree node is valid.  Experimental (the author
6682  *     only uses the HTML enhancements in a SAX parser)
6683  *
6684  * Return: for Element nodes, a return from htmlElementAllowedHere (if
6685  *	legacy allowed) or htmlElementStatusHere (otherwise).
6686  *	for Attribute nodes, a return from htmlAttrAllowed
6687  *	for other nodes, HTML_NA (no checks performed)
6688  */
6689 htmlStatus
htmlNodeStatus(const htmlNodePtr node,int legacy)6690 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6691   if ( ! node )
6692     return HTML_INVALID ;
6693 
6694   switch ( node->type ) {
6695     case XML_ELEMENT_NODE:
6696       return legacy
6697 	? ( htmlElementAllowedHere (
6698 		htmlTagLookup(node->parent->name) , node->name
6699 		) ? HTML_VALID : HTML_INVALID )
6700 	: htmlElementStatusHere(
6701 		htmlTagLookup(node->parent->name) ,
6702 		htmlTagLookup(node->name) )
6703 	;
6704     case XML_ATTRIBUTE_NODE:
6705       return htmlAttrAllowed(
6706 	htmlTagLookup(node->parent->name) , node->name, legacy) ;
6707     default: return HTML_NA ;
6708   }
6709 }
6710 /************************************************************************
6711  *									*
6712  *	New set (2.6.0) of simpler and more flexible APIs		*
6713  *									*
6714  ************************************************************************/
6715 /**
6716  * DICT_FREE:
6717  * @str:  a string
6718  *
6719  * Free a string if it is not owned by the "dict" dictionary in the
6720  * current scope
6721  */
6722 #define DICT_FREE(str)						\
6723 	if ((str) && ((!dict) ||				\
6724 	    (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))	\
6725 	    xmlFree((char *)(str));
6726 
6727 /**
6728  * htmlCtxtReset:
6729  * @ctxt: an HTML parser context
6730  *
6731  * Reset a parser context
6732  */
6733 void
htmlCtxtReset(htmlParserCtxtPtr ctxt)6734 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6735 {
6736     xmlParserInputPtr input;
6737     xmlDictPtr dict;
6738 
6739     if (ctxt == NULL)
6740         return;
6741 
6742     xmlInitParser();
6743     dict = ctxt->dict;
6744 
6745     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6746         xmlFreeInputStream(input);
6747     }
6748     ctxt->inputNr = 0;
6749     ctxt->input = NULL;
6750 
6751     ctxt->spaceNr = 0;
6752     if (ctxt->spaceTab != NULL) {
6753 	ctxt->spaceTab[0] = -1;
6754 	ctxt->space = &ctxt->spaceTab[0];
6755     } else {
6756 	ctxt->space = NULL;
6757     }
6758 
6759 
6760     ctxt->nodeNr = 0;
6761     ctxt->node = NULL;
6762 
6763     ctxt->nameNr = 0;
6764     ctxt->name = NULL;
6765 
6766     ctxt->nsNr = 0;
6767 
6768     DICT_FREE(ctxt->version);
6769     ctxt->version = NULL;
6770     DICT_FREE(ctxt->encoding);
6771     ctxt->encoding = NULL;
6772     DICT_FREE(ctxt->directory);
6773     ctxt->directory = NULL;
6774     DICT_FREE(ctxt->extSubURI);
6775     ctxt->extSubURI = NULL;
6776     DICT_FREE(ctxt->extSubSystem);
6777     ctxt->extSubSystem = NULL;
6778     if (ctxt->myDoc != NULL)
6779         xmlFreeDoc(ctxt->myDoc);
6780     ctxt->myDoc = NULL;
6781 
6782     ctxt->standalone = -1;
6783     ctxt->hasExternalSubset = 0;
6784     ctxt->hasPErefs = 0;
6785     ctxt->html = 1;
6786     ctxt->external = 0;
6787     ctxt->instate = XML_PARSER_START;
6788     ctxt->token = 0;
6789 
6790     ctxt->wellFormed = 1;
6791     ctxt->nsWellFormed = 1;
6792     ctxt->disableSAX = 0;
6793     ctxt->valid = 1;
6794     ctxt->vctxt.userData = ctxt;
6795     ctxt->vctxt.error = xmlParserValidityError;
6796     ctxt->vctxt.warning = xmlParserValidityWarning;
6797     ctxt->record_info = 0;
6798     ctxt->checkIndex = 0;
6799     ctxt->inSubset = 0;
6800     ctxt->errNo = XML_ERR_OK;
6801     ctxt->depth = 0;
6802     ctxt->charset = XML_CHAR_ENCODING_NONE;
6803     ctxt->catalogs = NULL;
6804     xmlInitNodeInfoSeq(&ctxt->node_seq);
6805 
6806     if (ctxt->attsDefault != NULL) {
6807         xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6808         ctxt->attsDefault = NULL;
6809     }
6810     if (ctxt->attsSpecial != NULL) {
6811         xmlHashFree(ctxt->attsSpecial, NULL);
6812         ctxt->attsSpecial = NULL;
6813     }
6814 }
6815 
6816 /**
6817  * htmlCtxtUseOptions:
6818  * @ctxt: an HTML parser context
6819  * @options:  a combination of htmlParserOption(s)
6820  *
6821  * Applies the options to the parser context
6822  *
6823  * Returns 0 in case of success, the set of unknown or unimplemented options
6824  *         in case of error.
6825  */
6826 int
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt,int options)6827 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6828 {
6829     if (ctxt == NULL)
6830         return(-1);
6831 
6832     if (options & HTML_PARSE_NOWARNING) {
6833         ctxt->sax->warning = NULL;
6834         ctxt->vctxt.warning = NULL;
6835         options -= XML_PARSE_NOWARNING;
6836 	ctxt->options |= XML_PARSE_NOWARNING;
6837     }
6838     if (options & HTML_PARSE_NOERROR) {
6839         ctxt->sax->error = NULL;
6840         ctxt->vctxt.error = NULL;
6841         ctxt->sax->fatalError = NULL;
6842         options -= XML_PARSE_NOERROR;
6843 	ctxt->options |= XML_PARSE_NOERROR;
6844     }
6845     if (options & HTML_PARSE_PEDANTIC) {
6846         ctxt->pedantic = 1;
6847         options -= XML_PARSE_PEDANTIC;
6848 	ctxt->options |= XML_PARSE_PEDANTIC;
6849     } else
6850         ctxt->pedantic = 0;
6851     if (options & XML_PARSE_NOBLANKS) {
6852         ctxt->keepBlanks = 0;
6853         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6854         options -= XML_PARSE_NOBLANKS;
6855 	ctxt->options |= XML_PARSE_NOBLANKS;
6856     } else
6857         ctxt->keepBlanks = 1;
6858     if (options & HTML_PARSE_RECOVER) {
6859         ctxt->recovery = 1;
6860 	options -= HTML_PARSE_RECOVER;
6861     } else
6862         ctxt->recovery = 0;
6863     if (options & HTML_PARSE_COMPACT) {
6864 	ctxt->options |= HTML_PARSE_COMPACT;
6865         options -= HTML_PARSE_COMPACT;
6866     }
6867     if (options & XML_PARSE_HUGE) {
6868 	ctxt->options |= XML_PARSE_HUGE;
6869         options -= XML_PARSE_HUGE;
6870     }
6871     if (options & HTML_PARSE_NODEFDTD) {
6872 	ctxt->options |= HTML_PARSE_NODEFDTD;
6873         options -= HTML_PARSE_NODEFDTD;
6874     }
6875     if (options & HTML_PARSE_IGNORE_ENC) {
6876 	ctxt->options |= HTML_PARSE_IGNORE_ENC;
6877         options -= HTML_PARSE_IGNORE_ENC;
6878     }
6879     if (options & HTML_PARSE_NOIMPLIED) {
6880         ctxt->options |= HTML_PARSE_NOIMPLIED;
6881         options -= HTML_PARSE_NOIMPLIED;
6882     }
6883     ctxt->dictNames = 0;
6884     return (options);
6885 }
6886 
6887 /**
6888  * htmlDoRead:
6889  * @ctxt:  an HTML parser context
6890  * @URL:  the base URL to use for the document
6891  * @encoding:  the document encoding, or NULL
6892  * @options:  a combination of htmlParserOption(s)
6893  * @reuse:  keep the context for reuse
6894  *
6895  * Common front-end for the htmlRead functions
6896  *
6897  * Returns the resulting document tree or NULL
6898  */
6899 static htmlDocPtr
htmlDoRead(htmlParserCtxtPtr ctxt,const char * URL,const char * encoding,int options,int reuse)6900 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6901           int options, int reuse)
6902 {
6903     htmlDocPtr ret;
6904 
6905     htmlCtxtUseOptions(ctxt, options);
6906     ctxt->html = 1;
6907     if (encoding != NULL) {
6908         xmlCharEncodingHandlerPtr hdlr;
6909 
6910 	hdlr = xmlFindCharEncodingHandler(encoding);
6911 	if (hdlr != NULL) {
6912 	    xmlSwitchToEncoding(ctxt, hdlr);
6913 	    if (ctxt->input->encoding != NULL)
6914 	      xmlFree((xmlChar *) ctxt->input->encoding);
6915             ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6916         }
6917     }
6918     if ((URL != NULL) && (ctxt->input != NULL) &&
6919         (ctxt->input->filename == NULL))
6920         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6921     htmlParseDocument(ctxt);
6922     ret = ctxt->myDoc;
6923     ctxt->myDoc = NULL;
6924     if (!reuse) {
6925         if ((ctxt->dictNames) &&
6926 	    (ret != NULL) &&
6927 	    (ret->dict == ctxt->dict))
6928 	    ctxt->dict = NULL;
6929 	xmlFreeParserCtxt(ctxt);
6930     }
6931     return (ret);
6932 }
6933 
6934 /**
6935  * htmlReadDoc:
6936  * @cur:  a pointer to a zero terminated string
6937  * @URL:  the base URL to use for the document
6938  * @encoding:  the document encoding, or NULL
6939  * @options:  a combination of htmlParserOption(s)
6940  *
6941  * parse an XML in-memory document and build a tree.
6942  *
6943  * Returns the resulting document tree
6944  */
6945 htmlDocPtr
htmlReadDoc(const xmlChar * cur,const char * URL,const char * encoding,int options)6946 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6947 {
6948     htmlParserCtxtPtr ctxt;
6949 
6950     if (cur == NULL)
6951         return (NULL);
6952 
6953     xmlInitParser();
6954     ctxt = htmlCreateDocParserCtxt(cur, NULL);
6955     if (ctxt == NULL)
6956         return (NULL);
6957     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6958 }
6959 
6960 /**
6961  * htmlReadFile:
6962  * @filename:  a file or URL
6963  * @encoding:  the document encoding, or NULL
6964  * @options:  a combination of htmlParserOption(s)
6965  *
6966  * parse an XML file from the filesystem or the network.
6967  *
6968  * Returns the resulting document tree
6969  */
6970 htmlDocPtr
htmlReadFile(const char * filename,const char * encoding,int options)6971 htmlReadFile(const char *filename, const char *encoding, int options)
6972 {
6973     htmlParserCtxtPtr ctxt;
6974 
6975     xmlInitParser();
6976     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6977     if (ctxt == NULL)
6978         return (NULL);
6979     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6980 }
6981 
6982 /**
6983  * htmlReadMemory:
6984  * @buffer:  a pointer to a char array
6985  * @size:  the size of the array
6986  * @URL:  the base URL to use for the document
6987  * @encoding:  the document encoding, or NULL
6988  * @options:  a combination of htmlParserOption(s)
6989  *
6990  * parse an XML in-memory document and build a tree.
6991  *
6992  * Returns the resulting document tree
6993  */
6994 htmlDocPtr
htmlReadMemory(const char * buffer,int size,const char * URL,const char * encoding,int options)6995 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6996 {
6997     htmlParserCtxtPtr ctxt;
6998 
6999     xmlInitParser();
7000     ctxt = xmlCreateMemoryParserCtxt(buffer, size);
7001     if (ctxt == NULL)
7002         return (NULL);
7003     htmlDefaultSAXHandlerInit();
7004     if (ctxt->sax != NULL)
7005         memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
7006     return (htmlDoRead(ctxt, URL, encoding, options, 0));
7007 }
7008 
7009 /**
7010  * htmlReadFd:
7011  * @fd:  an open file descriptor
7012  * @URL:  the base URL to use for the document
7013  * @encoding:  the document encoding, or NULL
7014  * @options:  a combination of htmlParserOption(s)
7015  *
7016  * parse an HTML from a file descriptor and build a tree.
7017  * NOTE that the file descriptor will not be closed when the
7018  *      reader is closed or reset.
7019  *
7020  * Returns the resulting document tree
7021  */
7022 htmlDocPtr
htmlReadFd(int fd,const char * URL,const char * encoding,int options)7023 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
7024 {
7025     htmlParserCtxtPtr ctxt;
7026     xmlParserInputBufferPtr input;
7027     htmlParserInputPtr stream;
7028 
7029     if (fd < 0)
7030         return (NULL);
7031 
7032     xmlInitParser();
7033     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7034     if (input == NULL)
7035         return (NULL);
7036     input->closecallback = NULL;
7037     ctxt = htmlNewParserCtxt();
7038     if (ctxt == NULL) {
7039         xmlFreeParserInputBuffer(input);
7040         return (NULL);
7041     }
7042     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7043     if (stream == NULL) {
7044         xmlFreeParserInputBuffer(input);
7045 	htmlFreeParserCtxt(ctxt);
7046         return (NULL);
7047     }
7048     inputPush(ctxt, stream);
7049     return (htmlDoRead(ctxt, URL, encoding, options, 0));
7050 }
7051 
7052 /**
7053  * htmlReadIO:
7054  * @ioread:  an I/O read function
7055  * @ioclose:  an I/O close function
7056  * @ioctx:  an I/O handler
7057  * @URL:  the base URL to use for the document
7058  * @encoding:  the document encoding, or NULL
7059  * @options:  a combination of htmlParserOption(s)
7060  *
7061  * parse an HTML document from I/O functions and source and build a tree.
7062  *
7063  * Returns the resulting document tree
7064  */
7065 htmlDocPtr
htmlReadIO(xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)7066 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
7067           void *ioctx, const char *URL, const char *encoding, int options)
7068 {
7069     htmlParserCtxtPtr ctxt;
7070     xmlParserInputBufferPtr input;
7071     xmlParserInputPtr stream;
7072 
7073     if (ioread == NULL)
7074         return (NULL);
7075     xmlInitParser();
7076 
7077     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7078                                          XML_CHAR_ENCODING_NONE);
7079     if (input == NULL) {
7080         if (ioclose != NULL)
7081             ioclose(ioctx);
7082         return (NULL);
7083     }
7084     ctxt = htmlNewParserCtxt();
7085     if (ctxt == NULL) {
7086         xmlFreeParserInputBuffer(input);
7087         return (NULL);
7088     }
7089     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7090     if (stream == NULL) {
7091         xmlFreeParserInputBuffer(input);
7092 	xmlFreeParserCtxt(ctxt);
7093         return (NULL);
7094     }
7095     inputPush(ctxt, stream);
7096     return (htmlDoRead(ctxt, URL, encoding, options, 0));
7097 }
7098 
7099 /**
7100  * htmlCtxtReadDoc:
7101  * @ctxt:  an HTML parser context
7102  * @cur:  a pointer to a zero terminated string
7103  * @URL:  the base URL to use for the document
7104  * @encoding:  the document encoding, or NULL
7105  * @options:  a combination of htmlParserOption(s)
7106  *
7107  * parse an XML in-memory document and build a tree.
7108  * This reuses the existing @ctxt parser context
7109  *
7110  * Returns the resulting document tree
7111  */
7112 htmlDocPtr
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt,const xmlChar * cur,const char * URL,const char * encoding,int options)7113 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
7114                const char *URL, const char *encoding, int options)
7115 {
7116     if (cur == NULL)
7117         return (NULL);
7118     return (htmlCtxtReadMemory(ctxt, (const char *) cur, xmlStrlen(cur), URL,
7119                                encoding, options));
7120 }
7121 
7122 /**
7123  * htmlCtxtReadFile:
7124  * @ctxt:  an HTML parser context
7125  * @filename:  a file or URL
7126  * @encoding:  the document encoding, or NULL
7127  * @options:  a combination of htmlParserOption(s)
7128  *
7129  * parse an XML file from the filesystem or the network.
7130  * This reuses the existing @ctxt parser context
7131  *
7132  * Returns the resulting document tree
7133  */
7134 htmlDocPtr
htmlCtxtReadFile(htmlParserCtxtPtr ctxt,const char * filename,const char * encoding,int options)7135 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7136                 const char *encoding, int options)
7137 {
7138     xmlParserInputPtr stream;
7139 
7140     if (filename == NULL)
7141         return (NULL);
7142     if (ctxt == NULL)
7143         return (NULL);
7144     xmlInitParser();
7145 
7146     htmlCtxtReset(ctxt);
7147 
7148     stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7149     if (stream == NULL) {
7150         return (NULL);
7151     }
7152     inputPush(ctxt, stream);
7153     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7154 }
7155 
7156 /**
7157  * htmlCtxtReadMemory:
7158  * @ctxt:  an HTML parser context
7159  * @buffer:  a pointer to a char array
7160  * @size:  the size of the array
7161  * @URL:  the base URL to use for the document
7162  * @encoding:  the document encoding, or NULL
7163  * @options:  a combination of htmlParserOption(s)
7164  *
7165  * parse an XML in-memory document and build a tree.
7166  * This reuses the existing @ctxt parser context
7167  *
7168  * Returns the resulting document tree
7169  */
7170 htmlDocPtr
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt,const char * buffer,int size,const char * URL,const char * encoding,int options)7171 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7172                   const char *URL, const char *encoding, int options)
7173 {
7174     xmlParserInputBufferPtr input;
7175     xmlParserInputPtr stream;
7176 
7177     if (ctxt == NULL)
7178         return (NULL);
7179     if (buffer == NULL)
7180         return (NULL);
7181     xmlInitParser();
7182 
7183     htmlCtxtReset(ctxt);
7184 
7185     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7186     if (input == NULL) {
7187 	return(NULL);
7188     }
7189 
7190     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7191     if (stream == NULL) {
7192 	xmlFreeParserInputBuffer(input);
7193 	return(NULL);
7194     }
7195 
7196     inputPush(ctxt, stream);
7197     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7198 }
7199 
7200 /**
7201  * htmlCtxtReadFd:
7202  * @ctxt:  an HTML parser context
7203  * @fd:  an open file descriptor
7204  * @URL:  the base URL to use for the document
7205  * @encoding:  the document encoding, or NULL
7206  * @options:  a combination of htmlParserOption(s)
7207  *
7208  * parse an XML from a file descriptor and build a tree.
7209  * This reuses the existing @ctxt parser context
7210  *
7211  * Returns the resulting document tree
7212  */
7213 htmlDocPtr
htmlCtxtReadFd(htmlParserCtxtPtr ctxt,int fd,const char * URL,const char * encoding,int options)7214 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7215               const char *URL, const char *encoding, int options)
7216 {
7217     xmlParserInputBufferPtr input;
7218     xmlParserInputPtr stream;
7219 
7220     if (fd < 0)
7221         return (NULL);
7222     if (ctxt == NULL)
7223         return (NULL);
7224     xmlInitParser();
7225 
7226     htmlCtxtReset(ctxt);
7227 
7228 
7229     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7230     if (input == NULL)
7231         return (NULL);
7232     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7233     if (stream == NULL) {
7234         xmlFreeParserInputBuffer(input);
7235         return (NULL);
7236     }
7237     inputPush(ctxt, stream);
7238     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7239 }
7240 
7241 /**
7242  * htmlCtxtReadIO:
7243  * @ctxt:  an HTML parser context
7244  * @ioread:  an I/O read function
7245  * @ioclose:  an I/O close function
7246  * @ioctx:  an I/O handler
7247  * @URL:  the base URL to use for the document
7248  * @encoding:  the document encoding, or NULL
7249  * @options:  a combination of htmlParserOption(s)
7250  *
7251  * parse an HTML document from I/O functions and source and build a tree.
7252  * This reuses the existing @ctxt parser context
7253  *
7254  * Returns the resulting document tree
7255  */
7256 htmlDocPtr
htmlCtxtReadIO(htmlParserCtxtPtr ctxt,xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)7257 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7258               xmlInputCloseCallback ioclose, void *ioctx,
7259 	      const char *URL,
7260               const char *encoding, int options)
7261 {
7262     xmlParserInputBufferPtr input;
7263     xmlParserInputPtr stream;
7264 
7265     if (ioread == NULL)
7266         return (NULL);
7267     if (ctxt == NULL)
7268         return (NULL);
7269     xmlInitParser();
7270 
7271     htmlCtxtReset(ctxt);
7272 
7273     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7274                                          XML_CHAR_ENCODING_NONE);
7275     if (input == NULL) {
7276         if (ioclose != NULL)
7277             ioclose(ioctx);
7278         return (NULL);
7279     }
7280     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7281     if (stream == NULL) {
7282         xmlFreeParserInputBuffer(input);
7283         return (NULL);
7284     }
7285     inputPush(ctxt, stream);
7286     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7287 }
7288 
7289 #endif /* LIBXML_HTML_ENABLED */
7290