xref: /reactos/sdk/lib/3rdparty/libxml2/HTMLparser.c (revision 40462c92)
1 /*
2  * HTMLparser.c : an HTML 4.0 non-verifying parser
3  *
4  * See Copyright for the status of this software.
5  *
6  * daniel@veillard.com
7  */
8 
9 #define IN_LIBXML
10 #include "libxml.h"
11 #ifdef LIBXML_HTML_ENABLED
12 
13 #include <string.h>
14 #ifdef HAVE_CTYPE_H
15 #include <ctype.h>
16 #endif
17 #ifdef HAVE_STDLIB_H
18 #include <stdlib.h>
19 #endif
20 #ifdef HAVE_SYS_STAT_H
21 #include <sys/stat.h>
22 #endif
23 #ifdef HAVE_FCNTL_H
24 #include <fcntl.h>
25 #endif
26 #ifdef HAVE_UNISTD_H
27 #include <unistd.h>
28 #endif
29 #ifdef LIBXML_ZLIB_ENABLED
30 #include <zlib.h>
31 #endif
32 
33 #include <libxml/xmlmemory.h>
34 #include <libxml/tree.h>
35 #include <libxml/parser.h>
36 #include <libxml/parserInternals.h>
37 #include <libxml/xmlerror.h>
38 #include <libxml/HTMLparser.h>
39 #include <libxml/HTMLtree.h>
40 #include <libxml/entities.h>
41 #include <libxml/encoding.h>
42 #include <libxml/valid.h>
43 #include <libxml/xmlIO.h>
44 #include <libxml/globals.h>
45 #include <libxml/uri.h>
46 
47 #include "buf.h"
48 #include "enc.h"
49 
50 #define HTML_MAX_NAMELEN 1000
51 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
52 #define HTML_PARSER_BUFFER_SIZE 100
53 
54 /* #define DEBUG */
55 /* #define DEBUG_PUSH */
56 
57 static int htmlOmittedDefaultValue = 1;
58 
59 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60 			     xmlChar end, xmlChar  end2, xmlChar end3);
61 static void htmlParseComment(htmlParserCtxtPtr ctxt);
62 
63 /************************************************************************
64  *									*
65  *		Some factorized error routines				*
66  *									*
67  ************************************************************************/
68 
69 /**
70  * htmlErrMemory:
71  * @ctxt:  an HTML parser context
72  * @extra:  extra informations
73  *
74  * Handle a redefinition of attribute error
75  */
76 static void
77 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
78 {
79     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80         (ctxt->instate == XML_PARSER_EOF))
81 	return;
82     if (ctxt != NULL) {
83         ctxt->errNo = XML_ERR_NO_MEMORY;
84         ctxt->instate = XML_PARSER_EOF;
85         ctxt->disableSAX = 1;
86     }
87     if (extra)
88         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
89                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
90                         NULL, NULL, 0, 0,
91                         "Memory allocation failed : %s\n", extra);
92     else
93         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
94                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
95                         NULL, NULL, 0, 0, "Memory allocation failed\n");
96 }
97 
98 /**
99  * htmlParseErr:
100  * @ctxt:  an HTML parser context
101  * @error:  the error number
102  * @msg:  the error message
103  * @str1:  string infor
104  * @str2:  string infor
105  *
106  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
107  */
108 static void LIBXML_ATTR_FORMAT(3,0)
109 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110              const char *msg, const xmlChar *str1, const xmlChar *str2)
111 {
112     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113         (ctxt->instate == XML_PARSER_EOF))
114 	return;
115     if (ctxt != NULL)
116 	ctxt->errNo = error;
117     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
118                     XML_ERR_ERROR, NULL, 0,
119 		    (const char *) str1, (const char *) str2,
120 		    NULL, 0, 0,
121 		    msg, str1, str2);
122     if (ctxt != NULL)
123 	ctxt->wellFormed = 0;
124 }
125 
126 /**
127  * htmlParseErrInt:
128  * @ctxt:  an HTML parser context
129  * @error:  the error number
130  * @msg:  the error message
131  * @val:  integer info
132  *
133  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
134  */
135 static void LIBXML_ATTR_FORMAT(3,0)
136 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137              const char *msg, int val)
138 {
139     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140         (ctxt->instate == XML_PARSER_EOF))
141 	return;
142     if (ctxt != NULL)
143 	ctxt->errNo = error;
144     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
145                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
146 		    NULL, val, 0, msg, val);
147     if (ctxt != NULL)
148 	ctxt->wellFormed = 0;
149 }
150 
151 /************************************************************************
152  *									*
153  *	Parser stacks related functions and macros		*
154  *									*
155  ************************************************************************/
156 
157 /**
158  * htmlnamePush:
159  * @ctxt:  an HTML parser context
160  * @value:  the element name
161  *
162  * Pushes a new element name on top of the name stack
163  *
164  * Returns 0 in case of error, the index in the stack otherwise
165  */
166 static int
167 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
168 {
169     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
170         ctxt->html = 3;
171     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
172         ctxt->html = 10;
173     if (ctxt->nameNr >= ctxt->nameMax) {
174         ctxt->nameMax *= 2;
175         ctxt->nameTab = (const xmlChar * *)
176                          xmlRealloc((xmlChar * *)ctxt->nameTab,
177                                     ctxt->nameMax *
178                                     sizeof(ctxt->nameTab[0]));
179         if (ctxt->nameTab == NULL) {
180             htmlErrMemory(ctxt, NULL);
181             return (0);
182         }
183     }
184     ctxt->nameTab[ctxt->nameNr] = value;
185     ctxt->name = value;
186     return (ctxt->nameNr++);
187 }
188 /**
189  * htmlnamePop:
190  * @ctxt: an HTML parser context
191  *
192  * Pops the top element name from the name stack
193  *
194  * Returns the name just removed
195  */
196 static const xmlChar *
197 htmlnamePop(htmlParserCtxtPtr ctxt)
198 {
199     const xmlChar *ret;
200 
201     if (ctxt->nameNr <= 0)
202         return (NULL);
203     ctxt->nameNr--;
204     if (ctxt->nameNr < 0)
205         return (NULL);
206     if (ctxt->nameNr > 0)
207         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
208     else
209         ctxt->name = NULL;
210     ret = ctxt->nameTab[ctxt->nameNr];
211     ctxt->nameTab[ctxt->nameNr] = NULL;
212     return (ret);
213 }
214 
215 /**
216  * htmlNodeInfoPush:
217  * @ctxt:  an HTML parser context
218  * @value:  the node info
219  *
220  * Pushes a new element name on top of the node info stack
221  *
222  * Returns 0 in case of error, the index in the stack otherwise
223  */
224 static int
225 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
226 {
227     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228         if (ctxt->nodeInfoMax == 0)
229                 ctxt->nodeInfoMax = 5;
230         ctxt->nodeInfoMax *= 2;
231         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
233                                     ctxt->nodeInfoMax *
234                                     sizeof(ctxt->nodeInfoTab[0]));
235         if (ctxt->nodeInfoTab == NULL) {
236             htmlErrMemory(ctxt, NULL);
237             return (0);
238         }
239     }
240     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242     return (ctxt->nodeInfoNr++);
243 }
244 
245 /**
246  * htmlNodeInfoPop:
247  * @ctxt:  an HTML parser context
248  *
249  * Pops the top element name from the node info stack
250  *
251  * Returns 0 in case of error, the pointer to NodeInfo otherwise
252  */
253 static htmlParserNodeInfo *
254 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
255 {
256     if (ctxt->nodeInfoNr <= 0)
257         return (NULL);
258     ctxt->nodeInfoNr--;
259     if (ctxt->nodeInfoNr < 0)
260         return (NULL);
261     if (ctxt->nodeInfoNr > 0)
262         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
263     else
264         ctxt->nodeInfo = NULL;
265     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
266 }
267 
268 /*
269  * Macros for accessing the content. Those should be used only by the parser,
270  * and not exported.
271  *
272  * Dirty macros, i.e. one need to make assumption on the context to use them
273  *
274  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
275  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
276  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277  *           in UNICODE mode. This should be used internally by the parser
278  *           only to compare to ASCII values otherwise it would break when
279  *           running with UTF-8 encoding.
280  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
281  *           to compare on ASCII based substring.
282  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
283  *           it should be used only to compare on ASCII based substring.
284  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
285  *           strings without newlines within the parser.
286  *
287  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
288  *
289  *   CURRENT Returns the current char value, with the full decoding of
290  *           UTF-8 if we are using this mode. It returns an int.
291  *   NEXT    Skip to the next character, this does the proper decoding
292  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
293  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
294  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
295  */
296 
297 #define UPPER (toupper(*ctxt->input->cur))
298 
299 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
300 
301 #define NXT(val) ctxt->input->cur[(val)]
302 
303 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
304 
305 #define CUR_PTR ctxt->input->cur
306 #define BASE_PTR ctxt->input->base
307 
308 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
309 		   (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
310 	xmlParserInputShrink(ctxt->input)
311 
312 #define GROW if ((ctxt->progressive == 0) &&				\
313 		 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))	\
314 	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
315 
316 #define CURRENT ((int) (*ctxt->input->cur))
317 
318 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
319 
320 /* Imported from XML */
321 
322 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
323 #define CUR ((int) (*ctxt->input->cur))
324 #define NEXT xmlNextChar(ctxt)
325 
326 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
327 
328 
329 #define NEXTL(l) do {							\
330     if (*(ctxt->input->cur) == '\n') {					\
331 	ctxt->input->line++; ctxt->input->col = 1;			\
332     } else ctxt->input->col++;						\
333     ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;		\
334   } while (0)
335 
336 /************
337     \
338     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);	\
339     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
340  ************/
341 
342 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
343 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
344 
345 #define COPY_BUF(l,b,i,v)						\
346     if (l == 1) b[i++] = (xmlChar) v;					\
347     else i += xmlCopyChar(l,&b[i],v)
348 
349 /**
350  * htmlFindEncoding:
351  * @the HTML parser context
352  *
353  * Ty to find and encoding in the current data available in the input
354  * buffer this is needed to try to switch to the proper encoding when
355  * one face a character error.
356  * That's an heuristic, since it's operating outside of parsing it could
357  * try to use a meta which had been commented out, that's the reason it
358  * should only be used in case of error, not as a default.
359  *
360  * Returns an encoding string or NULL if not found, the string need to
361  *   be freed
362  */
363 static xmlChar *
364 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
365     const xmlChar *start, *cur, *end;
366 
367     if ((ctxt == NULL) || (ctxt->input == NULL) ||
368         (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
369         (ctxt->input->buf->encoder != NULL))
370         return(NULL);
371     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
372         return(NULL);
373 
374     start = ctxt->input->cur;
375     end = ctxt->input->end;
376     /* we also expect the input buffer to be zero terminated */
377     if (*end != 0)
378         return(NULL);
379 
380     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
381     if (cur == NULL)
382         return(NULL);
383     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
384     if (cur == NULL)
385         return(NULL);
386     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
387     if (cur == NULL)
388         return(NULL);
389     cur += 8;
390     start = cur;
391     while (((*cur >= 'A') && (*cur <= 'Z')) ||
392            ((*cur >= 'a') && (*cur <= 'z')) ||
393            ((*cur >= '0') && (*cur <= '9')) ||
394            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
395            cur++;
396     if (cur == start)
397         return(NULL);
398     return(xmlStrndup(start, cur - start));
399 }
400 
401 /**
402  * htmlCurrentChar:
403  * @ctxt:  the HTML parser context
404  * @len:  pointer to the length of the char read
405  *
406  * The current char value, if using UTF-8 this may actually span multiple
407  * bytes in the input buffer. Implement the end of line normalization:
408  * 2.11 End-of-Line Handling
409  * If the encoding is unspecified, in the case we find an ISO-Latin-1
410  * char, then the encoding converter is plugged in automatically.
411  *
412  * Returns the current char value and its length
413  */
414 
415 static int
416 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
417     if (ctxt->instate == XML_PARSER_EOF)
418 	return(0);
419 
420     if (ctxt->token != 0) {
421 	*len = 0;
422 	return(ctxt->token);
423     }
424     if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
425 	/*
426 	 * We are supposed to handle UTF8, check it's valid
427 	 * From rfc2044: encoding of the Unicode values on UTF-8:
428 	 *
429 	 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
430 	 * 0000 0000-0000 007F   0xxxxxxx
431 	 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
432 	 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
433 	 *
434 	 * Check for the 0x110000 limit too
435 	 */
436 	const unsigned char *cur = ctxt->input->cur;
437 	unsigned char c;
438 	unsigned int val;
439 
440 	c = *cur;
441 	if (c & 0x80) {
442 	    if (cur[1] == 0) {
443 		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
444                 cur = ctxt->input->cur;
445             }
446 	    if ((cur[1] & 0xc0) != 0x80)
447 		goto encoding_error;
448 	    if ((c & 0xe0) == 0xe0) {
449 
450 		if (cur[2] == 0) {
451 		    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
452                     cur = ctxt->input->cur;
453                 }
454 		if ((cur[2] & 0xc0) != 0x80)
455 		    goto encoding_error;
456 		if ((c & 0xf0) == 0xf0) {
457 		    if (cur[3] == 0) {
458 			xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
459                         cur = ctxt->input->cur;
460                     }
461 		    if (((c & 0xf8) != 0xf0) ||
462 			((cur[3] & 0xc0) != 0x80))
463 			goto encoding_error;
464 		    /* 4-byte code */
465 		    *len = 4;
466 		    val = (cur[0] & 0x7) << 18;
467 		    val |= (cur[1] & 0x3f) << 12;
468 		    val |= (cur[2] & 0x3f) << 6;
469 		    val |= cur[3] & 0x3f;
470 		} else {
471 		  /* 3-byte code */
472 		    *len = 3;
473 		    val = (cur[0] & 0xf) << 12;
474 		    val |= (cur[1] & 0x3f) << 6;
475 		    val |= cur[2] & 0x3f;
476 		}
477 	    } else {
478 	      /* 2-byte code */
479 		*len = 2;
480 		val = (cur[0] & 0x1f) << 6;
481 		val |= cur[1] & 0x3f;
482 	    }
483 	    if (!IS_CHAR(val)) {
484 	        htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
485 				"Char 0x%X out of allowed range\n", val);
486 	    }
487 	    return(val);
488 	} else {
489             if ((*ctxt->input->cur == 0) &&
490                 (ctxt->input->cur < ctxt->input->end)) {
491                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
492 				"Char 0x%X out of allowed range\n", 0);
493                 *len = 1;
494                 return(' ');
495             }
496 	    /* 1-byte code */
497 	    *len = 1;
498 	    return((int) *ctxt->input->cur);
499 	}
500     }
501     /*
502      * Assume it's a fixed length encoding (1) with
503      * a compatible encoding for the ASCII set, since
504      * XML constructs only use < 128 chars
505      */
506     *len = 1;
507     if ((int) *ctxt->input->cur < 0x80)
508 	return((int) *ctxt->input->cur);
509 
510     /*
511      * Humm this is bad, do an automatic flow conversion
512      */
513     {
514         xmlChar * guess;
515         xmlCharEncodingHandlerPtr handler;
516 
517         guess = htmlFindEncoding(ctxt);
518         if (guess == NULL) {
519             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
520         } else {
521             if (ctxt->input->encoding != NULL)
522                 xmlFree((xmlChar *) ctxt->input->encoding);
523             ctxt->input->encoding = guess;
524             handler = xmlFindCharEncodingHandler((const char *) guess);
525             if (handler != NULL) {
526                 xmlSwitchToEncoding(ctxt, handler);
527             } else {
528                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
529                              "Unsupported encoding %s", guess, NULL);
530             }
531         }
532         ctxt->charset = XML_CHAR_ENCODING_UTF8;
533     }
534 
535     return(xmlCurrentChar(ctxt, len));
536 
537 encoding_error:
538     /*
539      * If we detect an UTF8 error that probably mean that the
540      * input encoding didn't get properly advertised in the
541      * declaration header. Report the error and switch the encoding
542      * to ISO-Latin-1 (if you don't like this policy, just declare the
543      * encoding !)
544      */
545     {
546         char buffer[150];
547 
548 	if (ctxt->input->end - ctxt->input->cur >= 4) {
549 	    snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
550 			    ctxt->input->cur[0], ctxt->input->cur[1],
551 			    ctxt->input->cur[2], ctxt->input->cur[3]);
552 	} else {
553 	    snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
554 	}
555 	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
556 		     "Input is not proper UTF-8, indicate encoding !\n",
557 		     BAD_CAST buffer, NULL);
558     }
559 
560     ctxt->charset = XML_CHAR_ENCODING_8859_1;
561     *len = 1;
562     return((int) *ctxt->input->cur);
563 }
564 
565 /**
566  * htmlSkipBlankChars:
567  * @ctxt:  the HTML parser context
568  *
569  * skip all blanks character found at that point in the input streams.
570  *
571  * Returns the number of space chars skipped
572  */
573 
574 static int
575 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
576     int res = 0;
577 
578     while (IS_BLANK_CH(*(ctxt->input->cur))) {
579 	if ((*ctxt->input->cur == 0) &&
580 	    (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
581 		xmlPopInput(ctxt);
582 	} else {
583 	    if (*(ctxt->input->cur) == '\n') {
584 		ctxt->input->line++; ctxt->input->col = 1;
585 	    } else ctxt->input->col++;
586 	    ctxt->input->cur++;
587 	    ctxt->nbChars++;
588 	    if (*ctxt->input->cur == 0)
589 		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
590 	}
591 	res++;
592     }
593     return(res);
594 }
595 
596 
597 
598 /************************************************************************
599  *									*
600  *	The list of HTML elements and their properties		*
601  *									*
602  ************************************************************************/
603 
604 /*
605  *  Start Tag: 1 means the start tag can be omitted
606  *  End Tag:   1 means the end tag can be omitted
607  *             2 means it's forbidden (empty elements)
608  *             3 means the tag is stylistic and should be closed easily
609  *  Depr:      this element is deprecated
610  *  DTD:       1 means that this element is valid only in the Loose DTD
611  *             2 means that this element is valid only in the Frameset DTD
612  *
613  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
614 	, subElements , impliedsubelt , Attributes, userdata
615  */
616 
617 /* Definitions and a couple of vars for HTML Elements */
618 
619 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
620 #define NB_FONTSTYLE 8
621 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
622 #define NB_PHRASE 10
623 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
624 #define NB_SPECIAL 16
625 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
626 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
627 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
628 #define NB_BLOCK NB_HEADING + NB_LIST + 14
629 #define FORMCTRL "input", "select", "textarea", "label", "button"
630 #define NB_FORMCTRL 5
631 #define PCDATA
632 #define NB_PCDATA 0
633 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
634 #define NB_HEADING 6
635 #define LIST "ul", "ol", "dir", "menu"
636 #define NB_LIST 4
637 #define MODIFIER
638 #define NB_MODIFIER 0
639 #define FLOW BLOCK,INLINE
640 #define NB_FLOW NB_BLOCK + NB_INLINE
641 #define EMPTY NULL
642 
643 
644 static const char* const html_flow[] = { FLOW, NULL } ;
645 static const char* const html_inline[] = { INLINE, NULL } ;
646 
647 /* placeholders: elts with content but no subelements */
648 static const char* const html_pcdata[] = { NULL } ;
649 #define html_cdata html_pcdata
650 
651 
652 /* ... and for HTML Attributes */
653 
654 #define COREATTRS "id", "class", "style", "title"
655 #define NB_COREATTRS 4
656 #define I18N "lang", "dir"
657 #define NB_I18N 2
658 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
659 #define NB_EVENTS 9
660 #define ATTRS COREATTRS,I18N,EVENTS
661 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
662 #define CELLHALIGN "align", "char", "charoff"
663 #define NB_CELLHALIGN 3
664 #define CELLVALIGN "valign"
665 #define NB_CELLVALIGN 1
666 
667 static const char* const html_attrs[] = { ATTRS, NULL } ;
668 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
669 static const char* const core_attrs[] = { COREATTRS, NULL } ;
670 static const char* const i18n_attrs[] = { I18N, NULL } ;
671 
672 
673 /* Other declarations that should go inline ... */
674 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
675 	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
676 	"tabindex", "onfocus", "onblur", NULL } ;
677 static const char* const target_attr[] = { "target", NULL } ;
678 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
679 static const char* const alt_attr[] = { "alt", NULL } ;
680 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
681 static const char* const href_attrs[] = { "href", NULL } ;
682 static const char* const clear_attrs[] = { "clear", NULL } ;
683 static const char* const inline_p[] = { INLINE, "p", NULL } ;
684 
685 static const char* const flow_param[] = { FLOW, "param", NULL } ;
686 static const char* const applet_attrs[] = { COREATTRS , "codebase",
687 		"archive", "alt", "name", "height", "width", "align",
688 		"hspace", "vspace", NULL } ;
689 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
690 	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
691 static const char* const basefont_attrs[] =
692 	{ "id", "size", "color", "face", NULL } ;
693 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
694 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
695 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
696 static const char* const body_depr[] = { "background", "bgcolor", "text",
697 	"link", "vlink", "alink", NULL } ;
698 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
699 	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
700 
701 
702 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
703 static const char* const col_elt[] = { "col", NULL } ;
704 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
705 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
706 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
707 static const char* const compact_attr[] = { "compact", NULL } ;
708 static const char* const label_attr[] = { "label", NULL } ;
709 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
710 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
711 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
712 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
713 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
714 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
715 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
716 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
717 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
718 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
719 static const char* const version_attr[] = { "version", NULL } ;
720 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
721 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
722 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
723 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
724 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
725 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
726 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
727 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
728 static const char* const align_attr[] = { "align", NULL } ;
729 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
730 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
731 static const char* const name_attr[] = { "name", NULL } ;
732 static const char* const action_attr[] = { "action", NULL } ;
733 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
734 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
735 static const char* const content_attr[] = { "content", NULL } ;
736 static const char* const type_attr[] = { "type", NULL } ;
737 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
738 static const char* const object_contents[] = { FLOW, "param", NULL } ;
739 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
740 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
741 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
742 static const char* const option_elt[] = { "option", NULL } ;
743 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
744 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
745 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
746 static const char* const width_attr[] = { "width", NULL } ;
747 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
748 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
749 static const char* const language_attr[] = { "language", NULL } ;
750 static const char* const select_content[] = { "optgroup", "option", NULL } ;
751 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
752 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
753 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
754 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
755 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
756 static const char* const tr_elt[] = { "tr", NULL } ;
757 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
758 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
759 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
760 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
761 static const char* const tr_contents[] = { "th", "td", NULL } ;
762 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
763 static const char* const li_elt[] = { "li", NULL } ;
764 static const char* const ul_depr[] = { "type", "compact", NULL} ;
765 static const char* const dir_attr[] = { "dir", NULL} ;
766 
767 #define DECL (const char**)
768 
769 static const htmlElemDesc
770 html40ElementTable[] = {
771 { "a",		0, 0, 0, 0, 0, 0, 1, "anchor ",
772 	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
773 },
774 { "abbr",	0, 0, 0, 0, 0, 0, 1, "abbreviated form",
775 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
776 },
777 { "acronym",	0, 0, 0, 0, 0, 0, 1, "",
778 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
779 },
780 { "address",	0, 0, 0, 0, 0, 0, 0, "information on author ",
781 	DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
782 },
783 { "applet",	0, 0, 0, 0, 1, 1, 2, "java applet ",
784 	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
785 },
786 { "area",	0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
787 	EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
788 },
789 { "b",		0, 3, 0, 0, 0, 0, 1, "bold text style",
790 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
791 },
792 { "base",	0, 2, 2, 1, 0, 0, 0, "document base uri ",
793 	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
794 },
795 { "basefont",	0, 2, 2, 1, 1, 1, 1, "base font size " ,
796 	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
797 },
798 { "bdo",	0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
799 	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
800 },
801 { "big",	0, 3, 0, 0, 0, 0, 1, "large text style",
802 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
803 },
804 { "blockquote",	0, 0, 0, 0, 0, 0, 0, "long quotation ",
805 	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
806 },
807 { "body",	1, 1, 0, 0, 0, 0, 0, "document body ",
808 	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
809 },
810 { "br",		0, 2, 2, 1, 0, 0, 1, "forced line break ",
811 	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
812 },
813 { "button",	0, 0, 0, 0, 0, 0, 2, "push button ",
814 	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
815 },
816 { "caption",	0, 0, 0, 0, 0, 0, 0, "table caption ",
817 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
818 },
819 { "center",	0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
820 	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
821 },
822 { "cite",	0, 0, 0, 0, 0, 0, 1, "citation",
823 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
824 },
825 { "code",	0, 0, 0, 0, 0, 0, 1, "computer code fragment",
826 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
827 },
828 { "col",	0, 2, 2, 1, 0, 0, 0, "table column ",
829 	EMPTY , NULL , DECL col_attrs , NULL, NULL
830 },
831 { "colgroup",	0, 1, 0, 0, 0, 0, 0, "table column group ",
832 	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
833 },
834 { "dd",		0, 1, 0, 0, 0, 0, 0, "definition description ",
835 	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
836 },
837 { "del",	0, 0, 0, 0, 0, 0, 2, "deleted text ",
838 	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
839 },
840 { "dfn",	0, 0, 0, 0, 0, 0, 1, "instance definition",
841 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
842 },
843 { "dir",	0, 0, 0, 0, 1, 1, 0, "directory list",
844 	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
845 },
846 { "div",	0, 0, 0, 0, 0, 0, 0, "generic language/style container",
847 	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
848 },
849 { "dl",		0, 0, 0, 0, 0, 0, 0, "definition list ",
850 	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
851 },
852 { "dt",		0, 1, 0, 0, 0, 0, 0, "definition term ",
853 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
854 },
855 { "em",		0, 3, 0, 0, 0, 0, 1, "emphasis",
856 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
857 },
858 { "embed",	0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
859 	EMPTY, NULL, DECL embed_attrs, NULL, NULL
860 },
861 { "fieldset",	0, 0, 0, 0, 0, 0, 0, "form control group ",
862 	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
863 },
864 { "font",	0, 3, 0, 0, 1, 1, 1, "local change to font ",
865 	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
866 },
867 { "form",	0, 0, 0, 0, 0, 0, 0, "interactive form ",
868 	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
869 },
870 { "frame",	0, 2, 2, 1, 0, 2, 0, "subwindow " ,
871 	EMPTY, NULL, NULL, DECL frame_attrs, NULL
872 },
873 { "frameset",	0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
874 	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
875 },
876 { "h1",		0, 0, 0, 0, 0, 0, 0, "heading ",
877 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
878 },
879 { "h2",		0, 0, 0, 0, 0, 0, 0, "heading ",
880 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
881 },
882 { "h3",		0, 0, 0, 0, 0, 0, 0, "heading ",
883 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
884 },
885 { "h4",		0, 0, 0, 0, 0, 0, 0, "heading ",
886 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
887 },
888 { "h5",		0, 0, 0, 0, 0, 0, 0, "heading ",
889 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
890 },
891 { "h6",		0, 0, 0, 0, 0, 0, 0, "heading ",
892 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
893 },
894 { "head",	1, 1, 0, 0, 0, 0, 0, "document head ",
895 	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
896 },
897 { "hr",		0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
898 	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
899 },
900 { "html",	1, 1, 0, 0, 0, 0, 0, "document root element ",
901 	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
902 },
903 { "i",		0, 3, 0, 0, 0, 0, 1, "italic text style",
904 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
905 },
906 { "iframe",	0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
907 	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
908 },
909 { "img",	0, 2, 2, 1, 0, 0, 1, "embedded image ",
910 	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
911 },
912 { "input",	0, 2, 2, 1, 0, 0, 1, "form control ",
913 	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
914 },
915 { "ins",	0, 0, 0, 0, 0, 0, 2, "inserted text",
916 	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
917 },
918 { "isindex",	0, 2, 2, 1, 1, 1, 0, "single line prompt ",
919 	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
920 },
921 { "kbd",	0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
922 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
923 },
924 { "label",	0, 0, 0, 0, 0, 0, 1, "form field label text ",
925 	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
926 },
927 { "legend",	0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
928 	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
929 },
930 { "li",		0, 1, 1, 0, 0, 0, 0, "list item ",
931 	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
932 },
933 { "link",	0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
934 	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
935 },
936 { "map",	0, 0, 0, 0, 0, 0, 2, "client-side image map ",
937 	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
938 },
939 { "menu",	0, 0, 0, 0, 1, 1, 0, "menu list ",
940 	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
941 },
942 { "meta",	0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
943 	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
944 },
945 { "noframes",	0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
946 	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
947 },
948 { "noscript",	0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
949 	DECL html_flow, "div", DECL html_attrs, NULL, NULL
950 },
951 { "object",	0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
952 	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
953 },
954 { "ol",		0, 0, 0, 0, 0, 0, 0, "ordered list ",
955 	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
956 },
957 { "optgroup",	0, 0, 0, 0, 0, 0, 0, "option group ",
958 	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
959 },
960 { "option",	0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
961 	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
962 },
963 { "p",		0, 1, 0, 0, 0, 0, 0, "paragraph ",
964 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
965 },
966 { "param",	0, 2, 2, 1, 0, 0, 0, "named property value ",
967 	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
968 },
969 { "pre",	0, 0, 0, 0, 0, 0, 0, "preformatted text ",
970 	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
971 },
972 { "q",		0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
973 	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
974 },
975 { "s",		0, 3, 0, 0, 1, 1, 1, "strike-through text style",
976 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
977 },
978 { "samp",	0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
979 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
980 },
981 { "script",	0, 0, 0, 0, 0, 0, 2, "script statements ",
982 	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
983 },
984 { "select",	0, 0, 0, 0, 0, 0, 1, "option selector ",
985 	DECL select_content, NULL, DECL select_attrs, NULL, NULL
986 },
987 { "small",	0, 3, 0, 0, 0, 0, 1, "small text style",
988 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
989 },
990 { "span",	0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
991 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
992 },
993 { "strike",	0, 3, 0, 0, 1, 1, 1, "strike-through text",
994 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
995 },
996 { "strong",	0, 3, 0, 0, 0, 0, 1, "strong emphasis",
997 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
998 },
999 { "style",	0, 0, 0, 0, 0, 0, 0, "style info ",
1000 	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1001 },
1002 { "sub",	0, 3, 0, 0, 0, 0, 1, "subscript",
1003 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1004 },
1005 { "sup",	0, 3, 0, 0, 0, 0, 1, "superscript ",
1006 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1007 },
1008 { "table",	0, 0, 0, 0, 0, 0, 0, "",
1009 	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1010 },
1011 { "tbody",	1, 0, 0, 0, 0, 0, 0, "table body ",
1012 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1013 },
1014 { "td",		0, 0, 0, 0, 0, 0, 0, "table data cell",
1015 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1016 },
1017 { "textarea",	0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1018 	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1019 },
1020 { "tfoot",	0, 1, 0, 0, 0, 0, 0, "table footer ",
1021 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1022 },
1023 { "th",		0, 1, 0, 0, 0, 0, 0, "table header cell",
1024 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1025 },
1026 { "thead",	0, 1, 0, 0, 0, 0, 0, "table header ",
1027 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1028 },
1029 { "title",	0, 0, 0, 0, 0, 0, 0, "document title ",
1030 	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1031 },
1032 { "tr",		0, 0, 0, 0, 0, 0, 0, "table row ",
1033 	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1034 },
1035 { "tt",		0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1036 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1037 },
1038 { "u",		0, 3, 0, 0, 1, 1, 1, "underlined text style",
1039 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1040 },
1041 { "ul",		0, 0, 0, 0, 0, 0, 0, "unordered list ",
1042 	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1043 },
1044 { "var",	0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1045 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1046 }
1047 };
1048 
1049 /*
1050  * start tags that imply the end of current element
1051  */
1052 static const char * const htmlStartClose[] = {
1053 "form",		"form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1054 		"dl", "ul", "ol", "menu", "dir", "address", "pre",
1055 		"listing", "xmp", "head", NULL,
1056 "head",		"p", NULL,
1057 "title",	"p", NULL,
1058 "body",		"head", "style", "link", "title", "p", NULL,
1059 "frameset",	"head", "style", "link", "title", "p", NULL,
1060 "li",		"p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1061 		"pre", "listing", "xmp", "head", "li", NULL,
1062 "hr",		"p", "head", NULL,
1063 "h1",		"p", "head", NULL,
1064 "h2",		"p", "head", NULL,
1065 "h3",		"p", "head", NULL,
1066 "h4",		"p", "head", NULL,
1067 "h5",		"p", "head", NULL,
1068 "h6",		"p", "head", NULL,
1069 "dir",		"p", "head", NULL,
1070 "address",	"p", "head", "ul", NULL,
1071 "pre",		"p", "head", "ul", NULL,
1072 "listing",	"p", "head", NULL,
1073 "xmp",		"p", "head", NULL,
1074 "blockquote",	"p", "head", NULL,
1075 "dl",		"p", "dt", "menu", "dir", "address", "pre", "listing",
1076 		"xmp", "head", NULL,
1077 "dt",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
1078                 "head", "dd", NULL,
1079 "dd",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
1080                 "head", "dt", NULL,
1081 "ul",		"p", "head", "ol", "menu", "dir", "address", "pre",
1082 		"listing", "xmp", NULL,
1083 "ol",		"p", "head", "ul", NULL,
1084 "menu",		"p", "head", "ul", NULL,
1085 "p",		"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
1086 "div",		"p", "head", NULL,
1087 "noscript",	"script", NULL,
1088 "center",	"font", "b", "i", "p", "head", NULL,
1089 "a",		"a", "head", NULL,
1090 "caption",	"p", NULL,
1091 "colgroup",	"caption", "colgroup", "col", "p", NULL,
1092 "col",		"caption", "col", "p", NULL,
1093 "table",	"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1094 		"listing", "xmp", "a", NULL,
1095 "th",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1096 "td",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1097 "tr",		"th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1098 "thead",	"caption", "col", "colgroup", NULL,
1099 "tfoot",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
1100 		"tbody", "p", NULL,
1101 "tbody",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
1102 		"tfoot", "tbody", "p", NULL,
1103 "optgroup",	"option", NULL,
1104 "option",	"option", NULL,
1105 "fieldset",	"legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1106 		"pre", "listing", "xmp", "a", NULL,
1107 /* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
1108 "tt",		"head", NULL,
1109 "i",		"head", NULL,
1110 "b",		"head", NULL,
1111 "u",		"head", NULL,
1112 "s",		"head", NULL,
1113 "strike",	"head", NULL,
1114 "big",		"head", NULL,
1115 "small",	"head", NULL,
1116 
1117 "em",		"head", NULL,
1118 "strong",	"head", NULL,
1119 "dfn",		"head", NULL,
1120 "code",		"head", NULL,
1121 "samp",		"head", NULL,
1122 "kbd",		"head", NULL,
1123 "var",		"head", NULL,
1124 "cite",		"head", NULL,
1125 "abbr",		"head", NULL,
1126 "acronym",	"head", NULL,
1127 
1128 /* "a" */
1129 "img",		"head", NULL,
1130 /* "applet" */
1131 /* "embed" */
1132 /* "object" */
1133 "font",		"head", NULL,
1134 /* "basefont" */
1135 "br",		"head", NULL,
1136 /* "script" */
1137 "map",		"head", NULL,
1138 "q",		"head", NULL,
1139 "sub",		"head", NULL,
1140 "sup",		"head", NULL,
1141 "span",		"head", NULL,
1142 "bdo",		"head", NULL,
1143 "iframe",	"head", NULL,
1144 NULL
1145 };
1146 
1147 /*
1148  * The list of HTML elements which are supposed not to have
1149  * CDATA content and where a p element will be implied
1150  *
1151  * TODO: extend that list by reading the HTML SGML DTD on
1152  *       implied paragraph
1153  */
1154 static const char *const htmlNoContentElements[] = {
1155     "html",
1156     "head",
1157     NULL
1158 };
1159 
1160 /*
1161  * The list of HTML attributes which are of content %Script;
1162  * NOTE: when adding ones, check htmlIsScriptAttribute() since
1163  *       it assumes the name starts with 'on'
1164  */
1165 static const char *const htmlScriptAttributes[] = {
1166     "onclick",
1167     "ondblclick",
1168     "onmousedown",
1169     "onmouseup",
1170     "onmouseover",
1171     "onmousemove",
1172     "onmouseout",
1173     "onkeypress",
1174     "onkeydown",
1175     "onkeyup",
1176     "onload",
1177     "onunload",
1178     "onfocus",
1179     "onblur",
1180     "onsubmit",
1181     "onreset",
1182     "onchange",
1183     "onselect"
1184 };
1185 
1186 /*
1187  * This table is used by the htmlparser to know what to do with
1188  * broken html pages. By assigning different priorities to different
1189  * elements the parser can decide how to handle extra endtags.
1190  * Endtags are only allowed to close elements with lower or equal
1191  * priority.
1192  */
1193 
1194 typedef struct {
1195     const char *name;
1196     int priority;
1197 } elementPriority;
1198 
1199 static const elementPriority htmlEndPriority[] = {
1200     {"div",   150},
1201     {"td",    160},
1202     {"th",    160},
1203     {"tr",    170},
1204     {"thead", 180},
1205     {"tbody", 180},
1206     {"tfoot", 180},
1207     {"table", 190},
1208     {"head",  200},
1209     {"body",  200},
1210     {"html",  220},
1211     {NULL,    100} /* Default priority */
1212 };
1213 
1214 static const char** htmlStartCloseIndex[100];
1215 static int htmlStartCloseIndexinitialized = 0;
1216 
1217 /************************************************************************
1218  *									*
1219  *	functions to handle HTML specific data			*
1220  *									*
1221  ************************************************************************/
1222 
1223 /**
1224  * htmlInitAutoClose:
1225  *
1226  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1227  * This is not reentrant. Call xmlInitParser() once before processing in
1228  * case of use in multithreaded programs.
1229  */
1230 void
1231 htmlInitAutoClose(void) {
1232     int indx, i = 0;
1233 
1234     if (htmlStartCloseIndexinitialized) return;
1235 
1236     for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1237     indx = 0;
1238     while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1239         htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1240 	while (htmlStartClose[i] != NULL) i++;
1241 	i++;
1242     }
1243     htmlStartCloseIndexinitialized = 1;
1244 }
1245 
1246 /**
1247  * htmlTagLookup:
1248  * @tag:  The tag name in lowercase
1249  *
1250  * Lookup the HTML tag in the ElementTable
1251  *
1252  * Returns the related htmlElemDescPtr or NULL if not found.
1253  */
1254 const htmlElemDesc *
1255 htmlTagLookup(const xmlChar *tag) {
1256     unsigned int i;
1257 
1258     for (i = 0; i < (sizeof(html40ElementTable) /
1259                      sizeof(html40ElementTable[0]));i++) {
1260         if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1261 	    return((htmlElemDescPtr) &html40ElementTable[i]);
1262     }
1263     return(NULL);
1264 }
1265 
1266 /**
1267  * htmlGetEndPriority:
1268  * @name: The name of the element to look up the priority for.
1269  *
1270  * Return value: The "endtag" priority.
1271  **/
1272 static int
1273 htmlGetEndPriority (const xmlChar *name) {
1274     int i = 0;
1275 
1276     while ((htmlEndPriority[i].name != NULL) &&
1277 	   (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1278 	i++;
1279 
1280     return(htmlEndPriority[i].priority);
1281 }
1282 
1283 
1284 /**
1285  * htmlCheckAutoClose:
1286  * @newtag:  The new tag name
1287  * @oldtag:  The old tag name
1288  *
1289  * Checks whether the new tag is one of the registered valid tags for
1290  * closing old.
1291  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1292  *
1293  * Returns 0 if no, 1 if yes.
1294  */
1295 static int
1296 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1297 {
1298     int i, indx;
1299     const char **closed = NULL;
1300 
1301     if (htmlStartCloseIndexinitialized == 0)
1302         htmlInitAutoClose();
1303 
1304     /* inefficient, but not a big deal */
1305     for (indx = 0; indx < 100; indx++) {
1306         closed = htmlStartCloseIndex[indx];
1307         if (closed == NULL)
1308             return (0);
1309         if (xmlStrEqual(BAD_CAST * closed, newtag))
1310             break;
1311     }
1312 
1313     i = closed - htmlStartClose;
1314     i++;
1315     while (htmlStartClose[i] != NULL) {
1316         if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1317             return (1);
1318         }
1319         i++;
1320     }
1321     return (0);
1322 }
1323 
1324 /**
1325  * htmlAutoCloseOnClose:
1326  * @ctxt:  an HTML parser context
1327  * @newtag:  The new tag name
1328  * @force:  force the tag closure
1329  *
1330  * The HTML DTD allows an ending tag to implicitly close other tags.
1331  */
1332 static void
1333 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1334 {
1335     const htmlElemDesc *info;
1336     int i, priority;
1337 
1338     priority = htmlGetEndPriority(newtag);
1339 
1340     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1341 
1342         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1343             break;
1344         /*
1345          * A misplaced endtag can only close elements with lower
1346          * or equal priority, so if we find an element with higher
1347          * priority before we find an element with
1348          * matching name, we just ignore this endtag
1349          */
1350         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1351             return;
1352     }
1353     if (i < 0)
1354         return;
1355 
1356     while (!xmlStrEqual(newtag, ctxt->name)) {
1357         info = htmlTagLookup(ctxt->name);
1358         if ((info != NULL) && (info->endTag == 3)) {
1359             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1360 	                 "Opening and ending tag mismatch: %s and %s\n",
1361 			 newtag, ctxt->name);
1362         }
1363         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1364             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1365 	htmlnamePop(ctxt);
1366     }
1367 }
1368 
1369 /**
1370  * htmlAutoCloseOnEnd:
1371  * @ctxt:  an HTML parser context
1372  *
1373  * Close all remaining tags at the end of the stream
1374  */
1375 static void
1376 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1377 {
1378     int i;
1379 
1380     if (ctxt->nameNr == 0)
1381         return;
1382     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1383         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1384             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1385 	htmlnamePop(ctxt);
1386     }
1387 }
1388 
1389 /**
1390  * htmlAutoClose:
1391  * @ctxt:  an HTML parser context
1392  * @newtag:  The new tag name or NULL
1393  *
1394  * The HTML DTD allows a tag to implicitly close other tags.
1395  * The list is kept in htmlStartClose array. This function is
1396  * called when a new tag has been detected and generates the
1397  * appropriates closes if possible/needed.
1398  * If newtag is NULL this mean we are at the end of the resource
1399  * and we should check
1400  */
1401 static void
1402 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1403 {
1404     while ((newtag != NULL) && (ctxt->name != NULL) &&
1405            (htmlCheckAutoClose(newtag, ctxt->name))) {
1406         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1407             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1408 	htmlnamePop(ctxt);
1409     }
1410     if (newtag == NULL) {
1411         htmlAutoCloseOnEnd(ctxt);
1412         return;
1413     }
1414     while ((newtag == NULL) && (ctxt->name != NULL) &&
1415            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1416             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1417             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1418         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1419             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1420 	htmlnamePop(ctxt);
1421     }
1422 }
1423 
1424 /**
1425  * htmlAutoCloseTag:
1426  * @doc:  the HTML document
1427  * @name:  The tag name
1428  * @elem:  the HTML element
1429  *
1430  * The HTML DTD allows a tag to implicitly close other tags.
1431  * The list is kept in htmlStartClose array. This function checks
1432  * if the element or one of it's children would autoclose the
1433  * given tag.
1434  *
1435  * Returns 1 if autoclose, 0 otherwise
1436  */
1437 int
1438 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1439     htmlNodePtr child;
1440 
1441     if (elem == NULL) return(1);
1442     if (xmlStrEqual(name, elem->name)) return(0);
1443     if (htmlCheckAutoClose(elem->name, name)) return(1);
1444     child = elem->children;
1445     while (child != NULL) {
1446         if (htmlAutoCloseTag(doc, name, child)) return(1);
1447 	child = child->next;
1448     }
1449     return(0);
1450 }
1451 
1452 /**
1453  * htmlIsAutoClosed:
1454  * @doc:  the HTML document
1455  * @elem:  the HTML element
1456  *
1457  * The HTML DTD allows a tag to implicitly close other tags.
1458  * The list is kept in htmlStartClose array. This function checks
1459  * if a tag is autoclosed by one of it's child
1460  *
1461  * Returns 1 if autoclosed, 0 otherwise
1462  */
1463 int
1464 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1465     htmlNodePtr child;
1466 
1467     if (elem == NULL) return(1);
1468     child = elem->children;
1469     while (child != NULL) {
1470 	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1471 	child = child->next;
1472     }
1473     return(0);
1474 }
1475 
1476 /**
1477  * htmlCheckImplied:
1478  * @ctxt:  an HTML parser context
1479  * @newtag:  The new tag name
1480  *
1481  * The HTML DTD allows a tag to exists only implicitly
1482  * called when a new tag has been detected and generates the
1483  * appropriates implicit tags if missing
1484  */
1485 static void
1486 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1487     int i;
1488 
1489     if (ctxt->options & HTML_PARSE_NOIMPLIED)
1490         return;
1491     if (!htmlOmittedDefaultValue)
1492 	return;
1493     if (xmlStrEqual(newtag, BAD_CAST"html"))
1494 	return;
1495     if (ctxt->nameNr <= 0) {
1496 	htmlnamePush(ctxt, BAD_CAST"html");
1497 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1498 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1499     }
1500     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1501         return;
1502     if ((ctxt->nameNr <= 1) &&
1503         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1504 	 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1505 	 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1506 	 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1507 	 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1508 	 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1509         if (ctxt->html >= 3) {
1510             /* we already saw or generated an <head> before */
1511             return;
1512         }
1513         /*
1514          * dropped OBJECT ... i you put it first BODY will be
1515          * assumed !
1516          */
1517         htmlnamePush(ctxt, BAD_CAST"head");
1518         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1519             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1520     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1521 	       (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1522 	       (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1523         if (ctxt->html >= 10) {
1524             /* we already saw or generated a <body> before */
1525             return;
1526         }
1527 	for (i = 0;i < ctxt->nameNr;i++) {
1528 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1529 		return;
1530 	    }
1531 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1532 		return;
1533 	    }
1534 	}
1535 
1536 	htmlnamePush(ctxt, BAD_CAST"body");
1537 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1538 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1539     }
1540 }
1541 
1542 /**
1543  * htmlCheckParagraph
1544  * @ctxt:  an HTML parser context
1545  *
1546  * Check whether a p element need to be implied before inserting
1547  * characters in the current element.
1548  *
1549  * Returns 1 if a paragraph has been inserted, 0 if not and -1
1550  *         in case of error.
1551  */
1552 
1553 static int
1554 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1555     const xmlChar *tag;
1556     int i;
1557 
1558     if (ctxt == NULL)
1559 	return(-1);
1560     tag = ctxt->name;
1561     if (tag == NULL) {
1562 	htmlAutoClose(ctxt, BAD_CAST"p");
1563 	htmlCheckImplied(ctxt, BAD_CAST"p");
1564 	htmlnamePush(ctxt, BAD_CAST"p");
1565 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1566 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1567 	return(1);
1568     }
1569     if (!htmlOmittedDefaultValue)
1570 	return(0);
1571     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1572 	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1573 	    htmlAutoClose(ctxt, BAD_CAST"p");
1574 	    htmlCheckImplied(ctxt, BAD_CAST"p");
1575 	    htmlnamePush(ctxt, BAD_CAST"p");
1576 	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1577 		ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1578 	    return(1);
1579 	}
1580     }
1581     return(0);
1582 }
1583 
1584 /**
1585  * htmlIsScriptAttribute:
1586  * @name:  an attribute name
1587  *
1588  * Check if an attribute is of content type Script
1589  *
1590  * Returns 1 is the attribute is a script 0 otherwise
1591  */
1592 int
1593 htmlIsScriptAttribute(const xmlChar *name) {
1594     unsigned int i;
1595 
1596     if (name == NULL)
1597       return(0);
1598     /*
1599      * all script attributes start with 'on'
1600      */
1601     if ((name[0] != 'o') || (name[1] != 'n'))
1602       return(0);
1603     for (i = 0;
1604 	 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1605 	 i++) {
1606 	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1607 	    return(1);
1608     }
1609     return(0);
1610 }
1611 
1612 /************************************************************************
1613  *									*
1614  *	The list of HTML predefined entities			*
1615  *									*
1616  ************************************************************************/
1617 
1618 
1619 static const htmlEntityDesc  html40EntitiesTable[] = {
1620 /*
1621  * the 4 absolute ones, plus apostrophe.
1622  */
1623 { 34,	"quot",	"quotation mark = APL quote, U+0022 ISOnum" },
1624 { 38,	"amp",	"ampersand, U+0026 ISOnum" },
1625 { 39,	"apos",	"single quote" },
1626 { 60,	"lt",	"less-than sign, U+003C ISOnum" },
1627 { 62,	"gt",	"greater-than sign, U+003E ISOnum" },
1628 
1629 /*
1630  * A bunch still in the 128-255 range
1631  * Replacing them depend really on the charset used.
1632  */
1633 { 160,	"nbsp",	"no-break space = non-breaking space, U+00A0 ISOnum" },
1634 { 161,	"iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1635 { 162,	"cent",	"cent sign, U+00A2 ISOnum" },
1636 { 163,	"pound","pound sign, U+00A3 ISOnum" },
1637 { 164,	"curren","currency sign, U+00A4 ISOnum" },
1638 { 165,	"yen",	"yen sign = yuan sign, U+00A5 ISOnum" },
1639 { 166,	"brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1640 { 167,	"sect",	"section sign, U+00A7 ISOnum" },
1641 { 168,	"uml",	"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1642 { 169,	"copy",	"copyright sign, U+00A9 ISOnum" },
1643 { 170,	"ordf",	"feminine ordinal indicator, U+00AA ISOnum" },
1644 { 171,	"laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1645 { 172,	"not",	"not sign, U+00AC ISOnum" },
1646 { 173,	"shy",	"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1647 { 174,	"reg",	"registered sign = registered trade mark sign, U+00AE ISOnum" },
1648 { 175,	"macr",	"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1649 { 176,	"deg",	"degree sign, U+00B0 ISOnum" },
1650 { 177,	"plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1651 { 178,	"sup2",	"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1652 { 179,	"sup3",	"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1653 { 180,	"acute","acute accent = spacing acute, U+00B4 ISOdia" },
1654 { 181,	"micro","micro sign, U+00B5 ISOnum" },
1655 { 182,	"para",	"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1656 { 183,	"middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1657 { 184,	"cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1658 { 185,	"sup1",	"superscript one = superscript digit one, U+00B9 ISOnum" },
1659 { 186,	"ordm",	"masculine ordinal indicator, U+00BA ISOnum" },
1660 { 187,	"raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1661 { 188,	"frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1662 { 189,	"frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1663 { 190,	"frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1664 { 191,	"iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1665 { 192,	"Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1666 { 193,	"Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1667 { 194,	"Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1668 { 195,	"Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1669 { 196,	"Auml",	"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1670 { 197,	"Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1671 { 198,	"AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1672 { 199,	"Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1673 { 200,	"Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1674 { 201,	"Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1675 { 202,	"Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1676 { 203,	"Euml",	"latin capital letter E with diaeresis, U+00CB ISOlat1" },
1677 { 204,	"Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1678 { 205,	"Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1679 { 206,	"Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1680 { 207,	"Iuml",	"latin capital letter I with diaeresis, U+00CF ISOlat1" },
1681 { 208,	"ETH",	"latin capital letter ETH, U+00D0 ISOlat1" },
1682 { 209,	"Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1683 { 210,	"Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1684 { 211,	"Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1685 { 212,	"Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1686 { 213,	"Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1687 { 214,	"Ouml",	"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1688 { 215,	"times","multiplication sign, U+00D7 ISOnum" },
1689 { 216,	"Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1690 { 217,	"Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1691 { 218,	"Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1692 { 219,	"Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1693 { 220,	"Uuml",	"latin capital letter U with diaeresis, U+00DC ISOlat1" },
1694 { 221,	"Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1695 { 222,	"THORN","latin capital letter THORN, U+00DE ISOlat1" },
1696 { 223,	"szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1697 { 224,	"agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1698 { 225,	"aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1699 { 226,	"acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1700 { 227,	"atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1701 { 228,	"auml",	"latin small letter a with diaeresis, U+00E4 ISOlat1" },
1702 { 229,	"aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1703 { 230,	"aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1704 { 231,	"ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1705 { 232,	"egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1706 { 233,	"eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1707 { 234,	"ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1708 { 235,	"euml",	"latin small letter e with diaeresis, U+00EB ISOlat1" },
1709 { 236,	"igrave","latin small letter i with grave, U+00EC ISOlat1" },
1710 { 237,	"iacute","latin small letter i with acute, U+00ED ISOlat1" },
1711 { 238,	"icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1712 { 239,	"iuml",	"latin small letter i with diaeresis, U+00EF ISOlat1" },
1713 { 240,	"eth",	"latin small letter eth, U+00F0 ISOlat1" },
1714 { 241,	"ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1715 { 242,	"ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1716 { 243,	"oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1717 { 244,	"ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1718 { 245,	"otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1719 { 246,	"ouml",	"latin small letter o with diaeresis, U+00F6 ISOlat1" },
1720 { 247,	"divide","division sign, U+00F7 ISOnum" },
1721 { 248,	"oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1722 { 249,	"ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1723 { 250,	"uacute","latin small letter u with acute, U+00FA ISOlat1" },
1724 { 251,	"ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1725 { 252,	"uuml",	"latin small letter u with diaeresis, U+00FC ISOlat1" },
1726 { 253,	"yacute","latin small letter y with acute, U+00FD ISOlat1" },
1727 { 254,	"thorn","latin small letter thorn with, U+00FE ISOlat1" },
1728 { 255,	"yuml",	"latin small letter y with diaeresis, U+00FF ISOlat1" },
1729 
1730 { 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },
1731 { 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },
1732 { 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1733 { 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },
1734 { 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1735 
1736 /*
1737  * Anything below should really be kept as entities references
1738  */
1739 { 402,	"fnof",	"latin small f with hook = function = florin, U+0192 ISOtech" },
1740 
1741 { 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },
1742 { 732,	"tilde","small tilde, U+02DC ISOdia" },
1743 
1744 { 913,	"Alpha","greek capital letter alpha, U+0391" },
1745 { 914,	"Beta",	"greek capital letter beta, U+0392" },
1746 { 915,	"Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1747 { 916,	"Delta","greek capital letter delta, U+0394 ISOgrk3" },
1748 { 917,	"Epsilon","greek capital letter epsilon, U+0395" },
1749 { 918,	"Zeta",	"greek capital letter zeta, U+0396" },
1750 { 919,	"Eta",	"greek capital letter eta, U+0397" },
1751 { 920,	"Theta","greek capital letter theta, U+0398 ISOgrk3" },
1752 { 921,	"Iota",	"greek capital letter iota, U+0399" },
1753 { 922,	"Kappa","greek capital letter kappa, U+039A" },
1754 { 923,	"Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1755 { 924,	"Mu",	"greek capital letter mu, U+039C" },
1756 { 925,	"Nu",	"greek capital letter nu, U+039D" },
1757 { 926,	"Xi",	"greek capital letter xi, U+039E ISOgrk3" },
1758 { 927,	"Omicron","greek capital letter omicron, U+039F" },
1759 { 928,	"Pi",	"greek capital letter pi, U+03A0 ISOgrk3" },
1760 { 929,	"Rho",	"greek capital letter rho, U+03A1" },
1761 { 931,	"Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1762 { 932,	"Tau",	"greek capital letter tau, U+03A4" },
1763 { 933,	"Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1764 { 934,	"Phi",	"greek capital letter phi, U+03A6 ISOgrk3" },
1765 { 935,	"Chi",	"greek capital letter chi, U+03A7" },
1766 { 936,	"Psi",	"greek capital letter psi, U+03A8 ISOgrk3" },
1767 { 937,	"Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1768 
1769 { 945,	"alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1770 { 946,	"beta",	"greek small letter beta, U+03B2 ISOgrk3" },
1771 { 947,	"gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1772 { 948,	"delta","greek small letter delta, U+03B4 ISOgrk3" },
1773 { 949,	"epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1774 { 950,	"zeta",	"greek small letter zeta, U+03B6 ISOgrk3" },
1775 { 951,	"eta",	"greek small letter eta, U+03B7 ISOgrk3" },
1776 { 952,	"theta","greek small letter theta, U+03B8 ISOgrk3" },
1777 { 953,	"iota",	"greek small letter iota, U+03B9 ISOgrk3" },
1778 { 954,	"kappa","greek small letter kappa, U+03BA ISOgrk3" },
1779 { 955,	"lambda","greek small letter lambda, U+03BB ISOgrk3" },
1780 { 956,	"mu",	"greek small letter mu, U+03BC ISOgrk3" },
1781 { 957,	"nu",	"greek small letter nu, U+03BD ISOgrk3" },
1782 { 958,	"xi",	"greek small letter xi, U+03BE ISOgrk3" },
1783 { 959,	"omicron","greek small letter omicron, U+03BF NEW" },
1784 { 960,	"pi",	"greek small letter pi, U+03C0 ISOgrk3" },
1785 { 961,	"rho",	"greek small letter rho, U+03C1 ISOgrk3" },
1786 { 962,	"sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1787 { 963,	"sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1788 { 964,	"tau",	"greek small letter tau, U+03C4 ISOgrk3" },
1789 { 965,	"upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1790 { 966,	"phi",	"greek small letter phi, U+03C6 ISOgrk3" },
1791 { 967,	"chi",	"greek small letter chi, U+03C7 ISOgrk3" },
1792 { 968,	"psi",	"greek small letter psi, U+03C8 ISOgrk3" },
1793 { 969,	"omega","greek small letter omega, U+03C9 ISOgrk3" },
1794 { 977,	"thetasym","greek small letter theta symbol, U+03D1 NEW" },
1795 { 978,	"upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1796 { 982,	"piv",	"greek pi symbol, U+03D6 ISOgrk3" },
1797 
1798 { 8194,	"ensp",	"en space, U+2002 ISOpub" },
1799 { 8195,	"emsp",	"em space, U+2003 ISOpub" },
1800 { 8201,	"thinsp","thin space, U+2009 ISOpub" },
1801 { 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },
1802 { 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },
1803 { 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },
1804 { 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },
1805 { 8211,	"ndash","en dash, U+2013 ISOpub" },
1806 { 8212,	"mdash","em dash, U+2014 ISOpub" },
1807 { 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },
1808 { 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },
1809 { 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },
1810 { 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },
1811 { 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },
1812 { 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },
1813 { 8224,	"dagger","dagger, U+2020 ISOpub" },
1814 { 8225,	"Dagger","double dagger, U+2021 ISOpub" },
1815 
1816 { 8226,	"bull",	"bullet = black small circle, U+2022 ISOpub" },
1817 { 8230,	"hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1818 
1819 { 8240,	"permil","per mille sign, U+2030 ISOtech" },
1820 
1821 { 8242,	"prime","prime = minutes = feet, U+2032 ISOtech" },
1822 { 8243,	"Prime","double prime = seconds = inches, U+2033 ISOtech" },
1823 
1824 { 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1825 { 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1826 
1827 { 8254,	"oline","overline = spacing overscore, U+203E NEW" },
1828 { 8260,	"frasl","fraction slash, U+2044 NEW" },
1829 
1830 { 8364,	"euro",	"euro sign, U+20AC NEW" },
1831 
1832 { 8465,	"image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1833 { 8472,	"weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1834 { 8476,	"real",	"blackletter capital R = real part symbol, U+211C ISOamso" },
1835 { 8482,	"trade","trade mark sign, U+2122 ISOnum" },
1836 { 8501,	"alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1837 { 8592,	"larr",	"leftwards arrow, U+2190 ISOnum" },
1838 { 8593,	"uarr",	"upwards arrow, U+2191 ISOnum" },
1839 { 8594,	"rarr",	"rightwards arrow, U+2192 ISOnum" },
1840 { 8595,	"darr",	"downwards arrow, U+2193 ISOnum" },
1841 { 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },
1842 { 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1843 { 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },
1844 { 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },
1845 { 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },
1846 { 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },
1847 { 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },
1848 
1849 { 8704,	"forall","for all, U+2200 ISOtech" },
1850 { 8706,	"part",	"partial differential, U+2202 ISOtech" },
1851 { 8707,	"exist","there exists, U+2203 ISOtech" },
1852 { 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },
1853 { 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },
1854 { 8712,	"isin",	"element of, U+2208 ISOtech" },
1855 { 8713,	"notin","not an element of, U+2209 ISOtech" },
1856 { 8715,	"ni",	"contains as member, U+220B ISOtech" },
1857 { 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },
1858 { 8721,	"sum",	"n-ary summation, U+2211 ISOamsb" },
1859 { 8722,	"minus","minus sign, U+2212 ISOtech" },
1860 { 8727,	"lowast","asterisk operator, U+2217 ISOtech" },
1861 { 8730,	"radic","square root = radical sign, U+221A ISOtech" },
1862 { 8733,	"prop",	"proportional to, U+221D ISOtech" },
1863 { 8734,	"infin","infinity, U+221E ISOtech" },
1864 { 8736,	"ang",	"angle, U+2220 ISOamso" },
1865 { 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },
1866 { 8744,	"or",	"logical or = vee, U+2228 ISOtech" },
1867 { 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },
1868 { 8746,	"cup",	"union = cup, U+222A ISOtech" },
1869 { 8747,	"int",	"integral, U+222B ISOtech" },
1870 { 8756,	"there4","therefore, U+2234 ISOtech" },
1871 { 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },
1872 { 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },
1873 { 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1874 { 8800,	"ne",	"not equal to, U+2260 ISOtech" },
1875 { 8801,	"equiv","identical to, U+2261 ISOtech" },
1876 { 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },
1877 { 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },
1878 { 8834,	"sub",	"subset of, U+2282 ISOtech" },
1879 { 8835,	"sup",	"superset of, U+2283 ISOtech" },
1880 { 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },
1881 { 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },
1882 { 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },
1883 { 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },
1884 { 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },
1885 { 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1886 { 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },
1887 { 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1888 { 8969,	"rceil","right ceiling, U+2309 ISOamsc" },
1889 { 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },
1890 { 8971,	"rfloor","right floor, U+230B ISOamsc" },
1891 { 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },
1892 { 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },
1893 { 9674,	"loz",	"lozenge, U+25CA ISOpub" },
1894 
1895 { 9824,	"spades","black spade suit, U+2660 ISOpub" },
1896 { 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },
1897 { 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },
1898 { 9830,	"diams","black diamond suit, U+2666 ISOpub" },
1899 
1900 };
1901 
1902 /************************************************************************
1903  *									*
1904  *		Commodity functions to handle entities			*
1905  *									*
1906  ************************************************************************/
1907 
1908 /*
1909  * Macro used to grow the current buffer.
1910  */
1911 #define growBuffer(buffer) {						\
1912     xmlChar *tmp;							\
1913     buffer##_size *= 2;							\
1914     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1915     if (tmp == NULL) {						\
1916 	htmlErrMemory(ctxt, "growing buffer\n");			\
1917 	xmlFree(buffer);						\
1918 	return(NULL);							\
1919     }									\
1920     buffer = tmp;							\
1921 }
1922 
1923 /**
1924  * htmlEntityLookup:
1925  * @name: the entity name
1926  *
1927  * Lookup the given entity in EntitiesTable
1928  *
1929  * TODO: the linear scan is really ugly, an hash table is really needed.
1930  *
1931  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1932  */
1933 const htmlEntityDesc *
1934 htmlEntityLookup(const xmlChar *name) {
1935     unsigned int i;
1936 
1937     for (i = 0;i < (sizeof(html40EntitiesTable)/
1938                     sizeof(html40EntitiesTable[0]));i++) {
1939         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1940             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1941 	}
1942     }
1943     return(NULL);
1944 }
1945 
1946 /**
1947  * htmlEntityValueLookup:
1948  * @value: the entity's unicode value
1949  *
1950  * Lookup the given entity in EntitiesTable
1951  *
1952  * TODO: the linear scan is really ugly, an hash table is really needed.
1953  *
1954  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1955  */
1956 const htmlEntityDesc *
1957 htmlEntityValueLookup(unsigned int value) {
1958     unsigned int i;
1959 
1960     for (i = 0;i < (sizeof(html40EntitiesTable)/
1961                     sizeof(html40EntitiesTable[0]));i++) {
1962         if (html40EntitiesTable[i].value >= value) {
1963 	    if (html40EntitiesTable[i].value > value)
1964 		break;
1965             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1966 	}
1967     }
1968     return(NULL);
1969 }
1970 
1971 /**
1972  * UTF8ToHtml:
1973  * @out:  a pointer to an array of bytes to store the result
1974  * @outlen:  the length of @out
1975  * @in:  a pointer to an array of UTF-8 chars
1976  * @inlen:  the length of @in
1977  *
1978  * Take a block of UTF-8 chars in and try to convert it to an ASCII
1979  * plus HTML entities block of chars out.
1980  *
1981  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1982  * The value of @inlen after return is the number of octets consumed
1983  *     as the return value is positive, else unpredictable.
1984  * The value of @outlen after return is the number of octets consumed.
1985  */
1986 int
1987 UTF8ToHtml(unsigned char* out, int *outlen,
1988               const unsigned char* in, int *inlen) {
1989     const unsigned char* processed = in;
1990     const unsigned char* outend;
1991     const unsigned char* outstart = out;
1992     const unsigned char* instart = in;
1993     const unsigned char* inend;
1994     unsigned int c, d;
1995     int trailing;
1996 
1997     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
1998     if (in == NULL) {
1999         /*
2000 	 * initialization nothing to do
2001 	 */
2002 	*outlen = 0;
2003 	*inlen = 0;
2004 	return(0);
2005     }
2006     inend = in + (*inlen);
2007     outend = out + (*outlen);
2008     while (in < inend) {
2009 	d = *in++;
2010 	if      (d < 0x80)  { c= d; trailing= 0; }
2011 	else if (d < 0xC0) {
2012 	    /* trailing byte in leading position */
2013 	    *outlen = out - outstart;
2014 	    *inlen = processed - instart;
2015 	    return(-2);
2016         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2017         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2018         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2019 	else {
2020 	    /* no chance for this in Ascii */
2021 	    *outlen = out - outstart;
2022 	    *inlen = processed - instart;
2023 	    return(-2);
2024 	}
2025 
2026 	if (inend - in < trailing) {
2027 	    break;
2028 	}
2029 
2030 	for ( ; trailing; trailing--) {
2031 	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2032 		break;
2033 	    c <<= 6;
2034 	    c |= d & 0x3F;
2035 	}
2036 
2037 	/* assertion: c is a single UTF-4 value */
2038 	if (c < 0x80) {
2039 	    if (out + 1 >= outend)
2040 		break;
2041 	    *out++ = c;
2042 	} else {
2043 	    int len;
2044 	    const htmlEntityDesc * ent;
2045 	    const char *cp;
2046 	    char nbuf[16];
2047 
2048 	    /*
2049 	     * Try to lookup a predefined HTML entity for it
2050 	     */
2051 
2052 	    ent = htmlEntityValueLookup(c);
2053 	    if (ent == NULL) {
2054 	      snprintf(nbuf, sizeof(nbuf), "#%u", c);
2055 	      cp = nbuf;
2056 	    }
2057 	    else
2058 	      cp = ent->name;
2059 	    len = strlen(cp);
2060 	    if (out + 2 + len >= outend)
2061 		break;
2062 	    *out++ = '&';
2063 	    memcpy(out, cp, len);
2064 	    out += len;
2065 	    *out++ = ';';
2066 	}
2067 	processed = in;
2068     }
2069     *outlen = out - outstart;
2070     *inlen = processed - instart;
2071     return(0);
2072 }
2073 
2074 /**
2075  * htmlEncodeEntities:
2076  * @out:  a pointer to an array of bytes to store the result
2077  * @outlen:  the length of @out
2078  * @in:  a pointer to an array of UTF-8 chars
2079  * @inlen:  the length of @in
2080  * @quoteChar: the quote character to escape (' or ") or zero.
2081  *
2082  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2083  * plus HTML entities block of chars out.
2084  *
2085  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2086  * The value of @inlen after return is the number of octets consumed
2087  *     as the return value is positive, else unpredictable.
2088  * The value of @outlen after return is the number of octets consumed.
2089  */
2090 int
2091 htmlEncodeEntities(unsigned char* out, int *outlen,
2092 		   const unsigned char* in, int *inlen, int quoteChar) {
2093     const unsigned char* processed = in;
2094     const unsigned char* outend;
2095     const unsigned char* outstart = out;
2096     const unsigned char* instart = in;
2097     const unsigned char* inend;
2098     unsigned int c, d;
2099     int trailing;
2100 
2101     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2102         return(-1);
2103     outend = out + (*outlen);
2104     inend = in + (*inlen);
2105     while (in < inend) {
2106 	d = *in++;
2107 	if      (d < 0x80)  { c= d; trailing= 0; }
2108 	else if (d < 0xC0) {
2109 	    /* trailing byte in leading position */
2110 	    *outlen = out - outstart;
2111 	    *inlen = processed - instart;
2112 	    return(-2);
2113         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2114         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2115         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2116 	else {
2117 	    /* no chance for this in Ascii */
2118 	    *outlen = out - outstart;
2119 	    *inlen = processed - instart;
2120 	    return(-2);
2121 	}
2122 
2123 	if (inend - in < trailing)
2124 	    break;
2125 
2126 	while (trailing--) {
2127 	    if (((d= *in++) & 0xC0) != 0x80) {
2128 		*outlen = out - outstart;
2129 		*inlen = processed - instart;
2130 		return(-2);
2131 	    }
2132 	    c <<= 6;
2133 	    c |= d & 0x3F;
2134 	}
2135 
2136 	/* assertion: c is a single UTF-4 value */
2137 	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2138 	    (c != '&') && (c != '<') && (c != '>')) {
2139 	    if (out >= outend)
2140 		break;
2141 	    *out++ = c;
2142 	} else {
2143 	    const htmlEntityDesc * ent;
2144 	    const char *cp;
2145 	    char nbuf[16];
2146 	    int len;
2147 
2148 	    /*
2149 	     * Try to lookup a predefined HTML entity for it
2150 	     */
2151 	    ent = htmlEntityValueLookup(c);
2152 	    if (ent == NULL) {
2153 		snprintf(nbuf, sizeof(nbuf), "#%u", c);
2154 		cp = nbuf;
2155 	    }
2156 	    else
2157 		cp = ent->name;
2158 	    len = strlen(cp);
2159 	    if (out + 2 + len > outend)
2160 		break;
2161 	    *out++ = '&';
2162 	    memcpy(out, cp, len);
2163 	    out += len;
2164 	    *out++ = ';';
2165 	}
2166 	processed = in;
2167     }
2168     *outlen = out - outstart;
2169     *inlen = processed - instart;
2170     return(0);
2171 }
2172 
2173 /************************************************************************
2174  *									*
2175  *		Commodity functions to handle streams			*
2176  *									*
2177  ************************************************************************/
2178 
2179 #ifdef LIBXML_PUSH_ENABLED
2180 /**
2181  * htmlNewInputStream:
2182  * @ctxt:  an HTML parser context
2183  *
2184  * Create a new input stream structure
2185  * Returns the new input stream or NULL
2186  */
2187 static htmlParserInputPtr
2188 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2189     htmlParserInputPtr input;
2190 
2191     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2192     if (input == NULL) {
2193         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2194 	return(NULL);
2195     }
2196     memset(input, 0, sizeof(htmlParserInput));
2197     input->filename = NULL;
2198     input->directory = NULL;
2199     input->base = NULL;
2200     input->cur = NULL;
2201     input->buf = NULL;
2202     input->line = 1;
2203     input->col = 1;
2204     input->buf = NULL;
2205     input->free = NULL;
2206     input->version = NULL;
2207     input->consumed = 0;
2208     input->length = 0;
2209     return(input);
2210 }
2211 #endif
2212 
2213 
2214 /************************************************************************
2215  *									*
2216  *		Commodity functions, cleanup needed ?			*
2217  *									*
2218  ************************************************************************/
2219 /*
2220  * all tags allowing pc data from the html 4.01 loose dtd
2221  * NOTE: it might be more appropriate to integrate this information
2222  * into the html40ElementTable array but I don't want to risk any
2223  * binary incompatibility
2224  */
2225 static const char *allowPCData[] = {
2226     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2227     "blockquote", "body", "button", "caption", "center", "cite", "code",
2228     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2229     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2230     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2231     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2232 };
2233 
2234 /**
2235  * areBlanks:
2236  * @ctxt:  an HTML parser context
2237  * @str:  a xmlChar *
2238  * @len:  the size of @str
2239  *
2240  * Is this a sequence of blank chars that one can ignore ?
2241  *
2242  * Returns 1 if ignorable 0 otherwise.
2243  */
2244 
2245 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2246     unsigned int i;
2247     int j;
2248     xmlNodePtr lastChild;
2249     xmlDtdPtr dtd;
2250 
2251     for (j = 0;j < len;j++)
2252         if (!(IS_BLANK_CH(str[j]))) return(0);
2253 
2254     if (CUR == 0) return(1);
2255     if (CUR != '<') return(0);
2256     if (ctxt->name == NULL)
2257 	return(1);
2258     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2259 	return(1);
2260     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2261 	return(1);
2262 
2263     /* Only strip CDATA children of the body tag for strict HTML DTDs */
2264     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2265         dtd = xmlGetIntSubset(ctxt->myDoc);
2266         if (dtd != NULL && dtd->ExternalID != NULL) {
2267             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2268                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2269                 return(1);
2270         }
2271     }
2272 
2273     if (ctxt->node == NULL) return(0);
2274     lastChild = xmlGetLastChild(ctxt->node);
2275     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2276 	lastChild = lastChild->prev;
2277     if (lastChild == NULL) {
2278         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2279             (ctxt->node->content != NULL)) return(0);
2280 	/* keep ws in constructs like ...<b> </b>...
2281 	   for all tags "b" allowing PCDATA */
2282 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2283 	    if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2284 		return(0);
2285 	    }
2286 	}
2287     } else if (xmlNodeIsText(lastChild)) {
2288         return(0);
2289     } else {
2290 	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2291 	   for all tags "p" allowing PCDATA */
2292 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2293 	    if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2294 		return(0);
2295 	    }
2296 	}
2297     }
2298     return(1);
2299 }
2300 
2301 /**
2302  * htmlNewDocNoDtD:
2303  * @URI:  URI for the dtd, or NULL
2304  * @ExternalID:  the external ID of the DTD, or NULL
2305  *
2306  * Creates a new HTML document without a DTD node if @URI and @ExternalID
2307  * are NULL
2308  *
2309  * Returns a new document, do not initialize the DTD if not provided
2310  */
2311 htmlDocPtr
2312 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2313     xmlDocPtr cur;
2314 
2315     /*
2316      * Allocate a new document and fill the fields.
2317      */
2318     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2319     if (cur == NULL) {
2320 	htmlErrMemory(NULL, "HTML document creation failed\n");
2321 	return(NULL);
2322     }
2323     memset(cur, 0, sizeof(xmlDoc));
2324 
2325     cur->type = XML_HTML_DOCUMENT_NODE;
2326     cur->version = NULL;
2327     cur->intSubset = NULL;
2328     cur->doc = cur;
2329     cur->name = NULL;
2330     cur->children = NULL;
2331     cur->extSubset = NULL;
2332     cur->oldNs = NULL;
2333     cur->encoding = NULL;
2334     cur->standalone = 1;
2335     cur->compression = 0;
2336     cur->ids = NULL;
2337     cur->refs = NULL;
2338     cur->_private = NULL;
2339     cur->charset = XML_CHAR_ENCODING_UTF8;
2340     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2341     if ((ExternalID != NULL) ||
2342 	(URI != NULL))
2343 	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2344     return(cur);
2345 }
2346 
2347 /**
2348  * htmlNewDoc:
2349  * @URI:  URI for the dtd, or NULL
2350  * @ExternalID:  the external ID of the DTD, or NULL
2351  *
2352  * Creates a new HTML document
2353  *
2354  * Returns a new document
2355  */
2356 htmlDocPtr
2357 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2358     if ((URI == NULL) && (ExternalID == NULL))
2359 	return(htmlNewDocNoDtD(
2360 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2361 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2362 
2363     return(htmlNewDocNoDtD(URI, ExternalID));
2364 }
2365 
2366 
2367 /************************************************************************
2368  *									*
2369  *			The parser itself				*
2370  *	Relates to http://www.w3.org/TR/html40				*
2371  *									*
2372  ************************************************************************/
2373 
2374 /************************************************************************
2375  *									*
2376  *			The parser itself				*
2377  *									*
2378  ************************************************************************/
2379 
2380 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2381 
2382 /**
2383  * htmlParseHTMLName:
2384  * @ctxt:  an HTML parser context
2385  *
2386  * parse an HTML tag or attribute name, note that we convert it to lowercase
2387  * since HTML names are not case-sensitive.
2388  *
2389  * Returns the Tag Name parsed or NULL
2390  */
2391 
2392 static const xmlChar *
2393 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2394     int i = 0;
2395     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2396 
2397     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2398         (CUR != ':') && (CUR != '.')) return(NULL);
2399 
2400     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2401            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2402 	   (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2403            (CUR == '.'))) {
2404 	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2405         else loc[i] = CUR;
2406 	i++;
2407 
2408 	NEXT;
2409     }
2410 
2411     return(xmlDictLookup(ctxt->dict, loc, i));
2412 }
2413 
2414 
2415 /**
2416  * htmlParseHTMLName_nonInvasive:
2417  * @ctxt:  an HTML parser context
2418  *
2419  * parse an HTML tag or attribute name, note that we convert it to lowercase
2420  * since HTML names are not case-sensitive, this doesn't consume the data
2421  * from the stream, it's a look-ahead
2422  *
2423  * Returns the Tag Name parsed or NULL
2424  */
2425 
2426 static const xmlChar *
2427 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2428     int i = 0;
2429     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2430 
2431     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2432         (NXT(1) != ':')) return(NULL);
2433 
2434     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2435            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2436 	   (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2437 	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2438         else loc[i] = NXT(1+i);
2439 	i++;
2440     }
2441 
2442     return(xmlDictLookup(ctxt->dict, loc, i));
2443 }
2444 
2445 
2446 /**
2447  * htmlParseName:
2448  * @ctxt:  an HTML parser context
2449  *
2450  * parse an HTML name, this routine is case sensitive.
2451  *
2452  * Returns the Name parsed or NULL
2453  */
2454 
2455 static const xmlChar *
2456 htmlParseName(htmlParserCtxtPtr ctxt) {
2457     const xmlChar *in;
2458     const xmlChar *ret;
2459     int count = 0;
2460 
2461     GROW;
2462 
2463     /*
2464      * Accelerator for simple ASCII names
2465      */
2466     in = ctxt->input->cur;
2467     if (((*in >= 0x61) && (*in <= 0x7A)) ||
2468 	((*in >= 0x41) && (*in <= 0x5A)) ||
2469 	(*in == '_') || (*in == ':')) {
2470 	in++;
2471 	while (((*in >= 0x61) && (*in <= 0x7A)) ||
2472 	       ((*in >= 0x41) && (*in <= 0x5A)) ||
2473 	       ((*in >= 0x30) && (*in <= 0x39)) ||
2474 	       (*in == '_') || (*in == '-') ||
2475 	       (*in == ':') || (*in == '.'))
2476 	    in++;
2477 
2478 	if (in == ctxt->input->end)
2479 	    return(NULL);
2480 
2481 	if ((*in > 0) && (*in < 0x80)) {
2482 	    count = in - ctxt->input->cur;
2483 	    ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2484 	    ctxt->input->cur = in;
2485 	    ctxt->nbChars += count;
2486 	    ctxt->input->col += count;
2487 	    return(ret);
2488 	}
2489     }
2490     return(htmlParseNameComplex(ctxt));
2491 }
2492 
2493 static const xmlChar *
2494 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2495     int len = 0, l;
2496     int c;
2497     int count = 0;
2498     const xmlChar *base = ctxt->input->base;
2499 
2500     /*
2501      * Handler for more complex cases
2502      */
2503     GROW;
2504     c = CUR_CHAR(l);
2505     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2506 	(!IS_LETTER(c) && (c != '_') &&
2507          (c != ':'))) {
2508 	return(NULL);
2509     }
2510 
2511     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2512 	   ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2513             (c == '.') || (c == '-') ||
2514 	    (c == '_') || (c == ':') ||
2515 	    (IS_COMBINING(c)) ||
2516 	    (IS_EXTENDER(c)))) {
2517 	if (count++ > 100) {
2518 	    count = 0;
2519 	    GROW;
2520 	}
2521 	len += l;
2522 	NEXTL(l);
2523 	c = CUR_CHAR(l);
2524 	if (ctxt->input->base != base) {
2525 	    /*
2526 	     * We changed encoding from an unknown encoding
2527 	     * Input buffer changed location, so we better start again
2528 	     */
2529 	    return(htmlParseNameComplex(ctxt));
2530 	}
2531     }
2532 
2533     if (ctxt->input->cur - ctxt->input->base < len) {
2534         /* Sanity check */
2535 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2536                      "unexpected change of input buffer", NULL, NULL);
2537         return (NULL);
2538     }
2539 
2540     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2541 }
2542 
2543 
2544 /**
2545  * htmlParseHTMLAttribute:
2546  * @ctxt:  an HTML parser context
2547  * @stop:  a char stop value
2548  *
2549  * parse an HTML attribute value till the stop (quote), if
2550  * stop is 0 then it stops at the first space
2551  *
2552  * Returns the attribute parsed or NULL
2553  */
2554 
2555 static xmlChar *
2556 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2557     xmlChar *buffer = NULL;
2558     int buffer_size = 0;
2559     xmlChar *out = NULL;
2560     const xmlChar *name = NULL;
2561     const xmlChar *cur = NULL;
2562     const htmlEntityDesc * ent;
2563 
2564     /*
2565      * allocate a translation buffer.
2566      */
2567     buffer_size = HTML_PARSER_BUFFER_SIZE;
2568     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2569     if (buffer == NULL) {
2570 	htmlErrMemory(ctxt, "buffer allocation failed\n");
2571 	return(NULL);
2572     }
2573     out = buffer;
2574 
2575     /*
2576      * Ok loop until we reach one of the ending chars
2577      */
2578     while ((CUR != 0) && (CUR != stop)) {
2579 	if ((stop == 0) && (CUR == '>')) break;
2580 	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2581         if (CUR == '&') {
2582 	    if (NXT(1) == '#') {
2583 		unsigned int c;
2584 		int bits;
2585 
2586 		c = htmlParseCharRef(ctxt);
2587 		if      (c <    0x80)
2588 		        { *out++  = c;                bits= -6; }
2589 		else if (c <   0x800)
2590 		        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2591 		else if (c < 0x10000)
2592 		        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2593 		else
2594 		        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2595 
2596 		for ( ; bits >= 0; bits-= 6) {
2597 		    *out++  = ((c >> bits) & 0x3F) | 0x80;
2598 		}
2599 
2600 		if (out - buffer > buffer_size - 100) {
2601 			int indx = out - buffer;
2602 
2603 			growBuffer(buffer);
2604 			out = &buffer[indx];
2605 		}
2606 	    } else {
2607 		ent = htmlParseEntityRef(ctxt, &name);
2608 		if (name == NULL) {
2609 		    *out++ = '&';
2610 		    if (out - buffer > buffer_size - 100) {
2611 			int indx = out - buffer;
2612 
2613 			growBuffer(buffer);
2614 			out = &buffer[indx];
2615 		    }
2616 		} else if (ent == NULL) {
2617 		    *out++ = '&';
2618 		    cur = name;
2619 		    while (*cur != 0) {
2620 			if (out - buffer > buffer_size - 100) {
2621 			    int indx = out - buffer;
2622 
2623 			    growBuffer(buffer);
2624 			    out = &buffer[indx];
2625 			}
2626 			*out++ = *cur++;
2627 		    }
2628 		} else {
2629 		    unsigned int c;
2630 		    int bits;
2631 
2632 		    if (out - buffer > buffer_size - 100) {
2633 			int indx = out - buffer;
2634 
2635 			growBuffer(buffer);
2636 			out = &buffer[indx];
2637 		    }
2638 		    c = ent->value;
2639 		    if      (c <    0x80)
2640 			{ *out++  = c;                bits= -6; }
2641 		    else if (c <   0x800)
2642 			{ *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2643 		    else if (c < 0x10000)
2644 			{ *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2645 		    else
2646 			{ *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2647 
2648 		    for ( ; bits >= 0; bits-= 6) {
2649 			*out++  = ((c >> bits) & 0x3F) | 0x80;
2650 		    }
2651 		}
2652 	    }
2653 	} else {
2654 	    unsigned int c;
2655 	    int bits, l;
2656 
2657 	    if (out - buffer > buffer_size - 100) {
2658 		int indx = out - buffer;
2659 
2660 		growBuffer(buffer);
2661 		out = &buffer[indx];
2662 	    }
2663 	    c = CUR_CHAR(l);
2664 	    if      (c <    0x80)
2665 		    { *out++  = c;                bits= -6; }
2666 	    else if (c <   0x800)
2667 		    { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2668 	    else if (c < 0x10000)
2669 		    { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2670 	    else
2671 		    { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2672 
2673 	    for ( ; bits >= 0; bits-= 6) {
2674 		*out++  = ((c >> bits) & 0x3F) | 0x80;
2675 	    }
2676 	    NEXT;
2677 	}
2678     }
2679     *out = 0;
2680     return(buffer);
2681 }
2682 
2683 /**
2684  * htmlParseEntityRef:
2685  * @ctxt:  an HTML parser context
2686  * @str:  location to store the entity name
2687  *
2688  * parse an HTML ENTITY references
2689  *
2690  * [68] EntityRef ::= '&' Name ';'
2691  *
2692  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2693  *         if non-NULL *str will have to be freed by the caller.
2694  */
2695 const htmlEntityDesc *
2696 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2697     const xmlChar *name;
2698     const htmlEntityDesc * ent = NULL;
2699 
2700     if (str != NULL) *str = NULL;
2701     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2702 
2703     if (CUR == '&') {
2704         NEXT;
2705         name = htmlParseName(ctxt);
2706 	if (name == NULL) {
2707 	    htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2708 	                 "htmlParseEntityRef: no name\n", NULL, NULL);
2709 	} else {
2710 	    GROW;
2711 	    if (CUR == ';') {
2712 	        if (str != NULL)
2713 		    *str = name;
2714 
2715 		/*
2716 		 * Lookup the entity in the table.
2717 		 */
2718 		ent = htmlEntityLookup(name);
2719 		if (ent != NULL) /* OK that's ugly !!! */
2720 		    NEXT;
2721 	    } else {
2722 		htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2723 		             "htmlParseEntityRef: expecting ';'\n",
2724 			     NULL, NULL);
2725 	        if (str != NULL)
2726 		    *str = name;
2727 	    }
2728 	}
2729     }
2730     return(ent);
2731 }
2732 
2733 /**
2734  * htmlParseAttValue:
2735  * @ctxt:  an HTML parser context
2736  *
2737  * parse a value for an attribute
2738  * Note: the parser won't do substitution of entities here, this
2739  * will be handled later in xmlStringGetNodeList, unless it was
2740  * asked for ctxt->replaceEntities != 0
2741  *
2742  * Returns the AttValue parsed or NULL.
2743  */
2744 
2745 static xmlChar *
2746 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2747     xmlChar *ret = NULL;
2748 
2749     if (CUR == '"') {
2750         NEXT;
2751 	ret = htmlParseHTMLAttribute(ctxt, '"');
2752         if (CUR != '"') {
2753 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2754 	                 "AttValue: \" expected\n", NULL, NULL);
2755 	} else
2756 	    NEXT;
2757     } else if (CUR == '\'') {
2758         NEXT;
2759 	ret = htmlParseHTMLAttribute(ctxt, '\'');
2760         if (CUR != '\'') {
2761 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2762 	                 "AttValue: ' expected\n", NULL, NULL);
2763 	} else
2764 	    NEXT;
2765     } else {
2766         /*
2767 	 * That's an HTMLism, the attribute value may not be quoted
2768 	 */
2769 	ret = htmlParseHTMLAttribute(ctxt, 0);
2770 	if (ret == NULL) {
2771 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2772 	                 "AttValue: no value found\n", NULL, NULL);
2773 	}
2774     }
2775     return(ret);
2776 }
2777 
2778 /**
2779  * htmlParseSystemLiteral:
2780  * @ctxt:  an HTML parser context
2781  *
2782  * parse an HTML Literal
2783  *
2784  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2785  *
2786  * Returns the SystemLiteral parsed or NULL
2787  */
2788 
2789 static xmlChar *
2790 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2791     size_t len = 0, startPosition = 0;
2792     xmlChar *ret = NULL;
2793 
2794     if (CUR == '"') {
2795         NEXT;
2796 
2797         if (CUR_PTR < BASE_PTR)
2798             return(ret);
2799         startPosition = CUR_PTR - BASE_PTR;
2800 
2801 	while ((IS_CHAR_CH(CUR)) && (CUR != '"')) {
2802 	    NEXT;
2803 	    len++;
2804 	}
2805 	if (!IS_CHAR_CH(CUR)) {
2806 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2807 			 "Unfinished SystemLiteral\n", NULL, NULL);
2808 	} else {
2809 	    ret = xmlStrndup((BASE_PTR+startPosition), len);
2810 	    NEXT;
2811         }
2812     } else if (CUR == '\'') {
2813         NEXT;
2814 
2815         if (CUR_PTR < BASE_PTR)
2816             return(ret);
2817         startPosition = CUR_PTR - BASE_PTR;
2818 
2819 	while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) {
2820 	    NEXT;
2821 	    len++;
2822 	}
2823 	if (!IS_CHAR_CH(CUR)) {
2824 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2825 			 "Unfinished SystemLiteral\n", NULL, NULL);
2826 	} else {
2827 	    ret = xmlStrndup((BASE_PTR+startPosition), len);
2828 	    NEXT;
2829         }
2830     } else {
2831 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2832 	             " or ' expected\n", NULL, NULL);
2833     }
2834 
2835     return(ret);
2836 }
2837 
2838 /**
2839  * htmlParsePubidLiteral:
2840  * @ctxt:  an HTML parser context
2841  *
2842  * parse an HTML public literal
2843  *
2844  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2845  *
2846  * Returns the PubidLiteral parsed or NULL.
2847  */
2848 
2849 static xmlChar *
2850 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2851     size_t len = 0, startPosition = 0;
2852     xmlChar *ret = NULL;
2853     /*
2854      * Name ::= (Letter | '_') (NameChar)*
2855      */
2856     if (CUR == '"') {
2857         NEXT;
2858 
2859         if (CUR_PTR < BASE_PTR)
2860             return(ret);
2861         startPosition = CUR_PTR - BASE_PTR;
2862 
2863         while (IS_PUBIDCHAR_CH(CUR)) {
2864             len++;
2865             NEXT;
2866         }
2867 
2868 	if (CUR != '"') {
2869 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2870 	                 "Unfinished PubidLiteral\n", NULL, NULL);
2871 	} else {
2872 	    ret = xmlStrndup((BASE_PTR + startPosition), len);
2873 	    NEXT;
2874 	}
2875     } else if (CUR == '\'') {
2876         NEXT;
2877 
2878         if (CUR_PTR < BASE_PTR)
2879             return(ret);
2880         startPosition = CUR_PTR - BASE_PTR;
2881 
2882         while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')){
2883             len++;
2884             NEXT;
2885         }
2886 
2887 	if (CUR != '\'') {
2888 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2889 	                 "Unfinished PubidLiteral\n", NULL, NULL);
2890 	} else {
2891 	    ret = xmlStrndup((BASE_PTR + startPosition), len);
2892 	    NEXT;
2893 	}
2894     } else {
2895 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2896 	             "PubidLiteral \" or ' expected\n", NULL, NULL);
2897     }
2898 
2899     return(ret);
2900 }
2901 
2902 /**
2903  * htmlParseScript:
2904  * @ctxt:  an HTML parser context
2905  *
2906  * parse the content of an HTML SCRIPT or STYLE element
2907  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2908  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2909  * http://www.w3.org/TR/html4/types.html#type-script
2910  * http://www.w3.org/TR/html4/types.html#h-6.15
2911  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2912  *
2913  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2914  * element and the value of intrinsic event attributes. User agents must
2915  * not evaluate script data as HTML markup but instead must pass it on as
2916  * data to a script engine.
2917  * NOTES:
2918  * - The content is passed like CDATA
2919  * - the attributes for style and scripting "onXXX" are also described
2920  *   as CDATA but SGML allows entities references in attributes so their
2921  *   processing is identical as other attributes
2922  */
2923 static void
2924 htmlParseScript(htmlParserCtxtPtr ctxt) {
2925     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2926     int nbchar = 0;
2927     int cur,l;
2928 
2929     SHRINK;
2930     cur = CUR_CHAR(l);
2931     while (IS_CHAR_CH(cur)) {
2932 	if ((cur == '<') && (NXT(1) == '/')) {
2933             /*
2934              * One should break here, the specification is clear:
2935              * Authors should therefore escape "</" within the content.
2936              * Escape mechanisms are specific to each scripting or
2937              * style sheet language.
2938              *
2939              * In recovery mode, only break if end tag match the
2940              * current tag, effectively ignoring all tags inside the
2941              * script/style block and treating the entire block as
2942              * CDATA.
2943              */
2944             if (ctxt->recovery) {
2945                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2946 				   xmlStrlen(ctxt->name)) == 0)
2947                 {
2948                     break; /* while */
2949                 } else {
2950 		    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2951 				 "Element %s embeds close tag\n",
2952 		                 ctxt->name, NULL);
2953 		}
2954             } else {
2955                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2956                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2957                 {
2958                     break; /* while */
2959                 }
2960             }
2961 	}
2962 	COPY_BUF(l,buf,nbchar,cur);
2963 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2964             buf[nbchar] = 0;
2965 	    if (ctxt->sax->cdataBlock!= NULL) {
2966 		/*
2967 		 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2968 		 */
2969 		ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2970 	    } else if (ctxt->sax->characters != NULL) {
2971 		ctxt->sax->characters(ctxt->userData, buf, nbchar);
2972 	    }
2973 	    nbchar = 0;
2974 	}
2975 	GROW;
2976 	NEXTL(l);
2977 	cur = CUR_CHAR(l);
2978     }
2979 
2980     if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
2981         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2982                     "Invalid char in CDATA 0x%X\n", cur);
2983         if (ctxt->input->cur < ctxt->input->end) {
2984             NEXT;
2985         }
2986     }
2987 
2988     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2989         buf[nbchar] = 0;
2990 	if (ctxt->sax->cdataBlock!= NULL) {
2991 	    /*
2992 	     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2993 	     */
2994 	    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2995 	} else if (ctxt->sax->characters != NULL) {
2996 	    ctxt->sax->characters(ctxt->userData, buf, nbchar);
2997 	}
2998     }
2999 }
3000 
3001 
3002 /**
3003  * htmlParseCharDataInternal:
3004  * @ctxt:  an HTML parser context
3005  * @readahead: optional read ahead character in ascii range
3006  *
3007  * parse a CharData section.
3008  * if we are within a CDATA section ']]>' marks an end of section.
3009  *
3010  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3011  */
3012 
3013 static void
3014 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3015     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3016     int nbchar = 0;
3017     int cur, l;
3018     int chunk = 0;
3019 
3020     if (readahead)
3021         buf[nbchar++] = readahead;
3022 
3023     SHRINK;
3024     cur = CUR_CHAR(l);
3025     while (((cur != '<') || (ctxt->token == '<')) &&
3026            ((cur != '&') || (ctxt->token == '&')) &&
3027 	   (cur != 0)) {
3028 	if (!(IS_CHAR(cur))) {
3029 	    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3030 	                "Invalid char in CDATA 0x%X\n", cur);
3031 	} else {
3032 	    COPY_BUF(l,buf,nbchar,cur);
3033 	}
3034 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3035             buf[nbchar] = 0;
3036 
3037 	    /*
3038 	     * Ok the segment is to be consumed as chars.
3039 	     */
3040 	    if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3041 		if (areBlanks(ctxt, buf, nbchar)) {
3042 		    if (ctxt->keepBlanks) {
3043 			if (ctxt->sax->characters != NULL)
3044 			    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3045 		    } else {
3046 			if (ctxt->sax->ignorableWhitespace != NULL)
3047 			    ctxt->sax->ignorableWhitespace(ctxt->userData,
3048 			                                   buf, nbchar);
3049 		    }
3050 		} else {
3051 		    htmlCheckParagraph(ctxt);
3052 		    if (ctxt->sax->characters != NULL)
3053 			ctxt->sax->characters(ctxt->userData, buf, nbchar);
3054 		}
3055 	    }
3056 	    nbchar = 0;
3057 	}
3058 	NEXTL(l);
3059         chunk++;
3060         if (chunk > HTML_PARSER_BUFFER_SIZE) {
3061             chunk = 0;
3062             SHRINK;
3063             GROW;
3064         }
3065 	cur = CUR_CHAR(l);
3066 	if (cur == 0) {
3067 	    SHRINK;
3068 	    GROW;
3069 	    cur = CUR_CHAR(l);
3070 	}
3071     }
3072     if (nbchar != 0) {
3073         buf[nbchar] = 0;
3074 
3075 	/*
3076 	 * Ok the segment is to be consumed as chars.
3077 	 */
3078 	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3079 	    if (areBlanks(ctxt, buf, nbchar)) {
3080 		if (ctxt->keepBlanks) {
3081 		    if (ctxt->sax->characters != NULL)
3082 			ctxt->sax->characters(ctxt->userData, buf, nbchar);
3083 		} else {
3084 		    if (ctxt->sax->ignorableWhitespace != NULL)
3085 			ctxt->sax->ignorableWhitespace(ctxt->userData,
3086 			                               buf, nbchar);
3087 		}
3088 	    } else {
3089 		htmlCheckParagraph(ctxt);
3090 		if (ctxt->sax->characters != NULL)
3091 		    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3092 	    }
3093 	}
3094     } else {
3095 	/*
3096 	 * Loop detection
3097 	 */
3098 	if (cur == 0)
3099 	    ctxt->instate = XML_PARSER_EOF;
3100     }
3101 }
3102 
3103 /**
3104  * htmlParseCharData:
3105  * @ctxt:  an HTML parser context
3106  *
3107  * parse a CharData section.
3108  * if we are within a CDATA section ']]>' marks an end of section.
3109  *
3110  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3111  */
3112 
3113 static void
3114 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3115     htmlParseCharDataInternal(ctxt, 0);
3116 }
3117 
3118 /**
3119  * htmlParseExternalID:
3120  * @ctxt:  an HTML parser context
3121  * @publicID:  a xmlChar** receiving PubidLiteral
3122  *
3123  * Parse an External ID or a Public ID
3124  *
3125  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3126  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3127  *
3128  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3129  *
3130  * Returns the function returns SystemLiteral and in the second
3131  *                case publicID receives PubidLiteral, is strict is off
3132  *                it is possible to return NULL and have publicID set.
3133  */
3134 
3135 static xmlChar *
3136 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3137     xmlChar *URI = NULL;
3138 
3139     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3140          (UPP(2) == 'S') && (UPP(3) == 'T') &&
3141 	 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3142         SKIP(6);
3143 	if (!IS_BLANK_CH(CUR)) {
3144 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3145 	                 "Space required after 'SYSTEM'\n", NULL, NULL);
3146 	}
3147         SKIP_BLANKS;
3148 	URI = htmlParseSystemLiteral(ctxt);
3149 	if (URI == NULL) {
3150 	    htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3151 	                 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3152         }
3153     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3154 	       (UPP(2) == 'B') && (UPP(3) == 'L') &&
3155 	       (UPP(4) == 'I') && (UPP(5) == 'C')) {
3156         SKIP(6);
3157 	if (!IS_BLANK_CH(CUR)) {
3158 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3159 	                 "Space required after 'PUBLIC'\n", NULL, NULL);
3160 	}
3161         SKIP_BLANKS;
3162 	*publicID = htmlParsePubidLiteral(ctxt);
3163 	if (*publicID == NULL) {
3164 	    htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3165 	                 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3166 			 NULL, NULL);
3167 	}
3168         SKIP_BLANKS;
3169         if ((CUR == '"') || (CUR == '\'')) {
3170 	    URI = htmlParseSystemLiteral(ctxt);
3171 	}
3172     }
3173     return(URI);
3174 }
3175 
3176 /**
3177  * xmlParsePI:
3178  * @ctxt:  an XML parser context
3179  *
3180  * parse an XML Processing Instruction.
3181  *
3182  * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3183  */
3184 static void
3185 htmlParsePI(htmlParserCtxtPtr ctxt) {
3186     xmlChar *buf = NULL;
3187     int len = 0;
3188     int size = HTML_PARSER_BUFFER_SIZE;
3189     int cur, l;
3190     const xmlChar *target;
3191     xmlParserInputState state;
3192     int count = 0;
3193 
3194     if ((RAW == '<') && (NXT(1) == '?')) {
3195 	state = ctxt->instate;
3196         ctxt->instate = XML_PARSER_PI;
3197 	/*
3198 	 * this is a Processing Instruction.
3199 	 */
3200 	SKIP(2);
3201 	SHRINK;
3202 
3203 	/*
3204 	 * Parse the target name and check for special support like
3205 	 * namespace.
3206 	 */
3207         target = htmlParseName(ctxt);
3208 	if (target != NULL) {
3209 	    if (RAW == '>') {
3210 		SKIP(1);
3211 
3212 		/*
3213 		 * SAX: PI detected.
3214 		 */
3215 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3216 		    (ctxt->sax->processingInstruction != NULL))
3217 		    ctxt->sax->processingInstruction(ctxt->userData,
3218 		                                     target, NULL);
3219 		ctxt->instate = state;
3220 		return;
3221 	    }
3222 	    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3223 	    if (buf == NULL) {
3224 		htmlErrMemory(ctxt, NULL);
3225 		ctxt->instate = state;
3226 		return;
3227 	    }
3228 	    cur = CUR;
3229 	    if (!IS_BLANK(cur)) {
3230 		htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3231 			  "ParsePI: PI %s space expected\n", target, NULL);
3232 	    }
3233             SKIP_BLANKS;
3234 	    cur = CUR_CHAR(l);
3235 	    while (IS_CHAR(cur) && (cur != '>')) {
3236 		if (len + 5 >= size) {
3237 		    xmlChar *tmp;
3238 
3239 		    size *= 2;
3240 		    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3241 		    if (tmp == NULL) {
3242 			htmlErrMemory(ctxt, NULL);
3243 			xmlFree(buf);
3244 			ctxt->instate = state;
3245 			return;
3246 		    }
3247 		    buf = tmp;
3248 		}
3249 		count++;
3250 		if (count > 50) {
3251 		    GROW;
3252 		    count = 0;
3253 		}
3254 		COPY_BUF(l,buf,len,cur);
3255 		NEXTL(l);
3256 		cur = CUR_CHAR(l);
3257 		if (cur == 0) {
3258 		    SHRINK;
3259 		    GROW;
3260 		    cur = CUR_CHAR(l);
3261 		}
3262 	    }
3263 	    buf[len] = 0;
3264 	    if (cur != '>') {
3265 		htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3266 		      "ParsePI: PI %s never end ...\n", target, NULL);
3267 	    } else {
3268 		SKIP(1);
3269 
3270 		/*
3271 		 * SAX: PI detected.
3272 		 */
3273 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3274 		    (ctxt->sax->processingInstruction != NULL))
3275 		    ctxt->sax->processingInstruction(ctxt->userData,
3276 		                                     target, buf);
3277 	    }
3278 	    xmlFree(buf);
3279 	} else {
3280 	    htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3281                          "PI is not started correctly", NULL, NULL);
3282 	}
3283 	ctxt->instate = state;
3284     }
3285 }
3286 
3287 /**
3288  * htmlParseComment:
3289  * @ctxt:  an HTML parser context
3290  *
3291  * Parse an XML (SGML) comment <!-- .... -->
3292  *
3293  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3294  */
3295 static void
3296 htmlParseComment(htmlParserCtxtPtr ctxt) {
3297     xmlChar *buf = NULL;
3298     int len;
3299     int size = HTML_PARSER_BUFFER_SIZE;
3300     int q, ql;
3301     int r, rl;
3302     int cur, l;
3303     xmlParserInputState state;
3304 
3305     /*
3306      * Check that there is a comment right here.
3307      */
3308     if ((RAW != '<') || (NXT(1) != '!') ||
3309         (NXT(2) != '-') || (NXT(3) != '-')) return;
3310 
3311     state = ctxt->instate;
3312     ctxt->instate = XML_PARSER_COMMENT;
3313     SHRINK;
3314     SKIP(4);
3315     buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3316     if (buf == NULL) {
3317         htmlErrMemory(ctxt, "buffer allocation failed\n");
3318 	ctxt->instate = state;
3319 	return;
3320     }
3321     len = 0;
3322     buf[len] = 0;
3323     q = CUR_CHAR(ql);
3324     if (!IS_CHAR(q))
3325         goto unfinished;
3326     NEXTL(ql);
3327     r = CUR_CHAR(rl);
3328     if (!IS_CHAR(r))
3329         goto unfinished;
3330     NEXTL(rl);
3331     cur = CUR_CHAR(l);
3332     while (IS_CHAR(cur) &&
3333            ((cur != '>') ||
3334 	    (r != '-') || (q != '-'))) {
3335 	if (len + 5 >= size) {
3336 	    xmlChar *tmp;
3337 
3338 	    size *= 2;
3339 	    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3340 	    if (tmp == NULL) {
3341 	        xmlFree(buf);
3342 	        htmlErrMemory(ctxt, "growing buffer failed\n");
3343 		ctxt->instate = state;
3344 		return;
3345 	    }
3346 	    buf = tmp;
3347 	}
3348 	COPY_BUF(ql,buf,len,q);
3349 	q = r;
3350 	ql = rl;
3351 	r = cur;
3352 	rl = l;
3353 	NEXTL(l);
3354 	cur = CUR_CHAR(l);
3355 	if (cur == 0) {
3356 	    SHRINK;
3357 	    GROW;
3358 	    cur = CUR_CHAR(l);
3359 	}
3360     }
3361     buf[len] = 0;
3362     if (IS_CHAR(cur)) {
3363         NEXT;
3364 	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3365 	    (!ctxt->disableSAX))
3366 	    ctxt->sax->comment(ctxt->userData, buf);
3367 	xmlFree(buf);
3368 	ctxt->instate = state;
3369 	return;
3370     }
3371 
3372 unfinished:
3373     htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3374 		 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3375     xmlFree(buf);
3376 }
3377 
3378 /**
3379  * htmlParseCharRef:
3380  * @ctxt:  an HTML parser context
3381  *
3382  * parse Reference declarations
3383  *
3384  * [66] CharRef ::= '&#' [0-9]+ ';' |
3385  *                  '&#x' [0-9a-fA-F]+ ';'
3386  *
3387  * Returns the value parsed (as an int)
3388  */
3389 int
3390 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3391     int val = 0;
3392 
3393     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3394 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3395 		     "htmlParseCharRef: context error\n",
3396 		     NULL, NULL);
3397         return(0);
3398     }
3399     if ((CUR == '&') && (NXT(1) == '#') &&
3400         ((NXT(2) == 'x') || NXT(2) == 'X')) {
3401 	SKIP(3);
3402 	while (CUR != ';') {
3403 	    if ((CUR >= '0') && (CUR <= '9'))
3404 	        val = val * 16 + (CUR - '0');
3405 	    else if ((CUR >= 'a') && (CUR <= 'f'))
3406 	        val = val * 16 + (CUR - 'a') + 10;
3407 	    else if ((CUR >= 'A') && (CUR <= 'F'))
3408 	        val = val * 16 + (CUR - 'A') + 10;
3409 	    else {
3410 	        htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3411 		             "htmlParseCharRef: missing semicolon\n",
3412 			     NULL, NULL);
3413 		break;
3414 	    }
3415 	    NEXT;
3416 	}
3417 	if (CUR == ';')
3418 	    NEXT;
3419     } else if  ((CUR == '&') && (NXT(1) == '#')) {
3420 	SKIP(2);
3421 	while (CUR != ';') {
3422 	    if ((CUR >= '0') && (CUR <= '9'))
3423 	        val = val * 10 + (CUR - '0');
3424 	    else {
3425 	        htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3426 		             "htmlParseCharRef: missing semicolon\n",
3427 			     NULL, NULL);
3428 		break;
3429 	    }
3430 	    NEXT;
3431 	}
3432 	if (CUR == ';')
3433 	    NEXT;
3434     } else {
3435 	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3436 	             "htmlParseCharRef: invalid value\n", NULL, NULL);
3437     }
3438     /*
3439      * Check the value IS_CHAR ...
3440      */
3441     if (IS_CHAR(val)) {
3442         return(val);
3443     } else {
3444 	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3445 			"htmlParseCharRef: invalid xmlChar value %d\n",
3446 			val);
3447     }
3448     return(0);
3449 }
3450 
3451 
3452 /**
3453  * htmlParseDocTypeDecl:
3454  * @ctxt:  an HTML parser context
3455  *
3456  * parse a DOCTYPE declaration
3457  *
3458  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3459  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3460  */
3461 
3462 static void
3463 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3464     const xmlChar *name;
3465     xmlChar *ExternalID = NULL;
3466     xmlChar *URI = NULL;
3467 
3468     /*
3469      * We know that '<!DOCTYPE' has been detected.
3470      */
3471     SKIP(9);
3472 
3473     SKIP_BLANKS;
3474 
3475     /*
3476      * Parse the DOCTYPE name.
3477      */
3478     name = htmlParseName(ctxt);
3479     if (name == NULL) {
3480 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3481 	             "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3482 		     NULL, NULL);
3483     }
3484     /*
3485      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3486      */
3487 
3488     SKIP_BLANKS;
3489 
3490     /*
3491      * Check for SystemID and ExternalID
3492      */
3493     URI = htmlParseExternalID(ctxt, &ExternalID);
3494     SKIP_BLANKS;
3495 
3496     /*
3497      * We should be at the end of the DOCTYPE declaration.
3498      */
3499     if (CUR != '>') {
3500 	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3501 	             "DOCTYPE improperly terminated\n", NULL, NULL);
3502         /* We shouldn't try to resynchronize ... */
3503     }
3504     NEXT;
3505 
3506     /*
3507      * Create or update the document accordingly to the DOCTYPE
3508      */
3509     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3510 	(!ctxt->disableSAX))
3511 	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3512 
3513     /*
3514      * Cleanup, since we don't use all those identifiers
3515      */
3516     if (URI != NULL) xmlFree(URI);
3517     if (ExternalID != NULL) xmlFree(ExternalID);
3518 }
3519 
3520 /**
3521  * htmlParseAttribute:
3522  * @ctxt:  an HTML parser context
3523  * @value:  a xmlChar ** used to store the value of the attribute
3524  *
3525  * parse an attribute
3526  *
3527  * [41] Attribute ::= Name Eq AttValue
3528  *
3529  * [25] Eq ::= S? '=' S?
3530  *
3531  * With namespace:
3532  *
3533  * [NS 11] Attribute ::= QName Eq AttValue
3534  *
3535  * Also the case QName == xmlns:??? is handled independently as a namespace
3536  * definition.
3537  *
3538  * Returns the attribute name, and the value in *value.
3539  */
3540 
3541 static const xmlChar *
3542 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3543     const xmlChar *name;
3544     xmlChar *val = NULL;
3545 
3546     *value = NULL;
3547     name = htmlParseHTMLName(ctxt);
3548     if (name == NULL) {
3549 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3550 	             "error parsing attribute name\n", NULL, NULL);
3551         return(NULL);
3552     }
3553 
3554     /*
3555      * read the value
3556      */
3557     SKIP_BLANKS;
3558     if (CUR == '=') {
3559         NEXT;
3560 	SKIP_BLANKS;
3561 	val = htmlParseAttValue(ctxt);
3562     }
3563 
3564     *value = val;
3565     return(name);
3566 }
3567 
3568 /**
3569  * htmlCheckEncodingDirect:
3570  * @ctxt:  an HTML parser context
3571  * @attvalue: the attribute value
3572  *
3573  * Checks an attribute value to detect
3574  * the encoding
3575  * If a new encoding is detected the parser is switched to decode
3576  * it and pass UTF8
3577  */
3578 static void
3579 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3580 
3581     if ((ctxt == NULL) || (encoding == NULL) ||
3582         (ctxt->options & HTML_PARSE_IGNORE_ENC))
3583 	return;
3584 
3585     /* do not change encoding */
3586     if (ctxt->input->encoding != NULL)
3587         return;
3588 
3589     if (encoding != NULL) {
3590 	xmlCharEncoding enc;
3591 	xmlCharEncodingHandlerPtr handler;
3592 
3593 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3594 
3595 	if (ctxt->input->encoding != NULL)
3596 	    xmlFree((xmlChar *) ctxt->input->encoding);
3597 	ctxt->input->encoding = xmlStrdup(encoding);
3598 
3599 	enc = xmlParseCharEncoding((const char *) encoding);
3600 	/*
3601 	 * registered set of known encodings
3602 	 */
3603 	if (enc != XML_CHAR_ENCODING_ERROR) {
3604 	    if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3605 	         (enc == XML_CHAR_ENCODING_UTF16BE) ||
3606 		 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3607 		 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3608 		(ctxt->input->buf != NULL) &&
3609 		(ctxt->input->buf->encoder == NULL)) {
3610 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3611 		             "htmlCheckEncoding: wrong encoding meta\n",
3612 			     NULL, NULL);
3613 	    } else {
3614 		xmlSwitchEncoding(ctxt, enc);
3615 	    }
3616 	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
3617 	} else {
3618 	    /*
3619 	     * fallback for unknown encodings
3620 	     */
3621 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
3622 	    if (handler != NULL) {
3623 		xmlSwitchToEncoding(ctxt, handler);
3624 		ctxt->charset = XML_CHAR_ENCODING_UTF8;
3625 	    } else {
3626 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3627 		             "htmlCheckEncoding: unknown encoding %s\n",
3628 			     encoding, NULL);
3629 	    }
3630 	}
3631 
3632 	if ((ctxt->input->buf != NULL) &&
3633 	    (ctxt->input->buf->encoder != NULL) &&
3634 	    (ctxt->input->buf->raw != NULL) &&
3635 	    (ctxt->input->buf->buffer != NULL)) {
3636 	    int nbchars;
3637 	    int processed;
3638 
3639 	    /*
3640 	     * convert as much as possible to the parser reading buffer.
3641 	     */
3642 	    processed = ctxt->input->cur - ctxt->input->base;
3643 	    xmlBufShrink(ctxt->input->buf->buffer, processed);
3644 	    nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3645             xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3646 	    if (nbchars < 0) {
3647 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3648 		             "htmlCheckEncoding: encoder error\n",
3649 			     NULL, NULL);
3650 	    }
3651 	}
3652     }
3653 }
3654 
3655 /**
3656  * htmlCheckEncoding:
3657  * @ctxt:  an HTML parser context
3658  * @attvalue: the attribute value
3659  *
3660  * Checks an http-equiv attribute from a Meta tag to detect
3661  * the encoding
3662  * If a new encoding is detected the parser is switched to decode
3663  * it and pass UTF8
3664  */
3665 static void
3666 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3667     const xmlChar *encoding;
3668 
3669     if (!attvalue)
3670 	return;
3671 
3672     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3673     if (encoding != NULL) {
3674 	encoding += 7;
3675     }
3676     /*
3677      * skip blank
3678      */
3679     if (encoding && IS_BLANK_CH(*encoding))
3680 	encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3681     if (encoding && *encoding == '=') {
3682 	encoding ++;
3683 	htmlCheckEncodingDirect(ctxt, encoding);
3684     }
3685 }
3686 
3687 /**
3688  * htmlCheckMeta:
3689  * @ctxt:  an HTML parser context
3690  * @atts:  the attributes values
3691  *
3692  * Checks an attributes from a Meta tag
3693  */
3694 static void
3695 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3696     int i;
3697     const xmlChar *att, *value;
3698     int http = 0;
3699     const xmlChar *content = NULL;
3700 
3701     if ((ctxt == NULL) || (atts == NULL))
3702 	return;
3703 
3704     i = 0;
3705     att = atts[i++];
3706     while (att != NULL) {
3707 	value = atts[i++];
3708 	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3709 	 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3710 	    http = 1;
3711 	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3712 	    htmlCheckEncodingDirect(ctxt, value);
3713 	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3714 	    content = value;
3715 	att = atts[i++];
3716     }
3717     if ((http) && (content != NULL))
3718 	htmlCheckEncoding(ctxt, content);
3719 
3720 }
3721 
3722 /**
3723  * htmlParseStartTag:
3724  * @ctxt:  an HTML parser context
3725  *
3726  * parse a start of tag either for rule element or
3727  * EmptyElement. In both case we don't parse the tag closing chars.
3728  *
3729  * [40] STag ::= '<' Name (S Attribute)* S? '>'
3730  *
3731  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3732  *
3733  * With namespace:
3734  *
3735  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3736  *
3737  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3738  *
3739  * Returns 0 in case of success, -1 in case of error and 1 if discarded
3740  */
3741 
3742 static int
3743 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3744     const xmlChar *name;
3745     const xmlChar *attname;
3746     xmlChar *attvalue;
3747     const xmlChar **atts;
3748     int nbatts = 0;
3749     int maxatts;
3750     int meta = 0;
3751     int i;
3752     int discardtag = 0;
3753 
3754     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3755 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3756 		     "htmlParseStartTag: context error\n", NULL, NULL);
3757 	return -1;
3758     }
3759     if (ctxt->instate == XML_PARSER_EOF)
3760         return(-1);
3761     if (CUR != '<') return -1;
3762     NEXT;
3763 
3764     atts = ctxt->atts;
3765     maxatts = ctxt->maxatts;
3766 
3767     GROW;
3768     name = htmlParseHTMLName(ctxt);
3769     if (name == NULL) {
3770 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3771 	             "htmlParseStartTag: invalid element name\n",
3772 		     NULL, NULL);
3773 	/* if recover preserve text on classic misconstructs */
3774 	if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
3775 	    (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
3776 	    htmlParseCharDataInternal(ctxt, '<');
3777 	    return(-1);
3778 	}
3779 
3780 
3781 	/* Dump the bogus tag like browsers do */
3782 	while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3783                (ctxt->instate != XML_PARSER_EOF))
3784 	    NEXT;
3785         return -1;
3786     }
3787     if (xmlStrEqual(name, BAD_CAST"meta"))
3788 	meta = 1;
3789 
3790     /*
3791      * Check for auto-closure of HTML elements.
3792      */
3793     htmlAutoClose(ctxt, name);
3794 
3795     /*
3796      * Check for implied HTML elements.
3797      */
3798     htmlCheckImplied(ctxt, name);
3799 
3800     /*
3801      * Avoid html at any level > 0, head at any level != 1
3802      * or any attempt to recurse body
3803      */
3804     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3805 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3806 	             "htmlParseStartTag: misplaced <html> tag\n",
3807 		     name, NULL);
3808 	discardtag = 1;
3809 	ctxt->depth++;
3810     }
3811     if ((ctxt->nameNr != 1) &&
3812 	(xmlStrEqual(name, BAD_CAST"head"))) {
3813 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3814 	             "htmlParseStartTag: misplaced <head> tag\n",
3815 		     name, NULL);
3816 	discardtag = 1;
3817 	ctxt->depth++;
3818     }
3819     if (xmlStrEqual(name, BAD_CAST"body")) {
3820 	int indx;
3821 	for (indx = 0;indx < ctxt->nameNr;indx++) {
3822 	    if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3823 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3824 		             "htmlParseStartTag: misplaced <body> tag\n",
3825 			     name, NULL);
3826 		discardtag = 1;
3827 		ctxt->depth++;
3828 	    }
3829 	}
3830     }
3831 
3832     /*
3833      * Now parse the attributes, it ends up with the ending
3834      *
3835      * (S Attribute)* S?
3836      */
3837     SKIP_BLANKS;
3838     while ((IS_CHAR_CH(CUR)) &&
3839            (CUR != '>') &&
3840 	   ((CUR != '/') || (NXT(1) != '>'))) {
3841 	long cons = ctxt->nbChars;
3842 
3843 	GROW;
3844 	attname = htmlParseAttribute(ctxt, &attvalue);
3845         if (attname != NULL) {
3846 
3847 	    /*
3848 	     * Well formedness requires at most one declaration of an attribute
3849 	     */
3850 	    for (i = 0; i < nbatts;i += 2) {
3851 	        if (xmlStrEqual(atts[i], attname)) {
3852 		    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3853 		                 "Attribute %s redefined\n", attname, NULL);
3854 		    if (attvalue != NULL)
3855 			xmlFree(attvalue);
3856 		    goto failed;
3857 		}
3858 	    }
3859 
3860 	    /*
3861 	     * Add the pair to atts
3862 	     */
3863 	    if (atts == NULL) {
3864 	        maxatts = 22; /* allow for 10 attrs by default */
3865 	        atts = (const xmlChar **)
3866 		       xmlMalloc(maxatts * sizeof(xmlChar *));
3867 		if (atts == NULL) {
3868 		    htmlErrMemory(ctxt, NULL);
3869 		    if (attvalue != NULL)
3870 			xmlFree(attvalue);
3871 		    goto failed;
3872 		}
3873 		ctxt->atts = atts;
3874 		ctxt->maxatts = maxatts;
3875 	    } else if (nbatts + 4 > maxatts) {
3876 	        const xmlChar **n;
3877 
3878 	        maxatts *= 2;
3879 	        n = (const xmlChar **) xmlRealloc((void *) atts,
3880 					     maxatts * sizeof(const xmlChar *));
3881 		if (n == NULL) {
3882 		    htmlErrMemory(ctxt, NULL);
3883 		    if (attvalue != NULL)
3884 			xmlFree(attvalue);
3885 		    goto failed;
3886 		}
3887 		atts = n;
3888 		ctxt->atts = atts;
3889 		ctxt->maxatts = maxatts;
3890 	    }
3891 	    atts[nbatts++] = attname;
3892 	    atts[nbatts++] = attvalue;
3893 	    atts[nbatts] = NULL;
3894 	    atts[nbatts + 1] = NULL;
3895 	}
3896 	else {
3897 	    if (attvalue != NULL)
3898 	        xmlFree(attvalue);
3899 	    /* Dump the bogus attribute string up to the next blank or
3900 	     * the end of the tag. */
3901 	    while ((IS_CHAR_CH(CUR)) &&
3902 	           !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3903 		   ((CUR != '/') || (NXT(1) != '>')))
3904 		NEXT;
3905 	}
3906 
3907 failed:
3908 	SKIP_BLANKS;
3909         if (cons == ctxt->nbChars) {
3910 	    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3911 	                 "htmlParseStartTag: problem parsing attributes\n",
3912 			 NULL, NULL);
3913 	    break;
3914 	}
3915     }
3916 
3917     /*
3918      * Handle specific association to the META tag
3919      */
3920     if (meta && (nbatts != 0))
3921 	htmlCheckMeta(ctxt, atts);
3922 
3923     /*
3924      * SAX: Start of Element !
3925      */
3926     if (!discardtag) {
3927 	htmlnamePush(ctxt, name);
3928 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3929 	    if (nbatts != 0)
3930 		ctxt->sax->startElement(ctxt->userData, name, atts);
3931 	    else
3932 		ctxt->sax->startElement(ctxt->userData, name, NULL);
3933 	}
3934     }
3935 
3936     if (atts != NULL) {
3937         for (i = 1;i < nbatts;i += 2) {
3938 	    if (atts[i] != NULL)
3939 		xmlFree((xmlChar *) atts[i]);
3940 	}
3941     }
3942 
3943     return(discardtag);
3944 }
3945 
3946 /**
3947  * htmlParseEndTag:
3948  * @ctxt:  an HTML parser context
3949  *
3950  * parse an end of tag
3951  *
3952  * [42] ETag ::= '</' Name S? '>'
3953  *
3954  * With namespace
3955  *
3956  * [NS 9] ETag ::= '</' QName S? '>'
3957  *
3958  * Returns 1 if the current level should be closed.
3959  */
3960 
3961 static int
3962 htmlParseEndTag(htmlParserCtxtPtr ctxt)
3963 {
3964     const xmlChar *name;
3965     const xmlChar *oldname;
3966     int i, ret;
3967 
3968     if ((CUR != '<') || (NXT(1) != '/')) {
3969         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3970 	             "htmlParseEndTag: '</' not found\n", NULL, NULL);
3971         return (0);
3972     }
3973     SKIP(2);
3974 
3975     name = htmlParseHTMLName(ctxt);
3976     if (name == NULL)
3977         return (0);
3978     /*
3979      * We should definitely be at the ending "S? '>'" part
3980      */
3981     SKIP_BLANKS;
3982     if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
3983         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3984 	             "End tag : expected '>'\n", NULL, NULL);
3985 	if (ctxt->recovery) {
3986 	    /*
3987 	     * We're not at the ending > !!
3988 	     * Error, unless in recover mode where we search forwards
3989 	     * until we find a >
3990 	     */
3991 	    while (CUR != '\0' && CUR != '>') NEXT;
3992 	    NEXT;
3993 	}
3994     } else
3995         NEXT;
3996 
3997     /*
3998      * if we ignored misplaced tags in htmlParseStartTag don't pop them
3999      * out now.
4000      */
4001     if ((ctxt->depth > 0) &&
4002         (xmlStrEqual(name, BAD_CAST "html") ||
4003          xmlStrEqual(name, BAD_CAST "body") ||
4004 	 xmlStrEqual(name, BAD_CAST "head"))) {
4005 	ctxt->depth--;
4006 	return (0);
4007     }
4008 
4009     /*
4010      * If the name read is not one of the element in the parsing stack
4011      * then return, it's just an error.
4012      */
4013     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4014         if (xmlStrEqual(name, ctxt->nameTab[i]))
4015             break;
4016     }
4017     if (i < 0) {
4018         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4019 	             "Unexpected end tag : %s\n", name, NULL);
4020         return (0);
4021     }
4022 
4023 
4024     /*
4025      * Check for auto-closure of HTML elements.
4026      */
4027 
4028     htmlAutoCloseOnClose(ctxt, name);
4029 
4030     /*
4031      * Well formedness constraints, opening and closing must match.
4032      * With the exception that the autoclose may have popped stuff out
4033      * of the stack.
4034      */
4035     if (!xmlStrEqual(name, ctxt->name)) {
4036         if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4037             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4038 	                 "Opening and ending tag mismatch: %s and %s\n",
4039 			 name, ctxt->name);
4040         }
4041     }
4042 
4043     /*
4044      * SAX: End of Tag
4045      */
4046     oldname = ctxt->name;
4047     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4048         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4049             ctxt->sax->endElement(ctxt->userData, name);
4050 	htmlNodeInfoPop(ctxt);
4051         htmlnamePop(ctxt);
4052         ret = 1;
4053     } else {
4054         ret = 0;
4055     }
4056 
4057     return (ret);
4058 }
4059 
4060 
4061 /**
4062  * htmlParseReference:
4063  * @ctxt:  an HTML parser context
4064  *
4065  * parse and handle entity references in content,
4066  * this will end-up in a call to character() since this is either a
4067  * CharRef, or a predefined entity.
4068  */
4069 static void
4070 htmlParseReference(htmlParserCtxtPtr ctxt) {
4071     const htmlEntityDesc * ent;
4072     xmlChar out[6];
4073     const xmlChar *name;
4074     if (CUR != '&') return;
4075 
4076     if (NXT(1) == '#') {
4077 	unsigned int c;
4078 	int bits, i = 0;
4079 
4080 	c = htmlParseCharRef(ctxt);
4081 	if (c == 0)
4082 	    return;
4083 
4084         if      (c <    0x80) { out[i++]= c;                bits= -6; }
4085         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4086         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4087         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4088 
4089         for ( ; bits >= 0; bits-= 6) {
4090             out[i++]= ((c >> bits) & 0x3F) | 0x80;
4091         }
4092 	out[i] = 0;
4093 
4094 	htmlCheckParagraph(ctxt);
4095 	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4096 	    ctxt->sax->characters(ctxt->userData, out, i);
4097     } else {
4098 	ent = htmlParseEntityRef(ctxt, &name);
4099 	if (name == NULL) {
4100 	    htmlCheckParagraph(ctxt);
4101 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4102 	        ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4103 	    return;
4104 	}
4105 	if ((ent == NULL) || !(ent->value > 0)) {
4106 	    htmlCheckParagraph(ctxt);
4107 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4108 		ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4109 		ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4110 		/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4111 	    }
4112 	} else {
4113 	    unsigned int c;
4114 	    int bits, i = 0;
4115 
4116 	    c = ent->value;
4117 	    if      (c <    0x80)
4118 	            { out[i++]= c;                bits= -6; }
4119 	    else if (c <   0x800)
4120 	            { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4121 	    else if (c < 0x10000)
4122 	            { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4123 	    else
4124 	            { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4125 
4126 	    for ( ; bits >= 0; bits-= 6) {
4127 		out[i++]= ((c >> bits) & 0x3F) | 0x80;
4128 	    }
4129 	    out[i] = 0;
4130 
4131 	    htmlCheckParagraph(ctxt);
4132 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4133 		ctxt->sax->characters(ctxt->userData, out, i);
4134 	}
4135     }
4136 }
4137 
4138 /**
4139  * htmlParseContent:
4140  * @ctxt:  an HTML parser context
4141  *
4142  * Parse a content: comment, sub-element, reference or text.
4143  * Kept for compatibility with old code
4144  */
4145 
4146 static void
4147 htmlParseContent(htmlParserCtxtPtr ctxt) {
4148     xmlChar *currentNode;
4149     int depth;
4150     const xmlChar *name;
4151 
4152     currentNode = xmlStrdup(ctxt->name);
4153     depth = ctxt->nameNr;
4154     while (1) {
4155 	long cons = ctxt->nbChars;
4156 
4157         GROW;
4158 
4159         if (ctxt->instate == XML_PARSER_EOF)
4160             break;
4161 
4162 	/*
4163 	 * Our tag or one of it's parent or children is ending.
4164 	 */
4165         if ((CUR == '<') && (NXT(1) == '/')) {
4166 	    if (htmlParseEndTag(ctxt) &&
4167 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4168 		if (currentNode != NULL)
4169 		    xmlFree(currentNode);
4170 		return;
4171 	    }
4172 	    continue; /* while */
4173         }
4174 
4175 	else if ((CUR == '<') &&
4176 	         ((IS_ASCII_LETTER(NXT(1))) ||
4177 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4178 	    name = htmlParseHTMLName_nonInvasive(ctxt);
4179 	    if (name == NULL) {
4180 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4181 			 "htmlParseStartTag: invalid element name\n",
4182 			 NULL, NULL);
4183 	        /* Dump the bogus tag like browsers do */
4184         while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4185 	            NEXT;
4186 
4187 	        if (currentNode != NULL)
4188 	            xmlFree(currentNode);
4189 	        return;
4190 	    }
4191 
4192 	    if (ctxt->name != NULL) {
4193 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4194 	            htmlAutoClose(ctxt, name);
4195 	            continue;
4196 	        }
4197 	    }
4198 	}
4199 
4200 	/*
4201 	 * Has this node been popped out during parsing of
4202 	 * the next element
4203 	 */
4204         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4205 	    (!xmlStrEqual(currentNode, ctxt->name)))
4206 	     {
4207 	    if (currentNode != NULL) xmlFree(currentNode);
4208 	    return;
4209 	}
4210 
4211 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4212 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4213 	    /*
4214 	     * Handle SCRIPT/STYLE separately
4215 	     */
4216 	    htmlParseScript(ctxt);
4217 	} else {
4218 	    /*
4219 	     * Sometimes DOCTYPE arrives in the middle of the document
4220 	     */
4221 	    if ((CUR == '<') && (NXT(1) == '!') &&
4222 		(UPP(2) == 'D') && (UPP(3) == 'O') &&
4223 		(UPP(4) == 'C') && (UPP(5) == 'T') &&
4224 		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4225 		(UPP(8) == 'E')) {
4226 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4227 		             "Misplaced DOCTYPE declaration\n",
4228 			     BAD_CAST "DOCTYPE" , NULL);
4229 		htmlParseDocTypeDecl(ctxt);
4230 	    }
4231 
4232 	    /*
4233 	     * First case :  a comment
4234 	     */
4235 	    if ((CUR == '<') && (NXT(1) == '!') &&
4236 		(NXT(2) == '-') && (NXT(3) == '-')) {
4237 		htmlParseComment(ctxt);
4238 	    }
4239 
4240 	    /*
4241 	     * Second case : a Processing Instruction.
4242 	     */
4243 	    else if ((CUR == '<') && (NXT(1) == '?')) {
4244 		htmlParsePI(ctxt);
4245 	    }
4246 
4247 	    /*
4248 	     * Third case :  a sub-element.
4249 	     */
4250 	    else if (CUR == '<') {
4251 		htmlParseElement(ctxt);
4252 	    }
4253 
4254 	    /*
4255 	     * Fourth case : a reference. If if has not been resolved,
4256 	     *    parsing returns it's Name, create the node
4257 	     */
4258 	    else if (CUR == '&') {
4259 		htmlParseReference(ctxt);
4260 	    }
4261 
4262 	    /*
4263 	     * Fifth case : end of the resource
4264 	     */
4265 	    else if (CUR == 0) {
4266 		htmlAutoCloseOnEnd(ctxt);
4267 		break;
4268 	    }
4269 
4270 	    /*
4271 	     * Last case, text. Note that References are handled directly.
4272 	     */
4273 	    else {
4274 		htmlParseCharData(ctxt);
4275 	    }
4276 
4277 	    if (cons == ctxt->nbChars) {
4278 		if (ctxt->node != NULL) {
4279 		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4280 		                 "detected an error in element content\n",
4281 				 NULL, NULL);
4282 		}
4283 		break;
4284 	    }
4285 	}
4286         GROW;
4287     }
4288     if (currentNode != NULL) xmlFree(currentNode);
4289 }
4290 
4291 /**
4292  * htmlParseElement:
4293  * @ctxt:  an HTML parser context
4294  *
4295  * parse an HTML element, this is highly recursive
4296  * this is kept for compatibility with previous code versions
4297  *
4298  * [39] element ::= EmptyElemTag | STag content ETag
4299  *
4300  * [41] Attribute ::= Name Eq AttValue
4301  */
4302 
4303 void
4304 htmlParseElement(htmlParserCtxtPtr ctxt) {
4305     const xmlChar *name;
4306     xmlChar *currentNode = NULL;
4307     const htmlElemDesc * info;
4308     htmlParserNodeInfo node_info;
4309     int failed;
4310     int depth;
4311     const xmlChar *oldptr;
4312 
4313     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4314 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4315 		     "htmlParseElement: context error\n", NULL, NULL);
4316 	return;
4317     }
4318 
4319     if (ctxt->instate == XML_PARSER_EOF)
4320         return;
4321 
4322     /* Capture start position */
4323     if (ctxt->record_info) {
4324         node_info.begin_pos = ctxt->input->consumed +
4325                           (CUR_PTR - ctxt->input->base);
4326 	node_info.begin_line = ctxt->input->line;
4327     }
4328 
4329     failed = htmlParseStartTag(ctxt);
4330     name = ctxt->name;
4331     if ((failed == -1) || (name == NULL)) {
4332 	if (CUR == '>')
4333 	    NEXT;
4334         return;
4335     }
4336 
4337     /*
4338      * Lookup the info for that element.
4339      */
4340     info = htmlTagLookup(name);
4341     if (info == NULL) {
4342 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4343 	             "Tag %s invalid\n", name, NULL);
4344     }
4345 
4346     /*
4347      * Check for an Empty Element labeled the XML/SGML way
4348      */
4349     if ((CUR == '/') && (NXT(1) == '>')) {
4350         SKIP(2);
4351 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4352 	    ctxt->sax->endElement(ctxt->userData, name);
4353 	htmlnamePop(ctxt);
4354 	return;
4355     }
4356 
4357     if (CUR == '>') {
4358         NEXT;
4359     } else {
4360 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4361 	             "Couldn't find end of Start Tag %s\n", name, NULL);
4362 
4363 	/*
4364 	 * end of parsing of this node.
4365 	 */
4366 	if (xmlStrEqual(name, ctxt->name)) {
4367 	    nodePop(ctxt);
4368 	    htmlnamePop(ctxt);
4369 	}
4370 
4371 	/*
4372 	 * Capture end position and add node
4373 	 */
4374 	if (ctxt->record_info) {
4375 	   node_info.end_pos = ctxt->input->consumed +
4376 			      (CUR_PTR - ctxt->input->base);
4377 	   node_info.end_line = ctxt->input->line;
4378 	   node_info.node = ctxt->node;
4379 	   xmlParserAddNodeInfo(ctxt, &node_info);
4380 	}
4381 	return;
4382     }
4383 
4384     /*
4385      * Check for an Empty Element from DTD definition
4386      */
4387     if ((info != NULL) && (info->empty)) {
4388 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4389 	    ctxt->sax->endElement(ctxt->userData, name);
4390 	htmlnamePop(ctxt);
4391 	return;
4392     }
4393 
4394     /*
4395      * Parse the content of the element:
4396      */
4397     currentNode = xmlStrdup(ctxt->name);
4398     depth = ctxt->nameNr;
4399     while (IS_CHAR_CH(CUR)) {
4400 	oldptr = ctxt->input->cur;
4401 	htmlParseContent(ctxt);
4402 	if (oldptr==ctxt->input->cur) break;
4403 	if (ctxt->nameNr < depth) break;
4404     }
4405 
4406     /*
4407      * Capture end position and add node
4408      */
4409     if ( currentNode != NULL && ctxt->record_info ) {
4410        node_info.end_pos = ctxt->input->consumed +
4411                           (CUR_PTR - ctxt->input->base);
4412        node_info.end_line = ctxt->input->line;
4413        node_info.node = ctxt->node;
4414        xmlParserAddNodeInfo(ctxt, &node_info);
4415     }
4416     if (!IS_CHAR_CH(CUR)) {
4417 	htmlAutoCloseOnEnd(ctxt);
4418     }
4419 
4420     if (currentNode != NULL)
4421 	xmlFree(currentNode);
4422 }
4423 
4424 static void
4425 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4426     /*
4427      * Capture end position and add node
4428      */
4429     if ( ctxt->node != NULL && ctxt->record_info ) {
4430        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4431                                 (CUR_PTR - ctxt->input->base);
4432        ctxt->nodeInfo->end_line = ctxt->input->line;
4433        ctxt->nodeInfo->node = ctxt->node;
4434        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4435        htmlNodeInfoPop(ctxt);
4436     }
4437     if (!IS_CHAR_CH(CUR)) {
4438        htmlAutoCloseOnEnd(ctxt);
4439     }
4440 }
4441 
4442 /**
4443  * htmlParseElementInternal:
4444  * @ctxt:  an HTML parser context
4445  *
4446  * parse an HTML element, new version, non recursive
4447  *
4448  * [39] element ::= EmptyElemTag | STag content ETag
4449  *
4450  * [41] Attribute ::= Name Eq AttValue
4451  */
4452 
4453 static void
4454 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4455     const xmlChar *name;
4456     const htmlElemDesc * info;
4457     htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4458     int failed;
4459 
4460     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4461 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4462 		     "htmlParseElementInternal: context error\n", NULL, NULL);
4463 	return;
4464     }
4465 
4466     if (ctxt->instate == XML_PARSER_EOF)
4467         return;
4468 
4469     /* Capture start position */
4470     if (ctxt->record_info) {
4471         node_info.begin_pos = ctxt->input->consumed +
4472                           (CUR_PTR - ctxt->input->base);
4473 	node_info.begin_line = ctxt->input->line;
4474     }
4475 
4476     failed = htmlParseStartTag(ctxt);
4477     name = ctxt->name;
4478     if ((failed == -1) || (name == NULL)) {
4479 	if (CUR == '>')
4480 	    NEXT;
4481         return;
4482     }
4483 
4484     /*
4485      * Lookup the info for that element.
4486      */
4487     info = htmlTagLookup(name);
4488     if (info == NULL) {
4489 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4490 	             "Tag %s invalid\n", name, NULL);
4491     }
4492 
4493     /*
4494      * Check for an Empty Element labeled the XML/SGML way
4495      */
4496     if ((CUR == '/') && (NXT(1) == '>')) {
4497         SKIP(2);
4498 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4499 	    ctxt->sax->endElement(ctxt->userData, name);
4500 	htmlnamePop(ctxt);
4501 	return;
4502     }
4503 
4504     if (CUR == '>') {
4505         NEXT;
4506     } else {
4507 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4508 	             "Couldn't find end of Start Tag %s\n", name, NULL);
4509 
4510 	/*
4511 	 * end of parsing of this node.
4512 	 */
4513 	if (xmlStrEqual(name, ctxt->name)) {
4514 	    nodePop(ctxt);
4515 	    htmlnamePop(ctxt);
4516 	}
4517 
4518         if (ctxt->record_info)
4519             htmlNodeInfoPush(ctxt, &node_info);
4520         htmlParserFinishElementParsing(ctxt);
4521 	return;
4522     }
4523 
4524     /*
4525      * Check for an Empty Element from DTD definition
4526      */
4527     if ((info != NULL) && (info->empty)) {
4528 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4529 	    ctxt->sax->endElement(ctxt->userData, name);
4530 	htmlnamePop(ctxt);
4531 	return;
4532     }
4533 
4534     if (ctxt->record_info)
4535         htmlNodeInfoPush(ctxt, &node_info);
4536 }
4537 
4538 /**
4539  * htmlParseContentInternal:
4540  * @ctxt:  an HTML parser context
4541  *
4542  * Parse a content: comment, sub-element, reference or text.
4543  * New version for non recursive htmlParseElementInternal
4544  */
4545 
4546 static void
4547 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4548     xmlChar *currentNode;
4549     int depth;
4550     const xmlChar *name;
4551 
4552     currentNode = xmlStrdup(ctxt->name);
4553     depth = ctxt->nameNr;
4554     while (1) {
4555 	long cons = ctxt->nbChars;
4556 
4557         GROW;
4558 
4559         if (ctxt->instate == XML_PARSER_EOF)
4560             break;
4561 
4562 	/*
4563 	 * Our tag or one of it's parent or children is ending.
4564 	 */
4565         if ((CUR == '<') && (NXT(1) == '/')) {
4566 	    if (htmlParseEndTag(ctxt) &&
4567 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4568 		if (currentNode != NULL)
4569 		    xmlFree(currentNode);
4570 
4571 	        currentNode = xmlStrdup(ctxt->name);
4572 	        depth = ctxt->nameNr;
4573 	    }
4574 	    continue; /* while */
4575         }
4576 
4577 	else if ((CUR == '<') &&
4578 	         ((IS_ASCII_LETTER(NXT(1))) ||
4579 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4580 	    name = htmlParseHTMLName_nonInvasive(ctxt);
4581 	    if (name == NULL) {
4582 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4583 			 "htmlParseStartTag: invalid element name\n",
4584 			 NULL, NULL);
4585 	        /* Dump the bogus tag like browsers do */
4586 	        while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4587 	            NEXT;
4588 
4589 	        htmlParserFinishElementParsing(ctxt);
4590 	        if (currentNode != NULL)
4591 	            xmlFree(currentNode);
4592 
4593 	        currentNode = xmlStrdup(ctxt->name);
4594 	        depth = ctxt->nameNr;
4595 	        continue;
4596 	    }
4597 
4598 	    if (ctxt->name != NULL) {
4599 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4600 	            htmlAutoClose(ctxt, name);
4601 	            continue;
4602 	        }
4603 	    }
4604 	}
4605 
4606 	/*
4607 	 * Has this node been popped out during parsing of
4608 	 * the next element
4609 	 */
4610         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4611 	    (!xmlStrEqual(currentNode, ctxt->name)))
4612 	     {
4613 	    htmlParserFinishElementParsing(ctxt);
4614 	    if (currentNode != NULL) xmlFree(currentNode);
4615 
4616 	    currentNode = xmlStrdup(ctxt->name);
4617 	    depth = ctxt->nameNr;
4618 	    continue;
4619 	}
4620 
4621 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4622 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4623 	    /*
4624 	     * Handle SCRIPT/STYLE separately
4625 	     */
4626 	    htmlParseScript(ctxt);
4627 	} else {
4628 	    /*
4629 	     * Sometimes DOCTYPE arrives in the middle of the document
4630 	     */
4631 	    if ((CUR == '<') && (NXT(1) == '!') &&
4632 		(UPP(2) == 'D') && (UPP(3) == 'O') &&
4633 		(UPP(4) == 'C') && (UPP(5) == 'T') &&
4634 		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4635 		(UPP(8) == 'E')) {
4636 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4637 		             "Misplaced DOCTYPE declaration\n",
4638 			     BAD_CAST "DOCTYPE" , NULL);
4639 		htmlParseDocTypeDecl(ctxt);
4640 	    }
4641 
4642 	    /*
4643 	     * First case :  a comment
4644 	     */
4645 	    if ((CUR == '<') && (NXT(1) == '!') &&
4646 		(NXT(2) == '-') && (NXT(3) == '-')) {
4647 		htmlParseComment(ctxt);
4648 	    }
4649 
4650 	    /*
4651 	     * Second case : a Processing Instruction.
4652 	     */
4653 	    else if ((CUR == '<') && (NXT(1) == '?')) {
4654 		htmlParsePI(ctxt);
4655 	    }
4656 
4657 	    /*
4658 	     * Third case :  a sub-element.
4659 	     */
4660 	    else if (CUR == '<') {
4661 		htmlParseElementInternal(ctxt);
4662 		if (currentNode != NULL) xmlFree(currentNode);
4663 
4664 		currentNode = xmlStrdup(ctxt->name);
4665 		depth = ctxt->nameNr;
4666 	    }
4667 
4668 	    /*
4669 	     * Fourth case : a reference. If if has not been resolved,
4670 	     *    parsing returns it's Name, create the node
4671 	     */
4672 	    else if (CUR == '&') {
4673 		htmlParseReference(ctxt);
4674 	    }
4675 
4676 	    /*
4677 	     * Fifth case : end of the resource
4678 	     */
4679 	    else if (CUR == 0) {
4680 		htmlAutoCloseOnEnd(ctxt);
4681 		break;
4682 	    }
4683 
4684 	    /*
4685 	     * Last case, text. Note that References are handled directly.
4686 	     */
4687 	    else {
4688 		htmlParseCharData(ctxt);
4689 	    }
4690 
4691 	    if (cons == ctxt->nbChars) {
4692 		if (ctxt->node != NULL) {
4693 		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4694 		                 "detected an error in element content\n",
4695 				 NULL, NULL);
4696 		}
4697 		break;
4698 	    }
4699 	}
4700         GROW;
4701     }
4702     if (currentNode != NULL) xmlFree(currentNode);
4703 }
4704 
4705 /**
4706  * htmlParseContent:
4707  * @ctxt:  an HTML parser context
4708  *
4709  * Parse a content: comment, sub-element, reference or text.
4710  * This is the entry point when called from parser.c
4711  */
4712 
4713 void
4714 __htmlParseContent(void *ctxt) {
4715     if (ctxt != NULL)
4716 	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4717 }
4718 
4719 /**
4720  * htmlParseDocument:
4721  * @ctxt:  an HTML parser context
4722  *
4723  * parse an HTML document (and build a tree if using the standard SAX
4724  * interface).
4725  *
4726  * Returns 0, -1 in case of error. the parser context is augmented
4727  *                as a result of the parsing.
4728  */
4729 
4730 int
4731 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4732     xmlChar start[4];
4733     xmlCharEncoding enc;
4734     xmlDtdPtr dtd;
4735 
4736     xmlInitParser();
4737 
4738     htmlDefaultSAXHandlerInit();
4739 
4740     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4741 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4742 		     "htmlParseDocument: context error\n", NULL, NULL);
4743 	return(XML_ERR_INTERNAL_ERROR);
4744     }
4745     ctxt->html = 1;
4746     ctxt->linenumbers = 1;
4747     GROW;
4748     /*
4749      * SAX: beginning of the document processing.
4750      */
4751     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4752         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4753 
4754     if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4755         ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4756 	/*
4757 	 * Get the 4 first bytes and decode the charset
4758 	 * if enc != XML_CHAR_ENCODING_NONE
4759 	 * plug some encoding conversion routines.
4760 	 */
4761 	start[0] = RAW;
4762 	start[1] = NXT(1);
4763 	start[2] = NXT(2);
4764 	start[3] = NXT(3);
4765 	enc = xmlDetectCharEncoding(&start[0], 4);
4766 	if (enc != XML_CHAR_ENCODING_NONE) {
4767 	    xmlSwitchEncoding(ctxt, enc);
4768 	}
4769     }
4770 
4771     /*
4772      * Wipe out everything which is before the first '<'
4773      */
4774     SKIP_BLANKS;
4775     if (CUR == 0) {
4776 	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4777 	             "Document is empty\n", NULL, NULL);
4778     }
4779 
4780     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4781 	ctxt->sax->startDocument(ctxt->userData);
4782 
4783 
4784     /*
4785      * Parse possible comments and PIs before any content
4786      */
4787     while (((CUR == '<') && (NXT(1) == '!') &&
4788             (NXT(2) == '-') && (NXT(3) == '-')) ||
4789 	   ((CUR == '<') && (NXT(1) == '?'))) {
4790         htmlParseComment(ctxt);
4791         htmlParsePI(ctxt);
4792 	SKIP_BLANKS;
4793     }
4794 
4795 
4796     /*
4797      * Then possibly doc type declaration(s) and more Misc
4798      * (doctypedecl Misc*)?
4799      */
4800     if ((CUR == '<') && (NXT(1) == '!') &&
4801 	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4802 	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4803 	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4804 	(UPP(8) == 'E')) {
4805 	htmlParseDocTypeDecl(ctxt);
4806     }
4807     SKIP_BLANKS;
4808 
4809     /*
4810      * Parse possible comments and PIs before any content
4811      */
4812     while (((CUR == '<') && (NXT(1) == '!') &&
4813             (NXT(2) == '-') && (NXT(3) == '-')) ||
4814 	   ((CUR == '<') && (NXT(1) == '?'))) {
4815         htmlParseComment(ctxt);
4816         htmlParsePI(ctxt);
4817 	SKIP_BLANKS;
4818     }
4819 
4820     /*
4821      * Time to start parsing the tree itself
4822      */
4823     htmlParseContentInternal(ctxt);
4824 
4825     /*
4826      * autoclose
4827      */
4828     if (CUR == 0)
4829 	htmlAutoCloseOnEnd(ctxt);
4830 
4831 
4832     /*
4833      * SAX: end of the document processing.
4834      */
4835     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4836         ctxt->sax->endDocument(ctxt->userData);
4837 
4838     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4839 	dtd = xmlGetIntSubset(ctxt->myDoc);
4840 	if (dtd == NULL)
4841 	    ctxt->myDoc->intSubset =
4842 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4843 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4844 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4845     }
4846     if (! ctxt->wellFormed) return(-1);
4847     return(0);
4848 }
4849 
4850 
4851 /************************************************************************
4852  *									*
4853  *			Parser contexts handling			*
4854  *									*
4855  ************************************************************************/
4856 
4857 /**
4858  * htmlInitParserCtxt:
4859  * @ctxt:  an HTML parser context
4860  *
4861  * Initialize a parser context
4862  *
4863  * Returns 0 in case of success and -1 in case of error
4864  */
4865 
4866 static int
4867 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4868 {
4869     htmlSAXHandler *sax;
4870 
4871     if (ctxt == NULL) return(-1);
4872     memset(ctxt, 0, sizeof(htmlParserCtxt));
4873 
4874     ctxt->dict = xmlDictCreate();
4875     if (ctxt->dict == NULL) {
4876         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4877 	return(-1);
4878     }
4879     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4880     if (sax == NULL) {
4881         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4882 	return(-1);
4883     }
4884     else
4885         memset(sax, 0, sizeof(htmlSAXHandler));
4886 
4887     /* Allocate the Input stack */
4888     ctxt->inputTab = (htmlParserInputPtr *)
4889                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
4890     if (ctxt->inputTab == NULL) {
4891         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4892 	ctxt->inputNr = 0;
4893 	ctxt->inputMax = 0;
4894 	ctxt->input = NULL;
4895 	return(-1);
4896     }
4897     ctxt->inputNr = 0;
4898     ctxt->inputMax = 5;
4899     ctxt->input = NULL;
4900     ctxt->version = NULL;
4901     ctxt->encoding = NULL;
4902     ctxt->standalone = -1;
4903     ctxt->instate = XML_PARSER_START;
4904 
4905     /* Allocate the Node stack */
4906     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4907     if (ctxt->nodeTab == NULL) {
4908         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4909 	ctxt->nodeNr = 0;
4910 	ctxt->nodeMax = 0;
4911 	ctxt->node = NULL;
4912 	ctxt->inputNr = 0;
4913 	ctxt->inputMax = 0;
4914 	ctxt->input = NULL;
4915 	return(-1);
4916     }
4917     ctxt->nodeNr = 0;
4918     ctxt->nodeMax = 10;
4919     ctxt->node = NULL;
4920 
4921     /* Allocate the Name stack */
4922     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4923     if (ctxt->nameTab == NULL) {
4924         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4925 	ctxt->nameNr = 0;
4926 	ctxt->nameMax = 0;
4927 	ctxt->name = NULL;
4928 	ctxt->nodeNr = 0;
4929 	ctxt->nodeMax = 0;
4930 	ctxt->node = NULL;
4931 	ctxt->inputNr = 0;
4932 	ctxt->inputMax = 0;
4933 	ctxt->input = NULL;
4934 	return(-1);
4935     }
4936     ctxt->nameNr = 0;
4937     ctxt->nameMax = 10;
4938     ctxt->name = NULL;
4939 
4940     ctxt->nodeInfoTab = NULL;
4941     ctxt->nodeInfoNr  = 0;
4942     ctxt->nodeInfoMax = 0;
4943 
4944     if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4945     else {
4946         ctxt->sax = sax;
4947 	memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4948     }
4949     ctxt->userData = ctxt;
4950     ctxt->myDoc = NULL;
4951     ctxt->wellFormed = 1;
4952     ctxt->replaceEntities = 0;
4953     ctxt->linenumbers = xmlLineNumbersDefaultValue;
4954     ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
4955     ctxt->html = 1;
4956     ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4957     ctxt->vctxt.userData = ctxt;
4958     ctxt->vctxt.error = xmlParserValidityError;
4959     ctxt->vctxt.warning = xmlParserValidityWarning;
4960     ctxt->record_info = 0;
4961     ctxt->validate = 0;
4962     ctxt->nbChars = 0;
4963     ctxt->checkIndex = 0;
4964     ctxt->catalogs = NULL;
4965     xmlInitNodeInfoSeq(&ctxt->node_seq);
4966     return(0);
4967 }
4968 
4969 /**
4970  * htmlFreeParserCtxt:
4971  * @ctxt:  an HTML parser context
4972  *
4973  * Free all the memory used by a parser context. However the parsed
4974  * document in ctxt->myDoc is not freed.
4975  */
4976 
4977 void
4978 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4979 {
4980     xmlFreeParserCtxt(ctxt);
4981 }
4982 
4983 /**
4984  * htmlNewParserCtxt:
4985  *
4986  * Allocate and initialize a new parser context.
4987  *
4988  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4989  */
4990 
4991 htmlParserCtxtPtr
4992 htmlNewParserCtxt(void)
4993 {
4994     xmlParserCtxtPtr ctxt;
4995 
4996     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4997     if (ctxt == NULL) {
4998         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4999 	return(NULL);
5000     }
5001     memset(ctxt, 0, sizeof(xmlParserCtxt));
5002     if (htmlInitParserCtxt(ctxt) < 0) {
5003         htmlFreeParserCtxt(ctxt);
5004 	return(NULL);
5005     }
5006     return(ctxt);
5007 }
5008 
5009 /**
5010  * htmlCreateMemoryParserCtxt:
5011  * @buffer:  a pointer to a char array
5012  * @size:  the size of the array
5013  *
5014  * Create a parser context for an HTML in-memory document.
5015  *
5016  * Returns the new parser context or NULL
5017  */
5018 htmlParserCtxtPtr
5019 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5020     xmlParserCtxtPtr ctxt;
5021     xmlParserInputPtr input;
5022     xmlParserInputBufferPtr buf;
5023 
5024     if (buffer == NULL)
5025 	return(NULL);
5026     if (size <= 0)
5027 	return(NULL);
5028 
5029     ctxt = htmlNewParserCtxt();
5030     if (ctxt == NULL)
5031 	return(NULL);
5032 
5033     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5034     if (buf == NULL) return(NULL);
5035 
5036     input = xmlNewInputStream(ctxt);
5037     if (input == NULL) {
5038 	xmlFreeParserCtxt(ctxt);
5039 	return(NULL);
5040     }
5041 
5042     input->filename = NULL;
5043     input->buf = buf;
5044     xmlBufResetInput(buf->buffer, input);
5045 
5046     inputPush(ctxt, input);
5047     return(ctxt);
5048 }
5049 
5050 /**
5051  * htmlCreateDocParserCtxt:
5052  * @cur:  a pointer to an array of xmlChar
5053  * @encoding:  a free form C string describing the HTML document encoding, or NULL
5054  *
5055  * Create a parser context for an HTML document.
5056  *
5057  * TODO: check the need to add encoding handling there
5058  *
5059  * Returns the new parser context or NULL
5060  */
5061 static htmlParserCtxtPtr
5062 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5063     int len;
5064     htmlParserCtxtPtr ctxt;
5065 
5066     if (cur == NULL)
5067 	return(NULL);
5068     len = xmlStrlen(cur);
5069     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5070     if (ctxt == NULL)
5071 	return(NULL);
5072 
5073     if (encoding != NULL) {
5074 	xmlCharEncoding enc;
5075 	xmlCharEncodingHandlerPtr handler;
5076 
5077 	if (ctxt->input->encoding != NULL)
5078 	    xmlFree((xmlChar *) ctxt->input->encoding);
5079 	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5080 
5081 	enc = xmlParseCharEncoding(encoding);
5082 	/*
5083 	 * registered set of known encodings
5084 	 */
5085 	if (enc != XML_CHAR_ENCODING_ERROR) {
5086 	    xmlSwitchEncoding(ctxt, enc);
5087 	    if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5088 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5089 		             "Unsupported encoding %s\n",
5090 			     (const xmlChar *) encoding, NULL);
5091 	    }
5092 	} else {
5093 	    /*
5094 	     * fallback for unknown encodings
5095 	     */
5096 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
5097 	    if (handler != NULL) {
5098 		xmlSwitchToEncoding(ctxt, handler);
5099 	    } else {
5100 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5101 		             "Unsupported encoding %s\n",
5102 			     (const xmlChar *) encoding, NULL);
5103 	    }
5104 	}
5105     }
5106     return(ctxt);
5107 }
5108 
5109 #ifdef LIBXML_PUSH_ENABLED
5110 /************************************************************************
5111  *									*
5112  *	Progressive parsing interfaces				*
5113  *									*
5114  ************************************************************************/
5115 
5116 /**
5117  * htmlParseLookupSequence:
5118  * @ctxt:  an HTML parser context
5119  * @first:  the first char to lookup
5120  * @next:  the next char to lookup or zero
5121  * @third:  the next char to lookup or zero
5122  * @comment: flag to force checking inside comments
5123  *
5124  * Try to find if a sequence (first, next, third) or  just (first next) or
5125  * (first) is available in the input stream.
5126  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5127  * to avoid rescanning sequences of bytes, it DOES change the state of the
5128  * parser, do not use liberally.
5129  * This is basically similar to xmlParseLookupSequence()
5130  *
5131  * Returns the index to the current parsing point if the full sequence
5132  *      is available, -1 otherwise.
5133  */
5134 static int
5135 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5136                         xmlChar next, xmlChar third, int iscomment,
5137                         int ignoreattrval)
5138 {
5139     int base, len;
5140     htmlParserInputPtr in;
5141     const xmlChar *buf;
5142     int incomment = 0;
5143     int invalue = 0;
5144     char valdellim = 0x0;
5145 
5146     in = ctxt->input;
5147     if (in == NULL)
5148         return (-1);
5149 
5150     base = in->cur - in->base;
5151     if (base < 0)
5152         return (-1);
5153 
5154     if (ctxt->checkIndex > base)
5155         base = ctxt->checkIndex;
5156 
5157     if (in->buf == NULL) {
5158         buf = in->base;
5159         len = in->length;
5160     } else {
5161         buf = xmlBufContent(in->buf->buffer);
5162         len = xmlBufUse(in->buf->buffer);
5163     }
5164 
5165     /* take into account the sequence length */
5166     if (third)
5167         len -= 2;
5168     else if (next)
5169         len--;
5170     for (; base < len; base++) {
5171         if ((!incomment) && (base + 4 < len) && (!iscomment)) {
5172             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5173                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5174                 incomment = 1;
5175                 /* do not increment past <! - some people use <!--> */
5176                 base += 2;
5177             }
5178         }
5179         if (ignoreattrval) {
5180             if (buf[base] == '"' || buf[base] == '\'') {
5181                 if (invalue) {
5182                     if (buf[base] == valdellim) {
5183                         invalue = 0;
5184                         continue;
5185                     }
5186                 } else {
5187                     valdellim = buf[base];
5188                     invalue = 1;
5189                     continue;
5190                 }
5191             } else if (invalue) {
5192                 continue;
5193             }
5194         }
5195         if (incomment) {
5196             if (base + 3 > len)
5197                 return (-1);
5198             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5199                 (buf[base + 2] == '>')) {
5200                 incomment = 0;
5201                 base += 2;
5202             }
5203             continue;
5204         }
5205         if (buf[base] == first) {
5206             if (third != 0) {
5207                 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5208                     continue;
5209             } else if (next != 0) {
5210                 if (buf[base + 1] != next)
5211                     continue;
5212             }
5213             ctxt->checkIndex = 0;
5214 #ifdef DEBUG_PUSH
5215             if (next == 0)
5216                 xmlGenericError(xmlGenericErrorContext,
5217                                 "HPP: lookup '%c' found at %d\n",
5218                                 first, base);
5219             else if (third == 0)
5220                 xmlGenericError(xmlGenericErrorContext,
5221                                 "HPP: lookup '%c%c' found at %d\n",
5222                                 first, next, base);
5223             else
5224                 xmlGenericError(xmlGenericErrorContext,
5225                                 "HPP: lookup '%c%c%c' found at %d\n",
5226                                 first, next, third, base);
5227 #endif
5228             return (base - (in->cur - in->base));
5229         }
5230     }
5231     if ((!incomment) && (!invalue))
5232         ctxt->checkIndex = base;
5233 #ifdef DEBUG_PUSH
5234     if (next == 0)
5235         xmlGenericError(xmlGenericErrorContext,
5236                         "HPP: lookup '%c' failed\n", first);
5237     else if (third == 0)
5238         xmlGenericError(xmlGenericErrorContext,
5239                         "HPP: lookup '%c%c' failed\n", first, next);
5240     else
5241         xmlGenericError(xmlGenericErrorContext,
5242                         "HPP: lookup '%c%c%c' failed\n", first, next,
5243                         third);
5244 #endif
5245     return (-1);
5246 }
5247 
5248 /**
5249  * htmlParseLookupChars:
5250  * @ctxt: an HTML parser context
5251  * @stop: Array of chars, which stop the lookup.
5252  * @stopLen: Length of stop-Array
5253  *
5254  * Try to find if any char of the stop-Array is available in the input
5255  * stream.
5256  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5257  * to avoid rescanning sequences of bytes, it DOES change the state of the
5258  * parser, do not use liberally.
5259  *
5260  * Returns the index to the current parsing point if a stopChar
5261  *      is available, -1 otherwise.
5262  */
5263 static int
5264 htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
5265                      int stopLen)
5266 {
5267     int base, len;
5268     htmlParserInputPtr in;
5269     const xmlChar *buf;
5270     int incomment = 0;
5271     int i;
5272 
5273     in = ctxt->input;
5274     if (in == NULL)
5275         return (-1);
5276 
5277     base = in->cur - in->base;
5278     if (base < 0)
5279         return (-1);
5280 
5281     if (ctxt->checkIndex > base)
5282         base = ctxt->checkIndex;
5283 
5284     if (in->buf == NULL) {
5285         buf = in->base;
5286         len = in->length;
5287     } else {
5288         buf = xmlBufContent(in->buf->buffer);
5289         len = xmlBufUse(in->buf->buffer);
5290     }
5291 
5292     for (; base < len; base++) {
5293         if (!incomment && (base + 4 < len)) {
5294             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5295                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5296                 incomment = 1;
5297                 /* do not increment past <! - some people use <!--> */
5298                 base += 2;
5299             }
5300         }
5301         if (incomment) {
5302             if (base + 3 > len)
5303                 return (-1);
5304             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5305                 (buf[base + 2] == '>')) {
5306                 incomment = 0;
5307                 base += 2;
5308             }
5309             continue;
5310         }
5311         for (i = 0; i < stopLen; ++i) {
5312             if (buf[base] == stop[i]) {
5313                 ctxt->checkIndex = 0;
5314                 return (base - (in->cur - in->base));
5315             }
5316         }
5317     }
5318     ctxt->checkIndex = base;
5319     return (-1);
5320 }
5321 
5322 /**
5323  * htmlParseTryOrFinish:
5324  * @ctxt:  an HTML parser context
5325  * @terminate:  last chunk indicator
5326  *
5327  * Try to progress on parsing
5328  *
5329  * Returns zero if no parsing was possible
5330  */
5331 static int
5332 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5333     int ret = 0;
5334     htmlParserInputPtr in;
5335     int avail = 0;
5336     xmlChar cur, next;
5337 
5338     htmlParserNodeInfo node_info;
5339 
5340 #ifdef DEBUG_PUSH
5341     switch (ctxt->instate) {
5342 	case XML_PARSER_EOF:
5343 	    xmlGenericError(xmlGenericErrorContext,
5344 		    "HPP: try EOF\n"); break;
5345 	case XML_PARSER_START:
5346 	    xmlGenericError(xmlGenericErrorContext,
5347 		    "HPP: try START\n"); break;
5348 	case XML_PARSER_MISC:
5349 	    xmlGenericError(xmlGenericErrorContext,
5350 		    "HPP: try MISC\n");break;
5351 	case XML_PARSER_COMMENT:
5352 	    xmlGenericError(xmlGenericErrorContext,
5353 		    "HPP: try COMMENT\n");break;
5354 	case XML_PARSER_PROLOG:
5355 	    xmlGenericError(xmlGenericErrorContext,
5356 		    "HPP: try PROLOG\n");break;
5357 	case XML_PARSER_START_TAG:
5358 	    xmlGenericError(xmlGenericErrorContext,
5359 		    "HPP: try START_TAG\n");break;
5360 	case XML_PARSER_CONTENT:
5361 	    xmlGenericError(xmlGenericErrorContext,
5362 		    "HPP: try CONTENT\n");break;
5363 	case XML_PARSER_CDATA_SECTION:
5364 	    xmlGenericError(xmlGenericErrorContext,
5365 		    "HPP: try CDATA_SECTION\n");break;
5366 	case XML_PARSER_END_TAG:
5367 	    xmlGenericError(xmlGenericErrorContext,
5368 		    "HPP: try END_TAG\n");break;
5369 	case XML_PARSER_ENTITY_DECL:
5370 	    xmlGenericError(xmlGenericErrorContext,
5371 		    "HPP: try ENTITY_DECL\n");break;
5372 	case XML_PARSER_ENTITY_VALUE:
5373 	    xmlGenericError(xmlGenericErrorContext,
5374 		    "HPP: try ENTITY_VALUE\n");break;
5375 	case XML_PARSER_ATTRIBUTE_VALUE:
5376 	    xmlGenericError(xmlGenericErrorContext,
5377 		    "HPP: try ATTRIBUTE_VALUE\n");break;
5378 	case XML_PARSER_DTD:
5379 	    xmlGenericError(xmlGenericErrorContext,
5380 		    "HPP: try DTD\n");break;
5381 	case XML_PARSER_EPILOG:
5382 	    xmlGenericError(xmlGenericErrorContext,
5383 		    "HPP: try EPILOG\n");break;
5384 	case XML_PARSER_PI:
5385 	    xmlGenericError(xmlGenericErrorContext,
5386 		    "HPP: try PI\n");break;
5387 	case XML_PARSER_SYSTEM_LITERAL:
5388 	    xmlGenericError(xmlGenericErrorContext,
5389 		    "HPP: try SYSTEM_LITERAL\n");break;
5390     }
5391 #endif
5392 
5393     while (1) {
5394 
5395 	in = ctxt->input;
5396 	if (in == NULL) break;
5397 	if (in->buf == NULL)
5398 	    avail = in->length - (in->cur - in->base);
5399 	else
5400 	    avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5401 	if ((avail == 0) && (terminate)) {
5402 	    htmlAutoCloseOnEnd(ctxt);
5403 	    if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5404 		/*
5405 		 * SAX: end of the document processing.
5406 		 */
5407 		ctxt->instate = XML_PARSER_EOF;
5408 		if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5409 		    ctxt->sax->endDocument(ctxt->userData);
5410 	    }
5411 	}
5412         if (avail < 1)
5413 	    goto done;
5414 	cur = in->cur[0];
5415 	if (cur == 0) {
5416 	    SKIP(1);
5417 	    continue;
5418 	}
5419 
5420         switch (ctxt->instate) {
5421             case XML_PARSER_EOF:
5422 	        /*
5423 		 * Document parsing is done !
5424 		 */
5425 	        goto done;
5426             case XML_PARSER_START:
5427 	        /*
5428 		 * Very first chars read from the document flow.
5429 		 */
5430 		cur = in->cur[0];
5431 		if (IS_BLANK_CH(cur)) {
5432 		    SKIP_BLANKS;
5433 		    if (in->buf == NULL)
5434 			avail = in->length - (in->cur - in->base);
5435 		    else
5436 			avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5437 		}
5438 		if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5439 		    ctxt->sax->setDocumentLocator(ctxt->userData,
5440 						  &xmlDefaultSAXLocator);
5441 		if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5442 	            (!ctxt->disableSAX))
5443 		    ctxt->sax->startDocument(ctxt->userData);
5444 
5445 		cur = in->cur[0];
5446 		next = in->cur[1];
5447 		if ((cur == '<') && (next == '!') &&
5448 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5449 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5450 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5451 		    (UPP(8) == 'E')) {
5452 		    if ((!terminate) &&
5453 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5454 			goto done;
5455 #ifdef DEBUG_PUSH
5456 		    xmlGenericError(xmlGenericErrorContext,
5457 			    "HPP: Parsing internal subset\n");
5458 #endif
5459 		    htmlParseDocTypeDecl(ctxt);
5460 		    ctxt->instate = XML_PARSER_PROLOG;
5461 #ifdef DEBUG_PUSH
5462 		    xmlGenericError(xmlGenericErrorContext,
5463 			    "HPP: entering PROLOG\n");
5464 #endif
5465                 } else {
5466 		    ctxt->instate = XML_PARSER_MISC;
5467 #ifdef DEBUG_PUSH
5468 		    xmlGenericError(xmlGenericErrorContext,
5469 			    "HPP: entering MISC\n");
5470 #endif
5471 		}
5472 		break;
5473             case XML_PARSER_MISC:
5474 		SKIP_BLANKS;
5475 		if (in->buf == NULL)
5476 		    avail = in->length - (in->cur - in->base);
5477 		else
5478 		    avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5479 		/*
5480 		 * no chars in buffer
5481 		 */
5482 		if (avail < 1)
5483 		    goto done;
5484 		/*
5485 		 * not enouth chars in buffer
5486 		 */
5487 		if (avail < 2) {
5488 		    if (!terminate)
5489 			goto done;
5490 		    else
5491 			next = ' ';
5492 		} else {
5493 		    next = in->cur[1];
5494 		}
5495 		cur = in->cur[0];
5496 	        if ((cur == '<') && (next == '!') &&
5497 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5498 		    if ((!terminate) &&
5499 		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5500 			goto done;
5501 #ifdef DEBUG_PUSH
5502 		    xmlGenericError(xmlGenericErrorContext,
5503 			    "HPP: Parsing Comment\n");
5504 #endif
5505 		    htmlParseComment(ctxt);
5506 		    ctxt->instate = XML_PARSER_MISC;
5507 	        } else if ((cur == '<') && (next == '?')) {
5508 		    if ((!terminate) &&
5509 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5510 			goto done;
5511 #ifdef DEBUG_PUSH
5512 		    xmlGenericError(xmlGenericErrorContext,
5513 			    "HPP: Parsing PI\n");
5514 #endif
5515 		    htmlParsePI(ctxt);
5516 		    ctxt->instate = XML_PARSER_MISC;
5517 		} else if ((cur == '<') && (next == '!') &&
5518 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5519 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5520 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5521 		    (UPP(8) == 'E')) {
5522 		    if ((!terminate) &&
5523 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5524 			goto done;
5525 #ifdef DEBUG_PUSH
5526 		    xmlGenericError(xmlGenericErrorContext,
5527 			    "HPP: Parsing internal subset\n");
5528 #endif
5529 		    htmlParseDocTypeDecl(ctxt);
5530 		    ctxt->instate = XML_PARSER_PROLOG;
5531 #ifdef DEBUG_PUSH
5532 		    xmlGenericError(xmlGenericErrorContext,
5533 			    "HPP: entering PROLOG\n");
5534 #endif
5535 		} else if ((cur == '<') && (next == '!') &&
5536 		           (avail < 9)) {
5537 		    goto done;
5538 		} else {
5539 		    ctxt->instate = XML_PARSER_START_TAG;
5540 #ifdef DEBUG_PUSH
5541 		    xmlGenericError(xmlGenericErrorContext,
5542 			    "HPP: entering START_TAG\n");
5543 #endif
5544 		}
5545 		break;
5546             case XML_PARSER_PROLOG:
5547 		SKIP_BLANKS;
5548 		if (in->buf == NULL)
5549 		    avail = in->length - (in->cur - in->base);
5550 		else
5551 		    avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5552 		if (avail < 2)
5553 		    goto done;
5554 		cur = in->cur[0];
5555 		next = in->cur[1];
5556 		if ((cur == '<') && (next == '!') &&
5557 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5558 		    if ((!terminate) &&
5559 		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5560 			goto done;
5561 #ifdef DEBUG_PUSH
5562 		    xmlGenericError(xmlGenericErrorContext,
5563 			    "HPP: Parsing Comment\n");
5564 #endif
5565 		    htmlParseComment(ctxt);
5566 		    ctxt->instate = XML_PARSER_PROLOG;
5567 	        } else if ((cur == '<') && (next == '?')) {
5568 		    if ((!terminate) &&
5569 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5570 			goto done;
5571 #ifdef DEBUG_PUSH
5572 		    xmlGenericError(xmlGenericErrorContext,
5573 			    "HPP: Parsing PI\n");
5574 #endif
5575 		    htmlParsePI(ctxt);
5576 		    ctxt->instate = XML_PARSER_PROLOG;
5577 		} else if ((cur == '<') && (next == '!') &&
5578 		           (avail < 4)) {
5579 		    goto done;
5580 		} else {
5581 		    ctxt->instate = XML_PARSER_START_TAG;
5582 #ifdef DEBUG_PUSH
5583 		    xmlGenericError(xmlGenericErrorContext,
5584 			    "HPP: entering START_TAG\n");
5585 #endif
5586 		}
5587 		break;
5588             case XML_PARSER_EPILOG:
5589 		if (in->buf == NULL)
5590 		    avail = in->length - (in->cur - in->base);
5591 		else
5592 		    avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5593 		if (avail < 1)
5594 		    goto done;
5595 		cur = in->cur[0];
5596 		if (IS_BLANK_CH(cur)) {
5597 		    htmlParseCharData(ctxt);
5598 		    goto done;
5599 		}
5600 		if (avail < 2)
5601 		    goto done;
5602 		next = in->cur[1];
5603 	        if ((cur == '<') && (next == '!') &&
5604 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5605 		    if ((!terminate) &&
5606 		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5607 			goto done;
5608 #ifdef DEBUG_PUSH
5609 		    xmlGenericError(xmlGenericErrorContext,
5610 			    "HPP: Parsing Comment\n");
5611 #endif
5612 		    htmlParseComment(ctxt);
5613 		    ctxt->instate = XML_PARSER_EPILOG;
5614 	        } else if ((cur == '<') && (next == '?')) {
5615 		    if ((!terminate) &&
5616 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5617 			goto done;
5618 #ifdef DEBUG_PUSH
5619 		    xmlGenericError(xmlGenericErrorContext,
5620 			    "HPP: Parsing PI\n");
5621 #endif
5622 		    htmlParsePI(ctxt);
5623 		    ctxt->instate = XML_PARSER_EPILOG;
5624 		} else if ((cur == '<') && (next == '!') &&
5625 		           (avail < 4)) {
5626 		    goto done;
5627 		} else {
5628 		    ctxt->errNo = XML_ERR_DOCUMENT_END;
5629 		    ctxt->wellFormed = 0;
5630 		    ctxt->instate = XML_PARSER_EOF;
5631 #ifdef DEBUG_PUSH
5632 		    xmlGenericError(xmlGenericErrorContext,
5633 			    "HPP: entering EOF\n");
5634 #endif
5635 		    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5636 			ctxt->sax->endDocument(ctxt->userData);
5637 		    goto done;
5638 		}
5639 		break;
5640             case XML_PARSER_START_TAG: {
5641 	        const xmlChar *name;
5642 		int failed;
5643 		const htmlElemDesc * info;
5644 
5645 		/*
5646 		 * no chars in buffer
5647 		 */
5648 		if (avail < 1)
5649 		    goto done;
5650 		/*
5651 		 * not enouth chars in buffer
5652 		 */
5653 		if (avail < 2) {
5654 		    if (!terminate)
5655 			goto done;
5656 		    else
5657 			next = ' ';
5658 		} else {
5659 		    next = in->cur[1];
5660 		}
5661 		cur = in->cur[0];
5662 	        if (cur != '<') {
5663 		    ctxt->instate = XML_PARSER_CONTENT;
5664 #ifdef DEBUG_PUSH
5665 		    xmlGenericError(xmlGenericErrorContext,
5666 			    "HPP: entering CONTENT\n");
5667 #endif
5668 		    break;
5669 		}
5670 		if (next == '/') {
5671 		    ctxt->instate = XML_PARSER_END_TAG;
5672 		    ctxt->checkIndex = 0;
5673 #ifdef DEBUG_PUSH
5674 		    xmlGenericError(xmlGenericErrorContext,
5675 			    "HPP: entering END_TAG\n");
5676 #endif
5677 		    break;
5678 		}
5679 		if ((!terminate) &&
5680 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5681 		    goto done;
5682 
5683                 /* Capture start position */
5684 	        if (ctxt->record_info) {
5685 	             node_info.begin_pos = ctxt->input->consumed +
5686 	                                (CUR_PTR - ctxt->input->base);
5687 	             node_info.begin_line = ctxt->input->line;
5688 	        }
5689 
5690 
5691 		failed = htmlParseStartTag(ctxt);
5692 		name = ctxt->name;
5693 		if ((failed == -1) ||
5694 		    (name == NULL)) {
5695 		    if (CUR == '>')
5696 			NEXT;
5697 		    break;
5698 		}
5699 
5700 		/*
5701 		 * Lookup the info for that element.
5702 		 */
5703 		info = htmlTagLookup(name);
5704 		if (info == NULL) {
5705 		    htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5706 		                 "Tag %s invalid\n", name, NULL);
5707 		}
5708 
5709 		/*
5710 		 * Check for an Empty Element labeled the XML/SGML way
5711 		 */
5712 		if ((CUR == '/') && (NXT(1) == '>')) {
5713 		    SKIP(2);
5714 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5715 			ctxt->sax->endElement(ctxt->userData, name);
5716 		    htmlnamePop(ctxt);
5717 		    ctxt->instate = XML_PARSER_CONTENT;
5718 #ifdef DEBUG_PUSH
5719 		    xmlGenericError(xmlGenericErrorContext,
5720 			    "HPP: entering CONTENT\n");
5721 #endif
5722 		    break;
5723 		}
5724 
5725 		if (CUR == '>') {
5726 		    NEXT;
5727 		} else {
5728 		    htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5729 		                 "Couldn't find end of Start Tag %s\n",
5730 				 name, NULL);
5731 
5732 		    /*
5733 		     * end of parsing of this node.
5734 		     */
5735 		    if (xmlStrEqual(name, ctxt->name)) {
5736 			nodePop(ctxt);
5737 			htmlnamePop(ctxt);
5738 		    }
5739 
5740 		    if (ctxt->record_info)
5741 		        htmlNodeInfoPush(ctxt, &node_info);
5742 
5743 		    ctxt->instate = XML_PARSER_CONTENT;
5744 #ifdef DEBUG_PUSH
5745 		    xmlGenericError(xmlGenericErrorContext,
5746 			    "HPP: entering CONTENT\n");
5747 #endif
5748 		    break;
5749 		}
5750 
5751 		/*
5752 		 * Check for an Empty Element from DTD definition
5753 		 */
5754 		if ((info != NULL) && (info->empty)) {
5755 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5756 			ctxt->sax->endElement(ctxt->userData, name);
5757 		    htmlnamePop(ctxt);
5758 		}
5759 
5760                 if (ctxt->record_info)
5761 	            htmlNodeInfoPush(ctxt, &node_info);
5762 
5763 		ctxt->instate = XML_PARSER_CONTENT;
5764 #ifdef DEBUG_PUSH
5765 		xmlGenericError(xmlGenericErrorContext,
5766 			"HPP: entering CONTENT\n");
5767 #endif
5768                 break;
5769 	    }
5770             case XML_PARSER_CONTENT: {
5771 		xmlChar chr[2] = { 0, 0 };
5772 		long cons;
5773 
5774                 /*
5775 		 * Handle preparsed entities and charRef
5776 		 */
5777 		if (ctxt->token != 0) {
5778 		    chr[0] = (xmlChar) ctxt->token;
5779 		    htmlCheckParagraph(ctxt);
5780 		    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5781 			ctxt->sax->characters(ctxt->userData, chr, 1);
5782 		    ctxt->token = 0;
5783 		    ctxt->checkIndex = 0;
5784 		}
5785 		if ((avail == 1) && (terminate)) {
5786 		    cur = in->cur[0];
5787 		    if ((cur != '<') && (cur != '&')) {
5788 			if (ctxt->sax != NULL) {
5789                             chr[0] = cur;
5790 			    if (IS_BLANK_CH(cur)) {
5791 				if (ctxt->keepBlanks) {
5792 				    if (ctxt->sax->characters != NULL)
5793 					ctxt->sax->characters(
5794 						ctxt->userData, chr, 1);
5795 				} else {
5796 				    if (ctxt->sax->ignorableWhitespace != NULL)
5797 					ctxt->sax->ignorableWhitespace(
5798 						ctxt->userData, chr, 1);
5799 				}
5800 			    } else {
5801 				htmlCheckParagraph(ctxt);
5802 				if (ctxt->sax->characters != NULL)
5803 				    ctxt->sax->characters(
5804 					    ctxt->userData, chr, 1);
5805 			    }
5806 			}
5807 			ctxt->token = 0;
5808 			ctxt->checkIndex = 0;
5809 			in->cur++;
5810 			break;
5811 		    }
5812 		}
5813 		if (avail < 2)
5814 		    goto done;
5815 		cur = in->cur[0];
5816 		next = in->cur[1];
5817 		cons = ctxt->nbChars;
5818 		if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5819 		    (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5820 		    /*
5821 		     * Handle SCRIPT/STYLE separately
5822 		     */
5823 		    if (!terminate) {
5824 		        int idx;
5825 			xmlChar val;
5826 
5827 			idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
5828 			if (idx < 0)
5829 			    goto done;
5830 		        val = in->cur[idx + 2];
5831 			if (val == 0) /* bad cut of input */
5832 			    goto done;
5833 		    }
5834 		    htmlParseScript(ctxt);
5835 		    if ((cur == '<') && (next == '/')) {
5836 			ctxt->instate = XML_PARSER_END_TAG;
5837 			ctxt->checkIndex = 0;
5838 #ifdef DEBUG_PUSH
5839 			xmlGenericError(xmlGenericErrorContext,
5840 				"HPP: entering END_TAG\n");
5841 #endif
5842 			break;
5843 		    }
5844 		} else {
5845 		    /*
5846 		     * Sometimes DOCTYPE arrives in the middle of the document
5847 		     */
5848 		    if ((cur == '<') && (next == '!') &&
5849 			(UPP(2) == 'D') && (UPP(3) == 'O') &&
5850 			(UPP(4) == 'C') && (UPP(5) == 'T') &&
5851 			(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5852 			(UPP(8) == 'E')) {
5853 			if ((!terminate) &&
5854 			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5855 			    goto done;
5856 			htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5857 			             "Misplaced DOCTYPE declaration\n",
5858 				     BAD_CAST "DOCTYPE" , NULL);
5859 			htmlParseDocTypeDecl(ctxt);
5860 		    } else if ((cur == '<') && (next == '!') &&
5861 			(in->cur[2] == '-') && (in->cur[3] == '-')) {
5862 			if ((!terminate) &&
5863 			    (htmlParseLookupSequence(
5864 				ctxt, '-', '-', '>', 1, 1) < 0))
5865 			    goto done;
5866 #ifdef DEBUG_PUSH
5867 			xmlGenericError(xmlGenericErrorContext,
5868 				"HPP: Parsing Comment\n");
5869 #endif
5870 			htmlParseComment(ctxt);
5871 			ctxt->instate = XML_PARSER_CONTENT;
5872 		    } else if ((cur == '<') && (next == '?')) {
5873 			if ((!terminate) &&
5874 			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5875 			    goto done;
5876 #ifdef DEBUG_PUSH
5877 			xmlGenericError(xmlGenericErrorContext,
5878 				"HPP: Parsing PI\n");
5879 #endif
5880 			htmlParsePI(ctxt);
5881 			ctxt->instate = XML_PARSER_CONTENT;
5882 		    } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5883 			goto done;
5884 		    } else if ((cur == '<') && (next == '/')) {
5885 			ctxt->instate = XML_PARSER_END_TAG;
5886 			ctxt->checkIndex = 0;
5887 #ifdef DEBUG_PUSH
5888 			xmlGenericError(xmlGenericErrorContext,
5889 				"HPP: entering END_TAG\n");
5890 #endif
5891 			break;
5892 		    } else if (cur == '<') {
5893 			ctxt->instate = XML_PARSER_START_TAG;
5894 			ctxt->checkIndex = 0;
5895 #ifdef DEBUG_PUSH
5896 			xmlGenericError(xmlGenericErrorContext,
5897 				"HPP: entering START_TAG\n");
5898 #endif
5899 			break;
5900 		    } else if (cur == '&') {
5901 			if ((!terminate) &&
5902 			    (htmlParseLookupChars(ctxt,
5903                                                   BAD_CAST "; >/", 4) < 0))
5904 			    goto done;
5905 #ifdef DEBUG_PUSH
5906 			xmlGenericError(xmlGenericErrorContext,
5907 				"HPP: Parsing Reference\n");
5908 #endif
5909 			/* TODO: check generation of subtrees if noent !!! */
5910 			htmlParseReference(ctxt);
5911 		    } else {
5912 		        /*
5913 			 * check that the text sequence is complete
5914 			 * before handing out the data to the parser
5915 			 * to avoid problems with erroneous end of
5916 			 * data detection.
5917 			 */
5918 			if ((!terminate) &&
5919                             (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
5920 			    goto done;
5921 			ctxt->checkIndex = 0;
5922 #ifdef DEBUG_PUSH
5923 			xmlGenericError(xmlGenericErrorContext,
5924 				"HPP: Parsing char data\n");
5925 #endif
5926 			htmlParseCharData(ctxt);
5927 		    }
5928 		}
5929 		if (cons == ctxt->nbChars) {
5930 		    if (ctxt->node != NULL) {
5931 			htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5932 			             "detected an error in element content\n",
5933 				     NULL, NULL);
5934 		    }
5935 		    NEXT;
5936 		    break;
5937 		}
5938 
5939 		break;
5940 	    }
5941             case XML_PARSER_END_TAG:
5942 		if (avail < 2)
5943 		    goto done;
5944 		if ((!terminate) &&
5945 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5946 		    goto done;
5947 		htmlParseEndTag(ctxt);
5948 		if (ctxt->nameNr == 0) {
5949 		    ctxt->instate = XML_PARSER_EPILOG;
5950 		} else {
5951 		    ctxt->instate = XML_PARSER_CONTENT;
5952 		}
5953 		ctxt->checkIndex = 0;
5954 #ifdef DEBUG_PUSH
5955 		xmlGenericError(xmlGenericErrorContext,
5956 			"HPP: entering CONTENT\n");
5957 #endif
5958 	        break;
5959             case XML_PARSER_CDATA_SECTION:
5960 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5961 			"HPP: internal error, state == CDATA\n",
5962 			     NULL, NULL);
5963 		ctxt->instate = XML_PARSER_CONTENT;
5964 		ctxt->checkIndex = 0;
5965 #ifdef DEBUG_PUSH
5966 		xmlGenericError(xmlGenericErrorContext,
5967 			"HPP: entering CONTENT\n");
5968 #endif
5969 		break;
5970             case XML_PARSER_DTD:
5971 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5972 			"HPP: internal error, state == DTD\n",
5973 			     NULL, NULL);
5974 		ctxt->instate = XML_PARSER_CONTENT;
5975 		ctxt->checkIndex = 0;
5976 #ifdef DEBUG_PUSH
5977 		xmlGenericError(xmlGenericErrorContext,
5978 			"HPP: entering CONTENT\n");
5979 #endif
5980 		break;
5981             case XML_PARSER_COMMENT:
5982 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5983 			"HPP: internal error, state == COMMENT\n",
5984 			     NULL, NULL);
5985 		ctxt->instate = XML_PARSER_CONTENT;
5986 		ctxt->checkIndex = 0;
5987 #ifdef DEBUG_PUSH
5988 		xmlGenericError(xmlGenericErrorContext,
5989 			"HPP: entering CONTENT\n");
5990 #endif
5991 		break;
5992             case XML_PARSER_PI:
5993 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5994 			"HPP: internal error, state == PI\n",
5995 			     NULL, NULL);
5996 		ctxt->instate = XML_PARSER_CONTENT;
5997 		ctxt->checkIndex = 0;
5998 #ifdef DEBUG_PUSH
5999 		xmlGenericError(xmlGenericErrorContext,
6000 			"HPP: entering CONTENT\n");
6001 #endif
6002 		break;
6003             case XML_PARSER_ENTITY_DECL:
6004 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6005 			"HPP: internal error, state == ENTITY_DECL\n",
6006 			     NULL, NULL);
6007 		ctxt->instate = XML_PARSER_CONTENT;
6008 		ctxt->checkIndex = 0;
6009 #ifdef DEBUG_PUSH
6010 		xmlGenericError(xmlGenericErrorContext,
6011 			"HPP: entering CONTENT\n");
6012 #endif
6013 		break;
6014             case XML_PARSER_ENTITY_VALUE:
6015 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6016 			"HPP: internal error, state == ENTITY_VALUE\n",
6017 			     NULL, NULL);
6018 		ctxt->instate = XML_PARSER_CONTENT;
6019 		ctxt->checkIndex = 0;
6020 #ifdef DEBUG_PUSH
6021 		xmlGenericError(xmlGenericErrorContext,
6022 			"HPP: entering DTD\n");
6023 #endif
6024 		break;
6025             case XML_PARSER_ATTRIBUTE_VALUE:
6026 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6027 			"HPP: internal error, state == ATTRIBUTE_VALUE\n",
6028 			     NULL, NULL);
6029 		ctxt->instate = XML_PARSER_START_TAG;
6030 		ctxt->checkIndex = 0;
6031 #ifdef DEBUG_PUSH
6032 		xmlGenericError(xmlGenericErrorContext,
6033 			"HPP: entering START_TAG\n");
6034 #endif
6035 		break;
6036 	    case XML_PARSER_SYSTEM_LITERAL:
6037 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6038 		    "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6039 			     NULL, NULL);
6040 		ctxt->instate = XML_PARSER_CONTENT;
6041 		ctxt->checkIndex = 0;
6042 #ifdef DEBUG_PUSH
6043 		xmlGenericError(xmlGenericErrorContext,
6044 			"HPP: entering CONTENT\n");
6045 #endif
6046 		break;
6047 	    case XML_PARSER_IGNORE:
6048 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6049 			"HPP: internal error, state == XML_PARSER_IGNORE\n",
6050 			     NULL, NULL);
6051 		ctxt->instate = XML_PARSER_CONTENT;
6052 		ctxt->checkIndex = 0;
6053 #ifdef DEBUG_PUSH
6054 		xmlGenericError(xmlGenericErrorContext,
6055 			"HPP: entering CONTENT\n");
6056 #endif
6057 		break;
6058 	    case XML_PARSER_PUBLIC_LITERAL:
6059 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6060 			"HPP: internal error, state == XML_PARSER_LITERAL\n",
6061 			     NULL, NULL);
6062 		ctxt->instate = XML_PARSER_CONTENT;
6063 		ctxt->checkIndex = 0;
6064 #ifdef DEBUG_PUSH
6065 		xmlGenericError(xmlGenericErrorContext,
6066 			"HPP: entering CONTENT\n");
6067 #endif
6068 		break;
6069 
6070 	}
6071     }
6072 done:
6073     if ((avail == 0) && (terminate)) {
6074 	htmlAutoCloseOnEnd(ctxt);
6075 	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6076 	    /*
6077 	     * SAX: end of the document processing.
6078 	     */
6079 	    ctxt->instate = XML_PARSER_EOF;
6080 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6081 		ctxt->sax->endDocument(ctxt->userData);
6082 	}
6083     }
6084     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6085 	((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6086 	 (ctxt->instate == XML_PARSER_EPILOG))) {
6087 	xmlDtdPtr dtd;
6088 	dtd = xmlGetIntSubset(ctxt->myDoc);
6089 	if (dtd == NULL)
6090 	    ctxt->myDoc->intSubset =
6091 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6092 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6093 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6094     }
6095 #ifdef DEBUG_PUSH
6096     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6097 #endif
6098     return(ret);
6099 }
6100 
6101 /**
6102  * htmlParseChunk:
6103  * @ctxt:  an HTML parser context
6104  * @chunk:  an char array
6105  * @size:  the size in byte of the chunk
6106  * @terminate:  last chunk indicator
6107  *
6108  * Parse a Chunk of memory
6109  *
6110  * Returns zero if no error, the xmlParserErrors otherwise.
6111  */
6112 int
6113 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6114               int terminate) {
6115     if ((ctxt == NULL) || (ctxt->input == NULL)) {
6116 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6117 		     "htmlParseChunk: context error\n", NULL, NULL);
6118 	return(XML_ERR_INTERNAL_ERROR);
6119     }
6120     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6121         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
6122 	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6123 	size_t cur = ctxt->input->cur - ctxt->input->base;
6124 	int res;
6125 
6126 	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6127 	if (res < 0) {
6128 	    ctxt->errNo = XML_PARSER_EOF;
6129 	    ctxt->disableSAX = 1;
6130 	    return (XML_PARSER_EOF);
6131 	}
6132         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6133 #ifdef DEBUG_PUSH
6134 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6135 #endif
6136 
6137 #if 0
6138 	if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6139 	    htmlParseTryOrFinish(ctxt, terminate);
6140 #endif
6141     } else if (ctxt->instate != XML_PARSER_EOF) {
6142 	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6143 	    xmlParserInputBufferPtr in = ctxt->input->buf;
6144 	    if ((in->encoder != NULL) && (in->buffer != NULL) &&
6145 		    (in->raw != NULL)) {
6146 		int nbchars;
6147 		size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6148 		size_t current = ctxt->input->cur - ctxt->input->base;
6149 
6150 		nbchars = xmlCharEncInput(in, terminate);
6151 		if (nbchars < 0) {
6152 		    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6153 			         "encoder error\n", NULL, NULL);
6154 		    return(XML_ERR_INVALID_ENCODING);
6155 		}
6156 		xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6157 	    }
6158 	}
6159     }
6160     htmlParseTryOrFinish(ctxt, terminate);
6161     if (terminate) {
6162 	if ((ctxt->instate != XML_PARSER_EOF) &&
6163 	    (ctxt->instate != XML_PARSER_EPILOG) &&
6164 	    (ctxt->instate != XML_PARSER_MISC)) {
6165 	    ctxt->errNo = XML_ERR_DOCUMENT_END;
6166 	    ctxt->wellFormed = 0;
6167 	}
6168 	if (ctxt->instate != XML_PARSER_EOF) {
6169 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6170 		ctxt->sax->endDocument(ctxt->userData);
6171 	}
6172 	ctxt->instate = XML_PARSER_EOF;
6173     }
6174     return((xmlParserErrors) ctxt->errNo);
6175 }
6176 
6177 /************************************************************************
6178  *									*
6179  *			User entry points				*
6180  *									*
6181  ************************************************************************/
6182 
6183 /**
6184  * htmlCreatePushParserCtxt:
6185  * @sax:  a SAX handler
6186  * @user_data:  The user data returned on SAX callbacks
6187  * @chunk:  a pointer to an array of chars
6188  * @size:  number of chars in the array
6189  * @filename:  an optional file name or URI
6190  * @enc:  an optional encoding
6191  *
6192  * Create a parser context for using the HTML parser in push mode
6193  * The value of @filename is used for fetching external entities
6194  * and error/warning reports.
6195  *
6196  * Returns the new parser context or NULL
6197  */
6198 htmlParserCtxtPtr
6199 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6200                          const char *chunk, int size, const char *filename,
6201 			 xmlCharEncoding enc) {
6202     htmlParserCtxtPtr ctxt;
6203     htmlParserInputPtr inputStream;
6204     xmlParserInputBufferPtr buf;
6205 
6206     xmlInitParser();
6207 
6208     buf = xmlAllocParserInputBuffer(enc);
6209     if (buf == NULL) return(NULL);
6210 
6211     ctxt = htmlNewParserCtxt();
6212     if (ctxt == NULL) {
6213 	xmlFreeParserInputBuffer(buf);
6214 	return(NULL);
6215     }
6216     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6217 	ctxt->charset=XML_CHAR_ENCODING_UTF8;
6218     if (sax != NULL) {
6219 	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6220 	    xmlFree(ctxt->sax);
6221 	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6222 	if (ctxt->sax == NULL) {
6223 	    xmlFree(buf);
6224 	    xmlFree(ctxt);
6225 	    return(NULL);
6226 	}
6227 	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6228 	if (user_data != NULL)
6229 	    ctxt->userData = user_data;
6230     }
6231     if (filename == NULL) {
6232 	ctxt->directory = NULL;
6233     } else {
6234         ctxt->directory = xmlParserGetDirectory(filename);
6235     }
6236 
6237     inputStream = htmlNewInputStream(ctxt);
6238     if (inputStream == NULL) {
6239 	xmlFreeParserCtxt(ctxt);
6240 	xmlFree(buf);
6241 	return(NULL);
6242     }
6243 
6244     if (filename == NULL)
6245 	inputStream->filename = NULL;
6246     else
6247 	inputStream->filename = (char *)
6248 	    xmlCanonicPath((const xmlChar *) filename);
6249     inputStream->buf = buf;
6250     xmlBufResetInput(buf->buffer, inputStream);
6251 
6252     inputPush(ctxt, inputStream);
6253 
6254     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6255         (ctxt->input->buf != NULL))  {
6256 	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6257 	size_t cur = ctxt->input->cur - ctxt->input->base;
6258 
6259 	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6260 
6261         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6262 #ifdef DEBUG_PUSH
6263 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6264 #endif
6265     }
6266     ctxt->progressive = 1;
6267 
6268     return(ctxt);
6269 }
6270 #endif /* LIBXML_PUSH_ENABLED */
6271 
6272 /**
6273  * htmlSAXParseDoc:
6274  * @cur:  a pointer to an array of xmlChar
6275  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6276  * @sax:  the SAX handler block
6277  * @userData: if using SAX, this pointer will be provided on callbacks.
6278  *
6279  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6280  * to handle parse events. If sax is NULL, fallback to the default DOM
6281  * behavior and return a tree.
6282  *
6283  * Returns the resulting document tree unless SAX is NULL or the document is
6284  *     not well formed.
6285  */
6286 
6287 htmlDocPtr
6288 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6289                 htmlSAXHandlerPtr sax, void *userData) {
6290     htmlDocPtr ret;
6291     htmlParserCtxtPtr ctxt;
6292 
6293     xmlInitParser();
6294 
6295     if (cur == NULL) return(NULL);
6296 
6297 
6298     ctxt = htmlCreateDocParserCtxt(cur, encoding);
6299     if (ctxt == NULL) return(NULL);
6300     if (sax != NULL) {
6301         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6302         ctxt->sax = sax;
6303         ctxt->userData = userData;
6304     }
6305 
6306     htmlParseDocument(ctxt);
6307     ret = ctxt->myDoc;
6308     if (sax != NULL) {
6309 	ctxt->sax = NULL;
6310 	ctxt->userData = NULL;
6311     }
6312     htmlFreeParserCtxt(ctxt);
6313 
6314     return(ret);
6315 }
6316 
6317 /**
6318  * htmlParseDoc:
6319  * @cur:  a pointer to an array of xmlChar
6320  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6321  *
6322  * parse an HTML in-memory document and build a tree.
6323  *
6324  * Returns the resulting document tree
6325  */
6326 
6327 htmlDocPtr
6328 htmlParseDoc(const xmlChar *cur, const char *encoding) {
6329     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6330 }
6331 
6332 
6333 /**
6334  * htmlCreateFileParserCtxt:
6335  * @filename:  the filename
6336  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6337  *
6338  * Create a parser context for a file content.
6339  * Automatic support for ZLIB/Compress compressed document is provided
6340  * by default if found at compile-time.
6341  *
6342  * Returns the new parser context or NULL
6343  */
6344 htmlParserCtxtPtr
6345 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6346 {
6347     htmlParserCtxtPtr ctxt;
6348     htmlParserInputPtr inputStream;
6349     char *canonicFilename;
6350     /* htmlCharEncoding enc; */
6351     xmlChar *content, *content_line = (xmlChar *) "charset=";
6352 
6353     if (filename == NULL)
6354         return(NULL);
6355 
6356     ctxt = htmlNewParserCtxt();
6357     if (ctxt == NULL) {
6358 	return(NULL);
6359     }
6360     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6361     if (canonicFilename == NULL) {
6362 #ifdef LIBXML_SAX1_ENABLED
6363 	if (xmlDefaultSAXHandler.error != NULL) {
6364 	    xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6365 	}
6366 #endif
6367 	xmlFreeParserCtxt(ctxt);
6368 	return(NULL);
6369     }
6370 
6371     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6372     xmlFree(canonicFilename);
6373     if (inputStream == NULL) {
6374 	xmlFreeParserCtxt(ctxt);
6375 	return(NULL);
6376     }
6377 
6378     inputPush(ctxt, inputStream);
6379 
6380     /* set encoding */
6381     if (encoding) {
6382         size_t l = strlen(encoding);
6383 
6384 	if (l < 1000) {
6385 	    content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6386 	    if (content) {
6387 		strcpy ((char *)content, (char *)content_line);
6388 		strcat ((char *)content, (char *)encoding);
6389 		htmlCheckEncoding (ctxt, content);
6390 		xmlFree (content);
6391 	    }
6392 	}
6393     }
6394 
6395     return(ctxt);
6396 }
6397 
6398 /**
6399  * htmlSAXParseFile:
6400  * @filename:  the filename
6401  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6402  * @sax:  the SAX handler block
6403  * @userData: if using SAX, this pointer will be provided on callbacks.
6404  *
6405  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6406  * compressed document is provided by default if found at compile-time.
6407  * It use the given SAX function block to handle the parsing callback.
6408  * If sax is NULL, fallback to the default DOM tree building routines.
6409  *
6410  * Returns the resulting document tree unless SAX is NULL or the document is
6411  *     not well formed.
6412  */
6413 
6414 htmlDocPtr
6415 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6416                  void *userData) {
6417     htmlDocPtr ret;
6418     htmlParserCtxtPtr ctxt;
6419     htmlSAXHandlerPtr oldsax = NULL;
6420 
6421     xmlInitParser();
6422 
6423     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6424     if (ctxt == NULL) return(NULL);
6425     if (sax != NULL) {
6426 	oldsax = ctxt->sax;
6427         ctxt->sax = sax;
6428         ctxt->userData = userData;
6429     }
6430 
6431     htmlParseDocument(ctxt);
6432 
6433     ret = ctxt->myDoc;
6434     if (sax != NULL) {
6435         ctxt->sax = oldsax;
6436         ctxt->userData = NULL;
6437     }
6438     htmlFreeParserCtxt(ctxt);
6439 
6440     return(ret);
6441 }
6442 
6443 /**
6444  * htmlParseFile:
6445  * @filename:  the filename
6446  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6447  *
6448  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6449  * compressed document is provided by default if found at compile-time.
6450  *
6451  * Returns the resulting document tree
6452  */
6453 
6454 htmlDocPtr
6455 htmlParseFile(const char *filename, const char *encoding) {
6456     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6457 }
6458 
6459 /**
6460  * htmlHandleOmittedElem:
6461  * @val:  int 0 or 1
6462  *
6463  * Set and return the previous value for handling HTML omitted tags.
6464  *
6465  * Returns the last value for 0 for no handling, 1 for auto insertion.
6466  */
6467 
6468 int
6469 htmlHandleOmittedElem(int val) {
6470     int old = htmlOmittedDefaultValue;
6471 
6472     htmlOmittedDefaultValue = val;
6473     return(old);
6474 }
6475 
6476 /**
6477  * htmlElementAllowedHere:
6478  * @parent: HTML parent element
6479  * @elt: HTML element
6480  *
6481  * Checks whether an HTML element may be a direct child of a parent element.
6482  * Note - doesn't check for deprecated elements
6483  *
6484  * Returns 1 if allowed; 0 otherwise.
6485  */
6486 int
6487 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6488   const char** p ;
6489 
6490   if ( ! elt || ! parent || ! parent->subelts )
6491 	return 0 ;
6492 
6493   for ( p = parent->subelts; *p; ++p )
6494     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6495       return 1 ;
6496 
6497   return 0 ;
6498 }
6499 /**
6500  * htmlElementStatusHere:
6501  * @parent: HTML parent element
6502  * @elt: HTML element
6503  *
6504  * Checks whether an HTML element may be a direct child of a parent element.
6505  * and if so whether it is valid or deprecated.
6506  *
6507  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6508  */
6509 htmlStatus
6510 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6511   if ( ! parent || ! elt )
6512     return HTML_INVALID ;
6513   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6514     return HTML_INVALID ;
6515 
6516   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6517 }
6518 /**
6519  * htmlAttrAllowed:
6520  * @elt: HTML element
6521  * @attr: HTML attribute
6522  * @legacy: whether to allow deprecated attributes
6523  *
6524  * Checks whether an attribute is valid for an element
6525  * Has full knowledge of Required and Deprecated attributes
6526  *
6527  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6528  */
6529 htmlStatus
6530 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6531   const char** p ;
6532 
6533   if ( !elt || ! attr )
6534 	return HTML_INVALID ;
6535 
6536   if ( elt->attrs_req )
6537     for ( p = elt->attrs_req; *p; ++p)
6538       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6539         return HTML_REQUIRED ;
6540 
6541   if ( elt->attrs_opt )
6542     for ( p = elt->attrs_opt; *p; ++p)
6543       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6544         return HTML_VALID ;
6545 
6546   if ( legacy && elt->attrs_depr )
6547     for ( p = elt->attrs_depr; *p; ++p)
6548       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6549         return HTML_DEPRECATED ;
6550 
6551   return HTML_INVALID ;
6552 }
6553 /**
6554  * htmlNodeStatus:
6555  * @node: an htmlNodePtr in a tree
6556  * @legacy: whether to allow deprecated elements (YES is faster here
6557  *	for Element nodes)
6558  *
6559  * Checks whether the tree node is valid.  Experimental (the author
6560  *     only uses the HTML enhancements in a SAX parser)
6561  *
6562  * Return: for Element nodes, a return from htmlElementAllowedHere (if
6563  *	legacy allowed) or htmlElementStatusHere (otherwise).
6564  *	for Attribute nodes, a return from htmlAttrAllowed
6565  *	for other nodes, HTML_NA (no checks performed)
6566  */
6567 htmlStatus
6568 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6569   if ( ! node )
6570     return HTML_INVALID ;
6571 
6572   switch ( node->type ) {
6573     case XML_ELEMENT_NODE:
6574       return legacy
6575 	? ( htmlElementAllowedHere (
6576 		htmlTagLookup(node->parent->name) , node->name
6577 		) ? HTML_VALID : HTML_INVALID )
6578 	: htmlElementStatusHere(
6579 		htmlTagLookup(node->parent->name) ,
6580 		htmlTagLookup(node->name) )
6581 	;
6582     case XML_ATTRIBUTE_NODE:
6583       return htmlAttrAllowed(
6584 	htmlTagLookup(node->parent->name) , node->name, legacy) ;
6585     default: return HTML_NA ;
6586   }
6587 }
6588 /************************************************************************
6589  *									*
6590  *	New set (2.6.0) of simpler and more flexible APIs		*
6591  *									*
6592  ************************************************************************/
6593 /**
6594  * DICT_FREE:
6595  * @str:  a string
6596  *
6597  * Free a string if it is not owned by the "dict" dictionary in the
6598  * current scope
6599  */
6600 #define DICT_FREE(str)						\
6601 	if ((str) && ((!dict) ||				\
6602 	    (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))	\
6603 	    xmlFree((char *)(str));
6604 
6605 /**
6606  * htmlCtxtReset:
6607  * @ctxt: an HTML parser context
6608  *
6609  * Reset a parser context
6610  */
6611 void
6612 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6613 {
6614     xmlParserInputPtr input;
6615     xmlDictPtr dict;
6616 
6617     if (ctxt == NULL)
6618         return;
6619 
6620     xmlInitParser();
6621     dict = ctxt->dict;
6622 
6623     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6624         xmlFreeInputStream(input);
6625     }
6626     ctxt->inputNr = 0;
6627     ctxt->input = NULL;
6628 
6629     ctxt->spaceNr = 0;
6630     if (ctxt->spaceTab != NULL) {
6631 	ctxt->spaceTab[0] = -1;
6632 	ctxt->space = &ctxt->spaceTab[0];
6633     } else {
6634 	ctxt->space = NULL;
6635     }
6636 
6637 
6638     ctxt->nodeNr = 0;
6639     ctxt->node = NULL;
6640 
6641     ctxt->nameNr = 0;
6642     ctxt->name = NULL;
6643 
6644     DICT_FREE(ctxt->version);
6645     ctxt->version = NULL;
6646     DICT_FREE(ctxt->encoding);
6647     ctxt->encoding = NULL;
6648     DICT_FREE(ctxt->directory);
6649     ctxt->directory = NULL;
6650     DICT_FREE(ctxt->extSubURI);
6651     ctxt->extSubURI = NULL;
6652     DICT_FREE(ctxt->extSubSystem);
6653     ctxt->extSubSystem = NULL;
6654     if (ctxt->myDoc != NULL)
6655         xmlFreeDoc(ctxt->myDoc);
6656     ctxt->myDoc = NULL;
6657 
6658     ctxt->standalone = -1;
6659     ctxt->hasExternalSubset = 0;
6660     ctxt->hasPErefs = 0;
6661     ctxt->html = 1;
6662     ctxt->external = 0;
6663     ctxt->instate = XML_PARSER_START;
6664     ctxt->token = 0;
6665 
6666     ctxt->wellFormed = 1;
6667     ctxt->nsWellFormed = 1;
6668     ctxt->disableSAX = 0;
6669     ctxt->valid = 1;
6670     ctxt->vctxt.userData = ctxt;
6671     ctxt->vctxt.error = xmlParserValidityError;
6672     ctxt->vctxt.warning = xmlParserValidityWarning;
6673     ctxt->record_info = 0;
6674     ctxt->nbChars = 0;
6675     ctxt->checkIndex = 0;
6676     ctxt->inSubset = 0;
6677     ctxt->errNo = XML_ERR_OK;
6678     ctxt->depth = 0;
6679     ctxt->charset = XML_CHAR_ENCODING_NONE;
6680     ctxt->catalogs = NULL;
6681     xmlInitNodeInfoSeq(&ctxt->node_seq);
6682 
6683     if (ctxt->attsDefault != NULL) {
6684         xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6685         ctxt->attsDefault = NULL;
6686     }
6687     if (ctxt->attsSpecial != NULL) {
6688         xmlHashFree(ctxt->attsSpecial, NULL);
6689         ctxt->attsSpecial = NULL;
6690     }
6691 }
6692 
6693 /**
6694  * htmlCtxtUseOptions:
6695  * @ctxt: an HTML parser context
6696  * @options:  a combination of htmlParserOption(s)
6697  *
6698  * Applies the options to the parser context
6699  *
6700  * Returns 0 in case of success, the set of unknown or unimplemented options
6701  *         in case of error.
6702  */
6703 int
6704 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6705 {
6706     if (ctxt == NULL)
6707         return(-1);
6708 
6709     if (options & HTML_PARSE_NOWARNING) {
6710         ctxt->sax->warning = NULL;
6711         ctxt->vctxt.warning = NULL;
6712         options -= XML_PARSE_NOWARNING;
6713 	ctxt->options |= XML_PARSE_NOWARNING;
6714     }
6715     if (options & HTML_PARSE_NOERROR) {
6716         ctxt->sax->error = NULL;
6717         ctxt->vctxt.error = NULL;
6718         ctxt->sax->fatalError = NULL;
6719         options -= XML_PARSE_NOERROR;
6720 	ctxt->options |= XML_PARSE_NOERROR;
6721     }
6722     if (options & HTML_PARSE_PEDANTIC) {
6723         ctxt->pedantic = 1;
6724         options -= XML_PARSE_PEDANTIC;
6725 	ctxt->options |= XML_PARSE_PEDANTIC;
6726     } else
6727         ctxt->pedantic = 0;
6728     if (options & XML_PARSE_NOBLANKS) {
6729         ctxt->keepBlanks = 0;
6730         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6731         options -= XML_PARSE_NOBLANKS;
6732 	ctxt->options |= XML_PARSE_NOBLANKS;
6733     } else
6734         ctxt->keepBlanks = 1;
6735     if (options & HTML_PARSE_RECOVER) {
6736         ctxt->recovery = 1;
6737 	options -= HTML_PARSE_RECOVER;
6738     } else
6739         ctxt->recovery = 0;
6740     if (options & HTML_PARSE_COMPACT) {
6741 	ctxt->options |= HTML_PARSE_COMPACT;
6742         options -= HTML_PARSE_COMPACT;
6743     }
6744     if (options & XML_PARSE_HUGE) {
6745 	ctxt->options |= XML_PARSE_HUGE;
6746         options -= XML_PARSE_HUGE;
6747     }
6748     if (options & HTML_PARSE_NODEFDTD) {
6749 	ctxt->options |= HTML_PARSE_NODEFDTD;
6750         options -= HTML_PARSE_NODEFDTD;
6751     }
6752     if (options & HTML_PARSE_IGNORE_ENC) {
6753 	ctxt->options |= HTML_PARSE_IGNORE_ENC;
6754         options -= HTML_PARSE_IGNORE_ENC;
6755     }
6756     if (options & HTML_PARSE_NOIMPLIED) {
6757         ctxt->options |= HTML_PARSE_NOIMPLIED;
6758         options -= HTML_PARSE_NOIMPLIED;
6759     }
6760     ctxt->dictNames = 0;
6761     return (options);
6762 }
6763 
6764 /**
6765  * htmlDoRead:
6766  * @ctxt:  an HTML parser context
6767  * @URL:  the base URL to use for the document
6768  * @encoding:  the document encoding, or NULL
6769  * @options:  a combination of htmlParserOption(s)
6770  * @reuse:  keep the context for reuse
6771  *
6772  * Common front-end for the htmlRead functions
6773  *
6774  * Returns the resulting document tree or NULL
6775  */
6776 static htmlDocPtr
6777 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6778           int options, int reuse)
6779 {
6780     htmlDocPtr ret;
6781 
6782     htmlCtxtUseOptions(ctxt, options);
6783     ctxt->html = 1;
6784     if (encoding != NULL) {
6785         xmlCharEncodingHandlerPtr hdlr;
6786 
6787 	hdlr = xmlFindCharEncodingHandler(encoding);
6788 	if (hdlr != NULL) {
6789 	    xmlSwitchToEncoding(ctxt, hdlr);
6790 	    if (ctxt->input->encoding != NULL)
6791 	      xmlFree((xmlChar *) ctxt->input->encoding);
6792             ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6793         }
6794     }
6795     if ((URL != NULL) && (ctxt->input != NULL) &&
6796         (ctxt->input->filename == NULL))
6797         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6798     htmlParseDocument(ctxt);
6799     ret = ctxt->myDoc;
6800     ctxt->myDoc = NULL;
6801     if (!reuse) {
6802         if ((ctxt->dictNames) &&
6803 	    (ret != NULL) &&
6804 	    (ret->dict == ctxt->dict))
6805 	    ctxt->dict = NULL;
6806 	xmlFreeParserCtxt(ctxt);
6807     }
6808     return (ret);
6809 }
6810 
6811 /**
6812  * htmlReadDoc:
6813  * @cur:  a pointer to a zero terminated string
6814  * @URL:  the base URL to use for the document
6815  * @encoding:  the document encoding, or NULL
6816  * @options:  a combination of htmlParserOption(s)
6817  *
6818  * parse an XML in-memory document and build a tree.
6819  *
6820  * Returns the resulting document tree
6821  */
6822 htmlDocPtr
6823 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6824 {
6825     htmlParserCtxtPtr ctxt;
6826 
6827     if (cur == NULL)
6828         return (NULL);
6829 
6830     xmlInitParser();
6831     ctxt = htmlCreateDocParserCtxt(cur, NULL);
6832     if (ctxt == NULL)
6833         return (NULL);
6834     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6835 }
6836 
6837 /**
6838  * htmlReadFile:
6839  * @filename:  a file or URL
6840  * @encoding:  the document encoding, or NULL
6841  * @options:  a combination of htmlParserOption(s)
6842  *
6843  * parse an XML file from the filesystem or the network.
6844  *
6845  * Returns the resulting document tree
6846  */
6847 htmlDocPtr
6848 htmlReadFile(const char *filename, const char *encoding, int options)
6849 {
6850     htmlParserCtxtPtr ctxt;
6851 
6852     xmlInitParser();
6853     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6854     if (ctxt == NULL)
6855         return (NULL);
6856     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6857 }
6858 
6859 /**
6860  * htmlReadMemory:
6861  * @buffer:  a pointer to a char array
6862  * @size:  the size of the array
6863  * @URL:  the base URL to use for the document
6864  * @encoding:  the document encoding, or NULL
6865  * @options:  a combination of htmlParserOption(s)
6866  *
6867  * parse an XML in-memory document and build a tree.
6868  *
6869  * Returns the resulting document tree
6870  */
6871 htmlDocPtr
6872 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6873 {
6874     htmlParserCtxtPtr ctxt;
6875 
6876     xmlInitParser();
6877     ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6878     if (ctxt == NULL)
6879         return (NULL);
6880     htmlDefaultSAXHandlerInit();
6881     if (ctxt->sax != NULL)
6882         memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6883     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6884 }
6885 
6886 /**
6887  * htmlReadFd:
6888  * @fd:  an open file descriptor
6889  * @URL:  the base URL to use for the document
6890  * @encoding:  the document encoding, or NULL
6891  * @options:  a combination of htmlParserOption(s)
6892  *
6893  * parse an XML from a file descriptor and build a tree.
6894  *
6895  * Returns the resulting document tree
6896  */
6897 htmlDocPtr
6898 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6899 {
6900     htmlParserCtxtPtr ctxt;
6901     xmlParserInputBufferPtr input;
6902     xmlParserInputPtr stream;
6903 
6904     if (fd < 0)
6905         return (NULL);
6906     xmlInitParser();
6907 
6908     xmlInitParser();
6909     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6910     if (input == NULL)
6911         return (NULL);
6912     ctxt = xmlNewParserCtxt();
6913     if (ctxt == NULL) {
6914         xmlFreeParserInputBuffer(input);
6915         return (NULL);
6916     }
6917     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6918     if (stream == NULL) {
6919         xmlFreeParserInputBuffer(input);
6920 	xmlFreeParserCtxt(ctxt);
6921         return (NULL);
6922     }
6923     inputPush(ctxt, stream);
6924     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6925 }
6926 
6927 /**
6928  * htmlReadIO:
6929  * @ioread:  an I/O read function
6930  * @ioclose:  an I/O close function
6931  * @ioctx:  an I/O handler
6932  * @URL:  the base URL to use for the document
6933  * @encoding:  the document encoding, or NULL
6934  * @options:  a combination of htmlParserOption(s)
6935  *
6936  * parse an HTML document from I/O functions and source and build a tree.
6937  *
6938  * Returns the resulting document tree
6939  */
6940 htmlDocPtr
6941 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6942           void *ioctx, const char *URL, const char *encoding, int options)
6943 {
6944     htmlParserCtxtPtr ctxt;
6945     xmlParserInputBufferPtr input;
6946     xmlParserInputPtr stream;
6947 
6948     if (ioread == NULL)
6949         return (NULL);
6950     xmlInitParser();
6951 
6952     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6953                                          XML_CHAR_ENCODING_NONE);
6954     if (input == NULL) {
6955         if (ioclose != NULL)
6956             ioclose(ioctx);
6957         return (NULL);
6958     }
6959     ctxt = htmlNewParserCtxt();
6960     if (ctxt == NULL) {
6961         xmlFreeParserInputBuffer(input);
6962         return (NULL);
6963     }
6964     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6965     if (stream == NULL) {
6966         xmlFreeParserInputBuffer(input);
6967 	xmlFreeParserCtxt(ctxt);
6968         return (NULL);
6969     }
6970     inputPush(ctxt, stream);
6971     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6972 }
6973 
6974 /**
6975  * htmlCtxtReadDoc:
6976  * @ctxt:  an HTML parser context
6977  * @cur:  a pointer to a zero terminated string
6978  * @URL:  the base URL to use for the document
6979  * @encoding:  the document encoding, or NULL
6980  * @options:  a combination of htmlParserOption(s)
6981  *
6982  * parse an XML in-memory document and build a tree.
6983  * This reuses the existing @ctxt parser context
6984  *
6985  * Returns the resulting document tree
6986  */
6987 htmlDocPtr
6988 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6989                const char *URL, const char *encoding, int options)
6990 {
6991     xmlParserInputPtr stream;
6992 
6993     if (cur == NULL)
6994         return (NULL);
6995     if (ctxt == NULL)
6996         return (NULL);
6997     xmlInitParser();
6998 
6999     htmlCtxtReset(ctxt);
7000 
7001     stream = xmlNewStringInputStream(ctxt, cur);
7002     if (stream == NULL) {
7003         return (NULL);
7004     }
7005     inputPush(ctxt, stream);
7006     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7007 }
7008 
7009 /**
7010  * htmlCtxtReadFile:
7011  * @ctxt:  an HTML parser context
7012  * @filename:  a file or URL
7013  * @encoding:  the document encoding, or NULL
7014  * @options:  a combination of htmlParserOption(s)
7015  *
7016  * parse an XML file from the filesystem or the network.
7017  * This reuses the existing @ctxt parser context
7018  *
7019  * Returns the resulting document tree
7020  */
7021 htmlDocPtr
7022 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7023                 const char *encoding, int options)
7024 {
7025     xmlParserInputPtr stream;
7026 
7027     if (filename == NULL)
7028         return (NULL);
7029     if (ctxt == NULL)
7030         return (NULL);
7031     xmlInitParser();
7032 
7033     htmlCtxtReset(ctxt);
7034 
7035     stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7036     if (stream == NULL) {
7037         return (NULL);
7038     }
7039     inputPush(ctxt, stream);
7040     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7041 }
7042 
7043 /**
7044  * htmlCtxtReadMemory:
7045  * @ctxt:  an HTML parser context
7046  * @buffer:  a pointer to a char array
7047  * @size:  the size of the array
7048  * @URL:  the base URL to use for the document
7049  * @encoding:  the document encoding, or NULL
7050  * @options:  a combination of htmlParserOption(s)
7051  *
7052  * parse an XML in-memory document and build a tree.
7053  * This reuses the existing @ctxt parser context
7054  *
7055  * Returns the resulting document tree
7056  */
7057 htmlDocPtr
7058 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7059                   const char *URL, const char *encoding, int options)
7060 {
7061     xmlParserInputBufferPtr input;
7062     xmlParserInputPtr stream;
7063 
7064     if (ctxt == NULL)
7065         return (NULL);
7066     if (buffer == NULL)
7067         return (NULL);
7068     xmlInitParser();
7069 
7070     htmlCtxtReset(ctxt);
7071 
7072     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7073     if (input == NULL) {
7074 	return(NULL);
7075     }
7076 
7077     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7078     if (stream == NULL) {
7079 	xmlFreeParserInputBuffer(input);
7080 	return(NULL);
7081     }
7082 
7083     inputPush(ctxt, stream);
7084     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7085 }
7086 
7087 /**
7088  * htmlCtxtReadFd:
7089  * @ctxt:  an HTML parser context
7090  * @fd:  an open file descriptor
7091  * @URL:  the base URL to use for the document
7092  * @encoding:  the document encoding, or NULL
7093  * @options:  a combination of htmlParserOption(s)
7094  *
7095  * parse an XML from a file descriptor and build a tree.
7096  * This reuses the existing @ctxt parser context
7097  *
7098  * Returns the resulting document tree
7099  */
7100 htmlDocPtr
7101 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7102               const char *URL, const char *encoding, int options)
7103 {
7104     xmlParserInputBufferPtr input;
7105     xmlParserInputPtr stream;
7106 
7107     if (fd < 0)
7108         return (NULL);
7109     if (ctxt == NULL)
7110         return (NULL);
7111     xmlInitParser();
7112 
7113     htmlCtxtReset(ctxt);
7114 
7115 
7116     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7117     if (input == NULL)
7118         return (NULL);
7119     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7120     if (stream == NULL) {
7121         xmlFreeParserInputBuffer(input);
7122         return (NULL);
7123     }
7124     inputPush(ctxt, stream);
7125     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7126 }
7127 
7128 /**
7129  * htmlCtxtReadIO:
7130  * @ctxt:  an HTML parser context
7131  * @ioread:  an I/O read function
7132  * @ioclose:  an I/O close function
7133  * @ioctx:  an I/O handler
7134  * @URL:  the base URL to use for the document
7135  * @encoding:  the document encoding, or NULL
7136  * @options:  a combination of htmlParserOption(s)
7137  *
7138  * parse an HTML document from I/O functions and source and build a tree.
7139  * This reuses the existing @ctxt parser context
7140  *
7141  * Returns the resulting document tree
7142  */
7143 htmlDocPtr
7144 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7145               xmlInputCloseCallback ioclose, void *ioctx,
7146 	      const char *URL,
7147               const char *encoding, int options)
7148 {
7149     xmlParserInputBufferPtr input;
7150     xmlParserInputPtr stream;
7151 
7152     if (ioread == NULL)
7153         return (NULL);
7154     if (ctxt == NULL)
7155         return (NULL);
7156     xmlInitParser();
7157 
7158     htmlCtxtReset(ctxt);
7159 
7160     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7161                                          XML_CHAR_ENCODING_NONE);
7162     if (input == NULL) {
7163         if (ioclose != NULL)
7164             ioclose(ioctx);
7165         return (NULL);
7166     }
7167     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7168     if (stream == NULL) {
7169         xmlFreeParserInputBuffer(input);
7170         return (NULL);
7171     }
7172     inputPush(ctxt, stream);
7173     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7174 }
7175 
7176 #define bottom_HTMLparser
7177 #include "elfgcchack.h"
7178 #endif /* LIBXML_HTML_ENABLED */
7179