1 /* libxml2 - Library for parsing XML documents
2  * Copyright (C) 2006-2019 Free Software Foundation, Inc.
3  *
4  * This file is not part of the GNU gettext program, but is used with
5  * GNU gettext.
6  *
7  * The original copyright notice is as follows:
8  */
9 
10 /*
11  * Copyright (C) 1998-2012 Daniel Veillard.  All Rights Reserved.
12  *
13  * Permission is hereby granted, free of charge, to any person obtaining a copy
14  * of this software and associated documentation files (the "Software"), to deal
15  * in the Software without restriction, including without limitation the rights
16  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17  * copies of the Software, and to permit persons to whom the Software is fur-
18  * nished to do so, subject to the following conditions:
19  *
20  * The above copyright notice and this permission notice shall be included in
21  * all copies or substantial portions of the Software.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FIT-
25  * NESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
26  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29  * THE SOFTWARE.
30  *
31  * daniel@veillard.com
32  */
33 
34 /*
35  * HTMLparser.c : an HTML 4.0 non-verifying parser
36  */
37 
38 #define IN_LIBXML
39 #include "libxml.h"
40 #ifdef LIBXML_HTML_ENABLED
41 
42 #include <string.h>
43 #ifdef HAVE_CTYPE_H
44 #include <ctype.h>
45 #endif
46 #ifdef HAVE_STDLIB_H
47 #include <stdlib.h>
48 #endif
49 #ifdef HAVE_SYS_STAT_H
50 #include <sys/stat.h>
51 #endif
52 #ifdef HAVE_FCNTL_H
53 #include <fcntl.h>
54 #endif
55 #ifdef HAVE_UNISTD_H
56 #include <unistd.h>
57 #endif
58 #ifdef LIBXML_ZLIB_ENABLED
59 #include <zlib.h>
60 #endif
61 
62 #include <libxml/xmlmemory.h>
63 #include <libxml/tree.h>
64 #include <libxml/parser.h>
65 #include <libxml/parserInternals.h>
66 #include <libxml/xmlerror.h>
67 #include <libxml/HTMLparser.h>
68 #include <libxml/HTMLtree.h>
69 #include <libxml/entities.h>
70 #include <libxml/encoding.h>
71 #include <libxml/valid.h>
72 #include <libxml/xmlIO.h>
73 #include <libxml/globals.h>
74 #include <libxml/uri.h>
75 
76 #include "buf.h"
77 #include "enc.h"
78 
79 #define HTML_MAX_NAMELEN 1000
80 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
81 #define HTML_PARSER_BUFFER_SIZE 100
82 
83 /* #define DEBUG */
84 /* #define DEBUG_PUSH */
85 
86 static int htmlOmittedDefaultValue = 1;
87 
88 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
89 			     xmlChar end, xmlChar  end2, xmlChar end3);
90 static void htmlParseComment(htmlParserCtxtPtr ctxt);
91 
92 /************************************************************************
93  *									*
94  *		Some factorized error routines				*
95  *									*
96  ************************************************************************/
97 
98 /**
99  * htmlErrMemory:
100  * @ctxt:  an HTML parser context
101  * @extra:  extra informations
102  *
103  * Handle a redefinition of attribute error
104  */
105 static void
htmlErrMemory(xmlParserCtxtPtr ctxt,const char * extra)106 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
107 {
108     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
109         (ctxt->instate == XML_PARSER_EOF))
110 	return;
111     if (ctxt != NULL) {
112         ctxt->errNo = XML_ERR_NO_MEMORY;
113         ctxt->instate = XML_PARSER_EOF;
114         ctxt->disableSAX = 1;
115     }
116     if (extra)
117         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
118                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
119                         NULL, NULL, 0, 0,
120                         "Memory allocation failed : %s\n", extra);
121     else
122         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
123                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
124                         NULL, NULL, 0, 0, "Memory allocation failed\n");
125 }
126 
127 /**
128  * htmlParseErr:
129  * @ctxt:  an HTML parser context
130  * @error:  the error number
131  * @msg:  the error message
132  * @str1:  string infor
133  * @str2:  string infor
134  *
135  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
136  */
137 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErr(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,const xmlChar * str1,const xmlChar * str2)138 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
139              const char *msg, const xmlChar *str1, const xmlChar *str2)
140 {
141     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
142         (ctxt->instate == XML_PARSER_EOF))
143 	return;
144     if (ctxt != NULL)
145 	ctxt->errNo = error;
146     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
147                     XML_ERR_ERROR, NULL, 0,
148 		    (const char *) str1, (const char *) str2,
149 		    NULL, 0, 0,
150 		    msg, str1, str2);
151     if (ctxt != NULL)
152 	ctxt->wellFormed = 0;
153 }
154 
155 /**
156  * htmlParseErrInt:
157  * @ctxt:  an HTML parser context
158  * @error:  the error number
159  * @msg:  the error message
160  * @val:  integer info
161  *
162  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
163  */
164 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErrInt(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,int val)165 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
166              const char *msg, int val)
167 {
168     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
169         (ctxt->instate == XML_PARSER_EOF))
170 	return;
171     if (ctxt != NULL)
172 	ctxt->errNo = error;
173     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
174                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
175 		    NULL, val, 0, msg, val);
176     if (ctxt != NULL)
177 	ctxt->wellFormed = 0;
178 }
179 
180 /************************************************************************
181  *									*
182  *	Parser stacks related functions and macros		*
183  *									*
184  ************************************************************************/
185 
186 /**
187  * htmlnamePush:
188  * @ctxt:  an HTML parser context
189  * @value:  the element name
190  *
191  * Pushes a new element name on top of the name stack
192  *
193  * Returns 0 in case of error, the index in the stack otherwise
194  */
195 static int
htmlnamePush(htmlParserCtxtPtr ctxt,const xmlChar * value)196 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
197 {
198     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
199         ctxt->html = 3;
200     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
201         ctxt->html = 10;
202     if (ctxt->nameNr >= ctxt->nameMax) {
203         ctxt->nameMax *= 2;
204         ctxt->nameTab = (const xmlChar * *)
205                          xmlRealloc((xmlChar * *)ctxt->nameTab,
206                                     ctxt->nameMax *
207                                     sizeof(ctxt->nameTab[0]));
208         if (ctxt->nameTab == NULL) {
209             htmlErrMemory(ctxt, NULL);
210             return (0);
211         }
212     }
213     ctxt->nameTab[ctxt->nameNr] = value;
214     ctxt->name = value;
215     return (ctxt->nameNr++);
216 }
217 /**
218  * htmlnamePop:
219  * @ctxt: an HTML parser context
220  *
221  * Pops the top element name from the name stack
222  *
223  * Returns the name just removed
224  */
225 static const xmlChar *
htmlnamePop(htmlParserCtxtPtr ctxt)226 htmlnamePop(htmlParserCtxtPtr ctxt)
227 {
228     const xmlChar *ret;
229 
230     if (ctxt->nameNr <= 0)
231         return (NULL);
232     ctxt->nameNr--;
233     if (ctxt->nameNr < 0)
234         return (NULL);
235     if (ctxt->nameNr > 0)
236         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
237     else
238         ctxt->name = NULL;
239     ret = ctxt->nameTab[ctxt->nameNr];
240     ctxt->nameTab[ctxt->nameNr] = NULL;
241     return (ret);
242 }
243 
244 /**
245  * htmlNodeInfoPush:
246  * @ctxt:  an HTML parser context
247  * @value:  the node info
248  *
249  * Pushes a new element name on top of the node info stack
250  *
251  * Returns 0 in case of error, the index in the stack otherwise
252  */
253 static int
htmlNodeInfoPush(htmlParserCtxtPtr ctxt,htmlParserNodeInfo * value)254 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
255 {
256     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
257         if (ctxt->nodeInfoMax == 0)
258                 ctxt->nodeInfoMax = 5;
259         ctxt->nodeInfoMax *= 2;
260         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
261                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
262                                     ctxt->nodeInfoMax *
263                                     sizeof(ctxt->nodeInfoTab[0]));
264         if (ctxt->nodeInfoTab == NULL) {
265             htmlErrMemory(ctxt, NULL);
266             return (0);
267         }
268     }
269     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
270     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
271     return (ctxt->nodeInfoNr++);
272 }
273 
274 /**
275  * htmlNodeInfoPop:
276  * @ctxt:  an HTML parser context
277  *
278  * Pops the top element name from the node info stack
279  *
280  * Returns 0 in case of error, the pointer to NodeInfo otherwise
281  */
282 static htmlParserNodeInfo *
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)283 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
284 {
285     if (ctxt->nodeInfoNr <= 0)
286         return (NULL);
287     ctxt->nodeInfoNr--;
288     if (ctxt->nodeInfoNr < 0)
289         return (NULL);
290     if (ctxt->nodeInfoNr > 0)
291         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
292     else
293         ctxt->nodeInfo = NULL;
294     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
295 }
296 
297 /*
298  * Macros for accessing the content. Those should be used only by the parser,
299  * and not exported.
300  *
301  * Dirty macros, i.e. one need to make assumption on the context to use them
302  *
303  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
304  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
305  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
306  *           in UNICODE mode. This should be used internally by the parser
307  *           only to compare to ASCII values otherwise it would break when
308  *           running with UTF-8 encoding.
309  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
310  *           to compare on ASCII based substring.
311  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
312  *           it should be used only to compare on ASCII based substring.
313  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
314  *           strings without newlines within the parser.
315  *
316  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
317  *
318  *   CURRENT Returns the current char value, with the full decoding of
319  *           UTF-8 if we are using this mode. It returns an int.
320  *   NEXT    Skip to the next character, this does the proper decoding
321  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
322  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
323  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
324  */
325 
326 #define UPPER (toupper(*ctxt->input->cur))
327 
328 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
329 
330 #define NXT(val) ctxt->input->cur[(val)]
331 
332 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
333 
334 #define CUR_PTR ctxt->input->cur
335 #define BASE_PTR ctxt->input->base
336 
337 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
338 		   (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
339 	xmlParserInputShrink(ctxt->input)
340 
341 #define GROW if ((ctxt->progressive == 0) &&				\
342 		 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))	\
343 	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
344 
345 #define CURRENT ((int) (*ctxt->input->cur))
346 
347 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
348 
349 /* Inported from XML */
350 
351 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
352 #define CUR ((int) (*ctxt->input->cur))
353 #define NEXT xmlNextChar(ctxt)
354 
355 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
356 
357 
358 #define NEXTL(l) do {							\
359     if (*(ctxt->input->cur) == '\n') {					\
360 	ctxt->input->line++; ctxt->input->col = 1;			\
361     } else ctxt->input->col++;						\
362     ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;		\
363   } while (0)
364 
365 /************
366     \
367     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);	\
368     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
369  ************/
370 
371 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
372 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
373 
374 #define COPY_BUF(l,b,i,v)						\
375     if (l == 1) b[i++] = (xmlChar) v;					\
376     else i += xmlCopyChar(l,&b[i],v)
377 
378 /**
379  * htmlFindEncoding:
380  * @the HTML parser context
381  *
382  * Ty to find and encoding in the current data available in the input
383  * buffer this is needed to try to switch to the proper encoding when
384  * one face a character error.
385  * That's an heuristic, since it's operating outside of parsing it could
386  * try to use a meta which had been commented out, that's the reason it
387  * should only be used in case of error, not as a default.
388  *
389  * Returns an encoding string or NULL if not found, the string need to
390  *   be freed
391  */
392 static xmlChar *
htmlFindEncoding(xmlParserCtxtPtr ctxt)393 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
394     const xmlChar *start, *cur, *end;
395 
396     if ((ctxt == NULL) || (ctxt->input == NULL) ||
397         (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
398         (ctxt->input->buf->encoder != NULL))
399         return(NULL);
400     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
401         return(NULL);
402 
403     start = ctxt->input->cur;
404     end = ctxt->input->end;
405     /* we also expect the input buffer to be zero terminated */
406     if (*end != 0)
407         return(NULL);
408 
409     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
410     if (cur == NULL)
411         return(NULL);
412     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
413     if (cur == NULL)
414         return(NULL);
415     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
416     if (cur == NULL)
417         return(NULL);
418     cur += 8;
419     start = cur;
420     while (((*cur >= 'A') && (*cur <= 'Z')) ||
421            ((*cur >= 'a') && (*cur <= 'z')) ||
422            ((*cur >= '0') && (*cur <= '9')) ||
423            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
424            cur++;
425     if (cur == start)
426         return(NULL);
427     return(xmlStrndup(start, cur - start));
428 }
429 
430 /**
431  * htmlCurrentChar:
432  * @ctxt:  the HTML parser context
433  * @len:  pointer to the length of the char read
434  *
435  * The current char value, if using UTF-8 this may actually span multiple
436  * bytes in the input buffer. Implement the end of line normalization:
437  * 2.11 End-of-Line Handling
438  * If the encoding is unspecified, in the case we find an ISO-Latin-1
439  * char, then the encoding converter is plugged in automatically.
440  *
441  * Returns the current char value and its length
442  */
443 
444 static int
htmlCurrentChar(xmlParserCtxtPtr ctxt,int * len)445 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
446     if (ctxt->instate == XML_PARSER_EOF)
447 	return(0);
448 
449     if (ctxt->token != 0) {
450 	*len = 0;
451 	return(ctxt->token);
452     }
453     if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
454 	/*
455 	 * We are supposed to handle UTF8, check it's valid
456 	 * From rfc2044: encoding of the Unicode values on UTF-8:
457 	 *
458 	 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
459 	 * 0000 0000-0000 007F   0xxxxxxx
460 	 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
461 	 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
462 	 *
463 	 * Check for the 0x110000 limit too
464 	 */
465 	const unsigned char *cur = ctxt->input->cur;
466 	unsigned char c;
467 	unsigned int val;
468 
469 	c = *cur;
470 	if (c & 0x80) {
471 	    if (cur[1] == 0) {
472 		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
473                 cur = ctxt->input->cur;
474             }
475 	    if ((cur[1] & 0xc0) != 0x80)
476 		goto encoding_error;
477 	    if ((c & 0xe0) == 0xe0) {
478 
479 		if (cur[2] == 0) {
480 		    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
481                     cur = ctxt->input->cur;
482                 }
483 		if ((cur[2] & 0xc0) != 0x80)
484 		    goto encoding_error;
485 		if ((c & 0xf0) == 0xf0) {
486 		    if (cur[3] == 0) {
487 			xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
488                         cur = ctxt->input->cur;
489                     }
490 		    if (((c & 0xf8) != 0xf0) ||
491 			((cur[3] & 0xc0) != 0x80))
492 			goto encoding_error;
493 		    /* 4-byte code */
494 		    *len = 4;
495 		    val = (cur[0] & 0x7) << 18;
496 		    val |= (cur[1] & 0x3f) << 12;
497 		    val |= (cur[2] & 0x3f) << 6;
498 		    val |= cur[3] & 0x3f;
499 		} else {
500 		  /* 3-byte code */
501 		    *len = 3;
502 		    val = (cur[0] & 0xf) << 12;
503 		    val |= (cur[1] & 0x3f) << 6;
504 		    val |= cur[2] & 0x3f;
505 		}
506 	    } else {
507 	      /* 2-byte code */
508 		*len = 2;
509 		val = (cur[0] & 0x1f) << 6;
510 		val |= cur[1] & 0x3f;
511 	    }
512 	    if (!IS_CHAR(val)) {
513 	        htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
514 				"Char 0x%X out of allowed range\n", val);
515 	    }
516 	    return(val);
517 	} else {
518             if ((*ctxt->input->cur == 0) &&
519                 (ctxt->input->cur < ctxt->input->end)) {
520                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
521 				"Char 0x%X out of allowed range\n", 0);
522                 *len = 1;
523                 return(' ');
524             }
525 	    /* 1-byte code */
526 	    *len = 1;
527 	    return((int) *ctxt->input->cur);
528 	}
529     }
530     /*
531      * Assume it's a fixed length encoding (1) with
532      * a compatible encoding for the ASCII set, since
533      * XML constructs only use < 128 chars
534      */
535     *len = 1;
536     if ((int) *ctxt->input->cur < 0x80)
537 	return((int) *ctxt->input->cur);
538 
539     /*
540      * Humm this is bad, do an automatic flow conversion
541      */
542     {
543         xmlChar * guess;
544         xmlCharEncodingHandlerPtr handler;
545 
546         guess = htmlFindEncoding(ctxt);
547         if (guess == NULL) {
548             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
549         } else {
550             if (ctxt->input->encoding != NULL)
551                 xmlFree((xmlChar *) ctxt->input->encoding);
552             ctxt->input->encoding = guess;
553             handler = xmlFindCharEncodingHandler((const char *) guess);
554             if (handler != NULL) {
555                 xmlSwitchToEncoding(ctxt, handler);
556             } else {
557                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
558                              "Unsupported encoding %s", guess, NULL);
559             }
560         }
561         ctxt->charset = XML_CHAR_ENCODING_UTF8;
562     }
563 
564     return(xmlCurrentChar(ctxt, len));
565 
566 encoding_error:
567     /*
568      * If we detect an UTF8 error that probably mean that the
569      * input encoding didn't get properly advertized in the
570      * declaration header. Report the error and switch the encoding
571      * to ISO-Latin-1 (if you don't like this policy, just declare the
572      * encoding !)
573      */
574     {
575         char buffer[150];
576 
577 	if (ctxt->input->end - ctxt->input->cur >= 4) {
578 	    snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
579 			    ctxt->input->cur[0], ctxt->input->cur[1],
580 			    ctxt->input->cur[2], ctxt->input->cur[3]);
581 	} else {
582 	    snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
583 	}
584 	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
585 		     "Input is not proper UTF-8, indicate encoding !\n",
586 		     BAD_CAST buffer, NULL);
587     }
588 
589     ctxt->charset = XML_CHAR_ENCODING_8859_1;
590     *len = 1;
591     return((int) *ctxt->input->cur);
592 }
593 
594 /**
595  * htmlSkipBlankChars:
596  * @ctxt:  the HTML parser context
597  *
598  * skip all blanks character found at that point in the input streams.
599  *
600  * Returns the number of space chars skipped
601  */
602 
603 static int
htmlSkipBlankChars(xmlParserCtxtPtr ctxt)604 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
605     int res = 0;
606 
607     while (IS_BLANK_CH(*(ctxt->input->cur))) {
608 	if ((*ctxt->input->cur == 0) &&
609 	    (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
610 		xmlPopInput(ctxt);
611 	} else {
612 	    if (*(ctxt->input->cur) == '\n') {
613 		ctxt->input->line++; ctxt->input->col = 1;
614 	    } else ctxt->input->col++;
615 	    ctxt->input->cur++;
616 	    ctxt->nbChars++;
617 	    if (*ctxt->input->cur == 0)
618 		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
619 	}
620 	res++;
621     }
622     return(res);
623 }
624 
625 
626 
627 /************************************************************************
628  *									*
629  *	The list of HTML elements and their properties		*
630  *									*
631  ************************************************************************/
632 
633 /*
634  *  Start Tag: 1 means the start tag can be ommited
635  *  End Tag:   1 means the end tag can be ommited
636  *             2 means it's forbidden (empty elements)
637  *             3 means the tag is stylistic and should be closed easily
638  *  Depr:      this element is deprecated
639  *  DTD:       1 means that this element is valid only in the Loose DTD
640  *             2 means that this element is valid only in the Frameset DTD
641  *
642  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
643 	, subElements , impliedsubelt , Attributes, userdata
644  */
645 
646 /* Definitions and a couple of vars for HTML Elements */
647 
648 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
649 #define NB_FONTSTYLE 8
650 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
651 #define NB_PHRASE 10
652 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
653 #define NB_SPECIAL 16
654 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
655 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
656 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
657 #define NB_BLOCK NB_HEADING + NB_LIST + 14
658 #define FORMCTRL "input", "select", "textarea", "label", "button"
659 #define NB_FORMCTRL 5
660 #define PCDATA
661 #define NB_PCDATA 0
662 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
663 #define NB_HEADING 6
664 #define LIST "ul", "ol", "dir", "menu"
665 #define NB_LIST 4
666 #define MODIFIER
667 #define NB_MODIFIER 0
668 #define FLOW BLOCK,INLINE
669 #define NB_FLOW NB_BLOCK + NB_INLINE
670 #define EMPTY NULL
671 
672 
673 static const char* const html_flow[] = { FLOW, NULL } ;
674 static const char* const html_inline[] = { INLINE, NULL } ;
675 
676 /* placeholders: elts with content but no subelements */
677 static const char* const html_pcdata[] = { NULL } ;
678 #define html_cdata html_pcdata
679 
680 
681 /* ... and for HTML Attributes */
682 
683 #define COREATTRS "id", "class", "style", "title"
684 #define NB_COREATTRS 4
685 #define I18N "lang", "dir"
686 #define NB_I18N 2
687 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
688 #define NB_EVENTS 9
689 #define ATTRS COREATTRS,I18N,EVENTS
690 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
691 #define CELLHALIGN "align", "char", "charoff"
692 #define NB_CELLHALIGN 3
693 #define CELLVALIGN "valign"
694 #define NB_CELLVALIGN 1
695 
696 static const char* const html_attrs[] = { ATTRS, NULL } ;
697 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
698 static const char* const core_attrs[] = { COREATTRS, NULL } ;
699 static const char* const i18n_attrs[] = { I18N, NULL } ;
700 
701 
702 /* Other declarations that should go inline ... */
703 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
704 	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
705 	"tabindex", "onfocus", "onblur", NULL } ;
706 static const char* const target_attr[] = { "target", NULL } ;
707 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
708 static const char* const alt_attr[] = { "alt", NULL } ;
709 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
710 static const char* const href_attrs[] = { "href", NULL } ;
711 static const char* const clear_attrs[] = { "clear", NULL } ;
712 static const char* const inline_p[] = { INLINE, "p", NULL } ;
713 
714 static const char* const flow_param[] = { FLOW, "param", NULL } ;
715 static const char* const applet_attrs[] = { COREATTRS , "codebase",
716 		"archive", "alt", "name", "height", "width", "align",
717 		"hspace", "vspace", NULL } ;
718 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
719 	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
720 static const char* const basefont_attrs[] =
721 	{ "id", "size", "color", "face", NULL } ;
722 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
723 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
724 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
725 static const char* const body_depr[] = { "background", "bgcolor", "text",
726 	"link", "vlink", "alink", NULL } ;
727 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
728 	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
729 
730 
731 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
732 static const char* const col_elt[] = { "col", NULL } ;
733 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
734 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
735 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
736 static const char* const compact_attr[] = { "compact", NULL } ;
737 static const char* const label_attr[] = { "label", NULL } ;
738 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
739 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
740 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
741 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
742 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
743 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
744 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
745 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
746 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
747 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
748 static const char* const version_attr[] = { "version", NULL } ;
749 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
750 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
751 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
752 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
753 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
754 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
755 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
756 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
757 static const char* const align_attr[] = { "align", NULL } ;
758 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
759 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
760 static const char* const name_attr[] = { "name", NULL } ;
761 static const char* const action_attr[] = { "action", NULL } ;
762 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
763 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
764 static const char* const content_attr[] = { "content", NULL } ;
765 static const char* const type_attr[] = { "type", NULL } ;
766 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
767 static const char* const object_contents[] = { FLOW, "param", NULL } ;
768 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
769 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
770 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
771 static const char* const option_elt[] = { "option", NULL } ;
772 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
773 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
774 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
775 static const char* const width_attr[] = { "width", NULL } ;
776 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
777 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
778 static const char* const language_attr[] = { "language", NULL } ;
779 static const char* const select_content[] = { "optgroup", "option", NULL } ;
780 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
781 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
782 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
783 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
784 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
785 static const char* const tr_elt[] = { "tr", NULL } ;
786 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
787 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
788 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
789 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
790 static const char* const tr_contents[] = { "th", "td", NULL } ;
791 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
792 static const char* const li_elt[] = { "li", NULL } ;
793 static const char* const ul_depr[] = { "type", "compact", NULL} ;
794 static const char* const dir_attr[] = { "dir", NULL} ;
795 
796 #define DECL (const char**)
797 
798 static const htmlElemDesc
799 html40ElementTable[] = {
800 { "a",		0, 0, 0, 0, 0, 0, 1, "anchor ",
801 	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
802 },
803 { "abbr",	0, 0, 0, 0, 0, 0, 1, "abbreviated form",
804 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
805 },
806 { "acronym",	0, 0, 0, 0, 0, 0, 1, "",
807 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
808 },
809 { "address",	0, 0, 0, 0, 0, 0, 0, "information on author ",
810 	DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
811 },
812 { "applet",	0, 0, 0, 0, 1, 1, 2, "java applet ",
813 	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
814 },
815 { "area",	0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
816 	EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
817 },
818 { "b",		0, 3, 0, 0, 0, 0, 1, "bold text style",
819 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
820 },
821 { "base",	0, 2, 2, 1, 0, 0, 0, "document base uri ",
822 	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
823 },
824 { "basefont",	0, 2, 2, 1, 1, 1, 1, "base font size " ,
825 	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
826 },
827 { "bdo",	0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
828 	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
829 },
830 { "big",	0, 3, 0, 0, 0, 0, 1, "large text style",
831 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
832 },
833 { "blockquote",	0, 0, 0, 0, 0, 0, 0, "long quotation ",
834 	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
835 },
836 { "body",	1, 1, 0, 0, 0, 0, 0, "document body ",
837 	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
838 },
839 { "br",		0, 2, 2, 1, 0, 0, 1, "forced line break ",
840 	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
841 },
842 { "button",	0, 0, 0, 0, 0, 0, 2, "push button ",
843 	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
844 },
845 { "caption",	0, 0, 0, 0, 0, 0, 0, "table caption ",
846 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
847 },
848 { "center",	0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
849 	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
850 },
851 { "cite",	0, 0, 0, 0, 0, 0, 1, "citation",
852 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
853 },
854 { "code",	0, 0, 0, 0, 0, 0, 1, "computer code fragment",
855 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
856 },
857 { "col",	0, 2, 2, 1, 0, 0, 0, "table column ",
858 	EMPTY , NULL , DECL col_attrs , NULL, NULL
859 },
860 { "colgroup",	0, 1, 0, 0, 0, 0, 0, "table column group ",
861 	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
862 },
863 { "dd",		0, 1, 0, 0, 0, 0, 0, "definition description ",
864 	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
865 },
866 { "del",	0, 0, 0, 0, 0, 0, 2, "deleted text ",
867 	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
868 },
869 { "dfn",	0, 0, 0, 0, 0, 0, 1, "instance definition",
870 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
871 },
872 { "dir",	0, 0, 0, 0, 1, 1, 0, "directory list",
873 	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
874 },
875 { "div",	0, 0, 0, 0, 0, 0, 0, "generic language/style container",
876 	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
877 },
878 { "dl",		0, 0, 0, 0, 0, 0, 0, "definition list ",
879 	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
880 },
881 { "dt",		0, 1, 0, 0, 0, 0, 0, "definition term ",
882 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
883 },
884 { "em",		0, 3, 0, 0, 0, 0, 1, "emphasis",
885 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
886 },
887 { "embed",	0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
888 	EMPTY, NULL, DECL embed_attrs, NULL, NULL
889 },
890 { "fieldset",	0, 0, 0, 0, 0, 0, 0, "form control group ",
891 	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
892 },
893 { "font",	0, 3, 0, 0, 1, 1, 1, "local change to font ",
894 	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
895 },
896 { "form",	0, 0, 0, 0, 0, 0, 0, "interactive form ",
897 	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
898 },
899 { "frame",	0, 2, 2, 1, 0, 2, 0, "subwindow " ,
900 	EMPTY, NULL, NULL, DECL frame_attrs, NULL
901 },
902 { "frameset",	0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
903 	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
904 },
905 { "h1",		0, 0, 0, 0, 0, 0, 0, "heading ",
906 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
907 },
908 { "h2",		0, 0, 0, 0, 0, 0, 0, "heading ",
909 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
910 },
911 { "h3",		0, 0, 0, 0, 0, 0, 0, "heading ",
912 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
913 },
914 { "h4",		0, 0, 0, 0, 0, 0, 0, "heading ",
915 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
916 },
917 { "h5",		0, 0, 0, 0, 0, 0, 0, "heading ",
918 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
919 },
920 { "h6",		0, 0, 0, 0, 0, 0, 0, "heading ",
921 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
922 },
923 { "head",	1, 1, 0, 0, 0, 0, 0, "document head ",
924 	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
925 },
926 { "hr",		0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
927 	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
928 },
929 { "html",	1, 1, 0, 0, 0, 0, 0, "document root element ",
930 	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
931 },
932 { "i",		0, 3, 0, 0, 0, 0, 1, "italic text style",
933 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
934 },
935 { "iframe",	0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
936 	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
937 },
938 { "img",	0, 2, 2, 1, 0, 0, 1, "embedded image ",
939 	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
940 },
941 { "input",	0, 2, 2, 1, 0, 0, 1, "form control ",
942 	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
943 },
944 { "ins",	0, 0, 0, 0, 0, 0, 2, "inserted text",
945 	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
946 },
947 { "isindex",	0, 2, 2, 1, 1, 1, 0, "single line prompt ",
948 	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
949 },
950 { "kbd",	0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
951 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
952 },
953 { "label",	0, 0, 0, 0, 0, 0, 1, "form field label text ",
954 	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
955 },
956 { "legend",	0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
957 	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
958 },
959 { "li",		0, 1, 1, 0, 0, 0, 0, "list item ",
960 	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
961 },
962 { "link",	0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
963 	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
964 },
965 { "map",	0, 0, 0, 0, 0, 0, 2, "client-side image map ",
966 	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
967 },
968 { "menu",	0, 0, 0, 0, 1, 1, 0, "menu list ",
969 	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
970 },
971 { "meta",	0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
972 	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
973 },
974 { "noframes",	0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
975 	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
976 },
977 { "noscript",	0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
978 	DECL html_flow, "div", DECL html_attrs, NULL, NULL
979 },
980 { "object",	0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
981 	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
982 },
983 { "ol",		0, 0, 0, 0, 0, 0, 0, "ordered list ",
984 	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
985 },
986 { "optgroup",	0, 0, 0, 0, 0, 0, 0, "option group ",
987 	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
988 },
989 { "option",	0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
990 	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
991 },
992 { "p",		0, 1, 0, 0, 0, 0, 0, "paragraph ",
993 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
994 },
995 { "param",	0, 2, 2, 1, 0, 0, 0, "named property value ",
996 	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
997 },
998 { "pre",	0, 0, 0, 0, 0, 0, 0, "preformatted text ",
999 	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
1000 },
1001 { "q",		0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
1002 	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
1003 },
1004 { "s",		0, 3, 0, 0, 1, 1, 1, "strike-through text style",
1005 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1006 },
1007 { "samp",	0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
1008 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1009 },
1010 { "script",	0, 0, 0, 0, 0, 0, 2, "script statements ",
1011 	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
1012 },
1013 { "select",	0, 0, 0, 0, 0, 0, 1, "option selector ",
1014 	DECL select_content, NULL, DECL select_attrs, NULL, NULL
1015 },
1016 { "small",	0, 3, 0, 0, 0, 0, 1, "small text style",
1017 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1018 },
1019 { "span",	0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
1020 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1021 },
1022 { "strike",	0, 3, 0, 0, 1, 1, 1, "strike-through text",
1023 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1024 },
1025 { "strong",	0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1026 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1027 },
1028 { "style",	0, 0, 0, 0, 0, 0, 0, "style info ",
1029 	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1030 },
1031 { "sub",	0, 3, 0, 0, 0, 0, 1, "subscript",
1032 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1033 },
1034 { "sup",	0, 3, 0, 0, 0, 0, 1, "superscript ",
1035 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1036 },
1037 { "table",	0, 0, 0, 0, 0, 0, 0, "",
1038 	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1039 },
1040 { "tbody",	1, 0, 0, 0, 0, 0, 0, "table body ",
1041 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1042 },
1043 { "td",		0, 0, 0, 0, 0, 0, 0, "table data cell",
1044 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1045 },
1046 { "textarea",	0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1047 	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1048 },
1049 { "tfoot",	0, 1, 0, 0, 0, 0, 0, "table footer ",
1050 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1051 },
1052 { "th",		0, 1, 0, 0, 0, 0, 0, "table header cell",
1053 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1054 },
1055 { "thead",	0, 1, 0, 0, 0, 0, 0, "table header ",
1056 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1057 },
1058 { "title",	0, 0, 0, 0, 0, 0, 0, "document title ",
1059 	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1060 },
1061 { "tr",		0, 0, 0, 0, 0, 0, 0, "table row ",
1062 	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1063 },
1064 { "tt",		0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1065 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1066 },
1067 { "u",		0, 3, 0, 0, 1, 1, 1, "underlined text style",
1068 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1069 },
1070 { "ul",		0, 0, 0, 0, 0, 0, 0, "unordered list ",
1071 	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1072 },
1073 { "var",	0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1074 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1075 }
1076 };
1077 
1078 /*
1079  * start tags that imply the end of current element
1080  */
1081 static const char * const htmlStartClose[] = {
1082 "form",		"form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1083 		"dl", "ul", "ol", "menu", "dir", "address", "pre",
1084 		"listing", "xmp", "head", NULL,
1085 "head",		"p", NULL,
1086 "title",	"p", NULL,
1087 "body",		"head", "style", "link", "title", "p", NULL,
1088 "frameset",	"head", "style", "link", "title", "p", NULL,
1089 "li",		"p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1090 		"pre", "listing", "xmp", "head", "li", NULL,
1091 "hr",		"p", "head", NULL,
1092 "h1",		"p", "head", NULL,
1093 "h2",		"p", "head", NULL,
1094 "h3",		"p", "head", NULL,
1095 "h4",		"p", "head", NULL,
1096 "h5",		"p", "head", NULL,
1097 "h6",		"p", "head", NULL,
1098 "dir",		"p", "head", NULL,
1099 "address",	"p", "head", "ul", NULL,
1100 "pre",		"p", "head", "ul", NULL,
1101 "listing",	"p", "head", NULL,
1102 "xmp",		"p", "head", NULL,
1103 "blockquote",	"p", "head", NULL,
1104 "dl",		"p", "dt", "menu", "dir", "address", "pre", "listing",
1105 		"xmp", "head", NULL,
1106 "dt",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
1107                 "head", "dd", NULL,
1108 "dd",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
1109                 "head", "dt", NULL,
1110 "ul",		"p", "head", "ol", "menu", "dir", "address", "pre",
1111 		"listing", "xmp", NULL,
1112 "ol",		"p", "head", "ul", NULL,
1113 "menu",		"p", "head", "ul", NULL,
1114 "p",		"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
1115 "div",		"p", "head", NULL,
1116 "noscript",	"script", NULL,
1117 "center",	"font", "b", "i", "p", "head", NULL,
1118 "a",		"a", "head", NULL,
1119 "caption",	"p", NULL,
1120 "colgroup",	"caption", "colgroup", "col", "p", NULL,
1121 "col",		"caption", "col", "p", NULL,
1122 "table",	"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1123 		"listing", "xmp", "a", NULL,
1124 "th",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1125 "td",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1126 "tr",		"th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1127 "thead",	"caption", "col", "colgroup", NULL,
1128 "tfoot",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
1129 		"tbody", "p", NULL,
1130 "tbody",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
1131 		"tfoot", "tbody", "p", NULL,
1132 "optgroup",	"option", NULL,
1133 "option",	"option", NULL,
1134 "fieldset",	"legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1135 		"pre", "listing", "xmp", "a", NULL,
1136 /* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
1137 "tt",		"head", NULL,
1138 "i",		"head", NULL,
1139 "b",		"head", NULL,
1140 "u",		"head", NULL,
1141 "s",		"head", NULL,
1142 "strike",	"head", NULL,
1143 "big",		"head", NULL,
1144 "small",	"head", NULL,
1145 
1146 "em",		"head", NULL,
1147 "strong",	"head", NULL,
1148 "dfn",		"head", NULL,
1149 "code",		"head", NULL,
1150 "samp",		"head", NULL,
1151 "kbd",		"head", NULL,
1152 "var",		"head", NULL,
1153 "cite",		"head", NULL,
1154 "abbr",		"head", NULL,
1155 "acronym",	"head", NULL,
1156 
1157 /* "a" */
1158 "img",		"head", NULL,
1159 /* "applet" */
1160 /* "embed" */
1161 /* "object" */
1162 "font",		"head", NULL,
1163 /* "basefont" */
1164 "br",		"head", NULL,
1165 /* "script" */
1166 "map",		"head", NULL,
1167 "q",		"head", NULL,
1168 "sub",		"head", NULL,
1169 "sup",		"head", NULL,
1170 "span",		"head", NULL,
1171 "bdo",		"head", NULL,
1172 "iframe",	"head", NULL,
1173 NULL
1174 };
1175 
1176 /*
1177  * The list of HTML elements which are supposed not to have
1178  * CDATA content and where a p element will be implied
1179  *
1180  * TODO: extend that list by reading the HTML SGML DTD on
1181  *       implied paragraph
1182  */
1183 static const char *const htmlNoContentElements[] = {
1184     "html",
1185     "head",
1186     NULL
1187 };
1188 
1189 /*
1190  * The list of HTML attributes which are of content %Script;
1191  * NOTE: when adding ones, check htmlIsScriptAttribute() since
1192  *       it assumes the name starts with 'on'
1193  */
1194 static const char *const htmlScriptAttributes[] = {
1195     "onclick",
1196     "ondblclick",
1197     "onmousedown",
1198     "onmouseup",
1199     "onmouseover",
1200     "onmousemove",
1201     "onmouseout",
1202     "onkeypress",
1203     "onkeydown",
1204     "onkeyup",
1205     "onload",
1206     "onunload",
1207     "onfocus",
1208     "onblur",
1209     "onsubmit",
1210     "onreset",
1211     "onchange",
1212     "onselect"
1213 };
1214 
1215 /*
1216  * This table is used by the htmlparser to know what to do with
1217  * broken html pages. By assigning different priorities to different
1218  * elements the parser can decide how to handle extra endtags.
1219  * Endtags are only allowed to close elements with lower or equal
1220  * priority.
1221  */
1222 
1223 typedef struct {
1224     const char *name;
1225     int priority;
1226 } elementPriority;
1227 
1228 static const elementPriority htmlEndPriority[] = {
1229     {"div",   150},
1230     {"td",    160},
1231     {"th",    160},
1232     {"tr",    170},
1233     {"thead", 180},
1234     {"tbody", 180},
1235     {"tfoot", 180},
1236     {"table", 190},
1237     {"head",  200},
1238     {"body",  200},
1239     {"html",  220},
1240     {NULL,    100} /* Default priority */
1241 };
1242 
1243 static const char** htmlStartCloseIndex[100];
1244 static int htmlStartCloseIndexinitialized = 0;
1245 
1246 /************************************************************************
1247  *									*
1248  *	functions to handle HTML specific data			*
1249  *									*
1250  ************************************************************************/
1251 
1252 /**
1253  * htmlInitAutoClose:
1254  *
1255  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1256  * This is not reentrant. Call xmlInitParser() once before processing in
1257  * case of use in multithreaded programs.
1258  */
1259 void
htmlInitAutoClose(void)1260 htmlInitAutoClose(void) {
1261     int indx, i = 0;
1262 
1263     if (htmlStartCloseIndexinitialized) return;
1264 
1265     for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1266     indx = 0;
1267     while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1268         htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1269 	while (htmlStartClose[i] != NULL) i++;
1270 	i++;
1271     }
1272     htmlStartCloseIndexinitialized = 1;
1273 }
1274 
1275 /**
1276  * htmlTagLookup:
1277  * @tag:  The tag name in lowercase
1278  *
1279  * Lookup the HTML tag in the ElementTable
1280  *
1281  * Returns the related htmlElemDescPtr or NULL if not found.
1282  */
1283 const htmlElemDesc *
htmlTagLookup(const xmlChar * tag)1284 htmlTagLookup(const xmlChar *tag) {
1285     unsigned int i;
1286 
1287     for (i = 0; i < (sizeof(html40ElementTable) /
1288                      sizeof(html40ElementTable[0]));i++) {
1289         if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1290 	    return((htmlElemDescPtr) &html40ElementTable[i]);
1291     }
1292     return(NULL);
1293 }
1294 
1295 /**
1296  * htmlGetEndPriority:
1297  * @name: The name of the element to look up the priority for.
1298  *
1299  * Return value: The "endtag" priority.
1300  **/
1301 static int
htmlGetEndPriority(const xmlChar * name)1302 htmlGetEndPriority (const xmlChar *name) {
1303     int i = 0;
1304 
1305     while ((htmlEndPriority[i].name != NULL) &&
1306 	   (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1307 	i++;
1308 
1309     return(htmlEndPriority[i].priority);
1310 }
1311 
1312 
1313 /**
1314  * htmlCheckAutoClose:
1315  * @newtag:  The new tag name
1316  * @oldtag:  The old tag name
1317  *
1318  * Checks whether the new tag is one of the registered valid tags for
1319  * closing old.
1320  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1321  *
1322  * Returns 0 if no, 1 if yes.
1323  */
1324 static int
htmlCheckAutoClose(const xmlChar * newtag,const xmlChar * oldtag)1325 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1326 {
1327     int i, indx;
1328     const char **closed = NULL;
1329 
1330     if (htmlStartCloseIndexinitialized == 0)
1331         htmlInitAutoClose();
1332 
1333     /* inefficient, but not a big deal */
1334     for (indx = 0; indx < 100; indx++) {
1335         closed = htmlStartCloseIndex[indx];
1336         if (closed == NULL)
1337             return (0);
1338         if (xmlStrEqual(BAD_CAST * closed, newtag))
1339             break;
1340     }
1341 
1342     i = closed - htmlStartClose;
1343     i++;
1344     while (htmlStartClose[i] != NULL) {
1345         if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1346             return (1);
1347         }
1348         i++;
1349     }
1350     return (0);
1351 }
1352 
1353 /**
1354  * htmlAutoCloseOnClose:
1355  * @ctxt:  an HTML parser context
1356  * @newtag:  The new tag name
1357  * @force:  force the tag closure
1358  *
1359  * The HTML DTD allows an ending tag to implicitly close other tags.
1360  */
1361 static void
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1362 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1363 {
1364     const htmlElemDesc *info;
1365     int i, priority;
1366 
1367     priority = htmlGetEndPriority(newtag);
1368 
1369     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1370 
1371         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1372             break;
1373         /*
1374          * A missplaced endtag can only close elements with lower
1375          * or equal priority, so if we find an element with higher
1376          * priority before we find an element with
1377          * matching name, we just ignore this endtag
1378          */
1379         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1380             return;
1381     }
1382     if (i < 0)
1383         return;
1384 
1385     while (!xmlStrEqual(newtag, ctxt->name)) {
1386         info = htmlTagLookup(ctxt->name);
1387         if ((info != NULL) && (info->endTag == 3)) {
1388             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1389 	                 "Opening and ending tag mismatch: %s and %s\n",
1390 			 newtag, ctxt->name);
1391         }
1392         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1393             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1394 	htmlnamePop(ctxt);
1395     }
1396 }
1397 
1398 /**
1399  * htmlAutoCloseOnEnd:
1400  * @ctxt:  an HTML parser context
1401  *
1402  * Close all remaining tags at the end of the stream
1403  */
1404 static void
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)1405 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1406 {
1407     int i;
1408 
1409     if (ctxt->nameNr == 0)
1410         return;
1411     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1412         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1413             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1414 	htmlnamePop(ctxt);
1415     }
1416 }
1417 
1418 /**
1419  * htmlAutoClose:
1420  * @ctxt:  an HTML parser context
1421  * @newtag:  The new tag name or NULL
1422  *
1423  * The HTML DTD allows a tag to implicitly close other tags.
1424  * The list is kept in htmlStartClose array. This function is
1425  * called when a new tag has been detected and generates the
1426  * appropriates closes if possible/needed.
1427  * If newtag is NULL this mean we are at the end of the resource
1428  * and we should check
1429  */
1430 static void
htmlAutoClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1431 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1432 {
1433     while ((newtag != NULL) && (ctxt->name != NULL) &&
1434            (htmlCheckAutoClose(newtag, ctxt->name))) {
1435         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1436             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1437 	htmlnamePop(ctxt);
1438     }
1439     if (newtag == NULL) {
1440         htmlAutoCloseOnEnd(ctxt);
1441         return;
1442     }
1443     while ((newtag == NULL) && (ctxt->name != NULL) &&
1444            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1445             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1446             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1447         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1448             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1449 	htmlnamePop(ctxt);
1450     }
1451 }
1452 
1453 /**
1454  * htmlAutoCloseTag:
1455  * @doc:  the HTML document
1456  * @name:  The tag name
1457  * @elem:  the HTML element
1458  *
1459  * The HTML DTD allows a tag to implicitly close other tags.
1460  * The list is kept in htmlStartClose array. This function checks
1461  * if the element or one of it's children would autoclose the
1462  * given tag.
1463  *
1464  * Returns 1 if autoclose, 0 otherwise
1465  */
1466 int
htmlAutoCloseTag(htmlDocPtr doc,const xmlChar * name,htmlNodePtr elem)1467 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1468     htmlNodePtr child;
1469 
1470     if (elem == NULL) return(1);
1471     if (xmlStrEqual(name, elem->name)) return(0);
1472     if (htmlCheckAutoClose(elem->name, name)) return(1);
1473     child = elem->children;
1474     while (child != NULL) {
1475         if (htmlAutoCloseTag(doc, name, child)) return(1);
1476 	child = child->next;
1477     }
1478     return(0);
1479 }
1480 
1481 /**
1482  * htmlIsAutoClosed:
1483  * @doc:  the HTML document
1484  * @elem:  the HTML element
1485  *
1486  * The HTML DTD allows a tag to implicitly close other tags.
1487  * The list is kept in htmlStartClose array. This function checks
1488  * if a tag is autoclosed by one of it's child
1489  *
1490  * Returns 1 if autoclosed, 0 otherwise
1491  */
1492 int
htmlIsAutoClosed(htmlDocPtr doc,htmlNodePtr elem)1493 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1494     htmlNodePtr child;
1495 
1496     if (elem == NULL) return(1);
1497     child = elem->children;
1498     while (child != NULL) {
1499 	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1500 	child = child->next;
1501     }
1502     return(0);
1503 }
1504 
1505 /**
1506  * htmlCheckImplied:
1507  * @ctxt:  an HTML parser context
1508  * @newtag:  The new tag name
1509  *
1510  * The HTML DTD allows a tag to exists only implicitly
1511  * called when a new tag has been detected and generates the
1512  * appropriates implicit tags if missing
1513  */
1514 static void
htmlCheckImplied(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1515 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1516     int i;
1517 
1518     if (ctxt->options & HTML_PARSE_NOIMPLIED)
1519         return;
1520     if (!htmlOmittedDefaultValue)
1521 	return;
1522     if (xmlStrEqual(newtag, BAD_CAST"html"))
1523 	return;
1524     if (ctxt->nameNr <= 0) {
1525 	htmlnamePush(ctxt, BAD_CAST"html");
1526 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1527 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1528     }
1529     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1530         return;
1531     if ((ctxt->nameNr <= 1) &&
1532         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1533 	 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1534 	 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1535 	 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1536 	 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1537 	 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1538         if (ctxt->html >= 3) {
1539             /* we already saw or generated an <head> before */
1540             return;
1541         }
1542         /*
1543          * dropped OBJECT ... i you put it first BODY will be
1544          * assumed !
1545          */
1546         htmlnamePush(ctxt, BAD_CAST"head");
1547         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1548             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1549     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1550 	       (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1551 	       (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1552         if (ctxt->html >= 10) {
1553             /* we already saw or generated a <body> before */
1554             return;
1555         }
1556 	for (i = 0;i < ctxt->nameNr;i++) {
1557 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1558 		return;
1559 	    }
1560 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1561 		return;
1562 	    }
1563 	}
1564 
1565 	htmlnamePush(ctxt, BAD_CAST"body");
1566 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1567 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1568     }
1569 }
1570 
1571 /**
1572  * htmlCheckParagraph
1573  * @ctxt:  an HTML parser context
1574  *
1575  * Check whether a p element need to be implied before inserting
1576  * characters in the current element.
1577  *
1578  * Returns 1 if a paragraph has been inserted, 0 if not and -1
1579  *         in case of error.
1580  */
1581 
1582 static int
htmlCheckParagraph(htmlParserCtxtPtr ctxt)1583 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1584     const xmlChar *tag;
1585     int i;
1586 
1587     if (ctxt == NULL)
1588 	return(-1);
1589     tag = ctxt->name;
1590     if (tag == NULL) {
1591 	htmlAutoClose(ctxt, BAD_CAST"p");
1592 	htmlCheckImplied(ctxt, BAD_CAST"p");
1593 	htmlnamePush(ctxt, BAD_CAST"p");
1594 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1595 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1596 	return(1);
1597     }
1598     if (!htmlOmittedDefaultValue)
1599 	return(0);
1600     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1601 	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1602 	    htmlAutoClose(ctxt, BAD_CAST"p");
1603 	    htmlCheckImplied(ctxt, BAD_CAST"p");
1604 	    htmlnamePush(ctxt, BAD_CAST"p");
1605 	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1606 		ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1607 	    return(1);
1608 	}
1609     }
1610     return(0);
1611 }
1612 
1613 /**
1614  * htmlIsScriptAttribute:
1615  * @name:  an attribute name
1616  *
1617  * Check if an attribute is of content type Script
1618  *
1619  * Returns 1 is the attribute is a script 0 otherwise
1620  */
1621 int
htmlIsScriptAttribute(const xmlChar * name)1622 htmlIsScriptAttribute(const xmlChar *name) {
1623     unsigned int i;
1624 
1625     if (name == NULL)
1626       return(0);
1627     /*
1628      * all script attributes start with 'on'
1629      */
1630     if ((name[0] != 'o') || (name[1] != 'n'))
1631       return(0);
1632     for (i = 0;
1633 	 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1634 	 i++) {
1635 	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1636 	    return(1);
1637     }
1638     return(0);
1639 }
1640 
1641 /************************************************************************
1642  *									*
1643  *	The list of HTML predefined entities			*
1644  *									*
1645  ************************************************************************/
1646 
1647 
1648 static const htmlEntityDesc  html40EntitiesTable[] = {
1649 /*
1650  * the 4 absolute ones, plus apostrophe.
1651  */
1652 { 34,	"quot",	"quotation mark = APL quote, U+0022 ISOnum" },
1653 { 38,	"amp",	"ampersand, U+0026 ISOnum" },
1654 { 39,	"apos",	"single quote" },
1655 { 60,	"lt",	"less-than sign, U+003C ISOnum" },
1656 { 62,	"gt",	"greater-than sign, U+003E ISOnum" },
1657 
1658 /*
1659  * A bunch still in the 128-255 range
1660  * Replacing them depend really on the charset used.
1661  */
1662 { 160,	"nbsp",	"no-break space = non-breaking space, U+00A0 ISOnum" },
1663 { 161,	"iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1664 { 162,	"cent",	"cent sign, U+00A2 ISOnum" },
1665 { 163,	"pound","pound sign, U+00A3 ISOnum" },
1666 { 164,	"curren","currency sign, U+00A4 ISOnum" },
1667 { 165,	"yen",	"yen sign = yuan sign, U+00A5 ISOnum" },
1668 { 166,	"brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1669 { 167,	"sect",	"section sign, U+00A7 ISOnum" },
1670 { 168,	"uml",	"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1671 { 169,	"copy",	"copyright sign, U+00A9 ISOnum" },
1672 { 170,	"ordf",	"feminine ordinal indicator, U+00AA ISOnum" },
1673 { 171,	"laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1674 { 172,	"not",	"not sign, U+00AC ISOnum" },
1675 { 173,	"shy",	"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1676 { 174,	"reg",	"registered sign = registered trade mark sign, U+00AE ISOnum" },
1677 { 175,	"macr",	"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1678 { 176,	"deg",	"degree sign, U+00B0 ISOnum" },
1679 { 177,	"plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1680 { 178,	"sup2",	"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1681 { 179,	"sup3",	"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1682 { 180,	"acute","acute accent = spacing acute, U+00B4 ISOdia" },
1683 { 181,	"micro","micro sign, U+00B5 ISOnum" },
1684 { 182,	"para",	"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1685 { 183,	"middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1686 { 184,	"cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1687 { 185,	"sup1",	"superscript one = superscript digit one, U+00B9 ISOnum" },
1688 { 186,	"ordm",	"masculine ordinal indicator, U+00BA ISOnum" },
1689 { 187,	"raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1690 { 188,	"frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1691 { 189,	"frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1692 { 190,	"frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1693 { 191,	"iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1694 { 192,	"Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1695 { 193,	"Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1696 { 194,	"Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1697 { 195,	"Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1698 { 196,	"Auml",	"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1699 { 197,	"Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1700 { 198,	"AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1701 { 199,	"Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1702 { 200,	"Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1703 { 201,	"Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1704 { 202,	"Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1705 { 203,	"Euml",	"latin capital letter E with diaeresis, U+00CB ISOlat1" },
1706 { 204,	"Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1707 { 205,	"Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1708 { 206,	"Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1709 { 207,	"Iuml",	"latin capital letter I with diaeresis, U+00CF ISOlat1" },
1710 { 208,	"ETH",	"latin capital letter ETH, U+00D0 ISOlat1" },
1711 { 209,	"Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1712 { 210,	"Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1713 { 211,	"Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1714 { 212,	"Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1715 { 213,	"Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1716 { 214,	"Ouml",	"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1717 { 215,	"times","multiplication sign, U+00D7 ISOnum" },
1718 { 216,	"Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1719 { 217,	"Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1720 { 218,	"Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1721 { 219,	"Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1722 { 220,	"Uuml",	"latin capital letter U with diaeresis, U+00DC ISOlat1" },
1723 { 221,	"Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1724 { 222,	"THORN","latin capital letter THORN, U+00DE ISOlat1" },
1725 { 223,	"szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1726 { 224,	"agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1727 { 225,	"aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1728 { 226,	"acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1729 { 227,	"atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1730 { 228,	"auml",	"latin small letter a with diaeresis, U+00E4 ISOlat1" },
1731 { 229,	"aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1732 { 230,	"aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1733 { 231,	"ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1734 { 232,	"egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1735 { 233,	"eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1736 { 234,	"ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1737 { 235,	"euml",	"latin small letter e with diaeresis, U+00EB ISOlat1" },
1738 { 236,	"igrave","latin small letter i with grave, U+00EC ISOlat1" },
1739 { 237,	"iacute","latin small letter i with acute, U+00ED ISOlat1" },
1740 { 238,	"icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1741 { 239,	"iuml",	"latin small letter i with diaeresis, U+00EF ISOlat1" },
1742 { 240,	"eth",	"latin small letter eth, U+00F0 ISOlat1" },
1743 { 241,	"ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1744 { 242,	"ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1745 { 243,	"oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1746 { 244,	"ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1747 { 245,	"otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1748 { 246,	"ouml",	"latin small letter o with diaeresis, U+00F6 ISOlat1" },
1749 { 247,	"divide","division sign, U+00F7 ISOnum" },
1750 { 248,	"oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1751 { 249,	"ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1752 { 250,	"uacute","latin small letter u with acute, U+00FA ISOlat1" },
1753 { 251,	"ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1754 { 252,	"uuml",	"latin small letter u with diaeresis, U+00FC ISOlat1" },
1755 { 253,	"yacute","latin small letter y with acute, U+00FD ISOlat1" },
1756 { 254,	"thorn","latin small letter thorn with, U+00FE ISOlat1" },
1757 { 255,	"yuml",	"latin small letter y with diaeresis, U+00FF ISOlat1" },
1758 
1759 { 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },
1760 { 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },
1761 { 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1762 { 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },
1763 { 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1764 
1765 /*
1766  * Anything below should really be kept as entities references
1767  */
1768 { 402,	"fnof",	"latin small f with hook = function = florin, U+0192 ISOtech" },
1769 
1770 { 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },
1771 { 732,	"tilde","small tilde, U+02DC ISOdia" },
1772 
1773 { 913,	"Alpha","greek capital letter alpha, U+0391" },
1774 { 914,	"Beta",	"greek capital letter beta, U+0392" },
1775 { 915,	"Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1776 { 916,	"Delta","greek capital letter delta, U+0394 ISOgrk3" },
1777 { 917,	"Epsilon","greek capital letter epsilon, U+0395" },
1778 { 918,	"Zeta",	"greek capital letter zeta, U+0396" },
1779 { 919,	"Eta",	"greek capital letter eta, U+0397" },
1780 { 920,	"Theta","greek capital letter theta, U+0398 ISOgrk3" },
1781 { 921,	"Iota",	"greek capital letter iota, U+0399" },
1782 { 922,	"Kappa","greek capital letter kappa, U+039A" },
1783 { 923,	"Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1784 { 924,	"Mu",	"greek capital letter mu, U+039C" },
1785 { 925,	"Nu",	"greek capital letter nu, U+039D" },
1786 { 926,	"Xi",	"greek capital letter xi, U+039E ISOgrk3" },
1787 { 927,	"Omicron","greek capital letter omicron, U+039F" },
1788 { 928,	"Pi",	"greek capital letter pi, U+03A0 ISOgrk3" },
1789 { 929,	"Rho",	"greek capital letter rho, U+03A1" },
1790 { 931,	"Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1791 { 932,	"Tau",	"greek capital letter tau, U+03A4" },
1792 { 933,	"Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1793 { 934,	"Phi",	"greek capital letter phi, U+03A6 ISOgrk3" },
1794 { 935,	"Chi",	"greek capital letter chi, U+03A7" },
1795 { 936,	"Psi",	"greek capital letter psi, U+03A8 ISOgrk3" },
1796 { 937,	"Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1797 
1798 { 945,	"alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1799 { 946,	"beta",	"greek small letter beta, U+03B2 ISOgrk3" },
1800 { 947,	"gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1801 { 948,	"delta","greek small letter delta, U+03B4 ISOgrk3" },
1802 { 949,	"epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1803 { 950,	"zeta",	"greek small letter zeta, U+03B6 ISOgrk3" },
1804 { 951,	"eta",	"greek small letter eta, U+03B7 ISOgrk3" },
1805 { 952,	"theta","greek small letter theta, U+03B8 ISOgrk3" },
1806 { 953,	"iota",	"greek small letter iota, U+03B9 ISOgrk3" },
1807 { 954,	"kappa","greek small letter kappa, U+03BA ISOgrk3" },
1808 { 955,	"lambda","greek small letter lambda, U+03BB ISOgrk3" },
1809 { 956,	"mu",	"greek small letter mu, U+03BC ISOgrk3" },
1810 { 957,	"nu",	"greek small letter nu, U+03BD ISOgrk3" },
1811 { 958,	"xi",	"greek small letter xi, U+03BE ISOgrk3" },
1812 { 959,	"omicron","greek small letter omicron, U+03BF NEW" },
1813 { 960,	"pi",	"greek small letter pi, U+03C0 ISOgrk3" },
1814 { 961,	"rho",	"greek small letter rho, U+03C1 ISOgrk3" },
1815 { 962,	"sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1816 { 963,	"sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1817 { 964,	"tau",	"greek small letter tau, U+03C4 ISOgrk3" },
1818 { 965,	"upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1819 { 966,	"phi",	"greek small letter phi, U+03C6 ISOgrk3" },
1820 { 967,	"chi",	"greek small letter chi, U+03C7 ISOgrk3" },
1821 { 968,	"psi",	"greek small letter psi, U+03C8 ISOgrk3" },
1822 { 969,	"omega","greek small letter omega, U+03C9 ISOgrk3" },
1823 { 977,	"thetasym","greek small letter theta symbol, U+03D1 NEW" },
1824 { 978,	"upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1825 { 982,	"piv",	"greek pi symbol, U+03D6 ISOgrk3" },
1826 
1827 { 8194,	"ensp",	"en space, U+2002 ISOpub" },
1828 { 8195,	"emsp",	"em space, U+2003 ISOpub" },
1829 { 8201,	"thinsp","thin space, U+2009 ISOpub" },
1830 { 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },
1831 { 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },
1832 { 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },
1833 { 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },
1834 { 8211,	"ndash","en dash, U+2013 ISOpub" },
1835 { 8212,	"mdash","em dash, U+2014 ISOpub" },
1836 { 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },
1837 { 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },
1838 { 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },
1839 { 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },
1840 { 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },
1841 { 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },
1842 { 8224,	"dagger","dagger, U+2020 ISOpub" },
1843 { 8225,	"Dagger","double dagger, U+2021 ISOpub" },
1844 
1845 { 8226,	"bull",	"bullet = black small circle, U+2022 ISOpub" },
1846 { 8230,	"hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1847 
1848 { 8240,	"permil","per mille sign, U+2030 ISOtech" },
1849 
1850 { 8242,	"prime","prime = minutes = feet, U+2032 ISOtech" },
1851 { 8243,	"Prime","double prime = seconds = inches, U+2033 ISOtech" },
1852 
1853 { 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1854 { 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1855 
1856 { 8254,	"oline","overline = spacing overscore, U+203E NEW" },
1857 { 8260,	"frasl","fraction slash, U+2044 NEW" },
1858 
1859 { 8364,	"euro",	"euro sign, U+20AC NEW" },
1860 
1861 { 8465,	"image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1862 { 8472,	"weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1863 { 8476,	"real",	"blackletter capital R = real part symbol, U+211C ISOamso" },
1864 { 8482,	"trade","trade mark sign, U+2122 ISOnum" },
1865 { 8501,	"alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1866 { 8592,	"larr",	"leftwards arrow, U+2190 ISOnum" },
1867 { 8593,	"uarr",	"upwards arrow, U+2191 ISOnum" },
1868 { 8594,	"rarr",	"rightwards arrow, U+2192 ISOnum" },
1869 { 8595,	"darr",	"downwards arrow, U+2193 ISOnum" },
1870 { 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },
1871 { 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1872 { 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },
1873 { 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },
1874 { 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },
1875 { 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },
1876 { 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },
1877 
1878 { 8704,	"forall","for all, U+2200 ISOtech" },
1879 { 8706,	"part",	"partial differential, U+2202 ISOtech" },
1880 { 8707,	"exist","there exists, U+2203 ISOtech" },
1881 { 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },
1882 { 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },
1883 { 8712,	"isin",	"element of, U+2208 ISOtech" },
1884 { 8713,	"notin","not an element of, U+2209 ISOtech" },
1885 { 8715,	"ni",	"contains as member, U+220B ISOtech" },
1886 { 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },
1887 { 8721,	"sum",	"n-ary summation, U+2211 ISOamsb" },
1888 { 8722,	"minus","minus sign, U+2212 ISOtech" },
1889 { 8727,	"lowast","asterisk operator, U+2217 ISOtech" },
1890 { 8730,	"radic","square root = radical sign, U+221A ISOtech" },
1891 { 8733,	"prop",	"proportional to, U+221D ISOtech" },
1892 { 8734,	"infin","infinity, U+221E ISOtech" },
1893 { 8736,	"ang",	"angle, U+2220 ISOamso" },
1894 { 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },
1895 { 8744,	"or",	"logical or = vee, U+2228 ISOtech" },
1896 { 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },
1897 { 8746,	"cup",	"union = cup, U+222A ISOtech" },
1898 { 8747,	"int",	"integral, U+222B ISOtech" },
1899 { 8756,	"there4","therefore, U+2234 ISOtech" },
1900 { 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },
1901 { 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },
1902 { 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1903 { 8800,	"ne",	"not equal to, U+2260 ISOtech" },
1904 { 8801,	"equiv","identical to, U+2261 ISOtech" },
1905 { 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },
1906 { 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },
1907 { 8834,	"sub",	"subset of, U+2282 ISOtech" },
1908 { 8835,	"sup",	"superset of, U+2283 ISOtech" },
1909 { 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },
1910 { 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },
1911 { 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },
1912 { 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },
1913 { 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },
1914 { 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1915 { 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },
1916 { 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1917 { 8969,	"rceil","right ceiling, U+2309 ISOamsc" },
1918 { 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },
1919 { 8971,	"rfloor","right floor, U+230B ISOamsc" },
1920 { 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },
1921 { 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },
1922 { 9674,	"loz",	"lozenge, U+25CA ISOpub" },
1923 
1924 { 9824,	"spades","black spade suit, U+2660 ISOpub" },
1925 { 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },
1926 { 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },
1927 { 9830,	"diams","black diamond suit, U+2666 ISOpub" },
1928 
1929 };
1930 
1931 /************************************************************************
1932  *									*
1933  *		Commodity functions to handle entities			*
1934  *									*
1935  ************************************************************************/
1936 
1937 /*
1938  * Macro used to grow the current buffer.
1939  */
1940 #define growBuffer(buffer) {						\
1941     xmlChar *tmp;							\
1942     buffer##_size *= 2;							\
1943     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1944     if (tmp == NULL) {						\
1945 	htmlErrMemory(ctxt, "growing buffer\n");			\
1946 	xmlFree(buffer);						\
1947 	return(NULL);							\
1948     }									\
1949     buffer = tmp;							\
1950 }
1951 
1952 /**
1953  * htmlEntityLookup:
1954  * @name: the entity name
1955  *
1956  * Lookup the given entity in EntitiesTable
1957  *
1958  * TODO: the linear scan is really ugly, an hash table is really needed.
1959  *
1960  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1961  */
1962 const htmlEntityDesc *
htmlEntityLookup(const xmlChar * name)1963 htmlEntityLookup(const xmlChar *name) {
1964     unsigned int i;
1965 
1966     for (i = 0;i < (sizeof(html40EntitiesTable)/
1967                     sizeof(html40EntitiesTable[0]));i++) {
1968         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1969             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1970 	}
1971     }
1972     return(NULL);
1973 }
1974 
1975 /**
1976  * htmlEntityValueLookup:
1977  * @value: the entity's unicode value
1978  *
1979  * Lookup the given entity in EntitiesTable
1980  *
1981  * TODO: the linear scan is really ugly, an hash table is really needed.
1982  *
1983  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1984  */
1985 const htmlEntityDesc *
htmlEntityValueLookup(unsigned int value)1986 htmlEntityValueLookup(unsigned int value) {
1987     unsigned int i;
1988 
1989     for (i = 0;i < (sizeof(html40EntitiesTable)/
1990                     sizeof(html40EntitiesTable[0]));i++) {
1991         if (html40EntitiesTable[i].value >= value) {
1992 	    if (html40EntitiesTable[i].value > value)
1993 		break;
1994             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1995 	}
1996     }
1997     return(NULL);
1998 }
1999 
2000 /**
2001  * UTF8ToHtml:
2002  * @out:  a pointer to an array of bytes to store the result
2003  * @outlen:  the length of @out
2004  * @in:  a pointer to an array of UTF-8 chars
2005  * @inlen:  the length of @in
2006  *
2007  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2008  * plus HTML entities block of chars out.
2009  *
2010  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2011  * The value of @inlen after return is the number of octets consumed
2012  *     as the return value is positive, else unpredictable.
2013  * The value of @outlen after return is the number of octets consumed.
2014  */
2015 int
UTF8ToHtml(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)2016 UTF8ToHtml(unsigned char* out, int *outlen,
2017               const unsigned char* in, int *inlen) {
2018     const unsigned char* processed = in;
2019     const unsigned char* outend;
2020     const unsigned char* outstart = out;
2021     const unsigned char* instart = in;
2022     const unsigned char* inend;
2023     unsigned int c, d;
2024     int trailing;
2025 
2026     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2027     if (in == NULL) {
2028         /*
2029 	 * initialization nothing to do
2030 	 */
2031 	*outlen = 0;
2032 	*inlen = 0;
2033 	return(0);
2034     }
2035     inend = in + (*inlen);
2036     outend = out + (*outlen);
2037     while (in < inend) {
2038 	d = *in++;
2039 	if      (d < 0x80)  { c= d; trailing= 0; }
2040 	else if (d < 0xC0) {
2041 	    /* trailing byte in leading position */
2042 	    *outlen = out - outstart;
2043 	    *inlen = processed - instart;
2044 	    return(-2);
2045         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2046         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2047         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2048 	else {
2049 	    /* no chance for this in Ascii */
2050 	    *outlen = out - outstart;
2051 	    *inlen = processed - instart;
2052 	    return(-2);
2053 	}
2054 
2055 	if (inend - in < trailing) {
2056 	    break;
2057 	}
2058 
2059 	for ( ; trailing; trailing--) {
2060 	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2061 		break;
2062 	    c <<= 6;
2063 	    c |= d & 0x3F;
2064 	}
2065 
2066 	/* assertion: c is a single UTF-4 value */
2067 	if (c < 0x80) {
2068 	    if (out + 1 >= outend)
2069 		break;
2070 	    *out++ = c;
2071 	} else {
2072 	    int len;
2073 	    const htmlEntityDesc * ent;
2074 	    const char *cp;
2075 	    char nbuf[16];
2076 
2077 	    /*
2078 	     * Try to lookup a predefined HTML entity for it
2079 	     */
2080 
2081 	    ent = htmlEntityValueLookup(c);
2082 	    if (ent == NULL) {
2083 	      snprintf(nbuf, sizeof(nbuf), "#%u", c);
2084 	      cp = nbuf;
2085 	    }
2086 	    else
2087 	      cp = ent->name;
2088 	    len = strlen(cp);
2089 	    if (out + 2 + len >= outend)
2090 		break;
2091 	    *out++ = '&';
2092 	    memcpy(out, cp, len);
2093 	    out += len;
2094 	    *out++ = ';';
2095 	}
2096 	processed = in;
2097     }
2098     *outlen = out - outstart;
2099     *inlen = processed - instart;
2100     return(0);
2101 }
2102 
2103 /**
2104  * htmlEncodeEntities:
2105  * @out:  a pointer to an array of bytes to store the result
2106  * @outlen:  the length of @out
2107  * @in:  a pointer to an array of UTF-8 chars
2108  * @inlen:  the length of @in
2109  * @quoteChar: the quote character to escape (' or ") or zero.
2110  *
2111  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2112  * plus HTML entities block of chars out.
2113  *
2114  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2115  * The value of @inlen after return is the number of octets consumed
2116  *     as the return value is positive, else unpredictable.
2117  * The value of @outlen after return is the number of octets consumed.
2118  */
2119 int
htmlEncodeEntities(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,int quoteChar)2120 htmlEncodeEntities(unsigned char* out, int *outlen,
2121 		   const unsigned char* in, int *inlen, int quoteChar) {
2122     const unsigned char* processed = in;
2123     const unsigned char* outend;
2124     const unsigned char* outstart = out;
2125     const unsigned char* instart = in;
2126     const unsigned char* inend;
2127     unsigned int c, d;
2128     int trailing;
2129 
2130     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2131         return(-1);
2132     outend = out + (*outlen);
2133     inend = in + (*inlen);
2134     while (in < inend) {
2135 	d = *in++;
2136 	if      (d < 0x80)  { c= d; trailing= 0; }
2137 	else if (d < 0xC0) {
2138 	    /* trailing byte in leading position */
2139 	    *outlen = out - outstart;
2140 	    *inlen = processed - instart;
2141 	    return(-2);
2142         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2143         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2144         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2145 	else {
2146 	    /* no chance for this in Ascii */
2147 	    *outlen = out - outstart;
2148 	    *inlen = processed - instart;
2149 	    return(-2);
2150 	}
2151 
2152 	if (inend - in < trailing)
2153 	    break;
2154 
2155 	while (trailing--) {
2156 	    if (((d= *in++) & 0xC0) != 0x80) {
2157 		*outlen = out - outstart;
2158 		*inlen = processed - instart;
2159 		return(-2);
2160 	    }
2161 	    c <<= 6;
2162 	    c |= d & 0x3F;
2163 	}
2164 
2165 	/* assertion: c is a single UTF-4 value */
2166 	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2167 	    (c != '&') && (c != '<') && (c != '>')) {
2168 	    if (out >= outend)
2169 		break;
2170 	    *out++ = c;
2171 	} else {
2172 	    const htmlEntityDesc * ent;
2173 	    const char *cp;
2174 	    char nbuf[16];
2175 	    int len;
2176 
2177 	    /*
2178 	     * Try to lookup a predefined HTML entity for it
2179 	     */
2180 	    ent = htmlEntityValueLookup(c);
2181 	    if (ent == NULL) {
2182 		snprintf(nbuf, sizeof(nbuf), "#%u", c);
2183 		cp = nbuf;
2184 	    }
2185 	    else
2186 		cp = ent->name;
2187 	    len = strlen(cp);
2188 	    if (out + 2 + len > outend)
2189 		break;
2190 	    *out++ = '&';
2191 	    memcpy(out, cp, len);
2192 	    out += len;
2193 	    *out++ = ';';
2194 	}
2195 	processed = in;
2196     }
2197     *outlen = out - outstart;
2198     *inlen = processed - instart;
2199     return(0);
2200 }
2201 
2202 /************************************************************************
2203  *									*
2204  *		Commodity functions to handle streams			*
2205  *									*
2206  ************************************************************************/
2207 
2208 /**
2209  * htmlNewInputStream:
2210  * @ctxt:  an HTML parser context
2211  *
2212  * Create a new input stream structure
2213  * Returns the new input stream or NULL
2214  */
2215 static htmlParserInputPtr
htmlNewInputStream(htmlParserCtxtPtr ctxt)2216 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2217     htmlParserInputPtr input;
2218 
2219     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2220     if (input == NULL) {
2221         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2222 	return(NULL);
2223     }
2224     memset(input, 0, sizeof(htmlParserInput));
2225     input->filename = NULL;
2226     input->directory = NULL;
2227     input->base = NULL;
2228     input->cur = NULL;
2229     input->buf = NULL;
2230     input->line = 1;
2231     input->col = 1;
2232     input->buf = NULL;
2233     input->free = NULL;
2234     input->version = NULL;
2235     input->consumed = 0;
2236     input->length = 0;
2237     return(input);
2238 }
2239 
2240 
2241 /************************************************************************
2242  *									*
2243  *		Commodity functions, cleanup needed ?			*
2244  *									*
2245  ************************************************************************/
2246 /*
2247  * all tags allowing pc data from the html 4.01 loose dtd
2248  * NOTE: it might be more apropriate to integrate this information
2249  * into the html40ElementTable array but I don't want to risk any
2250  * binary incomptibility
2251  */
2252 static const char *allowPCData[] = {
2253     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2254     "blockquote", "body", "button", "caption", "center", "cite", "code",
2255     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2256     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2257     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2258     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2259 };
2260 
2261 /**
2262  * areBlanks:
2263  * @ctxt:  an HTML parser context
2264  * @str:  a xmlChar *
2265  * @len:  the size of @str
2266  *
2267  * Is this a sequence of blank chars that one can ignore ?
2268  *
2269  * Returns 1 if ignorable 0 otherwise.
2270  */
2271 
areBlanks(htmlParserCtxtPtr ctxt,const xmlChar * str,int len)2272 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2273     unsigned int i;
2274     int j;
2275     xmlNodePtr lastChild;
2276     xmlDtdPtr dtd;
2277 
2278     for (j = 0;j < len;j++)
2279         if (!(IS_BLANK_CH(str[j]))) return(0);
2280 
2281     if (CUR == 0) return(1);
2282     if (CUR != '<') return(0);
2283     if (ctxt->name == NULL)
2284 	return(1);
2285     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2286 	return(1);
2287     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2288 	return(1);
2289 
2290     /* Only strip CDATA children of the body tag for strict HTML DTDs */
2291     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2292         dtd = xmlGetIntSubset(ctxt->myDoc);
2293         if (dtd != NULL && dtd->ExternalID != NULL) {
2294             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2295                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2296                 return(1);
2297         }
2298     }
2299 
2300     if (ctxt->node == NULL) return(0);
2301     lastChild = xmlGetLastChild(ctxt->node);
2302     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2303 	lastChild = lastChild->prev;
2304     if (lastChild == NULL) {
2305         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2306             (ctxt->node->content != NULL)) return(0);
2307 	/* keep ws in constructs like ...<b> </b>...
2308 	   for all tags "b" allowing PCDATA */
2309 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2310 	    if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2311 		return(0);
2312 	    }
2313 	}
2314     } else if (xmlNodeIsText(lastChild)) {
2315         return(0);
2316     } else {
2317 	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2318 	   for all tags "p" allowing PCDATA */
2319 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2320 	    if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2321 		return(0);
2322 	    }
2323 	}
2324     }
2325     return(1);
2326 }
2327 
2328 /**
2329  * htmlNewDocNoDtD:
2330  * @URI:  URI for the dtd, or NULL
2331  * @ExternalID:  the external ID of the DTD, or NULL
2332  *
2333  * Creates a new HTML document without a DTD node if @URI and @ExternalID
2334  * are NULL
2335  *
2336  * Returns a new document, do not initialize the DTD if not provided
2337  */
2338 htmlDocPtr
htmlNewDocNoDtD(const xmlChar * URI,const xmlChar * ExternalID)2339 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2340     xmlDocPtr cur;
2341 
2342     /*
2343      * Allocate a new document and fill the fields.
2344      */
2345     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2346     if (cur == NULL) {
2347 	htmlErrMemory(NULL, "HTML document creation failed\n");
2348 	return(NULL);
2349     }
2350     memset(cur, 0, sizeof(xmlDoc));
2351 
2352     cur->type = XML_HTML_DOCUMENT_NODE;
2353     cur->version = NULL;
2354     cur->intSubset = NULL;
2355     cur->doc = cur;
2356     cur->name = NULL;
2357     cur->children = NULL;
2358     cur->extSubset = NULL;
2359     cur->oldNs = NULL;
2360     cur->encoding = NULL;
2361     cur->standalone = 1;
2362     cur->compression = 0;
2363     cur->ids = NULL;
2364     cur->refs = NULL;
2365     cur->_private = NULL;
2366     cur->charset = XML_CHAR_ENCODING_UTF8;
2367     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2368     if ((ExternalID != NULL) ||
2369 	(URI != NULL))
2370 	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2371     return(cur);
2372 }
2373 
2374 /**
2375  * htmlNewDoc:
2376  * @URI:  URI for the dtd, or NULL
2377  * @ExternalID:  the external ID of the DTD, or NULL
2378  *
2379  * Creates a new HTML document
2380  *
2381  * Returns a new document
2382  */
2383 htmlDocPtr
htmlNewDoc(const xmlChar * URI,const xmlChar * ExternalID)2384 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2385     if ((URI == NULL) && (ExternalID == NULL))
2386 	return(htmlNewDocNoDtD(
2387 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2388 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2389 
2390     return(htmlNewDocNoDtD(URI, ExternalID));
2391 }
2392 
2393 
2394 /************************************************************************
2395  *									*
2396  *			The parser itself				*
2397  *	Relates to http://www.w3.org/TR/html40				*
2398  *									*
2399  ************************************************************************/
2400 
2401 /************************************************************************
2402  *									*
2403  *			The parser itself				*
2404  *									*
2405  ************************************************************************/
2406 
2407 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2408 
2409 /**
2410  * htmlParseHTMLName:
2411  * @ctxt:  an HTML parser context
2412  *
2413  * parse an HTML tag or attribute name, note that we convert it to lowercase
2414  * since HTML names are not case-sensitive.
2415  *
2416  * Returns the Tag Name parsed or NULL
2417  */
2418 
2419 static const xmlChar *
htmlParseHTMLName(htmlParserCtxtPtr ctxt)2420 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2421     int i = 0;
2422     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2423 
2424     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2425         (CUR != ':') && (CUR != '.')) return(NULL);
2426 
2427     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2428            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2429 	   (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2430            (CUR == '.'))) {
2431 	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2432         else loc[i] = CUR;
2433 	i++;
2434 
2435 	NEXT;
2436     }
2437 
2438     return(xmlDictLookup(ctxt->dict, loc, i));
2439 }
2440 
2441 
2442 /**
2443  * htmlParseHTMLName_nonInvasive:
2444  * @ctxt:  an HTML parser context
2445  *
2446  * parse an HTML tag or attribute name, note that we convert it to lowercase
2447  * since HTML names are not case-sensitive, this doesn't consume the data
2448  * from the stream, it's a look-ahead
2449  *
2450  * Returns the Tag Name parsed or NULL
2451  */
2452 
2453 static const xmlChar *
htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt)2454 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2455     int i = 0;
2456     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2457 
2458     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2459         (NXT(1) != ':')) return(NULL);
2460 
2461     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2462            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2463 	   (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2464 	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2465         else loc[i] = NXT(1+i);
2466 	i++;
2467     }
2468 
2469     return(xmlDictLookup(ctxt->dict, loc, i));
2470 }
2471 
2472 
2473 /**
2474  * htmlParseName:
2475  * @ctxt:  an HTML parser context
2476  *
2477  * parse an HTML name, this routine is case sensitive.
2478  *
2479  * Returns the Name parsed or NULL
2480  */
2481 
2482 static const xmlChar *
htmlParseName(htmlParserCtxtPtr ctxt)2483 htmlParseName(htmlParserCtxtPtr ctxt) {
2484     const xmlChar *in;
2485     const xmlChar *ret;
2486     int count = 0;
2487 
2488     GROW;
2489 
2490     /*
2491      * Accelerator for simple ASCII names
2492      */
2493     in = ctxt->input->cur;
2494     if (((*in >= 0x61) && (*in <= 0x7A)) ||
2495 	((*in >= 0x41) && (*in <= 0x5A)) ||
2496 	(*in == '_') || (*in == ':')) {
2497 	in++;
2498 	while (((*in >= 0x61) && (*in <= 0x7A)) ||
2499 	       ((*in >= 0x41) && (*in <= 0x5A)) ||
2500 	       ((*in >= 0x30) && (*in <= 0x39)) ||
2501 	       (*in == '_') || (*in == '-') ||
2502 	       (*in == ':') || (*in == '.'))
2503 	    in++;
2504 
2505 	if (in == ctxt->input->end)
2506 	    return(NULL);
2507 
2508 	if ((*in > 0) && (*in < 0x80)) {
2509 	    count = in - ctxt->input->cur;
2510 	    ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2511 	    ctxt->input->cur = in;
2512 	    ctxt->nbChars += count;
2513 	    ctxt->input->col += count;
2514 	    return(ret);
2515 	}
2516     }
2517     return(htmlParseNameComplex(ctxt));
2518 }
2519 
2520 static const xmlChar *
htmlParseNameComplex(xmlParserCtxtPtr ctxt)2521 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2522     int len = 0, l;
2523     int c;
2524     int count = 0;
2525     const xmlChar *base = ctxt->input->base;
2526 
2527     /*
2528      * Handler for more complex cases
2529      */
2530     GROW;
2531     c = CUR_CHAR(l);
2532     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2533 	(!IS_LETTER(c) && (c != '_') &&
2534          (c != ':'))) {
2535 	return(NULL);
2536     }
2537 
2538     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2539 	   ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2540             (c == '.') || (c == '-') ||
2541 	    (c == '_') || (c == ':') ||
2542 	    (IS_COMBINING(c)) ||
2543 	    (IS_EXTENDER(c)))) {
2544 	if (count++ > 100) {
2545 	    count = 0;
2546 	    GROW;
2547 	}
2548 	len += l;
2549 	NEXTL(l);
2550 	c = CUR_CHAR(l);
2551 	if (ctxt->input->base != base) {
2552 	    /*
2553 	     * We changed encoding from an unknown encoding
2554 	     * Input buffer changed location, so we better start again
2555 	     */
2556 	    return(htmlParseNameComplex(ctxt));
2557 	}
2558     }
2559 
2560     if (ctxt->input->cur - ctxt->input->base < len) {
2561         /* Sanity check */
2562 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2563                      "unexpected change of input buffer", NULL, NULL);
2564         return (NULL);
2565     }
2566 
2567     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2568 }
2569 
2570 
2571 /**
2572  * htmlParseHTMLAttribute:
2573  * @ctxt:  an HTML parser context
2574  * @stop:  a char stop value
2575  *
2576  * parse an HTML attribute value till the stop (quote), if
2577  * stop is 0 then it stops at the first space
2578  *
2579  * Returns the attribute parsed or NULL
2580  */
2581 
2582 static xmlChar *
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt,const xmlChar stop)2583 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2584     xmlChar *buffer = NULL;
2585     int buffer_size = 0;
2586     xmlChar *out = NULL;
2587     const xmlChar *name = NULL;
2588     const xmlChar *cur = NULL;
2589     const htmlEntityDesc * ent;
2590 
2591     /*
2592      * allocate a translation buffer.
2593      */
2594     buffer_size = HTML_PARSER_BUFFER_SIZE;
2595     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2596     if (buffer == NULL) {
2597 	htmlErrMemory(ctxt, "buffer allocation failed\n");
2598 	return(NULL);
2599     }
2600     out = buffer;
2601 
2602     /*
2603      * Ok loop until we reach one of the ending chars
2604      */
2605     while ((CUR != 0) && (CUR != stop)) {
2606 	if ((stop == 0) && (CUR == '>')) break;
2607 	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2608         if (CUR == '&') {
2609 	    if (NXT(1) == '#') {
2610 		unsigned int c;
2611 		int bits;
2612 
2613 		c = htmlParseCharRef(ctxt);
2614 		if      (c <    0x80)
2615 		        { *out++  = c;                bits= -6; }
2616 		else if (c <   0x800)
2617 		        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2618 		else if (c < 0x10000)
2619 		        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2620 		else
2621 		        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2622 
2623 		for ( ; bits >= 0; bits-= 6) {
2624 		    *out++  = ((c >> bits) & 0x3F) | 0x80;
2625 		}
2626 
2627 		if (out - buffer > buffer_size - 100) {
2628 			int indx = out - buffer;
2629 
2630 			growBuffer(buffer);
2631 			out = &buffer[indx];
2632 		}
2633 	    } else {
2634 		ent = htmlParseEntityRef(ctxt, &name);
2635 		if (name == NULL) {
2636 		    *out++ = '&';
2637 		    if (out - buffer > buffer_size - 100) {
2638 			int indx = out - buffer;
2639 
2640 			growBuffer(buffer);
2641 			out = &buffer[indx];
2642 		    }
2643 		} else if (ent == NULL) {
2644 		    *out++ = '&';
2645 		    cur = name;
2646 		    while (*cur != 0) {
2647 			if (out - buffer > buffer_size - 100) {
2648 			    int indx = out - buffer;
2649 
2650 			    growBuffer(buffer);
2651 			    out = &buffer[indx];
2652 			}
2653 			*out++ = *cur++;
2654 		    }
2655 		} else {
2656 		    unsigned int c;
2657 		    int bits;
2658 
2659 		    if (out - buffer > buffer_size - 100) {
2660 			int indx = out - buffer;
2661 
2662 			growBuffer(buffer);
2663 			out = &buffer[indx];
2664 		    }
2665 		    c = ent->value;
2666 		    if      (c <    0x80)
2667 			{ *out++  = c;                bits= -6; }
2668 		    else if (c <   0x800)
2669 			{ *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2670 		    else if (c < 0x10000)
2671 			{ *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2672 		    else
2673 			{ *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2674 
2675 		    for ( ; bits >= 0; bits-= 6) {
2676 			*out++  = ((c >> bits) & 0x3F) | 0x80;
2677 		    }
2678 		}
2679 	    }
2680 	} else {
2681 	    unsigned int c;
2682 	    int bits, l;
2683 
2684 	    if (out - buffer > buffer_size - 100) {
2685 		int indx = out - buffer;
2686 
2687 		growBuffer(buffer);
2688 		out = &buffer[indx];
2689 	    }
2690 	    c = CUR_CHAR(l);
2691 	    if      (c <    0x80)
2692 		    { *out++  = c;                bits= -6; }
2693 	    else if (c <   0x800)
2694 		    { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2695 	    else if (c < 0x10000)
2696 		    { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2697 	    else
2698 		    { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2699 
2700 	    for ( ; bits >= 0; bits-= 6) {
2701 		*out++  = ((c >> bits) & 0x3F) | 0x80;
2702 	    }
2703 	    NEXT;
2704 	}
2705     }
2706     *out = 0;
2707     return(buffer);
2708 }
2709 
2710 /**
2711  * htmlParseEntityRef:
2712  * @ctxt:  an HTML parser context
2713  * @str:  location to store the entity name
2714  *
2715  * parse an HTML ENTITY references
2716  *
2717  * [68] EntityRef ::= '&' Name ';'
2718  *
2719  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2720  *         if non-NULL *str will have to be freed by the caller.
2721  */
2722 const htmlEntityDesc *
htmlParseEntityRef(htmlParserCtxtPtr ctxt,const xmlChar ** str)2723 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2724     const xmlChar *name;
2725     const htmlEntityDesc * ent = NULL;
2726 
2727     if (str != NULL) *str = NULL;
2728     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2729 
2730     if (CUR == '&') {
2731         NEXT;
2732         name = htmlParseName(ctxt);
2733 	if (name == NULL) {
2734 	    htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2735 	                 "htmlParseEntityRef: no name\n", NULL, NULL);
2736 	} else {
2737 	    GROW;
2738 	    if (CUR == ';') {
2739 	        if (str != NULL)
2740 		    *str = name;
2741 
2742 		/*
2743 		 * Lookup the entity in the table.
2744 		 */
2745 		ent = htmlEntityLookup(name);
2746 		if (ent != NULL) /* OK that's ugly !!! */
2747 		    NEXT;
2748 	    } else {
2749 		htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2750 		             "htmlParseEntityRef: expecting ';'\n",
2751 			     NULL, NULL);
2752 	        if (str != NULL)
2753 		    *str = name;
2754 	    }
2755 	}
2756     }
2757     return(ent);
2758 }
2759 
2760 /**
2761  * htmlParseAttValue:
2762  * @ctxt:  an HTML parser context
2763  *
2764  * parse a value for an attribute
2765  * Note: the parser won't do substitution of entities here, this
2766  * will be handled later in xmlStringGetNodeList, unless it was
2767  * asked for ctxt->replaceEntities != 0
2768  *
2769  * Returns the AttValue parsed or NULL.
2770  */
2771 
2772 static xmlChar *
htmlParseAttValue(htmlParserCtxtPtr ctxt)2773 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2774     xmlChar *ret = NULL;
2775 
2776     if (CUR == '"') {
2777         NEXT;
2778 	ret = htmlParseHTMLAttribute(ctxt, '"');
2779         if (CUR != '"') {
2780 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2781 	                 "AttValue: \" expected\n", NULL, NULL);
2782 	} else
2783 	    NEXT;
2784     } else if (CUR == '\'') {
2785         NEXT;
2786 	ret = htmlParseHTMLAttribute(ctxt, '\'');
2787         if (CUR != '\'') {
2788 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2789 	                 "AttValue: ' expected\n", NULL, NULL);
2790 	} else
2791 	    NEXT;
2792     } else {
2793         /*
2794 	 * That's an HTMLism, the attribute value may not be quoted
2795 	 */
2796 	ret = htmlParseHTMLAttribute(ctxt, 0);
2797 	if (ret == NULL) {
2798 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2799 	                 "AttValue: no value found\n", NULL, NULL);
2800 	}
2801     }
2802     return(ret);
2803 }
2804 
2805 /**
2806  * htmlParseSystemLiteral:
2807  * @ctxt:  an HTML parser context
2808  *
2809  * parse an HTML Literal
2810  *
2811  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2812  *
2813  * Returns the SystemLiteral parsed or NULL
2814  */
2815 
2816 static xmlChar *
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt)2817 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2818     size_t len = 0, startPosition = 0;
2819     xmlChar *ret = NULL;
2820 
2821     if (CUR == '"') {
2822         NEXT;
2823 
2824         if (CUR_PTR < BASE_PTR)
2825             return(ret);
2826         startPosition = CUR_PTR - BASE_PTR;
2827 
2828 	while ((IS_CHAR_CH(CUR)) && (CUR != '"')) {
2829 	    NEXT;
2830 	    len++;
2831 	}
2832 	if (!IS_CHAR_CH(CUR)) {
2833 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2834 			 "Unfinished SystemLiteral\n", NULL, NULL);
2835 	} else {
2836 	    ret = xmlStrndup((BASE_PTR+startPosition), len);
2837 	    NEXT;
2838         }
2839     } else if (CUR == '\'') {
2840         NEXT;
2841 
2842         if (CUR_PTR < BASE_PTR)
2843             return(ret);
2844         startPosition = CUR_PTR - BASE_PTR;
2845 
2846 	while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) {
2847 	    NEXT;
2848 	    len++;
2849 	}
2850 	if (!IS_CHAR_CH(CUR)) {
2851 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2852 			 "Unfinished SystemLiteral\n", NULL, NULL);
2853 	} else {
2854 	    ret = xmlStrndup((BASE_PTR+startPosition), len);
2855 	    NEXT;
2856         }
2857     } else {
2858 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2859 	             " or ' expected\n", NULL, NULL);
2860     }
2861 
2862     return(ret);
2863 }
2864 
2865 /**
2866  * htmlParsePubidLiteral:
2867  * @ctxt:  an HTML parser context
2868  *
2869  * parse an HTML public literal
2870  *
2871  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2872  *
2873  * Returns the PubidLiteral parsed or NULL.
2874  */
2875 
2876 static xmlChar *
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt)2877 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2878     size_t len = 0, startPosition = 0;
2879     xmlChar *ret = NULL;
2880     /*
2881      * Name ::= (Letter | '_') (NameChar)*
2882      */
2883     if (CUR == '"') {
2884         NEXT;
2885 
2886         if (CUR_PTR < BASE_PTR)
2887             return(ret);
2888         startPosition = CUR_PTR - BASE_PTR;
2889 
2890         while (IS_PUBIDCHAR_CH(CUR)) {
2891             len++;
2892             NEXT;
2893         }
2894 
2895 	if (CUR != '"') {
2896 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2897 	                 "Unfinished PubidLiteral\n", NULL, NULL);
2898 	} else {
2899 	    ret = xmlStrndup((BASE_PTR + startPosition), len);
2900 	    NEXT;
2901 	}
2902     } else if (CUR == '\'') {
2903         NEXT;
2904 
2905         if (CUR_PTR < BASE_PTR)
2906             return(ret);
2907         startPosition = CUR_PTR - BASE_PTR;
2908 
2909         while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')){
2910             len++;
2911             NEXT;
2912         }
2913 
2914 	if (CUR != '\'') {
2915 	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2916 	                 "Unfinished PubidLiteral\n", NULL, NULL);
2917 	} else {
2918 	    ret = xmlStrndup((BASE_PTR + startPosition), len);
2919 	    NEXT;
2920 	}
2921     } else {
2922 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2923 	             "PubidLiteral \" or ' expected\n", NULL, NULL);
2924     }
2925 
2926     return(ret);
2927 }
2928 
2929 /**
2930  * htmlParseScript:
2931  * @ctxt:  an HTML parser context
2932  *
2933  * parse the content of an HTML SCRIPT or STYLE element
2934  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2935  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2936  * http://www.w3.org/TR/html4/types.html#type-script
2937  * http://www.w3.org/TR/html4/types.html#h-6.15
2938  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2939  *
2940  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2941  * element and the value of intrinsic event attributes. User agents must
2942  * not evaluate script data as HTML markup but instead must pass it on as
2943  * data to a script engine.
2944  * NOTES:
2945  * - The content is passed like CDATA
2946  * - the attributes for style and scripting "onXXX" are also described
2947  *   as CDATA but SGML allows entities references in attributes so their
2948  *   processing is identical as other attributes
2949  */
2950 static void
htmlParseScript(htmlParserCtxtPtr ctxt)2951 htmlParseScript(htmlParserCtxtPtr ctxt) {
2952     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2953     int nbchar = 0;
2954     int cur,l;
2955 
2956     SHRINK;
2957     cur = CUR_CHAR(l);
2958     while (IS_CHAR_CH(cur)) {
2959 	if ((cur == '<') && (NXT(1) == '/')) {
2960             /*
2961              * One should break here, the specification is clear:
2962              * Authors should therefore escape "</" within the content.
2963              * Escape mechanisms are specific to each scripting or
2964              * style sheet language.
2965              *
2966              * In recovery mode, only break if end tag match the
2967              * current tag, effectively ignoring all tags inside the
2968              * script/style block and treating the entire block as
2969              * CDATA.
2970              */
2971             if (ctxt->recovery) {
2972                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2973 				   xmlStrlen(ctxt->name)) == 0)
2974                 {
2975                     break; /* while */
2976                 } else {
2977 		    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2978 				 "Element %s embeds close tag\n",
2979 		                 ctxt->name, NULL);
2980 		}
2981             } else {
2982                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2983                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2984                 {
2985                     break; /* while */
2986                 }
2987             }
2988 	}
2989 	COPY_BUF(l,buf,nbchar,cur);
2990 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2991 	    if (ctxt->sax->cdataBlock!= NULL) {
2992 		/*
2993 		 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2994 		 */
2995 		ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2996 	    } else if (ctxt->sax->characters != NULL) {
2997 		ctxt->sax->characters(ctxt->userData, buf, nbchar);
2998 	    }
2999 	    nbchar = 0;
3000 	}
3001 	GROW;
3002 	NEXTL(l);
3003 	cur = CUR_CHAR(l);
3004     }
3005 
3006     if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
3007         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3008                     "Invalid char in CDATA 0x%X\n", cur);
3009         if (ctxt->input->cur < ctxt->input->end) {
3010             NEXT;
3011         }
3012     }
3013 
3014     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3015 	if (ctxt->sax->cdataBlock!= NULL) {
3016 	    /*
3017 	     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3018 	     */
3019 	    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3020 	} else if (ctxt->sax->characters != NULL) {
3021 	    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3022 	}
3023     }
3024 }
3025 
3026 
3027 /**
3028  * htmlParseCharDataInternal:
3029  * @ctxt:  an HTML parser context
3030  * @readahead: optional read ahead character in ascii range
3031  *
3032  * parse a CharData section.
3033  * if we are within a CDATA section ']]>' marks an end of section.
3034  *
3035  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3036  */
3037 
3038 static void
htmlParseCharDataInternal(htmlParserCtxtPtr ctxt,int readahead)3039 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3040     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3041     int nbchar = 0;
3042     int cur, l;
3043     int chunk = 0;
3044 
3045     if (readahead)
3046         buf[nbchar++] = readahead;
3047 
3048     SHRINK;
3049     cur = CUR_CHAR(l);
3050     while (((cur != '<') || (ctxt->token == '<')) &&
3051            ((cur != '&') || (ctxt->token == '&')) &&
3052 	   (cur != 0)) {
3053 	if (!(IS_CHAR(cur))) {
3054 	    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3055 	                "Invalid char in CDATA 0x%X\n", cur);
3056 	} else {
3057 	    COPY_BUF(l,buf,nbchar,cur);
3058 	}
3059 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3060 	    /*
3061 	     * Ok the segment is to be consumed as chars.
3062 	     */
3063 	    if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3064 		if (areBlanks(ctxt, buf, nbchar)) {
3065 		    if (ctxt->keepBlanks) {
3066 			if (ctxt->sax->characters != NULL)
3067 			    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3068 		    } else {
3069 			if (ctxt->sax->ignorableWhitespace != NULL)
3070 			    ctxt->sax->ignorableWhitespace(ctxt->userData,
3071 			                                   buf, nbchar);
3072 		    }
3073 		} else {
3074 		    htmlCheckParagraph(ctxt);
3075 		    if (ctxt->sax->characters != NULL)
3076 			ctxt->sax->characters(ctxt->userData, buf, nbchar);
3077 		}
3078 	    }
3079 	    nbchar = 0;
3080 	}
3081 	NEXTL(l);
3082         chunk++;
3083         if (chunk > HTML_PARSER_BUFFER_SIZE) {
3084             chunk = 0;
3085             SHRINK;
3086             GROW;
3087         }
3088 	cur = CUR_CHAR(l);
3089 	if (cur == 0) {
3090 	    SHRINK;
3091 	    GROW;
3092 	    cur = CUR_CHAR(l);
3093 	}
3094     }
3095     if (nbchar != 0) {
3096         buf[nbchar] = 0;
3097 
3098 	/*
3099 	 * Ok the segment is to be consumed as chars.
3100 	 */
3101 	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3102 	    if (areBlanks(ctxt, buf, nbchar)) {
3103 		if (ctxt->keepBlanks) {
3104 		    if (ctxt->sax->characters != NULL)
3105 			ctxt->sax->characters(ctxt->userData, buf, nbchar);
3106 		} else {
3107 		    if (ctxt->sax->ignorableWhitespace != NULL)
3108 			ctxt->sax->ignorableWhitespace(ctxt->userData,
3109 			                               buf, nbchar);
3110 		}
3111 	    } else {
3112 		htmlCheckParagraph(ctxt);
3113 		if (ctxt->sax->characters != NULL)
3114 		    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3115 	    }
3116 	}
3117     } else {
3118 	/*
3119 	 * Loop detection
3120 	 */
3121 	if (cur == 0)
3122 	    ctxt->instate = XML_PARSER_EOF;
3123     }
3124 }
3125 
3126 /**
3127  * htmlParseCharData:
3128  * @ctxt:  an HTML parser context
3129  *
3130  * parse a CharData section.
3131  * if we are within a CDATA section ']]>' marks an end of section.
3132  *
3133  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3134  */
3135 
3136 static void
htmlParseCharData(htmlParserCtxtPtr ctxt)3137 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3138     htmlParseCharDataInternal(ctxt, 0);
3139 }
3140 
3141 /**
3142  * htmlParseExternalID:
3143  * @ctxt:  an HTML parser context
3144  * @publicID:  a xmlChar** receiving PubidLiteral
3145  *
3146  * Parse an External ID or a Public ID
3147  *
3148  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3149  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3150  *
3151  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3152  *
3153  * Returns the function returns SystemLiteral and in the second
3154  *                case publicID receives PubidLiteral, is strict is off
3155  *                it is possible to return NULL and have publicID set.
3156  */
3157 
3158 static xmlChar *
htmlParseExternalID(htmlParserCtxtPtr ctxt,xmlChar ** publicID)3159 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3160     xmlChar *URI = NULL;
3161 
3162     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3163          (UPP(2) == 'S') && (UPP(3) == 'T') &&
3164 	 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3165         SKIP(6);
3166 	if (!IS_BLANK_CH(CUR)) {
3167 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3168 	                 "Space required after 'SYSTEM'\n", NULL, NULL);
3169 	}
3170         SKIP_BLANKS;
3171 	URI = htmlParseSystemLiteral(ctxt);
3172 	if (URI == NULL) {
3173 	    htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3174 	                 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3175         }
3176     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3177 	       (UPP(2) == 'B') && (UPP(3) == 'L') &&
3178 	       (UPP(4) == 'I') && (UPP(5) == 'C')) {
3179         SKIP(6);
3180 	if (!IS_BLANK_CH(CUR)) {
3181 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3182 	                 "Space required after 'PUBLIC'\n", NULL, NULL);
3183 	}
3184         SKIP_BLANKS;
3185 	*publicID = htmlParsePubidLiteral(ctxt);
3186 	if (*publicID == NULL) {
3187 	    htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3188 	                 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3189 			 NULL, NULL);
3190 	}
3191         SKIP_BLANKS;
3192         if ((CUR == '"') || (CUR == '\'')) {
3193 	    URI = htmlParseSystemLiteral(ctxt);
3194 	}
3195     }
3196     return(URI);
3197 }
3198 
3199 /**
3200  * xmlParsePI:
3201  * @ctxt:  an XML parser context
3202  *
3203  * parse an XML Processing Instruction.
3204  *
3205  * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3206  */
3207 static void
htmlParsePI(htmlParserCtxtPtr ctxt)3208 htmlParsePI(htmlParserCtxtPtr ctxt) {
3209     xmlChar *buf = NULL;
3210     int len = 0;
3211     int size = HTML_PARSER_BUFFER_SIZE;
3212     int cur, l;
3213     const xmlChar *target;
3214     xmlParserInputState state;
3215     int count = 0;
3216 
3217     if ((RAW == '<') && (NXT(1) == '?')) {
3218 	state = ctxt->instate;
3219         ctxt->instate = XML_PARSER_PI;
3220 	/*
3221 	 * this is a Processing Instruction.
3222 	 */
3223 	SKIP(2);
3224 	SHRINK;
3225 
3226 	/*
3227 	 * Parse the target name and check for special support like
3228 	 * namespace.
3229 	 */
3230         target = htmlParseName(ctxt);
3231 	if (target != NULL) {
3232 	    if (RAW == '>') {
3233 		SKIP(1);
3234 
3235 		/*
3236 		 * SAX: PI detected.
3237 		 */
3238 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3239 		    (ctxt->sax->processingInstruction != NULL))
3240 		    ctxt->sax->processingInstruction(ctxt->userData,
3241 		                                     target, NULL);
3242 		ctxt->instate = state;
3243 		return;
3244 	    }
3245 	    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3246 	    if (buf == NULL) {
3247 		htmlErrMemory(ctxt, NULL);
3248 		ctxt->instate = state;
3249 		return;
3250 	    }
3251 	    cur = CUR;
3252 	    if (!IS_BLANK(cur)) {
3253 		htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3254 			  "ParsePI: PI %s space expected\n", target, NULL);
3255 	    }
3256             SKIP_BLANKS;
3257 	    cur = CUR_CHAR(l);
3258 	    while (IS_CHAR(cur) && (cur != '>')) {
3259 		if (len + 5 >= size) {
3260 		    xmlChar *tmp;
3261 
3262 		    size *= 2;
3263 		    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3264 		    if (tmp == NULL) {
3265 			htmlErrMemory(ctxt, NULL);
3266 			xmlFree(buf);
3267 			ctxt->instate = state;
3268 			return;
3269 		    }
3270 		    buf = tmp;
3271 		}
3272 		count++;
3273 		if (count > 50) {
3274 		    GROW;
3275 		    count = 0;
3276 		}
3277 		COPY_BUF(l,buf,len,cur);
3278 		NEXTL(l);
3279 		cur = CUR_CHAR(l);
3280 		if (cur == 0) {
3281 		    SHRINK;
3282 		    GROW;
3283 		    cur = CUR_CHAR(l);
3284 		}
3285 	    }
3286 	    buf[len] = 0;
3287 	    if (cur != '>') {
3288 		htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3289 		      "ParsePI: PI %s never end ...\n", target, NULL);
3290 	    } else {
3291 		SKIP(1);
3292 
3293 		/*
3294 		 * SAX: PI detected.
3295 		 */
3296 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3297 		    (ctxt->sax->processingInstruction != NULL))
3298 		    ctxt->sax->processingInstruction(ctxt->userData,
3299 		                                     target, buf);
3300 	    }
3301 	    xmlFree(buf);
3302 	} else {
3303 	    htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3304                          "PI is not started correctly", NULL, NULL);
3305 	}
3306 	ctxt->instate = state;
3307     }
3308 }
3309 
3310 /**
3311  * htmlParseComment:
3312  * @ctxt:  an HTML parser context
3313  *
3314  * Parse an XML (SGML) comment <!-- .... -->
3315  *
3316  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3317  */
3318 static void
htmlParseComment(htmlParserCtxtPtr ctxt)3319 htmlParseComment(htmlParserCtxtPtr ctxt) {
3320     xmlChar *buf = NULL;
3321     int len;
3322     int size = HTML_PARSER_BUFFER_SIZE;
3323     int q, ql;
3324     int r, rl;
3325     int cur, l;
3326     xmlParserInputState state;
3327 
3328     /*
3329      * Check that there is a comment right here.
3330      */
3331     if ((RAW != '<') || (NXT(1) != '!') ||
3332         (NXT(2) != '-') || (NXT(3) != '-')) return;
3333 
3334     state = ctxt->instate;
3335     ctxt->instate = XML_PARSER_COMMENT;
3336     SHRINK;
3337     SKIP(4);
3338     buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3339     if (buf == NULL) {
3340         htmlErrMemory(ctxt, "buffer allocation failed\n");
3341 	ctxt->instate = state;
3342 	return;
3343     }
3344     len = 0;
3345     buf[len] = 0;
3346     q = CUR_CHAR(ql);
3347     if (!IS_CHAR(q))
3348         goto unfinished;
3349     NEXTL(ql);
3350     r = CUR_CHAR(rl);
3351     if (!IS_CHAR(r))
3352         goto unfinished;
3353     NEXTL(rl);
3354     cur = CUR_CHAR(l);
3355     while (IS_CHAR(cur) &&
3356            ((cur != '>') ||
3357 	    (r != '-') || (q != '-'))) {
3358 	if (len + 5 >= size) {
3359 	    xmlChar *tmp;
3360 
3361 	    size *= 2;
3362 	    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3363 	    if (tmp == NULL) {
3364 	        xmlFree(buf);
3365 	        htmlErrMemory(ctxt, "growing buffer failed\n");
3366 		ctxt->instate = state;
3367 		return;
3368 	    }
3369 	    buf = tmp;
3370 	}
3371 	COPY_BUF(ql,buf,len,q);
3372 	q = r;
3373 	ql = rl;
3374 	r = cur;
3375 	rl = l;
3376 	NEXTL(l);
3377 	cur = CUR_CHAR(l);
3378 	if (cur == 0) {
3379 	    SHRINK;
3380 	    GROW;
3381 	    cur = CUR_CHAR(l);
3382 	}
3383     }
3384     buf[len] = 0;
3385     if (IS_CHAR(cur)) {
3386         NEXT;
3387 	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3388 	    (!ctxt->disableSAX))
3389 	    ctxt->sax->comment(ctxt->userData, buf);
3390 	xmlFree(buf);
3391 	ctxt->instate = state;
3392 	return;
3393     }
3394 
3395 unfinished:
3396     htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3397 		 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3398     xmlFree(buf);
3399 }
3400 
3401 /**
3402  * htmlParseCharRef:
3403  * @ctxt:  an HTML parser context
3404  *
3405  * parse Reference declarations
3406  *
3407  * [66] CharRef ::= '&#' [0-9]+ ';' |
3408  *                  '&#x' [0-9a-fA-F]+ ';'
3409  *
3410  * Returns the value parsed (as an int)
3411  */
3412 int
htmlParseCharRef(htmlParserCtxtPtr ctxt)3413 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3414     int val = 0;
3415 
3416     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3417 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3418 		     "htmlParseCharRef: context error\n",
3419 		     NULL, NULL);
3420         return(0);
3421     }
3422     if ((CUR == '&') && (NXT(1) == '#') &&
3423         ((NXT(2) == 'x') || NXT(2) == 'X')) {
3424 	SKIP(3);
3425 	while (CUR != ';') {
3426 	    if ((CUR >= '0') && (CUR <= '9'))
3427 	        val = val * 16 + (CUR - '0');
3428 	    else if ((CUR >= 'a') && (CUR <= 'f'))
3429 	        val = val * 16 + (CUR - 'a') + 10;
3430 	    else if ((CUR >= 'A') && (CUR <= 'F'))
3431 	        val = val * 16 + (CUR - 'A') + 10;
3432 	    else {
3433 	        htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3434 		             "htmlParseCharRef: missing semicolon\n",
3435 			     NULL, NULL);
3436 		break;
3437 	    }
3438 	    NEXT;
3439 	}
3440 	if (CUR == ';')
3441 	    NEXT;
3442     } else if  ((CUR == '&') && (NXT(1) == '#')) {
3443 	SKIP(2);
3444 	while (CUR != ';') {
3445 	    if ((CUR >= '0') && (CUR <= '9'))
3446 	        val = val * 10 + (CUR - '0');
3447 	    else {
3448 	        htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3449 		             "htmlParseCharRef: missing semicolon\n",
3450 			     NULL, NULL);
3451 		break;
3452 	    }
3453 	    NEXT;
3454 	}
3455 	if (CUR == ';')
3456 	    NEXT;
3457     } else {
3458 	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3459 	             "htmlParseCharRef: invalid value\n", NULL, NULL);
3460     }
3461     /*
3462      * Check the value IS_CHAR ...
3463      */
3464     if (IS_CHAR(val)) {
3465         return(val);
3466     } else {
3467 	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3468 			"htmlParseCharRef: invalid xmlChar value %d\n",
3469 			val);
3470     }
3471     return(0);
3472 }
3473 
3474 
3475 /**
3476  * htmlParseDocTypeDecl:
3477  * @ctxt:  an HTML parser context
3478  *
3479  * parse a DOCTYPE declaration
3480  *
3481  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3482  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3483  */
3484 
3485 static void
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt)3486 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3487     const xmlChar *name;
3488     xmlChar *ExternalID = NULL;
3489     xmlChar *URI = NULL;
3490 
3491     /*
3492      * We know that '<!DOCTYPE' has been detected.
3493      */
3494     SKIP(9);
3495 
3496     SKIP_BLANKS;
3497 
3498     /*
3499      * Parse the DOCTYPE name.
3500      */
3501     name = htmlParseName(ctxt);
3502     if (name == NULL) {
3503 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3504 	             "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3505 		     NULL, NULL);
3506     }
3507     /*
3508      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3509      */
3510 
3511     SKIP_BLANKS;
3512 
3513     /*
3514      * Check for SystemID and ExternalID
3515      */
3516     URI = htmlParseExternalID(ctxt, &ExternalID);
3517     SKIP_BLANKS;
3518 
3519     /*
3520      * We should be at the end of the DOCTYPE declaration.
3521      */
3522     if (CUR != '>') {
3523 	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3524 	             "DOCTYPE improperly terminated\n", NULL, NULL);
3525         /* We shouldn't try to resynchronize ... */
3526     }
3527     NEXT;
3528 
3529     /*
3530      * Create or update the document accordingly to the DOCTYPE
3531      */
3532     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3533 	(!ctxt->disableSAX))
3534 	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3535 
3536     /*
3537      * Cleanup, since we don't use all those identifiers
3538      */
3539     if (URI != NULL) xmlFree(URI);
3540     if (ExternalID != NULL) xmlFree(ExternalID);
3541 }
3542 
3543 /**
3544  * htmlParseAttribute:
3545  * @ctxt:  an HTML parser context
3546  * @value:  a xmlChar ** used to store the value of the attribute
3547  *
3548  * parse an attribute
3549  *
3550  * [41] Attribute ::= Name Eq AttValue
3551  *
3552  * [25] Eq ::= S? '=' S?
3553  *
3554  * With namespace:
3555  *
3556  * [NS 11] Attribute ::= QName Eq AttValue
3557  *
3558  * Also the case QName == xmlns:??? is handled independently as a namespace
3559  * definition.
3560  *
3561  * Returns the attribute name, and the value in *value.
3562  */
3563 
3564 static const xmlChar *
htmlParseAttribute(htmlParserCtxtPtr ctxt,xmlChar ** value)3565 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3566     const xmlChar *name;
3567     xmlChar *val = NULL;
3568 
3569     *value = NULL;
3570     name = htmlParseHTMLName(ctxt);
3571     if (name == NULL) {
3572 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3573 	             "error parsing attribute name\n", NULL, NULL);
3574         return(NULL);
3575     }
3576 
3577     /*
3578      * read the value
3579      */
3580     SKIP_BLANKS;
3581     if (CUR == '=') {
3582         NEXT;
3583 	SKIP_BLANKS;
3584 	val = htmlParseAttValue(ctxt);
3585     }
3586 
3587     *value = val;
3588     return(name);
3589 }
3590 
3591 /**
3592  * htmlCheckEncodingDirect:
3593  * @ctxt:  an HTML parser context
3594  * @attvalue: the attribute value
3595  *
3596  * Checks an attribute value to detect
3597  * the encoding
3598  * If a new encoding is detected the parser is switched to decode
3599  * it and pass UTF8
3600  */
3601 static void
htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt,const xmlChar * encoding)3602 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3603 
3604     if ((ctxt == NULL) || (encoding == NULL) ||
3605         (ctxt->options & HTML_PARSE_IGNORE_ENC))
3606 	return;
3607 
3608     /* do not change encoding */
3609     if (ctxt->input->encoding != NULL)
3610         return;
3611 
3612     if (encoding != NULL) {
3613 	xmlCharEncoding enc;
3614 	xmlCharEncodingHandlerPtr handler;
3615 
3616 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3617 
3618 	if (ctxt->input->encoding != NULL)
3619 	    xmlFree((xmlChar *) ctxt->input->encoding);
3620 	ctxt->input->encoding = xmlStrdup(encoding);
3621 
3622 	enc = xmlParseCharEncoding((const char *) encoding);
3623 	/*
3624 	 * registered set of known encodings
3625 	 */
3626 	if (enc != XML_CHAR_ENCODING_ERROR) {
3627 	    if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3628 	         (enc == XML_CHAR_ENCODING_UTF16BE) ||
3629 		 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3630 		 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3631 		(ctxt->input->buf != NULL) &&
3632 		(ctxt->input->buf->encoder == NULL)) {
3633 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3634 		             "htmlCheckEncoding: wrong encoding meta\n",
3635 			     NULL, NULL);
3636 	    } else {
3637 		xmlSwitchEncoding(ctxt, enc);
3638 	    }
3639 	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
3640 	} else {
3641 	    /*
3642 	     * fallback for unknown encodings
3643 	     */
3644 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
3645 	    if (handler != NULL) {
3646 		xmlSwitchToEncoding(ctxt, handler);
3647 		ctxt->charset = XML_CHAR_ENCODING_UTF8;
3648 	    } else {
3649 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3650 		             "htmlCheckEncoding: unknown encoding %s\n",
3651 			     encoding, NULL);
3652 	    }
3653 	}
3654 
3655 	if ((ctxt->input->buf != NULL) &&
3656 	    (ctxt->input->buf->encoder != NULL) &&
3657 	    (ctxt->input->buf->raw != NULL) &&
3658 	    (ctxt->input->buf->buffer != NULL)) {
3659 	    int nbchars;
3660 	    int processed;
3661 
3662 	    /*
3663 	     * convert as much as possible to the parser reading buffer.
3664 	     */
3665 	    processed = ctxt->input->cur - ctxt->input->base;
3666 	    xmlBufShrink(ctxt->input->buf->buffer, processed);
3667 	    nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3668             xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3669 	    if (nbchars < 0) {
3670 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3671 		             "htmlCheckEncoding: encoder error\n",
3672 			     NULL, NULL);
3673 	    }
3674 	}
3675     }
3676 }
3677 
3678 /**
3679  * htmlCheckEncoding:
3680  * @ctxt:  an HTML parser context
3681  * @attvalue: the attribute value
3682  *
3683  * Checks an http-equiv attribute from a Meta tag to detect
3684  * the encoding
3685  * If a new encoding is detected the parser is switched to decode
3686  * it and pass UTF8
3687  */
3688 static void
htmlCheckEncoding(htmlParserCtxtPtr ctxt,const xmlChar * attvalue)3689 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3690     const xmlChar *encoding;
3691 
3692     if (!attvalue)
3693 	return;
3694 
3695     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3696     if (encoding != NULL) {
3697 	encoding += 7;
3698     }
3699     /*
3700      * skip blank
3701      */
3702     if (encoding && IS_BLANK_CH(*encoding))
3703 	encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3704     if (encoding && *encoding == '=') {
3705 	encoding ++;
3706 	htmlCheckEncodingDirect(ctxt, encoding);
3707     }
3708 }
3709 
3710 /**
3711  * htmlCheckMeta:
3712  * @ctxt:  an HTML parser context
3713  * @atts:  the attributes values
3714  *
3715  * Checks an attributes from a Meta tag
3716  */
3717 static void
htmlCheckMeta(htmlParserCtxtPtr ctxt,const xmlChar ** atts)3718 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3719     int i;
3720     const xmlChar *att, *value;
3721     int http = 0;
3722     const xmlChar *content = NULL;
3723 
3724     if ((ctxt == NULL) || (atts == NULL))
3725 	return;
3726 
3727     i = 0;
3728     att = atts[i++];
3729     while (att != NULL) {
3730 	value = atts[i++];
3731 	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3732 	 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3733 	    http = 1;
3734 	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3735 	    htmlCheckEncodingDirect(ctxt, value);
3736 	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3737 	    content = value;
3738 	att = atts[i++];
3739     }
3740     if ((http) && (content != NULL))
3741 	htmlCheckEncoding(ctxt, content);
3742 
3743 }
3744 
3745 /**
3746  * htmlParseStartTag:
3747  * @ctxt:  an HTML parser context
3748  *
3749  * parse a start of tag either for rule element or
3750  * EmptyElement. In both case we don't parse the tag closing chars.
3751  *
3752  * [40] STag ::= '<' Name (S Attribute)* S? '>'
3753  *
3754  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3755  *
3756  * With namespace:
3757  *
3758  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3759  *
3760  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3761  *
3762  * Returns 0 in case of success, -1 in case of error and 1 if discarded
3763  */
3764 
3765 static int
htmlParseStartTag(htmlParserCtxtPtr ctxt)3766 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3767     const xmlChar *name;
3768     const xmlChar *attname;
3769     xmlChar *attvalue;
3770     const xmlChar **atts;
3771     int nbatts = 0;
3772     int maxatts;
3773     int meta = 0;
3774     int i;
3775     int discardtag = 0;
3776 
3777     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3778 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3779 		     "htmlParseStartTag: context error\n", NULL, NULL);
3780 	return -1;
3781     }
3782     if (ctxt->instate == XML_PARSER_EOF)
3783         return(-1);
3784     if (CUR != '<') return -1;
3785     NEXT;
3786 
3787     atts = ctxt->atts;
3788     maxatts = ctxt->maxatts;
3789 
3790     GROW;
3791     name = htmlParseHTMLName(ctxt);
3792     if (name == NULL) {
3793 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3794 	             "htmlParseStartTag: invalid element name\n",
3795 		     NULL, NULL);
3796 	/* if recover preserve text on classic misconstructs */
3797 	if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
3798 	    (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
3799 	    htmlParseCharDataInternal(ctxt, '<');
3800 	    return(-1);
3801 	}
3802 
3803 
3804 	/* Dump the bogus tag like browsers do */
3805 	while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3806                (ctxt->instate != XML_PARSER_EOF))
3807 	    NEXT;
3808         return -1;
3809     }
3810     if (xmlStrEqual(name, BAD_CAST"meta"))
3811 	meta = 1;
3812 
3813     /*
3814      * Check for auto-closure of HTML elements.
3815      */
3816     htmlAutoClose(ctxt, name);
3817 
3818     /*
3819      * Check for implied HTML elements.
3820      */
3821     htmlCheckImplied(ctxt, name);
3822 
3823     /*
3824      * Avoid html at any level > 0, head at any level != 1
3825      * or any attempt to recurse body
3826      */
3827     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3828 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3829 	             "htmlParseStartTag: misplaced <html> tag\n",
3830 		     name, NULL);
3831 	discardtag = 1;
3832 	ctxt->depth++;
3833     }
3834     if ((ctxt->nameNr != 1) &&
3835 	(xmlStrEqual(name, BAD_CAST"head"))) {
3836 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3837 	             "htmlParseStartTag: misplaced <head> tag\n",
3838 		     name, NULL);
3839 	discardtag = 1;
3840 	ctxt->depth++;
3841     }
3842     if (xmlStrEqual(name, BAD_CAST"body")) {
3843 	int indx;
3844 	for (indx = 0;indx < ctxt->nameNr;indx++) {
3845 	    if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3846 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3847 		             "htmlParseStartTag: misplaced <body> tag\n",
3848 			     name, NULL);
3849 		discardtag = 1;
3850 		ctxt->depth++;
3851 	    }
3852 	}
3853     }
3854 
3855     /*
3856      * Now parse the attributes, it ends up with the ending
3857      *
3858      * (S Attribute)* S?
3859      */
3860     SKIP_BLANKS;
3861     while ((IS_CHAR_CH(CUR)) &&
3862            (CUR != '>') &&
3863 	   ((CUR != '/') || (NXT(1) != '>'))) {
3864 	long cons = ctxt->nbChars;
3865 
3866 	GROW;
3867 	attname = htmlParseAttribute(ctxt, &attvalue);
3868         if (attname != NULL) {
3869 
3870 	    /*
3871 	     * Well formedness requires at most one declaration of an attribute
3872 	     */
3873 	    for (i = 0; i < nbatts;i += 2) {
3874 	        if (xmlStrEqual(atts[i], attname)) {
3875 		    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3876 		                 "Attribute %s redefined\n", attname, NULL);
3877 		    if (attvalue != NULL)
3878 			xmlFree(attvalue);
3879 		    goto failed;
3880 		}
3881 	    }
3882 
3883 	    /*
3884 	     * Add the pair to atts
3885 	     */
3886 	    if (atts == NULL) {
3887 	        maxatts = 22; /* allow for 10 attrs by default */
3888 	        atts = (const xmlChar **)
3889 		       xmlMalloc(maxatts * sizeof(xmlChar *));
3890 		if (atts == NULL) {
3891 		    htmlErrMemory(ctxt, NULL);
3892 		    if (attvalue != NULL)
3893 			xmlFree(attvalue);
3894 		    goto failed;
3895 		}
3896 		ctxt->atts = atts;
3897 		ctxt->maxatts = maxatts;
3898 	    } else if (nbatts + 4 > maxatts) {
3899 	        const xmlChar **n;
3900 
3901 	        maxatts *= 2;
3902 	        n = (const xmlChar **) xmlRealloc((void *) atts,
3903 					     maxatts * sizeof(const xmlChar *));
3904 		if (n == NULL) {
3905 		    htmlErrMemory(ctxt, NULL);
3906 		    if (attvalue != NULL)
3907 			xmlFree(attvalue);
3908 		    goto failed;
3909 		}
3910 		atts = n;
3911 		ctxt->atts = atts;
3912 		ctxt->maxatts = maxatts;
3913 	    }
3914 	    atts[nbatts++] = attname;
3915 	    atts[nbatts++] = attvalue;
3916 	    atts[nbatts] = NULL;
3917 	    atts[nbatts + 1] = NULL;
3918 	}
3919 	else {
3920 	    if (attvalue != NULL)
3921 	        xmlFree(attvalue);
3922 	    /* Dump the bogus attribute string up to the next blank or
3923 	     * the end of the tag. */
3924 	    while ((IS_CHAR_CH(CUR)) &&
3925 	           !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3926 		   ((CUR != '/') || (NXT(1) != '>')))
3927 		NEXT;
3928 	}
3929 
3930 failed:
3931 	SKIP_BLANKS;
3932         if (cons == ctxt->nbChars) {
3933 	    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3934 	                 "htmlParseStartTag: problem parsing attributes\n",
3935 			 NULL, NULL);
3936 	    break;
3937 	}
3938     }
3939 
3940     /*
3941      * Handle specific association to the META tag
3942      */
3943     if (meta && (nbatts != 0))
3944 	htmlCheckMeta(ctxt, atts);
3945 
3946     /*
3947      * SAX: Start of Element !
3948      */
3949     if (!discardtag) {
3950 	htmlnamePush(ctxt, name);
3951 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3952 	    if (nbatts != 0)
3953 		ctxt->sax->startElement(ctxt->userData, name, atts);
3954 	    else
3955 		ctxt->sax->startElement(ctxt->userData, name, NULL);
3956 	}
3957     }
3958 
3959     if (atts != NULL) {
3960         for (i = 1;i < nbatts;i += 2) {
3961 	    if (atts[i] != NULL)
3962 		xmlFree((xmlChar *) atts[i]);
3963 	}
3964     }
3965 
3966     return(discardtag);
3967 }
3968 
3969 /**
3970  * htmlParseEndTag:
3971  * @ctxt:  an HTML parser context
3972  *
3973  * parse an end of tag
3974  *
3975  * [42] ETag ::= '</' Name S? '>'
3976  *
3977  * With namespace
3978  *
3979  * [NS 9] ETag ::= '</' QName S? '>'
3980  *
3981  * Returns 1 if the current level should be closed.
3982  */
3983 
3984 static int
htmlParseEndTag(htmlParserCtxtPtr ctxt)3985 htmlParseEndTag(htmlParserCtxtPtr ctxt)
3986 {
3987     const xmlChar *name;
3988     const xmlChar *oldname;
3989     int i, ret;
3990 
3991     if ((CUR != '<') || (NXT(1) != '/')) {
3992         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3993 	             "htmlParseEndTag: '</' not found\n", NULL, NULL);
3994         return (0);
3995     }
3996     SKIP(2);
3997 
3998     name = htmlParseHTMLName(ctxt);
3999     if (name == NULL)
4000         return (0);
4001     /*
4002      * We should definitely be at the ending "S? '>'" part
4003      */
4004     SKIP_BLANKS;
4005     if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
4006         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4007 	             "End tag : expected '>'\n", NULL, NULL);
4008 	if (ctxt->recovery) {
4009 	    /*
4010 	     * We're not at the ending > !!
4011 	     * Error, unless in recover mode where we search forwards
4012 	     * until we find a >
4013 	     */
4014 	    while (CUR != '\0' && CUR != '>') NEXT;
4015 	    NEXT;
4016 	}
4017     } else
4018         NEXT;
4019 
4020     /*
4021      * if we ignored misplaced tags in htmlParseStartTag don't pop them
4022      * out now.
4023      */
4024     if ((ctxt->depth > 0) &&
4025         (xmlStrEqual(name, BAD_CAST "html") ||
4026          xmlStrEqual(name, BAD_CAST "body") ||
4027 	 xmlStrEqual(name, BAD_CAST "head"))) {
4028 	ctxt->depth--;
4029 	return (0);
4030     }
4031 
4032     /*
4033      * If the name read is not one of the element in the parsing stack
4034      * then return, it's just an error.
4035      */
4036     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4037         if (xmlStrEqual(name, ctxt->nameTab[i]))
4038             break;
4039     }
4040     if (i < 0) {
4041         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4042 	             "Unexpected end tag : %s\n", name, NULL);
4043         return (0);
4044     }
4045 
4046 
4047     /*
4048      * Check for auto-closure of HTML elements.
4049      */
4050 
4051     htmlAutoCloseOnClose(ctxt, name);
4052 
4053     /*
4054      * Well formedness constraints, opening and closing must match.
4055      * With the exception that the autoclose may have popped stuff out
4056      * of the stack.
4057      */
4058     if (!xmlStrEqual(name, ctxt->name)) {
4059         if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4060             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4061 	                 "Opening and ending tag mismatch: %s and %s\n",
4062 			 name, ctxt->name);
4063         }
4064     }
4065 
4066     /*
4067      * SAX: End of Tag
4068      */
4069     oldname = ctxt->name;
4070     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4071         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4072             ctxt->sax->endElement(ctxt->userData, name);
4073 	htmlNodeInfoPop(ctxt);
4074         htmlnamePop(ctxt);
4075         ret = 1;
4076     } else {
4077         ret = 0;
4078     }
4079 
4080     return (ret);
4081 }
4082 
4083 
4084 /**
4085  * htmlParseReference:
4086  * @ctxt:  an HTML parser context
4087  *
4088  * parse and handle entity references in content,
4089  * this will end-up in a call to character() since this is either a
4090  * CharRef, or a predefined entity.
4091  */
4092 static void
htmlParseReference(htmlParserCtxtPtr ctxt)4093 htmlParseReference(htmlParserCtxtPtr ctxt) {
4094     const htmlEntityDesc * ent;
4095     xmlChar out[6];
4096     const xmlChar *name;
4097     if (CUR != '&') return;
4098 
4099     if (NXT(1) == '#') {
4100 	unsigned int c;
4101 	int bits, i = 0;
4102 
4103 	c = htmlParseCharRef(ctxt);
4104 	if (c == 0)
4105 	    return;
4106 
4107         if      (c <    0x80) { out[i++]= c;                bits= -6; }
4108         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4109         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4110         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4111 
4112         for ( ; bits >= 0; bits-= 6) {
4113             out[i++]= ((c >> bits) & 0x3F) | 0x80;
4114         }
4115 	out[i] = 0;
4116 
4117 	htmlCheckParagraph(ctxt);
4118 	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4119 	    ctxt->sax->characters(ctxt->userData, out, i);
4120     } else {
4121 	ent = htmlParseEntityRef(ctxt, &name);
4122 	if (name == NULL) {
4123 	    htmlCheckParagraph(ctxt);
4124 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4125 	        ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4126 	    return;
4127 	}
4128 	if ((ent == NULL) || !(ent->value > 0)) {
4129 	    htmlCheckParagraph(ctxt);
4130 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4131 		ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4132 		ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4133 		/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4134 	    }
4135 	} else {
4136 	    unsigned int c;
4137 	    int bits, i = 0;
4138 
4139 	    c = ent->value;
4140 	    if      (c <    0x80)
4141 	            { out[i++]= c;                bits= -6; }
4142 	    else if (c <   0x800)
4143 	            { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4144 	    else if (c < 0x10000)
4145 	            { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4146 	    else
4147 	            { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4148 
4149 	    for ( ; bits >= 0; bits-= 6) {
4150 		out[i++]= ((c >> bits) & 0x3F) | 0x80;
4151 	    }
4152 	    out[i] = 0;
4153 
4154 	    htmlCheckParagraph(ctxt);
4155 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4156 		ctxt->sax->characters(ctxt->userData, out, i);
4157 	}
4158     }
4159 }
4160 
4161 /**
4162  * htmlParseContent:
4163  * @ctxt:  an HTML parser context
4164  *
4165  * Parse a content: comment, sub-element, reference or text.
4166  * Kept for compatibility with old code
4167  */
4168 
4169 static void
htmlParseContent(htmlParserCtxtPtr ctxt)4170 htmlParseContent(htmlParserCtxtPtr ctxt) {
4171     xmlChar *currentNode;
4172     int depth;
4173     const xmlChar *name;
4174 
4175     currentNode = xmlStrdup(ctxt->name);
4176     depth = ctxt->nameNr;
4177     while (1) {
4178 	long cons = ctxt->nbChars;
4179 
4180         GROW;
4181 
4182         if (ctxt->instate == XML_PARSER_EOF)
4183             break;
4184 
4185 	/*
4186 	 * Our tag or one of it's parent or children is ending.
4187 	 */
4188         if ((CUR == '<') && (NXT(1) == '/')) {
4189 	    if (htmlParseEndTag(ctxt) &&
4190 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4191 		if (currentNode != NULL)
4192 		    xmlFree(currentNode);
4193 		return;
4194 	    }
4195 	    continue; /* while */
4196         }
4197 
4198 	else if ((CUR == '<') &&
4199 	         ((IS_ASCII_LETTER(NXT(1))) ||
4200 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4201 	    name = htmlParseHTMLName_nonInvasive(ctxt);
4202 	    if (name == NULL) {
4203 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4204 			 "htmlParseStartTag: invalid element name\n",
4205 			 NULL, NULL);
4206 	        /* Dump the bogus tag like browsers do */
4207         while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4208 	            NEXT;
4209 
4210 	        if (currentNode != NULL)
4211 	            xmlFree(currentNode);
4212 	        return;
4213 	    }
4214 
4215 	    if (ctxt->name != NULL) {
4216 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4217 	            htmlAutoClose(ctxt, name);
4218 	            continue;
4219 	        }
4220 	    }
4221 	}
4222 
4223 	/*
4224 	 * Has this node been popped out during parsing of
4225 	 * the next element
4226 	 */
4227         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4228 	    (!xmlStrEqual(currentNode, ctxt->name)))
4229 	     {
4230 	    if (currentNode != NULL) xmlFree(currentNode);
4231 	    return;
4232 	}
4233 
4234 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4235 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4236 	    /*
4237 	     * Handle SCRIPT/STYLE separately
4238 	     */
4239 	    htmlParseScript(ctxt);
4240 	} else {
4241 	    /*
4242 	     * Sometimes DOCTYPE arrives in the middle of the document
4243 	     */
4244 	    if ((CUR == '<') && (NXT(1) == '!') &&
4245 		(UPP(2) == 'D') && (UPP(3) == 'O') &&
4246 		(UPP(4) == 'C') && (UPP(5) == 'T') &&
4247 		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4248 		(UPP(8) == 'E')) {
4249 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4250 		             "Misplaced DOCTYPE declaration\n",
4251 			     BAD_CAST "DOCTYPE" , NULL);
4252 		htmlParseDocTypeDecl(ctxt);
4253 	    }
4254 
4255 	    /*
4256 	     * First case :  a comment
4257 	     */
4258 	    if ((CUR == '<') && (NXT(1) == '!') &&
4259 		(NXT(2) == '-') && (NXT(3) == '-')) {
4260 		htmlParseComment(ctxt);
4261 	    }
4262 
4263 	    /*
4264 	     * Second case : a Processing Instruction.
4265 	     */
4266 	    else if ((CUR == '<') && (NXT(1) == '?')) {
4267 		htmlParsePI(ctxt);
4268 	    }
4269 
4270 	    /*
4271 	     * Third case :  a sub-element.
4272 	     */
4273 	    else if (CUR == '<') {
4274 		htmlParseElement(ctxt);
4275 	    }
4276 
4277 	    /*
4278 	     * Fourth case : a reference. If if has not been resolved,
4279 	     *    parsing returns it's Name, create the node
4280 	     */
4281 	    else if (CUR == '&') {
4282 		htmlParseReference(ctxt);
4283 	    }
4284 
4285 	    /*
4286 	     * Fifth case : end of the resource
4287 	     */
4288 	    else if (CUR == 0) {
4289 		htmlAutoCloseOnEnd(ctxt);
4290 		break;
4291 	    }
4292 
4293 	    /*
4294 	     * Last case, text. Note that References are handled directly.
4295 	     */
4296 	    else {
4297 		htmlParseCharData(ctxt);
4298 	    }
4299 
4300 	    if (cons == ctxt->nbChars) {
4301 		if (ctxt->node != NULL) {
4302 		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4303 		                 "detected an error in element content\n",
4304 				 NULL, NULL);
4305 		}
4306 		break;
4307 	    }
4308 	}
4309         GROW;
4310     }
4311     if (currentNode != NULL) xmlFree(currentNode);
4312 }
4313 
4314 /**
4315  * htmlParseElement:
4316  * @ctxt:  an HTML parser context
4317  *
4318  * parse an HTML element, this is highly recursive
4319  * this is kept for compatibility with previous code versions
4320  *
4321  * [39] element ::= EmptyElemTag | STag content ETag
4322  *
4323  * [41] Attribute ::= Name Eq AttValue
4324  */
4325 
4326 void
htmlParseElement(htmlParserCtxtPtr ctxt)4327 htmlParseElement(htmlParserCtxtPtr ctxt) {
4328     const xmlChar *name;
4329     xmlChar *currentNode = NULL;
4330     const htmlElemDesc * info;
4331     htmlParserNodeInfo node_info;
4332     int failed;
4333     int depth;
4334     const xmlChar *oldptr;
4335 
4336     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4337 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4338 		     "htmlParseElement: context error\n", NULL, NULL);
4339 	return;
4340     }
4341 
4342     if (ctxt->instate == XML_PARSER_EOF)
4343         return;
4344 
4345     /* Capture start position */
4346     if (ctxt->record_info) {
4347         node_info.begin_pos = ctxt->input->consumed +
4348                           (CUR_PTR - ctxt->input->base);
4349 	node_info.begin_line = ctxt->input->line;
4350     }
4351 
4352     failed = htmlParseStartTag(ctxt);
4353     name = ctxt->name;
4354     if ((failed == -1) || (name == NULL)) {
4355 	if (CUR == '>')
4356 	    NEXT;
4357         return;
4358     }
4359 
4360     /*
4361      * Lookup the info for that element.
4362      */
4363     info = htmlTagLookup(name);
4364     if (info == NULL) {
4365 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4366 	             "Tag %s invalid\n", name, NULL);
4367     }
4368 
4369     /*
4370      * Check for an Empty Element labeled the XML/SGML way
4371      */
4372     if ((CUR == '/') && (NXT(1) == '>')) {
4373         SKIP(2);
4374 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4375 	    ctxt->sax->endElement(ctxt->userData, name);
4376 	htmlnamePop(ctxt);
4377 	return;
4378     }
4379 
4380     if (CUR == '>') {
4381         NEXT;
4382     } else {
4383 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4384 	             "Couldn't find end of Start Tag %s\n", name, NULL);
4385 
4386 	/*
4387 	 * end of parsing of this node.
4388 	 */
4389 	if (xmlStrEqual(name, ctxt->name)) {
4390 	    nodePop(ctxt);
4391 	    htmlnamePop(ctxt);
4392 	}
4393 
4394 	/*
4395 	 * Capture end position and add node
4396 	 */
4397 	if (ctxt->record_info) {
4398 	   node_info.end_pos = ctxt->input->consumed +
4399 			      (CUR_PTR - ctxt->input->base);
4400 	   node_info.end_line = ctxt->input->line;
4401 	   node_info.node = ctxt->node;
4402 	   xmlParserAddNodeInfo(ctxt, &node_info);
4403 	}
4404 	return;
4405     }
4406 
4407     /*
4408      * Check for an Empty Element from DTD definition
4409      */
4410     if ((info != NULL) && (info->empty)) {
4411 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4412 	    ctxt->sax->endElement(ctxt->userData, name);
4413 	htmlnamePop(ctxt);
4414 	return;
4415     }
4416 
4417     /*
4418      * Parse the content of the element:
4419      */
4420     currentNode = xmlStrdup(ctxt->name);
4421     depth = ctxt->nameNr;
4422     while (IS_CHAR_CH(CUR)) {
4423 	oldptr = ctxt->input->cur;
4424 	htmlParseContent(ctxt);
4425 	if (oldptr==ctxt->input->cur) break;
4426 	if (ctxt->nameNr < depth) break;
4427     }
4428 
4429     /*
4430      * Capture end position and add node
4431      */
4432     if ( currentNode != NULL && ctxt->record_info ) {
4433        node_info.end_pos = ctxt->input->consumed +
4434                           (CUR_PTR - ctxt->input->base);
4435        node_info.end_line = ctxt->input->line;
4436        node_info.node = ctxt->node;
4437        xmlParserAddNodeInfo(ctxt, &node_info);
4438     }
4439     if (!IS_CHAR_CH(CUR)) {
4440 	htmlAutoCloseOnEnd(ctxt);
4441     }
4442 
4443     if (currentNode != NULL)
4444 	xmlFree(currentNode);
4445 }
4446 
4447 static void
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt)4448 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4449     /*
4450      * Capture end position and add node
4451      */
4452     if ( ctxt->node != NULL && ctxt->record_info ) {
4453        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4454                                 (CUR_PTR - ctxt->input->base);
4455        ctxt->nodeInfo->end_line = ctxt->input->line;
4456        ctxt->nodeInfo->node = ctxt->node;
4457        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4458        htmlNodeInfoPop(ctxt);
4459     }
4460     if (!IS_CHAR_CH(CUR)) {
4461        htmlAutoCloseOnEnd(ctxt);
4462     }
4463 }
4464 
4465 /**
4466  * htmlParseElementInternal:
4467  * @ctxt:  an HTML parser context
4468  *
4469  * parse an HTML element, new version, non recursive
4470  *
4471  * [39] element ::= EmptyElemTag | STag content ETag
4472  *
4473  * [41] Attribute ::= Name Eq AttValue
4474  */
4475 
4476 static void
htmlParseElementInternal(htmlParserCtxtPtr ctxt)4477 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4478     const xmlChar *name;
4479     const htmlElemDesc * info;
4480     htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4481     int failed;
4482 
4483     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4484 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4485 		     "htmlParseElementInternal: context error\n", NULL, NULL);
4486 	return;
4487     }
4488 
4489     if (ctxt->instate == XML_PARSER_EOF)
4490         return;
4491 
4492     /* Capture start position */
4493     if (ctxt->record_info) {
4494         node_info.begin_pos = ctxt->input->consumed +
4495                           (CUR_PTR - ctxt->input->base);
4496 	node_info.begin_line = ctxt->input->line;
4497     }
4498 
4499     failed = htmlParseStartTag(ctxt);
4500     name = ctxt->name;
4501     if ((failed == -1) || (name == NULL)) {
4502 	if (CUR == '>')
4503 	    NEXT;
4504         return;
4505     }
4506 
4507     /*
4508      * Lookup the info for that element.
4509      */
4510     info = htmlTagLookup(name);
4511     if (info == NULL) {
4512 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4513 	             "Tag %s invalid\n", name, NULL);
4514     }
4515 
4516     /*
4517      * Check for an Empty Element labeled the XML/SGML way
4518      */
4519     if ((CUR == '/') && (NXT(1) == '>')) {
4520         SKIP(2);
4521 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4522 	    ctxt->sax->endElement(ctxt->userData, name);
4523 	htmlnamePop(ctxt);
4524 	return;
4525     }
4526 
4527     if (CUR == '>') {
4528         NEXT;
4529     } else {
4530 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4531 	             "Couldn't find end of Start Tag %s\n", name, NULL);
4532 
4533 	/*
4534 	 * end of parsing of this node.
4535 	 */
4536 	if (xmlStrEqual(name, ctxt->name)) {
4537 	    nodePop(ctxt);
4538 	    htmlnamePop(ctxt);
4539 	}
4540 
4541         if (ctxt->record_info)
4542             htmlNodeInfoPush(ctxt, &node_info);
4543         htmlParserFinishElementParsing(ctxt);
4544 	return;
4545     }
4546 
4547     /*
4548      * Check for an Empty Element from DTD definition
4549      */
4550     if ((info != NULL) && (info->empty)) {
4551 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4552 	    ctxt->sax->endElement(ctxt->userData, name);
4553 	htmlnamePop(ctxt);
4554 	return;
4555     }
4556 
4557     if (ctxt->record_info)
4558         htmlNodeInfoPush(ctxt, &node_info);
4559 }
4560 
4561 /**
4562  * htmlParseContentInternal:
4563  * @ctxt:  an HTML parser context
4564  *
4565  * Parse a content: comment, sub-element, reference or text.
4566  * New version for non recursive htmlParseElementInternal
4567  */
4568 
4569 static void
htmlParseContentInternal(htmlParserCtxtPtr ctxt)4570 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4571     xmlChar *currentNode;
4572     int depth;
4573     const xmlChar *name;
4574 
4575     currentNode = xmlStrdup(ctxt->name);
4576     depth = ctxt->nameNr;
4577     while (1) {
4578 	long cons = ctxt->nbChars;
4579 
4580         GROW;
4581 
4582         if (ctxt->instate == XML_PARSER_EOF)
4583             break;
4584 
4585 	/*
4586 	 * Our tag or one of it's parent or children is ending.
4587 	 */
4588         if ((CUR == '<') && (NXT(1) == '/')) {
4589 	    if (htmlParseEndTag(ctxt) &&
4590 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4591 		if (currentNode != NULL)
4592 		    xmlFree(currentNode);
4593 
4594 	        currentNode = xmlStrdup(ctxt->name);
4595 	        depth = ctxt->nameNr;
4596 	    }
4597 	    continue; /* while */
4598         }
4599 
4600 	else if ((CUR == '<') &&
4601 	         ((IS_ASCII_LETTER(NXT(1))) ||
4602 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4603 	    name = htmlParseHTMLName_nonInvasive(ctxt);
4604 	    if (name == NULL) {
4605 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4606 			 "htmlParseStartTag: invalid element name\n",
4607 			 NULL, NULL);
4608 	        /* Dump the bogus tag like browsers do */
4609 	        while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4610 	            NEXT;
4611 
4612 	        htmlParserFinishElementParsing(ctxt);
4613 	        if (currentNode != NULL)
4614 	            xmlFree(currentNode);
4615 
4616 	        currentNode = xmlStrdup(ctxt->name);
4617 	        depth = ctxt->nameNr;
4618 	        continue;
4619 	    }
4620 
4621 	    if (ctxt->name != NULL) {
4622 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4623 	            htmlAutoClose(ctxt, name);
4624 	            continue;
4625 	        }
4626 	    }
4627 	}
4628 
4629 	/*
4630 	 * Has this node been popped out during parsing of
4631 	 * the next element
4632 	 */
4633         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4634 	    (!xmlStrEqual(currentNode, ctxt->name)))
4635 	     {
4636 	    htmlParserFinishElementParsing(ctxt);
4637 	    if (currentNode != NULL) xmlFree(currentNode);
4638 
4639 	    currentNode = xmlStrdup(ctxt->name);
4640 	    depth = ctxt->nameNr;
4641 	    continue;
4642 	}
4643 
4644 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4645 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4646 	    /*
4647 	     * Handle SCRIPT/STYLE separately
4648 	     */
4649 	    htmlParseScript(ctxt);
4650 	} else {
4651 	    /*
4652 	     * Sometimes DOCTYPE arrives in the middle of the document
4653 	     */
4654 	    if ((CUR == '<') && (NXT(1) == '!') &&
4655 		(UPP(2) == 'D') && (UPP(3) == 'O') &&
4656 		(UPP(4) == 'C') && (UPP(5) == 'T') &&
4657 		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4658 		(UPP(8) == 'E')) {
4659 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4660 		             "Misplaced DOCTYPE declaration\n",
4661 			     BAD_CAST "DOCTYPE" , NULL);
4662 		htmlParseDocTypeDecl(ctxt);
4663 	    }
4664 
4665 	    /*
4666 	     * First case :  a comment
4667 	     */
4668 	    if ((CUR == '<') && (NXT(1) == '!') &&
4669 		(NXT(2) == '-') && (NXT(3) == '-')) {
4670 		htmlParseComment(ctxt);
4671 	    }
4672 
4673 	    /*
4674 	     * Second case : a Processing Instruction.
4675 	     */
4676 	    else if ((CUR == '<') && (NXT(1) == '?')) {
4677 		htmlParsePI(ctxt);
4678 	    }
4679 
4680 	    /*
4681 	     * Third case :  a sub-element.
4682 	     */
4683 	    else if (CUR == '<') {
4684 		htmlParseElementInternal(ctxt);
4685 		if (currentNode != NULL) xmlFree(currentNode);
4686 
4687 		currentNode = xmlStrdup(ctxt->name);
4688 		depth = ctxt->nameNr;
4689 	    }
4690 
4691 	    /*
4692 	     * Fourth case : a reference. If if has not been resolved,
4693 	     *    parsing returns it's Name, create the node
4694 	     */
4695 	    else if (CUR == '&') {
4696 		htmlParseReference(ctxt);
4697 	    }
4698 
4699 	    /*
4700 	     * Fifth case : end of the resource
4701 	     */
4702 	    else if (CUR == 0) {
4703 		htmlAutoCloseOnEnd(ctxt);
4704 		break;
4705 	    }
4706 
4707 	    /*
4708 	     * Last case, text. Note that References are handled directly.
4709 	     */
4710 	    else {
4711 		htmlParseCharData(ctxt);
4712 	    }
4713 
4714 	    if (cons == ctxt->nbChars) {
4715 		if (ctxt->node != NULL) {
4716 		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4717 		                 "detected an error in element content\n",
4718 				 NULL, NULL);
4719 		}
4720 		break;
4721 	    }
4722 	}
4723         GROW;
4724     }
4725     if (currentNode != NULL) xmlFree(currentNode);
4726 }
4727 
4728 /**
4729  * htmlParseContent:
4730  * @ctxt:  an HTML parser context
4731  *
4732  * Parse a content: comment, sub-element, reference or text.
4733  * This is the entry point when called from parser.c
4734  */
4735 
4736 void
__htmlParseContent(void * ctxt)4737 __htmlParseContent(void *ctxt) {
4738     if (ctxt != NULL)
4739 	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4740 }
4741 
4742 /**
4743  * htmlParseDocument:
4744  * @ctxt:  an HTML parser context
4745  *
4746  * parse an HTML document (and build a tree if using the standard SAX
4747  * interface).
4748  *
4749  * Returns 0, -1 in case of error. the parser context is augmented
4750  *                as a result of the parsing.
4751  */
4752 
4753 int
htmlParseDocument(htmlParserCtxtPtr ctxt)4754 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4755     xmlChar start[4];
4756     xmlCharEncoding enc;
4757     xmlDtdPtr dtd;
4758 
4759     xmlInitParser();
4760 
4761     htmlDefaultSAXHandlerInit();
4762 
4763     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4764 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4765 		     "htmlParseDocument: context error\n", NULL, NULL);
4766 	return(XML_ERR_INTERNAL_ERROR);
4767     }
4768     ctxt->html = 1;
4769     ctxt->linenumbers = 1;
4770     GROW;
4771     /*
4772      * SAX: beginning of the document processing.
4773      */
4774     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4775         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4776 
4777     if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4778         ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4779 	/*
4780 	 * Get the 4 first bytes and decode the charset
4781 	 * if enc != XML_CHAR_ENCODING_NONE
4782 	 * plug some encoding conversion routines.
4783 	 */
4784 	start[0] = RAW;
4785 	start[1] = NXT(1);
4786 	start[2] = NXT(2);
4787 	start[3] = NXT(3);
4788 	enc = xmlDetectCharEncoding(&start[0], 4);
4789 	if (enc != XML_CHAR_ENCODING_NONE) {
4790 	    xmlSwitchEncoding(ctxt, enc);
4791 	}
4792     }
4793 
4794     /*
4795      * Wipe out everything which is before the first '<'
4796      */
4797     SKIP_BLANKS;
4798     if (CUR == 0) {
4799 	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4800 	             "Document is empty\n", NULL, NULL);
4801     }
4802 
4803     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4804 	ctxt->sax->startDocument(ctxt->userData);
4805 
4806 
4807     /*
4808      * Parse possible comments and PIs before any content
4809      */
4810     while (((CUR == '<') && (NXT(1) == '!') &&
4811             (NXT(2) == '-') && (NXT(3) == '-')) ||
4812 	   ((CUR == '<') && (NXT(1) == '?'))) {
4813         htmlParseComment(ctxt);
4814         htmlParsePI(ctxt);
4815 	SKIP_BLANKS;
4816     }
4817 
4818 
4819     /*
4820      * Then possibly doc type declaration(s) and more Misc
4821      * (doctypedecl Misc*)?
4822      */
4823     if ((CUR == '<') && (NXT(1) == '!') &&
4824 	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4825 	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4826 	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4827 	(UPP(8) == 'E')) {
4828 	htmlParseDocTypeDecl(ctxt);
4829     }
4830     SKIP_BLANKS;
4831 
4832     /*
4833      * Parse possible comments and PIs before any content
4834      */
4835     while (((CUR == '<') && (NXT(1) == '!') &&
4836             (NXT(2) == '-') && (NXT(3) == '-')) ||
4837 	   ((CUR == '<') && (NXT(1) == '?'))) {
4838         htmlParseComment(ctxt);
4839         htmlParsePI(ctxt);
4840 	SKIP_BLANKS;
4841     }
4842 
4843     /*
4844      * Time to start parsing the tree itself
4845      */
4846     htmlParseContentInternal(ctxt);
4847 
4848     /*
4849      * autoclose
4850      */
4851     if (CUR == 0)
4852 	htmlAutoCloseOnEnd(ctxt);
4853 
4854 
4855     /*
4856      * SAX: end of the document processing.
4857      */
4858     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4859         ctxt->sax->endDocument(ctxt->userData);
4860 
4861     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4862 	dtd = xmlGetIntSubset(ctxt->myDoc);
4863 	if (dtd == NULL)
4864 	    ctxt->myDoc->intSubset =
4865 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4866 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4867 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4868     }
4869     if (! ctxt->wellFormed) return(-1);
4870     return(0);
4871 }
4872 
4873 
4874 /************************************************************************
4875  *									*
4876  *			Parser contexts handling			*
4877  *									*
4878  ************************************************************************/
4879 
4880 /**
4881  * htmlInitParserCtxt:
4882  * @ctxt:  an HTML parser context
4883  *
4884  * Initialize a parser context
4885  *
4886  * Returns 0 in case of success and -1 in case of error
4887  */
4888 
4889 static int
htmlInitParserCtxt(htmlParserCtxtPtr ctxt)4890 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4891 {
4892     htmlSAXHandler *sax;
4893 
4894     if (ctxt == NULL) return(-1);
4895     memset(ctxt, 0, sizeof(htmlParserCtxt));
4896 
4897     ctxt->dict = xmlDictCreate();
4898     if (ctxt->dict == NULL) {
4899         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4900 	return(-1);
4901     }
4902     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4903     if (sax == NULL) {
4904         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4905 	return(-1);
4906     }
4907     else
4908         memset(sax, 0, sizeof(htmlSAXHandler));
4909 
4910     /* Allocate the Input stack */
4911     ctxt->inputTab = (htmlParserInputPtr *)
4912                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
4913     if (ctxt->inputTab == NULL) {
4914         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4915 	ctxt->inputNr = 0;
4916 	ctxt->inputMax = 0;
4917 	ctxt->input = NULL;
4918 	return(-1);
4919     }
4920     ctxt->inputNr = 0;
4921     ctxt->inputMax = 5;
4922     ctxt->input = NULL;
4923     ctxt->version = NULL;
4924     ctxt->encoding = NULL;
4925     ctxt->standalone = -1;
4926     ctxt->instate = XML_PARSER_START;
4927 
4928     /* Allocate the Node stack */
4929     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4930     if (ctxt->nodeTab == NULL) {
4931         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4932 	ctxt->nodeNr = 0;
4933 	ctxt->nodeMax = 0;
4934 	ctxt->node = NULL;
4935 	ctxt->inputNr = 0;
4936 	ctxt->inputMax = 0;
4937 	ctxt->input = NULL;
4938 	return(-1);
4939     }
4940     ctxt->nodeNr = 0;
4941     ctxt->nodeMax = 10;
4942     ctxt->node = NULL;
4943 
4944     /* Allocate the Name stack */
4945     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4946     if (ctxt->nameTab == NULL) {
4947         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4948 	ctxt->nameNr = 0;
4949 	ctxt->nameMax = 0;
4950 	ctxt->name = NULL;
4951 	ctxt->nodeNr = 0;
4952 	ctxt->nodeMax = 0;
4953 	ctxt->node = NULL;
4954 	ctxt->inputNr = 0;
4955 	ctxt->inputMax = 0;
4956 	ctxt->input = NULL;
4957 	return(-1);
4958     }
4959     ctxt->nameNr = 0;
4960     ctxt->nameMax = 10;
4961     ctxt->name = NULL;
4962 
4963     ctxt->nodeInfoTab = NULL;
4964     ctxt->nodeInfoNr  = 0;
4965     ctxt->nodeInfoMax = 0;
4966 
4967     if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4968     else {
4969         ctxt->sax = sax;
4970 	memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4971     }
4972     ctxt->userData = ctxt;
4973     ctxt->myDoc = NULL;
4974     ctxt->wellFormed = 1;
4975     ctxt->replaceEntities = 0;
4976     ctxt->linenumbers = xmlLineNumbersDefaultValue;
4977     ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
4978     ctxt->html = 1;
4979     ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4980     ctxt->vctxt.userData = ctxt;
4981     ctxt->vctxt.error = xmlParserValidityError;
4982     ctxt->vctxt.warning = xmlParserValidityWarning;
4983     ctxt->record_info = 0;
4984     ctxt->validate = 0;
4985     ctxt->nbChars = 0;
4986     ctxt->checkIndex = 0;
4987     ctxt->catalogs = NULL;
4988     xmlInitNodeInfoSeq(&ctxt->node_seq);
4989     return(0);
4990 }
4991 
4992 /**
4993  * htmlFreeParserCtxt:
4994  * @ctxt:  an HTML parser context
4995  *
4996  * Free all the memory used by a parser context. However the parsed
4997  * document in ctxt->myDoc is not freed.
4998  */
4999 
5000 void
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)5001 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5002 {
5003     xmlFreeParserCtxt(ctxt);
5004 }
5005 
5006 /**
5007  * htmlNewParserCtxt:
5008  *
5009  * Allocate and initialize a new parser context.
5010  *
5011  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5012  */
5013 
5014 htmlParserCtxtPtr
htmlNewParserCtxt(void)5015 htmlNewParserCtxt(void)
5016 {
5017     xmlParserCtxtPtr ctxt;
5018 
5019     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5020     if (ctxt == NULL) {
5021         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5022 	return(NULL);
5023     }
5024     memset(ctxt, 0, sizeof(xmlParserCtxt));
5025     if (htmlInitParserCtxt(ctxt) < 0) {
5026         htmlFreeParserCtxt(ctxt);
5027 	return(NULL);
5028     }
5029     return(ctxt);
5030 }
5031 
5032 /**
5033  * htmlCreateMemoryParserCtxt:
5034  * @buffer:  a pointer to a char array
5035  * @size:  the size of the array
5036  *
5037  * Create a parser context for an HTML in-memory document.
5038  *
5039  * Returns the new parser context or NULL
5040  */
5041 htmlParserCtxtPtr
htmlCreateMemoryParserCtxt(const char * buffer,int size)5042 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5043     xmlParserCtxtPtr ctxt;
5044     xmlParserInputPtr input;
5045     xmlParserInputBufferPtr buf;
5046 
5047     if (buffer == NULL)
5048 	return(NULL);
5049     if (size <= 0)
5050 	return(NULL);
5051 
5052     ctxt = htmlNewParserCtxt();
5053     if (ctxt == NULL)
5054 	return(NULL);
5055 
5056     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5057     if (buf == NULL) return(NULL);
5058 
5059     input = xmlNewInputStream(ctxt);
5060     if (input == NULL) {
5061 	xmlFreeParserCtxt(ctxt);
5062 	return(NULL);
5063     }
5064 
5065     input->filename = NULL;
5066     input->buf = buf;
5067     xmlBufResetInput(buf->buffer, input);
5068 
5069     inputPush(ctxt, input);
5070     return(ctxt);
5071 }
5072 
5073 /**
5074  * htmlCreateDocParserCtxt:
5075  * @cur:  a pointer to an array of xmlChar
5076  * @encoding:  a free form C string describing the HTML document encoding, or NULL
5077  *
5078  * Create a parser context for an HTML document.
5079  *
5080  * TODO: check the need to add encoding handling there
5081  *
5082  * Returns the new parser context or NULL
5083  */
5084 static htmlParserCtxtPtr
htmlCreateDocParserCtxt(const xmlChar * cur,const char * encoding)5085 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5086     int len;
5087     htmlParserCtxtPtr ctxt;
5088 
5089     if (cur == NULL)
5090 	return(NULL);
5091     len = xmlStrlen(cur);
5092     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5093     if (ctxt == NULL)
5094 	return(NULL);
5095 
5096     if (encoding != NULL) {
5097 	xmlCharEncoding enc;
5098 	xmlCharEncodingHandlerPtr handler;
5099 
5100 	if (ctxt->input->encoding != NULL)
5101 	    xmlFree((xmlChar *) ctxt->input->encoding);
5102 	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5103 
5104 	enc = xmlParseCharEncoding(encoding);
5105 	/*
5106 	 * registered set of known encodings
5107 	 */
5108 	if (enc != XML_CHAR_ENCODING_ERROR) {
5109 	    xmlSwitchEncoding(ctxt, enc);
5110 	    if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5111 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5112 		             "Unsupported encoding %s\n",
5113 			     (const xmlChar *) encoding, NULL);
5114 	    }
5115 	} else {
5116 	    /*
5117 	     * fallback for unknown encodings
5118 	     */
5119 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
5120 	    if (handler != NULL) {
5121 		xmlSwitchToEncoding(ctxt, handler);
5122 	    } else {
5123 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5124 		             "Unsupported encoding %s\n",
5125 			     (const xmlChar *) encoding, NULL);
5126 	    }
5127 	}
5128     }
5129     return(ctxt);
5130 }
5131 
5132 #ifdef LIBXML_PUSH_ENABLED
5133 /************************************************************************
5134  *									*
5135  *	Progressive parsing interfaces				*
5136  *									*
5137  ************************************************************************/
5138 
5139 /**
5140  * htmlParseLookupSequence:
5141  * @ctxt:  an HTML parser context
5142  * @first:  the first char to lookup
5143  * @next:  the next char to lookup or zero
5144  * @third:  the next char to lookup or zero
5145  * @comment: flag to force checking inside comments
5146  *
5147  * Try to find if a sequence (first, next, third) or  just (first next) or
5148  * (first) is available in the input stream.
5149  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5150  * to avoid rescanning sequences of bytes, it DOES change the state of the
5151  * parser, do not use liberally.
5152  * This is basically similar to xmlParseLookupSequence()
5153  *
5154  * Returns the index to the current parsing point if the full sequence
5155  *      is available, -1 otherwise.
5156  */
5157 static int
htmlParseLookupSequence(htmlParserCtxtPtr ctxt,xmlChar first,xmlChar next,xmlChar third,int iscomment,int ignoreattrval)5158 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5159                         xmlChar next, xmlChar third, int iscomment,
5160                         int ignoreattrval)
5161 {
5162     int base, len;
5163     htmlParserInputPtr in;
5164     const xmlChar *buf;
5165     int incomment = 0;
5166     int invalue = 0;
5167     char valdellim = 0x0;
5168 
5169     in = ctxt->input;
5170     if (in == NULL)
5171         return (-1);
5172 
5173     base = in->cur - in->base;
5174     if (base < 0)
5175         return (-1);
5176 
5177     if (ctxt->checkIndex > base)
5178         base = ctxt->checkIndex;
5179 
5180     if (in->buf == NULL) {
5181         buf = in->base;
5182         len = in->length;
5183     } else {
5184         buf = xmlBufContent(in->buf->buffer);
5185         len = xmlBufUse(in->buf->buffer);
5186     }
5187 
5188     /* take into account the sequence length */
5189     if (third)
5190         len -= 2;
5191     else if (next)
5192         len--;
5193     for (; base < len; base++) {
5194         if ((!incomment) && (base + 4 < len) && (!iscomment)) {
5195             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5196                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5197                 incomment = 1;
5198                 /* do not increment past <! - some people use <!--> */
5199                 base += 2;
5200             }
5201         }
5202         if (ignoreattrval) {
5203             if (buf[base] == '"' || buf[base] == '\'') {
5204                 if (invalue) {
5205                     if (buf[base] == valdellim) {
5206                         invalue = 0;
5207                         continue;
5208                     }
5209                 } else {
5210                     valdellim = buf[base];
5211                     invalue = 1;
5212                     continue;
5213                 }
5214             } else if (invalue) {
5215                 continue;
5216             }
5217         }
5218         if (incomment) {
5219             if (base + 3 > len)
5220                 return (-1);
5221             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5222                 (buf[base + 2] == '>')) {
5223                 incomment = 0;
5224                 base += 2;
5225             }
5226             continue;
5227         }
5228         if (buf[base] == first) {
5229             if (third != 0) {
5230                 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5231                     continue;
5232             } else if (next != 0) {
5233                 if (buf[base + 1] != next)
5234                     continue;
5235             }
5236             ctxt->checkIndex = 0;
5237 #ifdef DEBUG_PUSH
5238             if (next == 0)
5239                 xmlGenericError(xmlGenericErrorContext,
5240                                 "HPP: lookup '%c' found at %d\n",
5241                                 first, base);
5242             else if (third == 0)
5243                 xmlGenericError(xmlGenericErrorContext,
5244                                 "HPP: lookup '%c%c' found at %d\n",
5245                                 first, next, base);
5246             else
5247                 xmlGenericError(xmlGenericErrorContext,
5248                                 "HPP: lookup '%c%c%c' found at %d\n",
5249                                 first, next, third, base);
5250 #endif
5251             return (base - (in->cur - in->base));
5252         }
5253     }
5254     if ((!incomment) && (!invalue))
5255         ctxt->checkIndex = base;
5256 #ifdef DEBUG_PUSH
5257     if (next == 0)
5258         xmlGenericError(xmlGenericErrorContext,
5259                         "HPP: lookup '%c' failed\n", first);
5260     else if (third == 0)
5261         xmlGenericError(xmlGenericErrorContext,
5262                         "HPP: lookup '%c%c' failed\n", first, next);
5263     else
5264         xmlGenericError(xmlGenericErrorContext,
5265                         "HPP: lookup '%c%c%c' failed\n", first, next,
5266                         third);
5267 #endif
5268     return (-1);
5269 }
5270 
5271 /**
5272  * htmlParseLookupChars:
5273  * @ctxt: an HTML parser context
5274  * @stop: Array of chars, which stop the lookup.
5275  * @stopLen: Length of stop-Array
5276  *
5277  * Try to find if any char of the stop-Array is available in the input
5278  * stream.
5279  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5280  * to avoid rescanning sequences of bytes, it DOES change the state of the
5281  * parser, do not use liberally.
5282  *
5283  * Returns the index to the current parsing point if a stopChar
5284  *      is available, -1 otherwise.
5285  */
5286 static int
htmlParseLookupChars(htmlParserCtxtPtr ctxt,const xmlChar * stop,int stopLen)5287 htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
5288                      int stopLen)
5289 {
5290     int base, len;
5291     htmlParserInputPtr in;
5292     const xmlChar *buf;
5293     int incomment = 0;
5294     int i;
5295 
5296     in = ctxt->input;
5297     if (in == NULL)
5298         return (-1);
5299 
5300     base = in->cur - in->base;
5301     if (base < 0)
5302         return (-1);
5303 
5304     if (ctxt->checkIndex > base)
5305         base = ctxt->checkIndex;
5306 
5307     if (in->buf == NULL) {
5308         buf = in->base;
5309         len = in->length;
5310     } else {
5311         buf = xmlBufContent(in->buf->buffer);
5312         len = xmlBufUse(in->buf->buffer);
5313     }
5314 
5315     for (; base < len; base++) {
5316         if (!incomment && (base + 4 < len)) {
5317             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5318                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5319                 incomment = 1;
5320                 /* do not increment past <! - some people use <!--> */
5321                 base += 2;
5322             }
5323         }
5324         if (incomment) {
5325             if (base + 3 > len)
5326                 return (-1);
5327             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5328                 (buf[base + 2] == '>')) {
5329                 incomment = 0;
5330                 base += 2;
5331             }
5332             continue;
5333         }
5334         for (i = 0; i < stopLen; ++i) {
5335             if (buf[base] == stop[i]) {
5336                 ctxt->checkIndex = 0;
5337                 return (base - (in->cur - in->base));
5338             }
5339         }
5340     }
5341     ctxt->checkIndex = base;
5342     return (-1);
5343 }
5344 
5345 /**
5346  * htmlParseTryOrFinish:
5347  * @ctxt:  an HTML parser context
5348  * @terminate:  last chunk indicator
5349  *
5350  * Try to progress on parsing
5351  *
5352  * Returns zero if no parsing was possible
5353  */
5354 static int
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt,int terminate)5355 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5356     int ret = 0;
5357     htmlParserInputPtr in;
5358     int avail = 0;
5359     xmlChar cur, next;
5360 
5361     htmlParserNodeInfo node_info;
5362 
5363 #ifdef DEBUG_PUSH
5364     switch (ctxt->instate) {
5365 	case XML_PARSER_EOF:
5366 	    xmlGenericError(xmlGenericErrorContext,
5367 		    "HPP: try EOF\n"); break;
5368 	case XML_PARSER_START:
5369 	    xmlGenericError(xmlGenericErrorContext,
5370 		    "HPP: try START\n"); break;
5371 	case XML_PARSER_MISC:
5372 	    xmlGenericError(xmlGenericErrorContext,
5373 		    "HPP: try MISC\n");break;
5374 	case XML_PARSER_COMMENT:
5375 	    xmlGenericError(xmlGenericErrorContext,
5376 		    "HPP: try COMMENT\n");break;
5377 	case XML_PARSER_PROLOG:
5378 	    xmlGenericError(xmlGenericErrorContext,
5379 		    "HPP: try PROLOG\n");break;
5380 	case XML_PARSER_START_TAG:
5381 	    xmlGenericError(xmlGenericErrorContext,
5382 		    "HPP: try START_TAG\n");break;
5383 	case XML_PARSER_CONTENT:
5384 	    xmlGenericError(xmlGenericErrorContext,
5385 		    "HPP: try CONTENT\n");break;
5386 	case XML_PARSER_CDATA_SECTION:
5387 	    xmlGenericError(xmlGenericErrorContext,
5388 		    "HPP: try CDATA_SECTION\n");break;
5389 	case XML_PARSER_END_TAG:
5390 	    xmlGenericError(xmlGenericErrorContext,
5391 		    "HPP: try END_TAG\n");break;
5392 	case XML_PARSER_ENTITY_DECL:
5393 	    xmlGenericError(xmlGenericErrorContext,
5394 		    "HPP: try ENTITY_DECL\n");break;
5395 	case XML_PARSER_ENTITY_VALUE:
5396 	    xmlGenericError(xmlGenericErrorContext,
5397 		    "HPP: try ENTITY_VALUE\n");break;
5398 	case XML_PARSER_ATTRIBUTE_VALUE:
5399 	    xmlGenericError(xmlGenericErrorContext,
5400 		    "HPP: try ATTRIBUTE_VALUE\n");break;
5401 	case XML_PARSER_DTD:
5402 	    xmlGenericError(xmlGenericErrorContext,
5403 		    "HPP: try DTD\n");break;
5404 	case XML_PARSER_EPILOG:
5405 	    xmlGenericError(xmlGenericErrorContext,
5406 		    "HPP: try EPILOG\n");break;
5407 	case XML_PARSER_PI:
5408 	    xmlGenericError(xmlGenericErrorContext,
5409 		    "HPP: try PI\n");break;
5410 	case XML_PARSER_SYSTEM_LITERAL:
5411 	    xmlGenericError(xmlGenericErrorContext,
5412 		    "HPP: try SYSTEM_LITERAL\n");break;
5413     }
5414 #endif
5415 
5416     while (1) {
5417 
5418 	in = ctxt->input;
5419 	if (in == NULL) break;
5420 	if (in->buf == NULL)
5421 	    avail = in->length - (in->cur - in->base);
5422 	else
5423 	    avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5424 	if ((avail == 0) && (terminate)) {
5425 	    htmlAutoCloseOnEnd(ctxt);
5426 	    if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5427 		/*
5428 		 * SAX: end of the document processing.
5429 		 */
5430 		ctxt->instate = XML_PARSER_EOF;
5431 		if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5432 		    ctxt->sax->endDocument(ctxt->userData);
5433 	    }
5434 	}
5435         if (avail < 1)
5436 	    goto done;
5437 	cur = in->cur[0];
5438 	if (cur == 0) {
5439 	    SKIP(1);
5440 	    continue;
5441 	}
5442 
5443         switch (ctxt->instate) {
5444             case XML_PARSER_EOF:
5445 	        /*
5446 		 * Document parsing is done !
5447 		 */
5448 	        goto done;
5449             case XML_PARSER_START:
5450 	        /*
5451 		 * Very first chars read from the document flow.
5452 		 */
5453 		cur = in->cur[0];
5454 		if (IS_BLANK_CH(cur)) {
5455 		    SKIP_BLANKS;
5456 		    if (in->buf == NULL)
5457 			avail = in->length - (in->cur - in->base);
5458 		    else
5459 			avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5460 		}
5461 		if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5462 		    ctxt->sax->setDocumentLocator(ctxt->userData,
5463 						  &xmlDefaultSAXLocator);
5464 		if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5465 	            (!ctxt->disableSAX))
5466 		    ctxt->sax->startDocument(ctxt->userData);
5467 
5468 		cur = in->cur[0];
5469 		next = in->cur[1];
5470 		if ((cur == '<') && (next == '!') &&
5471 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5472 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5473 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5474 		    (UPP(8) == 'E')) {
5475 		    if ((!terminate) &&
5476 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5477 			goto done;
5478 #ifdef DEBUG_PUSH
5479 		    xmlGenericError(xmlGenericErrorContext,
5480 			    "HPP: Parsing internal subset\n");
5481 #endif
5482 		    htmlParseDocTypeDecl(ctxt);
5483 		    ctxt->instate = XML_PARSER_PROLOG;
5484 #ifdef DEBUG_PUSH
5485 		    xmlGenericError(xmlGenericErrorContext,
5486 			    "HPP: entering PROLOG\n");
5487 #endif
5488                 } else {
5489 		    ctxt->instate = XML_PARSER_MISC;
5490 #ifdef DEBUG_PUSH
5491 		    xmlGenericError(xmlGenericErrorContext,
5492 			    "HPP: entering MISC\n");
5493 #endif
5494 		}
5495 		break;
5496             case XML_PARSER_MISC:
5497 		SKIP_BLANKS;
5498 		if (in->buf == NULL)
5499 		    avail = in->length - (in->cur - in->base);
5500 		else
5501 		    avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5502 		/*
5503 		 * no chars in buffer
5504 		 */
5505 		if (avail < 1)
5506 		    goto done;
5507 		/*
5508 		 * not enouth chars in buffer
5509 		 */
5510 		if (avail < 2) {
5511 		    if (!terminate)
5512 			goto done;
5513 		    else
5514 			next = ' ';
5515 		} else {
5516 		    next = in->cur[1];
5517 		}
5518 		cur = in->cur[0];
5519 	        if ((cur == '<') && (next == '!') &&
5520 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5521 		    if ((!terminate) &&
5522 		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5523 			goto done;
5524 #ifdef DEBUG_PUSH
5525 		    xmlGenericError(xmlGenericErrorContext,
5526 			    "HPP: Parsing Comment\n");
5527 #endif
5528 		    htmlParseComment(ctxt);
5529 		    ctxt->instate = XML_PARSER_MISC;
5530 	        } else if ((cur == '<') && (next == '?')) {
5531 		    if ((!terminate) &&
5532 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5533 			goto done;
5534 #ifdef DEBUG_PUSH
5535 		    xmlGenericError(xmlGenericErrorContext,
5536 			    "HPP: Parsing PI\n");
5537 #endif
5538 		    htmlParsePI(ctxt);
5539 		    ctxt->instate = XML_PARSER_MISC;
5540 		} else if ((cur == '<') && (next == '!') &&
5541 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5542 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5543 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5544 		    (UPP(8) == 'E')) {
5545 		    if ((!terminate) &&
5546 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5547 			goto done;
5548 #ifdef DEBUG_PUSH
5549 		    xmlGenericError(xmlGenericErrorContext,
5550 			    "HPP: Parsing internal subset\n");
5551 #endif
5552 		    htmlParseDocTypeDecl(ctxt);
5553 		    ctxt->instate = XML_PARSER_PROLOG;
5554 #ifdef DEBUG_PUSH
5555 		    xmlGenericError(xmlGenericErrorContext,
5556 			    "HPP: entering PROLOG\n");
5557 #endif
5558 		} else if ((cur == '<') && (next == '!') &&
5559 		           (avail < 9)) {
5560 		    goto done;
5561 		} else {
5562 		    ctxt->instate = XML_PARSER_START_TAG;
5563 #ifdef DEBUG_PUSH
5564 		    xmlGenericError(xmlGenericErrorContext,
5565 			    "HPP: entering START_TAG\n");
5566 #endif
5567 		}
5568 		break;
5569             case XML_PARSER_PROLOG:
5570 		SKIP_BLANKS;
5571 		if (in->buf == NULL)
5572 		    avail = in->length - (in->cur - in->base);
5573 		else
5574 		    avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5575 		if (avail < 2)
5576 		    goto done;
5577 		cur = in->cur[0];
5578 		next = in->cur[1];
5579 		if ((cur == '<') && (next == '!') &&
5580 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5581 		    if ((!terminate) &&
5582 		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5583 			goto done;
5584 #ifdef DEBUG_PUSH
5585 		    xmlGenericError(xmlGenericErrorContext,
5586 			    "HPP: Parsing Comment\n");
5587 #endif
5588 		    htmlParseComment(ctxt);
5589 		    ctxt->instate = XML_PARSER_PROLOG;
5590 	        } else if ((cur == '<') && (next == '?')) {
5591 		    if ((!terminate) &&
5592 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5593 			goto done;
5594 #ifdef DEBUG_PUSH
5595 		    xmlGenericError(xmlGenericErrorContext,
5596 			    "HPP: Parsing PI\n");
5597 #endif
5598 		    htmlParsePI(ctxt);
5599 		    ctxt->instate = XML_PARSER_PROLOG;
5600 		} else if ((cur == '<') && (next == '!') &&
5601 		           (avail < 4)) {
5602 		    goto done;
5603 		} else {
5604 		    ctxt->instate = XML_PARSER_START_TAG;
5605 #ifdef DEBUG_PUSH
5606 		    xmlGenericError(xmlGenericErrorContext,
5607 			    "HPP: entering START_TAG\n");
5608 #endif
5609 		}
5610 		break;
5611             case XML_PARSER_EPILOG:
5612 		if (in->buf == NULL)
5613 		    avail = in->length - (in->cur - in->base);
5614 		else
5615 		    avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5616 		if (avail < 1)
5617 		    goto done;
5618 		cur = in->cur[0];
5619 		if (IS_BLANK_CH(cur)) {
5620 		    htmlParseCharData(ctxt);
5621 		    goto done;
5622 		}
5623 		if (avail < 2)
5624 		    goto done;
5625 		next = in->cur[1];
5626 	        if ((cur == '<') && (next == '!') &&
5627 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5628 		    if ((!terminate) &&
5629 		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5630 			goto done;
5631 #ifdef DEBUG_PUSH
5632 		    xmlGenericError(xmlGenericErrorContext,
5633 			    "HPP: Parsing Comment\n");
5634 #endif
5635 		    htmlParseComment(ctxt);
5636 		    ctxt->instate = XML_PARSER_EPILOG;
5637 	        } else if ((cur == '<') && (next == '?')) {
5638 		    if ((!terminate) &&
5639 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5640 			goto done;
5641 #ifdef DEBUG_PUSH
5642 		    xmlGenericError(xmlGenericErrorContext,
5643 			    "HPP: Parsing PI\n");
5644 #endif
5645 		    htmlParsePI(ctxt);
5646 		    ctxt->instate = XML_PARSER_EPILOG;
5647 		} else if ((cur == '<') && (next == '!') &&
5648 		           (avail < 4)) {
5649 		    goto done;
5650 		} else {
5651 		    ctxt->errNo = XML_ERR_DOCUMENT_END;
5652 		    ctxt->wellFormed = 0;
5653 		    ctxt->instate = XML_PARSER_EOF;
5654 #ifdef DEBUG_PUSH
5655 		    xmlGenericError(xmlGenericErrorContext,
5656 			    "HPP: entering EOF\n");
5657 #endif
5658 		    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5659 			ctxt->sax->endDocument(ctxt->userData);
5660 		    goto done;
5661 		}
5662 		break;
5663             case XML_PARSER_START_TAG: {
5664 	        const xmlChar *name;
5665 		int failed;
5666 		const htmlElemDesc * info;
5667 
5668 		/*
5669 		 * no chars in buffer
5670 		 */
5671 		if (avail < 1)
5672 		    goto done;
5673 		/*
5674 		 * not enouth chars in buffer
5675 		 */
5676 		if (avail < 2) {
5677 		    if (!terminate)
5678 			goto done;
5679 		    else
5680 			next = ' ';
5681 		} else {
5682 		    next = in->cur[1];
5683 		}
5684 		cur = in->cur[0];
5685 	        if (cur != '<') {
5686 		    ctxt->instate = XML_PARSER_CONTENT;
5687 #ifdef DEBUG_PUSH
5688 		    xmlGenericError(xmlGenericErrorContext,
5689 			    "HPP: entering CONTENT\n");
5690 #endif
5691 		    break;
5692 		}
5693 		if (next == '/') {
5694 		    ctxt->instate = XML_PARSER_END_TAG;
5695 		    ctxt->checkIndex = 0;
5696 #ifdef DEBUG_PUSH
5697 		    xmlGenericError(xmlGenericErrorContext,
5698 			    "HPP: entering END_TAG\n");
5699 #endif
5700 		    break;
5701 		}
5702 		if ((!terminate) &&
5703 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5704 		    goto done;
5705 
5706                 /* Capture start position */
5707 	        if (ctxt->record_info) {
5708 	             node_info.begin_pos = ctxt->input->consumed +
5709 	                                (CUR_PTR - ctxt->input->base);
5710 	             node_info.begin_line = ctxt->input->line;
5711 	        }
5712 
5713 
5714 		failed = htmlParseStartTag(ctxt);
5715 		name = ctxt->name;
5716 		if ((failed == -1) ||
5717 		    (name == NULL)) {
5718 		    if (CUR == '>')
5719 			NEXT;
5720 		    break;
5721 		}
5722 
5723 		/*
5724 		 * Lookup the info for that element.
5725 		 */
5726 		info = htmlTagLookup(name);
5727 		if (info == NULL) {
5728 		    htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5729 		                 "Tag %s invalid\n", name, NULL);
5730 		}
5731 
5732 		/*
5733 		 * Check for an Empty Element labeled the XML/SGML way
5734 		 */
5735 		if ((CUR == '/') && (NXT(1) == '>')) {
5736 		    SKIP(2);
5737 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5738 			ctxt->sax->endElement(ctxt->userData, name);
5739 		    htmlnamePop(ctxt);
5740 		    ctxt->instate = XML_PARSER_CONTENT;
5741 #ifdef DEBUG_PUSH
5742 		    xmlGenericError(xmlGenericErrorContext,
5743 			    "HPP: entering CONTENT\n");
5744 #endif
5745 		    break;
5746 		}
5747 
5748 		if (CUR == '>') {
5749 		    NEXT;
5750 		} else {
5751 		    htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5752 		                 "Couldn't find end of Start Tag %s\n",
5753 				 name, NULL);
5754 
5755 		    /*
5756 		     * end of parsing of this node.
5757 		     */
5758 		    if (xmlStrEqual(name, ctxt->name)) {
5759 			nodePop(ctxt);
5760 			htmlnamePop(ctxt);
5761 		    }
5762 
5763 		    if (ctxt->record_info)
5764 		        htmlNodeInfoPush(ctxt, &node_info);
5765 
5766 		    ctxt->instate = XML_PARSER_CONTENT;
5767 #ifdef DEBUG_PUSH
5768 		    xmlGenericError(xmlGenericErrorContext,
5769 			    "HPP: entering CONTENT\n");
5770 #endif
5771 		    break;
5772 		}
5773 
5774 		/*
5775 		 * Check for an Empty Element from DTD definition
5776 		 */
5777 		if ((info != NULL) && (info->empty)) {
5778 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5779 			ctxt->sax->endElement(ctxt->userData, name);
5780 		    htmlnamePop(ctxt);
5781 		}
5782 
5783                 if (ctxt->record_info)
5784 	            htmlNodeInfoPush(ctxt, &node_info);
5785 
5786 		ctxt->instate = XML_PARSER_CONTENT;
5787 #ifdef DEBUG_PUSH
5788 		xmlGenericError(xmlGenericErrorContext,
5789 			"HPP: entering CONTENT\n");
5790 #endif
5791                 break;
5792 	    }
5793             case XML_PARSER_CONTENT: {
5794 		long cons;
5795                 /*
5796 		 * Handle preparsed entities and charRef
5797 		 */
5798 		if (ctxt->token != 0) {
5799 		    xmlChar chr[2] = { 0 , 0 } ;
5800 
5801 		    chr[0] = (xmlChar) ctxt->token;
5802 		    htmlCheckParagraph(ctxt);
5803 		    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5804 			ctxt->sax->characters(ctxt->userData, chr, 1);
5805 		    ctxt->token = 0;
5806 		    ctxt->checkIndex = 0;
5807 		}
5808 		if ((avail == 1) && (terminate)) {
5809 		    cur = in->cur[0];
5810 		    if ((cur != '<') && (cur != '&')) {
5811 			if (ctxt->sax != NULL) {
5812 			    if (IS_BLANK_CH(cur)) {
5813 				if (ctxt->keepBlanks) {
5814 				    if (ctxt->sax->characters != NULL)
5815 					ctxt->sax->characters(
5816 						ctxt->userData, &in->cur[0], 1);
5817 				} else {
5818 				    if (ctxt->sax->ignorableWhitespace != NULL)
5819 					ctxt->sax->ignorableWhitespace(
5820 						ctxt->userData, &in->cur[0], 1);
5821 				}
5822 			    } else {
5823 				htmlCheckParagraph(ctxt);
5824 				if (ctxt->sax->characters != NULL)
5825 				    ctxt->sax->characters(
5826 					    ctxt->userData, &in->cur[0], 1);
5827 			    }
5828 			}
5829 			ctxt->token = 0;
5830 			ctxt->checkIndex = 0;
5831 			in->cur++;
5832 			break;
5833 		    }
5834 		}
5835 		if (avail < 2)
5836 		    goto done;
5837 		cur = in->cur[0];
5838 		next = in->cur[1];
5839 		cons = ctxt->nbChars;
5840 		if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5841 		    (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5842 		    /*
5843 		     * Handle SCRIPT/STYLE separately
5844 		     */
5845 		    if (!terminate) {
5846 		        int idx;
5847 			xmlChar val;
5848 
5849 			idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
5850 			if (idx < 0)
5851 			    goto done;
5852 		        val = in->cur[idx + 2];
5853 			if (val == 0) /* bad cut of input */
5854 			    goto done;
5855 		    }
5856 		    htmlParseScript(ctxt);
5857 		    if ((cur == '<') && (next == '/')) {
5858 			ctxt->instate = XML_PARSER_END_TAG;
5859 			ctxt->checkIndex = 0;
5860 #ifdef DEBUG_PUSH
5861 			xmlGenericError(xmlGenericErrorContext,
5862 				"HPP: entering END_TAG\n");
5863 #endif
5864 			break;
5865 		    }
5866 		} else {
5867 		    /*
5868 		     * Sometimes DOCTYPE arrives in the middle of the document
5869 		     */
5870 		    if ((cur == '<') && (next == '!') &&
5871 			(UPP(2) == 'D') && (UPP(3) == 'O') &&
5872 			(UPP(4) == 'C') && (UPP(5) == 'T') &&
5873 			(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5874 			(UPP(8) == 'E')) {
5875 			if ((!terminate) &&
5876 			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5877 			    goto done;
5878 			htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5879 			             "Misplaced DOCTYPE declaration\n",
5880 				     BAD_CAST "DOCTYPE" , NULL);
5881 			htmlParseDocTypeDecl(ctxt);
5882 		    } else if ((cur == '<') && (next == '!') &&
5883 			(in->cur[2] == '-') && (in->cur[3] == '-')) {
5884 			if ((!terminate) &&
5885 			    (htmlParseLookupSequence(
5886 				ctxt, '-', '-', '>', 1, 1) < 0))
5887 			    goto done;
5888 #ifdef DEBUG_PUSH
5889 			xmlGenericError(xmlGenericErrorContext,
5890 				"HPP: Parsing Comment\n");
5891 #endif
5892 			htmlParseComment(ctxt);
5893 			ctxt->instate = XML_PARSER_CONTENT;
5894 		    } else if ((cur == '<') && (next == '?')) {
5895 			if ((!terminate) &&
5896 			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5897 			    goto done;
5898 #ifdef DEBUG_PUSH
5899 			xmlGenericError(xmlGenericErrorContext,
5900 				"HPP: Parsing PI\n");
5901 #endif
5902 			htmlParsePI(ctxt);
5903 			ctxt->instate = XML_PARSER_CONTENT;
5904 		    } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5905 			goto done;
5906 		    } else if ((cur == '<') && (next == '/')) {
5907 			ctxt->instate = XML_PARSER_END_TAG;
5908 			ctxt->checkIndex = 0;
5909 #ifdef DEBUG_PUSH
5910 			xmlGenericError(xmlGenericErrorContext,
5911 				"HPP: entering END_TAG\n");
5912 #endif
5913 			break;
5914 		    } else if (cur == '<') {
5915 			ctxt->instate = XML_PARSER_START_TAG;
5916 			ctxt->checkIndex = 0;
5917 #ifdef DEBUG_PUSH
5918 			xmlGenericError(xmlGenericErrorContext,
5919 				"HPP: entering START_TAG\n");
5920 #endif
5921 			break;
5922 		    } else if (cur == '&') {
5923 			if ((!terminate) &&
5924 			    (htmlParseLookupChars(ctxt,
5925                                                   BAD_CAST "; >/", 4) < 0))
5926 			    goto done;
5927 #ifdef DEBUG_PUSH
5928 			xmlGenericError(xmlGenericErrorContext,
5929 				"HPP: Parsing Reference\n");
5930 #endif
5931 			/* TODO: check generation of subtrees if noent !!! */
5932 			htmlParseReference(ctxt);
5933 		    } else {
5934 		        /*
5935 			 * check that the text sequence is complete
5936 			 * before handing out the data to the parser
5937 			 * to avoid problems with erroneous end of
5938 			 * data detection.
5939 			 */
5940 			if ((!terminate) &&
5941                             (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
5942 			    goto done;
5943 			ctxt->checkIndex = 0;
5944 #ifdef DEBUG_PUSH
5945 			xmlGenericError(xmlGenericErrorContext,
5946 				"HPP: Parsing char data\n");
5947 #endif
5948 			htmlParseCharData(ctxt);
5949 		    }
5950 		}
5951 		if (cons == ctxt->nbChars) {
5952 		    if (ctxt->node != NULL) {
5953 			htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5954 			             "detected an error in element content\n",
5955 				     NULL, NULL);
5956 		    }
5957 		    NEXT;
5958 		    break;
5959 		}
5960 
5961 		break;
5962 	    }
5963             case XML_PARSER_END_TAG:
5964 		if (avail < 2)
5965 		    goto done;
5966 		if ((!terminate) &&
5967 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5968 		    goto done;
5969 		htmlParseEndTag(ctxt);
5970 		if (ctxt->nameNr == 0) {
5971 		    ctxt->instate = XML_PARSER_EPILOG;
5972 		} else {
5973 		    ctxt->instate = XML_PARSER_CONTENT;
5974 		}
5975 		ctxt->checkIndex = 0;
5976 #ifdef DEBUG_PUSH
5977 		xmlGenericError(xmlGenericErrorContext,
5978 			"HPP: entering CONTENT\n");
5979 #endif
5980 	        break;
5981             case XML_PARSER_CDATA_SECTION:
5982 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5983 			"HPP: internal error, state == CDATA\n",
5984 			     NULL, NULL);
5985 		ctxt->instate = XML_PARSER_CONTENT;
5986 		ctxt->checkIndex = 0;
5987 #ifdef DEBUG_PUSH
5988 		xmlGenericError(xmlGenericErrorContext,
5989 			"HPP: entering CONTENT\n");
5990 #endif
5991 		break;
5992             case XML_PARSER_DTD:
5993 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5994 			"HPP: internal error, state == DTD\n",
5995 			     NULL, NULL);
5996 		ctxt->instate = XML_PARSER_CONTENT;
5997 		ctxt->checkIndex = 0;
5998 #ifdef DEBUG_PUSH
5999 		xmlGenericError(xmlGenericErrorContext,
6000 			"HPP: entering CONTENT\n");
6001 #endif
6002 		break;
6003             case XML_PARSER_COMMENT:
6004 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6005 			"HPP: internal error, state == COMMENT\n",
6006 			     NULL, NULL);
6007 		ctxt->instate = XML_PARSER_CONTENT;
6008 		ctxt->checkIndex = 0;
6009 #ifdef DEBUG_PUSH
6010 		xmlGenericError(xmlGenericErrorContext,
6011 			"HPP: entering CONTENT\n");
6012 #endif
6013 		break;
6014             case XML_PARSER_PI:
6015 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6016 			"HPP: internal error, state == PI\n",
6017 			     NULL, NULL);
6018 		ctxt->instate = XML_PARSER_CONTENT;
6019 		ctxt->checkIndex = 0;
6020 #ifdef DEBUG_PUSH
6021 		xmlGenericError(xmlGenericErrorContext,
6022 			"HPP: entering CONTENT\n");
6023 #endif
6024 		break;
6025             case XML_PARSER_ENTITY_DECL:
6026 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6027 			"HPP: internal error, state == ENTITY_DECL\n",
6028 			     NULL, NULL);
6029 		ctxt->instate = XML_PARSER_CONTENT;
6030 		ctxt->checkIndex = 0;
6031 #ifdef DEBUG_PUSH
6032 		xmlGenericError(xmlGenericErrorContext,
6033 			"HPP: entering CONTENT\n");
6034 #endif
6035 		break;
6036             case XML_PARSER_ENTITY_VALUE:
6037 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6038 			"HPP: internal error, state == ENTITY_VALUE\n",
6039 			     NULL, NULL);
6040 		ctxt->instate = XML_PARSER_CONTENT;
6041 		ctxt->checkIndex = 0;
6042 #ifdef DEBUG_PUSH
6043 		xmlGenericError(xmlGenericErrorContext,
6044 			"HPP: entering DTD\n");
6045 #endif
6046 		break;
6047             case XML_PARSER_ATTRIBUTE_VALUE:
6048 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6049 			"HPP: internal error, state == ATTRIBUTE_VALUE\n",
6050 			     NULL, NULL);
6051 		ctxt->instate = XML_PARSER_START_TAG;
6052 		ctxt->checkIndex = 0;
6053 #ifdef DEBUG_PUSH
6054 		xmlGenericError(xmlGenericErrorContext,
6055 			"HPP: entering START_TAG\n");
6056 #endif
6057 		break;
6058 	    case XML_PARSER_SYSTEM_LITERAL:
6059 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6060 		    "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6061 			     NULL, NULL);
6062 		ctxt->instate = XML_PARSER_CONTENT;
6063 		ctxt->checkIndex = 0;
6064 #ifdef DEBUG_PUSH
6065 		xmlGenericError(xmlGenericErrorContext,
6066 			"HPP: entering CONTENT\n");
6067 #endif
6068 		break;
6069 	    case XML_PARSER_IGNORE:
6070 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6071 			"HPP: internal error, state == XML_PARSER_IGNORE\n",
6072 			     NULL, NULL);
6073 		ctxt->instate = XML_PARSER_CONTENT;
6074 		ctxt->checkIndex = 0;
6075 #ifdef DEBUG_PUSH
6076 		xmlGenericError(xmlGenericErrorContext,
6077 			"HPP: entering CONTENT\n");
6078 #endif
6079 		break;
6080 	    case XML_PARSER_PUBLIC_LITERAL:
6081 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6082 			"HPP: internal error, state == XML_PARSER_LITERAL\n",
6083 			     NULL, NULL);
6084 		ctxt->instate = XML_PARSER_CONTENT;
6085 		ctxt->checkIndex = 0;
6086 #ifdef DEBUG_PUSH
6087 		xmlGenericError(xmlGenericErrorContext,
6088 			"HPP: entering CONTENT\n");
6089 #endif
6090 		break;
6091 
6092 	}
6093     }
6094 done:
6095     if ((avail == 0) && (terminate)) {
6096 	htmlAutoCloseOnEnd(ctxt);
6097 	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6098 	    /*
6099 	     * SAX: end of the document processing.
6100 	     */
6101 	    ctxt->instate = XML_PARSER_EOF;
6102 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6103 		ctxt->sax->endDocument(ctxt->userData);
6104 	}
6105     }
6106     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6107 	((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6108 	 (ctxt->instate == XML_PARSER_EPILOG))) {
6109 	xmlDtdPtr dtd;
6110 	dtd = xmlGetIntSubset(ctxt->myDoc);
6111 	if (dtd == NULL)
6112 	    ctxt->myDoc->intSubset =
6113 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6114 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6115 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6116     }
6117 #ifdef DEBUG_PUSH
6118     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6119 #endif
6120     return(ret);
6121 }
6122 
6123 /**
6124  * htmlParseChunk:
6125  * @ctxt:  an HTML parser context
6126  * @chunk:  an char array
6127  * @size:  the size in byte of the chunk
6128  * @terminate:  last chunk indicator
6129  *
6130  * Parse a Chunk of memory
6131  *
6132  * Returns zero if no error, the xmlParserErrors otherwise.
6133  */
6134 int
htmlParseChunk(htmlParserCtxtPtr ctxt,const char * chunk,int size,int terminate)6135 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6136               int terminate) {
6137     if ((ctxt == NULL) || (ctxt->input == NULL)) {
6138 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6139 		     "htmlParseChunk: context error\n", NULL, NULL);
6140 	return(XML_ERR_INTERNAL_ERROR);
6141     }
6142     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6143         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
6144 	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6145 	size_t cur = ctxt->input->cur - ctxt->input->base;
6146 	int res;
6147 
6148 	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6149 	if (res < 0) {
6150 	    ctxt->errNo = XML_PARSER_EOF;
6151 	    ctxt->disableSAX = 1;
6152 	    return (XML_PARSER_EOF);
6153 	}
6154         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6155 #ifdef DEBUG_PUSH
6156 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6157 #endif
6158 
6159 #if 0
6160 	if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6161 	    htmlParseTryOrFinish(ctxt, terminate);
6162 #endif
6163     } else if (ctxt->instate != XML_PARSER_EOF) {
6164 	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6165 	    xmlParserInputBufferPtr in = ctxt->input->buf;
6166 	    if ((in->encoder != NULL) && (in->buffer != NULL) &&
6167 		    (in->raw != NULL)) {
6168 		int nbchars;
6169 		size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6170 		size_t current = ctxt->input->cur - ctxt->input->base;
6171 
6172 		nbchars = xmlCharEncInput(in, terminate);
6173 		if (nbchars < 0) {
6174 		    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6175 			         "encoder error\n", NULL, NULL);
6176 		    return(XML_ERR_INVALID_ENCODING);
6177 		}
6178 		xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6179 	    }
6180 	}
6181     }
6182     htmlParseTryOrFinish(ctxt, terminate);
6183     if (terminate) {
6184 	if ((ctxt->instate != XML_PARSER_EOF) &&
6185 	    (ctxt->instate != XML_PARSER_EPILOG) &&
6186 	    (ctxt->instate != XML_PARSER_MISC)) {
6187 	    ctxt->errNo = XML_ERR_DOCUMENT_END;
6188 	    ctxt->wellFormed = 0;
6189 	}
6190 	if (ctxt->instate != XML_PARSER_EOF) {
6191 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6192 		ctxt->sax->endDocument(ctxt->userData);
6193 	}
6194 	ctxt->instate = XML_PARSER_EOF;
6195     }
6196     return((xmlParserErrors) ctxt->errNo);
6197 }
6198 
6199 /************************************************************************
6200  *									*
6201  *			User entry points				*
6202  *									*
6203  ************************************************************************/
6204 
6205 /**
6206  * htmlCreatePushParserCtxt:
6207  * @sax:  a SAX handler
6208  * @user_data:  The user data returned on SAX callbacks
6209  * @chunk:  a pointer to an array of chars
6210  * @size:  number of chars in the array
6211  * @filename:  an optional file name or URI
6212  * @enc:  an optional encoding
6213  *
6214  * Create a parser context for using the HTML parser in push mode
6215  * The value of @filename is used for fetching external entities
6216  * and error/warning reports.
6217  *
6218  * Returns the new parser context or NULL
6219  */
6220 htmlParserCtxtPtr
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,void * user_data,const char * chunk,int size,const char * filename,xmlCharEncoding enc)6221 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6222                          const char *chunk, int size, const char *filename,
6223 			 xmlCharEncoding enc) {
6224     htmlParserCtxtPtr ctxt;
6225     htmlParserInputPtr inputStream;
6226     xmlParserInputBufferPtr buf;
6227 
6228     xmlInitParser();
6229 
6230     buf = xmlAllocParserInputBuffer(enc);
6231     if (buf == NULL) return(NULL);
6232 
6233     ctxt = htmlNewParserCtxt();
6234     if (ctxt == NULL) {
6235 	xmlFreeParserInputBuffer(buf);
6236 	return(NULL);
6237     }
6238     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6239 	ctxt->charset=XML_CHAR_ENCODING_UTF8;
6240     if (sax != NULL) {
6241 	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6242 	    xmlFree(ctxt->sax);
6243 	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6244 	if (ctxt->sax == NULL) {
6245 	    xmlFree(buf);
6246 	    xmlFree(ctxt);
6247 	    return(NULL);
6248 	}
6249 	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6250 	if (user_data != NULL)
6251 	    ctxt->userData = user_data;
6252     }
6253     if (filename == NULL) {
6254 	ctxt->directory = NULL;
6255     } else {
6256         ctxt->directory = xmlParserGetDirectory(filename);
6257     }
6258 
6259     inputStream = htmlNewInputStream(ctxt);
6260     if (inputStream == NULL) {
6261 	xmlFreeParserCtxt(ctxt);
6262 	xmlFree(buf);
6263 	return(NULL);
6264     }
6265 
6266     if (filename == NULL)
6267 	inputStream->filename = NULL;
6268     else
6269 	inputStream->filename = (char *)
6270 	    xmlCanonicPath((const xmlChar *) filename);
6271     inputStream->buf = buf;
6272     xmlBufResetInput(buf->buffer, inputStream);
6273 
6274     inputPush(ctxt, inputStream);
6275 
6276     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6277         (ctxt->input->buf != NULL))  {
6278 	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6279 	size_t cur = ctxt->input->cur - ctxt->input->base;
6280 
6281 	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6282 
6283         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6284 #ifdef DEBUG_PUSH
6285 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6286 #endif
6287     }
6288     ctxt->progressive = 1;
6289 
6290     return(ctxt);
6291 }
6292 #endif /* LIBXML_PUSH_ENABLED */
6293 
6294 /**
6295  * htmlSAXParseDoc:
6296  * @cur:  a pointer to an array of xmlChar
6297  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6298  * @sax:  the SAX handler block
6299  * @userData: if using SAX, this pointer will be provided on callbacks.
6300  *
6301  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6302  * to handle parse events. If sax is NULL, fallback to the default DOM
6303  * behavior and return a tree.
6304  *
6305  * Returns the resulting document tree unless SAX is NULL or the document is
6306  *     not well formed.
6307  */
6308 
6309 htmlDocPtr
htmlSAXParseDoc(const xmlChar * cur,const char * encoding,htmlSAXHandlerPtr sax,void * userData)6310 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6311                 htmlSAXHandlerPtr sax, void *userData) {
6312     htmlDocPtr ret;
6313     htmlParserCtxtPtr ctxt;
6314 
6315     xmlInitParser();
6316 
6317     if (cur == NULL) return(NULL);
6318 
6319 
6320     ctxt = htmlCreateDocParserCtxt(cur, encoding);
6321     if (ctxt == NULL) return(NULL);
6322     if (sax != NULL) {
6323         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6324         ctxt->sax = sax;
6325         ctxt->userData = userData;
6326     }
6327 
6328     htmlParseDocument(ctxt);
6329     ret = ctxt->myDoc;
6330     if (sax != NULL) {
6331 	ctxt->sax = NULL;
6332 	ctxt->userData = NULL;
6333     }
6334     htmlFreeParserCtxt(ctxt);
6335 
6336     return(ret);
6337 }
6338 
6339 /**
6340  * htmlParseDoc:
6341  * @cur:  a pointer to an array of xmlChar
6342  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6343  *
6344  * parse an HTML in-memory document and build a tree.
6345  *
6346  * Returns the resulting document tree
6347  */
6348 
6349 htmlDocPtr
htmlParseDoc(const xmlChar * cur,const char * encoding)6350 htmlParseDoc(const xmlChar *cur, const char *encoding) {
6351     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6352 }
6353 
6354 
6355 /**
6356  * htmlCreateFileParserCtxt:
6357  * @filename:  the filename
6358  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6359  *
6360  * Create a parser context for a file content.
6361  * Automatic support for ZLIB/Compress compressed document is provided
6362  * by default if found at compile-time.
6363  *
6364  * Returns the new parser context or NULL
6365  */
6366 htmlParserCtxtPtr
htmlCreateFileParserCtxt(const char * filename,const char * encoding)6367 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6368 {
6369     htmlParserCtxtPtr ctxt;
6370     htmlParserInputPtr inputStream;
6371     char *canonicFilename;
6372     /* htmlCharEncoding enc; */
6373     xmlChar *content, *content_line = (xmlChar *) "charset=";
6374 
6375     if (filename == NULL)
6376         return(NULL);
6377 
6378     ctxt = htmlNewParserCtxt();
6379     if (ctxt == NULL) {
6380 	return(NULL);
6381     }
6382     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6383     if (canonicFilename == NULL) {
6384 #ifdef LIBXML_SAX1_ENABLED
6385 	if (xmlDefaultSAXHandler.error != NULL) {
6386 	    xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6387 	}
6388 #endif
6389 	xmlFreeParserCtxt(ctxt);
6390 	return(NULL);
6391     }
6392 
6393     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6394     xmlFree(canonicFilename);
6395     if (inputStream == NULL) {
6396 	xmlFreeParserCtxt(ctxt);
6397 	return(NULL);
6398     }
6399 
6400     inputPush(ctxt, inputStream);
6401 
6402     /* set encoding */
6403     if (encoding) {
6404         size_t l = strlen(encoding);
6405 
6406 	if (l < 1000) {
6407 	    content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6408 	    if (content) {
6409 		strcpy ((char *)content, (char *)content_line);
6410 		strcat ((char *)content, (char *)encoding);
6411 		htmlCheckEncoding (ctxt, content);
6412 		xmlFree (content);
6413 	    }
6414 	}
6415     }
6416 
6417     return(ctxt);
6418 }
6419 
6420 /**
6421  * htmlSAXParseFile:
6422  * @filename:  the filename
6423  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6424  * @sax:  the SAX handler block
6425  * @userData: if using SAX, this pointer will be provided on callbacks.
6426  *
6427  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6428  * compressed document is provided by default if found at compile-time.
6429  * It use the given SAX function block to handle the parsing callback.
6430  * If sax is NULL, fallback to the default DOM tree building routines.
6431  *
6432  * Returns the resulting document tree unless SAX is NULL or the document is
6433  *     not well formed.
6434  */
6435 
6436 htmlDocPtr
htmlSAXParseFile(const char * filename,const char * encoding,htmlSAXHandlerPtr sax,void * userData)6437 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6438                  void *userData) {
6439     htmlDocPtr ret;
6440     htmlParserCtxtPtr ctxt;
6441     htmlSAXHandlerPtr oldsax = NULL;
6442 
6443     xmlInitParser();
6444 
6445     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6446     if (ctxt == NULL) return(NULL);
6447     if (sax != NULL) {
6448 	oldsax = ctxt->sax;
6449         ctxt->sax = sax;
6450         ctxt->userData = userData;
6451     }
6452 
6453     htmlParseDocument(ctxt);
6454 
6455     ret = ctxt->myDoc;
6456     if (sax != NULL) {
6457         ctxt->sax = oldsax;
6458         ctxt->userData = NULL;
6459     }
6460     htmlFreeParserCtxt(ctxt);
6461 
6462     return(ret);
6463 }
6464 
6465 /**
6466  * htmlParseFile:
6467  * @filename:  the filename
6468  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6469  *
6470  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6471  * compressed document is provided by default if found at compile-time.
6472  *
6473  * Returns the resulting document tree
6474  */
6475 
6476 htmlDocPtr
htmlParseFile(const char * filename,const char * encoding)6477 htmlParseFile(const char *filename, const char *encoding) {
6478     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6479 }
6480 
6481 /**
6482  * htmlHandleOmittedElem:
6483  * @val:  int 0 or 1
6484  *
6485  * Set and return the previous value for handling HTML omitted tags.
6486  *
6487  * Returns the last value for 0 for no handling, 1 for auto insertion.
6488  */
6489 
6490 int
htmlHandleOmittedElem(int val)6491 htmlHandleOmittedElem(int val) {
6492     int old = htmlOmittedDefaultValue;
6493 
6494     htmlOmittedDefaultValue = val;
6495     return(old);
6496 }
6497 
6498 /**
6499  * htmlElementAllowedHere:
6500  * @parent: HTML parent element
6501  * @elt: HTML element
6502  *
6503  * Checks whether an HTML element may be a direct child of a parent element.
6504  * Note - doesn't check for deprecated elements
6505  *
6506  * Returns 1 if allowed; 0 otherwise.
6507  */
6508 int
htmlElementAllowedHere(const htmlElemDesc * parent,const xmlChar * elt)6509 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6510   const char** p ;
6511 
6512   if ( ! elt || ! parent || ! parent->subelts )
6513 	return 0 ;
6514 
6515   for ( p = parent->subelts; *p; ++p )
6516     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6517       return 1 ;
6518 
6519   return 0 ;
6520 }
6521 /**
6522  * htmlElementStatusHere:
6523  * @parent: HTML parent element
6524  * @elt: HTML element
6525  *
6526  * Checks whether an HTML element may be a direct child of a parent element.
6527  * and if so whether it is valid or deprecated.
6528  *
6529  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6530  */
6531 htmlStatus
htmlElementStatusHere(const htmlElemDesc * parent,const htmlElemDesc * elt)6532 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6533   if ( ! parent || ! elt )
6534     return HTML_INVALID ;
6535   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6536     return HTML_INVALID ;
6537 
6538   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6539 }
6540 /**
6541  * htmlAttrAllowed:
6542  * @elt: HTML element
6543  * @attr: HTML attribute
6544  * @legacy: whether to allow deprecated attributes
6545  *
6546  * Checks whether an attribute is valid for an element
6547  * Has full knowledge of Required and Deprecated attributes
6548  *
6549  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6550  */
6551 htmlStatus
htmlAttrAllowed(const htmlElemDesc * elt,const xmlChar * attr,int legacy)6552 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6553   const char** p ;
6554 
6555   if ( !elt || ! attr )
6556 	return HTML_INVALID ;
6557 
6558   if ( elt->attrs_req )
6559     for ( p = elt->attrs_req; *p; ++p)
6560       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6561         return HTML_REQUIRED ;
6562 
6563   if ( elt->attrs_opt )
6564     for ( p = elt->attrs_opt; *p; ++p)
6565       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6566         return HTML_VALID ;
6567 
6568   if ( legacy && elt->attrs_depr )
6569     for ( p = elt->attrs_depr; *p; ++p)
6570       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6571         return HTML_DEPRECATED ;
6572 
6573   return HTML_INVALID ;
6574 }
6575 /**
6576  * htmlNodeStatus:
6577  * @node: an htmlNodePtr in a tree
6578  * @legacy: whether to allow deprecated elements (YES is faster here
6579  *	for Element nodes)
6580  *
6581  * Checks whether the tree node is valid.  Experimental (the author
6582  *     only uses the HTML enhancements in a SAX parser)
6583  *
6584  * Return: for Element nodes, a return from htmlElementAllowedHere (if
6585  *	legacy allowed) or htmlElementStatusHere (otherwise).
6586  *	for Attribute nodes, a return from htmlAttrAllowed
6587  *	for other nodes, HTML_NA (no checks performed)
6588  */
6589 htmlStatus
htmlNodeStatus(const htmlNodePtr node,int legacy)6590 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6591   if ( ! node )
6592     return HTML_INVALID ;
6593 
6594   switch ( node->type ) {
6595     case XML_ELEMENT_NODE:
6596       return legacy
6597 	? ( htmlElementAllowedHere (
6598 		htmlTagLookup(node->parent->name) , node->name
6599 		) ? HTML_VALID : HTML_INVALID )
6600 	: htmlElementStatusHere(
6601 		htmlTagLookup(node->parent->name) ,
6602 		htmlTagLookup(node->name) )
6603 	;
6604     case XML_ATTRIBUTE_NODE:
6605       return htmlAttrAllowed(
6606 	htmlTagLookup(node->parent->name) , node->name, legacy) ;
6607     default: return HTML_NA ;
6608   }
6609 }
6610 /************************************************************************
6611  *									*
6612  *	New set (2.6.0) of simpler and more flexible APIs		*
6613  *									*
6614  ************************************************************************/
6615 /**
6616  * DICT_FREE:
6617  * @str:  a string
6618  *
6619  * Free a string if it is not owned by the "dict" dictionary in the
6620  * current scope
6621  */
6622 #define DICT_FREE(str)						\
6623 	if ((str) && ((!dict) ||				\
6624 	    (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))	\
6625 	    xmlFree((char *)(str));
6626 
6627 /**
6628  * htmlCtxtReset:
6629  * @ctxt: an HTML parser context
6630  *
6631  * Reset a parser context
6632  */
6633 void
htmlCtxtReset(htmlParserCtxtPtr ctxt)6634 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6635 {
6636     xmlParserInputPtr input;
6637     xmlDictPtr dict;
6638 
6639     if (ctxt == NULL)
6640         return;
6641 
6642     xmlInitParser();
6643     dict = ctxt->dict;
6644 
6645     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6646         xmlFreeInputStream(input);
6647     }
6648     ctxt->inputNr = 0;
6649     ctxt->input = NULL;
6650 
6651     ctxt->spaceNr = 0;
6652     if (ctxt->spaceTab != NULL) {
6653 	ctxt->spaceTab[0] = -1;
6654 	ctxt->space = &ctxt->spaceTab[0];
6655     } else {
6656 	ctxt->space = NULL;
6657     }
6658 
6659 
6660     ctxt->nodeNr = 0;
6661     ctxt->node = NULL;
6662 
6663     ctxt->nameNr = 0;
6664     ctxt->name = NULL;
6665 
6666     DICT_FREE(ctxt->version);
6667     ctxt->version = NULL;
6668     DICT_FREE(ctxt->encoding);
6669     ctxt->encoding = NULL;
6670     DICT_FREE(ctxt->directory);
6671     ctxt->directory = NULL;
6672     DICT_FREE(ctxt->extSubURI);
6673     ctxt->extSubURI = NULL;
6674     DICT_FREE(ctxt->extSubSystem);
6675     ctxt->extSubSystem = NULL;
6676     if (ctxt->myDoc != NULL)
6677         xmlFreeDoc(ctxt->myDoc);
6678     ctxt->myDoc = NULL;
6679 
6680     ctxt->standalone = -1;
6681     ctxt->hasExternalSubset = 0;
6682     ctxt->hasPErefs = 0;
6683     ctxt->html = 1;
6684     ctxt->external = 0;
6685     ctxt->instate = XML_PARSER_START;
6686     ctxt->token = 0;
6687 
6688     ctxt->wellFormed = 1;
6689     ctxt->nsWellFormed = 1;
6690     ctxt->disableSAX = 0;
6691     ctxt->valid = 1;
6692     ctxt->vctxt.userData = ctxt;
6693     ctxt->vctxt.error = xmlParserValidityError;
6694     ctxt->vctxt.warning = xmlParserValidityWarning;
6695     ctxt->record_info = 0;
6696     ctxt->nbChars = 0;
6697     ctxt->checkIndex = 0;
6698     ctxt->inSubset = 0;
6699     ctxt->errNo = XML_ERR_OK;
6700     ctxt->depth = 0;
6701     ctxt->charset = XML_CHAR_ENCODING_NONE;
6702     ctxt->catalogs = NULL;
6703     xmlInitNodeInfoSeq(&ctxt->node_seq);
6704 
6705     if (ctxt->attsDefault != NULL) {
6706         xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6707         ctxt->attsDefault = NULL;
6708     }
6709     if (ctxt->attsSpecial != NULL) {
6710         xmlHashFree(ctxt->attsSpecial, NULL);
6711         ctxt->attsSpecial = NULL;
6712     }
6713 }
6714 
6715 /**
6716  * htmlCtxtUseOptions:
6717  * @ctxt: an HTML parser context
6718  * @options:  a combination of htmlParserOption(s)
6719  *
6720  * Applies the options to the parser context
6721  *
6722  * Returns 0 in case of success, the set of unknown or unimplemented options
6723  *         in case of error.
6724  */
6725 int
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt,int options)6726 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6727 {
6728     if (ctxt == NULL)
6729         return(-1);
6730 
6731     if (options & HTML_PARSE_NOWARNING) {
6732         ctxt->sax->warning = NULL;
6733         ctxt->vctxt.warning = NULL;
6734         options -= XML_PARSE_NOWARNING;
6735 	ctxt->options |= XML_PARSE_NOWARNING;
6736     }
6737     if (options & HTML_PARSE_NOERROR) {
6738         ctxt->sax->error = NULL;
6739         ctxt->vctxt.error = NULL;
6740         ctxt->sax->fatalError = NULL;
6741         options -= XML_PARSE_NOERROR;
6742 	ctxt->options |= XML_PARSE_NOERROR;
6743     }
6744     if (options & HTML_PARSE_PEDANTIC) {
6745         ctxt->pedantic = 1;
6746         options -= XML_PARSE_PEDANTIC;
6747 	ctxt->options |= XML_PARSE_PEDANTIC;
6748     } else
6749         ctxt->pedantic = 0;
6750     if (options & XML_PARSE_NOBLANKS) {
6751         ctxt->keepBlanks = 0;
6752         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6753         options -= XML_PARSE_NOBLANKS;
6754 	ctxt->options |= XML_PARSE_NOBLANKS;
6755     } else
6756         ctxt->keepBlanks = 1;
6757     if (options & HTML_PARSE_RECOVER) {
6758         ctxt->recovery = 1;
6759 	options -= HTML_PARSE_RECOVER;
6760     } else
6761         ctxt->recovery = 0;
6762     if (options & HTML_PARSE_COMPACT) {
6763 	ctxt->options |= HTML_PARSE_COMPACT;
6764         options -= HTML_PARSE_COMPACT;
6765     }
6766     if (options & XML_PARSE_HUGE) {
6767 	ctxt->options |= XML_PARSE_HUGE;
6768         options -= XML_PARSE_HUGE;
6769     }
6770     if (options & HTML_PARSE_NODEFDTD) {
6771 	ctxt->options |= HTML_PARSE_NODEFDTD;
6772         options -= HTML_PARSE_NODEFDTD;
6773     }
6774     if (options & HTML_PARSE_IGNORE_ENC) {
6775 	ctxt->options |= HTML_PARSE_IGNORE_ENC;
6776         options -= HTML_PARSE_IGNORE_ENC;
6777     }
6778     if (options & HTML_PARSE_NOIMPLIED) {
6779         ctxt->options |= HTML_PARSE_NOIMPLIED;
6780         options -= HTML_PARSE_NOIMPLIED;
6781     }
6782     ctxt->dictNames = 0;
6783     return (options);
6784 }
6785 
6786 /**
6787  * htmlDoRead:
6788  * @ctxt:  an HTML parser context
6789  * @URL:  the base URL to use for the document
6790  * @encoding:  the document encoding, or NULL
6791  * @options:  a combination of htmlParserOption(s)
6792  * @reuse:  keep the context for reuse
6793  *
6794  * Common front-end for the htmlRead functions
6795  *
6796  * Returns the resulting document tree or NULL
6797  */
6798 static htmlDocPtr
htmlDoRead(htmlParserCtxtPtr ctxt,const char * URL,const char * encoding,int options,int reuse)6799 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6800           int options, int reuse)
6801 {
6802     htmlDocPtr ret;
6803 
6804     htmlCtxtUseOptions(ctxt, options);
6805     ctxt->html = 1;
6806     if (encoding != NULL) {
6807         xmlCharEncodingHandlerPtr hdlr;
6808 
6809 	hdlr = xmlFindCharEncodingHandler(encoding);
6810 	if (hdlr != NULL) {
6811 	    xmlSwitchToEncoding(ctxt, hdlr);
6812 	    if (ctxt->input->encoding != NULL)
6813 	      xmlFree((xmlChar *) ctxt->input->encoding);
6814             ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6815         }
6816     }
6817     if ((URL != NULL) && (ctxt->input != NULL) &&
6818         (ctxt->input->filename == NULL))
6819         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6820     htmlParseDocument(ctxt);
6821     ret = ctxt->myDoc;
6822     ctxt->myDoc = NULL;
6823     if (!reuse) {
6824         if ((ctxt->dictNames) &&
6825 	    (ret != NULL) &&
6826 	    (ret->dict == ctxt->dict))
6827 	    ctxt->dict = NULL;
6828 	xmlFreeParserCtxt(ctxt);
6829     }
6830     return (ret);
6831 }
6832 
6833 /**
6834  * htmlReadDoc:
6835  * @cur:  a pointer to a zero terminated string
6836  * @URL:  the base URL to use for the document
6837  * @encoding:  the document encoding, or NULL
6838  * @options:  a combination of htmlParserOption(s)
6839  *
6840  * parse an XML in-memory document and build a tree.
6841  *
6842  * Returns the resulting document tree
6843  */
6844 htmlDocPtr
htmlReadDoc(const xmlChar * cur,const char * URL,const char * encoding,int options)6845 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6846 {
6847     htmlParserCtxtPtr ctxt;
6848 
6849     if (cur == NULL)
6850         return (NULL);
6851 
6852     xmlInitParser();
6853     ctxt = htmlCreateDocParserCtxt(cur, NULL);
6854     if (ctxt == NULL)
6855         return (NULL);
6856     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6857 }
6858 
6859 /**
6860  * htmlReadFile:
6861  * @filename:  a file or URL
6862  * @encoding:  the document encoding, or NULL
6863  * @options:  a combination of htmlParserOption(s)
6864  *
6865  * parse an XML file from the filesystem or the network.
6866  *
6867  * Returns the resulting document tree
6868  */
6869 htmlDocPtr
htmlReadFile(const char * filename,const char * encoding,int options)6870 htmlReadFile(const char *filename, const char *encoding, int options)
6871 {
6872     htmlParserCtxtPtr ctxt;
6873 
6874     xmlInitParser();
6875     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6876     if (ctxt == NULL)
6877         return (NULL);
6878     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6879 }
6880 
6881 /**
6882  * htmlReadMemory:
6883  * @buffer:  a pointer to a char array
6884  * @size:  the size of the array
6885  * @URL:  the base URL to use for the document
6886  * @encoding:  the document encoding, or NULL
6887  * @options:  a combination of htmlParserOption(s)
6888  *
6889  * parse an XML in-memory document and build a tree.
6890  *
6891  * Returns the resulting document tree
6892  */
6893 htmlDocPtr
htmlReadMemory(const char * buffer,int size,const char * URL,const char * encoding,int options)6894 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6895 {
6896     htmlParserCtxtPtr ctxt;
6897 
6898     xmlInitParser();
6899     ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6900     if (ctxt == NULL)
6901         return (NULL);
6902     htmlDefaultSAXHandlerInit();
6903     if (ctxt->sax != NULL)
6904         memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6905     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6906 }
6907 
6908 /**
6909  * htmlReadFd:
6910  * @fd:  an open file descriptor
6911  * @URL:  the base URL to use for the document
6912  * @encoding:  the document encoding, or NULL
6913  * @options:  a combination of htmlParserOption(s)
6914  *
6915  * parse an XML from a file descriptor and build a tree.
6916  *
6917  * Returns the resulting document tree
6918  */
6919 htmlDocPtr
htmlReadFd(int fd,const char * URL,const char * encoding,int options)6920 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6921 {
6922     htmlParserCtxtPtr ctxt;
6923     xmlParserInputBufferPtr input;
6924     xmlParserInputPtr stream;
6925 
6926     if (fd < 0)
6927         return (NULL);
6928     xmlInitParser();
6929 
6930     xmlInitParser();
6931     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6932     if (input == NULL)
6933         return (NULL);
6934     ctxt = xmlNewParserCtxt();
6935     if (ctxt == NULL) {
6936         xmlFreeParserInputBuffer(input);
6937         return (NULL);
6938     }
6939     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6940     if (stream == NULL) {
6941         xmlFreeParserInputBuffer(input);
6942 	xmlFreeParserCtxt(ctxt);
6943         return (NULL);
6944     }
6945     inputPush(ctxt, stream);
6946     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6947 }
6948 
6949 /**
6950  * htmlReadIO:
6951  * @ioread:  an I/O read function
6952  * @ioclose:  an I/O close function
6953  * @ioctx:  an I/O handler
6954  * @URL:  the base URL to use for the document
6955  * @encoding:  the document encoding, or NULL
6956  * @options:  a combination of htmlParserOption(s)
6957  *
6958  * parse an HTML document from I/O functions and source and build a tree.
6959  *
6960  * Returns the resulting document tree
6961  */
6962 htmlDocPtr
htmlReadIO(xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)6963 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6964           void *ioctx, const char *URL, const char *encoding, int options)
6965 {
6966     htmlParserCtxtPtr ctxt;
6967     xmlParserInputBufferPtr input;
6968     xmlParserInputPtr stream;
6969 
6970     if (ioread == NULL)
6971         return (NULL);
6972     xmlInitParser();
6973 
6974     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6975                                          XML_CHAR_ENCODING_NONE);
6976     if (input == NULL) {
6977         if (ioclose != NULL)
6978             ioclose(ioctx);
6979         return (NULL);
6980     }
6981     ctxt = htmlNewParserCtxt();
6982     if (ctxt == NULL) {
6983         xmlFreeParserInputBuffer(input);
6984         return (NULL);
6985     }
6986     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6987     if (stream == NULL) {
6988         xmlFreeParserInputBuffer(input);
6989 	xmlFreeParserCtxt(ctxt);
6990         return (NULL);
6991     }
6992     inputPush(ctxt, stream);
6993     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6994 }
6995 
6996 /**
6997  * htmlCtxtReadDoc:
6998  * @ctxt:  an HTML parser context
6999  * @cur:  a pointer to a zero terminated string
7000  * @URL:  the base URL to use for the document
7001  * @encoding:  the document encoding, or NULL
7002  * @options:  a combination of htmlParserOption(s)
7003  *
7004  * parse an XML in-memory document and build a tree.
7005  * This reuses the existing @ctxt parser context
7006  *
7007  * Returns the resulting document tree
7008  */
7009 htmlDocPtr
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt,const xmlChar * cur,const char * URL,const char * encoding,int options)7010 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
7011                const char *URL, const char *encoding, int options)
7012 {
7013     xmlParserInputPtr stream;
7014 
7015     if (cur == NULL)
7016         return (NULL);
7017     if (ctxt == NULL)
7018         return (NULL);
7019     xmlInitParser();
7020 
7021     htmlCtxtReset(ctxt);
7022 
7023     stream = xmlNewStringInputStream(ctxt, cur);
7024     if (stream == NULL) {
7025         return (NULL);
7026     }
7027     inputPush(ctxt, stream);
7028     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7029 }
7030 
7031 /**
7032  * htmlCtxtReadFile:
7033  * @ctxt:  an HTML parser context
7034  * @filename:  a file or URL
7035  * @encoding:  the document encoding, or NULL
7036  * @options:  a combination of htmlParserOption(s)
7037  *
7038  * parse an XML file from the filesystem or the network.
7039  * This reuses the existing @ctxt parser context
7040  *
7041  * Returns the resulting document tree
7042  */
7043 htmlDocPtr
htmlCtxtReadFile(htmlParserCtxtPtr ctxt,const char * filename,const char * encoding,int options)7044 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7045                 const char *encoding, int options)
7046 {
7047     xmlParserInputPtr stream;
7048 
7049     if (filename == NULL)
7050         return (NULL);
7051     if (ctxt == NULL)
7052         return (NULL);
7053     xmlInitParser();
7054 
7055     htmlCtxtReset(ctxt);
7056 
7057     stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7058     if (stream == NULL) {
7059         return (NULL);
7060     }
7061     inputPush(ctxt, stream);
7062     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7063 }
7064 
7065 /**
7066  * htmlCtxtReadMemory:
7067  * @ctxt:  an HTML parser context
7068  * @buffer:  a pointer to a char array
7069  * @size:  the size of the array
7070  * @URL:  the base URL to use for the document
7071  * @encoding:  the document encoding, or NULL
7072  * @options:  a combination of htmlParserOption(s)
7073  *
7074  * parse an XML in-memory document and build a tree.
7075  * This reuses the existing @ctxt parser context
7076  *
7077  * Returns the resulting document tree
7078  */
7079 htmlDocPtr
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt,const char * buffer,int size,const char * URL,const char * encoding,int options)7080 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7081                   const char *URL, const char *encoding, int options)
7082 {
7083     xmlParserInputBufferPtr input;
7084     xmlParserInputPtr stream;
7085 
7086     if (ctxt == NULL)
7087         return (NULL);
7088     if (buffer == NULL)
7089         return (NULL);
7090     xmlInitParser();
7091 
7092     htmlCtxtReset(ctxt);
7093 
7094     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7095     if (input == NULL) {
7096 	return(NULL);
7097     }
7098 
7099     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7100     if (stream == NULL) {
7101 	xmlFreeParserInputBuffer(input);
7102 	return(NULL);
7103     }
7104 
7105     inputPush(ctxt, stream);
7106     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7107 }
7108 
7109 /**
7110  * htmlCtxtReadFd:
7111  * @ctxt:  an HTML parser context
7112  * @fd:  an open file descriptor
7113  * @URL:  the base URL to use for the document
7114  * @encoding:  the document encoding, or NULL
7115  * @options:  a combination of htmlParserOption(s)
7116  *
7117  * parse an XML from a file descriptor and build a tree.
7118  * This reuses the existing @ctxt parser context
7119  *
7120  * Returns the resulting document tree
7121  */
7122 htmlDocPtr
htmlCtxtReadFd(htmlParserCtxtPtr ctxt,int fd,const char * URL,const char * encoding,int options)7123 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7124               const char *URL, const char *encoding, int options)
7125 {
7126     xmlParserInputBufferPtr input;
7127     xmlParserInputPtr stream;
7128 
7129     if (fd < 0)
7130         return (NULL);
7131     if (ctxt == NULL)
7132         return (NULL);
7133     xmlInitParser();
7134 
7135     htmlCtxtReset(ctxt);
7136 
7137 
7138     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7139     if (input == NULL)
7140         return (NULL);
7141     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7142     if (stream == NULL) {
7143         xmlFreeParserInputBuffer(input);
7144         return (NULL);
7145     }
7146     inputPush(ctxt, stream);
7147     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7148 }
7149 
7150 /**
7151  * htmlCtxtReadIO:
7152  * @ctxt:  an HTML parser context
7153  * @ioread:  an I/O read function
7154  * @ioclose:  an I/O close function
7155  * @ioctx:  an I/O handler
7156  * @URL:  the base URL to use for the document
7157  * @encoding:  the document encoding, or NULL
7158  * @options:  a combination of htmlParserOption(s)
7159  *
7160  * parse an HTML document from I/O functions and source and build a tree.
7161  * This reuses the existing @ctxt parser context
7162  *
7163  * Returns the resulting document tree
7164  */
7165 htmlDocPtr
htmlCtxtReadIO(htmlParserCtxtPtr ctxt,xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)7166 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7167               xmlInputCloseCallback ioclose, void *ioctx,
7168 	      const char *URL,
7169               const char *encoding, int options)
7170 {
7171     xmlParserInputBufferPtr input;
7172     xmlParserInputPtr stream;
7173 
7174     if (ioread == NULL)
7175         return (NULL);
7176     if (ctxt == NULL)
7177         return (NULL);
7178     xmlInitParser();
7179 
7180     htmlCtxtReset(ctxt);
7181 
7182     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7183                                          XML_CHAR_ENCODING_NONE);
7184     if (input == NULL) {
7185         if (ioclose != NULL)
7186             ioclose(ioctx);
7187         return (NULL);
7188     }
7189     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7190     if (stream == NULL) {
7191         xmlFreeParserInputBuffer(input);
7192         return (NULL);
7193     }
7194     inputPush(ctxt, stream);
7195     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7196 }
7197 
7198 #define bottom_HTMLparser
7199 #include "elfgcchack.h"
7200 #endif /* LIBXML_HTML_ENABLED */
7201