1 /* libxml2 - Library for parsing XML documents
2 * Copyright (C) 2006-2019 Free Software Foundation, Inc.
3 *
4 * This file is not part of the GNU gettext program, but is used with
5 * GNU gettext.
6 *
7 * The original copyright notice is as follows:
8 */
9
10 /*
11 * Copyright (C) 1998-2012 Daniel Veillard. All Rights Reserved.
12 *
13 * Permission is hereby granted, free of charge, to any person obtaining a copy
14 * of this software and associated documentation files (the "Software"), to deal
15 * in the Software without restriction, including without limitation the rights
16 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17 * copies of the Software, and to permit persons to whom the Software is fur-
18 * nished to do so, subject to the following conditions:
19 *
20 * The above copyright notice and this permission notice shall be included in
21 * all copies or substantial portions of the Software.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FIT-
25 * NESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29 * THE SOFTWARE.
30 *
31 * daniel@veillard.com
32 */
33
34 /*
35 * HTMLparser.c : an HTML 4.0 non-verifying parser
36 */
37
38 #define IN_LIBXML
39 #include "libxml.h"
40 #ifdef LIBXML_HTML_ENABLED
41
42 #include <string.h>
43 #ifdef HAVE_CTYPE_H
44 #include <ctype.h>
45 #endif
46 #ifdef HAVE_STDLIB_H
47 #include <stdlib.h>
48 #endif
49 #ifdef HAVE_SYS_STAT_H
50 #include <sys/stat.h>
51 #endif
52 #ifdef HAVE_FCNTL_H
53 #include <fcntl.h>
54 #endif
55 #ifdef HAVE_UNISTD_H
56 #include <unistd.h>
57 #endif
58 #ifdef LIBXML_ZLIB_ENABLED
59 #include <zlib.h>
60 #endif
61
62 #include <libxml/xmlmemory.h>
63 #include <libxml/tree.h>
64 #include <libxml/parser.h>
65 #include <libxml/parserInternals.h>
66 #include <libxml/xmlerror.h>
67 #include <libxml/HTMLparser.h>
68 #include <libxml/HTMLtree.h>
69 #include <libxml/entities.h>
70 #include <libxml/encoding.h>
71 #include <libxml/valid.h>
72 #include <libxml/xmlIO.h>
73 #include <libxml/globals.h>
74 #include <libxml/uri.h>
75
76 #include "buf.h"
77 #include "enc.h"
78
79 #define HTML_MAX_NAMELEN 1000
80 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
81 #define HTML_PARSER_BUFFER_SIZE 100
82
83 /* #define DEBUG */
84 /* #define DEBUG_PUSH */
85
86 static int htmlOmittedDefaultValue = 1;
87
88 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
89 xmlChar end, xmlChar end2, xmlChar end3);
90 static void htmlParseComment(htmlParserCtxtPtr ctxt);
91
92 /************************************************************************
93 * *
94 * Some factorized error routines *
95 * *
96 ************************************************************************/
97
98 /**
99 * htmlErrMemory:
100 * @ctxt: an HTML parser context
101 * @extra: extra informations
102 *
103 * Handle a redefinition of attribute error
104 */
105 static void
htmlErrMemory(xmlParserCtxtPtr ctxt,const char * extra)106 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
107 {
108 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
109 (ctxt->instate == XML_PARSER_EOF))
110 return;
111 if (ctxt != NULL) {
112 ctxt->errNo = XML_ERR_NO_MEMORY;
113 ctxt->instate = XML_PARSER_EOF;
114 ctxt->disableSAX = 1;
115 }
116 if (extra)
117 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
118 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
119 NULL, NULL, 0, 0,
120 "Memory allocation failed : %s\n", extra);
121 else
122 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
123 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
124 NULL, NULL, 0, 0, "Memory allocation failed\n");
125 }
126
127 /**
128 * htmlParseErr:
129 * @ctxt: an HTML parser context
130 * @error: the error number
131 * @msg: the error message
132 * @str1: string infor
133 * @str2: string infor
134 *
135 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
136 */
137 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErr(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,const xmlChar * str1,const xmlChar * str2)138 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
139 const char *msg, const xmlChar *str1, const xmlChar *str2)
140 {
141 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
142 (ctxt->instate == XML_PARSER_EOF))
143 return;
144 if (ctxt != NULL)
145 ctxt->errNo = error;
146 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
147 XML_ERR_ERROR, NULL, 0,
148 (const char *) str1, (const char *) str2,
149 NULL, 0, 0,
150 msg, str1, str2);
151 if (ctxt != NULL)
152 ctxt->wellFormed = 0;
153 }
154
155 /**
156 * htmlParseErrInt:
157 * @ctxt: an HTML parser context
158 * @error: the error number
159 * @msg: the error message
160 * @val: integer info
161 *
162 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
163 */
164 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErrInt(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,int val)165 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
166 const char *msg, int val)
167 {
168 if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
169 (ctxt->instate == XML_PARSER_EOF))
170 return;
171 if (ctxt != NULL)
172 ctxt->errNo = error;
173 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
174 XML_ERR_ERROR, NULL, 0, NULL, NULL,
175 NULL, val, 0, msg, val);
176 if (ctxt != NULL)
177 ctxt->wellFormed = 0;
178 }
179
180 /************************************************************************
181 * *
182 * Parser stacks related functions and macros *
183 * *
184 ************************************************************************/
185
186 /**
187 * htmlnamePush:
188 * @ctxt: an HTML parser context
189 * @value: the element name
190 *
191 * Pushes a new element name on top of the name stack
192 *
193 * Returns 0 in case of error, the index in the stack otherwise
194 */
195 static int
htmlnamePush(htmlParserCtxtPtr ctxt,const xmlChar * value)196 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
197 {
198 if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
199 ctxt->html = 3;
200 if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
201 ctxt->html = 10;
202 if (ctxt->nameNr >= ctxt->nameMax) {
203 ctxt->nameMax *= 2;
204 ctxt->nameTab = (const xmlChar * *)
205 xmlRealloc((xmlChar * *)ctxt->nameTab,
206 ctxt->nameMax *
207 sizeof(ctxt->nameTab[0]));
208 if (ctxt->nameTab == NULL) {
209 htmlErrMemory(ctxt, NULL);
210 return (0);
211 }
212 }
213 ctxt->nameTab[ctxt->nameNr] = value;
214 ctxt->name = value;
215 return (ctxt->nameNr++);
216 }
217 /**
218 * htmlnamePop:
219 * @ctxt: an HTML parser context
220 *
221 * Pops the top element name from the name stack
222 *
223 * Returns the name just removed
224 */
225 static const xmlChar *
htmlnamePop(htmlParserCtxtPtr ctxt)226 htmlnamePop(htmlParserCtxtPtr ctxt)
227 {
228 const xmlChar *ret;
229
230 if (ctxt->nameNr <= 0)
231 return (NULL);
232 ctxt->nameNr--;
233 if (ctxt->nameNr < 0)
234 return (NULL);
235 if (ctxt->nameNr > 0)
236 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
237 else
238 ctxt->name = NULL;
239 ret = ctxt->nameTab[ctxt->nameNr];
240 ctxt->nameTab[ctxt->nameNr] = NULL;
241 return (ret);
242 }
243
244 /**
245 * htmlNodeInfoPush:
246 * @ctxt: an HTML parser context
247 * @value: the node info
248 *
249 * Pushes a new element name on top of the node info stack
250 *
251 * Returns 0 in case of error, the index in the stack otherwise
252 */
253 static int
htmlNodeInfoPush(htmlParserCtxtPtr ctxt,htmlParserNodeInfo * value)254 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
255 {
256 if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
257 if (ctxt->nodeInfoMax == 0)
258 ctxt->nodeInfoMax = 5;
259 ctxt->nodeInfoMax *= 2;
260 ctxt->nodeInfoTab = (htmlParserNodeInfo *)
261 xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
262 ctxt->nodeInfoMax *
263 sizeof(ctxt->nodeInfoTab[0]));
264 if (ctxt->nodeInfoTab == NULL) {
265 htmlErrMemory(ctxt, NULL);
266 return (0);
267 }
268 }
269 ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
270 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
271 return (ctxt->nodeInfoNr++);
272 }
273
274 /**
275 * htmlNodeInfoPop:
276 * @ctxt: an HTML parser context
277 *
278 * Pops the top element name from the node info stack
279 *
280 * Returns 0 in case of error, the pointer to NodeInfo otherwise
281 */
282 static htmlParserNodeInfo *
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)283 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
284 {
285 if (ctxt->nodeInfoNr <= 0)
286 return (NULL);
287 ctxt->nodeInfoNr--;
288 if (ctxt->nodeInfoNr < 0)
289 return (NULL);
290 if (ctxt->nodeInfoNr > 0)
291 ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
292 else
293 ctxt->nodeInfo = NULL;
294 return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
295 }
296
297 /*
298 * Macros for accessing the content. Those should be used only by the parser,
299 * and not exported.
300 *
301 * Dirty macros, i.e. one need to make assumption on the context to use them
302 *
303 * CUR_PTR return the current pointer to the xmlChar to be parsed.
304 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
305 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled
306 * in UNICODE mode. This should be used internally by the parser
307 * only to compare to ASCII values otherwise it would break when
308 * running with UTF-8 encoding.
309 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
310 * to compare on ASCII based substring.
311 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
312 * it should be used only to compare on ASCII based substring.
313 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
314 * strings without newlines within the parser.
315 *
316 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
317 *
318 * CURRENT Returns the current char value, with the full decoding of
319 * UTF-8 if we are using this mode. It returns an int.
320 * NEXT Skip to the next character, this does the proper decoding
321 * in UTF-8 mode. It also pop-up unfinished entities on the fly.
322 * NEXTL(l) Skip the current unicode character of l xmlChars long.
323 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
324 */
325
326 #define UPPER (toupper(*ctxt->input->cur))
327
328 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
329
330 #define NXT(val) ctxt->input->cur[(val)]
331
332 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
333
334 #define CUR_PTR ctxt->input->cur
335 #define BASE_PTR ctxt->input->base
336
337 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
338 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
339 xmlParserInputShrink(ctxt->input)
340
341 #define GROW if ((ctxt->progressive == 0) && \
342 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
343 xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
344
345 #define CURRENT ((int) (*ctxt->input->cur))
346
347 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
348
349 /* Inported from XML */
350
351 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
352 #define CUR ((int) (*ctxt->input->cur))
353 #define NEXT xmlNextChar(ctxt)
354
355 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
356
357
358 #define NEXTL(l) do { \
359 if (*(ctxt->input->cur) == '\n') { \
360 ctxt->input->line++; ctxt->input->col = 1; \
361 } else ctxt->input->col++; \
362 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
363 } while (0)
364
365 /************
366 \
367 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
368 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
369 ************/
370
371 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
372 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
373
374 #define COPY_BUF(l,b,i,v) \
375 if (l == 1) b[i++] = (xmlChar) v; \
376 else i += xmlCopyChar(l,&b[i],v)
377
378 /**
379 * htmlFindEncoding:
380 * @the HTML parser context
381 *
382 * Ty to find and encoding in the current data available in the input
383 * buffer this is needed to try to switch to the proper encoding when
384 * one face a character error.
385 * That's an heuristic, since it's operating outside of parsing it could
386 * try to use a meta which had been commented out, that's the reason it
387 * should only be used in case of error, not as a default.
388 *
389 * Returns an encoding string or NULL if not found, the string need to
390 * be freed
391 */
392 static xmlChar *
htmlFindEncoding(xmlParserCtxtPtr ctxt)393 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
394 const xmlChar *start, *cur, *end;
395
396 if ((ctxt == NULL) || (ctxt->input == NULL) ||
397 (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
398 (ctxt->input->buf->encoder != NULL))
399 return(NULL);
400 if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
401 return(NULL);
402
403 start = ctxt->input->cur;
404 end = ctxt->input->end;
405 /* we also expect the input buffer to be zero terminated */
406 if (*end != 0)
407 return(NULL);
408
409 cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
410 if (cur == NULL)
411 return(NULL);
412 cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
413 if (cur == NULL)
414 return(NULL);
415 cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
416 if (cur == NULL)
417 return(NULL);
418 cur += 8;
419 start = cur;
420 while (((*cur >= 'A') && (*cur <= 'Z')) ||
421 ((*cur >= 'a') && (*cur <= 'z')) ||
422 ((*cur >= '0') && (*cur <= '9')) ||
423 (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
424 cur++;
425 if (cur == start)
426 return(NULL);
427 return(xmlStrndup(start, cur - start));
428 }
429
430 /**
431 * htmlCurrentChar:
432 * @ctxt: the HTML parser context
433 * @len: pointer to the length of the char read
434 *
435 * The current char value, if using UTF-8 this may actually span multiple
436 * bytes in the input buffer. Implement the end of line normalization:
437 * 2.11 End-of-Line Handling
438 * If the encoding is unspecified, in the case we find an ISO-Latin-1
439 * char, then the encoding converter is plugged in automatically.
440 *
441 * Returns the current char value and its length
442 */
443
444 static int
htmlCurrentChar(xmlParserCtxtPtr ctxt,int * len)445 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
446 if (ctxt->instate == XML_PARSER_EOF)
447 return(0);
448
449 if (ctxt->token != 0) {
450 *len = 0;
451 return(ctxt->token);
452 }
453 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
454 /*
455 * We are supposed to handle UTF8, check it's valid
456 * From rfc2044: encoding of the Unicode values on UTF-8:
457 *
458 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
459 * 0000 0000-0000 007F 0xxxxxxx
460 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
461 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
462 *
463 * Check for the 0x110000 limit too
464 */
465 const unsigned char *cur = ctxt->input->cur;
466 unsigned char c;
467 unsigned int val;
468
469 c = *cur;
470 if (c & 0x80) {
471 if (cur[1] == 0) {
472 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
473 cur = ctxt->input->cur;
474 }
475 if ((cur[1] & 0xc0) != 0x80)
476 goto encoding_error;
477 if ((c & 0xe0) == 0xe0) {
478
479 if (cur[2] == 0) {
480 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
481 cur = ctxt->input->cur;
482 }
483 if ((cur[2] & 0xc0) != 0x80)
484 goto encoding_error;
485 if ((c & 0xf0) == 0xf0) {
486 if (cur[3] == 0) {
487 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
488 cur = ctxt->input->cur;
489 }
490 if (((c & 0xf8) != 0xf0) ||
491 ((cur[3] & 0xc0) != 0x80))
492 goto encoding_error;
493 /* 4-byte code */
494 *len = 4;
495 val = (cur[0] & 0x7) << 18;
496 val |= (cur[1] & 0x3f) << 12;
497 val |= (cur[2] & 0x3f) << 6;
498 val |= cur[3] & 0x3f;
499 } else {
500 /* 3-byte code */
501 *len = 3;
502 val = (cur[0] & 0xf) << 12;
503 val |= (cur[1] & 0x3f) << 6;
504 val |= cur[2] & 0x3f;
505 }
506 } else {
507 /* 2-byte code */
508 *len = 2;
509 val = (cur[0] & 0x1f) << 6;
510 val |= cur[1] & 0x3f;
511 }
512 if (!IS_CHAR(val)) {
513 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
514 "Char 0x%X out of allowed range\n", val);
515 }
516 return(val);
517 } else {
518 if ((*ctxt->input->cur == 0) &&
519 (ctxt->input->cur < ctxt->input->end)) {
520 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
521 "Char 0x%X out of allowed range\n", 0);
522 *len = 1;
523 return(' ');
524 }
525 /* 1-byte code */
526 *len = 1;
527 return((int) *ctxt->input->cur);
528 }
529 }
530 /*
531 * Assume it's a fixed length encoding (1) with
532 * a compatible encoding for the ASCII set, since
533 * XML constructs only use < 128 chars
534 */
535 *len = 1;
536 if ((int) *ctxt->input->cur < 0x80)
537 return((int) *ctxt->input->cur);
538
539 /*
540 * Humm this is bad, do an automatic flow conversion
541 */
542 {
543 xmlChar * guess;
544 xmlCharEncodingHandlerPtr handler;
545
546 guess = htmlFindEncoding(ctxt);
547 if (guess == NULL) {
548 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
549 } else {
550 if (ctxt->input->encoding != NULL)
551 xmlFree((xmlChar *) ctxt->input->encoding);
552 ctxt->input->encoding = guess;
553 handler = xmlFindCharEncodingHandler((const char *) guess);
554 if (handler != NULL) {
555 xmlSwitchToEncoding(ctxt, handler);
556 } else {
557 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
558 "Unsupported encoding %s", guess, NULL);
559 }
560 }
561 ctxt->charset = XML_CHAR_ENCODING_UTF8;
562 }
563
564 return(xmlCurrentChar(ctxt, len));
565
566 encoding_error:
567 /*
568 * If we detect an UTF8 error that probably mean that the
569 * input encoding didn't get properly advertized in the
570 * declaration header. Report the error and switch the encoding
571 * to ISO-Latin-1 (if you don't like this policy, just declare the
572 * encoding !)
573 */
574 {
575 char buffer[150];
576
577 if (ctxt->input->end - ctxt->input->cur >= 4) {
578 snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
579 ctxt->input->cur[0], ctxt->input->cur[1],
580 ctxt->input->cur[2], ctxt->input->cur[3]);
581 } else {
582 snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
583 }
584 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
585 "Input is not proper UTF-8, indicate encoding !\n",
586 BAD_CAST buffer, NULL);
587 }
588
589 ctxt->charset = XML_CHAR_ENCODING_8859_1;
590 *len = 1;
591 return((int) *ctxt->input->cur);
592 }
593
594 /**
595 * htmlSkipBlankChars:
596 * @ctxt: the HTML parser context
597 *
598 * skip all blanks character found at that point in the input streams.
599 *
600 * Returns the number of space chars skipped
601 */
602
603 static int
htmlSkipBlankChars(xmlParserCtxtPtr ctxt)604 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
605 int res = 0;
606
607 while (IS_BLANK_CH(*(ctxt->input->cur))) {
608 if ((*ctxt->input->cur == 0) &&
609 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
610 xmlPopInput(ctxt);
611 } else {
612 if (*(ctxt->input->cur) == '\n') {
613 ctxt->input->line++; ctxt->input->col = 1;
614 } else ctxt->input->col++;
615 ctxt->input->cur++;
616 ctxt->nbChars++;
617 if (*ctxt->input->cur == 0)
618 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
619 }
620 res++;
621 }
622 return(res);
623 }
624
625
626
627 /************************************************************************
628 * *
629 * The list of HTML elements and their properties *
630 * *
631 ************************************************************************/
632
633 /*
634 * Start Tag: 1 means the start tag can be ommited
635 * End Tag: 1 means the end tag can be ommited
636 * 2 means it's forbidden (empty elements)
637 * 3 means the tag is stylistic and should be closed easily
638 * Depr: this element is deprecated
639 * DTD: 1 means that this element is valid only in the Loose DTD
640 * 2 means that this element is valid only in the Frameset DTD
641 *
642 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
643 , subElements , impliedsubelt , Attributes, userdata
644 */
645
646 /* Definitions and a couple of vars for HTML Elements */
647
648 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
649 #define NB_FONTSTYLE 8
650 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
651 #define NB_PHRASE 10
652 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
653 #define NB_SPECIAL 16
654 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
655 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
656 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
657 #define NB_BLOCK NB_HEADING + NB_LIST + 14
658 #define FORMCTRL "input", "select", "textarea", "label", "button"
659 #define NB_FORMCTRL 5
660 #define PCDATA
661 #define NB_PCDATA 0
662 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
663 #define NB_HEADING 6
664 #define LIST "ul", "ol", "dir", "menu"
665 #define NB_LIST 4
666 #define MODIFIER
667 #define NB_MODIFIER 0
668 #define FLOW BLOCK,INLINE
669 #define NB_FLOW NB_BLOCK + NB_INLINE
670 #define EMPTY NULL
671
672
673 static const char* const html_flow[] = { FLOW, NULL } ;
674 static const char* const html_inline[] = { INLINE, NULL } ;
675
676 /* placeholders: elts with content but no subelements */
677 static const char* const html_pcdata[] = { NULL } ;
678 #define html_cdata html_pcdata
679
680
681 /* ... and for HTML Attributes */
682
683 #define COREATTRS "id", "class", "style", "title"
684 #define NB_COREATTRS 4
685 #define I18N "lang", "dir"
686 #define NB_I18N 2
687 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
688 #define NB_EVENTS 9
689 #define ATTRS COREATTRS,I18N,EVENTS
690 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
691 #define CELLHALIGN "align", "char", "charoff"
692 #define NB_CELLHALIGN 3
693 #define CELLVALIGN "valign"
694 #define NB_CELLVALIGN 1
695
696 static const char* const html_attrs[] = { ATTRS, NULL } ;
697 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
698 static const char* const core_attrs[] = { COREATTRS, NULL } ;
699 static const char* const i18n_attrs[] = { I18N, NULL } ;
700
701
702 /* Other declarations that should go inline ... */
703 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
704 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
705 "tabindex", "onfocus", "onblur", NULL } ;
706 static const char* const target_attr[] = { "target", NULL } ;
707 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
708 static const char* const alt_attr[] = { "alt", NULL } ;
709 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
710 static const char* const href_attrs[] = { "href", NULL } ;
711 static const char* const clear_attrs[] = { "clear", NULL } ;
712 static const char* const inline_p[] = { INLINE, "p", NULL } ;
713
714 static const char* const flow_param[] = { FLOW, "param", NULL } ;
715 static const char* const applet_attrs[] = { COREATTRS , "codebase",
716 "archive", "alt", "name", "height", "width", "align",
717 "hspace", "vspace", NULL } ;
718 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
719 "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
720 static const char* const basefont_attrs[] =
721 { "id", "size", "color", "face", NULL } ;
722 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
723 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
724 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
725 static const char* const body_depr[] = { "background", "bgcolor", "text",
726 "link", "vlink", "alink", NULL } ;
727 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
728 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
729
730
731 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
732 static const char* const col_elt[] = { "col", NULL } ;
733 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
734 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
735 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
736 static const char* const compact_attr[] = { "compact", NULL } ;
737 static const char* const label_attr[] = { "label", NULL } ;
738 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
739 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
740 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
741 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
742 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
743 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
744 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
745 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
746 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
747 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
748 static const char* const version_attr[] = { "version", NULL } ;
749 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
750 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
751 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
752 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
753 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
754 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
755 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
756 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
757 static const char* const align_attr[] = { "align", NULL } ;
758 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
759 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
760 static const char* const name_attr[] = { "name", NULL } ;
761 static const char* const action_attr[] = { "action", NULL } ;
762 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
763 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
764 static const char* const content_attr[] = { "content", NULL } ;
765 static const char* const type_attr[] = { "type", NULL } ;
766 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
767 static const char* const object_contents[] = { FLOW, "param", NULL } ;
768 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
769 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
770 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
771 static const char* const option_elt[] = { "option", NULL } ;
772 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
773 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
774 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
775 static const char* const width_attr[] = { "width", NULL } ;
776 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
777 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
778 static const char* const language_attr[] = { "language", NULL } ;
779 static const char* const select_content[] = { "optgroup", "option", NULL } ;
780 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
781 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
782 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
783 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
784 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
785 static const char* const tr_elt[] = { "tr", NULL } ;
786 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
787 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
788 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
789 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
790 static const char* const tr_contents[] = { "th", "td", NULL } ;
791 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
792 static const char* const li_elt[] = { "li", NULL } ;
793 static const char* const ul_depr[] = { "type", "compact", NULL} ;
794 static const char* const dir_attr[] = { "dir", NULL} ;
795
796 #define DECL (const char**)
797
798 static const htmlElemDesc
799 html40ElementTable[] = {
800 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
801 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
802 },
803 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
804 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
805 },
806 { "acronym", 0, 0, 0, 0, 0, 0, 1, "",
807 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
808 },
809 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
810 DECL inline_p , NULL , DECL html_attrs, NULL, NULL
811 },
812 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
813 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
814 },
815 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
816 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
817 },
818 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
819 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
820 },
821 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
822 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
823 },
824 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
825 EMPTY , NULL , NULL, DECL basefont_attrs, NULL
826 },
827 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
828 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
829 },
830 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
831 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
832 },
833 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
834 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
835 },
836 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
837 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
838 },
839 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
840 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
841 },
842 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
843 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
844 },
845 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
846 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
847 },
848 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
849 DECL html_flow , NULL , NULL, DECL html_attrs, NULL
850 },
851 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
852 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
853 },
854 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
855 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
856 },
857 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
858 EMPTY , NULL , DECL col_attrs , NULL, NULL
859 },
860 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
861 DECL col_elt , "col" , DECL col_attrs , NULL, NULL
862 },
863 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
864 DECL html_flow , NULL , DECL html_attrs, NULL, NULL
865 },
866 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
867 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
868 },
869 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
870 DECL html_inline , NULL , DECL html_attrs, NULL, NULL
871 },
872 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
873 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
874 },
875 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
876 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
877 },
878 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
879 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
880 },
881 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
882 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
883 },
884 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
885 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
886 },
887 { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
888 EMPTY, NULL, DECL embed_attrs, NULL, NULL
889 },
890 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
891 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
892 },
893 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
894 DECL html_inline, NULL, NULL, DECL font_attrs, NULL
895 },
896 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
897 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
898 },
899 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
900 EMPTY, NULL, NULL, DECL frame_attrs, NULL
901 },
902 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
903 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
904 },
905 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
906 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
907 },
908 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
909 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
910 },
911 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
912 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
913 },
914 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
915 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
916 },
917 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
918 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
919 },
920 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
921 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
922 },
923 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
924 DECL head_contents, NULL, DECL head_attrs, NULL, NULL
925 },
926 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
927 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
928 },
929 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
930 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
931 },
932 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
933 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
934 },
935 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
936 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
937 },
938 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
939 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
940 },
941 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
942 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
943 },
944 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
945 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
946 },
947 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
948 EMPTY, NULL, NULL, DECL prompt_attrs, NULL
949 },
950 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
951 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
952 },
953 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
954 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
955 },
956 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
957 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
958 },
959 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
960 DECL html_flow, NULL, DECL html_attrs, NULL, NULL
961 },
962 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
963 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
964 },
965 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
966 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
967 },
968 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
969 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
970 },
971 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
972 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
973 },
974 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
975 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
976 },
977 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
978 DECL html_flow, "div", DECL html_attrs, NULL, NULL
979 },
980 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
981 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
982 },
983 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
984 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
985 },
986 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
987 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
988 },
989 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
990 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
991 },
992 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
993 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
994 },
995 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
996 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
997 },
998 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
999 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
1000 },
1001 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
1002 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
1003 },
1004 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
1005 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1006 },
1007 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
1008 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1009 },
1010 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
1011 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
1012 },
1013 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
1014 DECL select_content, NULL, DECL select_attrs, NULL, NULL
1015 },
1016 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
1017 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1018 },
1019 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
1020 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1021 },
1022 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
1023 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1024 },
1025 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1026 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1027 },
1028 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
1029 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1030 },
1031 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
1032 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1033 },
1034 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1035 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1036 },
1037 { "table", 0, 0, 0, 0, 0, 0, 0, "",
1038 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1039 },
1040 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1041 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1042 },
1043 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1044 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1045 },
1046 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1047 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1048 },
1049 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1050 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1051 },
1052 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1053 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1054 },
1055 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1056 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1057 },
1058 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1059 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1060 },
1061 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1062 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1063 },
1064 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1065 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1066 },
1067 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1068 DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1069 },
1070 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1071 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1072 },
1073 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1074 DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1075 }
1076 };
1077
1078 /*
1079 * start tags that imply the end of current element
1080 */
1081 static const char * const htmlStartClose[] = {
1082 "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1083 "dl", "ul", "ol", "menu", "dir", "address", "pre",
1084 "listing", "xmp", "head", NULL,
1085 "head", "p", NULL,
1086 "title", "p", NULL,
1087 "body", "head", "style", "link", "title", "p", NULL,
1088 "frameset", "head", "style", "link", "title", "p", NULL,
1089 "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1090 "pre", "listing", "xmp", "head", "li", NULL,
1091 "hr", "p", "head", NULL,
1092 "h1", "p", "head", NULL,
1093 "h2", "p", "head", NULL,
1094 "h3", "p", "head", NULL,
1095 "h4", "p", "head", NULL,
1096 "h5", "p", "head", NULL,
1097 "h6", "p", "head", NULL,
1098 "dir", "p", "head", NULL,
1099 "address", "p", "head", "ul", NULL,
1100 "pre", "p", "head", "ul", NULL,
1101 "listing", "p", "head", NULL,
1102 "xmp", "p", "head", NULL,
1103 "blockquote", "p", "head", NULL,
1104 "dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1105 "xmp", "head", NULL,
1106 "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1107 "head", "dd", NULL,
1108 "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1109 "head", "dt", NULL,
1110 "ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1111 "listing", "xmp", NULL,
1112 "ol", "p", "head", "ul", NULL,
1113 "menu", "p", "head", "ul", NULL,
1114 "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
1115 "div", "p", "head", NULL,
1116 "noscript", "script", NULL,
1117 "center", "font", "b", "i", "p", "head", NULL,
1118 "a", "a", "head", NULL,
1119 "caption", "p", NULL,
1120 "colgroup", "caption", "colgroup", "col", "p", NULL,
1121 "col", "caption", "col", "p", NULL,
1122 "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1123 "listing", "xmp", "a", NULL,
1124 "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1125 "td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1126 "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1127 "thead", "caption", "col", "colgroup", NULL,
1128 "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1129 "tbody", "p", NULL,
1130 "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1131 "tfoot", "tbody", "p", NULL,
1132 "optgroup", "option", NULL,
1133 "option", "option", NULL,
1134 "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1135 "pre", "listing", "xmp", "a", NULL,
1136 /* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
1137 "tt", "head", NULL,
1138 "i", "head", NULL,
1139 "b", "head", NULL,
1140 "u", "head", NULL,
1141 "s", "head", NULL,
1142 "strike", "head", NULL,
1143 "big", "head", NULL,
1144 "small", "head", NULL,
1145
1146 "em", "head", NULL,
1147 "strong", "head", NULL,
1148 "dfn", "head", NULL,
1149 "code", "head", NULL,
1150 "samp", "head", NULL,
1151 "kbd", "head", NULL,
1152 "var", "head", NULL,
1153 "cite", "head", NULL,
1154 "abbr", "head", NULL,
1155 "acronym", "head", NULL,
1156
1157 /* "a" */
1158 "img", "head", NULL,
1159 /* "applet" */
1160 /* "embed" */
1161 /* "object" */
1162 "font", "head", NULL,
1163 /* "basefont" */
1164 "br", "head", NULL,
1165 /* "script" */
1166 "map", "head", NULL,
1167 "q", "head", NULL,
1168 "sub", "head", NULL,
1169 "sup", "head", NULL,
1170 "span", "head", NULL,
1171 "bdo", "head", NULL,
1172 "iframe", "head", NULL,
1173 NULL
1174 };
1175
1176 /*
1177 * The list of HTML elements which are supposed not to have
1178 * CDATA content and where a p element will be implied
1179 *
1180 * TODO: extend that list by reading the HTML SGML DTD on
1181 * implied paragraph
1182 */
1183 static const char *const htmlNoContentElements[] = {
1184 "html",
1185 "head",
1186 NULL
1187 };
1188
1189 /*
1190 * The list of HTML attributes which are of content %Script;
1191 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1192 * it assumes the name starts with 'on'
1193 */
1194 static const char *const htmlScriptAttributes[] = {
1195 "onclick",
1196 "ondblclick",
1197 "onmousedown",
1198 "onmouseup",
1199 "onmouseover",
1200 "onmousemove",
1201 "onmouseout",
1202 "onkeypress",
1203 "onkeydown",
1204 "onkeyup",
1205 "onload",
1206 "onunload",
1207 "onfocus",
1208 "onblur",
1209 "onsubmit",
1210 "onreset",
1211 "onchange",
1212 "onselect"
1213 };
1214
1215 /*
1216 * This table is used by the htmlparser to know what to do with
1217 * broken html pages. By assigning different priorities to different
1218 * elements the parser can decide how to handle extra endtags.
1219 * Endtags are only allowed to close elements with lower or equal
1220 * priority.
1221 */
1222
1223 typedef struct {
1224 const char *name;
1225 int priority;
1226 } elementPriority;
1227
1228 static const elementPriority htmlEndPriority[] = {
1229 {"div", 150},
1230 {"td", 160},
1231 {"th", 160},
1232 {"tr", 170},
1233 {"thead", 180},
1234 {"tbody", 180},
1235 {"tfoot", 180},
1236 {"table", 190},
1237 {"head", 200},
1238 {"body", 200},
1239 {"html", 220},
1240 {NULL, 100} /* Default priority */
1241 };
1242
1243 static const char** htmlStartCloseIndex[100];
1244 static int htmlStartCloseIndexinitialized = 0;
1245
1246 /************************************************************************
1247 * *
1248 * functions to handle HTML specific data *
1249 * *
1250 ************************************************************************/
1251
1252 /**
1253 * htmlInitAutoClose:
1254 *
1255 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1256 * This is not reentrant. Call xmlInitParser() once before processing in
1257 * case of use in multithreaded programs.
1258 */
1259 void
htmlInitAutoClose(void)1260 htmlInitAutoClose(void) {
1261 int indx, i = 0;
1262
1263 if (htmlStartCloseIndexinitialized) return;
1264
1265 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1266 indx = 0;
1267 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1268 htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1269 while (htmlStartClose[i] != NULL) i++;
1270 i++;
1271 }
1272 htmlStartCloseIndexinitialized = 1;
1273 }
1274
1275 /**
1276 * htmlTagLookup:
1277 * @tag: The tag name in lowercase
1278 *
1279 * Lookup the HTML tag in the ElementTable
1280 *
1281 * Returns the related htmlElemDescPtr or NULL if not found.
1282 */
1283 const htmlElemDesc *
htmlTagLookup(const xmlChar * tag)1284 htmlTagLookup(const xmlChar *tag) {
1285 unsigned int i;
1286
1287 for (i = 0; i < (sizeof(html40ElementTable) /
1288 sizeof(html40ElementTable[0]));i++) {
1289 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1290 return((htmlElemDescPtr) &html40ElementTable[i]);
1291 }
1292 return(NULL);
1293 }
1294
1295 /**
1296 * htmlGetEndPriority:
1297 * @name: The name of the element to look up the priority for.
1298 *
1299 * Return value: The "endtag" priority.
1300 **/
1301 static int
htmlGetEndPriority(const xmlChar * name)1302 htmlGetEndPriority (const xmlChar *name) {
1303 int i = 0;
1304
1305 while ((htmlEndPriority[i].name != NULL) &&
1306 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1307 i++;
1308
1309 return(htmlEndPriority[i].priority);
1310 }
1311
1312
1313 /**
1314 * htmlCheckAutoClose:
1315 * @newtag: The new tag name
1316 * @oldtag: The old tag name
1317 *
1318 * Checks whether the new tag is one of the registered valid tags for
1319 * closing old.
1320 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1321 *
1322 * Returns 0 if no, 1 if yes.
1323 */
1324 static int
htmlCheckAutoClose(const xmlChar * newtag,const xmlChar * oldtag)1325 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1326 {
1327 int i, indx;
1328 const char **closed = NULL;
1329
1330 if (htmlStartCloseIndexinitialized == 0)
1331 htmlInitAutoClose();
1332
1333 /* inefficient, but not a big deal */
1334 for (indx = 0; indx < 100; indx++) {
1335 closed = htmlStartCloseIndex[indx];
1336 if (closed == NULL)
1337 return (0);
1338 if (xmlStrEqual(BAD_CAST * closed, newtag))
1339 break;
1340 }
1341
1342 i = closed - htmlStartClose;
1343 i++;
1344 while (htmlStartClose[i] != NULL) {
1345 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1346 return (1);
1347 }
1348 i++;
1349 }
1350 return (0);
1351 }
1352
1353 /**
1354 * htmlAutoCloseOnClose:
1355 * @ctxt: an HTML parser context
1356 * @newtag: The new tag name
1357 * @force: force the tag closure
1358 *
1359 * The HTML DTD allows an ending tag to implicitly close other tags.
1360 */
1361 static void
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1362 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1363 {
1364 const htmlElemDesc *info;
1365 int i, priority;
1366
1367 priority = htmlGetEndPriority(newtag);
1368
1369 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1370
1371 if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1372 break;
1373 /*
1374 * A missplaced endtag can only close elements with lower
1375 * or equal priority, so if we find an element with higher
1376 * priority before we find an element with
1377 * matching name, we just ignore this endtag
1378 */
1379 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1380 return;
1381 }
1382 if (i < 0)
1383 return;
1384
1385 while (!xmlStrEqual(newtag, ctxt->name)) {
1386 info = htmlTagLookup(ctxt->name);
1387 if ((info != NULL) && (info->endTag == 3)) {
1388 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1389 "Opening and ending tag mismatch: %s and %s\n",
1390 newtag, ctxt->name);
1391 }
1392 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1393 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1394 htmlnamePop(ctxt);
1395 }
1396 }
1397
1398 /**
1399 * htmlAutoCloseOnEnd:
1400 * @ctxt: an HTML parser context
1401 *
1402 * Close all remaining tags at the end of the stream
1403 */
1404 static void
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)1405 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1406 {
1407 int i;
1408
1409 if (ctxt->nameNr == 0)
1410 return;
1411 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1412 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1413 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1414 htmlnamePop(ctxt);
1415 }
1416 }
1417
1418 /**
1419 * htmlAutoClose:
1420 * @ctxt: an HTML parser context
1421 * @newtag: The new tag name or NULL
1422 *
1423 * The HTML DTD allows a tag to implicitly close other tags.
1424 * The list is kept in htmlStartClose array. This function is
1425 * called when a new tag has been detected and generates the
1426 * appropriates closes if possible/needed.
1427 * If newtag is NULL this mean we are at the end of the resource
1428 * and we should check
1429 */
1430 static void
htmlAutoClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1431 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1432 {
1433 while ((newtag != NULL) && (ctxt->name != NULL) &&
1434 (htmlCheckAutoClose(newtag, ctxt->name))) {
1435 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1436 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1437 htmlnamePop(ctxt);
1438 }
1439 if (newtag == NULL) {
1440 htmlAutoCloseOnEnd(ctxt);
1441 return;
1442 }
1443 while ((newtag == NULL) && (ctxt->name != NULL) &&
1444 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1445 (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1446 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1447 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1448 ctxt->sax->endElement(ctxt->userData, ctxt->name);
1449 htmlnamePop(ctxt);
1450 }
1451 }
1452
1453 /**
1454 * htmlAutoCloseTag:
1455 * @doc: the HTML document
1456 * @name: The tag name
1457 * @elem: the HTML element
1458 *
1459 * The HTML DTD allows a tag to implicitly close other tags.
1460 * The list is kept in htmlStartClose array. This function checks
1461 * if the element or one of it's children would autoclose the
1462 * given tag.
1463 *
1464 * Returns 1 if autoclose, 0 otherwise
1465 */
1466 int
htmlAutoCloseTag(htmlDocPtr doc,const xmlChar * name,htmlNodePtr elem)1467 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1468 htmlNodePtr child;
1469
1470 if (elem == NULL) return(1);
1471 if (xmlStrEqual(name, elem->name)) return(0);
1472 if (htmlCheckAutoClose(elem->name, name)) return(1);
1473 child = elem->children;
1474 while (child != NULL) {
1475 if (htmlAutoCloseTag(doc, name, child)) return(1);
1476 child = child->next;
1477 }
1478 return(0);
1479 }
1480
1481 /**
1482 * htmlIsAutoClosed:
1483 * @doc: the HTML document
1484 * @elem: the HTML element
1485 *
1486 * The HTML DTD allows a tag to implicitly close other tags.
1487 * The list is kept in htmlStartClose array. This function checks
1488 * if a tag is autoclosed by one of it's child
1489 *
1490 * Returns 1 if autoclosed, 0 otherwise
1491 */
1492 int
htmlIsAutoClosed(htmlDocPtr doc,htmlNodePtr elem)1493 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1494 htmlNodePtr child;
1495
1496 if (elem == NULL) return(1);
1497 child = elem->children;
1498 while (child != NULL) {
1499 if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1500 child = child->next;
1501 }
1502 return(0);
1503 }
1504
1505 /**
1506 * htmlCheckImplied:
1507 * @ctxt: an HTML parser context
1508 * @newtag: The new tag name
1509 *
1510 * The HTML DTD allows a tag to exists only implicitly
1511 * called when a new tag has been detected and generates the
1512 * appropriates implicit tags if missing
1513 */
1514 static void
htmlCheckImplied(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1515 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1516 int i;
1517
1518 if (ctxt->options & HTML_PARSE_NOIMPLIED)
1519 return;
1520 if (!htmlOmittedDefaultValue)
1521 return;
1522 if (xmlStrEqual(newtag, BAD_CAST"html"))
1523 return;
1524 if (ctxt->nameNr <= 0) {
1525 htmlnamePush(ctxt, BAD_CAST"html");
1526 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1527 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1528 }
1529 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1530 return;
1531 if ((ctxt->nameNr <= 1) &&
1532 ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1533 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1534 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1535 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1536 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1537 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1538 if (ctxt->html >= 3) {
1539 /* we already saw or generated an <head> before */
1540 return;
1541 }
1542 /*
1543 * dropped OBJECT ... i you put it first BODY will be
1544 * assumed !
1545 */
1546 htmlnamePush(ctxt, BAD_CAST"head");
1547 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1548 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1549 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1550 (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1551 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1552 if (ctxt->html >= 10) {
1553 /* we already saw or generated a <body> before */
1554 return;
1555 }
1556 for (i = 0;i < ctxt->nameNr;i++) {
1557 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1558 return;
1559 }
1560 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1561 return;
1562 }
1563 }
1564
1565 htmlnamePush(ctxt, BAD_CAST"body");
1566 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1567 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1568 }
1569 }
1570
1571 /**
1572 * htmlCheckParagraph
1573 * @ctxt: an HTML parser context
1574 *
1575 * Check whether a p element need to be implied before inserting
1576 * characters in the current element.
1577 *
1578 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1579 * in case of error.
1580 */
1581
1582 static int
htmlCheckParagraph(htmlParserCtxtPtr ctxt)1583 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1584 const xmlChar *tag;
1585 int i;
1586
1587 if (ctxt == NULL)
1588 return(-1);
1589 tag = ctxt->name;
1590 if (tag == NULL) {
1591 htmlAutoClose(ctxt, BAD_CAST"p");
1592 htmlCheckImplied(ctxt, BAD_CAST"p");
1593 htmlnamePush(ctxt, BAD_CAST"p");
1594 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1595 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1596 return(1);
1597 }
1598 if (!htmlOmittedDefaultValue)
1599 return(0);
1600 for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1601 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1602 htmlAutoClose(ctxt, BAD_CAST"p");
1603 htmlCheckImplied(ctxt, BAD_CAST"p");
1604 htmlnamePush(ctxt, BAD_CAST"p");
1605 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1606 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1607 return(1);
1608 }
1609 }
1610 return(0);
1611 }
1612
1613 /**
1614 * htmlIsScriptAttribute:
1615 * @name: an attribute name
1616 *
1617 * Check if an attribute is of content type Script
1618 *
1619 * Returns 1 is the attribute is a script 0 otherwise
1620 */
1621 int
htmlIsScriptAttribute(const xmlChar * name)1622 htmlIsScriptAttribute(const xmlChar *name) {
1623 unsigned int i;
1624
1625 if (name == NULL)
1626 return(0);
1627 /*
1628 * all script attributes start with 'on'
1629 */
1630 if ((name[0] != 'o') || (name[1] != 'n'))
1631 return(0);
1632 for (i = 0;
1633 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1634 i++) {
1635 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1636 return(1);
1637 }
1638 return(0);
1639 }
1640
1641 /************************************************************************
1642 * *
1643 * The list of HTML predefined entities *
1644 * *
1645 ************************************************************************/
1646
1647
1648 static const htmlEntityDesc html40EntitiesTable[] = {
1649 /*
1650 * the 4 absolute ones, plus apostrophe.
1651 */
1652 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1653 { 38, "amp", "ampersand, U+0026 ISOnum" },
1654 { 39, "apos", "single quote" },
1655 { 60, "lt", "less-than sign, U+003C ISOnum" },
1656 { 62, "gt", "greater-than sign, U+003E ISOnum" },
1657
1658 /*
1659 * A bunch still in the 128-255 range
1660 * Replacing them depend really on the charset used.
1661 */
1662 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1663 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1664 { 162, "cent", "cent sign, U+00A2 ISOnum" },
1665 { 163, "pound","pound sign, U+00A3 ISOnum" },
1666 { 164, "curren","currency sign, U+00A4 ISOnum" },
1667 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1668 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1669 { 167, "sect", "section sign, U+00A7 ISOnum" },
1670 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1671 { 169, "copy", "copyright sign, U+00A9 ISOnum" },
1672 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1673 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1674 { 172, "not", "not sign, U+00AC ISOnum" },
1675 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1676 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1677 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1678 { 176, "deg", "degree sign, U+00B0 ISOnum" },
1679 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1680 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1681 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1682 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1683 { 181, "micro","micro sign, U+00B5 ISOnum" },
1684 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1685 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1686 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1687 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1688 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1689 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1690 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1691 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1692 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1693 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1694 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1695 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1696 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1697 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1698 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1699 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1700 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1701 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1702 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1703 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1704 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1705 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1706 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1707 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1708 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1709 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1710 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1711 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1712 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1713 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1714 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1715 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1716 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1717 { 215, "times","multiplication sign, U+00D7 ISOnum" },
1718 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1719 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1720 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1721 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1722 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1723 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1724 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1725 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1726 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1727 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1728 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1729 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1730 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1731 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1732 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1733 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1734 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1735 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1736 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1737 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1738 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1739 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1740 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1741 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1742 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1743 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1744 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1745 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1746 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1747 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1748 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1749 { 247, "divide","division sign, U+00F7 ISOnum" },
1750 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1751 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1752 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1753 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1754 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1755 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1756 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1757 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1758
1759 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1760 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1761 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1762 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1763 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1764
1765 /*
1766 * Anything below should really be kept as entities references
1767 */
1768 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1769
1770 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1771 { 732, "tilde","small tilde, U+02DC ISOdia" },
1772
1773 { 913, "Alpha","greek capital letter alpha, U+0391" },
1774 { 914, "Beta", "greek capital letter beta, U+0392" },
1775 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1776 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1777 { 917, "Epsilon","greek capital letter epsilon, U+0395" },
1778 { 918, "Zeta", "greek capital letter zeta, U+0396" },
1779 { 919, "Eta", "greek capital letter eta, U+0397" },
1780 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1781 { 921, "Iota", "greek capital letter iota, U+0399" },
1782 { 922, "Kappa","greek capital letter kappa, U+039A" },
1783 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1784 { 924, "Mu", "greek capital letter mu, U+039C" },
1785 { 925, "Nu", "greek capital letter nu, U+039D" },
1786 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1787 { 927, "Omicron","greek capital letter omicron, U+039F" },
1788 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1789 { 929, "Rho", "greek capital letter rho, U+03A1" },
1790 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1791 { 932, "Tau", "greek capital letter tau, U+03A4" },
1792 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1793 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1794 { 935, "Chi", "greek capital letter chi, U+03A7" },
1795 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1796 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1797
1798 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1799 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1800 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1801 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1802 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1803 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1804 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1805 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1806 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1807 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1808 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1809 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1810 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1811 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1812 { 959, "omicron","greek small letter omicron, U+03BF NEW" },
1813 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1814 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1815 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1816 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1817 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1818 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1819 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1820 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1821 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1822 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1823 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1824 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1825 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1826
1827 { 8194, "ensp", "en space, U+2002 ISOpub" },
1828 { 8195, "emsp", "em space, U+2003 ISOpub" },
1829 { 8201, "thinsp","thin space, U+2009 ISOpub" },
1830 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1831 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1832 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1833 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1834 { 8211, "ndash","en dash, U+2013 ISOpub" },
1835 { 8212, "mdash","em dash, U+2014 ISOpub" },
1836 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1837 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1838 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1839 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1840 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1841 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1842 { 8224, "dagger","dagger, U+2020 ISOpub" },
1843 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
1844
1845 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1846 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1847
1848 { 8240, "permil","per mille sign, U+2030 ISOtech" },
1849
1850 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1851 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1852
1853 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1854 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1855
1856 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
1857 { 8260, "frasl","fraction slash, U+2044 NEW" },
1858
1859 { 8364, "euro", "euro sign, U+20AC NEW" },
1860
1861 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1862 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1863 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1864 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
1865 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1866 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1867 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1868 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1869 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1870 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1871 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1872 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1873 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1874 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1875 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1876 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1877
1878 { 8704, "forall","for all, U+2200 ISOtech" },
1879 { 8706, "part", "partial differential, U+2202 ISOtech" },
1880 { 8707, "exist","there exists, U+2203 ISOtech" },
1881 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1882 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1883 { 8712, "isin", "element of, U+2208 ISOtech" },
1884 { 8713, "notin","not an element of, U+2209 ISOtech" },
1885 { 8715, "ni", "contains as member, U+220B ISOtech" },
1886 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1887 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1888 { 8722, "minus","minus sign, U+2212 ISOtech" },
1889 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1890 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
1891 { 8733, "prop", "proportional to, U+221D ISOtech" },
1892 { 8734, "infin","infinity, U+221E ISOtech" },
1893 { 8736, "ang", "angle, U+2220 ISOamso" },
1894 { 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1895 { 8744, "or", "logical or = vee, U+2228 ISOtech" },
1896 { 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1897 { 8746, "cup", "union = cup, U+222A ISOtech" },
1898 { 8747, "int", "integral, U+222B ISOtech" },
1899 { 8756, "there4","therefore, U+2234 ISOtech" },
1900 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1901 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1902 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1903 { 8800, "ne", "not equal to, U+2260 ISOtech" },
1904 { 8801, "equiv","identical to, U+2261 ISOtech" },
1905 { 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1906 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1907 { 8834, "sub", "subset of, U+2282 ISOtech" },
1908 { 8835, "sup", "superset of, U+2283 ISOtech" },
1909 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1910 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1911 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1912 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1913 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1914 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1915 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1916 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1917 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1918 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1919 { 8971, "rfloor","right floor, U+230B ISOamsc" },
1920 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1921 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1922 { 9674, "loz", "lozenge, U+25CA ISOpub" },
1923
1924 { 9824, "spades","black spade suit, U+2660 ISOpub" },
1925 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1926 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1927 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
1928
1929 };
1930
1931 /************************************************************************
1932 * *
1933 * Commodity functions to handle entities *
1934 * *
1935 ************************************************************************/
1936
1937 /*
1938 * Macro used to grow the current buffer.
1939 */
1940 #define growBuffer(buffer) { \
1941 xmlChar *tmp; \
1942 buffer##_size *= 2; \
1943 tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1944 if (tmp == NULL) { \
1945 htmlErrMemory(ctxt, "growing buffer\n"); \
1946 xmlFree(buffer); \
1947 return(NULL); \
1948 } \
1949 buffer = tmp; \
1950 }
1951
1952 /**
1953 * htmlEntityLookup:
1954 * @name: the entity name
1955 *
1956 * Lookup the given entity in EntitiesTable
1957 *
1958 * TODO: the linear scan is really ugly, an hash table is really needed.
1959 *
1960 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1961 */
1962 const htmlEntityDesc *
htmlEntityLookup(const xmlChar * name)1963 htmlEntityLookup(const xmlChar *name) {
1964 unsigned int i;
1965
1966 for (i = 0;i < (sizeof(html40EntitiesTable)/
1967 sizeof(html40EntitiesTable[0]));i++) {
1968 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1969 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1970 }
1971 }
1972 return(NULL);
1973 }
1974
1975 /**
1976 * htmlEntityValueLookup:
1977 * @value: the entity's unicode value
1978 *
1979 * Lookup the given entity in EntitiesTable
1980 *
1981 * TODO: the linear scan is really ugly, an hash table is really needed.
1982 *
1983 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1984 */
1985 const htmlEntityDesc *
htmlEntityValueLookup(unsigned int value)1986 htmlEntityValueLookup(unsigned int value) {
1987 unsigned int i;
1988
1989 for (i = 0;i < (sizeof(html40EntitiesTable)/
1990 sizeof(html40EntitiesTable[0]));i++) {
1991 if (html40EntitiesTable[i].value >= value) {
1992 if (html40EntitiesTable[i].value > value)
1993 break;
1994 return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1995 }
1996 }
1997 return(NULL);
1998 }
1999
2000 /**
2001 * UTF8ToHtml:
2002 * @out: a pointer to an array of bytes to store the result
2003 * @outlen: the length of @out
2004 * @in: a pointer to an array of UTF-8 chars
2005 * @inlen: the length of @in
2006 *
2007 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2008 * plus HTML entities block of chars out.
2009 *
2010 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2011 * The value of @inlen after return is the number of octets consumed
2012 * as the return value is positive, else unpredictable.
2013 * The value of @outlen after return is the number of octets consumed.
2014 */
2015 int
UTF8ToHtml(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)2016 UTF8ToHtml(unsigned char* out, int *outlen,
2017 const unsigned char* in, int *inlen) {
2018 const unsigned char* processed = in;
2019 const unsigned char* outend;
2020 const unsigned char* outstart = out;
2021 const unsigned char* instart = in;
2022 const unsigned char* inend;
2023 unsigned int c, d;
2024 int trailing;
2025
2026 if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2027 if (in == NULL) {
2028 /*
2029 * initialization nothing to do
2030 */
2031 *outlen = 0;
2032 *inlen = 0;
2033 return(0);
2034 }
2035 inend = in + (*inlen);
2036 outend = out + (*outlen);
2037 while (in < inend) {
2038 d = *in++;
2039 if (d < 0x80) { c= d; trailing= 0; }
2040 else if (d < 0xC0) {
2041 /* trailing byte in leading position */
2042 *outlen = out - outstart;
2043 *inlen = processed - instart;
2044 return(-2);
2045 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2046 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2047 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2048 else {
2049 /* no chance for this in Ascii */
2050 *outlen = out - outstart;
2051 *inlen = processed - instart;
2052 return(-2);
2053 }
2054
2055 if (inend - in < trailing) {
2056 break;
2057 }
2058
2059 for ( ; trailing; trailing--) {
2060 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2061 break;
2062 c <<= 6;
2063 c |= d & 0x3F;
2064 }
2065
2066 /* assertion: c is a single UTF-4 value */
2067 if (c < 0x80) {
2068 if (out + 1 >= outend)
2069 break;
2070 *out++ = c;
2071 } else {
2072 int len;
2073 const htmlEntityDesc * ent;
2074 const char *cp;
2075 char nbuf[16];
2076
2077 /*
2078 * Try to lookup a predefined HTML entity for it
2079 */
2080
2081 ent = htmlEntityValueLookup(c);
2082 if (ent == NULL) {
2083 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2084 cp = nbuf;
2085 }
2086 else
2087 cp = ent->name;
2088 len = strlen(cp);
2089 if (out + 2 + len >= outend)
2090 break;
2091 *out++ = '&';
2092 memcpy(out, cp, len);
2093 out += len;
2094 *out++ = ';';
2095 }
2096 processed = in;
2097 }
2098 *outlen = out - outstart;
2099 *inlen = processed - instart;
2100 return(0);
2101 }
2102
2103 /**
2104 * htmlEncodeEntities:
2105 * @out: a pointer to an array of bytes to store the result
2106 * @outlen: the length of @out
2107 * @in: a pointer to an array of UTF-8 chars
2108 * @inlen: the length of @in
2109 * @quoteChar: the quote character to escape (' or ") or zero.
2110 *
2111 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2112 * plus HTML entities block of chars out.
2113 *
2114 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2115 * The value of @inlen after return is the number of octets consumed
2116 * as the return value is positive, else unpredictable.
2117 * The value of @outlen after return is the number of octets consumed.
2118 */
2119 int
htmlEncodeEntities(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,int quoteChar)2120 htmlEncodeEntities(unsigned char* out, int *outlen,
2121 const unsigned char* in, int *inlen, int quoteChar) {
2122 const unsigned char* processed = in;
2123 const unsigned char* outend;
2124 const unsigned char* outstart = out;
2125 const unsigned char* instart = in;
2126 const unsigned char* inend;
2127 unsigned int c, d;
2128 int trailing;
2129
2130 if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2131 return(-1);
2132 outend = out + (*outlen);
2133 inend = in + (*inlen);
2134 while (in < inend) {
2135 d = *in++;
2136 if (d < 0x80) { c= d; trailing= 0; }
2137 else if (d < 0xC0) {
2138 /* trailing byte in leading position */
2139 *outlen = out - outstart;
2140 *inlen = processed - instart;
2141 return(-2);
2142 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2143 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2144 else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2145 else {
2146 /* no chance for this in Ascii */
2147 *outlen = out - outstart;
2148 *inlen = processed - instart;
2149 return(-2);
2150 }
2151
2152 if (inend - in < trailing)
2153 break;
2154
2155 while (trailing--) {
2156 if (((d= *in++) & 0xC0) != 0x80) {
2157 *outlen = out - outstart;
2158 *inlen = processed - instart;
2159 return(-2);
2160 }
2161 c <<= 6;
2162 c |= d & 0x3F;
2163 }
2164
2165 /* assertion: c is a single UTF-4 value */
2166 if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2167 (c != '&') && (c != '<') && (c != '>')) {
2168 if (out >= outend)
2169 break;
2170 *out++ = c;
2171 } else {
2172 const htmlEntityDesc * ent;
2173 const char *cp;
2174 char nbuf[16];
2175 int len;
2176
2177 /*
2178 * Try to lookup a predefined HTML entity for it
2179 */
2180 ent = htmlEntityValueLookup(c);
2181 if (ent == NULL) {
2182 snprintf(nbuf, sizeof(nbuf), "#%u", c);
2183 cp = nbuf;
2184 }
2185 else
2186 cp = ent->name;
2187 len = strlen(cp);
2188 if (out + 2 + len > outend)
2189 break;
2190 *out++ = '&';
2191 memcpy(out, cp, len);
2192 out += len;
2193 *out++ = ';';
2194 }
2195 processed = in;
2196 }
2197 *outlen = out - outstart;
2198 *inlen = processed - instart;
2199 return(0);
2200 }
2201
2202 /************************************************************************
2203 * *
2204 * Commodity functions to handle streams *
2205 * *
2206 ************************************************************************/
2207
2208 /**
2209 * htmlNewInputStream:
2210 * @ctxt: an HTML parser context
2211 *
2212 * Create a new input stream structure
2213 * Returns the new input stream or NULL
2214 */
2215 static htmlParserInputPtr
htmlNewInputStream(htmlParserCtxtPtr ctxt)2216 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2217 htmlParserInputPtr input;
2218
2219 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2220 if (input == NULL) {
2221 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2222 return(NULL);
2223 }
2224 memset(input, 0, sizeof(htmlParserInput));
2225 input->filename = NULL;
2226 input->directory = NULL;
2227 input->base = NULL;
2228 input->cur = NULL;
2229 input->buf = NULL;
2230 input->line = 1;
2231 input->col = 1;
2232 input->buf = NULL;
2233 input->free = NULL;
2234 input->version = NULL;
2235 input->consumed = 0;
2236 input->length = 0;
2237 return(input);
2238 }
2239
2240
2241 /************************************************************************
2242 * *
2243 * Commodity functions, cleanup needed ? *
2244 * *
2245 ************************************************************************/
2246 /*
2247 * all tags allowing pc data from the html 4.01 loose dtd
2248 * NOTE: it might be more apropriate to integrate this information
2249 * into the html40ElementTable array but I don't want to risk any
2250 * binary incomptibility
2251 */
2252 static const char *allowPCData[] = {
2253 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2254 "blockquote", "body", "button", "caption", "center", "cite", "code",
2255 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2256 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2257 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2258 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2259 };
2260
2261 /**
2262 * areBlanks:
2263 * @ctxt: an HTML parser context
2264 * @str: a xmlChar *
2265 * @len: the size of @str
2266 *
2267 * Is this a sequence of blank chars that one can ignore ?
2268 *
2269 * Returns 1 if ignorable 0 otherwise.
2270 */
2271
areBlanks(htmlParserCtxtPtr ctxt,const xmlChar * str,int len)2272 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2273 unsigned int i;
2274 int j;
2275 xmlNodePtr lastChild;
2276 xmlDtdPtr dtd;
2277
2278 for (j = 0;j < len;j++)
2279 if (!(IS_BLANK_CH(str[j]))) return(0);
2280
2281 if (CUR == 0) return(1);
2282 if (CUR != '<') return(0);
2283 if (ctxt->name == NULL)
2284 return(1);
2285 if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2286 return(1);
2287 if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2288 return(1);
2289
2290 /* Only strip CDATA children of the body tag for strict HTML DTDs */
2291 if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2292 dtd = xmlGetIntSubset(ctxt->myDoc);
2293 if (dtd != NULL && dtd->ExternalID != NULL) {
2294 if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2295 !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2296 return(1);
2297 }
2298 }
2299
2300 if (ctxt->node == NULL) return(0);
2301 lastChild = xmlGetLastChild(ctxt->node);
2302 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2303 lastChild = lastChild->prev;
2304 if (lastChild == NULL) {
2305 if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2306 (ctxt->node->content != NULL)) return(0);
2307 /* keep ws in constructs like ...<b> </b>...
2308 for all tags "b" allowing PCDATA */
2309 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2310 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2311 return(0);
2312 }
2313 }
2314 } else if (xmlNodeIsText(lastChild)) {
2315 return(0);
2316 } else {
2317 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2318 for all tags "p" allowing PCDATA */
2319 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2320 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2321 return(0);
2322 }
2323 }
2324 }
2325 return(1);
2326 }
2327
2328 /**
2329 * htmlNewDocNoDtD:
2330 * @URI: URI for the dtd, or NULL
2331 * @ExternalID: the external ID of the DTD, or NULL
2332 *
2333 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2334 * are NULL
2335 *
2336 * Returns a new document, do not initialize the DTD if not provided
2337 */
2338 htmlDocPtr
htmlNewDocNoDtD(const xmlChar * URI,const xmlChar * ExternalID)2339 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2340 xmlDocPtr cur;
2341
2342 /*
2343 * Allocate a new document and fill the fields.
2344 */
2345 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2346 if (cur == NULL) {
2347 htmlErrMemory(NULL, "HTML document creation failed\n");
2348 return(NULL);
2349 }
2350 memset(cur, 0, sizeof(xmlDoc));
2351
2352 cur->type = XML_HTML_DOCUMENT_NODE;
2353 cur->version = NULL;
2354 cur->intSubset = NULL;
2355 cur->doc = cur;
2356 cur->name = NULL;
2357 cur->children = NULL;
2358 cur->extSubset = NULL;
2359 cur->oldNs = NULL;
2360 cur->encoding = NULL;
2361 cur->standalone = 1;
2362 cur->compression = 0;
2363 cur->ids = NULL;
2364 cur->refs = NULL;
2365 cur->_private = NULL;
2366 cur->charset = XML_CHAR_ENCODING_UTF8;
2367 cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2368 if ((ExternalID != NULL) ||
2369 (URI != NULL))
2370 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2371 return(cur);
2372 }
2373
2374 /**
2375 * htmlNewDoc:
2376 * @URI: URI for the dtd, or NULL
2377 * @ExternalID: the external ID of the DTD, or NULL
2378 *
2379 * Creates a new HTML document
2380 *
2381 * Returns a new document
2382 */
2383 htmlDocPtr
htmlNewDoc(const xmlChar * URI,const xmlChar * ExternalID)2384 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2385 if ((URI == NULL) && (ExternalID == NULL))
2386 return(htmlNewDocNoDtD(
2387 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2388 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2389
2390 return(htmlNewDocNoDtD(URI, ExternalID));
2391 }
2392
2393
2394 /************************************************************************
2395 * *
2396 * The parser itself *
2397 * Relates to http://www.w3.org/TR/html40 *
2398 * *
2399 ************************************************************************/
2400
2401 /************************************************************************
2402 * *
2403 * The parser itself *
2404 * *
2405 ************************************************************************/
2406
2407 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2408
2409 /**
2410 * htmlParseHTMLName:
2411 * @ctxt: an HTML parser context
2412 *
2413 * parse an HTML tag or attribute name, note that we convert it to lowercase
2414 * since HTML names are not case-sensitive.
2415 *
2416 * Returns the Tag Name parsed or NULL
2417 */
2418
2419 static const xmlChar *
htmlParseHTMLName(htmlParserCtxtPtr ctxt)2420 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2421 int i = 0;
2422 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2423
2424 if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2425 (CUR != ':') && (CUR != '.')) return(NULL);
2426
2427 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2428 ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2429 (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2430 (CUR == '.'))) {
2431 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2432 else loc[i] = CUR;
2433 i++;
2434
2435 NEXT;
2436 }
2437
2438 return(xmlDictLookup(ctxt->dict, loc, i));
2439 }
2440
2441
2442 /**
2443 * htmlParseHTMLName_nonInvasive:
2444 * @ctxt: an HTML parser context
2445 *
2446 * parse an HTML tag or attribute name, note that we convert it to lowercase
2447 * since HTML names are not case-sensitive, this doesn't consume the data
2448 * from the stream, it's a look-ahead
2449 *
2450 * Returns the Tag Name parsed or NULL
2451 */
2452
2453 static const xmlChar *
htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt)2454 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2455 int i = 0;
2456 xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2457
2458 if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2459 (NXT(1) != ':')) return(NULL);
2460
2461 while ((i < HTML_PARSER_BUFFER_SIZE) &&
2462 ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2463 (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2464 if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2465 else loc[i] = NXT(1+i);
2466 i++;
2467 }
2468
2469 return(xmlDictLookup(ctxt->dict, loc, i));
2470 }
2471
2472
2473 /**
2474 * htmlParseName:
2475 * @ctxt: an HTML parser context
2476 *
2477 * parse an HTML name, this routine is case sensitive.
2478 *
2479 * Returns the Name parsed or NULL
2480 */
2481
2482 static const xmlChar *
htmlParseName(htmlParserCtxtPtr ctxt)2483 htmlParseName(htmlParserCtxtPtr ctxt) {
2484 const xmlChar *in;
2485 const xmlChar *ret;
2486 int count = 0;
2487
2488 GROW;
2489
2490 /*
2491 * Accelerator for simple ASCII names
2492 */
2493 in = ctxt->input->cur;
2494 if (((*in >= 0x61) && (*in <= 0x7A)) ||
2495 ((*in >= 0x41) && (*in <= 0x5A)) ||
2496 (*in == '_') || (*in == ':')) {
2497 in++;
2498 while (((*in >= 0x61) && (*in <= 0x7A)) ||
2499 ((*in >= 0x41) && (*in <= 0x5A)) ||
2500 ((*in >= 0x30) && (*in <= 0x39)) ||
2501 (*in == '_') || (*in == '-') ||
2502 (*in == ':') || (*in == '.'))
2503 in++;
2504
2505 if (in == ctxt->input->end)
2506 return(NULL);
2507
2508 if ((*in > 0) && (*in < 0x80)) {
2509 count = in - ctxt->input->cur;
2510 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2511 ctxt->input->cur = in;
2512 ctxt->nbChars += count;
2513 ctxt->input->col += count;
2514 return(ret);
2515 }
2516 }
2517 return(htmlParseNameComplex(ctxt));
2518 }
2519
2520 static const xmlChar *
htmlParseNameComplex(xmlParserCtxtPtr ctxt)2521 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2522 int len = 0, l;
2523 int c;
2524 int count = 0;
2525 const xmlChar *base = ctxt->input->base;
2526
2527 /*
2528 * Handler for more complex cases
2529 */
2530 GROW;
2531 c = CUR_CHAR(l);
2532 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2533 (!IS_LETTER(c) && (c != '_') &&
2534 (c != ':'))) {
2535 return(NULL);
2536 }
2537
2538 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2539 ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2540 (c == '.') || (c == '-') ||
2541 (c == '_') || (c == ':') ||
2542 (IS_COMBINING(c)) ||
2543 (IS_EXTENDER(c)))) {
2544 if (count++ > 100) {
2545 count = 0;
2546 GROW;
2547 }
2548 len += l;
2549 NEXTL(l);
2550 c = CUR_CHAR(l);
2551 if (ctxt->input->base != base) {
2552 /*
2553 * We changed encoding from an unknown encoding
2554 * Input buffer changed location, so we better start again
2555 */
2556 return(htmlParseNameComplex(ctxt));
2557 }
2558 }
2559
2560 if (ctxt->input->cur - ctxt->input->base < len) {
2561 /* Sanity check */
2562 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2563 "unexpected change of input buffer", NULL, NULL);
2564 return (NULL);
2565 }
2566
2567 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2568 }
2569
2570
2571 /**
2572 * htmlParseHTMLAttribute:
2573 * @ctxt: an HTML parser context
2574 * @stop: a char stop value
2575 *
2576 * parse an HTML attribute value till the stop (quote), if
2577 * stop is 0 then it stops at the first space
2578 *
2579 * Returns the attribute parsed or NULL
2580 */
2581
2582 static xmlChar *
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt,const xmlChar stop)2583 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2584 xmlChar *buffer = NULL;
2585 int buffer_size = 0;
2586 xmlChar *out = NULL;
2587 const xmlChar *name = NULL;
2588 const xmlChar *cur = NULL;
2589 const htmlEntityDesc * ent;
2590
2591 /*
2592 * allocate a translation buffer.
2593 */
2594 buffer_size = HTML_PARSER_BUFFER_SIZE;
2595 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2596 if (buffer == NULL) {
2597 htmlErrMemory(ctxt, "buffer allocation failed\n");
2598 return(NULL);
2599 }
2600 out = buffer;
2601
2602 /*
2603 * Ok loop until we reach one of the ending chars
2604 */
2605 while ((CUR != 0) && (CUR != stop)) {
2606 if ((stop == 0) && (CUR == '>')) break;
2607 if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2608 if (CUR == '&') {
2609 if (NXT(1) == '#') {
2610 unsigned int c;
2611 int bits;
2612
2613 c = htmlParseCharRef(ctxt);
2614 if (c < 0x80)
2615 { *out++ = c; bits= -6; }
2616 else if (c < 0x800)
2617 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2618 else if (c < 0x10000)
2619 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2620 else
2621 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2622
2623 for ( ; bits >= 0; bits-= 6) {
2624 *out++ = ((c >> bits) & 0x3F) | 0x80;
2625 }
2626
2627 if (out - buffer > buffer_size - 100) {
2628 int indx = out - buffer;
2629
2630 growBuffer(buffer);
2631 out = &buffer[indx];
2632 }
2633 } else {
2634 ent = htmlParseEntityRef(ctxt, &name);
2635 if (name == NULL) {
2636 *out++ = '&';
2637 if (out - buffer > buffer_size - 100) {
2638 int indx = out - buffer;
2639
2640 growBuffer(buffer);
2641 out = &buffer[indx];
2642 }
2643 } else if (ent == NULL) {
2644 *out++ = '&';
2645 cur = name;
2646 while (*cur != 0) {
2647 if (out - buffer > buffer_size - 100) {
2648 int indx = out - buffer;
2649
2650 growBuffer(buffer);
2651 out = &buffer[indx];
2652 }
2653 *out++ = *cur++;
2654 }
2655 } else {
2656 unsigned int c;
2657 int bits;
2658
2659 if (out - buffer > buffer_size - 100) {
2660 int indx = out - buffer;
2661
2662 growBuffer(buffer);
2663 out = &buffer[indx];
2664 }
2665 c = ent->value;
2666 if (c < 0x80)
2667 { *out++ = c; bits= -6; }
2668 else if (c < 0x800)
2669 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2670 else if (c < 0x10000)
2671 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2672 else
2673 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2674
2675 for ( ; bits >= 0; bits-= 6) {
2676 *out++ = ((c >> bits) & 0x3F) | 0x80;
2677 }
2678 }
2679 }
2680 } else {
2681 unsigned int c;
2682 int bits, l;
2683
2684 if (out - buffer > buffer_size - 100) {
2685 int indx = out - buffer;
2686
2687 growBuffer(buffer);
2688 out = &buffer[indx];
2689 }
2690 c = CUR_CHAR(l);
2691 if (c < 0x80)
2692 { *out++ = c; bits= -6; }
2693 else if (c < 0x800)
2694 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; }
2695 else if (c < 0x10000)
2696 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; }
2697 else
2698 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; }
2699
2700 for ( ; bits >= 0; bits-= 6) {
2701 *out++ = ((c >> bits) & 0x3F) | 0x80;
2702 }
2703 NEXT;
2704 }
2705 }
2706 *out = 0;
2707 return(buffer);
2708 }
2709
2710 /**
2711 * htmlParseEntityRef:
2712 * @ctxt: an HTML parser context
2713 * @str: location to store the entity name
2714 *
2715 * parse an HTML ENTITY references
2716 *
2717 * [68] EntityRef ::= '&' Name ';'
2718 *
2719 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2720 * if non-NULL *str will have to be freed by the caller.
2721 */
2722 const htmlEntityDesc *
htmlParseEntityRef(htmlParserCtxtPtr ctxt,const xmlChar ** str)2723 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2724 const xmlChar *name;
2725 const htmlEntityDesc * ent = NULL;
2726
2727 if (str != NULL) *str = NULL;
2728 if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2729
2730 if (CUR == '&') {
2731 NEXT;
2732 name = htmlParseName(ctxt);
2733 if (name == NULL) {
2734 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2735 "htmlParseEntityRef: no name\n", NULL, NULL);
2736 } else {
2737 GROW;
2738 if (CUR == ';') {
2739 if (str != NULL)
2740 *str = name;
2741
2742 /*
2743 * Lookup the entity in the table.
2744 */
2745 ent = htmlEntityLookup(name);
2746 if (ent != NULL) /* OK that's ugly !!! */
2747 NEXT;
2748 } else {
2749 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2750 "htmlParseEntityRef: expecting ';'\n",
2751 NULL, NULL);
2752 if (str != NULL)
2753 *str = name;
2754 }
2755 }
2756 }
2757 return(ent);
2758 }
2759
2760 /**
2761 * htmlParseAttValue:
2762 * @ctxt: an HTML parser context
2763 *
2764 * parse a value for an attribute
2765 * Note: the parser won't do substitution of entities here, this
2766 * will be handled later in xmlStringGetNodeList, unless it was
2767 * asked for ctxt->replaceEntities != 0
2768 *
2769 * Returns the AttValue parsed or NULL.
2770 */
2771
2772 static xmlChar *
htmlParseAttValue(htmlParserCtxtPtr ctxt)2773 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2774 xmlChar *ret = NULL;
2775
2776 if (CUR == '"') {
2777 NEXT;
2778 ret = htmlParseHTMLAttribute(ctxt, '"');
2779 if (CUR != '"') {
2780 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2781 "AttValue: \" expected\n", NULL, NULL);
2782 } else
2783 NEXT;
2784 } else if (CUR == '\'') {
2785 NEXT;
2786 ret = htmlParseHTMLAttribute(ctxt, '\'');
2787 if (CUR != '\'') {
2788 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2789 "AttValue: ' expected\n", NULL, NULL);
2790 } else
2791 NEXT;
2792 } else {
2793 /*
2794 * That's an HTMLism, the attribute value may not be quoted
2795 */
2796 ret = htmlParseHTMLAttribute(ctxt, 0);
2797 if (ret == NULL) {
2798 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2799 "AttValue: no value found\n", NULL, NULL);
2800 }
2801 }
2802 return(ret);
2803 }
2804
2805 /**
2806 * htmlParseSystemLiteral:
2807 * @ctxt: an HTML parser context
2808 *
2809 * parse an HTML Literal
2810 *
2811 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2812 *
2813 * Returns the SystemLiteral parsed or NULL
2814 */
2815
2816 static xmlChar *
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt)2817 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2818 size_t len = 0, startPosition = 0;
2819 xmlChar *ret = NULL;
2820
2821 if (CUR == '"') {
2822 NEXT;
2823
2824 if (CUR_PTR < BASE_PTR)
2825 return(ret);
2826 startPosition = CUR_PTR - BASE_PTR;
2827
2828 while ((IS_CHAR_CH(CUR)) && (CUR != '"')) {
2829 NEXT;
2830 len++;
2831 }
2832 if (!IS_CHAR_CH(CUR)) {
2833 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2834 "Unfinished SystemLiteral\n", NULL, NULL);
2835 } else {
2836 ret = xmlStrndup((BASE_PTR+startPosition), len);
2837 NEXT;
2838 }
2839 } else if (CUR == '\'') {
2840 NEXT;
2841
2842 if (CUR_PTR < BASE_PTR)
2843 return(ret);
2844 startPosition = CUR_PTR - BASE_PTR;
2845
2846 while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) {
2847 NEXT;
2848 len++;
2849 }
2850 if (!IS_CHAR_CH(CUR)) {
2851 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2852 "Unfinished SystemLiteral\n", NULL, NULL);
2853 } else {
2854 ret = xmlStrndup((BASE_PTR+startPosition), len);
2855 NEXT;
2856 }
2857 } else {
2858 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2859 " or ' expected\n", NULL, NULL);
2860 }
2861
2862 return(ret);
2863 }
2864
2865 /**
2866 * htmlParsePubidLiteral:
2867 * @ctxt: an HTML parser context
2868 *
2869 * parse an HTML public literal
2870 *
2871 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2872 *
2873 * Returns the PubidLiteral parsed or NULL.
2874 */
2875
2876 static xmlChar *
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt)2877 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2878 size_t len = 0, startPosition = 0;
2879 xmlChar *ret = NULL;
2880 /*
2881 * Name ::= (Letter | '_') (NameChar)*
2882 */
2883 if (CUR == '"') {
2884 NEXT;
2885
2886 if (CUR_PTR < BASE_PTR)
2887 return(ret);
2888 startPosition = CUR_PTR - BASE_PTR;
2889
2890 while (IS_PUBIDCHAR_CH(CUR)) {
2891 len++;
2892 NEXT;
2893 }
2894
2895 if (CUR != '"') {
2896 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2897 "Unfinished PubidLiteral\n", NULL, NULL);
2898 } else {
2899 ret = xmlStrndup((BASE_PTR + startPosition), len);
2900 NEXT;
2901 }
2902 } else if (CUR == '\'') {
2903 NEXT;
2904
2905 if (CUR_PTR < BASE_PTR)
2906 return(ret);
2907 startPosition = CUR_PTR - BASE_PTR;
2908
2909 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')){
2910 len++;
2911 NEXT;
2912 }
2913
2914 if (CUR != '\'') {
2915 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2916 "Unfinished PubidLiteral\n", NULL, NULL);
2917 } else {
2918 ret = xmlStrndup((BASE_PTR + startPosition), len);
2919 NEXT;
2920 }
2921 } else {
2922 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2923 "PubidLiteral \" or ' expected\n", NULL, NULL);
2924 }
2925
2926 return(ret);
2927 }
2928
2929 /**
2930 * htmlParseScript:
2931 * @ctxt: an HTML parser context
2932 *
2933 * parse the content of an HTML SCRIPT or STYLE element
2934 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2935 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2936 * http://www.w3.org/TR/html4/types.html#type-script
2937 * http://www.w3.org/TR/html4/types.html#h-6.15
2938 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2939 *
2940 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2941 * element and the value of intrinsic event attributes. User agents must
2942 * not evaluate script data as HTML markup but instead must pass it on as
2943 * data to a script engine.
2944 * NOTES:
2945 * - The content is passed like CDATA
2946 * - the attributes for style and scripting "onXXX" are also described
2947 * as CDATA but SGML allows entities references in attributes so their
2948 * processing is identical as other attributes
2949 */
2950 static void
htmlParseScript(htmlParserCtxtPtr ctxt)2951 htmlParseScript(htmlParserCtxtPtr ctxt) {
2952 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2953 int nbchar = 0;
2954 int cur,l;
2955
2956 SHRINK;
2957 cur = CUR_CHAR(l);
2958 while (IS_CHAR_CH(cur)) {
2959 if ((cur == '<') && (NXT(1) == '/')) {
2960 /*
2961 * One should break here, the specification is clear:
2962 * Authors should therefore escape "</" within the content.
2963 * Escape mechanisms are specific to each scripting or
2964 * style sheet language.
2965 *
2966 * In recovery mode, only break if end tag match the
2967 * current tag, effectively ignoring all tags inside the
2968 * script/style block and treating the entire block as
2969 * CDATA.
2970 */
2971 if (ctxt->recovery) {
2972 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2973 xmlStrlen(ctxt->name)) == 0)
2974 {
2975 break; /* while */
2976 } else {
2977 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2978 "Element %s embeds close tag\n",
2979 ctxt->name, NULL);
2980 }
2981 } else {
2982 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2983 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2984 {
2985 break; /* while */
2986 }
2987 }
2988 }
2989 COPY_BUF(l,buf,nbchar,cur);
2990 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2991 if (ctxt->sax->cdataBlock!= NULL) {
2992 /*
2993 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2994 */
2995 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2996 } else if (ctxt->sax->characters != NULL) {
2997 ctxt->sax->characters(ctxt->userData, buf, nbchar);
2998 }
2999 nbchar = 0;
3000 }
3001 GROW;
3002 NEXTL(l);
3003 cur = CUR_CHAR(l);
3004 }
3005
3006 if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
3007 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3008 "Invalid char in CDATA 0x%X\n", cur);
3009 if (ctxt->input->cur < ctxt->input->end) {
3010 NEXT;
3011 }
3012 }
3013
3014 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3015 if (ctxt->sax->cdataBlock!= NULL) {
3016 /*
3017 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3018 */
3019 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3020 } else if (ctxt->sax->characters != NULL) {
3021 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3022 }
3023 }
3024 }
3025
3026
3027 /**
3028 * htmlParseCharDataInternal:
3029 * @ctxt: an HTML parser context
3030 * @readahead: optional read ahead character in ascii range
3031 *
3032 * parse a CharData section.
3033 * if we are within a CDATA section ']]>' marks an end of section.
3034 *
3035 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3036 */
3037
3038 static void
htmlParseCharDataInternal(htmlParserCtxtPtr ctxt,int readahead)3039 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3040 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3041 int nbchar = 0;
3042 int cur, l;
3043 int chunk = 0;
3044
3045 if (readahead)
3046 buf[nbchar++] = readahead;
3047
3048 SHRINK;
3049 cur = CUR_CHAR(l);
3050 while (((cur != '<') || (ctxt->token == '<')) &&
3051 ((cur != '&') || (ctxt->token == '&')) &&
3052 (cur != 0)) {
3053 if (!(IS_CHAR(cur))) {
3054 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3055 "Invalid char in CDATA 0x%X\n", cur);
3056 } else {
3057 COPY_BUF(l,buf,nbchar,cur);
3058 }
3059 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3060 /*
3061 * Ok the segment is to be consumed as chars.
3062 */
3063 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3064 if (areBlanks(ctxt, buf, nbchar)) {
3065 if (ctxt->keepBlanks) {
3066 if (ctxt->sax->characters != NULL)
3067 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3068 } else {
3069 if (ctxt->sax->ignorableWhitespace != NULL)
3070 ctxt->sax->ignorableWhitespace(ctxt->userData,
3071 buf, nbchar);
3072 }
3073 } else {
3074 htmlCheckParagraph(ctxt);
3075 if (ctxt->sax->characters != NULL)
3076 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3077 }
3078 }
3079 nbchar = 0;
3080 }
3081 NEXTL(l);
3082 chunk++;
3083 if (chunk > HTML_PARSER_BUFFER_SIZE) {
3084 chunk = 0;
3085 SHRINK;
3086 GROW;
3087 }
3088 cur = CUR_CHAR(l);
3089 if (cur == 0) {
3090 SHRINK;
3091 GROW;
3092 cur = CUR_CHAR(l);
3093 }
3094 }
3095 if (nbchar != 0) {
3096 buf[nbchar] = 0;
3097
3098 /*
3099 * Ok the segment is to be consumed as chars.
3100 */
3101 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3102 if (areBlanks(ctxt, buf, nbchar)) {
3103 if (ctxt->keepBlanks) {
3104 if (ctxt->sax->characters != NULL)
3105 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3106 } else {
3107 if (ctxt->sax->ignorableWhitespace != NULL)
3108 ctxt->sax->ignorableWhitespace(ctxt->userData,
3109 buf, nbchar);
3110 }
3111 } else {
3112 htmlCheckParagraph(ctxt);
3113 if (ctxt->sax->characters != NULL)
3114 ctxt->sax->characters(ctxt->userData, buf, nbchar);
3115 }
3116 }
3117 } else {
3118 /*
3119 * Loop detection
3120 */
3121 if (cur == 0)
3122 ctxt->instate = XML_PARSER_EOF;
3123 }
3124 }
3125
3126 /**
3127 * htmlParseCharData:
3128 * @ctxt: an HTML parser context
3129 *
3130 * parse a CharData section.
3131 * if we are within a CDATA section ']]>' marks an end of section.
3132 *
3133 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3134 */
3135
3136 static void
htmlParseCharData(htmlParserCtxtPtr ctxt)3137 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3138 htmlParseCharDataInternal(ctxt, 0);
3139 }
3140
3141 /**
3142 * htmlParseExternalID:
3143 * @ctxt: an HTML parser context
3144 * @publicID: a xmlChar** receiving PubidLiteral
3145 *
3146 * Parse an External ID or a Public ID
3147 *
3148 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3149 * | 'PUBLIC' S PubidLiteral S SystemLiteral
3150 *
3151 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3152 *
3153 * Returns the function returns SystemLiteral and in the second
3154 * case publicID receives PubidLiteral, is strict is off
3155 * it is possible to return NULL and have publicID set.
3156 */
3157
3158 static xmlChar *
htmlParseExternalID(htmlParserCtxtPtr ctxt,xmlChar ** publicID)3159 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3160 xmlChar *URI = NULL;
3161
3162 if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3163 (UPP(2) == 'S') && (UPP(3) == 'T') &&
3164 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3165 SKIP(6);
3166 if (!IS_BLANK_CH(CUR)) {
3167 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3168 "Space required after 'SYSTEM'\n", NULL, NULL);
3169 }
3170 SKIP_BLANKS;
3171 URI = htmlParseSystemLiteral(ctxt);
3172 if (URI == NULL) {
3173 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3174 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3175 }
3176 } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3177 (UPP(2) == 'B') && (UPP(3) == 'L') &&
3178 (UPP(4) == 'I') && (UPP(5) == 'C')) {
3179 SKIP(6);
3180 if (!IS_BLANK_CH(CUR)) {
3181 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3182 "Space required after 'PUBLIC'\n", NULL, NULL);
3183 }
3184 SKIP_BLANKS;
3185 *publicID = htmlParsePubidLiteral(ctxt);
3186 if (*publicID == NULL) {
3187 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3188 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3189 NULL, NULL);
3190 }
3191 SKIP_BLANKS;
3192 if ((CUR == '"') || (CUR == '\'')) {
3193 URI = htmlParseSystemLiteral(ctxt);
3194 }
3195 }
3196 return(URI);
3197 }
3198
3199 /**
3200 * xmlParsePI:
3201 * @ctxt: an XML parser context
3202 *
3203 * parse an XML Processing Instruction.
3204 *
3205 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3206 */
3207 static void
htmlParsePI(htmlParserCtxtPtr ctxt)3208 htmlParsePI(htmlParserCtxtPtr ctxt) {
3209 xmlChar *buf = NULL;
3210 int len = 0;
3211 int size = HTML_PARSER_BUFFER_SIZE;
3212 int cur, l;
3213 const xmlChar *target;
3214 xmlParserInputState state;
3215 int count = 0;
3216
3217 if ((RAW == '<') && (NXT(1) == '?')) {
3218 state = ctxt->instate;
3219 ctxt->instate = XML_PARSER_PI;
3220 /*
3221 * this is a Processing Instruction.
3222 */
3223 SKIP(2);
3224 SHRINK;
3225
3226 /*
3227 * Parse the target name and check for special support like
3228 * namespace.
3229 */
3230 target = htmlParseName(ctxt);
3231 if (target != NULL) {
3232 if (RAW == '>') {
3233 SKIP(1);
3234
3235 /*
3236 * SAX: PI detected.
3237 */
3238 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3239 (ctxt->sax->processingInstruction != NULL))
3240 ctxt->sax->processingInstruction(ctxt->userData,
3241 target, NULL);
3242 ctxt->instate = state;
3243 return;
3244 }
3245 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3246 if (buf == NULL) {
3247 htmlErrMemory(ctxt, NULL);
3248 ctxt->instate = state;
3249 return;
3250 }
3251 cur = CUR;
3252 if (!IS_BLANK(cur)) {
3253 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3254 "ParsePI: PI %s space expected\n", target, NULL);
3255 }
3256 SKIP_BLANKS;
3257 cur = CUR_CHAR(l);
3258 while (IS_CHAR(cur) && (cur != '>')) {
3259 if (len + 5 >= size) {
3260 xmlChar *tmp;
3261
3262 size *= 2;
3263 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3264 if (tmp == NULL) {
3265 htmlErrMemory(ctxt, NULL);
3266 xmlFree(buf);
3267 ctxt->instate = state;
3268 return;
3269 }
3270 buf = tmp;
3271 }
3272 count++;
3273 if (count > 50) {
3274 GROW;
3275 count = 0;
3276 }
3277 COPY_BUF(l,buf,len,cur);
3278 NEXTL(l);
3279 cur = CUR_CHAR(l);
3280 if (cur == 0) {
3281 SHRINK;
3282 GROW;
3283 cur = CUR_CHAR(l);
3284 }
3285 }
3286 buf[len] = 0;
3287 if (cur != '>') {
3288 htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3289 "ParsePI: PI %s never end ...\n", target, NULL);
3290 } else {
3291 SKIP(1);
3292
3293 /*
3294 * SAX: PI detected.
3295 */
3296 if ((ctxt->sax) && (!ctxt->disableSAX) &&
3297 (ctxt->sax->processingInstruction != NULL))
3298 ctxt->sax->processingInstruction(ctxt->userData,
3299 target, buf);
3300 }
3301 xmlFree(buf);
3302 } else {
3303 htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3304 "PI is not started correctly", NULL, NULL);
3305 }
3306 ctxt->instate = state;
3307 }
3308 }
3309
3310 /**
3311 * htmlParseComment:
3312 * @ctxt: an HTML parser context
3313 *
3314 * Parse an XML (SGML) comment <!-- .... -->
3315 *
3316 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3317 */
3318 static void
htmlParseComment(htmlParserCtxtPtr ctxt)3319 htmlParseComment(htmlParserCtxtPtr ctxt) {
3320 xmlChar *buf = NULL;
3321 int len;
3322 int size = HTML_PARSER_BUFFER_SIZE;
3323 int q, ql;
3324 int r, rl;
3325 int cur, l;
3326 xmlParserInputState state;
3327
3328 /*
3329 * Check that there is a comment right here.
3330 */
3331 if ((RAW != '<') || (NXT(1) != '!') ||
3332 (NXT(2) != '-') || (NXT(3) != '-')) return;
3333
3334 state = ctxt->instate;
3335 ctxt->instate = XML_PARSER_COMMENT;
3336 SHRINK;
3337 SKIP(4);
3338 buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3339 if (buf == NULL) {
3340 htmlErrMemory(ctxt, "buffer allocation failed\n");
3341 ctxt->instate = state;
3342 return;
3343 }
3344 len = 0;
3345 buf[len] = 0;
3346 q = CUR_CHAR(ql);
3347 if (!IS_CHAR(q))
3348 goto unfinished;
3349 NEXTL(ql);
3350 r = CUR_CHAR(rl);
3351 if (!IS_CHAR(r))
3352 goto unfinished;
3353 NEXTL(rl);
3354 cur = CUR_CHAR(l);
3355 while (IS_CHAR(cur) &&
3356 ((cur != '>') ||
3357 (r != '-') || (q != '-'))) {
3358 if (len + 5 >= size) {
3359 xmlChar *tmp;
3360
3361 size *= 2;
3362 tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3363 if (tmp == NULL) {
3364 xmlFree(buf);
3365 htmlErrMemory(ctxt, "growing buffer failed\n");
3366 ctxt->instate = state;
3367 return;
3368 }
3369 buf = tmp;
3370 }
3371 COPY_BUF(ql,buf,len,q);
3372 q = r;
3373 ql = rl;
3374 r = cur;
3375 rl = l;
3376 NEXTL(l);
3377 cur = CUR_CHAR(l);
3378 if (cur == 0) {
3379 SHRINK;
3380 GROW;
3381 cur = CUR_CHAR(l);
3382 }
3383 }
3384 buf[len] = 0;
3385 if (IS_CHAR(cur)) {
3386 NEXT;
3387 if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3388 (!ctxt->disableSAX))
3389 ctxt->sax->comment(ctxt->userData, buf);
3390 xmlFree(buf);
3391 ctxt->instate = state;
3392 return;
3393 }
3394
3395 unfinished:
3396 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3397 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3398 xmlFree(buf);
3399 }
3400
3401 /**
3402 * htmlParseCharRef:
3403 * @ctxt: an HTML parser context
3404 *
3405 * parse Reference declarations
3406 *
3407 * [66] CharRef ::= '&#' [0-9]+ ';' |
3408 * '&#x' [0-9a-fA-F]+ ';'
3409 *
3410 * Returns the value parsed (as an int)
3411 */
3412 int
htmlParseCharRef(htmlParserCtxtPtr ctxt)3413 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3414 int val = 0;
3415
3416 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3417 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3418 "htmlParseCharRef: context error\n",
3419 NULL, NULL);
3420 return(0);
3421 }
3422 if ((CUR == '&') && (NXT(1) == '#') &&
3423 ((NXT(2) == 'x') || NXT(2) == 'X')) {
3424 SKIP(3);
3425 while (CUR != ';') {
3426 if ((CUR >= '0') && (CUR <= '9'))
3427 val = val * 16 + (CUR - '0');
3428 else if ((CUR >= 'a') && (CUR <= 'f'))
3429 val = val * 16 + (CUR - 'a') + 10;
3430 else if ((CUR >= 'A') && (CUR <= 'F'))
3431 val = val * 16 + (CUR - 'A') + 10;
3432 else {
3433 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3434 "htmlParseCharRef: missing semicolon\n",
3435 NULL, NULL);
3436 break;
3437 }
3438 NEXT;
3439 }
3440 if (CUR == ';')
3441 NEXT;
3442 } else if ((CUR == '&') && (NXT(1) == '#')) {
3443 SKIP(2);
3444 while (CUR != ';') {
3445 if ((CUR >= '0') && (CUR <= '9'))
3446 val = val * 10 + (CUR - '0');
3447 else {
3448 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3449 "htmlParseCharRef: missing semicolon\n",
3450 NULL, NULL);
3451 break;
3452 }
3453 NEXT;
3454 }
3455 if (CUR == ';')
3456 NEXT;
3457 } else {
3458 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3459 "htmlParseCharRef: invalid value\n", NULL, NULL);
3460 }
3461 /*
3462 * Check the value IS_CHAR ...
3463 */
3464 if (IS_CHAR(val)) {
3465 return(val);
3466 } else {
3467 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3468 "htmlParseCharRef: invalid xmlChar value %d\n",
3469 val);
3470 }
3471 return(0);
3472 }
3473
3474
3475 /**
3476 * htmlParseDocTypeDecl:
3477 * @ctxt: an HTML parser context
3478 *
3479 * parse a DOCTYPE declaration
3480 *
3481 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3482 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3483 */
3484
3485 static void
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt)3486 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3487 const xmlChar *name;
3488 xmlChar *ExternalID = NULL;
3489 xmlChar *URI = NULL;
3490
3491 /*
3492 * We know that '<!DOCTYPE' has been detected.
3493 */
3494 SKIP(9);
3495
3496 SKIP_BLANKS;
3497
3498 /*
3499 * Parse the DOCTYPE name.
3500 */
3501 name = htmlParseName(ctxt);
3502 if (name == NULL) {
3503 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3504 "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3505 NULL, NULL);
3506 }
3507 /*
3508 * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3509 */
3510
3511 SKIP_BLANKS;
3512
3513 /*
3514 * Check for SystemID and ExternalID
3515 */
3516 URI = htmlParseExternalID(ctxt, &ExternalID);
3517 SKIP_BLANKS;
3518
3519 /*
3520 * We should be at the end of the DOCTYPE declaration.
3521 */
3522 if (CUR != '>') {
3523 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3524 "DOCTYPE improperly terminated\n", NULL, NULL);
3525 /* We shouldn't try to resynchronize ... */
3526 }
3527 NEXT;
3528
3529 /*
3530 * Create or update the document accordingly to the DOCTYPE
3531 */
3532 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3533 (!ctxt->disableSAX))
3534 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3535
3536 /*
3537 * Cleanup, since we don't use all those identifiers
3538 */
3539 if (URI != NULL) xmlFree(URI);
3540 if (ExternalID != NULL) xmlFree(ExternalID);
3541 }
3542
3543 /**
3544 * htmlParseAttribute:
3545 * @ctxt: an HTML parser context
3546 * @value: a xmlChar ** used to store the value of the attribute
3547 *
3548 * parse an attribute
3549 *
3550 * [41] Attribute ::= Name Eq AttValue
3551 *
3552 * [25] Eq ::= S? '=' S?
3553 *
3554 * With namespace:
3555 *
3556 * [NS 11] Attribute ::= QName Eq AttValue
3557 *
3558 * Also the case QName == xmlns:??? is handled independently as a namespace
3559 * definition.
3560 *
3561 * Returns the attribute name, and the value in *value.
3562 */
3563
3564 static const xmlChar *
htmlParseAttribute(htmlParserCtxtPtr ctxt,xmlChar ** value)3565 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3566 const xmlChar *name;
3567 xmlChar *val = NULL;
3568
3569 *value = NULL;
3570 name = htmlParseHTMLName(ctxt);
3571 if (name == NULL) {
3572 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3573 "error parsing attribute name\n", NULL, NULL);
3574 return(NULL);
3575 }
3576
3577 /*
3578 * read the value
3579 */
3580 SKIP_BLANKS;
3581 if (CUR == '=') {
3582 NEXT;
3583 SKIP_BLANKS;
3584 val = htmlParseAttValue(ctxt);
3585 }
3586
3587 *value = val;
3588 return(name);
3589 }
3590
3591 /**
3592 * htmlCheckEncodingDirect:
3593 * @ctxt: an HTML parser context
3594 * @attvalue: the attribute value
3595 *
3596 * Checks an attribute value to detect
3597 * the encoding
3598 * If a new encoding is detected the parser is switched to decode
3599 * it and pass UTF8
3600 */
3601 static void
htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt,const xmlChar * encoding)3602 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3603
3604 if ((ctxt == NULL) || (encoding == NULL) ||
3605 (ctxt->options & HTML_PARSE_IGNORE_ENC))
3606 return;
3607
3608 /* do not change encoding */
3609 if (ctxt->input->encoding != NULL)
3610 return;
3611
3612 if (encoding != NULL) {
3613 xmlCharEncoding enc;
3614 xmlCharEncodingHandlerPtr handler;
3615
3616 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3617
3618 if (ctxt->input->encoding != NULL)
3619 xmlFree((xmlChar *) ctxt->input->encoding);
3620 ctxt->input->encoding = xmlStrdup(encoding);
3621
3622 enc = xmlParseCharEncoding((const char *) encoding);
3623 /*
3624 * registered set of known encodings
3625 */
3626 if (enc != XML_CHAR_ENCODING_ERROR) {
3627 if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3628 (enc == XML_CHAR_ENCODING_UTF16BE) ||
3629 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3630 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3631 (ctxt->input->buf != NULL) &&
3632 (ctxt->input->buf->encoder == NULL)) {
3633 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3634 "htmlCheckEncoding: wrong encoding meta\n",
3635 NULL, NULL);
3636 } else {
3637 xmlSwitchEncoding(ctxt, enc);
3638 }
3639 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3640 } else {
3641 /*
3642 * fallback for unknown encodings
3643 */
3644 handler = xmlFindCharEncodingHandler((const char *) encoding);
3645 if (handler != NULL) {
3646 xmlSwitchToEncoding(ctxt, handler);
3647 ctxt->charset = XML_CHAR_ENCODING_UTF8;
3648 } else {
3649 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3650 "htmlCheckEncoding: unknown encoding %s\n",
3651 encoding, NULL);
3652 }
3653 }
3654
3655 if ((ctxt->input->buf != NULL) &&
3656 (ctxt->input->buf->encoder != NULL) &&
3657 (ctxt->input->buf->raw != NULL) &&
3658 (ctxt->input->buf->buffer != NULL)) {
3659 int nbchars;
3660 int processed;
3661
3662 /*
3663 * convert as much as possible to the parser reading buffer.
3664 */
3665 processed = ctxt->input->cur - ctxt->input->base;
3666 xmlBufShrink(ctxt->input->buf->buffer, processed);
3667 nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3668 xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3669 if (nbchars < 0) {
3670 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3671 "htmlCheckEncoding: encoder error\n",
3672 NULL, NULL);
3673 }
3674 }
3675 }
3676 }
3677
3678 /**
3679 * htmlCheckEncoding:
3680 * @ctxt: an HTML parser context
3681 * @attvalue: the attribute value
3682 *
3683 * Checks an http-equiv attribute from a Meta tag to detect
3684 * the encoding
3685 * If a new encoding is detected the parser is switched to decode
3686 * it and pass UTF8
3687 */
3688 static void
htmlCheckEncoding(htmlParserCtxtPtr ctxt,const xmlChar * attvalue)3689 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3690 const xmlChar *encoding;
3691
3692 if (!attvalue)
3693 return;
3694
3695 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3696 if (encoding != NULL) {
3697 encoding += 7;
3698 }
3699 /*
3700 * skip blank
3701 */
3702 if (encoding && IS_BLANK_CH(*encoding))
3703 encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3704 if (encoding && *encoding == '=') {
3705 encoding ++;
3706 htmlCheckEncodingDirect(ctxt, encoding);
3707 }
3708 }
3709
3710 /**
3711 * htmlCheckMeta:
3712 * @ctxt: an HTML parser context
3713 * @atts: the attributes values
3714 *
3715 * Checks an attributes from a Meta tag
3716 */
3717 static void
htmlCheckMeta(htmlParserCtxtPtr ctxt,const xmlChar ** atts)3718 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3719 int i;
3720 const xmlChar *att, *value;
3721 int http = 0;
3722 const xmlChar *content = NULL;
3723
3724 if ((ctxt == NULL) || (atts == NULL))
3725 return;
3726
3727 i = 0;
3728 att = atts[i++];
3729 while (att != NULL) {
3730 value = atts[i++];
3731 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3732 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3733 http = 1;
3734 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3735 htmlCheckEncodingDirect(ctxt, value);
3736 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3737 content = value;
3738 att = atts[i++];
3739 }
3740 if ((http) && (content != NULL))
3741 htmlCheckEncoding(ctxt, content);
3742
3743 }
3744
3745 /**
3746 * htmlParseStartTag:
3747 * @ctxt: an HTML parser context
3748 *
3749 * parse a start of tag either for rule element or
3750 * EmptyElement. In both case we don't parse the tag closing chars.
3751 *
3752 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3753 *
3754 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3755 *
3756 * With namespace:
3757 *
3758 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3759 *
3760 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3761 *
3762 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3763 */
3764
3765 static int
htmlParseStartTag(htmlParserCtxtPtr ctxt)3766 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3767 const xmlChar *name;
3768 const xmlChar *attname;
3769 xmlChar *attvalue;
3770 const xmlChar **atts;
3771 int nbatts = 0;
3772 int maxatts;
3773 int meta = 0;
3774 int i;
3775 int discardtag = 0;
3776
3777 if ((ctxt == NULL) || (ctxt->input == NULL)) {
3778 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3779 "htmlParseStartTag: context error\n", NULL, NULL);
3780 return -1;
3781 }
3782 if (ctxt->instate == XML_PARSER_EOF)
3783 return(-1);
3784 if (CUR != '<') return -1;
3785 NEXT;
3786
3787 atts = ctxt->atts;
3788 maxatts = ctxt->maxatts;
3789
3790 GROW;
3791 name = htmlParseHTMLName(ctxt);
3792 if (name == NULL) {
3793 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3794 "htmlParseStartTag: invalid element name\n",
3795 NULL, NULL);
3796 /* if recover preserve text on classic misconstructs */
3797 if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
3798 (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
3799 htmlParseCharDataInternal(ctxt, '<');
3800 return(-1);
3801 }
3802
3803
3804 /* Dump the bogus tag like browsers do */
3805 while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3806 (ctxt->instate != XML_PARSER_EOF))
3807 NEXT;
3808 return -1;
3809 }
3810 if (xmlStrEqual(name, BAD_CAST"meta"))
3811 meta = 1;
3812
3813 /*
3814 * Check for auto-closure of HTML elements.
3815 */
3816 htmlAutoClose(ctxt, name);
3817
3818 /*
3819 * Check for implied HTML elements.
3820 */
3821 htmlCheckImplied(ctxt, name);
3822
3823 /*
3824 * Avoid html at any level > 0, head at any level != 1
3825 * or any attempt to recurse body
3826 */
3827 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3828 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3829 "htmlParseStartTag: misplaced <html> tag\n",
3830 name, NULL);
3831 discardtag = 1;
3832 ctxt->depth++;
3833 }
3834 if ((ctxt->nameNr != 1) &&
3835 (xmlStrEqual(name, BAD_CAST"head"))) {
3836 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3837 "htmlParseStartTag: misplaced <head> tag\n",
3838 name, NULL);
3839 discardtag = 1;
3840 ctxt->depth++;
3841 }
3842 if (xmlStrEqual(name, BAD_CAST"body")) {
3843 int indx;
3844 for (indx = 0;indx < ctxt->nameNr;indx++) {
3845 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3846 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3847 "htmlParseStartTag: misplaced <body> tag\n",
3848 name, NULL);
3849 discardtag = 1;
3850 ctxt->depth++;
3851 }
3852 }
3853 }
3854
3855 /*
3856 * Now parse the attributes, it ends up with the ending
3857 *
3858 * (S Attribute)* S?
3859 */
3860 SKIP_BLANKS;
3861 while ((IS_CHAR_CH(CUR)) &&
3862 (CUR != '>') &&
3863 ((CUR != '/') || (NXT(1) != '>'))) {
3864 long cons = ctxt->nbChars;
3865
3866 GROW;
3867 attname = htmlParseAttribute(ctxt, &attvalue);
3868 if (attname != NULL) {
3869
3870 /*
3871 * Well formedness requires at most one declaration of an attribute
3872 */
3873 for (i = 0; i < nbatts;i += 2) {
3874 if (xmlStrEqual(atts[i], attname)) {
3875 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3876 "Attribute %s redefined\n", attname, NULL);
3877 if (attvalue != NULL)
3878 xmlFree(attvalue);
3879 goto failed;
3880 }
3881 }
3882
3883 /*
3884 * Add the pair to atts
3885 */
3886 if (atts == NULL) {
3887 maxatts = 22; /* allow for 10 attrs by default */
3888 atts = (const xmlChar **)
3889 xmlMalloc(maxatts * sizeof(xmlChar *));
3890 if (atts == NULL) {
3891 htmlErrMemory(ctxt, NULL);
3892 if (attvalue != NULL)
3893 xmlFree(attvalue);
3894 goto failed;
3895 }
3896 ctxt->atts = atts;
3897 ctxt->maxatts = maxatts;
3898 } else if (nbatts + 4 > maxatts) {
3899 const xmlChar **n;
3900
3901 maxatts *= 2;
3902 n = (const xmlChar **) xmlRealloc((void *) atts,
3903 maxatts * sizeof(const xmlChar *));
3904 if (n == NULL) {
3905 htmlErrMemory(ctxt, NULL);
3906 if (attvalue != NULL)
3907 xmlFree(attvalue);
3908 goto failed;
3909 }
3910 atts = n;
3911 ctxt->atts = atts;
3912 ctxt->maxatts = maxatts;
3913 }
3914 atts[nbatts++] = attname;
3915 atts[nbatts++] = attvalue;
3916 atts[nbatts] = NULL;
3917 atts[nbatts + 1] = NULL;
3918 }
3919 else {
3920 if (attvalue != NULL)
3921 xmlFree(attvalue);
3922 /* Dump the bogus attribute string up to the next blank or
3923 * the end of the tag. */
3924 while ((IS_CHAR_CH(CUR)) &&
3925 !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3926 ((CUR != '/') || (NXT(1) != '>')))
3927 NEXT;
3928 }
3929
3930 failed:
3931 SKIP_BLANKS;
3932 if (cons == ctxt->nbChars) {
3933 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3934 "htmlParseStartTag: problem parsing attributes\n",
3935 NULL, NULL);
3936 break;
3937 }
3938 }
3939
3940 /*
3941 * Handle specific association to the META tag
3942 */
3943 if (meta && (nbatts != 0))
3944 htmlCheckMeta(ctxt, atts);
3945
3946 /*
3947 * SAX: Start of Element !
3948 */
3949 if (!discardtag) {
3950 htmlnamePush(ctxt, name);
3951 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3952 if (nbatts != 0)
3953 ctxt->sax->startElement(ctxt->userData, name, atts);
3954 else
3955 ctxt->sax->startElement(ctxt->userData, name, NULL);
3956 }
3957 }
3958
3959 if (atts != NULL) {
3960 for (i = 1;i < nbatts;i += 2) {
3961 if (atts[i] != NULL)
3962 xmlFree((xmlChar *) atts[i]);
3963 }
3964 }
3965
3966 return(discardtag);
3967 }
3968
3969 /**
3970 * htmlParseEndTag:
3971 * @ctxt: an HTML parser context
3972 *
3973 * parse an end of tag
3974 *
3975 * [42] ETag ::= '</' Name S? '>'
3976 *
3977 * With namespace
3978 *
3979 * [NS 9] ETag ::= '</' QName S? '>'
3980 *
3981 * Returns 1 if the current level should be closed.
3982 */
3983
3984 static int
htmlParseEndTag(htmlParserCtxtPtr ctxt)3985 htmlParseEndTag(htmlParserCtxtPtr ctxt)
3986 {
3987 const xmlChar *name;
3988 const xmlChar *oldname;
3989 int i, ret;
3990
3991 if ((CUR != '<') || (NXT(1) != '/')) {
3992 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3993 "htmlParseEndTag: '</' not found\n", NULL, NULL);
3994 return (0);
3995 }
3996 SKIP(2);
3997
3998 name = htmlParseHTMLName(ctxt);
3999 if (name == NULL)
4000 return (0);
4001 /*
4002 * We should definitely be at the ending "S? '>'" part
4003 */
4004 SKIP_BLANKS;
4005 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
4006 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4007 "End tag : expected '>'\n", NULL, NULL);
4008 if (ctxt->recovery) {
4009 /*
4010 * We're not at the ending > !!
4011 * Error, unless in recover mode where we search forwards
4012 * until we find a >
4013 */
4014 while (CUR != '\0' && CUR != '>') NEXT;
4015 NEXT;
4016 }
4017 } else
4018 NEXT;
4019
4020 /*
4021 * if we ignored misplaced tags in htmlParseStartTag don't pop them
4022 * out now.
4023 */
4024 if ((ctxt->depth > 0) &&
4025 (xmlStrEqual(name, BAD_CAST "html") ||
4026 xmlStrEqual(name, BAD_CAST "body") ||
4027 xmlStrEqual(name, BAD_CAST "head"))) {
4028 ctxt->depth--;
4029 return (0);
4030 }
4031
4032 /*
4033 * If the name read is not one of the element in the parsing stack
4034 * then return, it's just an error.
4035 */
4036 for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4037 if (xmlStrEqual(name, ctxt->nameTab[i]))
4038 break;
4039 }
4040 if (i < 0) {
4041 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4042 "Unexpected end tag : %s\n", name, NULL);
4043 return (0);
4044 }
4045
4046
4047 /*
4048 * Check for auto-closure of HTML elements.
4049 */
4050
4051 htmlAutoCloseOnClose(ctxt, name);
4052
4053 /*
4054 * Well formedness constraints, opening and closing must match.
4055 * With the exception that the autoclose may have popped stuff out
4056 * of the stack.
4057 */
4058 if (!xmlStrEqual(name, ctxt->name)) {
4059 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4060 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4061 "Opening and ending tag mismatch: %s and %s\n",
4062 name, ctxt->name);
4063 }
4064 }
4065
4066 /*
4067 * SAX: End of Tag
4068 */
4069 oldname = ctxt->name;
4070 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4071 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4072 ctxt->sax->endElement(ctxt->userData, name);
4073 htmlNodeInfoPop(ctxt);
4074 htmlnamePop(ctxt);
4075 ret = 1;
4076 } else {
4077 ret = 0;
4078 }
4079
4080 return (ret);
4081 }
4082
4083
4084 /**
4085 * htmlParseReference:
4086 * @ctxt: an HTML parser context
4087 *
4088 * parse and handle entity references in content,
4089 * this will end-up in a call to character() since this is either a
4090 * CharRef, or a predefined entity.
4091 */
4092 static void
htmlParseReference(htmlParserCtxtPtr ctxt)4093 htmlParseReference(htmlParserCtxtPtr ctxt) {
4094 const htmlEntityDesc * ent;
4095 xmlChar out[6];
4096 const xmlChar *name;
4097 if (CUR != '&') return;
4098
4099 if (NXT(1) == '#') {
4100 unsigned int c;
4101 int bits, i = 0;
4102
4103 c = htmlParseCharRef(ctxt);
4104 if (c == 0)
4105 return;
4106
4107 if (c < 0x80) { out[i++]= c; bits= -6; }
4108 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4109 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4110 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4111
4112 for ( ; bits >= 0; bits-= 6) {
4113 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4114 }
4115 out[i] = 0;
4116
4117 htmlCheckParagraph(ctxt);
4118 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4119 ctxt->sax->characters(ctxt->userData, out, i);
4120 } else {
4121 ent = htmlParseEntityRef(ctxt, &name);
4122 if (name == NULL) {
4123 htmlCheckParagraph(ctxt);
4124 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4125 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4126 return;
4127 }
4128 if ((ent == NULL) || !(ent->value > 0)) {
4129 htmlCheckParagraph(ctxt);
4130 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4131 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4132 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4133 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4134 }
4135 } else {
4136 unsigned int c;
4137 int bits, i = 0;
4138
4139 c = ent->value;
4140 if (c < 0x80)
4141 { out[i++]= c; bits= -6; }
4142 else if (c < 0x800)
4143 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; }
4144 else if (c < 0x10000)
4145 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; }
4146 else
4147 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; }
4148
4149 for ( ; bits >= 0; bits-= 6) {
4150 out[i++]= ((c >> bits) & 0x3F) | 0x80;
4151 }
4152 out[i] = 0;
4153
4154 htmlCheckParagraph(ctxt);
4155 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4156 ctxt->sax->characters(ctxt->userData, out, i);
4157 }
4158 }
4159 }
4160
4161 /**
4162 * htmlParseContent:
4163 * @ctxt: an HTML parser context
4164 *
4165 * Parse a content: comment, sub-element, reference or text.
4166 * Kept for compatibility with old code
4167 */
4168
4169 static void
htmlParseContent(htmlParserCtxtPtr ctxt)4170 htmlParseContent(htmlParserCtxtPtr ctxt) {
4171 xmlChar *currentNode;
4172 int depth;
4173 const xmlChar *name;
4174
4175 currentNode = xmlStrdup(ctxt->name);
4176 depth = ctxt->nameNr;
4177 while (1) {
4178 long cons = ctxt->nbChars;
4179
4180 GROW;
4181
4182 if (ctxt->instate == XML_PARSER_EOF)
4183 break;
4184
4185 /*
4186 * Our tag or one of it's parent or children is ending.
4187 */
4188 if ((CUR == '<') && (NXT(1) == '/')) {
4189 if (htmlParseEndTag(ctxt) &&
4190 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4191 if (currentNode != NULL)
4192 xmlFree(currentNode);
4193 return;
4194 }
4195 continue; /* while */
4196 }
4197
4198 else if ((CUR == '<') &&
4199 ((IS_ASCII_LETTER(NXT(1))) ||
4200 (NXT(1) == '_') || (NXT(1) == ':'))) {
4201 name = htmlParseHTMLName_nonInvasive(ctxt);
4202 if (name == NULL) {
4203 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4204 "htmlParseStartTag: invalid element name\n",
4205 NULL, NULL);
4206 /* Dump the bogus tag like browsers do */
4207 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4208 NEXT;
4209
4210 if (currentNode != NULL)
4211 xmlFree(currentNode);
4212 return;
4213 }
4214
4215 if (ctxt->name != NULL) {
4216 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4217 htmlAutoClose(ctxt, name);
4218 continue;
4219 }
4220 }
4221 }
4222
4223 /*
4224 * Has this node been popped out during parsing of
4225 * the next element
4226 */
4227 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4228 (!xmlStrEqual(currentNode, ctxt->name)))
4229 {
4230 if (currentNode != NULL) xmlFree(currentNode);
4231 return;
4232 }
4233
4234 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4235 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4236 /*
4237 * Handle SCRIPT/STYLE separately
4238 */
4239 htmlParseScript(ctxt);
4240 } else {
4241 /*
4242 * Sometimes DOCTYPE arrives in the middle of the document
4243 */
4244 if ((CUR == '<') && (NXT(1) == '!') &&
4245 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4246 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4247 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4248 (UPP(8) == 'E')) {
4249 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4250 "Misplaced DOCTYPE declaration\n",
4251 BAD_CAST "DOCTYPE" , NULL);
4252 htmlParseDocTypeDecl(ctxt);
4253 }
4254
4255 /*
4256 * First case : a comment
4257 */
4258 if ((CUR == '<') && (NXT(1) == '!') &&
4259 (NXT(2) == '-') && (NXT(3) == '-')) {
4260 htmlParseComment(ctxt);
4261 }
4262
4263 /*
4264 * Second case : a Processing Instruction.
4265 */
4266 else if ((CUR == '<') && (NXT(1) == '?')) {
4267 htmlParsePI(ctxt);
4268 }
4269
4270 /*
4271 * Third case : a sub-element.
4272 */
4273 else if (CUR == '<') {
4274 htmlParseElement(ctxt);
4275 }
4276
4277 /*
4278 * Fourth case : a reference. If if has not been resolved,
4279 * parsing returns it's Name, create the node
4280 */
4281 else if (CUR == '&') {
4282 htmlParseReference(ctxt);
4283 }
4284
4285 /*
4286 * Fifth case : end of the resource
4287 */
4288 else if (CUR == 0) {
4289 htmlAutoCloseOnEnd(ctxt);
4290 break;
4291 }
4292
4293 /*
4294 * Last case, text. Note that References are handled directly.
4295 */
4296 else {
4297 htmlParseCharData(ctxt);
4298 }
4299
4300 if (cons == ctxt->nbChars) {
4301 if (ctxt->node != NULL) {
4302 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4303 "detected an error in element content\n",
4304 NULL, NULL);
4305 }
4306 break;
4307 }
4308 }
4309 GROW;
4310 }
4311 if (currentNode != NULL) xmlFree(currentNode);
4312 }
4313
4314 /**
4315 * htmlParseElement:
4316 * @ctxt: an HTML parser context
4317 *
4318 * parse an HTML element, this is highly recursive
4319 * this is kept for compatibility with previous code versions
4320 *
4321 * [39] element ::= EmptyElemTag | STag content ETag
4322 *
4323 * [41] Attribute ::= Name Eq AttValue
4324 */
4325
4326 void
htmlParseElement(htmlParserCtxtPtr ctxt)4327 htmlParseElement(htmlParserCtxtPtr ctxt) {
4328 const xmlChar *name;
4329 xmlChar *currentNode = NULL;
4330 const htmlElemDesc * info;
4331 htmlParserNodeInfo node_info;
4332 int failed;
4333 int depth;
4334 const xmlChar *oldptr;
4335
4336 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4337 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4338 "htmlParseElement: context error\n", NULL, NULL);
4339 return;
4340 }
4341
4342 if (ctxt->instate == XML_PARSER_EOF)
4343 return;
4344
4345 /* Capture start position */
4346 if (ctxt->record_info) {
4347 node_info.begin_pos = ctxt->input->consumed +
4348 (CUR_PTR - ctxt->input->base);
4349 node_info.begin_line = ctxt->input->line;
4350 }
4351
4352 failed = htmlParseStartTag(ctxt);
4353 name = ctxt->name;
4354 if ((failed == -1) || (name == NULL)) {
4355 if (CUR == '>')
4356 NEXT;
4357 return;
4358 }
4359
4360 /*
4361 * Lookup the info for that element.
4362 */
4363 info = htmlTagLookup(name);
4364 if (info == NULL) {
4365 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4366 "Tag %s invalid\n", name, NULL);
4367 }
4368
4369 /*
4370 * Check for an Empty Element labeled the XML/SGML way
4371 */
4372 if ((CUR == '/') && (NXT(1) == '>')) {
4373 SKIP(2);
4374 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4375 ctxt->sax->endElement(ctxt->userData, name);
4376 htmlnamePop(ctxt);
4377 return;
4378 }
4379
4380 if (CUR == '>') {
4381 NEXT;
4382 } else {
4383 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4384 "Couldn't find end of Start Tag %s\n", name, NULL);
4385
4386 /*
4387 * end of parsing of this node.
4388 */
4389 if (xmlStrEqual(name, ctxt->name)) {
4390 nodePop(ctxt);
4391 htmlnamePop(ctxt);
4392 }
4393
4394 /*
4395 * Capture end position and add node
4396 */
4397 if (ctxt->record_info) {
4398 node_info.end_pos = ctxt->input->consumed +
4399 (CUR_PTR - ctxt->input->base);
4400 node_info.end_line = ctxt->input->line;
4401 node_info.node = ctxt->node;
4402 xmlParserAddNodeInfo(ctxt, &node_info);
4403 }
4404 return;
4405 }
4406
4407 /*
4408 * Check for an Empty Element from DTD definition
4409 */
4410 if ((info != NULL) && (info->empty)) {
4411 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4412 ctxt->sax->endElement(ctxt->userData, name);
4413 htmlnamePop(ctxt);
4414 return;
4415 }
4416
4417 /*
4418 * Parse the content of the element:
4419 */
4420 currentNode = xmlStrdup(ctxt->name);
4421 depth = ctxt->nameNr;
4422 while (IS_CHAR_CH(CUR)) {
4423 oldptr = ctxt->input->cur;
4424 htmlParseContent(ctxt);
4425 if (oldptr==ctxt->input->cur) break;
4426 if (ctxt->nameNr < depth) break;
4427 }
4428
4429 /*
4430 * Capture end position and add node
4431 */
4432 if ( currentNode != NULL && ctxt->record_info ) {
4433 node_info.end_pos = ctxt->input->consumed +
4434 (CUR_PTR - ctxt->input->base);
4435 node_info.end_line = ctxt->input->line;
4436 node_info.node = ctxt->node;
4437 xmlParserAddNodeInfo(ctxt, &node_info);
4438 }
4439 if (!IS_CHAR_CH(CUR)) {
4440 htmlAutoCloseOnEnd(ctxt);
4441 }
4442
4443 if (currentNode != NULL)
4444 xmlFree(currentNode);
4445 }
4446
4447 static void
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt)4448 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4449 /*
4450 * Capture end position and add node
4451 */
4452 if ( ctxt->node != NULL && ctxt->record_info ) {
4453 ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4454 (CUR_PTR - ctxt->input->base);
4455 ctxt->nodeInfo->end_line = ctxt->input->line;
4456 ctxt->nodeInfo->node = ctxt->node;
4457 xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4458 htmlNodeInfoPop(ctxt);
4459 }
4460 if (!IS_CHAR_CH(CUR)) {
4461 htmlAutoCloseOnEnd(ctxt);
4462 }
4463 }
4464
4465 /**
4466 * htmlParseElementInternal:
4467 * @ctxt: an HTML parser context
4468 *
4469 * parse an HTML element, new version, non recursive
4470 *
4471 * [39] element ::= EmptyElemTag | STag content ETag
4472 *
4473 * [41] Attribute ::= Name Eq AttValue
4474 */
4475
4476 static void
htmlParseElementInternal(htmlParserCtxtPtr ctxt)4477 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4478 const xmlChar *name;
4479 const htmlElemDesc * info;
4480 htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4481 int failed;
4482
4483 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4484 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4485 "htmlParseElementInternal: context error\n", NULL, NULL);
4486 return;
4487 }
4488
4489 if (ctxt->instate == XML_PARSER_EOF)
4490 return;
4491
4492 /* Capture start position */
4493 if (ctxt->record_info) {
4494 node_info.begin_pos = ctxt->input->consumed +
4495 (CUR_PTR - ctxt->input->base);
4496 node_info.begin_line = ctxt->input->line;
4497 }
4498
4499 failed = htmlParseStartTag(ctxt);
4500 name = ctxt->name;
4501 if ((failed == -1) || (name == NULL)) {
4502 if (CUR == '>')
4503 NEXT;
4504 return;
4505 }
4506
4507 /*
4508 * Lookup the info for that element.
4509 */
4510 info = htmlTagLookup(name);
4511 if (info == NULL) {
4512 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4513 "Tag %s invalid\n", name, NULL);
4514 }
4515
4516 /*
4517 * Check for an Empty Element labeled the XML/SGML way
4518 */
4519 if ((CUR == '/') && (NXT(1) == '>')) {
4520 SKIP(2);
4521 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4522 ctxt->sax->endElement(ctxt->userData, name);
4523 htmlnamePop(ctxt);
4524 return;
4525 }
4526
4527 if (CUR == '>') {
4528 NEXT;
4529 } else {
4530 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4531 "Couldn't find end of Start Tag %s\n", name, NULL);
4532
4533 /*
4534 * end of parsing of this node.
4535 */
4536 if (xmlStrEqual(name, ctxt->name)) {
4537 nodePop(ctxt);
4538 htmlnamePop(ctxt);
4539 }
4540
4541 if (ctxt->record_info)
4542 htmlNodeInfoPush(ctxt, &node_info);
4543 htmlParserFinishElementParsing(ctxt);
4544 return;
4545 }
4546
4547 /*
4548 * Check for an Empty Element from DTD definition
4549 */
4550 if ((info != NULL) && (info->empty)) {
4551 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4552 ctxt->sax->endElement(ctxt->userData, name);
4553 htmlnamePop(ctxt);
4554 return;
4555 }
4556
4557 if (ctxt->record_info)
4558 htmlNodeInfoPush(ctxt, &node_info);
4559 }
4560
4561 /**
4562 * htmlParseContentInternal:
4563 * @ctxt: an HTML parser context
4564 *
4565 * Parse a content: comment, sub-element, reference or text.
4566 * New version for non recursive htmlParseElementInternal
4567 */
4568
4569 static void
htmlParseContentInternal(htmlParserCtxtPtr ctxt)4570 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4571 xmlChar *currentNode;
4572 int depth;
4573 const xmlChar *name;
4574
4575 currentNode = xmlStrdup(ctxt->name);
4576 depth = ctxt->nameNr;
4577 while (1) {
4578 long cons = ctxt->nbChars;
4579
4580 GROW;
4581
4582 if (ctxt->instate == XML_PARSER_EOF)
4583 break;
4584
4585 /*
4586 * Our tag or one of it's parent or children is ending.
4587 */
4588 if ((CUR == '<') && (NXT(1) == '/')) {
4589 if (htmlParseEndTag(ctxt) &&
4590 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4591 if (currentNode != NULL)
4592 xmlFree(currentNode);
4593
4594 currentNode = xmlStrdup(ctxt->name);
4595 depth = ctxt->nameNr;
4596 }
4597 continue; /* while */
4598 }
4599
4600 else if ((CUR == '<') &&
4601 ((IS_ASCII_LETTER(NXT(1))) ||
4602 (NXT(1) == '_') || (NXT(1) == ':'))) {
4603 name = htmlParseHTMLName_nonInvasive(ctxt);
4604 if (name == NULL) {
4605 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4606 "htmlParseStartTag: invalid element name\n",
4607 NULL, NULL);
4608 /* Dump the bogus tag like browsers do */
4609 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4610 NEXT;
4611
4612 htmlParserFinishElementParsing(ctxt);
4613 if (currentNode != NULL)
4614 xmlFree(currentNode);
4615
4616 currentNode = xmlStrdup(ctxt->name);
4617 depth = ctxt->nameNr;
4618 continue;
4619 }
4620
4621 if (ctxt->name != NULL) {
4622 if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4623 htmlAutoClose(ctxt, name);
4624 continue;
4625 }
4626 }
4627 }
4628
4629 /*
4630 * Has this node been popped out during parsing of
4631 * the next element
4632 */
4633 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4634 (!xmlStrEqual(currentNode, ctxt->name)))
4635 {
4636 htmlParserFinishElementParsing(ctxt);
4637 if (currentNode != NULL) xmlFree(currentNode);
4638
4639 currentNode = xmlStrdup(ctxt->name);
4640 depth = ctxt->nameNr;
4641 continue;
4642 }
4643
4644 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4645 (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4646 /*
4647 * Handle SCRIPT/STYLE separately
4648 */
4649 htmlParseScript(ctxt);
4650 } else {
4651 /*
4652 * Sometimes DOCTYPE arrives in the middle of the document
4653 */
4654 if ((CUR == '<') && (NXT(1) == '!') &&
4655 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4656 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4657 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4658 (UPP(8) == 'E')) {
4659 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4660 "Misplaced DOCTYPE declaration\n",
4661 BAD_CAST "DOCTYPE" , NULL);
4662 htmlParseDocTypeDecl(ctxt);
4663 }
4664
4665 /*
4666 * First case : a comment
4667 */
4668 if ((CUR == '<') && (NXT(1) == '!') &&
4669 (NXT(2) == '-') && (NXT(3) == '-')) {
4670 htmlParseComment(ctxt);
4671 }
4672
4673 /*
4674 * Second case : a Processing Instruction.
4675 */
4676 else if ((CUR == '<') && (NXT(1) == '?')) {
4677 htmlParsePI(ctxt);
4678 }
4679
4680 /*
4681 * Third case : a sub-element.
4682 */
4683 else if (CUR == '<') {
4684 htmlParseElementInternal(ctxt);
4685 if (currentNode != NULL) xmlFree(currentNode);
4686
4687 currentNode = xmlStrdup(ctxt->name);
4688 depth = ctxt->nameNr;
4689 }
4690
4691 /*
4692 * Fourth case : a reference. If if has not been resolved,
4693 * parsing returns it's Name, create the node
4694 */
4695 else if (CUR == '&') {
4696 htmlParseReference(ctxt);
4697 }
4698
4699 /*
4700 * Fifth case : end of the resource
4701 */
4702 else if (CUR == 0) {
4703 htmlAutoCloseOnEnd(ctxt);
4704 break;
4705 }
4706
4707 /*
4708 * Last case, text. Note that References are handled directly.
4709 */
4710 else {
4711 htmlParseCharData(ctxt);
4712 }
4713
4714 if (cons == ctxt->nbChars) {
4715 if (ctxt->node != NULL) {
4716 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4717 "detected an error in element content\n",
4718 NULL, NULL);
4719 }
4720 break;
4721 }
4722 }
4723 GROW;
4724 }
4725 if (currentNode != NULL) xmlFree(currentNode);
4726 }
4727
4728 /**
4729 * htmlParseContent:
4730 * @ctxt: an HTML parser context
4731 *
4732 * Parse a content: comment, sub-element, reference or text.
4733 * This is the entry point when called from parser.c
4734 */
4735
4736 void
__htmlParseContent(void * ctxt)4737 __htmlParseContent(void *ctxt) {
4738 if (ctxt != NULL)
4739 htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4740 }
4741
4742 /**
4743 * htmlParseDocument:
4744 * @ctxt: an HTML parser context
4745 *
4746 * parse an HTML document (and build a tree if using the standard SAX
4747 * interface).
4748 *
4749 * Returns 0, -1 in case of error. the parser context is augmented
4750 * as a result of the parsing.
4751 */
4752
4753 int
htmlParseDocument(htmlParserCtxtPtr ctxt)4754 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4755 xmlChar start[4];
4756 xmlCharEncoding enc;
4757 xmlDtdPtr dtd;
4758
4759 xmlInitParser();
4760
4761 htmlDefaultSAXHandlerInit();
4762
4763 if ((ctxt == NULL) || (ctxt->input == NULL)) {
4764 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4765 "htmlParseDocument: context error\n", NULL, NULL);
4766 return(XML_ERR_INTERNAL_ERROR);
4767 }
4768 ctxt->html = 1;
4769 ctxt->linenumbers = 1;
4770 GROW;
4771 /*
4772 * SAX: beginning of the document processing.
4773 */
4774 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4775 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4776
4777 if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4778 ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4779 /*
4780 * Get the 4 first bytes and decode the charset
4781 * if enc != XML_CHAR_ENCODING_NONE
4782 * plug some encoding conversion routines.
4783 */
4784 start[0] = RAW;
4785 start[1] = NXT(1);
4786 start[2] = NXT(2);
4787 start[3] = NXT(3);
4788 enc = xmlDetectCharEncoding(&start[0], 4);
4789 if (enc != XML_CHAR_ENCODING_NONE) {
4790 xmlSwitchEncoding(ctxt, enc);
4791 }
4792 }
4793
4794 /*
4795 * Wipe out everything which is before the first '<'
4796 */
4797 SKIP_BLANKS;
4798 if (CUR == 0) {
4799 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4800 "Document is empty\n", NULL, NULL);
4801 }
4802
4803 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4804 ctxt->sax->startDocument(ctxt->userData);
4805
4806
4807 /*
4808 * Parse possible comments and PIs before any content
4809 */
4810 while (((CUR == '<') && (NXT(1) == '!') &&
4811 (NXT(2) == '-') && (NXT(3) == '-')) ||
4812 ((CUR == '<') && (NXT(1) == '?'))) {
4813 htmlParseComment(ctxt);
4814 htmlParsePI(ctxt);
4815 SKIP_BLANKS;
4816 }
4817
4818
4819 /*
4820 * Then possibly doc type declaration(s) and more Misc
4821 * (doctypedecl Misc*)?
4822 */
4823 if ((CUR == '<') && (NXT(1) == '!') &&
4824 (UPP(2) == 'D') && (UPP(3) == 'O') &&
4825 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4826 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4827 (UPP(8) == 'E')) {
4828 htmlParseDocTypeDecl(ctxt);
4829 }
4830 SKIP_BLANKS;
4831
4832 /*
4833 * Parse possible comments and PIs before any content
4834 */
4835 while (((CUR == '<') && (NXT(1) == '!') &&
4836 (NXT(2) == '-') && (NXT(3) == '-')) ||
4837 ((CUR == '<') && (NXT(1) == '?'))) {
4838 htmlParseComment(ctxt);
4839 htmlParsePI(ctxt);
4840 SKIP_BLANKS;
4841 }
4842
4843 /*
4844 * Time to start parsing the tree itself
4845 */
4846 htmlParseContentInternal(ctxt);
4847
4848 /*
4849 * autoclose
4850 */
4851 if (CUR == 0)
4852 htmlAutoCloseOnEnd(ctxt);
4853
4854
4855 /*
4856 * SAX: end of the document processing.
4857 */
4858 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4859 ctxt->sax->endDocument(ctxt->userData);
4860
4861 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4862 dtd = xmlGetIntSubset(ctxt->myDoc);
4863 if (dtd == NULL)
4864 ctxt->myDoc->intSubset =
4865 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4866 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4867 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4868 }
4869 if (! ctxt->wellFormed) return(-1);
4870 return(0);
4871 }
4872
4873
4874 /************************************************************************
4875 * *
4876 * Parser contexts handling *
4877 * *
4878 ************************************************************************/
4879
4880 /**
4881 * htmlInitParserCtxt:
4882 * @ctxt: an HTML parser context
4883 *
4884 * Initialize a parser context
4885 *
4886 * Returns 0 in case of success and -1 in case of error
4887 */
4888
4889 static int
htmlInitParserCtxt(htmlParserCtxtPtr ctxt)4890 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4891 {
4892 htmlSAXHandler *sax;
4893
4894 if (ctxt == NULL) return(-1);
4895 memset(ctxt, 0, sizeof(htmlParserCtxt));
4896
4897 ctxt->dict = xmlDictCreate();
4898 if (ctxt->dict == NULL) {
4899 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4900 return(-1);
4901 }
4902 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4903 if (sax == NULL) {
4904 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4905 return(-1);
4906 }
4907 else
4908 memset(sax, 0, sizeof(htmlSAXHandler));
4909
4910 /* Allocate the Input stack */
4911 ctxt->inputTab = (htmlParserInputPtr *)
4912 xmlMalloc(5 * sizeof(htmlParserInputPtr));
4913 if (ctxt->inputTab == NULL) {
4914 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4915 ctxt->inputNr = 0;
4916 ctxt->inputMax = 0;
4917 ctxt->input = NULL;
4918 return(-1);
4919 }
4920 ctxt->inputNr = 0;
4921 ctxt->inputMax = 5;
4922 ctxt->input = NULL;
4923 ctxt->version = NULL;
4924 ctxt->encoding = NULL;
4925 ctxt->standalone = -1;
4926 ctxt->instate = XML_PARSER_START;
4927
4928 /* Allocate the Node stack */
4929 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4930 if (ctxt->nodeTab == NULL) {
4931 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4932 ctxt->nodeNr = 0;
4933 ctxt->nodeMax = 0;
4934 ctxt->node = NULL;
4935 ctxt->inputNr = 0;
4936 ctxt->inputMax = 0;
4937 ctxt->input = NULL;
4938 return(-1);
4939 }
4940 ctxt->nodeNr = 0;
4941 ctxt->nodeMax = 10;
4942 ctxt->node = NULL;
4943
4944 /* Allocate the Name stack */
4945 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4946 if (ctxt->nameTab == NULL) {
4947 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4948 ctxt->nameNr = 0;
4949 ctxt->nameMax = 0;
4950 ctxt->name = NULL;
4951 ctxt->nodeNr = 0;
4952 ctxt->nodeMax = 0;
4953 ctxt->node = NULL;
4954 ctxt->inputNr = 0;
4955 ctxt->inputMax = 0;
4956 ctxt->input = NULL;
4957 return(-1);
4958 }
4959 ctxt->nameNr = 0;
4960 ctxt->nameMax = 10;
4961 ctxt->name = NULL;
4962
4963 ctxt->nodeInfoTab = NULL;
4964 ctxt->nodeInfoNr = 0;
4965 ctxt->nodeInfoMax = 0;
4966
4967 if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4968 else {
4969 ctxt->sax = sax;
4970 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4971 }
4972 ctxt->userData = ctxt;
4973 ctxt->myDoc = NULL;
4974 ctxt->wellFormed = 1;
4975 ctxt->replaceEntities = 0;
4976 ctxt->linenumbers = xmlLineNumbersDefaultValue;
4977 ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
4978 ctxt->html = 1;
4979 ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4980 ctxt->vctxt.userData = ctxt;
4981 ctxt->vctxt.error = xmlParserValidityError;
4982 ctxt->vctxt.warning = xmlParserValidityWarning;
4983 ctxt->record_info = 0;
4984 ctxt->validate = 0;
4985 ctxt->nbChars = 0;
4986 ctxt->checkIndex = 0;
4987 ctxt->catalogs = NULL;
4988 xmlInitNodeInfoSeq(&ctxt->node_seq);
4989 return(0);
4990 }
4991
4992 /**
4993 * htmlFreeParserCtxt:
4994 * @ctxt: an HTML parser context
4995 *
4996 * Free all the memory used by a parser context. However the parsed
4997 * document in ctxt->myDoc is not freed.
4998 */
4999
5000 void
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)5001 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5002 {
5003 xmlFreeParserCtxt(ctxt);
5004 }
5005
5006 /**
5007 * htmlNewParserCtxt:
5008 *
5009 * Allocate and initialize a new parser context.
5010 *
5011 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5012 */
5013
5014 htmlParserCtxtPtr
htmlNewParserCtxt(void)5015 htmlNewParserCtxt(void)
5016 {
5017 xmlParserCtxtPtr ctxt;
5018
5019 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5020 if (ctxt == NULL) {
5021 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5022 return(NULL);
5023 }
5024 memset(ctxt, 0, sizeof(xmlParserCtxt));
5025 if (htmlInitParserCtxt(ctxt) < 0) {
5026 htmlFreeParserCtxt(ctxt);
5027 return(NULL);
5028 }
5029 return(ctxt);
5030 }
5031
5032 /**
5033 * htmlCreateMemoryParserCtxt:
5034 * @buffer: a pointer to a char array
5035 * @size: the size of the array
5036 *
5037 * Create a parser context for an HTML in-memory document.
5038 *
5039 * Returns the new parser context or NULL
5040 */
5041 htmlParserCtxtPtr
htmlCreateMemoryParserCtxt(const char * buffer,int size)5042 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5043 xmlParserCtxtPtr ctxt;
5044 xmlParserInputPtr input;
5045 xmlParserInputBufferPtr buf;
5046
5047 if (buffer == NULL)
5048 return(NULL);
5049 if (size <= 0)
5050 return(NULL);
5051
5052 ctxt = htmlNewParserCtxt();
5053 if (ctxt == NULL)
5054 return(NULL);
5055
5056 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5057 if (buf == NULL) return(NULL);
5058
5059 input = xmlNewInputStream(ctxt);
5060 if (input == NULL) {
5061 xmlFreeParserCtxt(ctxt);
5062 return(NULL);
5063 }
5064
5065 input->filename = NULL;
5066 input->buf = buf;
5067 xmlBufResetInput(buf->buffer, input);
5068
5069 inputPush(ctxt, input);
5070 return(ctxt);
5071 }
5072
5073 /**
5074 * htmlCreateDocParserCtxt:
5075 * @cur: a pointer to an array of xmlChar
5076 * @encoding: a free form C string describing the HTML document encoding, or NULL
5077 *
5078 * Create a parser context for an HTML document.
5079 *
5080 * TODO: check the need to add encoding handling there
5081 *
5082 * Returns the new parser context or NULL
5083 */
5084 static htmlParserCtxtPtr
htmlCreateDocParserCtxt(const xmlChar * cur,const char * encoding)5085 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5086 int len;
5087 htmlParserCtxtPtr ctxt;
5088
5089 if (cur == NULL)
5090 return(NULL);
5091 len = xmlStrlen(cur);
5092 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5093 if (ctxt == NULL)
5094 return(NULL);
5095
5096 if (encoding != NULL) {
5097 xmlCharEncoding enc;
5098 xmlCharEncodingHandlerPtr handler;
5099
5100 if (ctxt->input->encoding != NULL)
5101 xmlFree((xmlChar *) ctxt->input->encoding);
5102 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5103
5104 enc = xmlParseCharEncoding(encoding);
5105 /*
5106 * registered set of known encodings
5107 */
5108 if (enc != XML_CHAR_ENCODING_ERROR) {
5109 xmlSwitchEncoding(ctxt, enc);
5110 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5111 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5112 "Unsupported encoding %s\n",
5113 (const xmlChar *) encoding, NULL);
5114 }
5115 } else {
5116 /*
5117 * fallback for unknown encodings
5118 */
5119 handler = xmlFindCharEncodingHandler((const char *) encoding);
5120 if (handler != NULL) {
5121 xmlSwitchToEncoding(ctxt, handler);
5122 } else {
5123 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5124 "Unsupported encoding %s\n",
5125 (const xmlChar *) encoding, NULL);
5126 }
5127 }
5128 }
5129 return(ctxt);
5130 }
5131
5132 #ifdef LIBXML_PUSH_ENABLED
5133 /************************************************************************
5134 * *
5135 * Progressive parsing interfaces *
5136 * *
5137 ************************************************************************/
5138
5139 /**
5140 * htmlParseLookupSequence:
5141 * @ctxt: an HTML parser context
5142 * @first: the first char to lookup
5143 * @next: the next char to lookup or zero
5144 * @third: the next char to lookup or zero
5145 * @comment: flag to force checking inside comments
5146 *
5147 * Try to find if a sequence (first, next, third) or just (first next) or
5148 * (first) is available in the input stream.
5149 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5150 * to avoid rescanning sequences of bytes, it DOES change the state of the
5151 * parser, do not use liberally.
5152 * This is basically similar to xmlParseLookupSequence()
5153 *
5154 * Returns the index to the current parsing point if the full sequence
5155 * is available, -1 otherwise.
5156 */
5157 static int
htmlParseLookupSequence(htmlParserCtxtPtr ctxt,xmlChar first,xmlChar next,xmlChar third,int iscomment,int ignoreattrval)5158 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5159 xmlChar next, xmlChar third, int iscomment,
5160 int ignoreattrval)
5161 {
5162 int base, len;
5163 htmlParserInputPtr in;
5164 const xmlChar *buf;
5165 int incomment = 0;
5166 int invalue = 0;
5167 char valdellim = 0x0;
5168
5169 in = ctxt->input;
5170 if (in == NULL)
5171 return (-1);
5172
5173 base = in->cur - in->base;
5174 if (base < 0)
5175 return (-1);
5176
5177 if (ctxt->checkIndex > base)
5178 base = ctxt->checkIndex;
5179
5180 if (in->buf == NULL) {
5181 buf = in->base;
5182 len = in->length;
5183 } else {
5184 buf = xmlBufContent(in->buf->buffer);
5185 len = xmlBufUse(in->buf->buffer);
5186 }
5187
5188 /* take into account the sequence length */
5189 if (third)
5190 len -= 2;
5191 else if (next)
5192 len--;
5193 for (; base < len; base++) {
5194 if ((!incomment) && (base + 4 < len) && (!iscomment)) {
5195 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5196 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5197 incomment = 1;
5198 /* do not increment past <! - some people use <!--> */
5199 base += 2;
5200 }
5201 }
5202 if (ignoreattrval) {
5203 if (buf[base] == '"' || buf[base] == '\'') {
5204 if (invalue) {
5205 if (buf[base] == valdellim) {
5206 invalue = 0;
5207 continue;
5208 }
5209 } else {
5210 valdellim = buf[base];
5211 invalue = 1;
5212 continue;
5213 }
5214 } else if (invalue) {
5215 continue;
5216 }
5217 }
5218 if (incomment) {
5219 if (base + 3 > len)
5220 return (-1);
5221 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5222 (buf[base + 2] == '>')) {
5223 incomment = 0;
5224 base += 2;
5225 }
5226 continue;
5227 }
5228 if (buf[base] == first) {
5229 if (third != 0) {
5230 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5231 continue;
5232 } else if (next != 0) {
5233 if (buf[base + 1] != next)
5234 continue;
5235 }
5236 ctxt->checkIndex = 0;
5237 #ifdef DEBUG_PUSH
5238 if (next == 0)
5239 xmlGenericError(xmlGenericErrorContext,
5240 "HPP: lookup '%c' found at %d\n",
5241 first, base);
5242 else if (third == 0)
5243 xmlGenericError(xmlGenericErrorContext,
5244 "HPP: lookup '%c%c' found at %d\n",
5245 first, next, base);
5246 else
5247 xmlGenericError(xmlGenericErrorContext,
5248 "HPP: lookup '%c%c%c' found at %d\n",
5249 first, next, third, base);
5250 #endif
5251 return (base - (in->cur - in->base));
5252 }
5253 }
5254 if ((!incomment) && (!invalue))
5255 ctxt->checkIndex = base;
5256 #ifdef DEBUG_PUSH
5257 if (next == 0)
5258 xmlGenericError(xmlGenericErrorContext,
5259 "HPP: lookup '%c' failed\n", first);
5260 else if (third == 0)
5261 xmlGenericError(xmlGenericErrorContext,
5262 "HPP: lookup '%c%c' failed\n", first, next);
5263 else
5264 xmlGenericError(xmlGenericErrorContext,
5265 "HPP: lookup '%c%c%c' failed\n", first, next,
5266 third);
5267 #endif
5268 return (-1);
5269 }
5270
5271 /**
5272 * htmlParseLookupChars:
5273 * @ctxt: an HTML parser context
5274 * @stop: Array of chars, which stop the lookup.
5275 * @stopLen: Length of stop-Array
5276 *
5277 * Try to find if any char of the stop-Array is available in the input
5278 * stream.
5279 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5280 * to avoid rescanning sequences of bytes, it DOES change the state of the
5281 * parser, do not use liberally.
5282 *
5283 * Returns the index to the current parsing point if a stopChar
5284 * is available, -1 otherwise.
5285 */
5286 static int
htmlParseLookupChars(htmlParserCtxtPtr ctxt,const xmlChar * stop,int stopLen)5287 htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
5288 int stopLen)
5289 {
5290 int base, len;
5291 htmlParserInputPtr in;
5292 const xmlChar *buf;
5293 int incomment = 0;
5294 int i;
5295
5296 in = ctxt->input;
5297 if (in == NULL)
5298 return (-1);
5299
5300 base = in->cur - in->base;
5301 if (base < 0)
5302 return (-1);
5303
5304 if (ctxt->checkIndex > base)
5305 base = ctxt->checkIndex;
5306
5307 if (in->buf == NULL) {
5308 buf = in->base;
5309 len = in->length;
5310 } else {
5311 buf = xmlBufContent(in->buf->buffer);
5312 len = xmlBufUse(in->buf->buffer);
5313 }
5314
5315 for (; base < len; base++) {
5316 if (!incomment && (base + 4 < len)) {
5317 if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5318 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5319 incomment = 1;
5320 /* do not increment past <! - some people use <!--> */
5321 base += 2;
5322 }
5323 }
5324 if (incomment) {
5325 if (base + 3 > len)
5326 return (-1);
5327 if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5328 (buf[base + 2] == '>')) {
5329 incomment = 0;
5330 base += 2;
5331 }
5332 continue;
5333 }
5334 for (i = 0; i < stopLen; ++i) {
5335 if (buf[base] == stop[i]) {
5336 ctxt->checkIndex = 0;
5337 return (base - (in->cur - in->base));
5338 }
5339 }
5340 }
5341 ctxt->checkIndex = base;
5342 return (-1);
5343 }
5344
5345 /**
5346 * htmlParseTryOrFinish:
5347 * @ctxt: an HTML parser context
5348 * @terminate: last chunk indicator
5349 *
5350 * Try to progress on parsing
5351 *
5352 * Returns zero if no parsing was possible
5353 */
5354 static int
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt,int terminate)5355 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5356 int ret = 0;
5357 htmlParserInputPtr in;
5358 int avail = 0;
5359 xmlChar cur, next;
5360
5361 htmlParserNodeInfo node_info;
5362
5363 #ifdef DEBUG_PUSH
5364 switch (ctxt->instate) {
5365 case XML_PARSER_EOF:
5366 xmlGenericError(xmlGenericErrorContext,
5367 "HPP: try EOF\n"); break;
5368 case XML_PARSER_START:
5369 xmlGenericError(xmlGenericErrorContext,
5370 "HPP: try START\n"); break;
5371 case XML_PARSER_MISC:
5372 xmlGenericError(xmlGenericErrorContext,
5373 "HPP: try MISC\n");break;
5374 case XML_PARSER_COMMENT:
5375 xmlGenericError(xmlGenericErrorContext,
5376 "HPP: try COMMENT\n");break;
5377 case XML_PARSER_PROLOG:
5378 xmlGenericError(xmlGenericErrorContext,
5379 "HPP: try PROLOG\n");break;
5380 case XML_PARSER_START_TAG:
5381 xmlGenericError(xmlGenericErrorContext,
5382 "HPP: try START_TAG\n");break;
5383 case XML_PARSER_CONTENT:
5384 xmlGenericError(xmlGenericErrorContext,
5385 "HPP: try CONTENT\n");break;
5386 case XML_PARSER_CDATA_SECTION:
5387 xmlGenericError(xmlGenericErrorContext,
5388 "HPP: try CDATA_SECTION\n");break;
5389 case XML_PARSER_END_TAG:
5390 xmlGenericError(xmlGenericErrorContext,
5391 "HPP: try END_TAG\n");break;
5392 case XML_PARSER_ENTITY_DECL:
5393 xmlGenericError(xmlGenericErrorContext,
5394 "HPP: try ENTITY_DECL\n");break;
5395 case XML_PARSER_ENTITY_VALUE:
5396 xmlGenericError(xmlGenericErrorContext,
5397 "HPP: try ENTITY_VALUE\n");break;
5398 case XML_PARSER_ATTRIBUTE_VALUE:
5399 xmlGenericError(xmlGenericErrorContext,
5400 "HPP: try ATTRIBUTE_VALUE\n");break;
5401 case XML_PARSER_DTD:
5402 xmlGenericError(xmlGenericErrorContext,
5403 "HPP: try DTD\n");break;
5404 case XML_PARSER_EPILOG:
5405 xmlGenericError(xmlGenericErrorContext,
5406 "HPP: try EPILOG\n");break;
5407 case XML_PARSER_PI:
5408 xmlGenericError(xmlGenericErrorContext,
5409 "HPP: try PI\n");break;
5410 case XML_PARSER_SYSTEM_LITERAL:
5411 xmlGenericError(xmlGenericErrorContext,
5412 "HPP: try SYSTEM_LITERAL\n");break;
5413 }
5414 #endif
5415
5416 while (1) {
5417
5418 in = ctxt->input;
5419 if (in == NULL) break;
5420 if (in->buf == NULL)
5421 avail = in->length - (in->cur - in->base);
5422 else
5423 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5424 if ((avail == 0) && (terminate)) {
5425 htmlAutoCloseOnEnd(ctxt);
5426 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5427 /*
5428 * SAX: end of the document processing.
5429 */
5430 ctxt->instate = XML_PARSER_EOF;
5431 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5432 ctxt->sax->endDocument(ctxt->userData);
5433 }
5434 }
5435 if (avail < 1)
5436 goto done;
5437 cur = in->cur[0];
5438 if (cur == 0) {
5439 SKIP(1);
5440 continue;
5441 }
5442
5443 switch (ctxt->instate) {
5444 case XML_PARSER_EOF:
5445 /*
5446 * Document parsing is done !
5447 */
5448 goto done;
5449 case XML_PARSER_START:
5450 /*
5451 * Very first chars read from the document flow.
5452 */
5453 cur = in->cur[0];
5454 if (IS_BLANK_CH(cur)) {
5455 SKIP_BLANKS;
5456 if (in->buf == NULL)
5457 avail = in->length - (in->cur - in->base);
5458 else
5459 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5460 }
5461 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5462 ctxt->sax->setDocumentLocator(ctxt->userData,
5463 &xmlDefaultSAXLocator);
5464 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5465 (!ctxt->disableSAX))
5466 ctxt->sax->startDocument(ctxt->userData);
5467
5468 cur = in->cur[0];
5469 next = in->cur[1];
5470 if ((cur == '<') && (next == '!') &&
5471 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5472 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5473 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5474 (UPP(8) == 'E')) {
5475 if ((!terminate) &&
5476 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5477 goto done;
5478 #ifdef DEBUG_PUSH
5479 xmlGenericError(xmlGenericErrorContext,
5480 "HPP: Parsing internal subset\n");
5481 #endif
5482 htmlParseDocTypeDecl(ctxt);
5483 ctxt->instate = XML_PARSER_PROLOG;
5484 #ifdef DEBUG_PUSH
5485 xmlGenericError(xmlGenericErrorContext,
5486 "HPP: entering PROLOG\n");
5487 #endif
5488 } else {
5489 ctxt->instate = XML_PARSER_MISC;
5490 #ifdef DEBUG_PUSH
5491 xmlGenericError(xmlGenericErrorContext,
5492 "HPP: entering MISC\n");
5493 #endif
5494 }
5495 break;
5496 case XML_PARSER_MISC:
5497 SKIP_BLANKS;
5498 if (in->buf == NULL)
5499 avail = in->length - (in->cur - in->base);
5500 else
5501 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5502 /*
5503 * no chars in buffer
5504 */
5505 if (avail < 1)
5506 goto done;
5507 /*
5508 * not enouth chars in buffer
5509 */
5510 if (avail < 2) {
5511 if (!terminate)
5512 goto done;
5513 else
5514 next = ' ';
5515 } else {
5516 next = in->cur[1];
5517 }
5518 cur = in->cur[0];
5519 if ((cur == '<') && (next == '!') &&
5520 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5521 if ((!terminate) &&
5522 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5523 goto done;
5524 #ifdef DEBUG_PUSH
5525 xmlGenericError(xmlGenericErrorContext,
5526 "HPP: Parsing Comment\n");
5527 #endif
5528 htmlParseComment(ctxt);
5529 ctxt->instate = XML_PARSER_MISC;
5530 } else if ((cur == '<') && (next == '?')) {
5531 if ((!terminate) &&
5532 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5533 goto done;
5534 #ifdef DEBUG_PUSH
5535 xmlGenericError(xmlGenericErrorContext,
5536 "HPP: Parsing PI\n");
5537 #endif
5538 htmlParsePI(ctxt);
5539 ctxt->instate = XML_PARSER_MISC;
5540 } else if ((cur == '<') && (next == '!') &&
5541 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5542 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5543 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5544 (UPP(8) == 'E')) {
5545 if ((!terminate) &&
5546 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5547 goto done;
5548 #ifdef DEBUG_PUSH
5549 xmlGenericError(xmlGenericErrorContext,
5550 "HPP: Parsing internal subset\n");
5551 #endif
5552 htmlParseDocTypeDecl(ctxt);
5553 ctxt->instate = XML_PARSER_PROLOG;
5554 #ifdef DEBUG_PUSH
5555 xmlGenericError(xmlGenericErrorContext,
5556 "HPP: entering PROLOG\n");
5557 #endif
5558 } else if ((cur == '<') && (next == '!') &&
5559 (avail < 9)) {
5560 goto done;
5561 } else {
5562 ctxt->instate = XML_PARSER_START_TAG;
5563 #ifdef DEBUG_PUSH
5564 xmlGenericError(xmlGenericErrorContext,
5565 "HPP: entering START_TAG\n");
5566 #endif
5567 }
5568 break;
5569 case XML_PARSER_PROLOG:
5570 SKIP_BLANKS;
5571 if (in->buf == NULL)
5572 avail = in->length - (in->cur - in->base);
5573 else
5574 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5575 if (avail < 2)
5576 goto done;
5577 cur = in->cur[0];
5578 next = in->cur[1];
5579 if ((cur == '<') && (next == '!') &&
5580 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5581 if ((!terminate) &&
5582 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5583 goto done;
5584 #ifdef DEBUG_PUSH
5585 xmlGenericError(xmlGenericErrorContext,
5586 "HPP: Parsing Comment\n");
5587 #endif
5588 htmlParseComment(ctxt);
5589 ctxt->instate = XML_PARSER_PROLOG;
5590 } else if ((cur == '<') && (next == '?')) {
5591 if ((!terminate) &&
5592 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5593 goto done;
5594 #ifdef DEBUG_PUSH
5595 xmlGenericError(xmlGenericErrorContext,
5596 "HPP: Parsing PI\n");
5597 #endif
5598 htmlParsePI(ctxt);
5599 ctxt->instate = XML_PARSER_PROLOG;
5600 } else if ((cur == '<') && (next == '!') &&
5601 (avail < 4)) {
5602 goto done;
5603 } else {
5604 ctxt->instate = XML_PARSER_START_TAG;
5605 #ifdef DEBUG_PUSH
5606 xmlGenericError(xmlGenericErrorContext,
5607 "HPP: entering START_TAG\n");
5608 #endif
5609 }
5610 break;
5611 case XML_PARSER_EPILOG:
5612 if (in->buf == NULL)
5613 avail = in->length - (in->cur - in->base);
5614 else
5615 avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5616 if (avail < 1)
5617 goto done;
5618 cur = in->cur[0];
5619 if (IS_BLANK_CH(cur)) {
5620 htmlParseCharData(ctxt);
5621 goto done;
5622 }
5623 if (avail < 2)
5624 goto done;
5625 next = in->cur[1];
5626 if ((cur == '<') && (next == '!') &&
5627 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5628 if ((!terminate) &&
5629 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5630 goto done;
5631 #ifdef DEBUG_PUSH
5632 xmlGenericError(xmlGenericErrorContext,
5633 "HPP: Parsing Comment\n");
5634 #endif
5635 htmlParseComment(ctxt);
5636 ctxt->instate = XML_PARSER_EPILOG;
5637 } else if ((cur == '<') && (next == '?')) {
5638 if ((!terminate) &&
5639 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5640 goto done;
5641 #ifdef DEBUG_PUSH
5642 xmlGenericError(xmlGenericErrorContext,
5643 "HPP: Parsing PI\n");
5644 #endif
5645 htmlParsePI(ctxt);
5646 ctxt->instate = XML_PARSER_EPILOG;
5647 } else if ((cur == '<') && (next == '!') &&
5648 (avail < 4)) {
5649 goto done;
5650 } else {
5651 ctxt->errNo = XML_ERR_DOCUMENT_END;
5652 ctxt->wellFormed = 0;
5653 ctxt->instate = XML_PARSER_EOF;
5654 #ifdef DEBUG_PUSH
5655 xmlGenericError(xmlGenericErrorContext,
5656 "HPP: entering EOF\n");
5657 #endif
5658 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5659 ctxt->sax->endDocument(ctxt->userData);
5660 goto done;
5661 }
5662 break;
5663 case XML_PARSER_START_TAG: {
5664 const xmlChar *name;
5665 int failed;
5666 const htmlElemDesc * info;
5667
5668 /*
5669 * no chars in buffer
5670 */
5671 if (avail < 1)
5672 goto done;
5673 /*
5674 * not enouth chars in buffer
5675 */
5676 if (avail < 2) {
5677 if (!terminate)
5678 goto done;
5679 else
5680 next = ' ';
5681 } else {
5682 next = in->cur[1];
5683 }
5684 cur = in->cur[0];
5685 if (cur != '<') {
5686 ctxt->instate = XML_PARSER_CONTENT;
5687 #ifdef DEBUG_PUSH
5688 xmlGenericError(xmlGenericErrorContext,
5689 "HPP: entering CONTENT\n");
5690 #endif
5691 break;
5692 }
5693 if (next == '/') {
5694 ctxt->instate = XML_PARSER_END_TAG;
5695 ctxt->checkIndex = 0;
5696 #ifdef DEBUG_PUSH
5697 xmlGenericError(xmlGenericErrorContext,
5698 "HPP: entering END_TAG\n");
5699 #endif
5700 break;
5701 }
5702 if ((!terminate) &&
5703 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5704 goto done;
5705
5706 /* Capture start position */
5707 if (ctxt->record_info) {
5708 node_info.begin_pos = ctxt->input->consumed +
5709 (CUR_PTR - ctxt->input->base);
5710 node_info.begin_line = ctxt->input->line;
5711 }
5712
5713
5714 failed = htmlParseStartTag(ctxt);
5715 name = ctxt->name;
5716 if ((failed == -1) ||
5717 (name == NULL)) {
5718 if (CUR == '>')
5719 NEXT;
5720 break;
5721 }
5722
5723 /*
5724 * Lookup the info for that element.
5725 */
5726 info = htmlTagLookup(name);
5727 if (info == NULL) {
5728 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5729 "Tag %s invalid\n", name, NULL);
5730 }
5731
5732 /*
5733 * Check for an Empty Element labeled the XML/SGML way
5734 */
5735 if ((CUR == '/') && (NXT(1) == '>')) {
5736 SKIP(2);
5737 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5738 ctxt->sax->endElement(ctxt->userData, name);
5739 htmlnamePop(ctxt);
5740 ctxt->instate = XML_PARSER_CONTENT;
5741 #ifdef DEBUG_PUSH
5742 xmlGenericError(xmlGenericErrorContext,
5743 "HPP: entering CONTENT\n");
5744 #endif
5745 break;
5746 }
5747
5748 if (CUR == '>') {
5749 NEXT;
5750 } else {
5751 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5752 "Couldn't find end of Start Tag %s\n",
5753 name, NULL);
5754
5755 /*
5756 * end of parsing of this node.
5757 */
5758 if (xmlStrEqual(name, ctxt->name)) {
5759 nodePop(ctxt);
5760 htmlnamePop(ctxt);
5761 }
5762
5763 if (ctxt->record_info)
5764 htmlNodeInfoPush(ctxt, &node_info);
5765
5766 ctxt->instate = XML_PARSER_CONTENT;
5767 #ifdef DEBUG_PUSH
5768 xmlGenericError(xmlGenericErrorContext,
5769 "HPP: entering CONTENT\n");
5770 #endif
5771 break;
5772 }
5773
5774 /*
5775 * Check for an Empty Element from DTD definition
5776 */
5777 if ((info != NULL) && (info->empty)) {
5778 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5779 ctxt->sax->endElement(ctxt->userData, name);
5780 htmlnamePop(ctxt);
5781 }
5782
5783 if (ctxt->record_info)
5784 htmlNodeInfoPush(ctxt, &node_info);
5785
5786 ctxt->instate = XML_PARSER_CONTENT;
5787 #ifdef DEBUG_PUSH
5788 xmlGenericError(xmlGenericErrorContext,
5789 "HPP: entering CONTENT\n");
5790 #endif
5791 break;
5792 }
5793 case XML_PARSER_CONTENT: {
5794 long cons;
5795 /*
5796 * Handle preparsed entities and charRef
5797 */
5798 if (ctxt->token != 0) {
5799 xmlChar chr[2] = { 0 , 0 } ;
5800
5801 chr[0] = (xmlChar) ctxt->token;
5802 htmlCheckParagraph(ctxt);
5803 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5804 ctxt->sax->characters(ctxt->userData, chr, 1);
5805 ctxt->token = 0;
5806 ctxt->checkIndex = 0;
5807 }
5808 if ((avail == 1) && (terminate)) {
5809 cur = in->cur[0];
5810 if ((cur != '<') && (cur != '&')) {
5811 if (ctxt->sax != NULL) {
5812 if (IS_BLANK_CH(cur)) {
5813 if (ctxt->keepBlanks) {
5814 if (ctxt->sax->characters != NULL)
5815 ctxt->sax->characters(
5816 ctxt->userData, &in->cur[0], 1);
5817 } else {
5818 if (ctxt->sax->ignorableWhitespace != NULL)
5819 ctxt->sax->ignorableWhitespace(
5820 ctxt->userData, &in->cur[0], 1);
5821 }
5822 } else {
5823 htmlCheckParagraph(ctxt);
5824 if (ctxt->sax->characters != NULL)
5825 ctxt->sax->characters(
5826 ctxt->userData, &in->cur[0], 1);
5827 }
5828 }
5829 ctxt->token = 0;
5830 ctxt->checkIndex = 0;
5831 in->cur++;
5832 break;
5833 }
5834 }
5835 if (avail < 2)
5836 goto done;
5837 cur = in->cur[0];
5838 next = in->cur[1];
5839 cons = ctxt->nbChars;
5840 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5841 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5842 /*
5843 * Handle SCRIPT/STYLE separately
5844 */
5845 if (!terminate) {
5846 int idx;
5847 xmlChar val;
5848
5849 idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
5850 if (idx < 0)
5851 goto done;
5852 val = in->cur[idx + 2];
5853 if (val == 0) /* bad cut of input */
5854 goto done;
5855 }
5856 htmlParseScript(ctxt);
5857 if ((cur == '<') && (next == '/')) {
5858 ctxt->instate = XML_PARSER_END_TAG;
5859 ctxt->checkIndex = 0;
5860 #ifdef DEBUG_PUSH
5861 xmlGenericError(xmlGenericErrorContext,
5862 "HPP: entering END_TAG\n");
5863 #endif
5864 break;
5865 }
5866 } else {
5867 /*
5868 * Sometimes DOCTYPE arrives in the middle of the document
5869 */
5870 if ((cur == '<') && (next == '!') &&
5871 (UPP(2) == 'D') && (UPP(3) == 'O') &&
5872 (UPP(4) == 'C') && (UPP(5) == 'T') &&
5873 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5874 (UPP(8) == 'E')) {
5875 if ((!terminate) &&
5876 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5877 goto done;
5878 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5879 "Misplaced DOCTYPE declaration\n",
5880 BAD_CAST "DOCTYPE" , NULL);
5881 htmlParseDocTypeDecl(ctxt);
5882 } else if ((cur == '<') && (next == '!') &&
5883 (in->cur[2] == '-') && (in->cur[3] == '-')) {
5884 if ((!terminate) &&
5885 (htmlParseLookupSequence(
5886 ctxt, '-', '-', '>', 1, 1) < 0))
5887 goto done;
5888 #ifdef DEBUG_PUSH
5889 xmlGenericError(xmlGenericErrorContext,
5890 "HPP: Parsing Comment\n");
5891 #endif
5892 htmlParseComment(ctxt);
5893 ctxt->instate = XML_PARSER_CONTENT;
5894 } else if ((cur == '<') && (next == '?')) {
5895 if ((!terminate) &&
5896 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5897 goto done;
5898 #ifdef DEBUG_PUSH
5899 xmlGenericError(xmlGenericErrorContext,
5900 "HPP: Parsing PI\n");
5901 #endif
5902 htmlParsePI(ctxt);
5903 ctxt->instate = XML_PARSER_CONTENT;
5904 } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5905 goto done;
5906 } else if ((cur == '<') && (next == '/')) {
5907 ctxt->instate = XML_PARSER_END_TAG;
5908 ctxt->checkIndex = 0;
5909 #ifdef DEBUG_PUSH
5910 xmlGenericError(xmlGenericErrorContext,
5911 "HPP: entering END_TAG\n");
5912 #endif
5913 break;
5914 } else if (cur == '<') {
5915 ctxt->instate = XML_PARSER_START_TAG;
5916 ctxt->checkIndex = 0;
5917 #ifdef DEBUG_PUSH
5918 xmlGenericError(xmlGenericErrorContext,
5919 "HPP: entering START_TAG\n");
5920 #endif
5921 break;
5922 } else if (cur == '&') {
5923 if ((!terminate) &&
5924 (htmlParseLookupChars(ctxt,
5925 BAD_CAST "; >/", 4) < 0))
5926 goto done;
5927 #ifdef DEBUG_PUSH
5928 xmlGenericError(xmlGenericErrorContext,
5929 "HPP: Parsing Reference\n");
5930 #endif
5931 /* TODO: check generation of subtrees if noent !!! */
5932 htmlParseReference(ctxt);
5933 } else {
5934 /*
5935 * check that the text sequence is complete
5936 * before handing out the data to the parser
5937 * to avoid problems with erroneous end of
5938 * data detection.
5939 */
5940 if ((!terminate) &&
5941 (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
5942 goto done;
5943 ctxt->checkIndex = 0;
5944 #ifdef DEBUG_PUSH
5945 xmlGenericError(xmlGenericErrorContext,
5946 "HPP: Parsing char data\n");
5947 #endif
5948 htmlParseCharData(ctxt);
5949 }
5950 }
5951 if (cons == ctxt->nbChars) {
5952 if (ctxt->node != NULL) {
5953 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5954 "detected an error in element content\n",
5955 NULL, NULL);
5956 }
5957 NEXT;
5958 break;
5959 }
5960
5961 break;
5962 }
5963 case XML_PARSER_END_TAG:
5964 if (avail < 2)
5965 goto done;
5966 if ((!terminate) &&
5967 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5968 goto done;
5969 htmlParseEndTag(ctxt);
5970 if (ctxt->nameNr == 0) {
5971 ctxt->instate = XML_PARSER_EPILOG;
5972 } else {
5973 ctxt->instate = XML_PARSER_CONTENT;
5974 }
5975 ctxt->checkIndex = 0;
5976 #ifdef DEBUG_PUSH
5977 xmlGenericError(xmlGenericErrorContext,
5978 "HPP: entering CONTENT\n");
5979 #endif
5980 break;
5981 case XML_PARSER_CDATA_SECTION:
5982 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5983 "HPP: internal error, state == CDATA\n",
5984 NULL, NULL);
5985 ctxt->instate = XML_PARSER_CONTENT;
5986 ctxt->checkIndex = 0;
5987 #ifdef DEBUG_PUSH
5988 xmlGenericError(xmlGenericErrorContext,
5989 "HPP: entering CONTENT\n");
5990 #endif
5991 break;
5992 case XML_PARSER_DTD:
5993 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5994 "HPP: internal error, state == DTD\n",
5995 NULL, NULL);
5996 ctxt->instate = XML_PARSER_CONTENT;
5997 ctxt->checkIndex = 0;
5998 #ifdef DEBUG_PUSH
5999 xmlGenericError(xmlGenericErrorContext,
6000 "HPP: entering CONTENT\n");
6001 #endif
6002 break;
6003 case XML_PARSER_COMMENT:
6004 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6005 "HPP: internal error, state == COMMENT\n",
6006 NULL, NULL);
6007 ctxt->instate = XML_PARSER_CONTENT;
6008 ctxt->checkIndex = 0;
6009 #ifdef DEBUG_PUSH
6010 xmlGenericError(xmlGenericErrorContext,
6011 "HPP: entering CONTENT\n");
6012 #endif
6013 break;
6014 case XML_PARSER_PI:
6015 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6016 "HPP: internal error, state == PI\n",
6017 NULL, NULL);
6018 ctxt->instate = XML_PARSER_CONTENT;
6019 ctxt->checkIndex = 0;
6020 #ifdef DEBUG_PUSH
6021 xmlGenericError(xmlGenericErrorContext,
6022 "HPP: entering CONTENT\n");
6023 #endif
6024 break;
6025 case XML_PARSER_ENTITY_DECL:
6026 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6027 "HPP: internal error, state == ENTITY_DECL\n",
6028 NULL, NULL);
6029 ctxt->instate = XML_PARSER_CONTENT;
6030 ctxt->checkIndex = 0;
6031 #ifdef DEBUG_PUSH
6032 xmlGenericError(xmlGenericErrorContext,
6033 "HPP: entering CONTENT\n");
6034 #endif
6035 break;
6036 case XML_PARSER_ENTITY_VALUE:
6037 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6038 "HPP: internal error, state == ENTITY_VALUE\n",
6039 NULL, NULL);
6040 ctxt->instate = XML_PARSER_CONTENT;
6041 ctxt->checkIndex = 0;
6042 #ifdef DEBUG_PUSH
6043 xmlGenericError(xmlGenericErrorContext,
6044 "HPP: entering DTD\n");
6045 #endif
6046 break;
6047 case XML_PARSER_ATTRIBUTE_VALUE:
6048 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6049 "HPP: internal error, state == ATTRIBUTE_VALUE\n",
6050 NULL, NULL);
6051 ctxt->instate = XML_PARSER_START_TAG;
6052 ctxt->checkIndex = 0;
6053 #ifdef DEBUG_PUSH
6054 xmlGenericError(xmlGenericErrorContext,
6055 "HPP: entering START_TAG\n");
6056 #endif
6057 break;
6058 case XML_PARSER_SYSTEM_LITERAL:
6059 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6060 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6061 NULL, NULL);
6062 ctxt->instate = XML_PARSER_CONTENT;
6063 ctxt->checkIndex = 0;
6064 #ifdef DEBUG_PUSH
6065 xmlGenericError(xmlGenericErrorContext,
6066 "HPP: entering CONTENT\n");
6067 #endif
6068 break;
6069 case XML_PARSER_IGNORE:
6070 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6071 "HPP: internal error, state == XML_PARSER_IGNORE\n",
6072 NULL, NULL);
6073 ctxt->instate = XML_PARSER_CONTENT;
6074 ctxt->checkIndex = 0;
6075 #ifdef DEBUG_PUSH
6076 xmlGenericError(xmlGenericErrorContext,
6077 "HPP: entering CONTENT\n");
6078 #endif
6079 break;
6080 case XML_PARSER_PUBLIC_LITERAL:
6081 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6082 "HPP: internal error, state == XML_PARSER_LITERAL\n",
6083 NULL, NULL);
6084 ctxt->instate = XML_PARSER_CONTENT;
6085 ctxt->checkIndex = 0;
6086 #ifdef DEBUG_PUSH
6087 xmlGenericError(xmlGenericErrorContext,
6088 "HPP: entering CONTENT\n");
6089 #endif
6090 break;
6091
6092 }
6093 }
6094 done:
6095 if ((avail == 0) && (terminate)) {
6096 htmlAutoCloseOnEnd(ctxt);
6097 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6098 /*
6099 * SAX: end of the document processing.
6100 */
6101 ctxt->instate = XML_PARSER_EOF;
6102 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6103 ctxt->sax->endDocument(ctxt->userData);
6104 }
6105 }
6106 if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6107 ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6108 (ctxt->instate == XML_PARSER_EPILOG))) {
6109 xmlDtdPtr dtd;
6110 dtd = xmlGetIntSubset(ctxt->myDoc);
6111 if (dtd == NULL)
6112 ctxt->myDoc->intSubset =
6113 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6114 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6115 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6116 }
6117 #ifdef DEBUG_PUSH
6118 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6119 #endif
6120 return(ret);
6121 }
6122
6123 /**
6124 * htmlParseChunk:
6125 * @ctxt: an HTML parser context
6126 * @chunk: an char array
6127 * @size: the size in byte of the chunk
6128 * @terminate: last chunk indicator
6129 *
6130 * Parse a Chunk of memory
6131 *
6132 * Returns zero if no error, the xmlParserErrors otherwise.
6133 */
6134 int
htmlParseChunk(htmlParserCtxtPtr ctxt,const char * chunk,int size,int terminate)6135 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6136 int terminate) {
6137 if ((ctxt == NULL) || (ctxt->input == NULL)) {
6138 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6139 "htmlParseChunk: context error\n", NULL, NULL);
6140 return(XML_ERR_INTERNAL_ERROR);
6141 }
6142 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6143 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
6144 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6145 size_t cur = ctxt->input->cur - ctxt->input->base;
6146 int res;
6147
6148 res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6149 if (res < 0) {
6150 ctxt->errNo = XML_PARSER_EOF;
6151 ctxt->disableSAX = 1;
6152 return (XML_PARSER_EOF);
6153 }
6154 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6155 #ifdef DEBUG_PUSH
6156 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6157 #endif
6158
6159 #if 0
6160 if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6161 htmlParseTryOrFinish(ctxt, terminate);
6162 #endif
6163 } else if (ctxt->instate != XML_PARSER_EOF) {
6164 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6165 xmlParserInputBufferPtr in = ctxt->input->buf;
6166 if ((in->encoder != NULL) && (in->buffer != NULL) &&
6167 (in->raw != NULL)) {
6168 int nbchars;
6169 size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6170 size_t current = ctxt->input->cur - ctxt->input->base;
6171
6172 nbchars = xmlCharEncInput(in, terminate);
6173 if (nbchars < 0) {
6174 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6175 "encoder error\n", NULL, NULL);
6176 return(XML_ERR_INVALID_ENCODING);
6177 }
6178 xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6179 }
6180 }
6181 }
6182 htmlParseTryOrFinish(ctxt, terminate);
6183 if (terminate) {
6184 if ((ctxt->instate != XML_PARSER_EOF) &&
6185 (ctxt->instate != XML_PARSER_EPILOG) &&
6186 (ctxt->instate != XML_PARSER_MISC)) {
6187 ctxt->errNo = XML_ERR_DOCUMENT_END;
6188 ctxt->wellFormed = 0;
6189 }
6190 if (ctxt->instate != XML_PARSER_EOF) {
6191 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6192 ctxt->sax->endDocument(ctxt->userData);
6193 }
6194 ctxt->instate = XML_PARSER_EOF;
6195 }
6196 return((xmlParserErrors) ctxt->errNo);
6197 }
6198
6199 /************************************************************************
6200 * *
6201 * User entry points *
6202 * *
6203 ************************************************************************/
6204
6205 /**
6206 * htmlCreatePushParserCtxt:
6207 * @sax: a SAX handler
6208 * @user_data: The user data returned on SAX callbacks
6209 * @chunk: a pointer to an array of chars
6210 * @size: number of chars in the array
6211 * @filename: an optional file name or URI
6212 * @enc: an optional encoding
6213 *
6214 * Create a parser context for using the HTML parser in push mode
6215 * The value of @filename is used for fetching external entities
6216 * and error/warning reports.
6217 *
6218 * Returns the new parser context or NULL
6219 */
6220 htmlParserCtxtPtr
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,void * user_data,const char * chunk,int size,const char * filename,xmlCharEncoding enc)6221 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6222 const char *chunk, int size, const char *filename,
6223 xmlCharEncoding enc) {
6224 htmlParserCtxtPtr ctxt;
6225 htmlParserInputPtr inputStream;
6226 xmlParserInputBufferPtr buf;
6227
6228 xmlInitParser();
6229
6230 buf = xmlAllocParserInputBuffer(enc);
6231 if (buf == NULL) return(NULL);
6232
6233 ctxt = htmlNewParserCtxt();
6234 if (ctxt == NULL) {
6235 xmlFreeParserInputBuffer(buf);
6236 return(NULL);
6237 }
6238 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6239 ctxt->charset=XML_CHAR_ENCODING_UTF8;
6240 if (sax != NULL) {
6241 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6242 xmlFree(ctxt->sax);
6243 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6244 if (ctxt->sax == NULL) {
6245 xmlFree(buf);
6246 xmlFree(ctxt);
6247 return(NULL);
6248 }
6249 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6250 if (user_data != NULL)
6251 ctxt->userData = user_data;
6252 }
6253 if (filename == NULL) {
6254 ctxt->directory = NULL;
6255 } else {
6256 ctxt->directory = xmlParserGetDirectory(filename);
6257 }
6258
6259 inputStream = htmlNewInputStream(ctxt);
6260 if (inputStream == NULL) {
6261 xmlFreeParserCtxt(ctxt);
6262 xmlFree(buf);
6263 return(NULL);
6264 }
6265
6266 if (filename == NULL)
6267 inputStream->filename = NULL;
6268 else
6269 inputStream->filename = (char *)
6270 xmlCanonicPath((const xmlChar *) filename);
6271 inputStream->buf = buf;
6272 xmlBufResetInput(buf->buffer, inputStream);
6273
6274 inputPush(ctxt, inputStream);
6275
6276 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6277 (ctxt->input->buf != NULL)) {
6278 size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6279 size_t cur = ctxt->input->cur - ctxt->input->base;
6280
6281 xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6282
6283 xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6284 #ifdef DEBUG_PUSH
6285 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6286 #endif
6287 }
6288 ctxt->progressive = 1;
6289
6290 return(ctxt);
6291 }
6292 #endif /* LIBXML_PUSH_ENABLED */
6293
6294 /**
6295 * htmlSAXParseDoc:
6296 * @cur: a pointer to an array of xmlChar
6297 * @encoding: a free form C string describing the HTML document encoding, or NULL
6298 * @sax: the SAX handler block
6299 * @userData: if using SAX, this pointer will be provided on callbacks.
6300 *
6301 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6302 * to handle parse events. If sax is NULL, fallback to the default DOM
6303 * behavior and return a tree.
6304 *
6305 * Returns the resulting document tree unless SAX is NULL or the document is
6306 * not well formed.
6307 */
6308
6309 htmlDocPtr
htmlSAXParseDoc(const xmlChar * cur,const char * encoding,htmlSAXHandlerPtr sax,void * userData)6310 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6311 htmlSAXHandlerPtr sax, void *userData) {
6312 htmlDocPtr ret;
6313 htmlParserCtxtPtr ctxt;
6314
6315 xmlInitParser();
6316
6317 if (cur == NULL) return(NULL);
6318
6319
6320 ctxt = htmlCreateDocParserCtxt(cur, encoding);
6321 if (ctxt == NULL) return(NULL);
6322 if (sax != NULL) {
6323 if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6324 ctxt->sax = sax;
6325 ctxt->userData = userData;
6326 }
6327
6328 htmlParseDocument(ctxt);
6329 ret = ctxt->myDoc;
6330 if (sax != NULL) {
6331 ctxt->sax = NULL;
6332 ctxt->userData = NULL;
6333 }
6334 htmlFreeParserCtxt(ctxt);
6335
6336 return(ret);
6337 }
6338
6339 /**
6340 * htmlParseDoc:
6341 * @cur: a pointer to an array of xmlChar
6342 * @encoding: a free form C string describing the HTML document encoding, or NULL
6343 *
6344 * parse an HTML in-memory document and build a tree.
6345 *
6346 * Returns the resulting document tree
6347 */
6348
6349 htmlDocPtr
htmlParseDoc(const xmlChar * cur,const char * encoding)6350 htmlParseDoc(const xmlChar *cur, const char *encoding) {
6351 return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6352 }
6353
6354
6355 /**
6356 * htmlCreateFileParserCtxt:
6357 * @filename: the filename
6358 * @encoding: a free form C string describing the HTML document encoding, or NULL
6359 *
6360 * Create a parser context for a file content.
6361 * Automatic support for ZLIB/Compress compressed document is provided
6362 * by default if found at compile-time.
6363 *
6364 * Returns the new parser context or NULL
6365 */
6366 htmlParserCtxtPtr
htmlCreateFileParserCtxt(const char * filename,const char * encoding)6367 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6368 {
6369 htmlParserCtxtPtr ctxt;
6370 htmlParserInputPtr inputStream;
6371 char *canonicFilename;
6372 /* htmlCharEncoding enc; */
6373 xmlChar *content, *content_line = (xmlChar *) "charset=";
6374
6375 if (filename == NULL)
6376 return(NULL);
6377
6378 ctxt = htmlNewParserCtxt();
6379 if (ctxt == NULL) {
6380 return(NULL);
6381 }
6382 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6383 if (canonicFilename == NULL) {
6384 #ifdef LIBXML_SAX1_ENABLED
6385 if (xmlDefaultSAXHandler.error != NULL) {
6386 xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6387 }
6388 #endif
6389 xmlFreeParserCtxt(ctxt);
6390 return(NULL);
6391 }
6392
6393 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6394 xmlFree(canonicFilename);
6395 if (inputStream == NULL) {
6396 xmlFreeParserCtxt(ctxt);
6397 return(NULL);
6398 }
6399
6400 inputPush(ctxt, inputStream);
6401
6402 /* set encoding */
6403 if (encoding) {
6404 size_t l = strlen(encoding);
6405
6406 if (l < 1000) {
6407 content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6408 if (content) {
6409 strcpy ((char *)content, (char *)content_line);
6410 strcat ((char *)content, (char *)encoding);
6411 htmlCheckEncoding (ctxt, content);
6412 xmlFree (content);
6413 }
6414 }
6415 }
6416
6417 return(ctxt);
6418 }
6419
6420 /**
6421 * htmlSAXParseFile:
6422 * @filename: the filename
6423 * @encoding: a free form C string describing the HTML document encoding, or NULL
6424 * @sax: the SAX handler block
6425 * @userData: if using SAX, this pointer will be provided on callbacks.
6426 *
6427 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6428 * compressed document is provided by default if found at compile-time.
6429 * It use the given SAX function block to handle the parsing callback.
6430 * If sax is NULL, fallback to the default DOM tree building routines.
6431 *
6432 * Returns the resulting document tree unless SAX is NULL or the document is
6433 * not well formed.
6434 */
6435
6436 htmlDocPtr
htmlSAXParseFile(const char * filename,const char * encoding,htmlSAXHandlerPtr sax,void * userData)6437 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6438 void *userData) {
6439 htmlDocPtr ret;
6440 htmlParserCtxtPtr ctxt;
6441 htmlSAXHandlerPtr oldsax = NULL;
6442
6443 xmlInitParser();
6444
6445 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6446 if (ctxt == NULL) return(NULL);
6447 if (sax != NULL) {
6448 oldsax = ctxt->sax;
6449 ctxt->sax = sax;
6450 ctxt->userData = userData;
6451 }
6452
6453 htmlParseDocument(ctxt);
6454
6455 ret = ctxt->myDoc;
6456 if (sax != NULL) {
6457 ctxt->sax = oldsax;
6458 ctxt->userData = NULL;
6459 }
6460 htmlFreeParserCtxt(ctxt);
6461
6462 return(ret);
6463 }
6464
6465 /**
6466 * htmlParseFile:
6467 * @filename: the filename
6468 * @encoding: a free form C string describing the HTML document encoding, or NULL
6469 *
6470 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6471 * compressed document is provided by default if found at compile-time.
6472 *
6473 * Returns the resulting document tree
6474 */
6475
6476 htmlDocPtr
htmlParseFile(const char * filename,const char * encoding)6477 htmlParseFile(const char *filename, const char *encoding) {
6478 return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6479 }
6480
6481 /**
6482 * htmlHandleOmittedElem:
6483 * @val: int 0 or 1
6484 *
6485 * Set and return the previous value for handling HTML omitted tags.
6486 *
6487 * Returns the last value for 0 for no handling, 1 for auto insertion.
6488 */
6489
6490 int
htmlHandleOmittedElem(int val)6491 htmlHandleOmittedElem(int val) {
6492 int old = htmlOmittedDefaultValue;
6493
6494 htmlOmittedDefaultValue = val;
6495 return(old);
6496 }
6497
6498 /**
6499 * htmlElementAllowedHere:
6500 * @parent: HTML parent element
6501 * @elt: HTML element
6502 *
6503 * Checks whether an HTML element may be a direct child of a parent element.
6504 * Note - doesn't check for deprecated elements
6505 *
6506 * Returns 1 if allowed; 0 otherwise.
6507 */
6508 int
htmlElementAllowedHere(const htmlElemDesc * parent,const xmlChar * elt)6509 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6510 const char** p ;
6511
6512 if ( ! elt || ! parent || ! parent->subelts )
6513 return 0 ;
6514
6515 for ( p = parent->subelts; *p; ++p )
6516 if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6517 return 1 ;
6518
6519 return 0 ;
6520 }
6521 /**
6522 * htmlElementStatusHere:
6523 * @parent: HTML parent element
6524 * @elt: HTML element
6525 *
6526 * Checks whether an HTML element may be a direct child of a parent element.
6527 * and if so whether it is valid or deprecated.
6528 *
6529 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6530 */
6531 htmlStatus
htmlElementStatusHere(const htmlElemDesc * parent,const htmlElemDesc * elt)6532 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6533 if ( ! parent || ! elt )
6534 return HTML_INVALID ;
6535 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6536 return HTML_INVALID ;
6537
6538 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6539 }
6540 /**
6541 * htmlAttrAllowed:
6542 * @elt: HTML element
6543 * @attr: HTML attribute
6544 * @legacy: whether to allow deprecated attributes
6545 *
6546 * Checks whether an attribute is valid for an element
6547 * Has full knowledge of Required and Deprecated attributes
6548 *
6549 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6550 */
6551 htmlStatus
htmlAttrAllowed(const htmlElemDesc * elt,const xmlChar * attr,int legacy)6552 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6553 const char** p ;
6554
6555 if ( !elt || ! attr )
6556 return HTML_INVALID ;
6557
6558 if ( elt->attrs_req )
6559 for ( p = elt->attrs_req; *p; ++p)
6560 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6561 return HTML_REQUIRED ;
6562
6563 if ( elt->attrs_opt )
6564 for ( p = elt->attrs_opt; *p; ++p)
6565 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6566 return HTML_VALID ;
6567
6568 if ( legacy && elt->attrs_depr )
6569 for ( p = elt->attrs_depr; *p; ++p)
6570 if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6571 return HTML_DEPRECATED ;
6572
6573 return HTML_INVALID ;
6574 }
6575 /**
6576 * htmlNodeStatus:
6577 * @node: an htmlNodePtr in a tree
6578 * @legacy: whether to allow deprecated elements (YES is faster here
6579 * for Element nodes)
6580 *
6581 * Checks whether the tree node is valid. Experimental (the author
6582 * only uses the HTML enhancements in a SAX parser)
6583 *
6584 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6585 * legacy allowed) or htmlElementStatusHere (otherwise).
6586 * for Attribute nodes, a return from htmlAttrAllowed
6587 * for other nodes, HTML_NA (no checks performed)
6588 */
6589 htmlStatus
htmlNodeStatus(const htmlNodePtr node,int legacy)6590 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6591 if ( ! node )
6592 return HTML_INVALID ;
6593
6594 switch ( node->type ) {
6595 case XML_ELEMENT_NODE:
6596 return legacy
6597 ? ( htmlElementAllowedHere (
6598 htmlTagLookup(node->parent->name) , node->name
6599 ) ? HTML_VALID : HTML_INVALID )
6600 : htmlElementStatusHere(
6601 htmlTagLookup(node->parent->name) ,
6602 htmlTagLookup(node->name) )
6603 ;
6604 case XML_ATTRIBUTE_NODE:
6605 return htmlAttrAllowed(
6606 htmlTagLookup(node->parent->name) , node->name, legacy) ;
6607 default: return HTML_NA ;
6608 }
6609 }
6610 /************************************************************************
6611 * *
6612 * New set (2.6.0) of simpler and more flexible APIs *
6613 * *
6614 ************************************************************************/
6615 /**
6616 * DICT_FREE:
6617 * @str: a string
6618 *
6619 * Free a string if it is not owned by the "dict" dictionary in the
6620 * current scope
6621 */
6622 #define DICT_FREE(str) \
6623 if ((str) && ((!dict) || \
6624 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6625 xmlFree((char *)(str));
6626
6627 /**
6628 * htmlCtxtReset:
6629 * @ctxt: an HTML parser context
6630 *
6631 * Reset a parser context
6632 */
6633 void
htmlCtxtReset(htmlParserCtxtPtr ctxt)6634 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6635 {
6636 xmlParserInputPtr input;
6637 xmlDictPtr dict;
6638
6639 if (ctxt == NULL)
6640 return;
6641
6642 xmlInitParser();
6643 dict = ctxt->dict;
6644
6645 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6646 xmlFreeInputStream(input);
6647 }
6648 ctxt->inputNr = 0;
6649 ctxt->input = NULL;
6650
6651 ctxt->spaceNr = 0;
6652 if (ctxt->spaceTab != NULL) {
6653 ctxt->spaceTab[0] = -1;
6654 ctxt->space = &ctxt->spaceTab[0];
6655 } else {
6656 ctxt->space = NULL;
6657 }
6658
6659
6660 ctxt->nodeNr = 0;
6661 ctxt->node = NULL;
6662
6663 ctxt->nameNr = 0;
6664 ctxt->name = NULL;
6665
6666 DICT_FREE(ctxt->version);
6667 ctxt->version = NULL;
6668 DICT_FREE(ctxt->encoding);
6669 ctxt->encoding = NULL;
6670 DICT_FREE(ctxt->directory);
6671 ctxt->directory = NULL;
6672 DICT_FREE(ctxt->extSubURI);
6673 ctxt->extSubURI = NULL;
6674 DICT_FREE(ctxt->extSubSystem);
6675 ctxt->extSubSystem = NULL;
6676 if (ctxt->myDoc != NULL)
6677 xmlFreeDoc(ctxt->myDoc);
6678 ctxt->myDoc = NULL;
6679
6680 ctxt->standalone = -1;
6681 ctxt->hasExternalSubset = 0;
6682 ctxt->hasPErefs = 0;
6683 ctxt->html = 1;
6684 ctxt->external = 0;
6685 ctxt->instate = XML_PARSER_START;
6686 ctxt->token = 0;
6687
6688 ctxt->wellFormed = 1;
6689 ctxt->nsWellFormed = 1;
6690 ctxt->disableSAX = 0;
6691 ctxt->valid = 1;
6692 ctxt->vctxt.userData = ctxt;
6693 ctxt->vctxt.error = xmlParserValidityError;
6694 ctxt->vctxt.warning = xmlParserValidityWarning;
6695 ctxt->record_info = 0;
6696 ctxt->nbChars = 0;
6697 ctxt->checkIndex = 0;
6698 ctxt->inSubset = 0;
6699 ctxt->errNo = XML_ERR_OK;
6700 ctxt->depth = 0;
6701 ctxt->charset = XML_CHAR_ENCODING_NONE;
6702 ctxt->catalogs = NULL;
6703 xmlInitNodeInfoSeq(&ctxt->node_seq);
6704
6705 if (ctxt->attsDefault != NULL) {
6706 xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6707 ctxt->attsDefault = NULL;
6708 }
6709 if (ctxt->attsSpecial != NULL) {
6710 xmlHashFree(ctxt->attsSpecial, NULL);
6711 ctxt->attsSpecial = NULL;
6712 }
6713 }
6714
6715 /**
6716 * htmlCtxtUseOptions:
6717 * @ctxt: an HTML parser context
6718 * @options: a combination of htmlParserOption(s)
6719 *
6720 * Applies the options to the parser context
6721 *
6722 * Returns 0 in case of success, the set of unknown or unimplemented options
6723 * in case of error.
6724 */
6725 int
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt,int options)6726 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6727 {
6728 if (ctxt == NULL)
6729 return(-1);
6730
6731 if (options & HTML_PARSE_NOWARNING) {
6732 ctxt->sax->warning = NULL;
6733 ctxt->vctxt.warning = NULL;
6734 options -= XML_PARSE_NOWARNING;
6735 ctxt->options |= XML_PARSE_NOWARNING;
6736 }
6737 if (options & HTML_PARSE_NOERROR) {
6738 ctxt->sax->error = NULL;
6739 ctxt->vctxt.error = NULL;
6740 ctxt->sax->fatalError = NULL;
6741 options -= XML_PARSE_NOERROR;
6742 ctxt->options |= XML_PARSE_NOERROR;
6743 }
6744 if (options & HTML_PARSE_PEDANTIC) {
6745 ctxt->pedantic = 1;
6746 options -= XML_PARSE_PEDANTIC;
6747 ctxt->options |= XML_PARSE_PEDANTIC;
6748 } else
6749 ctxt->pedantic = 0;
6750 if (options & XML_PARSE_NOBLANKS) {
6751 ctxt->keepBlanks = 0;
6752 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6753 options -= XML_PARSE_NOBLANKS;
6754 ctxt->options |= XML_PARSE_NOBLANKS;
6755 } else
6756 ctxt->keepBlanks = 1;
6757 if (options & HTML_PARSE_RECOVER) {
6758 ctxt->recovery = 1;
6759 options -= HTML_PARSE_RECOVER;
6760 } else
6761 ctxt->recovery = 0;
6762 if (options & HTML_PARSE_COMPACT) {
6763 ctxt->options |= HTML_PARSE_COMPACT;
6764 options -= HTML_PARSE_COMPACT;
6765 }
6766 if (options & XML_PARSE_HUGE) {
6767 ctxt->options |= XML_PARSE_HUGE;
6768 options -= XML_PARSE_HUGE;
6769 }
6770 if (options & HTML_PARSE_NODEFDTD) {
6771 ctxt->options |= HTML_PARSE_NODEFDTD;
6772 options -= HTML_PARSE_NODEFDTD;
6773 }
6774 if (options & HTML_PARSE_IGNORE_ENC) {
6775 ctxt->options |= HTML_PARSE_IGNORE_ENC;
6776 options -= HTML_PARSE_IGNORE_ENC;
6777 }
6778 if (options & HTML_PARSE_NOIMPLIED) {
6779 ctxt->options |= HTML_PARSE_NOIMPLIED;
6780 options -= HTML_PARSE_NOIMPLIED;
6781 }
6782 ctxt->dictNames = 0;
6783 return (options);
6784 }
6785
6786 /**
6787 * htmlDoRead:
6788 * @ctxt: an HTML parser context
6789 * @URL: the base URL to use for the document
6790 * @encoding: the document encoding, or NULL
6791 * @options: a combination of htmlParserOption(s)
6792 * @reuse: keep the context for reuse
6793 *
6794 * Common front-end for the htmlRead functions
6795 *
6796 * Returns the resulting document tree or NULL
6797 */
6798 static htmlDocPtr
htmlDoRead(htmlParserCtxtPtr ctxt,const char * URL,const char * encoding,int options,int reuse)6799 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6800 int options, int reuse)
6801 {
6802 htmlDocPtr ret;
6803
6804 htmlCtxtUseOptions(ctxt, options);
6805 ctxt->html = 1;
6806 if (encoding != NULL) {
6807 xmlCharEncodingHandlerPtr hdlr;
6808
6809 hdlr = xmlFindCharEncodingHandler(encoding);
6810 if (hdlr != NULL) {
6811 xmlSwitchToEncoding(ctxt, hdlr);
6812 if (ctxt->input->encoding != NULL)
6813 xmlFree((xmlChar *) ctxt->input->encoding);
6814 ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6815 }
6816 }
6817 if ((URL != NULL) && (ctxt->input != NULL) &&
6818 (ctxt->input->filename == NULL))
6819 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6820 htmlParseDocument(ctxt);
6821 ret = ctxt->myDoc;
6822 ctxt->myDoc = NULL;
6823 if (!reuse) {
6824 if ((ctxt->dictNames) &&
6825 (ret != NULL) &&
6826 (ret->dict == ctxt->dict))
6827 ctxt->dict = NULL;
6828 xmlFreeParserCtxt(ctxt);
6829 }
6830 return (ret);
6831 }
6832
6833 /**
6834 * htmlReadDoc:
6835 * @cur: a pointer to a zero terminated string
6836 * @URL: the base URL to use for the document
6837 * @encoding: the document encoding, or NULL
6838 * @options: a combination of htmlParserOption(s)
6839 *
6840 * parse an XML in-memory document and build a tree.
6841 *
6842 * Returns the resulting document tree
6843 */
6844 htmlDocPtr
htmlReadDoc(const xmlChar * cur,const char * URL,const char * encoding,int options)6845 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6846 {
6847 htmlParserCtxtPtr ctxt;
6848
6849 if (cur == NULL)
6850 return (NULL);
6851
6852 xmlInitParser();
6853 ctxt = htmlCreateDocParserCtxt(cur, NULL);
6854 if (ctxt == NULL)
6855 return (NULL);
6856 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6857 }
6858
6859 /**
6860 * htmlReadFile:
6861 * @filename: a file or URL
6862 * @encoding: the document encoding, or NULL
6863 * @options: a combination of htmlParserOption(s)
6864 *
6865 * parse an XML file from the filesystem or the network.
6866 *
6867 * Returns the resulting document tree
6868 */
6869 htmlDocPtr
htmlReadFile(const char * filename,const char * encoding,int options)6870 htmlReadFile(const char *filename, const char *encoding, int options)
6871 {
6872 htmlParserCtxtPtr ctxt;
6873
6874 xmlInitParser();
6875 ctxt = htmlCreateFileParserCtxt(filename, encoding);
6876 if (ctxt == NULL)
6877 return (NULL);
6878 return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6879 }
6880
6881 /**
6882 * htmlReadMemory:
6883 * @buffer: a pointer to a char array
6884 * @size: the size of the array
6885 * @URL: the base URL to use for the document
6886 * @encoding: the document encoding, or NULL
6887 * @options: a combination of htmlParserOption(s)
6888 *
6889 * parse an XML in-memory document and build a tree.
6890 *
6891 * Returns the resulting document tree
6892 */
6893 htmlDocPtr
htmlReadMemory(const char * buffer,int size,const char * URL,const char * encoding,int options)6894 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6895 {
6896 htmlParserCtxtPtr ctxt;
6897
6898 xmlInitParser();
6899 ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6900 if (ctxt == NULL)
6901 return (NULL);
6902 htmlDefaultSAXHandlerInit();
6903 if (ctxt->sax != NULL)
6904 memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6905 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6906 }
6907
6908 /**
6909 * htmlReadFd:
6910 * @fd: an open file descriptor
6911 * @URL: the base URL to use for the document
6912 * @encoding: the document encoding, or NULL
6913 * @options: a combination of htmlParserOption(s)
6914 *
6915 * parse an XML from a file descriptor and build a tree.
6916 *
6917 * Returns the resulting document tree
6918 */
6919 htmlDocPtr
htmlReadFd(int fd,const char * URL,const char * encoding,int options)6920 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6921 {
6922 htmlParserCtxtPtr ctxt;
6923 xmlParserInputBufferPtr input;
6924 xmlParserInputPtr stream;
6925
6926 if (fd < 0)
6927 return (NULL);
6928 xmlInitParser();
6929
6930 xmlInitParser();
6931 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6932 if (input == NULL)
6933 return (NULL);
6934 ctxt = xmlNewParserCtxt();
6935 if (ctxt == NULL) {
6936 xmlFreeParserInputBuffer(input);
6937 return (NULL);
6938 }
6939 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6940 if (stream == NULL) {
6941 xmlFreeParserInputBuffer(input);
6942 xmlFreeParserCtxt(ctxt);
6943 return (NULL);
6944 }
6945 inputPush(ctxt, stream);
6946 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6947 }
6948
6949 /**
6950 * htmlReadIO:
6951 * @ioread: an I/O read function
6952 * @ioclose: an I/O close function
6953 * @ioctx: an I/O handler
6954 * @URL: the base URL to use for the document
6955 * @encoding: the document encoding, or NULL
6956 * @options: a combination of htmlParserOption(s)
6957 *
6958 * parse an HTML document from I/O functions and source and build a tree.
6959 *
6960 * Returns the resulting document tree
6961 */
6962 htmlDocPtr
htmlReadIO(xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)6963 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6964 void *ioctx, const char *URL, const char *encoding, int options)
6965 {
6966 htmlParserCtxtPtr ctxt;
6967 xmlParserInputBufferPtr input;
6968 xmlParserInputPtr stream;
6969
6970 if (ioread == NULL)
6971 return (NULL);
6972 xmlInitParser();
6973
6974 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6975 XML_CHAR_ENCODING_NONE);
6976 if (input == NULL) {
6977 if (ioclose != NULL)
6978 ioclose(ioctx);
6979 return (NULL);
6980 }
6981 ctxt = htmlNewParserCtxt();
6982 if (ctxt == NULL) {
6983 xmlFreeParserInputBuffer(input);
6984 return (NULL);
6985 }
6986 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6987 if (stream == NULL) {
6988 xmlFreeParserInputBuffer(input);
6989 xmlFreeParserCtxt(ctxt);
6990 return (NULL);
6991 }
6992 inputPush(ctxt, stream);
6993 return (htmlDoRead(ctxt, URL, encoding, options, 0));
6994 }
6995
6996 /**
6997 * htmlCtxtReadDoc:
6998 * @ctxt: an HTML parser context
6999 * @cur: a pointer to a zero terminated string
7000 * @URL: the base URL to use for the document
7001 * @encoding: the document encoding, or NULL
7002 * @options: a combination of htmlParserOption(s)
7003 *
7004 * parse an XML in-memory document and build a tree.
7005 * This reuses the existing @ctxt parser context
7006 *
7007 * Returns the resulting document tree
7008 */
7009 htmlDocPtr
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt,const xmlChar * cur,const char * URL,const char * encoding,int options)7010 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
7011 const char *URL, const char *encoding, int options)
7012 {
7013 xmlParserInputPtr stream;
7014
7015 if (cur == NULL)
7016 return (NULL);
7017 if (ctxt == NULL)
7018 return (NULL);
7019 xmlInitParser();
7020
7021 htmlCtxtReset(ctxt);
7022
7023 stream = xmlNewStringInputStream(ctxt, cur);
7024 if (stream == NULL) {
7025 return (NULL);
7026 }
7027 inputPush(ctxt, stream);
7028 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7029 }
7030
7031 /**
7032 * htmlCtxtReadFile:
7033 * @ctxt: an HTML parser context
7034 * @filename: a file or URL
7035 * @encoding: the document encoding, or NULL
7036 * @options: a combination of htmlParserOption(s)
7037 *
7038 * parse an XML file from the filesystem or the network.
7039 * This reuses the existing @ctxt parser context
7040 *
7041 * Returns the resulting document tree
7042 */
7043 htmlDocPtr
htmlCtxtReadFile(htmlParserCtxtPtr ctxt,const char * filename,const char * encoding,int options)7044 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7045 const char *encoding, int options)
7046 {
7047 xmlParserInputPtr stream;
7048
7049 if (filename == NULL)
7050 return (NULL);
7051 if (ctxt == NULL)
7052 return (NULL);
7053 xmlInitParser();
7054
7055 htmlCtxtReset(ctxt);
7056
7057 stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7058 if (stream == NULL) {
7059 return (NULL);
7060 }
7061 inputPush(ctxt, stream);
7062 return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7063 }
7064
7065 /**
7066 * htmlCtxtReadMemory:
7067 * @ctxt: an HTML parser context
7068 * @buffer: a pointer to a char array
7069 * @size: the size of the array
7070 * @URL: the base URL to use for the document
7071 * @encoding: the document encoding, or NULL
7072 * @options: a combination of htmlParserOption(s)
7073 *
7074 * parse an XML in-memory document and build a tree.
7075 * This reuses the existing @ctxt parser context
7076 *
7077 * Returns the resulting document tree
7078 */
7079 htmlDocPtr
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt,const char * buffer,int size,const char * URL,const char * encoding,int options)7080 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7081 const char *URL, const char *encoding, int options)
7082 {
7083 xmlParserInputBufferPtr input;
7084 xmlParserInputPtr stream;
7085
7086 if (ctxt == NULL)
7087 return (NULL);
7088 if (buffer == NULL)
7089 return (NULL);
7090 xmlInitParser();
7091
7092 htmlCtxtReset(ctxt);
7093
7094 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7095 if (input == NULL) {
7096 return(NULL);
7097 }
7098
7099 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7100 if (stream == NULL) {
7101 xmlFreeParserInputBuffer(input);
7102 return(NULL);
7103 }
7104
7105 inputPush(ctxt, stream);
7106 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7107 }
7108
7109 /**
7110 * htmlCtxtReadFd:
7111 * @ctxt: an HTML parser context
7112 * @fd: an open file descriptor
7113 * @URL: the base URL to use for the document
7114 * @encoding: the document encoding, or NULL
7115 * @options: a combination of htmlParserOption(s)
7116 *
7117 * parse an XML from a file descriptor and build a tree.
7118 * This reuses the existing @ctxt parser context
7119 *
7120 * Returns the resulting document tree
7121 */
7122 htmlDocPtr
htmlCtxtReadFd(htmlParserCtxtPtr ctxt,int fd,const char * URL,const char * encoding,int options)7123 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7124 const char *URL, const char *encoding, int options)
7125 {
7126 xmlParserInputBufferPtr input;
7127 xmlParserInputPtr stream;
7128
7129 if (fd < 0)
7130 return (NULL);
7131 if (ctxt == NULL)
7132 return (NULL);
7133 xmlInitParser();
7134
7135 htmlCtxtReset(ctxt);
7136
7137
7138 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7139 if (input == NULL)
7140 return (NULL);
7141 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7142 if (stream == NULL) {
7143 xmlFreeParserInputBuffer(input);
7144 return (NULL);
7145 }
7146 inputPush(ctxt, stream);
7147 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7148 }
7149
7150 /**
7151 * htmlCtxtReadIO:
7152 * @ctxt: an HTML parser context
7153 * @ioread: an I/O read function
7154 * @ioclose: an I/O close function
7155 * @ioctx: an I/O handler
7156 * @URL: the base URL to use for the document
7157 * @encoding: the document encoding, or NULL
7158 * @options: a combination of htmlParserOption(s)
7159 *
7160 * parse an HTML document from I/O functions and source and build a tree.
7161 * This reuses the existing @ctxt parser context
7162 *
7163 * Returns the resulting document tree
7164 */
7165 htmlDocPtr
htmlCtxtReadIO(htmlParserCtxtPtr ctxt,xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)7166 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7167 xmlInputCloseCallback ioclose, void *ioctx,
7168 const char *URL,
7169 const char *encoding, int options)
7170 {
7171 xmlParserInputBufferPtr input;
7172 xmlParserInputPtr stream;
7173
7174 if (ioread == NULL)
7175 return (NULL);
7176 if (ctxt == NULL)
7177 return (NULL);
7178 xmlInitParser();
7179
7180 htmlCtxtReset(ctxt);
7181
7182 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7183 XML_CHAR_ENCODING_NONE);
7184 if (input == NULL) {
7185 if (ioclose != NULL)
7186 ioclose(ioctx);
7187 return (NULL);
7188 }
7189 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7190 if (stream == NULL) {
7191 xmlFreeParserInputBuffer(input);
7192 return (NULL);
7193 }
7194 inputPush(ctxt, stream);
7195 return (htmlDoRead(ctxt, URL, encoding, options, 1));
7196 }
7197
7198 #define bottom_HTMLparser
7199 #include "elfgcchack.h"
7200 #endif /* LIBXML_HTML_ENABLED */
7201