1 /* markup.c -- simple XML-like parser
2 Copyright (C) 2015, 2018 Free Software Foundation, Inc.
3
4 This file is not part of the GNU gettext program, but is used with
5 GNU gettext.
6
7 This is a stripped down version of GLib's gmarkup.c. The original
8 copyright notice is as follows:
9 */
10
11 /* gmarkup.c - Simple XML-like parser
12 *
13 * Copyright 2000, 2003 Red Hat, Inc.
14 * Copyright 2007, 2008 Ryan Lortie <desrt@desrt.ca>
15 *
16 * GLib is free software; you can redistribute it and/or modify it
17 * under the terms of the GNU General Public License as
18 * published by the Free Software Foundation; either version 3 of the
19 * License, or (at your option) any later version.
20 *
21 * GLib is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 * General Public License for more details.
25 *
26 * You should have received a copy of the GNU General Public
27 * License along with GLib; see the file COPYING.LIB. If not,
28 * see <https://www.gnu.org/licenses/>.
29 */
30
31 #include "config.h"
32
33 #include <assert.h>
34 #include <stdarg.h>
35 #include <string.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <errno.h>
39
40 /* Specification */
41 #include "markup.h"
42
43 #include "c-ctype.h"
44 #include "gettext.h"
45 #include "gl_linked_list.h"
46 #include "gl_xlist.h"
47 #include "unictype.h"
48 #include "unistr.h"
49 #include "xalloc.h"
50 #include "xvasprintf.h"
51
52 #define _(s) gettext(s)
53
54 /**
55 * The "markup" parser is intended to parse a simple markup format
56 * that's a subset of XML. This is a small, efficient, easy-to-use
57 * parser. It should not be used if you expect to interoperate with
58 * other applications generating full-scale XML. However, it's very
59 * useful for application data files, config files, etc. where you
60 * know your application will be the only one writing the file.
61 * Full-scale XML parsers should be able to parse the subset used by
62 * markup, so you can easily migrate to full-scale XML at a later
63 * time if the need arises.
64 *
65 * The parser is not guaranteed to signal an error on all invalid XML;
66 * the parser may accept documents that an XML parser would not.
67 * However, XML documents which are not well-formed (which is a weaker
68 * condition than being valid. See the XML specification
69 * <https://www.w3.org/TR/REC-xml/> for definitions of these terms.)
70 * are not considered valid GMarkup documents.
71 *
72 * Simplifications to XML include:
73 *
74 * - Only UTF-8 encoding is allowed
75 *
76 * - No user-defined entities
77 *
78 * - Processing instructions, comments and the doctype declaration
79 * are "passed through" but are not interpreted in any way
80 *
81 * - No DTD or validation
82 *
83 * The markup format does support:
84 *
85 * - Elements
86 *
87 * - Attributes
88 *
89 * - 5 standard entities: & < > " '
90 *
91 * - Character references
92 *
93 * - Sections marked as CDATA
94 */
95
96 typedef enum
97 {
98 STATE_START,
99 STATE_AFTER_OPEN_ANGLE,
100 STATE_AFTER_CLOSE_ANGLE,
101 STATE_AFTER_ELISION_SLASH, /* the slash that obviates need for end element */
102 STATE_INSIDE_OPEN_TAG_NAME,
103 STATE_INSIDE_ATTRIBUTE_NAME,
104 STATE_AFTER_ATTRIBUTE_NAME,
105 STATE_BETWEEN_ATTRIBUTES,
106 STATE_AFTER_ATTRIBUTE_EQUALS_SIGN,
107 STATE_INSIDE_ATTRIBUTE_VALUE_SQ,
108 STATE_INSIDE_ATTRIBUTE_VALUE_DQ,
109 STATE_INSIDE_TEXT,
110 STATE_AFTER_CLOSE_TAG_SLASH,
111 STATE_INSIDE_CLOSE_TAG_NAME,
112 STATE_AFTER_CLOSE_TAG_NAME,
113 STATE_INSIDE_PASSTHROUGH,
114 STATE_ERROR
115 } markup_parse_state_ty;
116
117 typedef struct
118 {
119 const char *prev_element;
120 const markup_parser_ty *prev_parser;
121 void *prev_user_data;
122 } markup_recursion_tracker_ty;
123
124 typedef struct
125 {
126 char *buffer;
127 size_t bufmax;
128 size_t buflen;
129 } markup_string_ty;
130
131 struct _markup_parse_context_ty
132 {
133 const markup_parser_ty *parser;
134
135 markup_parse_flags_ty flags;
136
137 int line_number;
138 int char_number;
139
140 markup_parse_state_ty state;
141
142 void *user_data;
143
144 /* A piece of character data or an element that
145 * hasn't "ended" yet so we haven't yet called
146 * the callback for it.
147 */
148 markup_string_ty *partial_chunk;
149
150 gl_list_t tag_stack; /* <markup_string_ty> */
151
152 char **attr_names;
153 char **attr_values;
154 int cur_attr;
155 int alloc_attrs;
156
157 const char *current_text;
158 ssize_t current_text_len;
159 const char *current_text_end;
160
161 /* used to save the start of the last interesting thingy */
162 const char *start;
163
164 const char *iter;
165
166 char *error_text;
167
168 unsigned int document_empty : 1;
169 unsigned int parsing : 1;
170 unsigned int awaiting_pop : 1;
171 int balance;
172
173 /* subparser support */
174 gl_list_t subparser_stack; /* <markup_recursion_tracker_ty *> */
175 const char *subparser_element;
176 };
177
178 static markup_string_ty *
markup_string_new(void)179 markup_string_new (void)
180 {
181 return XZALLOC (markup_string_ty);
182 }
183
184 static char *
markup_string_free(markup_string_ty * string,bool free_segment)185 markup_string_free (markup_string_ty *string, bool free_segment)
186 {
187 if (free_segment)
188 {
189 free (string->buffer);
190 free (string);
191 return NULL;
192 }
193 else
194 {
195 char *result = string->buffer;
196 free (string);
197 return result;
198 }
199 }
200
201 static void
markup_string_free1(markup_string_ty * string)202 markup_string_free1 (markup_string_ty *string)
203 {
204 markup_string_free (string, true);
205 }
206
207 static void
markup_string_truncate(markup_string_ty * string,size_t length)208 markup_string_truncate (markup_string_ty *string, size_t length)
209 {
210 assert (string && length < string->buflen - 1);
211 string->buffer[length] = '\0';
212 string->buflen = length;
213 }
214
215 static void
markup_string_append(markup_string_ty * string,const char * to_append,size_t length)216 markup_string_append (markup_string_ty *string, const char *to_append,
217 size_t length)
218 {
219 if (string->buflen + length + 1 > string->bufmax)
220 {
221 string->bufmax *= 2;
222 if (string->buflen + length + 1 > string->bufmax)
223 string->bufmax = string->buflen + length + 1;
224 string->buffer = xrealloc (string->buffer, string->bufmax);
225 }
226 memcpy (string->buffer + string->buflen, to_append, length);
227 string->buffer[length] = '\0';
228 string->buflen = length;
229 }
230
231 static inline void
string_blank(markup_string_ty * string)232 string_blank (markup_string_ty *string)
233 {
234 if (string->bufmax > 0)
235 {
236 *string->buffer = '\0';
237 string->buflen = 0;
238 }
239 }
240
241 /* Creates a new parse context. A parse context is used to parse
242 marked-up documents. You can feed any number of documents into a
243 context, as long as no errors occur; once an error occurs, the
244 parse context can't continue to parse text (you have to free it and
245 create a new parse context). */
246 markup_parse_context_ty *
markup_parse_context_new(const markup_parser_ty * parser,markup_parse_flags_ty flags,void * user_data)247 markup_parse_context_new (const markup_parser_ty *parser,
248 markup_parse_flags_ty flags,
249 void *user_data)
250 {
251 markup_parse_context_ty *context;
252
253 assert (parser != NULL);
254
255 context = XMALLOC (markup_parse_context_ty);
256
257 context->parser = parser;
258 context->flags = flags;
259 context->user_data = user_data;
260
261 context->line_number = 1;
262 context->char_number = 1;
263
264 context->partial_chunk = NULL;
265
266 context->state = STATE_START;
267 context->tag_stack =
268 gl_list_create_empty (GL_LINKED_LIST,
269 NULL, NULL,
270 (gl_listelement_dispose_fn) markup_string_free1,
271 true);
272 context->attr_names = NULL;
273 context->attr_values = NULL;
274 context->cur_attr = -1;
275 context->alloc_attrs = 0;
276
277 context->current_text = NULL;
278 context->current_text_len = -1;
279 context->current_text_end = NULL;
280
281 context->start = NULL;
282 context->iter = NULL;
283
284 context->error_text = NULL;
285
286 context->document_empty = true;
287 context->parsing = false;
288
289 context->awaiting_pop = false;
290 context->subparser_stack =
291 gl_list_create_empty (GL_LINKED_LIST,
292 NULL, NULL,
293 (gl_listelement_dispose_fn) free,
294 true);
295 context->subparser_element = NULL;
296
297 context->balance = 0;
298
299 return context;
300 }
301
302 static void clear_attributes (markup_parse_context_ty *context);
303
304 /* Frees a parse context. This function can't be called from inside
305 one of the markup_parser_ty functions or while a subparser is
306 pushed. */
307 void
markup_parse_context_free(markup_parse_context_ty * context)308 markup_parse_context_free (markup_parse_context_ty *context)
309 {
310 assert (context != NULL);
311 assert (!context->parsing);
312 assert (gl_list_size (context->subparser_stack) == 0);
313 assert (!context->awaiting_pop);
314
315 clear_attributes (context);
316 free (context->attr_names);
317 free (context->attr_values);
318
319 gl_list_free (context->tag_stack);
320 gl_list_free (context->subparser_stack);
321
322 if (context->partial_chunk)
323 markup_string_free (context->partial_chunk, true);
324
325 free (context->error_text);
326
327 free (context);
328 }
329
330 static void pop_subparser_stack (markup_parse_context_ty *context);
331
332 static void
emit_error(markup_parse_context_ty * context,const char * error_text)333 emit_error (markup_parse_context_ty *context, const char *error_text)
334 {
335 context->state = STATE_ERROR;
336
337 if (context->parser->error)
338 (*context->parser->error) (context, error_text, context->user_data);
339
340 /* report the error all the way up to free all the user-data */
341 while (gl_list_size (context->subparser_stack) > 0)
342 {
343 pop_subparser_stack (context);
344 context->awaiting_pop = false; /* already been freed */
345
346 if (context->parser->error)
347 (*context->parser->error) (context, error_text, context->user_data);
348 }
349
350 if (context->error_text)
351 free (context->error_text);
352 context->error_text = xstrdup (error_text);
353 }
354
355 #define IS_COMMON_NAME_END_CHAR(c) \
356 ((c) == '=' || (c) == '/' || (c) == '>' || (c) == ' ')
357
358 static bool
slow_name_validate(markup_parse_context_ty * context,const char * name)359 slow_name_validate (markup_parse_context_ty *context, const char *name)
360 {
361 const char *p = name;
362 ucs4_t uc;
363
364 if (u8_check ((uint8_t *) name, strlen (name)) != NULL)
365 {
366 emit_error (context, _("invalid UTF-8 sequence"));
367 return false;
368 }
369
370 if (!(c_isalpha (*p)
371 || (!IS_COMMON_NAME_END_CHAR (*p)
372 && (*p == '_'
373 || *p == ':'
374 || (u8_mbtouc (&uc, (uint8_t *) name, strlen (name)) > 0
375 && uc_is_alpha (uc))))))
376 {
377 char *error_text = xasprintf (_("'%s' is not a valid name: %c"),
378 name, *p);
379 emit_error (context, error_text);
380 free (error_text);
381 return false;
382 }
383
384 for (p = (char *) u8_next (&uc, (uint8_t *) name);
385 p != NULL;
386 p = (char *) u8_next (&uc, (uint8_t *) p))
387 {
388 /* is_name_char */
389 if (!(c_isalnum (*p) ||
390 (!IS_COMMON_NAME_END_CHAR (*p) &&
391 (*p == '.' ||
392 *p == '-' ||
393 *p == '_' ||
394 *p == ':' ||
395 uc_is_alpha (uc)))))
396 {
397 char *error_text = xasprintf (_("'%s' is not a valid name: '%c'"),
398 name, *p);
399 emit_error (context, error_text);
400 free (error_text);
401 return false;
402 }
403 }
404 return true;
405 }
406
407 /*
408 * Use me for elements, attributes etc.
409 */
410 static bool
name_validate(markup_parse_context_ty * context,const char * name)411 name_validate (markup_parse_context_ty *context, const char *name)
412 {
413 char mask;
414 const char *p;
415
416 /* name start char */
417 p = name;
418 if (IS_COMMON_NAME_END_CHAR (*p)
419 || !(c_isalpha (*p) || *p == '_' || *p == ':'))
420 goto slow_validate;
421
422 for (mask = *p++; *p != '\0'; p++)
423 {
424 mask |= *p;
425
426 /* is_name_char */
427 if (!(c_isalnum (*p)
428 || (!IS_COMMON_NAME_END_CHAR (*p)
429 && (*p == '.' || *p == '-' || *p == '_' || *p == ':'))))
430 goto slow_validate;
431 }
432
433 if (mask & 0x80) /* un-common / non-ascii */
434 goto slow_validate;
435
436 return true;
437
438 slow_validate:
439 return slow_name_validate (context, name);
440 }
441
442 static bool
text_validate(markup_parse_context_ty * context,const char * p,int len)443 text_validate (markup_parse_context_ty *context,
444 const char *p,
445 int len)
446 {
447 if (u8_check ((const uint8_t *) p, len) != NULL)
448 {
449 emit_error (context, _("invalid UTF-8 sequence"));
450 return false;
451 }
452 else
453 return true;
454 }
455
456 /*
457 * re-write the GString in-place, unescaping anything that escaped.
458 * most XML does not contain entities, or escaping.
459 */
460 static bool
unescape_string_inplace(markup_parse_context_ty * context,markup_string_ty * string,bool * is_ascii)461 unescape_string_inplace (markup_parse_context_ty *context,
462 markup_string_ty *string,
463 bool *is_ascii)
464 {
465 char mask, *to;
466 const char *from;
467 bool normalize_attribute;
468
469 if (string->buflen == 0)
470 return true;
471
472 *is_ascii = false;
473
474 /* are we unescaping an attribute or not ? */
475 if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ
476 || context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ)
477 normalize_attribute = true;
478 else
479 normalize_attribute = false;
480
481 /*
482 * Meeks' theorem: unescaping can only shrink text.
483 * for < etc. this is obvious, for  more
484 * thought is required, but this is patently so.
485 */
486 mask = 0;
487 for (from = to = string->buffer; *from != '\0'; from++, to++)
488 {
489 *to = *from;
490
491 mask |= *to;
492 if (normalize_attribute && (*to == '\t' || *to == '\n'))
493 *to = ' ';
494 if (*to == '\r')
495 {
496 *to = normalize_attribute ? ' ' : '\n';
497 if (from[1] == '\n')
498 from++;
499 }
500 if (*from == '&')
501 {
502 from++;
503 if (*from == '#')
504 {
505 int base = 10;
506 unsigned long l;
507 char *end = NULL;
508
509 from++;
510
511 if (*from == 'x')
512 {
513 base = 16;
514 from++;
515 }
516
517 errno = 0;
518 l = strtoul (from, &end, base);
519
520 if (end == from || errno != 0)
521 {
522 char *error_text =
523 xasprintf (_("invalid character reference: %s"),
524 errno != 0
525 ? strerror (errno)
526 : _("not a valid number specification"));
527 emit_error (context, error_text);
528 free (error_text);
529 return false;
530 }
531 else if (*end != ';')
532 {
533 char *error_text =
534 xasprintf (_("invalid character reference: %s"),
535 _("no ending ';'"));
536 emit_error (context, error_text);
537 free (error_text);
538 return false;
539 }
540 else
541 {
542 /* characters XML 1.1 permits */
543 if ((0 < l && l <= 0xD7FF) ||
544 (0xE000 <= l && l <= 0xFFFD) ||
545 (0x10000 <= l && l <= 0x10FFFF))
546 {
547 char buf[8];
548 int length;
549 length = u8_uctomb ((uint8_t *) buf, l, 8);
550 memcpy (to, buf, length);
551 to += length - 1;
552 from = end;
553 if (l >= 0x80) /* not ascii */
554 mask |= 0x80;
555 }
556 else
557 {
558 char *error_text =
559 xasprintf (_("invalid character reference: %s"),
560 _("non-permitted character"));
561 emit_error (context, error_text);
562 free (error_text);
563 return false;
564 }
565 }
566 }
567
568 else if (strncmp (from, "lt;", 3) == 0)
569 {
570 *to = '<';
571 from += 2;
572 }
573 else if (strncmp (from, "gt;", 3) == 0)
574 {
575 *to = '>';
576 from += 2;
577 }
578 else if (strncmp (from, "amp;", 4) == 0)
579 {
580 *to = '&';
581 from += 3;
582 }
583 else if (strncmp (from, "quot;", 5) == 0)
584 {
585 *to = '"';
586 from += 4;
587 }
588 else if (strncmp (from, "apos;", 5) == 0)
589 {
590 *to = '\'';
591 from += 4;
592 }
593 else
594 {
595 const char *reason;
596 char *error_text;
597
598 if (*from == ';')
599 reason = _("empty");
600 else
601 {
602 const char *end = strchr (from, ';');
603 if (end)
604 reason = _("unknown");
605 else
606 reason = _("no ending ';'");
607 }
608 error_text = xasprintf (_("invalid entity reference: %s"),
609 reason);
610 emit_error (context, error_text);
611 free (error_text);
612 return false;
613 }
614 }
615 }
616
617 assert (to - string->buffer <= string->buflen);
618 if (to - string->buffer != string->buflen)
619 markup_string_truncate (string, to - string->buffer);
620
621 *is_ascii = !(mask & 0x80);
622
623 return true;
624 }
625
626 static inline bool
advance_char(markup_parse_context_ty * context)627 advance_char (markup_parse_context_ty *context)
628 {
629 context->iter++;
630 context->char_number++;
631
632 if (context->iter == context->current_text_end)
633 return false;
634
635 else if (*context->iter == '\n')
636 {
637 context->line_number++;
638 context->char_number = 1;
639 }
640
641 return true;
642 }
643
644 static inline bool
xml_isspace(char c)645 xml_isspace (char c)
646 {
647 return c == ' ' || c == '\t' || c == '\n' || c == '\r';
648 }
649
650 static void
skip_spaces(markup_parse_context_ty * context)651 skip_spaces (markup_parse_context_ty *context)
652 {
653 do
654 {
655 if (!xml_isspace (*context->iter))
656 return;
657 }
658 while (advance_char (context));
659 }
660
661 static void
advance_to_name_end(markup_parse_context_ty * context)662 advance_to_name_end (markup_parse_context_ty *context)
663 {
664 do
665 {
666 if (IS_COMMON_NAME_END_CHAR (*(context->iter)))
667 return;
668 if (xml_isspace (*(context->iter)))
669 return;
670 }
671 while (advance_char (context));
672 }
673
674 static void
add_to_partial(markup_parse_context_ty * context,const char * text_start,const char * text_end)675 add_to_partial (markup_parse_context_ty *context,
676 const char *text_start,
677 const char *text_end)
678 {
679 if (context->partial_chunk == NULL)
680 { /* allocate a new chunk to parse into */
681
682 context->partial_chunk = markup_string_new ();
683 }
684
685 if (text_start != text_end)
686 markup_string_append (context->partial_chunk,
687 text_start, text_end - text_start);
688 }
689
690 static inline void
truncate_partial(markup_parse_context_ty * context)691 truncate_partial (markup_parse_context_ty *context)
692 {
693 if (context->partial_chunk != NULL)
694 string_blank (context->partial_chunk);
695 }
696
697 static inline const char*
current_element(markup_parse_context_ty * context)698 current_element (markup_parse_context_ty *context)
699 {
700 const markup_string_ty *string = gl_list_get_at (context->tag_stack, 0);
701 return string->buffer;
702 }
703
704 static void
pop_subparser_stack(markup_parse_context_ty * context)705 pop_subparser_stack (markup_parse_context_ty *context)
706 {
707 markup_recursion_tracker_ty *tracker;
708
709 assert (gl_list_size (context->subparser_stack) > 0);
710
711 tracker = (markup_recursion_tracker_ty *) gl_list_get_at (context->subparser_stack, 0);
712
713 context->awaiting_pop = true;
714
715 context->user_data = tracker->prev_user_data;
716 context->parser = tracker->prev_parser;
717 context->subparser_element = tracker->prev_element;
718 free (tracker);
719
720 gl_list_remove_at (context->subparser_stack, 0);
721 }
722
723 static void
push_partial_as_tag(markup_parse_context_ty * context)724 push_partial_as_tag (markup_parse_context_ty *context)
725 {
726 gl_list_add_first (context->tag_stack, context->partial_chunk);
727 context->partial_chunk = NULL;
728 }
729
730 static void
pop_tag(markup_parse_context_ty * context)731 pop_tag (markup_parse_context_ty *context)
732 {
733 gl_list_remove_at (context->tag_stack, 0);
734 }
735
736 static void
possibly_finish_subparser(markup_parse_context_ty * context)737 possibly_finish_subparser (markup_parse_context_ty *context)
738 {
739 if (current_element (context) == context->subparser_element)
740 pop_subparser_stack (context);
741 }
742
743 static void
ensure_no_outstanding_subparser(markup_parse_context_ty * context)744 ensure_no_outstanding_subparser (markup_parse_context_ty *context)
745 {
746 context->awaiting_pop = false;
747 }
748
749 static void
add_attribute(markup_parse_context_ty * context,markup_string_ty * string)750 add_attribute (markup_parse_context_ty *context, markup_string_ty *string)
751 {
752 if (context->cur_attr + 2 >= context->alloc_attrs)
753 {
754 context->alloc_attrs += 5; /* silly magic number */
755 context->attr_names = xrealloc (context->attr_names, sizeof (char *) * context->alloc_attrs);
756 context->attr_values = xrealloc (context->attr_values, sizeof(char *) * context->alloc_attrs);
757 }
758 context->cur_attr++;
759 context->attr_names[context->cur_attr] = xstrdup (string->buffer);
760 context->attr_values[context->cur_attr] = NULL;
761 context->attr_names[context->cur_attr+1] = NULL;
762 context->attr_values[context->cur_attr+1] = NULL;
763 }
764
765 static void
clear_attributes(markup_parse_context_ty * context)766 clear_attributes (markup_parse_context_ty *context)
767 {
768 /* Go ahead and free the attributes. */
769 for (; context->cur_attr >= 0; context->cur_attr--)
770 {
771 int pos = context->cur_attr;
772 free (context->attr_names[pos]);
773 free (context->attr_values[pos]);
774 context->attr_names[pos] = context->attr_values[pos] = NULL;
775 }
776 assert (context->cur_attr == -1);
777 assert (context->attr_names == NULL ||
778 context->attr_names[0] == NULL);
779 assert (context->attr_values == NULL ||
780 context->attr_values[0] == NULL);
781 }
782
783 static void
markup_parse_context_push(markup_parse_context_ty * context,const markup_parser_ty * parser,void * user_data)784 markup_parse_context_push (markup_parse_context_ty *context,
785 const markup_parser_ty *parser,
786 void *user_data)
787 {
788 markup_recursion_tracker_ty *tracker;
789
790 tracker = XMALLOC (markup_recursion_tracker_ty);
791 tracker->prev_element = context->subparser_element;
792 tracker->prev_parser = context->parser;
793 tracker->prev_user_data = context->user_data;
794
795 context->subparser_element = current_element (context);
796 context->parser = parser;
797 context->user_data = user_data;
798
799 gl_list_add_first (context->subparser_stack, tracker);
800 }
801
802 static void
markup_parse_context_pop(markup_parse_context_ty * context)803 markup_parse_context_pop (markup_parse_context_ty *context)
804 {
805 if (!context->awaiting_pop)
806 possibly_finish_subparser (context);
807
808 assert (context->awaiting_pop);
809
810 context->awaiting_pop = false;
811 }
812
813 /* This has to be a separate function to ensure the alloca's
814 * are unwound on exit - otherwise we grow & blow the stack
815 * with large documents
816 */
817 static inline void
emit_start_element(markup_parse_context_ty * context)818 emit_start_element (markup_parse_context_ty *context)
819 {
820 int i, j = 0;
821 const char *start_name;
822 const char **attr_names;
823 const char **attr_values;
824
825 /* In case we want to ignore qualified tags and we see that we have
826 * one here, we push a subparser. This will ignore all tags inside of
827 * the qualified tag.
828 *
829 * We deal with the end of the subparser from emit_end_element.
830 */
831 if ((context->flags & MARKUP_IGNORE_QUALIFIED)
832 && strchr (current_element (context), ':'))
833 {
834 static const markup_parser_ty ignore_parser;
835 markup_parse_context_push (context, &ignore_parser, NULL);
836 clear_attributes (context);
837 return;
838 }
839
840 attr_names = XCALLOC (context->cur_attr + 2, const char *);
841 attr_values = XCALLOC (context->cur_attr + 2, const char *);
842 for (i = 0; i < context->cur_attr + 1; i++)
843 {
844 /* Possibly omit qualified attribute names from the list */
845 if ((context->flags & MARKUP_IGNORE_QUALIFIED)
846 && strchr (context->attr_names[i], ':'))
847 continue;
848
849 attr_names[j] = context->attr_names[i];
850 attr_values[j] = context->attr_values[i];
851 j++;
852 }
853 attr_names[j] = NULL;
854 attr_values[j] = NULL;
855
856 /* Call user callback for element start */
857 start_name = current_element (context);
858
859 if (context->parser->start_element && name_validate (context, start_name))
860 (* context->parser->start_element) (context,
861 start_name,
862 (const char **)attr_names,
863 (const char **)attr_values,
864 context->user_data);
865 free (attr_names);
866 free (attr_values);
867 clear_attributes (context);
868 }
869
870 static void
emit_end_element(markup_parse_context_ty * context)871 emit_end_element (markup_parse_context_ty *context)
872 {
873 assert (gl_list_size (context->tag_stack) != 0);
874
875 possibly_finish_subparser (context);
876
877 /* We might have just returned from our ignore subparser */
878 if ((context->flags & MARKUP_IGNORE_QUALIFIED)
879 && strchr (current_element (context), ':'))
880 {
881 markup_parse_context_pop (context);
882 pop_tag (context);
883 return;
884 }
885
886 if (context->parser->end_element)
887 (* context->parser->end_element) (context,
888 current_element (context),
889 context->user_data);
890
891 ensure_no_outstanding_subparser (context);
892
893 pop_tag (context);
894 }
895
896 /* Feed some data to the parse context. The data need not be valid
897 UTF-8; an error will be signaled if it's invalid. The data need
898 not be an entire document; you can feed a document into the parser
899 incrementally, via multiple calls to this function. Typically, as
900 you receive data from a network connection or file, you feed each
901 received chunk of data into this function, aborting the process if
902 an error occurs. Once an error is reported, no further data may be
903 fed to the parse context; all errors are fatal. */
904 bool
markup_parse_context_parse(markup_parse_context_ty * context,const char * text,ssize_t text_len)905 markup_parse_context_parse (markup_parse_context_ty *context,
906 const char *text,
907 ssize_t text_len)
908 {
909 assert (context != NULL);
910 assert (text != NULL);
911 assert (context->state != STATE_ERROR);
912 assert (!context->parsing);
913
914 if (text_len < 0)
915 text_len = strlen (text);
916
917 if (text_len == 0)
918 return true;
919
920 context->parsing = true;
921
922
923 context->current_text = text;
924 context->current_text_len = text_len;
925 context->current_text_end = context->current_text + text_len;
926 context->iter = context->current_text;
927 context->start = context->iter;
928
929 while (context->iter != context->current_text_end)
930 {
931 switch (context->state)
932 {
933 case STATE_START:
934 /* Possible next state: AFTER_OPEN_ANGLE */
935
936 assert (gl_list_size (context->tag_stack) == 0);
937
938 /* whitespace is ignored outside of any elements */
939 skip_spaces (context);
940
941 if (context->iter != context->current_text_end)
942 {
943 if (*context->iter == '<')
944 {
945 /* Move after the open angle */
946 advance_char (context);
947
948 context->state = STATE_AFTER_OPEN_ANGLE;
949
950 /* this could start a passthrough */
951 context->start = context->iter;
952
953 /* document is now non-empty */
954 context->document_empty = false;
955 }
956 else
957 {
958 emit_error (context,
959 _("document must begin with an element"));
960 }
961 }
962 break;
963
964 case STATE_AFTER_OPEN_ANGLE:
965 /* Possible next states: INSIDE_OPEN_TAG_NAME,
966 * AFTER_CLOSE_TAG_SLASH, INSIDE_PASSTHROUGH
967 */
968 if (*context->iter == '?' ||
969 *context->iter == '!')
970 {
971 /* include < in the passthrough */
972 const char *openangle = "<";
973 add_to_partial (context, openangle, openangle + 1);
974 context->start = context->iter;
975 context->balance = 1;
976 context->state = STATE_INSIDE_PASSTHROUGH;
977 }
978 else if (*context->iter == '/')
979 {
980 /* move after it */
981 advance_char (context);
982
983 context->state = STATE_AFTER_CLOSE_TAG_SLASH;
984 }
985 else if (!IS_COMMON_NAME_END_CHAR (*(context->iter)))
986 {
987 context->state = STATE_INSIDE_OPEN_TAG_NAME;
988
989 /* start of tag name */
990 context->start = context->iter;
991 }
992 else
993 {
994 char *error_text = xasprintf (_("invalid character after '%s'"),
995 "<");
996 emit_error (context, error_text);
997 free (error_text);
998 }
999 break;
1000
1001 /* The AFTER_CLOSE_ANGLE state is actually sort of
1002 * broken, because it doesn't correspond to a range
1003 * of characters in the input stream as the others do,
1004 * and thus makes things harder to conceptualize
1005 */
1006 case STATE_AFTER_CLOSE_ANGLE:
1007 /* Possible next states: INSIDE_TEXT, STATE_START */
1008 if (gl_list_size (context->tag_stack) == 0)
1009 {
1010 context->start = NULL;
1011 context->state = STATE_START;
1012 }
1013 else
1014 {
1015 context->start = context->iter;
1016 context->state = STATE_INSIDE_TEXT;
1017 }
1018 break;
1019
1020 case STATE_AFTER_ELISION_SLASH:
1021 /* Possible next state: AFTER_CLOSE_ANGLE */
1022 if (*context->iter == '>')
1023 {
1024 /* move after the close angle */
1025 advance_char (context);
1026 context->state = STATE_AFTER_CLOSE_ANGLE;
1027 emit_end_element (context);
1028 }
1029 else
1030 {
1031 char *error_text = xasprintf (_("missing '%c'"), '>');
1032 emit_error (context, error_text);
1033 free (error_text);
1034 }
1035 break;
1036
1037 case STATE_INSIDE_OPEN_TAG_NAME:
1038 /* Possible next states: BETWEEN_ATTRIBUTES */
1039
1040 /* if there's a partial chunk then it's the first part of the
1041 * tag name. If there's a context->start then it's the start
1042 * of the tag name in current_text, the partial chunk goes
1043 * before that start though.
1044 */
1045 advance_to_name_end (context);
1046
1047 if (context->iter == context->current_text_end)
1048 {
1049 /* The name hasn't necessarily ended. Merge with
1050 * partial chunk, leave state unchanged.
1051 */
1052 add_to_partial (context, context->start, context->iter);
1053 }
1054 else
1055 {
1056 /* The name has ended. Combine it with the partial chunk
1057 * if any; push it on the stack; enter next state.
1058 */
1059 add_to_partial (context, context->start, context->iter);
1060 push_partial_as_tag (context);
1061
1062 context->state = STATE_BETWEEN_ATTRIBUTES;
1063 context->start = NULL;
1064 }
1065 break;
1066
1067 case STATE_INSIDE_ATTRIBUTE_NAME:
1068 /* Possible next states: AFTER_ATTRIBUTE_NAME */
1069
1070 advance_to_name_end (context);
1071 add_to_partial (context, context->start, context->iter);
1072
1073 /* read the full name, if we enter the equals sign state
1074 * then add the attribute to the list (without the value),
1075 * otherwise store a partial chunk to be prepended later.
1076 */
1077 if (context->iter != context->current_text_end)
1078 context->state = STATE_AFTER_ATTRIBUTE_NAME;
1079 break;
1080
1081 case STATE_AFTER_ATTRIBUTE_NAME:
1082 /* Possible next states: AFTER_ATTRIBUTE_EQUALS_SIGN */
1083
1084 skip_spaces (context);
1085
1086 if (context->iter != context->current_text_end)
1087 {
1088 /* The name has ended. Combine it with the partial chunk
1089 * if any; push it on the stack; enter next state.
1090 */
1091 if (!name_validate (context, context->partial_chunk->buffer))
1092 break;
1093
1094 add_attribute (context, context->partial_chunk);
1095
1096 markup_string_free (context->partial_chunk, true);
1097 context->partial_chunk = NULL;
1098 context->start = NULL;
1099
1100 if (*context->iter == '=')
1101 {
1102 advance_char (context);
1103 context->state = STATE_AFTER_ATTRIBUTE_EQUALS_SIGN;
1104 }
1105 else
1106 {
1107 char *error_text = xasprintf (_("missing '%c'"), '=');
1108 emit_error (context, error_text);
1109 free (error_text);
1110 }
1111 }
1112 break;
1113
1114 case STATE_BETWEEN_ATTRIBUTES:
1115 /* Possible next states: AFTER_CLOSE_ANGLE,
1116 * AFTER_ELISION_SLASH, INSIDE_ATTRIBUTE_NAME
1117 */
1118 skip_spaces (context);
1119
1120 if (context->iter != context->current_text_end)
1121 {
1122 if (*context->iter == '/')
1123 {
1124 advance_char (context);
1125 context->state = STATE_AFTER_ELISION_SLASH;
1126 }
1127 else if (*context->iter == '>')
1128 {
1129 advance_char (context);
1130 context->state = STATE_AFTER_CLOSE_ANGLE;
1131 }
1132 else if (!IS_COMMON_NAME_END_CHAR (*(context->iter)))
1133 {
1134 context->state = STATE_INSIDE_ATTRIBUTE_NAME;
1135 /* start of attribute name */
1136 context->start = context->iter;
1137 }
1138 else
1139 {
1140 char *error_text = xasprintf (_("missing '%c' or '%c'"),
1141 '>', '/');
1142 emit_error (context, error_text);
1143 free (error_text);
1144 }
1145
1146 /* If we're done with attributes, invoke
1147 * the start_element callback
1148 */
1149 if (context->state == STATE_AFTER_ELISION_SLASH ||
1150 context->state == STATE_AFTER_CLOSE_ANGLE)
1151 emit_start_element (context);
1152 }
1153 break;
1154
1155 case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN:
1156 /* Possible next state: INSIDE_ATTRIBUTE_VALUE_[SQ/DQ] */
1157
1158 skip_spaces (context);
1159
1160 if (context->iter != context->current_text_end)
1161 {
1162 if (*context->iter == '"')
1163 {
1164 advance_char (context);
1165 context->state = STATE_INSIDE_ATTRIBUTE_VALUE_DQ;
1166 context->start = context->iter;
1167 }
1168 else if (*context->iter == '\'')
1169 {
1170 advance_char (context);
1171 context->state = STATE_INSIDE_ATTRIBUTE_VALUE_SQ;
1172 context->start = context->iter;
1173 }
1174 else
1175 {
1176 char *error_text = xasprintf (_("missing '%c' or '%c'"),
1177 '\'', '"');
1178 emit_error (context, error_text);
1179 free (error_text);
1180 }
1181 }
1182 break;
1183
1184 case STATE_INSIDE_ATTRIBUTE_VALUE_SQ:
1185 case STATE_INSIDE_ATTRIBUTE_VALUE_DQ:
1186 /* Possible next states: BETWEEN_ATTRIBUTES */
1187 {
1188 char delim;
1189
1190 if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ)
1191 {
1192 delim = '\'';
1193 }
1194 else
1195 {
1196 delim = '"';
1197 }
1198
1199 do
1200 {
1201 if (*context->iter == delim)
1202 break;
1203 }
1204 while (advance_char (context));
1205 }
1206 if (context->iter == context->current_text_end)
1207 {
1208 /* The value hasn't necessarily ended. Merge with
1209 * partial chunk, leave state unchanged.
1210 */
1211 add_to_partial (context, context->start, context->iter);
1212 }
1213 else
1214 {
1215 bool is_ascii;
1216 /* The value has ended at the quote mark. Combine it
1217 * with the partial chunk if any; set it for the current
1218 * attribute.
1219 */
1220 add_to_partial (context, context->start, context->iter);
1221
1222 assert (context->cur_attr >= 0);
1223
1224 if (unescape_string_inplace (context, context->partial_chunk,
1225 &is_ascii)
1226 && (is_ascii
1227 || text_validate (context,
1228 context->partial_chunk->buffer,
1229 context->partial_chunk->buflen)))
1230 {
1231 /* success, advance past quote and set state. */
1232 context->attr_values[context->cur_attr] =
1233 markup_string_free (context->partial_chunk, false);
1234 context->partial_chunk = NULL;
1235 advance_char (context);
1236 context->state = STATE_BETWEEN_ATTRIBUTES;
1237 context->start = NULL;
1238 }
1239
1240 truncate_partial (context);
1241 }
1242 break;
1243
1244 case STATE_INSIDE_TEXT:
1245 /* Possible next states: AFTER_OPEN_ANGLE */
1246 do
1247 {
1248 if (*context->iter == '<')
1249 break;
1250 }
1251 while (advance_char (context));
1252
1253 /* The text hasn't necessarily ended. Merge with
1254 * partial chunk, leave state unchanged.
1255 */
1256
1257 add_to_partial (context, context->start, context->iter);
1258
1259 if (context->iter != context->current_text_end)
1260 {
1261 bool is_ascii;
1262
1263 /* The text has ended at the open angle. Call the text
1264 * callback.
1265 */
1266 if (unescape_string_inplace (context, context->partial_chunk,
1267 &is_ascii)
1268 && (is_ascii
1269 || text_validate (context,
1270 context->partial_chunk->buffer,
1271 context->partial_chunk->buflen)))
1272 {
1273 if (context->parser->text)
1274 (*context->parser->text) (context,
1275 context->partial_chunk->buffer,
1276 context->partial_chunk->buflen,
1277 context->user_data);
1278
1279 /* advance past open angle and set state. */
1280 advance_char (context);
1281 context->state = STATE_AFTER_OPEN_ANGLE;
1282 /* could begin a passthrough */
1283 context->start = context->iter;
1284 }
1285
1286 truncate_partial (context);
1287 }
1288 break;
1289
1290 case STATE_AFTER_CLOSE_TAG_SLASH:
1291 /* Possible next state: INSIDE_CLOSE_TAG_NAME */
1292 if (!IS_COMMON_NAME_END_CHAR (*(context->iter)))
1293 {
1294 context->state = STATE_INSIDE_CLOSE_TAG_NAME;
1295
1296 /* start of tag name */
1297 context->start = context->iter;
1298 }
1299 else
1300 {
1301 char *error_text = xasprintf (_("invalid character after '%s'"),
1302 "</");
1303 emit_error (context, error_text);
1304 free (error_text);
1305 }
1306 break;
1307
1308 case STATE_INSIDE_CLOSE_TAG_NAME:
1309 /* Possible next state: AFTER_CLOSE_TAG_NAME */
1310 advance_to_name_end (context);
1311 add_to_partial (context, context->start, context->iter);
1312
1313 if (context->iter != context->current_text_end)
1314 context->state = STATE_AFTER_CLOSE_TAG_NAME;
1315 break;
1316
1317 case STATE_AFTER_CLOSE_TAG_NAME:
1318 /* Possible next state: AFTER_CLOSE_TAG_SLASH */
1319
1320 skip_spaces (context);
1321
1322 if (context->iter != context->current_text_end)
1323 {
1324 markup_string_ty *close_name;
1325
1326 close_name = context->partial_chunk;
1327 context->partial_chunk = NULL;
1328
1329 if (*context->iter != '>')
1330 {
1331 char *error_text =
1332 xasprintf (_("invalid character after '%s'"),
1333 _("a close element name"));
1334 emit_error (context, error_text);
1335 free (error_text);
1336 }
1337 else if (gl_list_size (context->tag_stack) == 0)
1338 {
1339 emit_error (context, _("element is closed"));
1340 }
1341 else if (strcmp (close_name->buffer, current_element (context))
1342 != 0)
1343 {
1344 emit_error (context, _("element is closed"));
1345 }
1346 else
1347 {
1348 advance_char (context);
1349 context->state = STATE_AFTER_CLOSE_ANGLE;
1350 context->start = NULL;
1351
1352 emit_end_element (context);
1353 }
1354 context->partial_chunk = close_name;
1355 truncate_partial (context);
1356 }
1357 break;
1358
1359 case STATE_INSIDE_PASSTHROUGH:
1360 /* Possible next state: AFTER_CLOSE_ANGLE */
1361 do
1362 {
1363 if (*context->iter == '<')
1364 context->balance++;
1365 if (*context->iter == '>')
1366 {
1367 char *str;
1368 size_t len;
1369
1370 context->balance--;
1371 add_to_partial (context, context->start, context->iter);
1372 context->start = context->iter;
1373
1374 str = context->partial_chunk->buffer;
1375 len = context->partial_chunk->buflen;
1376
1377 if (str[1] == '?' && str[len - 1] == '?')
1378 break;
1379 if (strncmp (str, "<!--", 4) == 0 &&
1380 strcmp (str + len - 2, "--") == 0)
1381 break;
1382 if (strncmp (str, "<![CDATA[", 9) == 0 &&
1383 strcmp (str + len - 2, "]]") == 0)
1384 break;
1385 if (strncmp (str, "<!DOCTYPE", 9) == 0 &&
1386 context->balance == 0)
1387 break;
1388 }
1389 }
1390 while (advance_char (context));
1391
1392 if (context->iter == context->current_text_end)
1393 {
1394 /* The passthrough hasn't necessarily ended. Merge with
1395 * partial chunk, leave state unchanged.
1396 */
1397 add_to_partial (context, context->start, context->iter);
1398 }
1399 else
1400 {
1401 /* The passthrough has ended at the close angle. Combine
1402 * it with the partial chunk if any. Call the passthrough
1403 * callback. Note that the open/close angles are
1404 * included in the text of the passthrough.
1405 */
1406 advance_char (context); /* advance past close angle */
1407 add_to_partial (context, context->start, context->iter);
1408
1409 if (context->flags & MARKUP_TREAT_CDATA_AS_TEXT &&
1410 strncmp (context->partial_chunk->buffer, "<![CDATA[", 9) == 0)
1411 {
1412 if (context->parser->text &&
1413 text_validate (context,
1414 context->partial_chunk->buffer + 9,
1415 context->partial_chunk->buflen - 12))
1416 (*context->parser->text) (context,
1417 context->partial_chunk->buffer + 9,
1418 context->partial_chunk->buflen - 12,
1419 context->user_data);
1420 }
1421 else if (context->parser->passthrough &&
1422 text_validate (context,
1423 context->partial_chunk->buffer,
1424 context->partial_chunk->buflen))
1425 (*context->parser->passthrough) (context,
1426 context->partial_chunk->buffer,
1427 context->partial_chunk->buflen,
1428 context->user_data);
1429
1430 truncate_partial (context);
1431
1432 context->state = STATE_AFTER_CLOSE_ANGLE;
1433 context->start = context->iter; /* could begin text */
1434 }
1435 break;
1436
1437 case STATE_ERROR:
1438 goto finished;
1439 break;
1440
1441 default:
1442 abort ();
1443 break;
1444 }
1445 }
1446
1447 finished:
1448 context->parsing = false;
1449
1450 return context->state != STATE_ERROR;
1451 }
1452
1453 /* Signals to the parse context that all data has been fed into the
1454 * parse context with markup_parse_context_parse.
1455 *
1456 * This function reports an error if the document isn't complete,
1457 * for example if elements are still open. */
1458 bool
markup_parse_context_end_parse(markup_parse_context_ty * context)1459 markup_parse_context_end_parse (markup_parse_context_ty *context)
1460 {
1461 const char *location = NULL;
1462
1463 assert (context != NULL);
1464 assert (!context->parsing);
1465 assert (context->state != STATE_ERROR);
1466
1467 if (context->partial_chunk != NULL)
1468 {
1469 markup_string_free (context->partial_chunk, true);
1470 context->partial_chunk = NULL;
1471 }
1472
1473 if (context->document_empty)
1474 {
1475 emit_error (context, _("empty document"));
1476 return false;
1477 }
1478
1479 context->parsing = true;
1480
1481 switch (context->state)
1482 {
1483 case STATE_START:
1484 /* Nothing to do */
1485 break;
1486
1487 case STATE_AFTER_OPEN_ANGLE:
1488 location = _("after '<'");
1489 break;
1490
1491 case STATE_AFTER_CLOSE_ANGLE:
1492 if (gl_list_size (context->tag_stack) > 0)
1493 {
1494 /* Error message the same as for INSIDE_TEXT */
1495 location = _("elements still open");
1496 }
1497 break;
1498
1499 case STATE_AFTER_ELISION_SLASH:
1500 location = _("missing '>'");
1501 break;
1502
1503 case STATE_INSIDE_OPEN_TAG_NAME:
1504 location = _("inside an element name");
1505 break;
1506
1507 case STATE_INSIDE_ATTRIBUTE_NAME:
1508 case STATE_AFTER_ATTRIBUTE_NAME:
1509 location = _("inside an attribute name");
1510 break;
1511
1512 case STATE_BETWEEN_ATTRIBUTES:
1513 location = _("inside an open tag");
1514 break;
1515
1516 case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN:
1517 location = _("after '='");
1518 break;
1519
1520 case STATE_INSIDE_ATTRIBUTE_VALUE_SQ:
1521 case STATE_INSIDE_ATTRIBUTE_VALUE_DQ:
1522 location = _("inside an attribute value");
1523 break;
1524
1525 case STATE_INSIDE_TEXT:
1526 assert (gl_list_size (context->tag_stack) > 0);
1527 location = _("elements still open");
1528 break;
1529
1530 case STATE_AFTER_CLOSE_TAG_SLASH:
1531 case STATE_INSIDE_CLOSE_TAG_NAME:
1532 case STATE_AFTER_CLOSE_TAG_NAME:
1533 location = _("inside the close tag");
1534 break;
1535
1536 case STATE_INSIDE_PASSTHROUGH:
1537 location = _("inside a comment or processing instruction");
1538 break;
1539
1540 case STATE_ERROR:
1541 default:
1542 abort ();
1543 break;
1544 }
1545
1546 if (location != NULL)
1547 {
1548 char *error_text = xasprintf (_("document ended unexpectedly: %s"),
1549 location);
1550 emit_error (context, error_text);
1551 free (error_text);
1552 }
1553
1554 context->parsing = false;
1555
1556 return context->state != STATE_ERROR;
1557 }
1558
1559 const char *
markup_parse_context_get_error(markup_parse_context_ty * context)1560 markup_parse_context_get_error (markup_parse_context_ty *context)
1561 {
1562 return context->error_text;
1563 }
1564