1 /* markup.c -- simple XML-like parser
2    Copyright (C) 2015, 2018 Free Software Foundation, Inc.
3 
4    This file is not part of the GNU gettext program, but is used with
5    GNU gettext.
6 
7    This is a stripped down version of GLib's gmarkup.c.  The original
8    copyright notice is as follows:
9 */
10 
11 /* gmarkup.c - Simple XML-like parser
12  *
13  *  Copyright 2000, 2003 Red Hat, Inc.
14  *  Copyright 2007, 2008 Ryan Lortie <desrt@desrt.ca>
15  *
16  * GLib is free software; you can redistribute it and/or modify it
17  * under the terms of the GNU General Public License as
18  * published by the Free Software Foundation; either version 3 of the
19  * License, or (at your option) any later version.
20  *
21  * GLib is distributed in the hope that it will be useful,
22  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
24  * General Public License for more details.
25  *
26  * You should have received a copy of the GNU General Public
27  * License along with GLib; see the file COPYING.LIB.  If not,
28  * see <https://www.gnu.org/licenses/>.
29  */
30 
31 #include "config.h"
32 
33 #include <assert.h>
34 #include <stdarg.h>
35 #include <string.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <errno.h>
39 
40 /* Specification */
41 #include "markup.h"
42 
43 #include "c-ctype.h"
44 #include "gettext.h"
45 #include "gl_linked_list.h"
46 #include "gl_xlist.h"
47 #include "unictype.h"
48 #include "unistr.h"
49 #include "xalloc.h"
50 #include "xvasprintf.h"
51 
52 #define _(s) gettext(s)
53 
54 /**
55  * The "markup" parser is intended to parse a simple markup format
56  * that's a subset of XML.  This is a small, efficient, easy-to-use
57  * parser.  It should not be used if you expect to interoperate with
58  * other applications generating full-scale XML.  However, it's very
59  * useful for application data files, config files, etc. where you
60  * know your application will be the only one writing the file.
61  * Full-scale XML parsers should be able to parse the subset used by
62  * markup, so you can easily migrate to full-scale XML at a later
63  * time if the need arises.
64  *
65  * The parser is not guaranteed to signal an error on all invalid XML;
66  * the parser may accept documents that an XML parser would not.
67  * However, XML documents which are not well-formed (which is a weaker
68  * condition than being valid.  See the XML specification
69  * <https://www.w3.org/TR/REC-xml/> for definitions of these terms.)
70  * are not considered valid GMarkup documents.
71  *
72  * Simplifications to XML include:
73  *
74  * - Only UTF-8 encoding is allowed
75  *
76  * - No user-defined entities
77  *
78  * - Processing instructions, comments and the doctype declaration
79  *   are "passed through" but are not interpreted in any way
80  *
81  * - No DTD or validation
82  *
83  * The markup format does support:
84  *
85  * - Elements
86  *
87  * - Attributes
88  *
89  * - 5 standard entities: &amp; &lt; &gt; &quot; &apos;
90  *
91  * - Character references
92  *
93  * - Sections marked as CDATA
94  */
95 
96 typedef enum
97 {
98   STATE_START,
99   STATE_AFTER_OPEN_ANGLE,
100   STATE_AFTER_CLOSE_ANGLE,
101   STATE_AFTER_ELISION_SLASH, /* the slash that obviates need for end element */
102   STATE_INSIDE_OPEN_TAG_NAME,
103   STATE_INSIDE_ATTRIBUTE_NAME,
104   STATE_AFTER_ATTRIBUTE_NAME,
105   STATE_BETWEEN_ATTRIBUTES,
106   STATE_AFTER_ATTRIBUTE_EQUALS_SIGN,
107   STATE_INSIDE_ATTRIBUTE_VALUE_SQ,
108   STATE_INSIDE_ATTRIBUTE_VALUE_DQ,
109   STATE_INSIDE_TEXT,
110   STATE_AFTER_CLOSE_TAG_SLASH,
111   STATE_INSIDE_CLOSE_TAG_NAME,
112   STATE_AFTER_CLOSE_TAG_NAME,
113   STATE_INSIDE_PASSTHROUGH,
114   STATE_ERROR
115 } markup_parse_state_ty;
116 
117 typedef struct
118 {
119   const char *prev_element;
120   const markup_parser_ty *prev_parser;
121   void *prev_user_data;
122 } markup_recursion_tracker_ty;
123 
124 typedef struct
125 {
126   char *buffer;
127   size_t bufmax;
128   size_t buflen;
129 } markup_string_ty;
130 
131 struct _markup_parse_context_ty
132 {
133   const markup_parser_ty *parser;
134 
135   markup_parse_flags_ty flags;
136 
137   int line_number;
138   int char_number;
139 
140   markup_parse_state_ty state;
141 
142   void *user_data;
143 
144   /* A piece of character data or an element that
145    * hasn't "ended" yet so we haven't yet called
146    * the callback for it.
147    */
148   markup_string_ty *partial_chunk;
149 
150   gl_list_t tag_stack;          /* <markup_string_ty> */
151 
152   char **attr_names;
153   char **attr_values;
154   int cur_attr;
155   int alloc_attrs;
156 
157   const char *current_text;
158   ssize_t current_text_len;
159   const char *current_text_end;
160 
161   /* used to save the start of the last interesting thingy */
162   const char *start;
163 
164   const char *iter;
165 
166   char *error_text;
167 
168   unsigned int document_empty : 1;
169   unsigned int parsing : 1;
170   unsigned int awaiting_pop : 1;
171   int balance;
172 
173   /* subparser support */
174   gl_list_t subparser_stack;    /* <markup_recursion_tracker_ty *> */
175   const char *subparser_element;
176 };
177 
178 static markup_string_ty *
markup_string_new(void)179 markup_string_new (void)
180 {
181   return XZALLOC (markup_string_ty);
182 }
183 
184 static char *
markup_string_free(markup_string_ty * string,bool free_segment)185 markup_string_free (markup_string_ty *string, bool free_segment)
186 {
187   if (free_segment)
188     {
189       free (string->buffer);
190       free (string);
191       return NULL;
192     }
193   else
194     {
195       char *result = string->buffer;
196       free (string);
197       return result;
198     }
199 }
200 
201 static void
markup_string_free1(markup_string_ty * string)202 markup_string_free1 (markup_string_ty *string)
203 {
204   markup_string_free (string, true);
205 }
206 
207 static void
markup_string_truncate(markup_string_ty * string,size_t length)208 markup_string_truncate (markup_string_ty *string, size_t length)
209 {
210   assert (string && length < string->buflen - 1);
211   string->buffer[length] = '\0';
212   string->buflen = length;
213 }
214 
215 static void
markup_string_append(markup_string_ty * string,const char * to_append,size_t length)216 markup_string_append (markup_string_ty *string, const char *to_append,
217                       size_t length)
218 {
219   if (string->buflen + length + 1 > string->bufmax)
220     {
221       string->bufmax *= 2;
222       if (string->buflen + length + 1 > string->bufmax)
223         string->bufmax = string->buflen + length + 1;
224       string->buffer = xrealloc (string->buffer, string->bufmax);
225     }
226   memcpy (string->buffer + string->buflen, to_append, length);
227   string->buffer[length] = '\0';
228   string->buflen = length;
229 }
230 
231 static inline void
string_blank(markup_string_ty * string)232 string_blank (markup_string_ty *string)
233 {
234   if (string->bufmax > 0)
235     {
236       *string->buffer = '\0';
237       string->buflen = 0;
238     }
239 }
240 
241 /* Creates a new parse context.  A parse context is used to parse
242    marked-up documents.  You can feed any number of documents into a
243    context, as long as no errors occur; once an error occurs, the
244    parse context can't continue to parse text (you have to free it and
245    create a new parse context).  */
246 markup_parse_context_ty *
markup_parse_context_new(const markup_parser_ty * parser,markup_parse_flags_ty flags,void * user_data)247 markup_parse_context_new (const markup_parser_ty *parser,
248                           markup_parse_flags_ty flags,
249                           void *user_data)
250 {
251   markup_parse_context_ty *context;
252 
253   assert (parser != NULL);
254 
255   context = XMALLOC (markup_parse_context_ty);
256 
257   context->parser = parser;
258   context->flags = flags;
259   context->user_data = user_data;
260 
261   context->line_number = 1;
262   context->char_number = 1;
263 
264   context->partial_chunk = NULL;
265 
266   context->state = STATE_START;
267   context->tag_stack =
268     gl_list_create_empty (GL_LINKED_LIST,
269                           NULL, NULL,
270                           (gl_listelement_dispose_fn) markup_string_free1,
271                           true);
272   context->attr_names = NULL;
273   context->attr_values = NULL;
274   context->cur_attr = -1;
275   context->alloc_attrs = 0;
276 
277   context->current_text = NULL;
278   context->current_text_len = -1;
279   context->current_text_end = NULL;
280 
281   context->start = NULL;
282   context->iter = NULL;
283 
284   context->error_text = NULL;
285 
286   context->document_empty = true;
287   context->parsing = false;
288 
289   context->awaiting_pop = false;
290   context->subparser_stack =
291     gl_list_create_empty (GL_LINKED_LIST,
292                           NULL, NULL,
293                           (gl_listelement_dispose_fn) free,
294                           true);
295   context->subparser_element = NULL;
296 
297   context->balance = 0;
298 
299   return context;
300 }
301 
302 static void clear_attributes (markup_parse_context_ty *context);
303 
304 /* Frees a parse context.  This function can't be called from inside
305    one of the markup_parser_ty functions or while a subparser is
306    pushed.  */
307 void
markup_parse_context_free(markup_parse_context_ty * context)308 markup_parse_context_free (markup_parse_context_ty *context)
309 {
310   assert (context != NULL);
311   assert (!context->parsing);
312   assert (gl_list_size (context->subparser_stack) == 0);
313   assert (!context->awaiting_pop);
314 
315   clear_attributes (context);
316   free (context->attr_names);
317   free (context->attr_values);
318 
319   gl_list_free (context->tag_stack);
320   gl_list_free (context->subparser_stack);
321 
322   if (context->partial_chunk)
323     markup_string_free (context->partial_chunk, true);
324 
325   free (context->error_text);
326 
327   free (context);
328 }
329 
330 static void pop_subparser_stack (markup_parse_context_ty *context);
331 
332 static void
emit_error(markup_parse_context_ty * context,const char * error_text)333 emit_error (markup_parse_context_ty *context, const char *error_text)
334 {
335   context->state = STATE_ERROR;
336 
337   if (context->parser->error)
338     (*context->parser->error) (context, error_text, context->user_data);
339 
340   /* report the error all the way up to free all the user-data */
341   while (gl_list_size (context->subparser_stack) > 0)
342     {
343       pop_subparser_stack (context);
344       context->awaiting_pop = false; /* already been freed */
345 
346       if (context->parser->error)
347         (*context->parser->error) (context, error_text, context->user_data);
348     }
349 
350   if (context->error_text)
351     free (context->error_text);
352   context->error_text = xstrdup (error_text);
353 }
354 
355 #define IS_COMMON_NAME_END_CHAR(c) \
356   ((c) == '=' || (c) == '/' || (c) == '>' || (c) == ' ')
357 
358 static bool
slow_name_validate(markup_parse_context_ty * context,const char * name)359 slow_name_validate (markup_parse_context_ty *context, const char *name)
360 {
361   const char *p = name;
362   ucs4_t uc;
363 
364   if (u8_check ((uint8_t *) name, strlen (name)) != NULL)
365     {
366       emit_error (context, _("invalid UTF-8 sequence"));
367       return false;
368     }
369 
370   if (!(c_isalpha (*p)
371         || (!IS_COMMON_NAME_END_CHAR (*p)
372             && (*p == '_'
373                 || *p == ':'
374                 || (u8_mbtouc (&uc, (uint8_t *) name, strlen (name)) > 0
375                     && uc_is_alpha (uc))))))
376     {
377       char *error_text = xasprintf (_("'%s' is not a valid name: %c"),
378                                     name, *p);
379       emit_error (context, error_text);
380       free (error_text);
381       return false;
382     }
383 
384   for (p = (char *) u8_next (&uc, (uint8_t *) name);
385        p != NULL;
386        p = (char *) u8_next (&uc, (uint8_t *) p))
387     {
388       /* is_name_char */
389       if (!(c_isalnum (*p) ||
390             (!IS_COMMON_NAME_END_CHAR (*p) &&
391              (*p == '.' ||
392               *p == '-' ||
393               *p == '_' ||
394               *p == ':' ||
395               uc_is_alpha (uc)))))
396         {
397           char *error_text = xasprintf (_("'%s' is not a valid name: '%c'"),
398                                         name, *p);
399           emit_error (context, error_text);
400           free (error_text);
401           return false;
402         }
403     }
404   return true;
405 }
406 
407 /*
408  * Use me for elements, attributes etc.
409  */
410 static bool
name_validate(markup_parse_context_ty * context,const char * name)411 name_validate (markup_parse_context_ty *context, const char *name)
412 {
413   char mask;
414   const char *p;
415 
416   /* name start char */
417   p = name;
418   if (IS_COMMON_NAME_END_CHAR (*p)
419       || !(c_isalpha (*p) || *p == '_' || *p == ':'))
420     goto slow_validate;
421 
422   for (mask = *p++; *p != '\0'; p++)
423     {
424       mask |= *p;
425 
426       /* is_name_char */
427       if (!(c_isalnum (*p)
428             || (!IS_COMMON_NAME_END_CHAR (*p)
429                 && (*p == '.' || *p == '-' || *p == '_' || *p == ':'))))
430         goto slow_validate;
431     }
432 
433   if (mask & 0x80) /* un-common / non-ascii */
434     goto slow_validate;
435 
436   return true;
437 
438  slow_validate:
439   return slow_name_validate (context, name);
440 }
441 
442 static bool
text_validate(markup_parse_context_ty * context,const char * p,int len)443 text_validate (markup_parse_context_ty *context,
444                const char *p,
445                int len)
446 {
447   if (u8_check ((const uint8_t *) p, len) != NULL)
448     {
449       emit_error (context, _("invalid UTF-8 sequence"));
450       return false;
451     }
452   else
453     return true;
454 }
455 
456 /*
457  * re-write the GString in-place, unescaping anything that escaped.
458  * most XML does not contain entities, or escaping.
459  */
460 static bool
unescape_string_inplace(markup_parse_context_ty * context,markup_string_ty * string,bool * is_ascii)461 unescape_string_inplace (markup_parse_context_ty *context,
462                          markup_string_ty *string,
463                          bool *is_ascii)
464 {
465   char mask, *to;
466   const char *from;
467   bool normalize_attribute;
468 
469   if (string->buflen == 0)
470     return true;
471 
472   *is_ascii = false;
473 
474   /* are we unescaping an attribute or not ? */
475   if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ
476       || context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ)
477     normalize_attribute = true;
478   else
479     normalize_attribute = false;
480 
481   /*
482    * Meeks' theorem: unescaping can only shrink text.
483    * for &lt; etc. this is obvious, for &#xffff; more
484    * thought is required, but this is patently so.
485    */
486   mask = 0;
487   for (from = to = string->buffer; *from != '\0'; from++, to++)
488     {
489       *to = *from;
490 
491       mask |= *to;
492       if (normalize_attribute && (*to == '\t' || *to == '\n'))
493         *to = ' ';
494       if (*to == '\r')
495         {
496           *to = normalize_attribute ? ' ' : '\n';
497           if (from[1] == '\n')
498             from++;
499         }
500       if (*from == '&')
501         {
502           from++;
503           if (*from == '#')
504             {
505               int base = 10;
506               unsigned long l;
507               char *end = NULL;
508 
509               from++;
510 
511               if (*from == 'x')
512                 {
513                   base = 16;
514                   from++;
515                 }
516 
517               errno = 0;
518               l = strtoul (from, &end, base);
519 
520               if (end == from || errno != 0)
521                 {
522                   char *error_text =
523                     xasprintf (_("invalid character reference: %s"),
524                                errno != 0
525                                ? strerror (errno)
526                                : _("not a valid number specification"));
527                   emit_error (context, error_text);
528                   free (error_text);
529                   return false;
530                 }
531               else if (*end != ';')
532                 {
533                   char *error_text =
534                     xasprintf (_("invalid character reference: %s"),
535                                _("no ending ';'"));
536                   emit_error (context, error_text);
537                   free (error_text);
538                   return false;
539                 }
540               else
541                 {
542                   /* characters XML 1.1 permits */
543                   if ((0 < l && l <= 0xD7FF) ||
544                       (0xE000 <= l && l <= 0xFFFD) ||
545                       (0x10000 <= l && l <= 0x10FFFF))
546                     {
547                       char buf[8];
548                       int length;
549                       length = u8_uctomb ((uint8_t *) buf, l, 8);
550                       memcpy (to, buf, length);
551                       to += length - 1;
552                       from = end;
553                       if (l >= 0x80) /* not ascii */
554                         mask |= 0x80;
555                     }
556                   else
557                     {
558                       char *error_text =
559                         xasprintf (_("invalid character reference: %s"),
560                                    _("non-permitted character"));
561                       emit_error (context, error_text);
562                       free (error_text);
563                       return false;
564                     }
565                 }
566             }
567 
568           else if (strncmp (from, "lt;", 3) == 0)
569             {
570               *to = '<';
571               from += 2;
572             }
573           else if (strncmp (from, "gt;", 3) == 0)
574             {
575               *to = '>';
576               from += 2;
577             }
578           else if (strncmp (from, "amp;", 4) == 0)
579             {
580               *to = '&';
581               from += 3;
582             }
583           else if (strncmp (from, "quot;", 5) == 0)
584             {
585               *to = '"';
586               from += 4;
587             }
588           else if (strncmp (from, "apos;", 5) == 0)
589             {
590               *to = '\'';
591               from += 4;
592             }
593           else
594             {
595               const char *reason;
596               char *error_text;
597 
598               if (*from == ';')
599                 reason = _("empty");
600               else
601                 {
602                   const char *end = strchr (from, ';');
603                   if (end)
604                     reason = _("unknown");
605                   else
606                     reason = _("no ending ';'");
607                 }
608               error_text = xasprintf (_("invalid entity reference: %s"),
609                                       reason);
610               emit_error (context, error_text);
611               free (error_text);
612               return false;
613             }
614         }
615     }
616 
617   assert (to - string->buffer <= string->buflen);
618   if (to - string->buffer != string->buflen)
619     markup_string_truncate (string, to - string->buffer);
620 
621   *is_ascii = !(mask & 0x80);
622 
623   return true;
624 }
625 
626 static inline bool
advance_char(markup_parse_context_ty * context)627 advance_char (markup_parse_context_ty *context)
628 {
629   context->iter++;
630   context->char_number++;
631 
632   if (context->iter == context->current_text_end)
633       return false;
634 
635   else if (*context->iter == '\n')
636     {
637       context->line_number++;
638       context->char_number = 1;
639     }
640 
641   return true;
642 }
643 
644 static inline bool
xml_isspace(char c)645 xml_isspace (char c)
646 {
647   return c == ' ' || c == '\t' || c == '\n' || c == '\r';
648 }
649 
650 static void
skip_spaces(markup_parse_context_ty * context)651 skip_spaces (markup_parse_context_ty *context)
652 {
653   do
654     {
655       if (!xml_isspace (*context->iter))
656         return;
657     }
658   while (advance_char (context));
659 }
660 
661 static void
advance_to_name_end(markup_parse_context_ty * context)662 advance_to_name_end (markup_parse_context_ty *context)
663 {
664   do
665     {
666       if (IS_COMMON_NAME_END_CHAR (*(context->iter)))
667         return;
668       if (xml_isspace (*(context->iter)))
669         return;
670     }
671   while (advance_char (context));
672 }
673 
674 static void
add_to_partial(markup_parse_context_ty * context,const char * text_start,const char * text_end)675 add_to_partial (markup_parse_context_ty *context,
676                 const char         *text_start,
677                 const char         *text_end)
678 {
679   if (context->partial_chunk == NULL)
680     { /* allocate a new chunk to parse into */
681 
682       context->partial_chunk = markup_string_new ();
683     }
684 
685   if (text_start != text_end)
686     markup_string_append (context->partial_chunk,
687                           text_start, text_end - text_start);
688 }
689 
690 static inline void
truncate_partial(markup_parse_context_ty * context)691 truncate_partial (markup_parse_context_ty *context)
692 {
693   if (context->partial_chunk != NULL)
694     string_blank (context->partial_chunk);
695 }
696 
697 static inline const char*
current_element(markup_parse_context_ty * context)698 current_element (markup_parse_context_ty *context)
699 {
700   const markup_string_ty *string = gl_list_get_at (context->tag_stack, 0);
701   return string->buffer;
702 }
703 
704 static void
pop_subparser_stack(markup_parse_context_ty * context)705 pop_subparser_stack (markup_parse_context_ty *context)
706 {
707   markup_recursion_tracker_ty *tracker;
708 
709   assert (gl_list_size (context->subparser_stack) > 0);
710 
711   tracker = (markup_recursion_tracker_ty *) gl_list_get_at (context->subparser_stack, 0);
712 
713   context->awaiting_pop = true;
714 
715   context->user_data = tracker->prev_user_data;
716   context->parser = tracker->prev_parser;
717   context->subparser_element = tracker->prev_element;
718   free (tracker);
719 
720   gl_list_remove_at (context->subparser_stack, 0);
721 }
722 
723 static void
push_partial_as_tag(markup_parse_context_ty * context)724 push_partial_as_tag (markup_parse_context_ty *context)
725 {
726   gl_list_add_first (context->tag_stack, context->partial_chunk);
727   context->partial_chunk = NULL;
728 }
729 
730 static void
pop_tag(markup_parse_context_ty * context)731 pop_tag (markup_parse_context_ty *context)
732 {
733   gl_list_remove_at (context->tag_stack, 0);
734 }
735 
736 static void
possibly_finish_subparser(markup_parse_context_ty * context)737 possibly_finish_subparser (markup_parse_context_ty *context)
738 {
739   if (current_element (context) == context->subparser_element)
740     pop_subparser_stack (context);
741 }
742 
743 static void
ensure_no_outstanding_subparser(markup_parse_context_ty * context)744 ensure_no_outstanding_subparser (markup_parse_context_ty *context)
745 {
746   context->awaiting_pop = false;
747 }
748 
749 static void
add_attribute(markup_parse_context_ty * context,markup_string_ty * string)750 add_attribute (markup_parse_context_ty *context, markup_string_ty *string)
751 {
752   if (context->cur_attr + 2 >= context->alloc_attrs)
753     {
754       context->alloc_attrs += 5; /* silly magic number */
755       context->attr_names = xrealloc (context->attr_names, sizeof (char *) * context->alloc_attrs);
756       context->attr_values = xrealloc (context->attr_values, sizeof(char *) * context->alloc_attrs);
757     }
758   context->cur_attr++;
759   context->attr_names[context->cur_attr] = xstrdup (string->buffer);
760   context->attr_values[context->cur_attr] = NULL;
761   context->attr_names[context->cur_attr+1] = NULL;
762   context->attr_values[context->cur_attr+1] = NULL;
763 }
764 
765 static void
clear_attributes(markup_parse_context_ty * context)766 clear_attributes (markup_parse_context_ty *context)
767 {
768   /* Go ahead and free the attributes. */
769   for (; context->cur_attr >= 0; context->cur_attr--)
770     {
771       int pos = context->cur_attr;
772       free (context->attr_names[pos]);
773       free (context->attr_values[pos]);
774       context->attr_names[pos] = context->attr_values[pos] = NULL;
775     }
776   assert (context->cur_attr == -1);
777   assert (context->attr_names == NULL ||
778           context->attr_names[0] == NULL);
779   assert (context->attr_values == NULL ||
780           context->attr_values[0] == NULL);
781 }
782 
783 static void
markup_parse_context_push(markup_parse_context_ty * context,const markup_parser_ty * parser,void * user_data)784 markup_parse_context_push (markup_parse_context_ty *context,
785                            const markup_parser_ty *parser,
786                            void *user_data)
787 {
788   markup_recursion_tracker_ty *tracker;
789 
790   tracker = XMALLOC (markup_recursion_tracker_ty);
791   tracker->prev_element = context->subparser_element;
792   tracker->prev_parser = context->parser;
793   tracker->prev_user_data = context->user_data;
794 
795   context->subparser_element = current_element (context);
796   context->parser = parser;
797   context->user_data = user_data;
798 
799   gl_list_add_first (context->subparser_stack, tracker);
800 }
801 
802 static void
markup_parse_context_pop(markup_parse_context_ty * context)803 markup_parse_context_pop (markup_parse_context_ty *context)
804 {
805   if (!context->awaiting_pop)
806     possibly_finish_subparser (context);
807 
808   assert (context->awaiting_pop);
809 
810   context->awaiting_pop = false;
811 }
812 
813 /* This has to be a separate function to ensure the alloca's
814  * are unwound on exit - otherwise we grow & blow the stack
815  * with large documents
816  */
817 static inline void
emit_start_element(markup_parse_context_ty * context)818 emit_start_element (markup_parse_context_ty *context)
819 {
820   int i, j = 0;
821   const char *start_name;
822   const char **attr_names;
823   const char **attr_values;
824 
825   /* In case we want to ignore qualified tags and we see that we have
826    * one here, we push a subparser.  This will ignore all tags inside of
827    * the qualified tag.
828    *
829    * We deal with the end of the subparser from emit_end_element.
830    */
831   if ((context->flags & MARKUP_IGNORE_QUALIFIED)
832       && strchr (current_element (context), ':'))
833     {
834       static const markup_parser_ty ignore_parser;
835       markup_parse_context_push (context, &ignore_parser, NULL);
836       clear_attributes (context);
837       return;
838     }
839 
840   attr_names = XCALLOC (context->cur_attr + 2, const char *);
841   attr_values = XCALLOC (context->cur_attr + 2, const char *);
842   for (i = 0; i < context->cur_attr + 1; i++)
843     {
844       /* Possibly omit qualified attribute names from the list */
845       if ((context->flags & MARKUP_IGNORE_QUALIFIED)
846           && strchr (context->attr_names[i], ':'))
847         continue;
848 
849       attr_names[j] = context->attr_names[i];
850       attr_values[j] = context->attr_values[i];
851       j++;
852     }
853   attr_names[j] = NULL;
854   attr_values[j] = NULL;
855 
856   /* Call user callback for element start */
857   start_name = current_element (context);
858 
859   if (context->parser->start_element && name_validate (context, start_name))
860     (* context->parser->start_element) (context,
861                                         start_name,
862                                         (const char **)attr_names,
863                                         (const char **)attr_values,
864                                         context->user_data);
865   free (attr_names);
866   free (attr_values);
867   clear_attributes (context);
868 }
869 
870 static void
emit_end_element(markup_parse_context_ty * context)871 emit_end_element (markup_parse_context_ty *context)
872 {
873   assert (gl_list_size (context->tag_stack) != 0);
874 
875   possibly_finish_subparser (context);
876 
877   /* We might have just returned from our ignore subparser */
878   if ((context->flags & MARKUP_IGNORE_QUALIFIED)
879       && strchr (current_element (context), ':'))
880     {
881       markup_parse_context_pop (context);
882       pop_tag (context);
883       return;
884     }
885 
886   if (context->parser->end_element)
887     (* context->parser->end_element) (context,
888                                       current_element (context),
889                                       context->user_data);
890 
891   ensure_no_outstanding_subparser (context);
892 
893   pop_tag (context);
894 }
895 
896 /* Feed some data to the parse context.  The data need not be valid
897    UTF-8; an error will be signaled if it's invalid.  The data need
898    not be an entire document; you can feed a document into the parser
899    incrementally, via multiple calls to this function.  Typically, as
900    you receive data from a network connection or file, you feed each
901    received chunk of data into this function, aborting the process if
902    an error occurs. Once an error is reported, no further data may be
903    fed to the parse context; all errors are fatal.  */
904 bool
markup_parse_context_parse(markup_parse_context_ty * context,const char * text,ssize_t text_len)905 markup_parse_context_parse (markup_parse_context_ty *context,
906                             const char *text,
907                             ssize_t text_len)
908 {
909   assert (context != NULL);
910   assert (text != NULL);
911   assert (context->state != STATE_ERROR);
912   assert (!context->parsing);
913 
914   if (text_len < 0)
915     text_len = strlen (text);
916 
917   if (text_len == 0)
918     return true;
919 
920   context->parsing = true;
921 
922 
923   context->current_text = text;
924   context->current_text_len = text_len;
925   context->current_text_end = context->current_text + text_len;
926   context->iter = context->current_text;
927   context->start = context->iter;
928 
929   while (context->iter != context->current_text_end)
930     {
931       switch (context->state)
932         {
933         case STATE_START:
934           /* Possible next state: AFTER_OPEN_ANGLE */
935 
936           assert (gl_list_size (context->tag_stack) == 0);
937 
938           /* whitespace is ignored outside of any elements */
939           skip_spaces (context);
940 
941           if (context->iter != context->current_text_end)
942             {
943               if (*context->iter == '<')
944                 {
945                   /* Move after the open angle */
946                   advance_char (context);
947 
948                   context->state = STATE_AFTER_OPEN_ANGLE;
949 
950                   /* this could start a passthrough */
951                   context->start = context->iter;
952 
953                   /* document is now non-empty */
954                   context->document_empty = false;
955                 }
956               else
957                 {
958                   emit_error (context,
959                               _("document must begin with an element"));
960                 }
961             }
962           break;
963 
964         case STATE_AFTER_OPEN_ANGLE:
965           /* Possible next states: INSIDE_OPEN_TAG_NAME,
966            *  AFTER_CLOSE_TAG_SLASH, INSIDE_PASSTHROUGH
967            */
968           if (*context->iter == '?' ||
969               *context->iter == '!')
970             {
971               /* include < in the passthrough */
972               const char *openangle = "<";
973               add_to_partial (context, openangle, openangle + 1);
974               context->start = context->iter;
975               context->balance = 1;
976               context->state = STATE_INSIDE_PASSTHROUGH;
977             }
978           else if (*context->iter == '/')
979             {
980               /* move after it */
981               advance_char (context);
982 
983               context->state = STATE_AFTER_CLOSE_TAG_SLASH;
984             }
985           else if (!IS_COMMON_NAME_END_CHAR (*(context->iter)))
986             {
987               context->state = STATE_INSIDE_OPEN_TAG_NAME;
988 
989               /* start of tag name */
990               context->start = context->iter;
991             }
992           else
993             {
994               char *error_text = xasprintf (_("invalid character after '%s'"),
995                                             "<");
996               emit_error (context, error_text);
997               free (error_text);
998             }
999           break;
1000 
1001           /* The AFTER_CLOSE_ANGLE state is actually sort of
1002            * broken, because it doesn't correspond to a range
1003            * of characters in the input stream as the others do,
1004            * and thus makes things harder to conceptualize
1005            */
1006         case STATE_AFTER_CLOSE_ANGLE:
1007           /* Possible next states: INSIDE_TEXT, STATE_START */
1008           if (gl_list_size (context->tag_stack) == 0)
1009             {
1010               context->start = NULL;
1011               context->state = STATE_START;
1012             }
1013           else
1014             {
1015               context->start = context->iter;
1016               context->state = STATE_INSIDE_TEXT;
1017             }
1018           break;
1019 
1020         case STATE_AFTER_ELISION_SLASH:
1021           /* Possible next state: AFTER_CLOSE_ANGLE */
1022           if (*context->iter == '>')
1023             {
1024               /* move after the close angle */
1025               advance_char (context);
1026               context->state = STATE_AFTER_CLOSE_ANGLE;
1027               emit_end_element (context);
1028             }
1029           else
1030             {
1031               char *error_text = xasprintf (_("missing '%c'"), '>');
1032               emit_error (context, error_text);
1033               free (error_text);
1034             }
1035           break;
1036 
1037         case STATE_INSIDE_OPEN_TAG_NAME:
1038           /* Possible next states: BETWEEN_ATTRIBUTES */
1039 
1040           /* if there's a partial chunk then it's the first part of the
1041            * tag name. If there's a context->start then it's the start
1042            * of the tag name in current_text, the partial chunk goes
1043            * before that start though.
1044            */
1045           advance_to_name_end (context);
1046 
1047           if (context->iter == context->current_text_end)
1048             {
1049               /* The name hasn't necessarily ended. Merge with
1050                * partial chunk, leave state unchanged.
1051                */
1052               add_to_partial (context, context->start, context->iter);
1053             }
1054           else
1055             {
1056               /* The name has ended. Combine it with the partial chunk
1057                * if any; push it on the stack; enter next state.
1058                */
1059               add_to_partial (context, context->start, context->iter);
1060               push_partial_as_tag (context);
1061 
1062               context->state = STATE_BETWEEN_ATTRIBUTES;
1063               context->start = NULL;
1064             }
1065           break;
1066 
1067         case STATE_INSIDE_ATTRIBUTE_NAME:
1068           /* Possible next states: AFTER_ATTRIBUTE_NAME */
1069 
1070           advance_to_name_end (context);
1071           add_to_partial (context, context->start, context->iter);
1072 
1073           /* read the full name, if we enter the equals sign state
1074            * then add the attribute to the list (without the value),
1075            * otherwise store a partial chunk to be prepended later.
1076            */
1077           if (context->iter != context->current_text_end)
1078             context->state = STATE_AFTER_ATTRIBUTE_NAME;
1079           break;
1080 
1081         case STATE_AFTER_ATTRIBUTE_NAME:
1082           /* Possible next states: AFTER_ATTRIBUTE_EQUALS_SIGN */
1083 
1084           skip_spaces (context);
1085 
1086           if (context->iter != context->current_text_end)
1087             {
1088               /* The name has ended. Combine it with the partial chunk
1089                * if any; push it on the stack; enter next state.
1090                */
1091               if (!name_validate (context, context->partial_chunk->buffer))
1092                 break;
1093 
1094               add_attribute (context, context->partial_chunk);
1095 
1096               markup_string_free (context->partial_chunk, true);
1097               context->partial_chunk = NULL;
1098               context->start = NULL;
1099 
1100               if (*context->iter == '=')
1101                 {
1102                   advance_char (context);
1103                   context->state = STATE_AFTER_ATTRIBUTE_EQUALS_SIGN;
1104                 }
1105               else
1106                 {
1107                   char *error_text = xasprintf (_("missing '%c'"), '=');
1108                   emit_error (context, error_text);
1109                   free (error_text);
1110                 }
1111             }
1112           break;
1113 
1114         case STATE_BETWEEN_ATTRIBUTES:
1115           /* Possible next states: AFTER_CLOSE_ANGLE,
1116            * AFTER_ELISION_SLASH, INSIDE_ATTRIBUTE_NAME
1117            */
1118           skip_spaces (context);
1119 
1120           if (context->iter != context->current_text_end)
1121             {
1122               if (*context->iter == '/')
1123                 {
1124                   advance_char (context);
1125                   context->state = STATE_AFTER_ELISION_SLASH;
1126                 }
1127               else if (*context->iter == '>')
1128                 {
1129                   advance_char (context);
1130                   context->state = STATE_AFTER_CLOSE_ANGLE;
1131                 }
1132               else if (!IS_COMMON_NAME_END_CHAR (*(context->iter)))
1133                 {
1134                   context->state = STATE_INSIDE_ATTRIBUTE_NAME;
1135                   /* start of attribute name */
1136                   context->start = context->iter;
1137                 }
1138               else
1139                 {
1140                   char *error_text = xasprintf (_("missing '%c' or '%c'"),
1141                                                 '>', '/');
1142                   emit_error (context, error_text);
1143                   free (error_text);
1144                 }
1145 
1146               /* If we're done with attributes, invoke
1147                * the start_element callback
1148                */
1149               if (context->state == STATE_AFTER_ELISION_SLASH ||
1150                   context->state == STATE_AFTER_CLOSE_ANGLE)
1151                 emit_start_element (context);
1152             }
1153           break;
1154 
1155         case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN:
1156           /* Possible next state: INSIDE_ATTRIBUTE_VALUE_[SQ/DQ] */
1157 
1158           skip_spaces (context);
1159 
1160           if (context->iter != context->current_text_end)
1161             {
1162               if (*context->iter == '"')
1163                 {
1164                   advance_char (context);
1165                   context->state = STATE_INSIDE_ATTRIBUTE_VALUE_DQ;
1166                   context->start = context->iter;
1167                 }
1168               else if (*context->iter == '\'')
1169                 {
1170                   advance_char (context);
1171                   context->state = STATE_INSIDE_ATTRIBUTE_VALUE_SQ;
1172                   context->start = context->iter;
1173                 }
1174               else
1175                 {
1176                   char *error_text = xasprintf (_("missing '%c' or '%c'"),
1177                                                 '\'', '"');
1178                   emit_error (context, error_text);
1179                   free (error_text);
1180                 }
1181             }
1182           break;
1183 
1184         case STATE_INSIDE_ATTRIBUTE_VALUE_SQ:
1185         case STATE_INSIDE_ATTRIBUTE_VALUE_DQ:
1186           /* Possible next states: BETWEEN_ATTRIBUTES */
1187           {
1188             char delim;
1189 
1190             if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ)
1191               {
1192                 delim = '\'';
1193               }
1194             else
1195               {
1196                 delim = '"';
1197               }
1198 
1199             do
1200               {
1201                 if (*context->iter == delim)
1202                   break;
1203               }
1204             while (advance_char (context));
1205           }
1206           if (context->iter == context->current_text_end)
1207             {
1208               /* The value hasn't necessarily ended. Merge with
1209                * partial chunk, leave state unchanged.
1210                */
1211               add_to_partial (context, context->start, context->iter);
1212             }
1213           else
1214             {
1215               bool is_ascii;
1216               /* The value has ended at the quote mark. Combine it
1217                * with the partial chunk if any; set it for the current
1218                * attribute.
1219                */
1220               add_to_partial (context, context->start, context->iter);
1221 
1222               assert (context->cur_attr >= 0);
1223 
1224               if (unescape_string_inplace (context, context->partial_chunk,
1225                                            &is_ascii)
1226                   && (is_ascii
1227                       || text_validate (context,
1228                                         context->partial_chunk->buffer,
1229                                         context->partial_chunk->buflen)))
1230                 {
1231                   /* success, advance past quote and set state. */
1232                   context->attr_values[context->cur_attr] =
1233                     markup_string_free (context->partial_chunk, false);
1234                   context->partial_chunk = NULL;
1235                   advance_char (context);
1236                   context->state = STATE_BETWEEN_ATTRIBUTES;
1237                   context->start = NULL;
1238                 }
1239 
1240               truncate_partial (context);
1241             }
1242           break;
1243 
1244         case STATE_INSIDE_TEXT:
1245           /* Possible next states: AFTER_OPEN_ANGLE */
1246           do
1247             {
1248               if (*context->iter == '<')
1249                 break;
1250             }
1251           while (advance_char (context));
1252 
1253           /* The text hasn't necessarily ended. Merge with
1254            * partial chunk, leave state unchanged.
1255            */
1256 
1257           add_to_partial (context, context->start, context->iter);
1258 
1259           if (context->iter != context->current_text_end)
1260             {
1261               bool is_ascii;
1262 
1263               /* The text has ended at the open angle. Call the text
1264                * callback.
1265                */
1266               if (unescape_string_inplace (context, context->partial_chunk,
1267                                            &is_ascii)
1268                   && (is_ascii
1269                       || text_validate (context,
1270                                         context->partial_chunk->buffer,
1271                                         context->partial_chunk->buflen)))
1272                 {
1273                   if (context->parser->text)
1274                     (*context->parser->text) (context,
1275                                               context->partial_chunk->buffer,
1276                                               context->partial_chunk->buflen,
1277                                               context->user_data);
1278 
1279                   /* advance past open angle and set state. */
1280                   advance_char (context);
1281                   context->state = STATE_AFTER_OPEN_ANGLE;
1282                   /* could begin a passthrough */
1283                   context->start = context->iter;
1284                 }
1285 
1286               truncate_partial (context);
1287             }
1288           break;
1289 
1290         case STATE_AFTER_CLOSE_TAG_SLASH:
1291           /* Possible next state: INSIDE_CLOSE_TAG_NAME */
1292           if (!IS_COMMON_NAME_END_CHAR (*(context->iter)))
1293             {
1294               context->state = STATE_INSIDE_CLOSE_TAG_NAME;
1295 
1296               /* start of tag name */
1297               context->start = context->iter;
1298             }
1299           else
1300             {
1301               char *error_text = xasprintf (_("invalid character after '%s'"),
1302                                             "</");
1303               emit_error (context, error_text);
1304               free (error_text);
1305             }
1306           break;
1307 
1308         case STATE_INSIDE_CLOSE_TAG_NAME:
1309           /* Possible next state: AFTER_CLOSE_TAG_NAME */
1310           advance_to_name_end (context);
1311           add_to_partial (context, context->start, context->iter);
1312 
1313           if (context->iter != context->current_text_end)
1314             context->state = STATE_AFTER_CLOSE_TAG_NAME;
1315           break;
1316 
1317         case STATE_AFTER_CLOSE_TAG_NAME:
1318           /* Possible next state: AFTER_CLOSE_TAG_SLASH */
1319 
1320           skip_spaces (context);
1321 
1322           if (context->iter != context->current_text_end)
1323             {
1324               markup_string_ty *close_name;
1325 
1326               close_name = context->partial_chunk;
1327               context->partial_chunk = NULL;
1328 
1329               if (*context->iter != '>')
1330                 {
1331                   char *error_text =
1332                     xasprintf (_("invalid character after '%s'"),
1333                                _("a close element name"));
1334                   emit_error (context, error_text);
1335                   free (error_text);
1336                 }
1337               else if (gl_list_size (context->tag_stack) == 0)
1338                 {
1339                   emit_error (context, _("element is closed"));
1340                 }
1341               else if (strcmp (close_name->buffer, current_element (context))
1342                        != 0)
1343                 {
1344                   emit_error (context, _("element is closed"));
1345                 }
1346               else
1347                 {
1348                   advance_char (context);
1349                   context->state = STATE_AFTER_CLOSE_ANGLE;
1350                   context->start = NULL;
1351 
1352                   emit_end_element (context);
1353                 }
1354               context->partial_chunk = close_name;
1355               truncate_partial (context);
1356             }
1357           break;
1358 
1359         case STATE_INSIDE_PASSTHROUGH:
1360           /* Possible next state: AFTER_CLOSE_ANGLE */
1361           do
1362             {
1363               if (*context->iter == '<')
1364                 context->balance++;
1365               if (*context->iter == '>')
1366                 {
1367                   char *str;
1368                   size_t len;
1369 
1370                   context->balance--;
1371                   add_to_partial (context, context->start, context->iter);
1372                   context->start = context->iter;
1373 
1374                   str = context->partial_chunk->buffer;
1375                   len = context->partial_chunk->buflen;
1376 
1377                   if (str[1] == '?' && str[len - 1] == '?')
1378                     break;
1379                   if (strncmp (str, "<!--", 4) == 0 &&
1380                       strcmp (str + len - 2, "--") == 0)
1381                     break;
1382                   if (strncmp (str, "<![CDATA[", 9) == 0 &&
1383                       strcmp (str + len - 2, "]]") == 0)
1384                     break;
1385                   if (strncmp (str, "<!DOCTYPE", 9) == 0 &&
1386                       context->balance == 0)
1387                     break;
1388                 }
1389             }
1390           while (advance_char (context));
1391 
1392           if (context->iter == context->current_text_end)
1393             {
1394               /* The passthrough hasn't necessarily ended. Merge with
1395                * partial chunk, leave state unchanged.
1396                */
1397                add_to_partial (context, context->start, context->iter);
1398             }
1399           else
1400             {
1401               /* The passthrough has ended at the close angle. Combine
1402                * it with the partial chunk if any. Call the passthrough
1403                * callback. Note that the open/close angles are
1404                * included in the text of the passthrough.
1405                */
1406               advance_char (context); /* advance past close angle */
1407               add_to_partial (context, context->start, context->iter);
1408 
1409               if (context->flags & MARKUP_TREAT_CDATA_AS_TEXT &&
1410                   strncmp (context->partial_chunk->buffer, "<![CDATA[", 9) == 0)
1411                 {
1412                   if (context->parser->text &&
1413                       text_validate (context,
1414                                      context->partial_chunk->buffer + 9,
1415                                      context->partial_chunk->buflen - 12))
1416                     (*context->parser->text) (context,
1417                                               context->partial_chunk->buffer + 9,
1418                                               context->partial_chunk->buflen - 12,
1419                                               context->user_data);
1420                 }
1421               else if (context->parser->passthrough &&
1422                        text_validate (context,
1423                                       context->partial_chunk->buffer,
1424                                       context->partial_chunk->buflen))
1425                 (*context->parser->passthrough) (context,
1426                                                  context->partial_chunk->buffer,
1427                                                  context->partial_chunk->buflen,
1428                                                  context->user_data);
1429 
1430               truncate_partial (context);
1431 
1432               context->state = STATE_AFTER_CLOSE_ANGLE;
1433               context->start = context->iter; /* could begin text */
1434             }
1435           break;
1436 
1437         case STATE_ERROR:
1438           goto finished;
1439           break;
1440 
1441         default:
1442           abort ();
1443           break;
1444         }
1445     }
1446 
1447  finished:
1448   context->parsing = false;
1449 
1450   return context->state != STATE_ERROR;
1451 }
1452 
1453 /* Signals to the parse context that all data has been fed into the
1454  * parse context with markup_parse_context_parse.
1455  *
1456  * This function reports an error if the document isn't complete,
1457  * for example if elements are still open.  */
1458 bool
markup_parse_context_end_parse(markup_parse_context_ty * context)1459 markup_parse_context_end_parse (markup_parse_context_ty *context)
1460 {
1461   const char *location = NULL;
1462 
1463   assert (context != NULL);
1464   assert (!context->parsing);
1465   assert (context->state != STATE_ERROR);
1466 
1467   if (context->partial_chunk != NULL)
1468     {
1469       markup_string_free (context->partial_chunk, true);
1470       context->partial_chunk = NULL;
1471     }
1472 
1473   if (context->document_empty)
1474     {
1475       emit_error (context, _("empty document"));
1476       return false;
1477     }
1478 
1479   context->parsing = true;
1480 
1481   switch (context->state)
1482     {
1483     case STATE_START:
1484       /* Nothing to do */
1485       break;
1486 
1487     case STATE_AFTER_OPEN_ANGLE:
1488       location = _("after '<'");
1489       break;
1490 
1491     case STATE_AFTER_CLOSE_ANGLE:
1492       if (gl_list_size (context->tag_stack) > 0)
1493         {
1494           /* Error message the same as for INSIDE_TEXT */
1495           location = _("elements still open");
1496         }
1497       break;
1498 
1499     case STATE_AFTER_ELISION_SLASH:
1500       location = _("missing '>'");
1501       break;
1502 
1503     case STATE_INSIDE_OPEN_TAG_NAME:
1504       location = _("inside an element name");
1505       break;
1506 
1507     case STATE_INSIDE_ATTRIBUTE_NAME:
1508     case STATE_AFTER_ATTRIBUTE_NAME:
1509       location = _("inside an attribute name");
1510       break;
1511 
1512     case STATE_BETWEEN_ATTRIBUTES:
1513       location = _("inside an open tag");
1514       break;
1515 
1516     case STATE_AFTER_ATTRIBUTE_EQUALS_SIGN:
1517       location = _("after '='");
1518       break;
1519 
1520     case STATE_INSIDE_ATTRIBUTE_VALUE_SQ:
1521     case STATE_INSIDE_ATTRIBUTE_VALUE_DQ:
1522       location = _("inside an attribute value");
1523       break;
1524 
1525     case STATE_INSIDE_TEXT:
1526       assert (gl_list_size (context->tag_stack) > 0);
1527       location = _("elements still open");
1528       break;
1529 
1530     case STATE_AFTER_CLOSE_TAG_SLASH:
1531     case STATE_INSIDE_CLOSE_TAG_NAME:
1532     case STATE_AFTER_CLOSE_TAG_NAME:
1533       location = _("inside the close tag");
1534       break;
1535 
1536     case STATE_INSIDE_PASSTHROUGH:
1537       location = _("inside a comment or processing instruction");
1538       break;
1539 
1540     case STATE_ERROR:
1541     default:
1542       abort ();
1543       break;
1544     }
1545 
1546   if (location != NULL)
1547     {
1548       char *error_text = xasprintf (_("document ended unexpectedly: %s"),
1549                                     location);
1550       emit_error (context, error_text);
1551       free (error_text);
1552     }
1553 
1554   context->parsing = false;
1555 
1556   return context->state != STATE_ERROR;
1557 }
1558 
1559 const char *
markup_parse_context_get_error(markup_parse_context_ty * context)1560 markup_parse_context_get_error (markup_parse_context_ty *context)
1561 {
1562   return context->error_text;
1563 }
1564