1 /* 	$Id: xmlparser.c,v 1.135 2008/10/03 13:07:51 richard Exp $
2 */
3 
4 #define DEBUG_FSM 0
5 
6 #ifndef lint
7 static char vcid[] = "$Id: xmlparser.c,v 1.135 2008/10/03 13:07:51 richard Exp $";
8 #endif /* lint */
9 
10 /*
11  * XML (and nSGML) parser.
12  * Author: Richard Tobin.
13  */
14 
15 #include <stdarg.h>
16 #include <stdlib.h>
17 
18 #ifdef FOR_LT
19 
20 #include "lt-memory.h"
21 #include "nsllib.h"
22 
23 #define Malloc salloc
24 #define Realloc srealloc
25 #define Free sfree
26 
27 #else
28 
29 #include "system.h"
30 
31 #endif
32 
33 #include "charset.h"
34 #include "string16.h"
35 #include "ctype16.h"
36 #include "dtd.h"
37 #include "input.h"
38 #include "stdio16.h"
39 #include "url.h"
40 #include "namespaces.h"
41 #include "xmlparser.h"
42 
43 #ifdef FOR_LT
44 
45 #include "lt-hash.h"
46 
47 typedef HashList *HashEntry;
48 typedef HashList HashEntryStruct;
49 #define create_hash_table NewSizedHashStruct
50 #define free_hash_table(table) FreeHashStructM((table), 1)
51 #define hash_map MapHashLists1
52 #define hash_set_value(entry, value) ((entry)->index = (value))
53 #define hash_get_value(entry) ((entry)->index)
54 #define hash_get_key(entry) ((entry)->word)
55 #define hash_get_key_len(entry) ((entry)->length * sizeof(Char))
56 #define HashMapRetType boolean
57 
hash_find_or_add(HashTable table,const Char * key,int key_len,int * foundp)58 static HashEntry hash_find_or_add(HashTable table, const Char *key,
59 				  int key_len, int *foundp)
60 {
61     HashEntry entry;
62 
63     key_len /= sizeof(Char);
64     entry = FindWordInTableX(table, key, key_len);
65     if(!entry)
66     {
67 	*foundp = 0;
68 	entry = AddWordToTableXM(table, key, key_len);
69 	if(!entry)
70 	    return 0;
71     }
72     else
73 	*foundp = 1;
74 
75     return entry;
76 }
77 
78 #else
79 
80 #include "hash.h"
81 
82 #define hash_set_value(entry, _value) ((entry)->value = (_value))
83 #define hash_get_value(entry) ((entry)->value)
84 #define hash_get_key(entry) ((entry)->key)
85 #define hash_get_key_len(entry) ((entry)->key_len)
86 #define HashMapRetType void
87 
88 #endif
89 
90 static int transcribe(Parser p, int back, int count);
91 static void pop_while_at_eoe(Parser p);
92 static void maybe_uppercase(Parser p, Char *s);
93 static void maybe_uppercase_name(Parser p);
94 static int str_maybecase_cmp8(Parser p, const char8 *a, const char8 *b);
95 static int is_ascii_alpha(int c);
96 static int is_ascii_digit(int c);
97 static int parse_external_id(Parser p, int required,
98 			     char8 **publicid, char8 **systemid,
99 			     int preq, int sreq);
100 static int parse_conditional(Parser p, Entity ent);
101 static int parse_notation_decl(Parser p, Entity ent);
102 static int parse_entity_decl(Parser p, Entity ent, int line, int chpos,
103 			     Entity ext_ent);
104 static int parsing_internal(Parser p);
105 static int parsing_external_subset(Parser p);
106 static int parse_attlist_decl(Parser p, Entity ent);
107 static int parse_element_decl(Parser p, Entity ent);
108 static ContentParticle parse_cp(Parser p);
109 static ContentParticle parse_choice_or_seq(Parser p, Entity ent);
110 static ContentParticle parse_choice_or_seq_1(Parser p, int nchildren,
111 					     char sep, Entity ent);
112 static int check_content_decl(Parser p, ContentParticle cp);
113 static int check_content_decl_1(Parser p, ContentParticle cp);
114 static Char *stringify_cp(ContentParticle cp);
115 static void print_cp(ContentParticle cp, FILE16 *f);
116 static int size_cp(ContentParticle cp);
117 static int check_qualname_syntax(Parser p, const Char *name, const char *type);
118 static int parse_reference(Parser p, int pe, int expand, int allow_external);
119 static int parse_character_reference(Parser p, int expand);
120 static const char8 *escape(int c, char8 *buf);
121 static int parse_name(Parser p, const char8 *where);
122 static int parse_nmtoken(Parser p, const char8 *where);
123 static int looking_at(Parser p, const char8 *string);
124 static void clear_xbit(XBit xbit);
125 static int expect(Parser p, int expected, const char8 *where);
126 static int expect_dtd_whitespace(Parser p, const char8 *where);
127 static void skip_whitespace(InputSource s);
128 static int skip_dtd_whitespace(Parser p, int allow_pe);
129 static int parse_cdata(Parser p);
130 static int process_nsl_decl(Parser p);
131 static int process_xml_decl(Parser p);
132 static int is_v1x(const char *version);
133 static int parse_dtd(Parser p);
134 static int read_markupdecls(Parser p);
135 static int error(Parser p, const char8 *format, ...);
136 static int warn(Parser p, const char8 *format, ...);
137 static void verror(char8 *buf, int size, XBit bit, const char8 *format, va_list args);
138 enum literal_type {
139     LT_cdata_attr, LT_tok_attr, LT_plain, LT_entity, LT_param_entity,
140     LT_pubid
141 };
142 static int parse_string(Parser p, const char8 *where, enum literal_type type, int *normalised);
143 static int parse_pi(Parser p, Entity ent);
144 static int parse_comment(Parser p, int skip, Entity ent);
145 static int parse_pcdata(Parser p);
146 static int parse_starttag(Parser p);
147 Namespace LookupNamespace(NamespaceBinding dictionary, const Char *prefix);
148 static int process_namespace(Parser p,
149 			     AttributeDefinition d, const Char *value);
150 static int parse_attribute(Parser p);
151 static WhiteSpaceMode process_xml_space(Parser p, const Char *value);
152 static int parse_endtag(Parser p);
153 static int parse_markup(Parser p);
154 static int parse(Parser p);
155 static int parse_markupdecl(Parser p);
156 static int validate_dtd(Parser p);
157 static int validate_final(Parser p);
158 static HashMapRetType check_id(const HashEntryStruct *id_entry, void *p);
159 static int validate_attribute(Parser p, AttributeDefinition a, ElementDefinition e, const Char *value);
160 static int validate_xml_lang_attribute(Parser p, ElementDefinition e, const Char *value);
161 static int check_attribute_syntax(Parser p, AttributeDefinition a, ElementDefinition e, const Char *value, const char *message, int real_use);
162 static int check_attribute_token(Parser p, AttributeDefinition a, ElementDefinition e, const Char *value, int length, const char *message, int real_use);
163 #if not_yet
164 static int magically_transform_dtd(Parser p, Char *name, int namelen);
165 #endif
166 
167 static struct element_definition pcdata_element;
168 const ElementDefinition Epsilon = 0, PCDataElement = &pcdata_element;
169 
170 static FSM NewFSM(void);
171 void FreeFSM(FSM fsm);
172 static FSMNode AddNode(FSM fsm);
173 static FSMEdge AddEdge(FSMNode source, FSMNode destination, void *label);
174 static void UnMarkFSM(FSM fsm, int value);
175 static void DeleteNode(FSMNode node);
176 static void DeleteEdge(FSMEdge edge);
177 static void CleanupFSM(FSM fsm);
178 static void CleanupNode(FSMNode node);
179 
180 #if DEBUG_FSM
181 static void PrintFSM(FILE16 *out, FSM fsm, int relabelled);
182 #endif
183 static int SimplifyFSM(FSM fsm);
184 static int add_epsilon_closure(FSMNode base, FSMNode node);
185 static FSMNode translate_particle(FSM fsm, ContentParticle cp, FSMNode next);
186 static FSMNode translate_particle_1(FSM fsm, ContentParticle cp, FSMNode next);
187 static FSMNode validate_content(FSMNode context, ElementDefinition e);
188 static int check_deterministic(Parser p, ElementDefinition element);
189 static int check_deterministic_1(Parser p, ElementDefinition element,
190 				  FSMNode node, ElementDefinition previous);
191 
192 #define validity_error (p->seen_validity_error=1, ParserGetFlag(p, ErrorOnValidityErrors) ? error : warn)
193 
194 #define namespace_error error
195 #define namespace_validity_error validity_error
196 
197 #define require(x) if(x >= 0) {} else return -1
198 #define require0(x) if(x >= 0) {} else return 0
199 
200 #define Consume(buf) (buf = 0, buf##size = 0)
201 #define ExpandBuf(buf, sz) \
202     if(buf##size >= (sz)+1) {} else if((buf = Realloc(buf, (buf##size = sz + 1) * sizeof(Char)))) {} else return error(p, "System error")
203 
204 #define CopyName(n) if((n = Malloc((p->namelen + 1)*sizeof(Char)))) {memcpy(n, p->name, p->namelen * sizeof(Char)); n[p->namelen] = 0;} else return error(p, "System error");
205 
206 #define CopyName0(n) if((n = Malloc((p->namelen + 1)*sizeof(Char)))) {memcpy(n, p->name, p->namelen * sizeof(Char)); n[p->namelen] = 0;} else {error(p, "System error"); return 0;}
207 
208 #define ifNF16wrong(p,b,l) if((p)->checker && NF16wrong==nf16checkL((p)->checker, (p)->source->line + (p)->source->next - (b), (l)))
209 #define NF16StartCheck(p) if((p)->checker) nf16checkStart((p)->checker)
210 #define NF16noStartCheck(p) if((p)->checker) nf16checkNoStart((p)->checker)
211 
212 #if CHAR_SIZE == 8
213 #define tochar8(s) s
214 #define duptochar8(s) strdup8(s)
215 #else
216 #define tochar8(s) (p->transbuf = translate_utf16_latin1_m(s, p->transbuf))
217 #define duptochar8(s) translate_utf16_latin1_m(s, 0)
218 #endif
219 
220 const char8 *XBitTypeName[XBIT_enum_count] = {
221     "dtd",
222     "start",
223     "empty",
224     "end",
225     "eof",
226     "pcdata",
227     "pi",
228     "comment",
229     "cdsect",
230     "error",
231     "warning",
232     "none"
233 };
234 
235 static Entity xml_builtin_entity;
236 static Entity xml_predefined_entities;
237 
238 static int parser_initialised = 0;
239 
240 static Char xml_ns[] = {'h','t','t','p',':','/','/','w','w','w','.','w','3',
241 			'.','o','r','g','/','X','M','L','/','1','9','9','8',
242 			'/','n','a','m','e','s','p','a','c','e',0};
243 static Char xmlns_ns[] = {'h','t','t','p',':','/','/','w','w','w','.','w', '3',
244 			  '.','o','r','g','/','2','0','0','0','/','x', 'm','l',
245 			  'n','s','/',0};
246 
init_parser(void)247 int init_parser(void)
248 {
249     Entity e, f;
250     int i;
251     static const Char lt[] = {'l','t',0}, ltval[] = {'&','#','6','0',';',0};
252     static const Char gt[] = {'g','t',0}, gtval[] = {'>',0};
253     static const Char amp[] = {'a','m','p',0},
254 		      ampval[] = {'&','#','3','8',';',0};
255     static const Char apos[] = {'a','p','o','s',0}, aposval[] = {'\'',0};
256     static const Char quot[] = {'q','u','o','t',0}, quotval[] = {'"',0};
257     static const Char *builtins[5][2] = {
258 	{lt, ltval}, {gt, gtval}, {amp, ampval},
259 	{apos, aposval}, {quot, quotval}
260     };
261 
262     if(parser_initialised)
263 	return 0;
264     parser_initialised = 1;
265 
266     if(init_charset() == -1 ||
267        init_ctype16() == -1 ||
268        init_stdio16() == -1 ||
269        init_url() == -1 ||
270        init_namespaces() == -1)
271 	return -1;
272 
273     xml_builtin_entity = NewInternalEntity(0, 0, 0, 0, 0, 0);
274 
275     for(i=0, f=0; i<5; i++, f=e)
276     {
277 	e = NewInternalEntity(builtins[i][0], builtins[i][1],
278 			      xml_builtin_entity, 0, 0, 0);
279 	if(!e)
280 	    return -1;
281 	e->next = f;
282     }
283 
284     xml_predefined_entities = e;
285 
286     return 0;
287 }
288 
deinit_parser(void)289 void deinit_parser(void)
290 {
291     Entity e, f;
292 
293     if(!parser_initialised)
294 	return;
295     parser_initialised = 0;
296 
297     deinit_charset();
298     deinit_ctype16();
299     deinit_stdio16();
300     deinit_namespaces();
301     deinit_url();
302 
303     for(e = xml_predefined_entities; e; e=f)
304     {
305 	f = e->next;
306 	e->text = 0;		/* it wasn't malloced so we mustn't free it */
307 	FreeEntity(e);
308     }
309 
310     FreeEntity(xml_builtin_entity);
311 }
312 
skip_whitespace(InputSource s)313 static void skip_whitespace(InputSource s)
314 {
315     int c;
316 
317     while((c = get(s)) != XEOE && is_xml_whitespace(c))
318 	;
319     unget(s);
320 }
321 
322 /*
323  * Skip whitespace and (optionally) the start and end of PEs.  Return 1 if
324  * there actually *was* some whitespace or a PE start/end, -1 if
325  * an error occurred, 0 otherwise.
326  */
327 
skip_dtd_whitespace(Parser p,int allow_pe)328 static int skip_dtd_whitespace(Parser p, int allow_pe)
329 {
330     int c;
331     int got_some = 0;
332     InputSource s = p->source;
333 
334     while(1)
335     {
336 	c = get(s);
337 
338 	if(c == XEOE)
339 	{
340 	    got_some = 1;
341 	    if(s->parent)
342 	    {
343 		if(!allow_pe)
344 		    return error(p,
345 				 "PE end not allowed here in internal subset");
346 		if(s->entity->type == ET_external)
347 		    p->external_pe_depth--;
348 		ParserPop(p);
349 		s = p->source;
350 	    }
351 	    else
352 	    {
353 		unget(s);	/* leave the final EOE waiting to be read */
354 		return got_some;
355 	    }
356 	}
357 	else if(is_xml_whitespace(c))
358 	{
359 	    got_some = 1;
360 	}
361 	else if(c == '%')
362 	{
363 	    /* this complication is needed for <!ENTITY % ...
364 	       otherwise we could just assume it was a PE reference. */
365 
366 	    c = get(s); unget(s);
367 	    if(c != XEOE && is_xml_namestart(c, p->map))
368 	    {
369 		if(!allow_pe)
370 		{
371 		    unget(s);	/* For error position */
372 		    return error(p,
373 				 "PE ref not allowed here in internal subset");
374 		}
375 		require(parse_reference(p, 1, 1, 1));
376 		s = p->source;
377 		if(s->entity->type == ET_external)
378 		    p->external_pe_depth++;
379 		got_some = 1;
380 	    }
381 	    else
382 	    {
383 		unget(s);
384 		return got_some;
385 	    }
386 	}
387 	else
388 	{
389 	    unget(s);
390 	    return got_some;
391 	}
392     }
393 }
394 
expect(Parser p,int expected,const char8 * where)395 static int expect(Parser p, int expected, const char8 *where)
396 {
397     int c;
398     InputSource s = p->source;
399 
400     c = get(s);
401     if(c != expected)
402     {
403 	unget(s);		/* For error position */
404 	if(c == BADCHAR)
405 	    return error(p, "Input error: %s", s->error_msg);
406 	else
407 	    return error(p, "Expected %s %s, but got %s",
408 			 escape(expected, p->escbuf[0]), where,
409 			 escape(c, p->escbuf[1]));
410     }
411 
412     return 0;
413 }
414 
415 /*
416  * Expects whitespace or the start or end of a PE.
417  */
418 
expect_dtd_whitespace(Parser p,const char8 * where)419 static int expect_dtd_whitespace(Parser p, const char8 *where)
420 {
421     int r = skip_dtd_whitespace(p, p->external_pe_depth > 0);
422 
423     if(r < 0)
424 	return -1;
425 
426     if(r == 0)
427 	return error(p, "Expected whitespace %s", where);
428 
429     return 0;
430 }
431 
clear_xbit(XBit xbit)432 static void clear_xbit(XBit xbit)
433 {
434     xbit->type = XBIT_none;
435     xbit->s1 = 0;
436     xbit->S1 = xbit->S2 = 0;
437     xbit->attributes = 0;
438     xbit->element_definition = 0;
439     xbit->ns_dict = 0;
440 }
441 
FreeXBit(XBit xbit)442 void FreeXBit(XBit xbit)
443 {
444     Attribute a, b;
445 
446     if(xbit->S1) Free(xbit->S1);
447     if(xbit->S2) Free(xbit->S2);
448     if(xbit->type != XBIT_error && xbit->type != XBIT_warning && xbit->s1)
449 	Free(xbit->s1);
450     if(xbit->ns_dict && xbit->nsowned)
451     {
452 	int i;
453 	NamespaceBinding parent, ns = xbit->ns_dict;
454 	for(i=0; i<xbit->nsc; i++)
455 	{
456 	    parent = ns->parent;
457 	    Free(ns);
458 	    ns = parent;
459 	}
460     }
461 
462     for(a = xbit->attributes; a; a = b)
463     {
464 	b = a->next;
465 	if(a->value) Free(a->value);
466 	Free(a);
467     }
468     clear_xbit(xbit);
469 }
470 
471 /*
472  * Returns 1 if the input matches string (and consume the input).
473  * Otherwise returns 0 and leaves the input stream where it was.
474  * Case-sensitivity depends on the CaseInsensitive flag.
475  * A space character at end of string matches any (non-zero) amount of
476  * whitespace; space are treated literally elsewhere.
477  * Never reads beyond an end-of-line, except to consume
478  * extra whitespace when the last character of string is a space.
479  * Never reads beyond end-of-entity.
480  */
481 
looking_at(Parser p,const char8 * string)482 static int looking_at(Parser p, const char8 *string)
483 {
484     InputSource s = p->source;
485     int c, d;
486     int save = s->next;
487 
488     if(p->state == PS_error)
489 	/* we got a bad character before, don't try again */
490 	return 0;
491 
492     for(c = *string++; c; c = *string++)
493     {
494 	if(at_eol(s))
495 	    goto fail;		/* We would go over a line end */
496 
497 	d = get(s);
498 
499 	if(d == BADCHAR)
500 	{
501 	    error(p, "Input error: %s", s->error_msg);
502 	    goto fail;
503 	}
504 
505 	if(c == ' ' && *string == 0)
506 	{
507 	    if(d == XEOE || !is_xml_whitespace(d))
508 		goto fail;
509 	    skip_whitespace(s);
510 	}
511 	else
512 	    if((ParserGetFlag(p, CaseInsensitive) &&
513 		Toupper(d) != Toupper(c)) ||
514 	       (!ParserGetFlag(p, CaseInsensitive) && d != c))
515 		goto fail;
516     }
517 
518     return 1;
519 
520 fail:
521     s->next = save;
522     return 0;
523 }
524 
parse_name(Parser p,const char8 * where)525 static int parse_name(Parser p, const char8 *where)
526 {
527     InputSource s = p->source;
528     int c, i;
529 
530     c = get(s);
531     if(c == BADCHAR)
532 	return error(p, "Input error: %s", s->error_msg);
533 
534     if(c == XEOE || !is_xml_namestart(c, p->map))
535     {
536 	unget(s);		/* For error position */
537 	error(p, "Expected name, but got %s %s",
538 	      escape(c, p->escbuf[0]), where);
539 	return -1;
540     }
541     i = 1;
542 
543     while(c = get(s), (c != XEOE && is_xml_namechar(c, p->map)))
544 	i++;
545     unget(s);
546 
547     p->name = s->line + s->next - i;
548     p->namelen = i;
549 
550     NF16StartCheck(p);
551     if(p->namechecker && NF16wrong==nf16checkL(p->namechecker,
552                         s->line + s->next - i, i))
553         return error(p, "Name not normalized after %s", where);
554 
555     return 0;
556 }
557 
parse_nmtoken(Parser p,const char8 * where)558 static int parse_nmtoken(Parser p, const char8 *where)
559 {
560     InputSource s = p->source;
561     int c, i=0;
562 
563     c = get(s);
564     if(c == BADCHAR)
565 	return error(p, "Input error: %s", s->error_msg);
566 
567     while(c !=XEOE && is_xml_namechar(c, p->map))
568     {
569 	i++;
570 	c = get(s);
571     }
572     unget(s);
573 
574     if(i == 0)
575 	return error(p, "Expected nmtoken, but got %s %s",
576 		     escape(c, p->escbuf[0]), where);
577 
578     p->name = s->line + s->next - i;
579     p->namelen = i;
580 
581     NF16StartCheck(p);
582     if(p->namechecker && NF16wrong==nf16checkL(p->namechecker,
583                         s->line + s->next - i, i))
584         return error(p, "nmtoken not normalized after %s", where);
585 
586     return 0;
587 }
588 
589 /* Escape a character for printing n an error message. */
590 
escape(int c,char8 * buf)591 static const char8 *escape(int c, char8 *buf)
592 {
593 #if CHAR_SIZE == 8
594     if(c != XEOE)
595 	c &= 0xff;
596 #endif
597 
598     if(c == XEOE)
599 	return "<EOE>";
600     else if(c >= 33 && c <= 126)
601 	sprintf(buf, "%c", c);
602     else if(c == ' ')
603 	sprintf(buf, "<space>");
604     else
605 	sprintf(buf, "<0x%x>", c);
606 
607     return buf;
608 }
609 
NewParser(void)610 Parser NewParser(void)
611 {
612     Parser p;
613     static Char xml[] = {'x','m','l',0};
614 
615     if(init_parser() == -1)
616 	return 0;
617 
618     p = Malloc(sizeof(*p));
619     if(!p)
620 	return 0;
621     p->state = PS_prolog1;
622     p->seen_validity_error = 0;
623     p->document_entity = 0;	/* Set at first ParserPush */
624     p->have_dtd = 0;
625     p->standalone = SDD_unspecified;
626     p->flags[0] = p->flags[1] = 0;
627     p->source = 0;
628     clear_xbit(&p->xbit);
629 #ifndef FOR_LT
630     p->xbit.nchildren = 0;	/* These three should never be changed */
631     p->xbit.children = 0;
632     p->xbit.parent = 0;
633 #endif
634     p->pbufsize = p->pbufnext = 0;
635     p->pbuf = 0;
636     p->save_pbufsize = p->save_pbufnext = 0;
637     p->save_pbuf = 0;
638     p->transbuf = 0;
639 
640     p->peeked = 0;
641     p->dtd = NewDtd();
642     p->dtd_callback = p->warning_callback = 0;
643     p->entity_opener = 0;
644     p->dtd_callback_arg = 0;
645     p->warning_callback_arg = 0;
646     p->entity_opener_arg = 0;
647     p->external_pe_depth = 0;
648 
649     p->checker = 0;
650     p->namechecker = 0;
651 
652     VectorInit(p->element_stack);
653 
654     p->base_ns.parent = 0;
655     p->base_ns.prefix = xml;
656     p->base_ns.namespace =
657 	FindNamespace(p->dtd->namespace_universe, xml_ns, 1);
658     if(!p->base_ns.namespace)
659 	return 0;
660 
661     p->id_table = create_hash_table(100);
662     if(!p->id_table)
663 	return 0;
664 
665     ParserSetFlag(p, XMLSyntax, 1);
666     ParserSetFlag(p, XMLPredefinedEntities, 1);
667     ParserSetFlag(p, XMLExternalIDs, 1);
668     ParserSetFlag(p, XMLMiscWFErrors, 1);
669     ParserSetFlag(p, ErrorOnUnquotedAttributeValues, 1);
670     ParserSetFlag(p, XMLLessThan, 1);
671     ParserSetFlag(p, ExpandGeneralEntities, 1);
672     ParserSetFlag(p, ExpandCharacterEntities, 1);
673     ParserSetFlag(p, NormaliseAttributeValues, 1);
674     ParserSetFlag(p, WarnOnRedefinitions, 1);
675     ParserSetFlag(p, TrustSDD, 1);
676     ParserSetFlag(p, ReturnComments, 1);
677     ParserSetFlag(p, MaintainElementStack, 1);
678     ParserSetFlag(p, XMLSpace, 0);
679     ParserSetFlag(p, XMLNamespaces, 0);
680     ParserSetFlag(p, XML11CheckNF, 0);
681     ParserSetFlag(p, XML11CheckExists, 0);
682 
683     /* These are set here because LTXML sometimes pushes an internal
684        entity (for string reading), and the version-determining code
685        never gets run. */
686     p->xml_version = XV_1_0;
687     p->map = xml_char_map_105;
688 
689     return p;
690 }
691 
FreeParser(Parser p)692 void FreeParser(Parser p)
693 {
694     while (p->source)
695 	ParserPop(p);		/* Will close file */
696 
697     Free(p->pbuf);
698     Free(p->save_pbuf);
699     Free(p->transbuf);
700     Free(p->element_stack);
701     free_hash_table(p->id_table);
702     if(p->checker)
703         nf16checkDelete(p->checker);
704     if(p->namechecker)
705         nf16checkDelete(p->namechecker);
706 
707     Free(p);
708 }
709 
ParserRootSource(Parser p)710 InputSource ParserRootSource(Parser p)
711 {
712     InputSource s;
713 
714     for(s=p->source; s && s->parent; s = s->parent)
715 	;
716 
717     return s;
718 }
719 
ParserRootEntity(Parser p)720 Entity ParserRootEntity(Parser p)
721 {
722     return ParserRootSource(p)->entity;
723 }
724 
ParserSetDtdCallbackArg(Parser p,void * arg)725 void ParserSetDtdCallbackArg(Parser p, void *arg)
726 {
727     p->dtd_callback_arg = arg;
728 }
729 
ParserSetWarningCallbackArg(Parser p,void * arg)730 void ParserSetWarningCallbackArg(Parser p, void *arg)
731 {
732     p->warning_callback_arg = arg;
733 }
734 
ParserSetEntityOpenerArg(Parser p,void * arg)735 void ParserSetEntityOpenerArg(Parser p, void *arg)
736 {
737     p->entity_opener_arg = arg;
738 }
739 
ParserSetDtdCallback(Parser p,CallbackProc cb)740 void ParserSetDtdCallback(Parser p, CallbackProc cb)
741 {
742     p->dtd_callback = cb;
743 }
744 
ParserSetWarningCallback(Parser p,CallbackProc cb)745 void ParserSetWarningCallback(Parser p, CallbackProc cb)
746 {
747     p->warning_callback = cb;
748 }
749 
ParserSetEntityOpener(Parser p,EntityOpenerProc opener)750 void ParserSetEntityOpener(Parser p, EntityOpenerProc opener)
751 {
752     p->entity_opener = opener;
753 }
754 
755 #ifndef FOR_LT
756 
ReadXTree(Parser p)757 XBit ReadXTree(Parser p)
758 {
759     XBit bit, tree, child;
760     XBit *children;
761 
762     bit = ReadXBit(p);
763 
764     switch(bit->type)
765     {
766     case XBIT_error:
767 	return bit;
768 
769     case XBIT_start:
770 	if(!(tree = Malloc(sizeof(*tree))))
771 	{
772 	    error(p, "System error");
773 	    return &p->xbit;
774 	}
775 	*tree = *bit;
776 	while(1)
777 	{
778 	    child = ReadXTree(p);
779 	    switch(child->type)
780 	    {
781 	    case XBIT_error:
782 		FreeXTree(tree);
783 		return child;
784 
785 	    case XBIT_eof:
786 		FreeXTree(tree);
787 		{
788 		    error(p, "EOF in element");
789 		    return &p->xbit;
790 		}
791 
792 	    case XBIT_end:
793 		if(child->element_definition != tree->element_definition)
794 		{
795 		    const Char *name1 = tree->element_definition->name,
796 			       *name2 = child->element_definition->name;
797 		    FreeXTree(tree);
798 		    FreeXTree(child);
799 		    error(p, "Mismatched end tag: expected </%S>, got </%S>",
800 			  name1, name2);
801 		    return &p->xbit;
802 		}
803 		/* Transfer ns records to start bit so that ns gets freed
804 		   when the tree is freed, rather than now. */
805 		tree->nsowned = child->nsowned;
806 		child->nsowned = 0;
807 		FreeXTree(child);
808 		return tree;
809 
810 	    default:
811 		children = Realloc(tree->children,
812 				   (tree->nchildren + 1) * sizeof(XBit));
813 		if(!children)
814 		{
815 		    FreeXTree(tree);
816 		    FreeXTree(child);
817 		    error(p, "System error");
818 		    return &p->xbit;
819 		}
820 		child->parent = tree;
821 		children[tree->nchildren] = child;
822 		tree->nchildren++;
823 		tree->children = children;
824 		break;
825 	    }
826 	}
827 
828     default:
829 	if(!(tree = Malloc(sizeof(*tree))))
830 	{
831 	    error(p, "System error");
832 	    return &p->xbit;
833 	}
834 	*tree = *bit;
835 	return tree;
836     }
837 }
838 
FreeXTree(XBit tree)839 void FreeXTree(XBit tree)
840 {
841     int i;
842     XBitType type = tree->type;
843 
844     for(i=0; i<tree->nchildren; i++)
845 	FreeXTree(tree->children[i]);
846 
847     Free(tree->children);
848 
849     FreeXBit(tree);
850 
851     if(type == XBIT_error)
852 	/* error "trees" are always in the Parser structure, not malloced */
853 	return;
854 
855     Free(tree);
856 }
857 
858 #endif /* (not) FOR_LT */
859 
ReadXBit(Parser p)860 XBit ReadXBit(Parser p)
861 {
862     if(p->peeked)
863 	p->peeked = 0;
864     else
865 	parse(p);
866 
867     return &p->xbit;
868 }
869 
PeekXBit(Parser p)870 XBit PeekXBit(Parser p)
871 {
872     if(p->peeked)
873 	error(p, "Attempt to peek twice");
874     else
875     {
876 	parse(p);
877 	p->peeked = 1;
878     }
879 
880     return &p->xbit;
881 }
882 
ParserPush(Parser p,InputSource source)883 int ParserPush(Parser p, InputSource source)
884 {
885     Entity e = source->entity;
886 
887     if(!p->source && !p->document_entity)
888 	p->document_entity = e;
889 
890     source->parent = p->source;
891     p->source = source;
892 
893     if(e->type == ET_internal)
894 	return 0;
895 
896     if(e != p->document_entity)
897 	source->map = p->map;
898 
899     /* Look at first few bytes of external entities to guess encoding,
900        then look for an XMLDecl or TextDecl.  */
901 
902     /* Check encoding even if we have already determined it for this
903        entity, because otherwise we might leave a BOM unread. */
904     determine_character_encoding(source);
905 
906 #if CHAR_SIZE == 8
907     if(!EncodingIsAsciiSuperset(e->encoding))
908 	return error(p, "Unsupported character encoding %s",
909 		     CharacterEncodingName[e->encoding]);
910 #else
911     if(e->encoding == CE_unknown)
912 	return error(p, "Unknown character encoding");
913 #endif
914 
915     get(source); unget(source);	/* To get the first line read */
916 
917     if(looking_at(p, "<?NSL "))
918     {
919 	require(process_nsl_decl(p));
920 	source->read_carefully = 0;
921 	return 0;
922     }
923 
924     if(looking_at(p, "<?xml "))
925     {
926 	require(process_xml_decl(p));
927 	if(e == p->document_entity && !e->version_decl)
928 	    return error(p, "XML declaration in document entity lacked "
929 			    "version number");
930 	if(e != p->document_entity && e->standalone_decl != SDD_unspecified)
931 	    return error(p, "Standalone attribute not allowed except in "
932 			    "document entity");
933 	if(e != p->document_entity && e->encoding_decl == CE_unknown)
934 	    return error(p, "Encoding declaration is required in text "
935 			    "declaration");
936     }
937 
938     else if(looking_at(p, "<?xml?"))
939 	return error(p, "Empty XML or text declaration");
940 
941     else if(looking_at(p, "<?XML "))
942 	return error(p, "Wrong case XML declaration, must be <?xml ...");
943 
944     else if(p->state == PS_error) /* looking_at may have set it */
945 	return -1;
946 
947     source->read_carefully = 0;
948 
949     if(e == p->document_entity)
950     {
951 	p->xml_version = e->xml_version;
952 	if(p->xml_version >= XV_1_1)
953 	{
954 	    ParserSetFlag(p, XML11Syntax, 1);
955 #if CHAR_SIZE == 16
956 	    p->map = xml_char_map_11;
957 #endif
958 #if CHAR_SIZE == 16
959 	    /* XXX is this the best place to do this? */
960             if(ParserGetFlag(p, XML11CheckNF))
961 	    {
962 	        p->checker = nf16checkNew(ParserGetFlag(p, XML11CheckExists));
963                 NF16StartCheck(p);
964 	        p->namechecker =
965 		    nf16checkNew(ParserGetFlag(p, XML11CheckExists));
966 	    }
967 #endif
968 	}
969 	else if(ParserGetFlag(p, Pre105Chars))
970 	    p->map = xml_char_map;
971 	else
972 	    p->map = xml_char_map_105;
973 
974 	source->map = p->map;
975     }
976     else if(e->xml_version > p->xml_version)
977     {
978 	const char8 *doc_ver = p->document_entity->version_decl ?
979 	                       p->document_entity->version_decl : "1.0";
980 
981 	if(ParserGetFlag(p, XMLStrictWFErrors))
982 	    return error(p, "Referenced entity has later version number "
983 			    "(%s) than document entity (%s)",
984 			 e->version_decl, doc_ver);
985 	else
986 	    warn(p, "Referenced entity has later version number "
987 		    "(%s) than document entity (%s)",
988 		 e->version_decl, doc_ver);
989     }
990 #if 0
991     Fprintf(Stderr, "\npushing %s, map = %s\n",
992 	    EntityDescription(e), source->map == xml_char_map ? "1.0" : "1.1");
993 #endif
994     return 0;
995 }
996 
ParserPop(Parser p)997 void ParserPop(Parser p)
998 {
999     InputSource source;
1000 
1001     source = p->source;
1002     p->source = source->parent;
1003 
1004     SourceClose(source);
1005 }
1006 
1007 /* Returns true if the source is at EOE. If so, the EOE will have been read. */
1008 
at_eoe(InputSource s)1009 static int at_eoe(InputSource s)
1010 {
1011     if(!at_eol(s))
1012 	return 0;
1013     if(s->seen_eoe || get_with_fill(s) == XEOE)
1014 	return 1;
1015     unget(s);
1016     return 0;
1017 }
1018 
1019 /* Pops any sources that are at EOE.  Leaves source buffer with at least
1020    one character in it (except at EOF, where it leaves the EOE unread). */
1021 
pop_while_at_eoe(Parser p)1022 static void pop_while_at_eoe(Parser p)
1023 {
1024     while(1)
1025     {
1026 	InputSource s = p->source;
1027 
1028 	if(!at_eoe(s))
1029 	    return;
1030 	if(!s->parent)
1031 	{
1032 	    unget(s);
1033 	    return;
1034 	}
1035 	ParserPop(p);
1036     }
1037 }
1038 
ParserSetFlag(Parser p,ParserFlag flag,int value)1039 void ParserSetFlag(Parser p, ParserFlag flag, int value)
1040 {
1041     int flagset;
1042     unsigned int flagbit;
1043 
1044     flagset = (flag >> 5);
1045     flagbit = (1u << (flag & 31));
1046 
1047     if(value)
1048 	p->flags[flagset] |= flagbit;
1049     else
1050 	p->flags[flagset] &= ~flagbit;
1051 
1052     if(flag == XMLPredefinedEntities)
1053     {
1054 	if(value)
1055 	    p->dtd->predefined_entities = xml_predefined_entities;
1056 	else
1057 	    p->dtd->predefined_entities = 0;
1058     }
1059 }
1060 
ParserPerror(Parser p,XBit bit)1061 void ParserPerror(Parser p, XBit bit)
1062 {
1063     int linenum, charnum;
1064     InputSource s, root;
1065 
1066     root = ParserRootSource(p);
1067 
1068     if(ParserGetFlag(p, SimpleErrorFormat))
1069     {
1070 	const char8 *d, *e;
1071 
1072 	d = EntityDescription(root->entity);
1073 	e = d+strlen8(d);
1074 	while(e > d && e[-1] != '/')
1075 	    --e;
1076 
1077 	if(p->state == PS_validate_dtd)
1078 	    Fprintf(Stderr, "%s:-1(end of prolog):-1: ", e);
1079 	else if(p->state == PS_validate_final)
1080 	    Fprintf(Stderr, "%s:-1(end of body):-1: ", e);
1081 	else
1082 	    Fprintf(Stderr, "%s:%d:%d: ", e,root->line_number+1, root->next+1);
1083 
1084 	if(bit->type == XBIT_warning)
1085 	    Fprintf(Stderr, "warning: ");
1086 	Fprintf(Stderr, "%s\n", bit->error_message);
1087 
1088 	return;
1089     }
1090 
1091     Fprintf(Stderr, "%s: %s\n",
1092 	    bit->type == XBIT_error ? "Error" : "Warning",
1093 	    bit->error_message);
1094 
1095     if(p->state == PS_validate_dtd || p->state == PS_validate_final)
1096     {
1097 	Fprintf(Stderr, " (detected at end of %s of document %s)\n",
1098 		p->state == PS_validate_final ? "body" : "prolog",
1099 		EntityDescription(root->entity));
1100 
1101 	return;
1102     }
1103 
1104     for(s=p->source; s; s=s->parent)
1105     {
1106 	if(s->entity->name)
1107 	    Fprintf(Stderr, " in entity \"%S\"", s->entity->name);
1108 	else
1109 	    Fprintf(Stderr, " in unnamed entity");
1110 
1111 	switch(SourceLineAndChar(s, &linenum, &charnum))
1112 	{
1113 	case 1:
1114 	    Fprintf(Stderr, " at line %d char %d of", linenum+1, charnum+1);
1115 	    break;
1116 	case 0:
1117 	    Fprintf(Stderr, " defined at line %d char %d of",
1118 		    linenum+1, charnum+1);
1119 	    break;
1120 	case -1:
1121 	    Fprintf(Stderr, " defined in");
1122 	    break;
1123 	}
1124 
1125 	Fprintf(Stderr, " %s\n", EntityDescription(s->entity));
1126     }
1127 }
1128 
1129 
parse(Parser p)1130 static int parse(Parser p)
1131 {
1132     int c;
1133     InputSource s;
1134 
1135     if(p->state == PS_end || p->state == PS_error)
1136     {
1137 	/* After an error or EOF, just keep returning EOF */
1138 	p->xbit.type = XBIT_eof;
1139 	return 0;
1140     }
1141 
1142     clear_xbit(&p->xbit);
1143 
1144     if(p->state <= PS_prolog2 || p->state == PS_epilog)
1145 	skip_whitespace(p->source);
1146 
1147 restart:
1148     pop_while_at_eoe(p);
1149     s = p->source;
1150     SourcePosition(s, &p->xbit.entity, &p->xbit.byte_offset);
1151 
1152     switch(c = get(s))
1153     {
1154     case XEOE:
1155 	if(p->state != PS_epilog)
1156 	    return error(p, "Document ends too soon");
1157 	p->state = PS_end;
1158 	p->xbit.type = XBIT_eof;
1159         NF16StartCheck(p);
1160 	return 0;
1161     case '<':
1162         NF16StartCheck(p); /* only effective after markup */
1163 	return parse_markup(p);
1164     case '&':
1165 	if(ParserGetFlag(p, IgnoreEntities))
1166 	    goto pcdata;
1167 	if(p->state <= PS_prolog2)
1168 	    return error(p, "Entity reference not allowed in prolog");
1169 	if(looking_at(p, "#"))
1170 	{
1171 	    /* a character reference - go back and parse as pcdata */
1172 	    unget(s);
1173 	    goto pcdata;
1174 	}
1175 	if(p->state == PS_error)	/* looking_at may have set it */
1176 	    return -1;
1177 	if(ParserGetFlag(p, ExpandGeneralEntities))
1178 	{
1179 	    /* an entity reference - push it and start again */
1180 	    require(parse_reference(p, 0, 1, 1));
1181             NF16StartCheck(p);
1182 	    goto restart;
1183 	}
1184 	/* not expanding general entities, so treat as pcdata */
1185 	goto pcdata;
1186     case BADCHAR:
1187 	return error(p, "Input error: %s", s->error_msg);
1188     default:
1189     pcdata:
1190 	unget(s);
1191 	return parse_pcdata(p);
1192     }
1193 }
1194 
1195 /* Called after reading '<' */
1196 
parse_markup(Parser p)1197 static int parse_markup(Parser p)
1198 {
1199     InputSource s = p->source;
1200     int c = get(s);
1201 
1202     switch(c)
1203     {
1204     case '!':
1205 	if(looking_at(p, "--"))
1206 	{
1207 	    if(ParserGetFlag(p, ReturnComments))
1208 		return parse_comment(p, 0, 0);
1209 	    else
1210 	    {
1211 		require(parse_comment(p, 1, 0));
1212 		/* XXX avoid recursion here */
1213 		return parse(p);
1214 	    }
1215 	}
1216 	else if(looking_at(p, "DOCTYPE "))
1217 	    return parse_dtd(p);
1218 	else if(looking_at(p, "[CDATA["))
1219 	    return parse_cdata(p);
1220 	else if(p->state == PS_error)	/* looking_at may have set it */
1221 	    return -1;
1222 	else
1223 	    return error(p, "Syntax error after <!");
1224 
1225     case '/':
1226 	return parse_endtag(p);
1227 
1228     case '?':
1229 	return parse_pi(p, 0);
1230 
1231     case BADCHAR:
1232 	return error(p, "Input error: %s", s->error_msg);
1233 
1234     default:
1235 	unget(s);
1236 	if(!ParserGetFlag(p, XMLLessThan) &&
1237 	   (c == XEOE || !is_xml_namestart(c, p->map)))
1238 	{
1239 	    /* In nSGML, recognise < as stago only if followed by namestart */
1240 
1241 	    unget(s);	/* put back the < */
1242 	    return parse_pcdata(p);
1243 	}
1244 	return parse_starttag(p);
1245     }
1246 }
1247 
parse_endtag(Parser p)1248 static int parse_endtag(Parser p)
1249 {
1250     ElementDefinition e;
1251     NSElementDefinition nse;
1252     Entity ent;
1253 
1254     p->xbit.type = XBIT_end;
1255     require(parse_name(p, "after </"));
1256     maybe_uppercase_name(p);
1257 
1258     if(ParserGetFlag(p, MaintainElementStack))
1259     {
1260 	if(VectorCount(p->element_stack) <= 0)
1261 	    return error(p, "End tag </%.*S> outside of any element",
1262 			 p->namelen, p->name);
1263     }
1264 
1265     if(ParserGetFlag(p, Validate))
1266     {
1267 	struct element_info *info = &VectorLast(p->element_stack);
1268 	ElementDefinition parent = info->definition;
1269 
1270 	if(parent->type == CT_element && info->context &&
1271 	   !info->context->end_node)
1272 	{
1273 	    require(validity_error(p, "Content model for %S does not "
1274 			          "allow it to end here",
1275 			      parent->name));
1276 	}
1277     }
1278 
1279     if(ParserGetFlag(p, MaintainElementStack))
1280     {
1281 	ent = VectorLast(p->element_stack).entity;
1282 	e = VectorLast(p->element_stack).definition;
1283 	nse = VectorLast(p->element_stack).ns_definition;
1284 	p->xbit.ns_dict = VectorLast(p->element_stack).ns;
1285 	p->xbit.nsc = VectorLast(p->element_stack).nsc;
1286 	p->xbit.nsowned = (p->xbit.ns_dict != &p->base_ns);
1287 	(void)VectorPop(p->element_stack);
1288 
1289 	if(p->namelen != e->namelen ||
1290 	   memcmp(p->name, e->name, p->namelen * sizeof(Char)) != 0)
1291 	    return error(p, "Mismatched end tag: expected </%S>, got </%.*S>",
1292 			 e->name, p->namelen, p->name);
1293 
1294 	p->xbit.element_definition = e;
1295 	p->xbit.ns_element_definition = nse;
1296 
1297 	if(ent != p->source->entity)
1298 	    return error(p, "Element ends in different entity from that "
1299 			    "in which it starts");
1300 
1301 	if(VectorCount(p->element_stack) == 0)
1302 	{
1303 	    if(ParserGetFlag(p, Validate))
1304 	    {
1305 		p->state = PS_validate_final;
1306 		require(validate_final(p));
1307 	    }
1308 	    p->state = PS_epilog;
1309 	}
1310     }
1311     else
1312     {
1313 	e = FindElementN(p->dtd, p->name, p->namelen);
1314 	p->xbit.element_definition = e;
1315 	if(!p->xbit.element_definition)
1316 	    return error(p, "End tag for unknown element %.*S",
1317 			 p->namelen, p->name);
1318     }
1319 
1320     skip_whitespace(p->source);
1321     NF16StartCheck(p);
1322     return expect(p, '>', "after name in end tag");
1323 }
1324 
check_qualname_syntax(Parser p,const Char * name,const char * type)1325 static int check_qualname_syntax(Parser p, const Char *name, const char *type)
1326 {
1327     Char *t;
1328 
1329     t = Strchr(name, ':');
1330 
1331     if(!t)
1332 	return 0;
1333 
1334     if(t == name)
1335     {
1336 	require(namespace_error(p, "%s name %S has empty prefix", type, name));
1337     }
1338     else if(t[1] == 0)
1339     {
1340 	require(namespace_error(p, "%s name %S has empty local part",
1341 				type, name));
1342     }
1343     else if(!is_xml_namestart(t[1], p->map))
1344     {
1345 	require(namespace_error(p, "%s name %S has illegal local part",
1346 				type, name));
1347     }
1348     else if(Strchr(t+1, ':'))
1349     {
1350 	require(namespace_error(p, "%s name %S has multiple colons",
1351 				type, name));
1352     }
1353 
1354     return 0;
1355 }
1356 
parse_starttag(Parser p)1357 static int parse_starttag(Parser p)
1358 {
1359     int c, is_top_level = 0;
1360     ElementDefinition e;
1361     AttributeDefinition d;
1362     Attribute a, aa, all_attrs;
1363     struct element_info *this_info = 0, *parent_info = 0;
1364 
1365     if(p->state == PS_epilog && !ParserGetFlag(p, AllowMultipleElements))
1366 	return error(p, "Document contains multiple elements");
1367 
1368     if(p->state < PS_body)
1369     {
1370 	if(ParserGetFlag(p, Validate))
1371 	{
1372 	    p->state = PS_validate_dtd;
1373 	    require(validate_dtd(p));
1374 	}
1375 	is_top_level = 1;
1376     }
1377 
1378     p->state = PS_body;
1379 
1380     require(parse_name(p, "after <"));
1381     maybe_uppercase_name(p);
1382 
1383 #if not_yet
1384     if(is_top_level && p->magic_prefix)
1385 	require(magically_transform_dtd(p, p->name, p->namelen));
1386 #endif
1387 
1388     e = FindElementN(p->dtd, p->name, p->namelen);
1389     if(!e || e->tentative)
1390     {
1391 	if(p->have_dtd && ParserGetFlag(p, ErrorOnUndefinedElements))
1392 	    return error(p, "Start tag for undeclared element %.*S",
1393 			 p->namelen, p->name);
1394 	if(ParserGetFlag(p, Validate) &&
1395 	   !(ParserGetFlag(p, RelaxedAny) &&
1396 	     VectorCount(p->element_stack) != 0 &&
1397 	     VectorLast(p->element_stack).definition->type == CT_any))
1398 	{
1399 	    require(validity_error(p,
1400 				   "Start tag for undeclared element %.*S",
1401 				   p->namelen, p->name));
1402 	}
1403 	if(e)
1404 	    RedefineElement(e, CT_any, 0, 0, 0);
1405 	else
1406 	{
1407 	    if(!(e =
1408 		 DefineElementN(p->dtd, p->name, p->namelen, CT_any, 0, 0, 0)))
1409 		return error(p, "System error");
1410 	    if(ParserGetFlag(p, XMLNamespaces))
1411 	    {
1412 		require(check_qualname_syntax(p, e->name, "Element"));
1413 	    }
1414 	}
1415     }
1416 
1417     p->xbit.element_definition = e;
1418 
1419     if(ParserGetFlag(p, Validate))
1420     {
1421 	if(VectorCount(p->element_stack) == 0)
1422 	{
1423 	    if(Strcmp(p->dtd->name, e->name) != 0)
1424 	    {
1425 		require(validity_error(p, "Root element is %S, should be %S",
1426 				       e->name, p->dtd->name));
1427 	    }
1428 	}
1429 	else
1430 	{
1431 	    struct element_info *info = &VectorLast(p->element_stack);
1432 	    ElementDefinition parent = info->definition;
1433 
1434 	    if(parent->type == CT_empty)
1435 	    {
1436 		require(validity_error(p, "Content model for %S does not "
1437 				          "allow anything here",
1438 				       parent->name));
1439 	    }
1440 	    else if(info->context)
1441 	    {
1442 		info->context = validate_content(info->context, e);
1443 		if(!info->context)
1444 		{
1445 		    require(validity_error(p, "Content model for %S does not "
1446 					      "allow element %S here",
1447 					   parent->name, e->name));
1448 		}
1449 	    }
1450 	}
1451     }
1452 
1453     while(1)
1454     {
1455 	InputSource s = p->source;
1456 
1457 	/* We could just do skip_whitespace here, but we will get a
1458 	   better error message if we look a bit closer. */
1459 
1460 	c = get(s);
1461 
1462 	if(c == BADCHAR)
1463 	    return error(p, "Input error: %s", s->error_msg);
1464 
1465 	if(c !=XEOE && is_xml_whitespace(c))
1466 	{
1467 	    skip_whitespace(s);
1468 	    c = get(s);
1469 	}
1470 	else if(c != '>' &&
1471 		!(ParserGetFlag(p, XMLSyntax) && c == '/'))
1472 	{
1473 	    unget(s);		/* For error position */
1474 	    return error(p, "Expected whitespace or tag end in start tag");
1475 	}
1476 
1477 	if(c == '>')
1478 	{
1479 	    p->xbit.type = XBIT_start;
1480 	    break;
1481 	}
1482 
1483 	if((ParserGetFlag(p, XMLSyntax)) && c == '/')
1484 	{
1485 	    require(expect(p, '>', "after / in start tag"));
1486 	    p->xbit.type = XBIT_empty;
1487 	    break;
1488 	}
1489 
1490 	unget(s);
1491 
1492 	require(parse_attribute(p));
1493     }
1494 
1495     if(ParserGetFlag(p, MaintainElementStack))
1496     {
1497 	if(p->xbit.type == XBIT_start)
1498 	{
1499 	    if(!VectorPushNothing(p->element_stack))
1500 		return error(p, "System error");
1501 	    if(VectorCount(p->element_stack) > 1)
1502 		parent_info = &VectorLast(p->element_stack) - 1;
1503 	    this_info = &VectorLast(p->element_stack);
1504 	    this_info->definition = e;
1505 	    this_info->context = e->fsm ? e->fsm->start_node : 0;
1506 	    this_info->wsm = WSM_unspecified;
1507 	    this_info->ns = 0;
1508 	    this_info->entity = p->source->entity;
1509 	    /* Set these here even if not doing namespace processing, to
1510 	       avoid rui errors from dbx. */
1511 	    this_info->ns_definition = 0;
1512 	    this_info->nsc = 0;
1513 	}
1514 	else
1515 	{
1516 	    /* Is this element allowed to be empty? */
1517 
1518 	    if(ParserGetFlag(p, Validate) && e->fsm &&
1519 	       !e->fsm->start_node->end_node)
1520 	    {
1521 		require(validity_error(p, "Content model for %S does not "
1522 				       "allow it to be empty",
1523 				       e->name));
1524 	    }
1525 
1526 	    /* Is it the (empty) top-level element? */
1527 
1528 	    if(VectorCount(p->element_stack) == 0)
1529 	    {
1530 		if(ParserGetFlag(p, Validate))
1531 		{
1532 		    p->state = PS_validate_final;
1533 		    require(validate_final(p));
1534 		}
1535 		p->state = PS_epilog;
1536 	    }
1537 	    else
1538 		parent_info = &VectorLast(p->element_stack);
1539 	}
1540     }
1541 
1542     if(ParserGetFlag(p, Validate))
1543     {
1544 	/* check for required attributes */
1545 
1546 	AttributeDefinition d;
1547 	Attribute a;
1548 
1549 	for(d=NextAttributeDefinition(e, 0);
1550 	    d;
1551 	    d=NextAttributeDefinition(e, d))
1552 	{
1553 	    if(d->default_type != DT_required)
1554 		continue;
1555 	    for(a=p->xbit.attributes; a; a=a->next)
1556 		if(a->definition == d)
1557 		    break;
1558 	    if(!a)
1559 	    {
1560 		require(validity_error(p,
1561 				       "Required attribute %S for element %S "
1562 				       "is not present",
1563 				       d->name, e->name));
1564 	    }
1565 	}
1566     }
1567 
1568     /* Find defaulted attributes if we need them */
1569 
1570     /* p->xbit.attributes only points to actually present attributes
1571        until the end of this function. */
1572     all_attrs = p->xbit.attributes;
1573 
1574     if(ParserGetFlag(p, ReturnDefaultedAttributes) ||
1575        ParserGetFlag(p, XMLNamespaces))
1576     {
1577 
1578 	for(d=NextAttributeDefinition(e, 0);
1579 	    d;
1580 	    d=NextAttributeDefinition(e, d))
1581 	{
1582 	    if(!d->default_value)
1583 		continue;
1584 	    for(a=p->xbit.attributes; a; a=a->next)
1585 		if(a->definition == d)
1586 		    break;
1587 	    if(!a)
1588 	    {
1589 		if(!(a = Malloc(sizeof(*a))))
1590 		    return error(p, "System error");
1591 		a->definition = d;
1592 		if(!(a->value = Strdup(d->default_value)))
1593 		    return error(p, "System error");
1594 		a->specified = 0;
1595 		a->quoted = 1;
1596 		a->next = all_attrs;
1597 		all_attrs = a;
1598 	    }
1599 	}
1600     }
1601 
1602     /* Do some checks on defaulted attributes if validating */
1603 
1604     if(ParserGetFlag(p, Validate))
1605     {
1606 	for(d=NextAttributeDefinition(e, 0);
1607 	    d;
1608 	    d=NextAttributeDefinition(e, d))
1609 	{
1610 	    int ed, sem;
1611 
1612 	    if(!d->default_value)
1613 		continue;
1614 
1615 	    /* Check no externally-declared defaults in standalone document,
1616 	       and do "non-lexical" validation of some attribute types */
1617 
1618 	    ed = (p->standalone == SDD_yes && d->is_externally_declared);
1619 	    sem =
1620 		(d->type == AT_entity || d->type == AT_entities ||
1621 		 d->type == AT_id ||
1622 		 d->type == AT_idref || d->type == AT_idrefs);
1623 
1624 	    if(ed || sem)
1625 	    {
1626 		/* was it actually defaulted? */
1627 
1628 		for(a=p->xbit.attributes; a; a=a->next)
1629 		    if(a->definition == d)
1630 			break;
1631 		if(a)
1632 		    /* no */
1633 		    continue;
1634 	    }
1635 
1636 	    if(sem)
1637 	    {
1638 		require(check_attribute_syntax(p, d, e, d->default_value,
1639 					       "defaulted value for attribute",
1640 					       1));
1641 	    }
1642 
1643 	    if(ed)
1644 	    {
1645 		require(validity_error(p, "Externally declared attribute %S "
1646 		    "for element %S defaulted in document declared standalone",
1647 				       d->name, e->name));
1648 	    }
1649 	}
1650     }
1651 
1652     /* Look for xml:space attribute */
1653 
1654     if(ParserGetFlag(p, XMLSpace))
1655     {
1656 	d = e->xml_space_attribute;
1657 
1658 	if(d)
1659 	{
1660 	    for(a=p->xbit.attributes; a; a=a->next)
1661 		if(a->definition == d)
1662 		{
1663 		    p->xbit.wsm = process_xml_space(p, a->value);
1664 		    goto done;
1665 		}
1666 
1667 	    if(d->default_type == DT_none || d->default_type == DT_fixed)
1668 	    {
1669 		p->xbit.wsm = process_xml_space(p, d->default_value);
1670 		goto done;
1671 	    }
1672 	}
1673 
1674 	p->xbit.wsm = parent_info ? parent_info->wsm : WSM_unspecified;
1675 
1676     done:
1677 	if(this_info)
1678 	    this_info->wsm = p->xbit.wsm;
1679     }
1680     else
1681 	p->xbit.wsm = WSM_unspecified;
1682 
1683     /* Look for xml:id attribute */
1684 
1685     if(ParserGetFlag(p, XMLID) && e->xml_id_attribute)
1686     {
1687 	Char *s;
1688 
1689 	d = e->xml_id_attribute;
1690 
1691 	for(a=p->xbit.attributes; a; a=a->next)
1692 	    if(a->definition == d)
1693 		break;
1694 	if(!a)
1695 	    goto id_done;
1696 
1697 	/* check that it's an NCName */
1698 
1699 	if(!is_xml_namestart(a->value[0], p->map))
1700 	{
1701 	    warn(p, "xml:id error: value \"%S\" does not start with a name start character",
1702 		a->value);
1703 	    goto id_done;
1704 	}
1705 
1706 	for(s=a->value; *s; s++)
1707 	{
1708 	    if(*s == ':')
1709 	    {
1710 		warn(p, "xml:id error: value \"%S\" contains a colon", a->value);
1711 		goto id_done;
1712 	    }
1713 	    else if(!is_xml_namechar(*s, p->map))
1714 	    {
1715 		warn(p, "xml:id error: value \"%S\" contains a character which is not a name character",
1716 		    a->value);
1717 		goto id_done;
1718 	    }
1719 	}
1720 
1721     id_done:
1722 	;
1723     }
1724 
1725     if(ParserGetFlag(p, XMLIDCheckUnique))
1726     {
1727 	int found;
1728 	HashEntry id_entry;
1729 
1730 	for(a=p->xbit.attributes; a; a=a->next)
1731 	{
1732 	    d = a->definition;
1733 	    if(d->type != AT_id)
1734 		continue;
1735 	    if(ParserGetFlag(p, Validate) && d->declared)
1736 		/* declared attributes will have been checked during validation */
1737 		continue;
1738 
1739 	    id_entry = hash_find_or_add(p->id_table,
1740 					a->value,
1741 					Strlen(a->value)*sizeof(Char),
1742 					&found);
1743 	    if(!id_entry)
1744 		return error(p, "System error");
1745 
1746 	    if(!found)
1747 		hash_set_value(id_entry, (void *)2);
1748 	    else
1749 		warn(p, "xml:id error: duplicate ID attribute value %S",
1750 		     a->value);
1751 	}
1752     }
1753 
1754     if(ParserGetFlag(p, XMLNamespaces))
1755     {
1756 	Attribute *attp;
1757 	Namespace ns;
1758 	NSElementDefinition nselt;
1759 	NSAttributeDefinition nsattr;
1760 
1761 	p->xbit.ns_dict = parent_info ? parent_info->ns : &p->base_ns;
1762 	p->xbit.nsc = 0;
1763 
1764 	/* Look for xmlns attributes */
1765 
1766 	for(attp=&all_attrs; *attp; )
1767 	{
1768 	    a = *attp;
1769 	    if(a->definition->ns_attr_prefix)
1770 	    {
1771 		require(process_namespace(p, a->definition, a->value));
1772 		p->xbit.nsc++;
1773 
1774 		/* remove the attribute now we've processed it */
1775 
1776 		if(!ParserGetFlag(p, ReturnNamespaceAttributes))
1777 		{
1778 		    if(p->xbit.attributes == a)
1779 			p->xbit.attributes = a->next;
1780 		    *attp = a->next;
1781 		    Free(a->value);
1782 		    Free(a);
1783 		}
1784 		else
1785 		    attp = &a->next;
1786 	    }
1787 	    else
1788 		attp = &a->next;
1789 	}
1790 
1791 	p->xbit.nsowned = (p->xbit.type == XBIT_empty &&
1792 			   p->xbit.ns_dict != &p->base_ns);
1793 
1794 	/* Find namespace for element */
1795 
1796 	if(e->prefix)
1797 	{
1798 	    ns = LookupNamespace(p->xbit.ns_dict, e->prefix);
1799 	    if(!ns)
1800 	    {
1801 		require(namespace_error(p,
1802 					"Element name %S has unbound prefix",
1803 					e->name));
1804 	    }
1805 	}
1806 	else
1807 	    ns = LookupNamespace(p->xbit.ns_dict, 0);
1808 
1809 	nselt = 0;
1810 	if(ns)
1811 	    if(!(nselt = NamespacifyElementDefinition(e, ns)))
1812 		return error(p, "System error");
1813 
1814 	p->xbit.ns_element_definition = nselt;
1815 
1816 	if(this_info)
1817 	{
1818 	    this_info->ns = p->xbit.ns_dict;
1819 	    this_info->nsc = p->xbit.nsc;
1820 	    this_info->ns_definition = nselt;
1821 	}
1822 
1823 	/* Find namespaces for attributes */
1824 
1825 	for(a=all_attrs; a; a=a->next)
1826 	{
1827 	    d = a->definition;
1828 	    nsattr = 0;
1829 
1830 	    if(!d->ns_attr_prefix) /* Ignore namespace attributes themselves */
1831 	    {
1832 		if(d->prefix)
1833 		{
1834 		    ns = LookupNamespace(p->xbit.ns_dict, d->prefix);
1835 		    if(!ns)
1836 		    {
1837 			require(namespace_error(p,
1838 				     "Attribute name %S has unbound prefix",
1839 						d->name));
1840 		    }
1841 		    else
1842 			if(!(nsattr =
1843 			     NamespacifyGlobalAttributeDefinition(d, ns)))
1844 			    return error(p, "System error");
1845 		}
1846 		else if(nselt)
1847 		{
1848 		    if(!(nsattr =
1849 			 NamespacifyElementAttributeDefinition(d, nselt)))
1850 			return error(p, "System error");
1851 		}
1852 	    }
1853 
1854 	    a->ns_definition = nsattr;
1855 	}
1856 
1857 	/* Check for repeated qualified attributes */
1858 
1859 	for(a=all_attrs; a; a=a->next)
1860 	{
1861 	    d = a->definition;
1862 	    if(a->ns_definition && !a->ns_definition->element)
1863 		for(aa=all_attrs; aa != a; aa=aa->next)
1864 		{
1865 		    if(aa->ns_definition == a->ns_definition)
1866 		    {
1867 			require(namespace_error(p,
1868 				    "Repeated attribute %S in namespace %S",
1869 				 d->local, a->ns_definition->namespace->nsname));
1870 		    }
1871 		}
1872 	}
1873 
1874 	/* Free defaulted attrs if we only got them for namespace stuff */
1875 
1876 	if(!ParserGetFlag(p, ReturnDefaultedAttributes))
1877 	{
1878 	    for(a=all_attrs; a != p->xbit.attributes; a = aa)
1879 	    {
1880 		aa = a->next;
1881 		Free(a->value);
1882 		Free(a);
1883 	    }
1884 	    all_attrs = p->xbit.attributes;
1885 	}
1886     }
1887 
1888     p->xbit.attributes = all_attrs;
1889 
1890     NF16StartCheck(p);
1891     return 0;
1892 }
1893 
process_namespace(Parser p,AttributeDefinition d,const Char * value)1894 static int process_namespace(Parser p, AttributeDefinition d,const Char *value)
1895 {
1896     NamespaceBinding nb;
1897     const Char *prefix;
1898     const Char *nsname;
1899     Namespace ns;
1900 
1901     static Char xmlns[] = {'x','m','l','n','s',0};
1902     static Char xml[] = {'x','m','l',0};
1903 
1904     int xml_prefix = 0, xmlns_prefix = 0;
1905     int xml_uri = 0, xmlns_uri = 0;
1906 
1907     prefix = *d->ns_attr_prefix ? d->ns_attr_prefix : 0;
1908     nsname = *value == 0 ? 0 : value;
1909 
1910     if(prefix && !nsname && p->xml_version < XV_1_1)
1911     {
1912 	require(namespace_error(p,
1913 				"Namespace declaration for %S has empty URI",
1914 				prefix));
1915     }
1916     if(prefix)
1917     {
1918 	if(Strcmp(prefix, xml) == 0)
1919 	    xml_prefix = 1;
1920 	else if(Strcmp(prefix, xmlns) == 0)
1921 	    xmlns_prefix = 1;
1922     }
1923 
1924     if(nsname)
1925     {
1926 	if(Strcmp(nsname, xml_ns) == 0)
1927 	    xml_uri = 1;
1928 	else if(Strcmp(nsname, xmlns_ns) == 0)
1929 	    xmlns_uri = 1;
1930     }
1931 
1932     if(xml_prefix && !xml_uri)
1933     {
1934 	require(namespace_error(p,
1935 			    "Declaration of xml prefix has wrong URI \"%S\"",
1936 				nsname));
1937     }
1938 
1939     if(xmlns_prefix)
1940     {
1941 	require(namespace_error(p,
1942 				"Declaration of xmlns prefix is not allowed"));
1943     }
1944 
1945     if(xml_uri && !xml_prefix)
1946     {
1947 	require(namespace_error(p, "Declaration of xml namespace with "
1948 	        " prefix \"%S\" (must be \"xml\")", prefix));
1949     }
1950 
1951     if(xmlns_uri)
1952     {
1953 	require(namespace_error(p,
1954 			     "Declaration of xmlns namespace is not allowed"));
1955     }
1956 
1957     if(nsname)
1958     {
1959 	if(!(ns = FindNamespace(p->dtd->namespace_universe, nsname, 1)))
1960 	    return error(p, "System error");
1961     }
1962     else
1963 	ns = 0;
1964 
1965     if(!(nb = Malloc(sizeof(*nb))))
1966 	return error(p, "System error");
1967 
1968     nb->prefix = prefix;
1969     nb->namespace = ns;
1970     nb->parent = p->xbit.ns_dict;
1971     p->xbit.ns_dict = nb;
1972 
1973     return 0;
1974 }
1975 
LookupNamespace(NamespaceBinding dictionary,const Char * prefix)1976 Namespace LookupNamespace(NamespaceBinding dictionary, const Char *prefix)
1977 {
1978     NamespaceBinding n;
1979 
1980     for(n=dictionary; n; n=n->parent)
1981     {
1982 	if(prefix == 0)
1983 	{
1984 	    if(n->prefix == 0)
1985 		return n->namespace;
1986 	}
1987 	else if(n->prefix && Strcmp(prefix, n->prefix) == 0)
1988 	    return n->namespace;
1989     }
1990 
1991     return 0;
1992 }
1993 
parse_attribute(Parser p)1994 static int parse_attribute(Parser p)
1995 {
1996     InputSource s = p->source;
1997     ElementDefinition elt = p->xbit.element_definition;
1998     AttributeDefinition def;
1999     struct attribute *a;
2000     int c;
2001     int normalised = 0;
2002     static Char xmlns[] = {'x','m','l','n','s',0};
2003 
2004     require(parse_name(p, "for attribute"));
2005     maybe_uppercase_name(p);
2006 
2007     def = FindAttributeN(elt, p->name, p->namelen);
2008     if(!def)
2009     {
2010 	if(p->have_dtd && ParserGetFlag(p, ErrorOnUndefinedAttributes))
2011 	    return error(p, "Undeclared attribute %.*S for element %S",
2012 			 p->namelen, p->name, elt->name);
2013 	if(ParserGetFlag(p, Validate) &&
2014 	   (elt->declared || elt->has_attlist) &&
2015 	   !(ParserGetFlag(p, AllowUndeclaredNSAttributes) &&
2016 	     p->namelen >= 5 && Strncmp(p->name, xmlns, 5) == 0 &&
2017 	     (p->namelen == 5 || p->name[5] == ':')))
2018 	{
2019 	    require(validity_error(p,
2020 				   "Undeclared attribute %.*S for element %S",
2021 				   p->namelen, p->name, elt->name));
2022 	}
2023 	if(!(def = DefineAttributeN(elt, p->name, p->namelen,
2024 				    AT_cdata, 0, DT_implied, 0, 0)))
2025 	    return error(p, "System error");
2026 	if(ParserGetFlag(p, XMLID) && elt->xml_id_attribute == def)
2027 	    def->type = AT_id;
2028 	if(ParserGetFlag(p, XMLNamespaces))
2029 	{
2030 	    require(check_qualname_syntax(p, def->name, "Attribute"));
2031 	}
2032     }
2033 
2034     for(a = p->xbit.attributes; a; a = a->next)
2035 	if(a->definition == def)
2036 	    return error(p, "Repeated attribute %.*S", p->namelen, p->name);
2037 
2038     if(!(a = Malloc(sizeof(*a))))
2039 	return error(p, "System error");
2040 
2041     a->value = 0;		/* in case of error */
2042     a->next = p->xbit.attributes;
2043     p->xbit.attributes = a;
2044     a->definition = def;
2045     a->specified = 1;
2046 
2047     skip_whitespace(s);
2048     require(expect(p, '=', "after attribute name"));
2049 
2050     skip_whitespace(s);
2051     c = get(s);
2052     unget(s);
2053     switch(c)
2054     {
2055     case BADCHAR:
2056     case '"':
2057     case '\'':
2058 	a->quoted = 1;
2059 	require(parse_string(p, "in attribute value",
2060 			     a->definition->type == AT_cdata ? LT_cdata_attr :
2061 			                                       LT_tok_attr,
2062 			     &normalised));
2063 	a->value = p->pbuf;
2064 	Consume(p->pbuf);
2065 	break;
2066     default:
2067 	if(ParserGetFlag(p, ErrorOnUnquotedAttributeValues))
2068 	    return error(p, "Value of attribute is unquoted");
2069 	a->quoted = 0;
2070 	require(parse_nmtoken(p, "in unquoted attribute value"));
2071 	CopyName(a->value);
2072 	break;
2073     }
2074 
2075     if(ParserGetFlag(p, Validate))
2076     {
2077 	if(p->standalone == SDD_yes && normalised &&
2078 	   a->definition->is_externally_declared)
2079 	{
2080 	    require(validity_error(p, "Externally declared attribute %S for "
2081 		   "element %S was normalised in document declared standalone",
2082 				   a->definition->name, elt->name));
2083 	}
2084 
2085 	require(validate_attribute(p, a->definition, elt, a->value));
2086     }
2087 
2088     return 0;
2089 }
2090 
process_xml_space(Parser p,const Char * value)2091 static WhiteSpaceMode process_xml_space(Parser p, const Char *value)
2092 {
2093     static Char _preserve[9] = {'p','r','e','s','e','r','v','e',0};
2094     static Char _default[8] = {'d','e','f','a','u','l','t',0};
2095     Char buf[9];
2096     const Char *v;
2097     int i;
2098 
2099     /* It's possible that it hasn't been normalised (sigh) */
2100 
2101     for(v=value; is_xml_whitespace(*v); v++)
2102 	;
2103     for(i=0; i<8; i++)
2104     {
2105 	if(!v[i] || is_xml_whitespace(v[i]))
2106 	    break;
2107 	buf[i] = v[i];
2108     }
2109     buf[i] = '\0';
2110     for(; v[i]; i++)
2111 	if(!is_xml_whitespace(v[i]))
2112 	    /* If you want validation, you know where to find it */
2113 	    return WSM_unspecified;
2114 
2115     if(Strcmp(v, _preserve) == 0)
2116 	return WSM_preserve;
2117     if(Strcmp(v, _default) == 0)
2118 	return WSM_default;
2119     return WSM_unspecified;
2120 }
2121 
transcribe(Parser p,int back,int count)2122 static int transcribe(Parser p, int back, int count)
2123 {
2124     ExpandBuf(p->pbuf, p->pbufnext + count);
2125     memcpy(p->pbuf + p->pbufnext,
2126 	   p->source->line + p->source->next - back,
2127 	   count * sizeof(Char));
2128     p->pbufnext += count;
2129     return 0;
2130 }
2131 
2132 /* Called after pushing back the first character of the pcdata */
2133 
parse_pcdata(Parser p)2134 static int parse_pcdata(Parser p)
2135 {
2136     int count = 0;
2137     int had_charref = 0;
2138     InputSource s;
2139     Char *buf;
2140     int next, buflen;
2141 
2142     if(p->state <= PS_prolog2)
2143 	return error(p, "Character data not allowed in prolog");
2144     if(p->state == PS_epilog)
2145 	return error(p, "Character data not allowed after body");
2146 
2147     s = p->source;
2148     buf = s->line;
2149     next = s->next;
2150     buflen = s->line_length;
2151 
2152     p->pbufnext = 0;
2153 
2154     while(1)
2155     {
2156 	if(next == buflen)
2157 	{
2158 	    s->next = next;
2159 	    if(count > 0)
2160 	    {
2161   	        ifNF16wrong(p,count,count)
2162  		    return error(p, "pcdata not normalized");
2163 		require(transcribe(p, count, count));
2164 	    }
2165 	    count = 0;
2166 	    if(at_eoe(s))
2167 	    {
2168 	        NF16StartCheck(p);
2169 		if(!ParserGetFlag(p, MergePCData))
2170 		    goto done;
2171 		else
2172 		    pop_while_at_eoe(p);
2173        	    }
2174 	    s = p->source;
2175 	    buf = s->line;
2176 	    next = s->next;
2177 	    buflen = s->line_length;
2178 	    if(next == buflen)
2179 		goto done;	/* must be EOF */
2180 	}
2181 
2182 	switch(buf[next++])
2183 	{
2184 	case BADCHAR:
2185 	    return error(p, "Input error: %s", s->error_msg);
2186 	case '<':
2187 	    if(!ParserGetFlag(p, XMLLessThan))
2188 	    {
2189 		/* In nSGML, don't recognise < as markup unless it looks ok */
2190 		if(next == buflen)
2191 		    goto deflt;
2192 		if(buf[next] != '!' && buf[next] != '/' && buf[next] != '?' &&
2193 		   !is_xml_namestart(buf[next], p->map))
2194 		    goto deflt;
2195 	    }
2196 	    s->next = next;
2197 	    if(count > 0)
2198 	    {
2199 	        ifNF16wrong(p,count+1,count)
2200 		    return error(p, "pcdata not normalized");
2201 		require(transcribe(p, count+1, count));
2202 	    }
2203 	    count = 0;
2204 	    if(!ParserGetFlag(p, ReturnComments) &&
2205 	       buflen >= next + 3 &&
2206 	       buf[next] == '!' && buf[next+1] == '-' && buf[next+2] == '-')
2207 	    {
2208 		s->next = next + 3;
2209 		require(parse_comment(p, 1, 0));
2210                 NF16StartCheck(p);
2211 		buflen = s->line_length;
2212 		next = s->next;
2213 		buf = s->line; 	/* thanks to robin@reportlab.com for this */
2214 	    }
2215 	    else
2216 	    {
2217 		s->next = next-1;
2218 		goto done;
2219 	    }
2220 	    break;
2221 	case '&':
2222 	    if(ParserGetFlag(p, IgnoreEntities))
2223 		goto deflt;
2224 	    if(!ParserGetFlag(p, MergePCData) &&
2225 	       (p->pbufnext > 0  || count > 0))
2226 	    {
2227 		/* We're returning references as separate bits, and we've
2228 		 come to one, and we've already got some data to return,
2229 		 so return what we've got and get the reference next time. */
2230 
2231 		s->next = next-1;
2232 		if(count > 0)
2233 		{
2234 		    ifNF16wrong(p,count,count)
2235 		        return error(p, "pcdata not normalized");
2236 		    require(transcribe(p, count, count));
2237 		}
2238 		goto done;
2239 	    }
2240 	    if(buflen >= next+1 && buf[next] == '#')
2241 	    {
2242 		/* It's a character reference */
2243 
2244 		had_charref = 1;
2245 		s->next = next+1;
2246 		if(count > 0)
2247 		{
2248 		    ifNF16wrong(p,count,count+2)
2249 		        return error(p,"pcdata not normalized");
2250 		    require(transcribe(p, count+2, count));
2251 		}
2252 		count = 0;
2253 		require(parse_character_reference(p,
2254 				   ParserGetFlag(p, ExpandCharacterEntities)));
2255 		NF16StartCheck(p);
2256 		next = s->next;
2257 
2258 		if(!ParserGetFlag(p, MergePCData))
2259 		    goto done;
2260 	    }
2261 	    else
2262 	    {
2263 		/* It's a general entity reference */
2264 
2265 		s->next = next;
2266 		if(count > 0)
2267 		{
2268 		    ifNF16wrong(p,count,count+1)
2269 		        return error(p, "pcdata not normalized");
2270 		    require(transcribe(p, count+1, count));
2271 		}
2272 		count = 0;
2273 		require(parse_reference(p, 0,
2274 				       ParserGetFlag(p, ExpandGeneralEntities),
2275 					1));
2276                 NF16StartCheck(p);
2277 		s = p->source;
2278 		buf = s->line;
2279 		buflen = s->line_length;
2280 		next = s->next;
2281 
2282 		if(!ParserGetFlag(p, MergePCData))
2283 		    goto done;
2284 	    }
2285 	    break;
2286 	case ']':
2287 	    if(ParserGetFlag(p, XMLMiscWFErrors) &&
2288 	       buflen >= next + 2 &&
2289 	       buf[next] == ']' && buf[next+1] == '>')
2290 		return error(p, "Illegal character sequence ']]>' in pcdata");
2291 	    /* fall through */
2292 	default:
2293 	deflt:
2294 	    count++;
2295 	    break;
2296 	}
2297     }
2298 
2299   done:
2300     ExpandBuf(p->pbuf, 0);	/* In case we got nothing */
2301     p->pbuf[p->pbufnext++] = 0;
2302     p->xbit.type = XBIT_pcdata;
2303     p->xbit.pcdata_chars = p->pbuf;
2304     Consume(p->pbuf);
2305     p->xbit.pcdata_ignorable_whitespace = 0;
2306 
2307     if(ParserGetFlag(p, Validate))
2308     {
2309 	ElementDefinition e = VectorLast(p->element_stack).definition;
2310 	if(e->type == CT_empty)
2311 	{
2312 	    require(validity_error(p, "PCDATA not allowed in EMPTY element %S",
2313 				   e->name));
2314 	}
2315 	else if(e->type == CT_element)
2316 	{
2317 	    Char *t;
2318 
2319 	    for(t = p->xbit.pcdata_chars; *t; t++)
2320 		if(!is_xml_whitespace(*t))
2321 		    break;
2322 	    if(*t)
2323 	    {
2324 		require(validity_error(p,
2325 				 "Content model for %S does not allow PCDATA",
2326 				       e->name));
2327 	    }
2328 	    else if(had_charref)
2329 	    {
2330 		/* E15 to 2nd edition */
2331 		require(validity_error(p,
2332 		     "Content model for %S does not allow character reference",
2333 				       e->name));
2334 	    }
2335 	    else
2336 	    {
2337 		p->xbit.pcdata_ignorable_whitespace = 1;
2338 		if(p->standalone == SDD_yes && e->is_externally_declared)
2339 		{
2340 		    require(validity_error(p, "Ignorable whitespace in "
2341 	    "externally declared element %S in document declared standalone",
2342 					   e->name));
2343 		}
2344 	    }
2345 	}
2346     }
2347 
2348     return 0;
2349 }
2350 
2351 /* Called after reading '<!--'.  Won't go over an entity end. */
2352 
parse_comment(Parser p,int skip,Entity ent)2353 static int parse_comment(Parser p, int skip, Entity ent)
2354 {
2355     InputSource s = p->source;
2356     int c, c1=0, c2=0;
2357     int count = 0;
2358     NF16noStartCheck(p);
2359 
2360     if(ParserGetFlag(p, Validate) && VectorCount(p->element_stack) > 0)
2361     {
2362 	ElementDefinition parent = VectorLast(p->element_stack).definition;
2363 
2364 	if(parent->type == CT_empty)
2365 	{
2366 	   require(validity_error(p, "Comment not allowed in EMPTY element %S",
2367 				  parent->name));
2368 	}
2369     }
2370 
2371     if(!skip)
2372 	p->pbufnext = 0;
2373 
2374     while((c = get(s)) != XEOE)
2375     {
2376 	if(c == BADCHAR)
2377 	    return error(p, "Input error: %s", s->error_msg);
2378 
2379 	count++;
2380 	if(c1 == '-' && c2 == '-')
2381 	{
2382 	    if(c == '>')
2383 		break;
2384 	    unget(s);		/* For error position */
2385 	    return error(p, "-- in comment");
2386 	}
2387 
2388 	if(at_eol(s))
2389 	{
2390 	    ifNF16wrong(p,count,count)
2391                 return error(p, "comment not normalized");
2392 	    if(!skip)
2393 	    {
2394 		require(transcribe(p, count, count));
2395 	    }
2396 	    count = 0;
2397 	}
2398 	c2 = c1; c1 = c;
2399     }
2400 
2401     /* XXX comment going over PE end should be only a validity error,
2402        but we treat it as a WF error */
2403 
2404     if(c == XEOE)
2405 	return error(p, "EOE in comment");
2406 
2407     ifNF16wrong(p,count,count-3)
2408         return error(p, "comment not normalized");
2409     NF16StartCheck(p);
2410     if(skip)
2411 	return 0;
2412 
2413     require(transcribe(p, count, count-3));
2414     p->pbuf[p->pbufnext++] = 0;
2415     p->xbit.type = XBIT_comment;
2416     p->xbit.comment_chars = p->pbuf;
2417     Consume(p->pbuf);
2418 
2419     return 0;
2420 }
2421 
parse_pi(Parser p,Entity ent)2422 static int parse_pi(Parser p, Entity ent)
2423 {
2424     InputSource s = p->source;
2425     int c, c1=0;
2426     int count = 0;
2427     Char xml[] = {'x', 'm', 'l', 0};
2428 
2429     if(ParserGetFlag(p, Validate) && VectorCount(p->element_stack) > 0)
2430     {
2431 	ElementDefinition parent = VectorLast(p->element_stack).definition;
2432 
2433 	if(parent->type == CT_empty)
2434 	{
2435 	    require(validity_error(p, "PI not allowed in EMPTY element %S",
2436 				   parent->name));
2437 	}
2438     }
2439 
2440     require(parse_name(p, "after <?"));
2441     CopyName(p->xbit.pi_name);
2442 
2443     p->pbufnext = 0;
2444     NF16noStartCheck(p);
2445 
2446     if(Strcasecmp(p->xbit.pi_name, xml) == 0)
2447     {
2448 	if(ParserGetFlag(p, XMLStrictWFErrors))
2449 	    return error(p, "Misplaced xml declaration");
2450 	else if(!ParserGetFlag(p, IgnorePlacementErrors))
2451 	    warn(p, "Misplaced xml declaration; treating as PI");
2452     }
2453 
2454     if(ParserGetFlag(p, XMLNamespaces) && Strchr(p->xbit.pi_name, ':'))
2455     {
2456 	require(namespace_error(p, "PI name %S contains colon",
2457 				p->xbit.pi_name));
2458     }
2459 
2460     /* Empty PI? */
2461 
2462     if(looking_at(p, ParserGetFlag(p, XMLSyntax) ? "?>" : ">"))
2463     {
2464 	ExpandBuf(p->pbuf, 0);
2465 	goto done;
2466     }
2467     if(p->state == PS_error)	/* looking_at may have set it */
2468 	return -1;
2469 
2470     /* If non-empty, must be white space after name */
2471 
2472     c = get(s);
2473     if(c == BADCHAR)
2474 	return error(p, "Input error: %s", s->error_msg);
2475     if(c == XEOE || !is_xml_whitespace(c))
2476 	return error(p, "Expected whitespace after PI name");
2477     skip_whitespace(s);
2478 
2479     while((c = get(s)) != XEOE)
2480     {
2481 	if(c == BADCHAR)
2482 	    return error(p, "Input error: %s", s->error_msg);
2483 	count++;
2484 	if(c == '>' &&
2485 	   (!ParserGetFlag(p, XMLSyntax) || c1 == '?'))
2486 	    break;
2487 	if(at_eol(s))
2488 	{
2489 	    ifNF16wrong(p,count,count)
2490                 return error(p, "PI not normalized");
2491 	    require(transcribe(p, count, count));
2492 	    count = 0;
2493 	}
2494 	c1 = c;
2495     }
2496 
2497     /* XXX pi going over PE end should (perhaps?) only be a validity error,
2498        but we treat it as a WF error */
2499 
2500     if(c == XEOE)
2501 	return error(p, "EOE in PI");
2502 
2503     ifNF16wrong(p,count,count-(ParserGetFlag(p, XMLSyntax) ? 2 : 1))
2504         return error(p, "PI not normalized");
2505     require(transcribe(p, count, count-(ParserGetFlag(p, XMLSyntax) ? 2 : 1)));
2506 done:
2507     p->pbuf[p->pbufnext++] = 0;
2508     p->xbit.type = XBIT_pi;
2509     p->xbit.pi_chars = p->pbuf;
2510     Consume(p->pbuf);
2511 
2512     NF16StartCheck(p);
2513     return 0;
2514 }
2515 
parse_string(Parser p,const char8 * where,enum literal_type type,int * normalised)2516 static int parse_string(Parser p, const char8 *where, enum literal_type type, int *normalised)
2517 {
2518     int c, quote;
2519     int count = 0;
2520     InputSource start_source, s;
2521     int changed = 0;
2522 
2523     /* entities cannot start with combiner, other things can */
2524     if (type==LT_param_entity||type==LT_entity) {
2525         NF16StartCheck(p);
2526     }
2527     else {
2528         NF16noStartCheck(p);
2529     }
2530 
2531     s = start_source = p->source;
2532 
2533     quote = get(s);
2534     if(quote == BADCHAR)
2535 	return error(p, "Input error: %s", s->error_msg);
2536     if(quote != '\'' && quote != '"')
2537     {
2538 	unget(s);		/* For error position */
2539 	return error(p, "Expected quoted string %s, but got %s",
2540 		     where, escape(quote, p->escbuf[0]));
2541     }
2542 
2543     p->pbufnext = 0;
2544 
2545     while(1)
2546     {
2547 	switch(c = get(s))
2548 	{
2549 	case BADCHAR:
2550 	    return error(p, "Input error: %s", s->error_msg);
2551 
2552 	case '\r':
2553 	case '\n':
2554 	case '\t':
2555 	    if(!((type == LT_pubid && c != '\t') || /* no tab in pubid */
2556 		 ((type == LT_cdata_attr || type == LT_tok_attr) &&
2557 		  ParserGetFlag(p, NormaliseAttributeValues))))
2558 	    {
2559 		count++;
2560 		break;
2561 	    }
2562 	    if(count > 0)
2563 	    {
2564  	        ifNF16wrong(p,count+1,count)
2565 		    return error(p, "not normalized: %s", where);
2566 		require(transcribe(p, count+1, count));
2567 	    }
2568 	    count = 0;
2569 	    ExpandBuf(p->pbuf, p->pbufnext+1);
2570 	    p->pbuf[p->pbufnext++] = ' ';
2571             NF16noStartCheck(p); /* space resets normalization checking */
2572 	    break;
2573 
2574 	case '<':
2575 	    if((type == LT_tok_attr || type == LT_cdata_attr) &&
2576 	       ParserGetFlag(p, XMLMiscWFErrors))
2577 		return error(p, "Illegal character '<' %s", where);
2578 	    count++;
2579 	    break;
2580 
2581 	case XEOE:
2582 	    if(s == start_source)
2583 		return error(p, "Quoted string goes past entity end");
2584 	    if(count > 0)
2585 	    {
2586  	        ifNF16wrong(p,count,count)
2587 		    return error(p, "not normalized: %s", where);
2588 		require(transcribe(p, count, count));
2589 	    }
2590 	    count = 0;
2591 	    ParserPop(p);
2592 	    s = p->source;
2593 	    break;
2594 
2595 	case '%':
2596 	    if(!(type == LT_entity || type == LT_param_entity))
2597 	    {
2598 		count++;
2599 		break;
2600 	    }
2601 	    if(count > 0)
2602 	    {
2603  	        ifNF16wrong(p,count+1,count)
2604 		    return error(p, "not normalized: %s", where);
2605 		require(transcribe(p, count+1, count));
2606 	    }
2607 	    count = 0;
2608 	    if(p->external_pe_depth == 0)
2609 	    {
2610 		unget(s);	/* For error position */
2611 		return error(p, "PE ref not allowed here in internal subset");
2612 	    }
2613 	    require(parse_reference(p, 1, 1, 1));
2614 	    s = p->source;
2615 	    break;
2616 
2617 	case '&':
2618 	    if(ParserGetFlag(p, IgnoreEntities))
2619 		goto deflt;
2620 	    if(type == LT_plain || type == LT_pubid)
2621 	    {
2622 		count++;
2623 		break;
2624 	    }
2625 
2626 	    if(count > 0)
2627 	    {
2628  	        ifNF16wrong(p,count+1,count)
2629 		    return error(p, "not normalized: %s", where);
2630 		require(transcribe(p, count+1, count));
2631 	    }
2632 	    count = 0;
2633 	    if(looking_at(p, "#"))
2634 		/* We *must* expand character references in parameter
2635 		   entity definitions otherwise the result when it is
2636 		   used may be syntactically incorrect. */
2637 	    {
2638 		require(parse_character_reference(p,
2639 				 type == LT_param_entity ||
2640 				 ParserGetFlag(p, ExpandCharacterEntities)));
2641 	    }
2642 	    else
2643 	    {
2644 		if(p->state == PS_error) /* looking_at may have set it */
2645 		    return -1;
2646 		require(parse_reference(p, 0,
2647 			    !(type == LT_entity || type == LT_param_entity) &&
2648 			    ParserGetFlag(p, ExpandGeneralEntities),
2649 					!ParserGetFlag(p, XMLMiscWFErrors)));
2650 		s = p->source;
2651 	    }
2652 	    break;
2653 
2654 	default:
2655 	deflt:
2656 	    if(c == quote && p->source == start_source)
2657 		goto done;
2658 	    count++;
2659 	}
2660 
2661 	if(at_eol(s) && count > 0)
2662 	{
2663  	    ifNF16wrong(p,count,count)
2664 		return error(p, "not normalized: %s", where);
2665 	    require(transcribe(p, count, count));
2666 	    count = 0;
2667 	}
2668     }
2669 
2670 done:
2671     if(count > 0)
2672     {
2673  	ifNF16wrong(p,count+1,count)
2674 	    return error(p, "not normalized: %s", where);
2675 	require(transcribe(p, count+1, count));
2676     }
2677     else
2678 	ExpandBuf(p->pbuf, p->pbufnext+1);
2679     p->pbuf[p->pbufnext++] = 0;
2680 
2681     if((ParserGetFlag(p, NormaliseAttributeValues) && type == LT_tok_attr) ||
2682        type == LT_pubid)
2683     {
2684 	Char *old, *new;
2685 
2686 	new = old = p->pbuf;
2687 
2688 	/* Skip leading whitespace */
2689 
2690 	while(*old == ' ')
2691 	{
2692 	    changed = 1;
2693 	    old++;
2694 	}
2695 
2696 	/* Compress whitespace */
2697 
2698 	for( ; *old; old++)
2699 	{
2700 	    if(*old == ' ')
2701 	    {
2702 		/* NB can't be at start because we skipped whitespace */
2703 		if(new[-1] == ' ')
2704 		    changed = 1;
2705 		else
2706 		    *new++ = ' ';
2707 	    }
2708 	    else
2709 		*new++ = *old;
2710 	}
2711 
2712 	/* Trim trailing space (only one possible because we compressed) */
2713 
2714 	if(new > p->pbuf && new[-1] == ' ')
2715 	{
2716 	    changed = 1;
2717 	    new--;
2718 	}
2719 
2720 	*new = 0;
2721     }
2722 
2723     if(normalised)
2724 	*normalised = changed;
2725 
2726     return 0;
2727 }
2728 
parse_dtd(Parser p)2729 static int parse_dtd(Parser p)
2730 {
2731     InputSource s = p->source;
2732     Entity parent = s->entity;
2733     Entity internal_part = 0, external_part = 0;
2734     Char *name;
2735     char8 *publicid = 0, *systemid = 0;
2736     struct xbit xbit;
2737 
2738     xbit = p->xbit;		/* copy start position */
2739     xbit.type = XBIT_dtd;
2740 
2741     require(parse_name(p, "for name in dtd"));
2742     CopyName(name);
2743     maybe_uppercase(p, name);
2744 
2745     if(ParserGetFlag(p, XMLNamespaces))
2746     {
2747 	require(check_qualname_syntax(p, name, "Doctype"));
2748     }
2749 
2750     skip_whitespace(s);
2751 
2752     require(parse_external_id(p, 0, &publicid, &systemid,
2753 			      ParserGetFlag(p, XMLExternalIDs),
2754 			      ParserGetFlag(p, XMLExternalIDs)));
2755 
2756     if(systemid || publicid)
2757     {
2758 	external_part = NewExternalEntityN(0,0, publicid, systemid, 0, parent);
2759 	if(!external_part)
2760 	{
2761 	    Free(name);
2762 	    return error(p, "System error");
2763 	}
2764 	skip_whitespace(s);
2765     }
2766 
2767     if(looking_at(p, "["))
2768     {
2769 	int line = s->line_number, cpos = s->next;
2770 
2771 	require(read_markupdecls(p));
2772 	skip_whitespace(s);
2773 	internal_part = NewInternalEntity(0, p->pbuf, parent, line, cpos, 1);
2774 	Consume(p->pbuf);
2775 	if(!internal_part)
2776 	{
2777 	    Free(name);
2778 	    FreeEntity(external_part);
2779 	    return error(p, "System error");
2780 	}
2781 	internal_part->is_internal_subset = 1;
2782     }
2783     if(p->state == PS_error)	/* looking_at may have set it */
2784 	return -1;
2785 
2786     require(expect(p, '>', "at end of dtd"));
2787 
2788     if(p->state == PS_prolog1)
2789 	p->state = PS_prolog2;
2790     else
2791     {
2792 	Free(name);
2793 	FreeEntity(external_part);
2794 	FreeEntity(internal_part);
2795 
2796 	if(ParserGetFlag(p, XMLStrictWFErrors))
2797 	    return error(p, "Misplaced or repeated DOCTYPE declaration");
2798 	else if(!ParserGetFlag(p, IgnorePlacementErrors))
2799 	    warn(p, "Misplaced or repeated DOCTYPE declaration");
2800 
2801 	/* Ignore it and return the next bit */
2802 	return parse(p);
2803     }
2804 
2805     if(p->dtd->name)
2806     {
2807 	Free(name);
2808 	FreeEntity(external_part);
2809 	FreeEntity(internal_part);
2810 
2811 	/* This happens if we manually set the dtd */
2812 	return parse(p);
2813     }
2814 
2815     p->dtd->name = name;
2816     p->dtd->internal_part = internal_part;
2817     p->dtd->external_part = external_part;
2818 
2819     if(internal_part)
2820     {
2821 	if(ParserGetFlag(p, TrustSDD) || ParserGetFlag(p, ProcessDTD))
2822 	{
2823 	    ParseDtd(p, internal_part);
2824 	    if(p->xbit.type == XBIT_error)
2825 		return -1;
2826 	}
2827     }
2828 
2829     if(external_part)
2830     {
2831 	if((ParserGetFlag(p, TrustSDD) &&
2832 	    (ParserGetFlag(p, Validate) || p->standalone != SDD_yes)) ||
2833 	   (!ParserGetFlag(p, TrustSDD) &&
2834 	    ParserGetFlag(p, ProcessDTD)))
2835 	{
2836 	    ParseDtd(p, external_part);
2837 	    if(p->xbit.type == XBIT_error)
2838 		return -1;
2839 	}
2840     }
2841 
2842     p->xbit = xbit;
2843     return 0;
2844 }
2845 
read_markupdecls(Parser p)2846 static int read_markupdecls(Parser p)
2847 {
2848     InputSource s = p->source;
2849     int depth=1;
2850     int c, d, hyphens=0;
2851     int count = 0;
2852 
2853     p->pbufnext = 0;
2854 
2855     while(1)
2856     {
2857 	c = get(s);
2858 	if(c == BADCHAR)
2859 	    return error(p, "Input error: %s", s->error_msg);
2860 	if(c == XEOE)
2861 	    return error(p, "EOE in DTD");
2862 	if(c == '-')
2863 	    hyphens++;
2864 	else
2865 	    hyphens = 0;
2866 
2867 	count++;
2868 
2869 	switch(c)
2870 	{
2871 	case ']':
2872 	    if(--depth == 0)
2873 	    {
2874 		count--;	/* We don't want the final ']' */
2875 		require(transcribe(p, count+1, count));
2876 		p->pbuf[p->pbufnext++] = 0;
2877 		return 0;
2878 	    }
2879 	    break;
2880 
2881 	case '[':
2882 	    depth++;
2883 	    break;
2884 
2885 	case '"':
2886 	case '\'':
2887 	    while((d = get(s)) != XEOE)
2888 	    {
2889 		if(d == BADCHAR)
2890 		    return error(p, "Input error: %s", s->error_msg);
2891 		count++;
2892 		if(at_eol(s))
2893 		{
2894 		    require(transcribe(p, count, count));
2895 		    count = 0;
2896 		}
2897 		if(d == c)
2898 		    break;
2899 	    }
2900 	    if(d == XEOE)
2901 		return error(p, "EOE in DTD");
2902 	    break;
2903 
2904 	case '-':
2905 	    if(hyphens < 2)
2906 		break;
2907 	    hyphens = 0;
2908 	    while((d = get(s)) != XEOE)
2909 	    {
2910 		if(d == BADCHAR)
2911 		    return error(p, "Input error: %s", s->error_msg);
2912 		count++;
2913 		if(at_eol(s))
2914 		{
2915 		    require(transcribe(p, count, count));
2916 		    count = 0;
2917 		}
2918 		if(d == '-')
2919 		    hyphens++;
2920 		else
2921 		    hyphens = 0;
2922 		if(hyphens == 2)
2923 		    break;
2924 	    }
2925 	    if(d == XEOE)
2926 		return error(p, "EOE in DTD");
2927 	    hyphens = 0;
2928 	    break;
2929 
2930 	default:
2931 	    break;
2932 	}
2933 
2934 	if(at_eol(s) && count > 0)
2935 	{
2936 	    require(transcribe(p, count, count));
2937 	    count = 0;
2938 	}
2939     }
2940 }
2941 
process_nsl_decl(Parser p)2942 static int process_nsl_decl(Parser p)
2943 {
2944     InputSource s = p->source;
2945     int c, count = 0;
2946 
2947     s->entity->ml_decl = ML_nsl;
2948 
2949     /* The default character encoding for nSGML files is ascii-ash */
2950     if(s->entity->encoding == CE_UTF_8)
2951 	s->entity->encoding = CE_unspecified_ascii_superset;
2952 
2953     /* Syntax is <?NSL DDB unquoted-filename 0> */
2954 
2955     if(!looking_at(p, "DDB "))
2956     {
2957 	if(p->state == PS_error)	/* looking_at may have set it */
2958 	    return -1;
2959 	return error(p, "Expected \"DDB\" in NSL declaration");
2960     }
2961 
2962     while(c = get(s), !is_xml_whitespace(c))
2963 	switch(c)
2964 	{
2965 	case BADCHAR:
2966 	    return error(p, "Input error: %s", s->error_msg);
2967 
2968 	case XEOE:
2969 	    return error(p, "EOE in NSL declaration");
2970 
2971 	case '>':
2972 	    return error(p, "Syntax error in NSL declaration");
2973 
2974 	default:
2975 	    count++;
2976 	}
2977 
2978     p->pbufnext = 0;
2979     require(transcribe(p, count+1, count));
2980     p->pbuf[p->pbufnext++] = 0;
2981 
2982     skip_whitespace(s);
2983     if(!looking_at(p, "0>"))
2984     {
2985 	if(p->state == PS_error)	/* looking_at may have set it */
2986 	    return -1;
2987 	return error(p, "Expected \"0>\" at end of NSL declaration");
2988     }
2989 
2990     if(!(s->entity->ddb_filename = duptochar8(p->pbuf)))
2991 	return error(p, "System error");
2992 
2993     return 0;
2994 }
2995 
process_xml_decl(Parser p)2996 static int process_xml_decl(Parser p)
2997 {
2998     InputSource s = p->source;
2999     enum {None, V, E, S} which, last = None;
3000     Char *Value, *cp;
3001     char8 *value;
3002     CharacterEncoding enc = CE_unknown;
3003     Char c;
3004 
3005     /*
3006      * If we are reading an external entity, should the XML declaration
3007      * (actually "text declaration") be included as part of the replacement
3008      * text?  The standard does not as far as I can see define the
3009      * replacement text of an external entity, but it says "a parsed
3010      * entity's contents are referred to as its replacement text" and
3011      * the production for extParsedEnt is "TextDecl? content".  If the
3012      * "contents" of the entity are identified with "content" in the
3013      * production, then clearly the text declaration is not part of the
3014      * replacement text.  On the other hand, this would be an inconsistency
3015      * between XML and SGML (which regards the declaration as just another
3016      * processing instruction), and there aren't any of those.
3017      *
3018      * It seems quite reasonable to want to put an encoding declaration
3019      * on an external entity containing only PCDATA, and this would be
3020      * illegal if the text declaration were inserted.  Furthermore, it's
3021      * way too much trouble to save the text declaration as well as parse it.
3022      */
3023 
3024     s->entity->ml_decl = ML_xml;
3025 
3026     /* Save the string buffer because it may already be in use */
3027     p->save_pbuf = p->pbuf;
3028     p->save_pbufsize = p->pbufsize;
3029     p->save_pbufnext = p->pbufnext;
3030     Consume(p->pbuf);
3031 
3032     while(!looking_at(p, "?>"))
3033     {
3034 	if(looking_at(p, "version"))
3035 	    which = V;
3036 	else if(looking_at(p, "encoding"))
3037 	    which = E;
3038 	else if(looking_at(p, "standalone"))
3039 	    which = S;
3040 	else if(p->state == PS_error)	/* looking_at may have set it */
3041 	    return -1;
3042 	else
3043 	    return error(p, "Expected \"version\", \"encoding\" or "
3044 			 "\"standalone\" in XML declaration");
3045 
3046 	if(which <= last)
3047 	{
3048 	    if(ParserGetFlag(p, XMLStrictWFErrors))
3049 		return error(p, "Repeated or misordered attributes "
3050 			        "in XML declaration");
3051 	    warn(p, "Repeated or misordered attributes in XML declaration");
3052 	}
3053 	last = which;
3054 
3055 	skip_whitespace(s);
3056 	require(expect(p, '=', "after attribute name in XML declaration"));
3057 	skip_whitespace(s);
3058 
3059 	require(parse_string(p, "for attribute value in XML declaration",
3060 			     LT_plain, 0));
3061 
3062 	maybe_uppercase(p, p->pbuf);
3063 	Value = p->pbuf;
3064 
3065 	if(which == E)
3066 	{
3067 	    if(!is_ascii_alpha(Value[0]))
3068 		return error(p, "Encoding name does not begin with letter");
3069 	    for(cp=Value+1; *cp; cp++)
3070 		if(!is_ascii_alpha(*cp) && !is_ascii_digit(*cp) &&
3071 		   *cp != '.' && *cp != '_' && *cp != '-')
3072 		    return error(p, "Illegal character %s in encoding name",
3073 				 escape(*cp, p->escbuf[0]));
3074 
3075 	    value = tochar8(Value);
3076 
3077 	    enc = FindEncoding(value);
3078 	    if(enc == CE_unknown)
3079 		return error(p, "Unknown declared encoding %s", value);
3080 
3081 	    if(EncodingsCompatible(p->source->entity->encoding, enc, &enc))
3082 	    {
3083 #if CHAR_SIZE == 8
3084 		/* We ignore the declared encoding in 8-bit mode,
3085 		   and treat it as a random ascii superset. */
3086 #else
3087 		p->source->entity->encoding = enc;
3088 #endif
3089 	    }
3090 	    else
3091 		return error(p, "Declared encoding %s is incompatible with %s "
3092 			        "which was used to read it",
3093 		     CharacterEncodingName[enc],
3094 		     CharacterEncodingName[p->source->entity->encoding]);
3095 
3096 	    s->entity->encoding_decl = enc;
3097 	}
3098 
3099 	if(which == S)
3100 	{
3101 	    value = tochar8(Value);
3102 
3103 	    if(str_maybecase_cmp8(p, value, "no") == 0)
3104 		p->standalone = SDD_no;
3105 	    else if(str_maybecase_cmp8(p, value, "yes") == 0)
3106 		p->standalone = SDD_yes;
3107 	    else
3108 		return error(p, "Expected \"yes\" or \"no\" "
3109 			        "for standalone in XML declaration");
3110 
3111 	    s->entity->standalone_decl = p->standalone;
3112 	}
3113 
3114 	if(which == V)
3115 	{
3116 	    for(cp=Value; *cp; cp++)
3117 		if(!is_ascii_alpha(*cp) && !is_ascii_digit(*cp) &&
3118 		   *cp != '.' && *cp != '_' && *cp != '-' && *cp != ':')
3119 		    return error(p, "Illegal character %s in version number",
3120 				 escape(*cp, p->escbuf[0]));
3121 
3122 	    if(!s->entity->version_decl)
3123 	    {
3124 		if(!(s->entity->version_decl = duptochar8(Value)))
3125 		    return error(p, "System error");
3126 
3127 		if(strcmp8(s->entity->version_decl, "1.0") == 0)
3128 		    s->entity->xml_version = XV_1_0;
3129 		else if(strcmp8(s->entity->version_decl, "1.1") == 0)
3130 		    s->entity->xml_version = XV_1_1;
3131 		else if(!ParserGetFlag(p, Pre105VersionCheck) && is_v1x(s->entity->version_decl))
3132 		    s->entity->xml_version = XV_1_0;
3133 		else
3134 		{
3135 		    if(ParserGetFlag(p, XMLStrictWFErrors))
3136 			return error(p, "Version number \"%s\" not supported",
3137 				     s->entity->version_decl);
3138 		    warn(p, "Version number \"%s\" not supported, "
3139 			    "parsing as XML 1.1",
3140 			 s->entity->version_decl);
3141 		    s->entity->xml_version = XV_1_1;
3142 		}
3143 	    }
3144 	}
3145 
3146 	c = get(s);
3147 	if(c == BADCHAR)
3148 	    return error(p, "Input error: %s", s->error_msg);
3149 	if(c == '?')
3150 	    unget(s);
3151 	else if(!is_xml_whitespace(c))
3152 	    return error(p, "Expected whitespace or \"?>\" after attribute "
3153 			    "in XML declaration");
3154 	skip_whitespace(s);
3155     }
3156 
3157     /* Restore the string buffer */
3158     Free(p->pbuf);
3159     p->pbuf = p->save_pbuf;
3160     p->pbufsize = p->save_pbufsize;
3161     p->pbufnext = p->save_pbufnext;
3162     Consume(p->save_pbuf);
3163 
3164     return 0;
3165 }
3166 
is_v1x(const char * version)3167 static int is_v1x(const char *version)
3168 {
3169     int i;
3170 
3171     if(version[0] != '1' || version[1] != '.')
3172 	return 0;
3173     if(!version[2])
3174 	return 0;
3175 
3176     for(i=2; version[i]; i++)
3177 	if(version[i] < '0' || version[i] > '9')
3178 	    return 0;
3179 
3180     return 1;
3181 }
3182 
parse_cdata(Parser p)3183 static int parse_cdata(Parser p)
3184 {
3185     InputSource s = p->source;
3186     int c, c1=0, c2=0;
3187     int count = 0;
3188     NF16StartCheck(p);
3189 
3190     if(p->state <= PS_prolog2)
3191 	return error(p, "CDATA section not allowed in prolog");
3192     if(p->state == PS_epilog)
3193 	return error(p, "CDATA section not allowed after body");
3194     if(ParserGetFlag(p, Validate))
3195     {
3196 	ElementDefinition e = VectorLast(p->element_stack).definition;
3197 	if(!(e->type == CT_mixed || e->type == CT_any))
3198 	{
3199 	    require(validity_error(p, "CDATA section not allowed here"));
3200 	    VectorLast(p->element_stack).context = 0;
3201 	}
3202     }
3203 
3204     p->pbufnext = 0;
3205 
3206     while((c = get(s)) != XEOE)
3207     {
3208 	if(c == BADCHAR)
3209 	    return error(p, "Input error: %s", s->error_msg);
3210 	count++;
3211 	if(c == '>' && c1 == ']' && c2 == ']')
3212 	    break;
3213 	if(at_eol(s))
3214 	{
3215             ifNF16wrong(p,count,count)
3216                 return error(p, "CDATA section not normalized");
3217 	    require(transcribe(p, count, count));
3218 	    count = 0;
3219 	}
3220 	c2 = c1; c1 = c;
3221     }
3222 
3223     if(c == XEOE)
3224 	return error(p, "EOE in CDATA section");
3225 
3226     ifNF16wrong(p,count,count)
3227         return error(p, "CDATA section not normalized");
3228     require(transcribe(p, count, count-3));
3229     p->pbuf[p->pbufnext++] = 0;
3230     p->xbit.type = XBIT_cdsect;
3231     p->xbit.cdsect_chars = p->pbuf;
3232     Consume(p->pbuf);
3233 
3234     NF16StartCheck(p);
3235     return 0;
3236 }
3237 
ParseDtd(Parser p,Entity e)3238 XBit ParseDtd(Parser p, Entity e)
3239 {
3240     InputSource source, save;
3241 
3242     if(e->type == ET_external && p->entity_opener)
3243 	source = p->entity_opener(e, p->entity_opener_arg);
3244     else
3245 	source = EntityOpen(e);
3246     if(!source)
3247     {
3248 	error(p, "Couldn't open dtd entity %s", EntityDescription(e));
3249 	return &p->xbit;
3250     }
3251 
3252     save = p->source;
3253     p->source = 0;
3254     if(ParserPush(p, source) == -1)
3255 	return &p->xbit;
3256 
3257     p->have_dtd = 1;
3258 
3259     p->external_pe_depth = (source->entity->type == ET_external);
3260 
3261     while(parse_markupdecl(p) == 0)
3262 	;
3263 
3264     p->external_pe_depth = 0;
3265 
3266     /* don't restore after error, so user can call ParserPerror */
3267     if(p->xbit.type != XBIT_error)
3268     {
3269 	ParserPop(p);		/* to free the input source */
3270 	p->source = save;
3271     }
3272 
3273     return &p->xbit;
3274 }
3275 
3276 /*
3277  * Returns 0 normally, -1 if error, 1 at EOF.
3278  */
parse_markupdecl(Parser p)3279 static int parse_markupdecl(Parser p)
3280 {
3281     InputSource s, t;
3282     int c;
3283     int cur_line, cur_char;
3284     Entity cur_ent, cur_ext_ent = 0;
3285 
3286     if(p->state == PS_error)
3287 	return error(p, "Attempt to continue reading DTD after error");
3288 
3289     clear_xbit(&p->xbit);
3290 
3291     require(skip_dtd_whitespace(p, 1));	/* allow PE even in internal subset */
3292     s = p->source;
3293     SourcePosition(s, &p->xbit.entity, &p->xbit.byte_offset);
3294 
3295     cur_ent = s->entity;
3296     cur_line = s->line_number;
3297     cur_char = s->next;
3298 
3299     /* Find the current *external* entity, to use as base URI for system
3300        identifiers */
3301 
3302     for(t = s; t; t = t->parent)
3303 	if(t->entity->type == ET_external)
3304 	{
3305 	    cur_ext_ent = t->entity;
3306 	    break;
3307 	}
3308     if(!cur_ext_ent)
3309 	cur_ext_ent = p->document_entity;
3310 
3311     c = get(s);
3312     switch(c)
3313     {
3314     case BADCHAR:
3315 	return error(p, "Input error: %s", s->error_msg);
3316     case XEOE:
3317 	p->xbit.type = XBIT_none;
3318 	return 1;
3319     case '<':
3320 	if(looking_at(p, "!ELEMENT"))
3321 	{
3322 	    require(expect_dtd_whitespace(p, "after ELEMENT"));
3323 	    return parse_element_decl(p, cur_ent);
3324 	}
3325 	else if(looking_at(p, "!ATTLIST"))
3326 	{
3327 	    require(expect_dtd_whitespace(p, "after ATTLIST"));
3328 	    return parse_attlist_decl(p, cur_ent);
3329 	}
3330 	else if(looking_at(p, "!ENTITY"))
3331 	{
3332 	    require(expect_dtd_whitespace(p, "after ENTITY"));
3333 	    return parse_entity_decl(p, cur_ent, cur_line, cur_char,
3334 				     cur_ext_ent);
3335 	}
3336 	else if(looking_at(p, "!NOTATION"))
3337 	{
3338 	    require(expect_dtd_whitespace(p, "after NOTATION"));
3339 	    return parse_notation_decl(p, cur_ent);
3340 	}
3341 	else if(looking_at(p, "!["))
3342 	    return parse_conditional(p, cur_ent);
3343 	else if(looking_at(p, "?"))
3344 	{
3345 	    require(parse_pi(p, cur_ent));
3346 	    if(p->dtd_callback)
3347 		p->dtd_callback(&p->xbit, p->dtd_callback_arg);
3348 	    else
3349 		FreeXBit(&p->xbit);
3350 	    return 0;
3351 	}
3352 	else if(looking_at(p, "!--"))
3353 	{
3354 	    if(ParserGetFlag(p, ReturnComments))
3355 	    {
3356 		require(parse_comment(p, 0, cur_ent));
3357 		if(p->dtd_callback)
3358 		    p->dtd_callback(&p->xbit, p->dtd_callback_arg);
3359 		else
3360 		    FreeXBit(&p->xbit);
3361 		return 0;
3362 	    }
3363 	    else
3364 		return parse_comment(p, 1, cur_ent);
3365 	}
3366 	else if(p->state == PS_error)	/* looking_at may have set it */
3367 	    return -1;
3368 	else
3369 	    return error(p, "Syntax error after < in dtd");
3370     default:
3371 	unget(s);		/* For error position */
3372 	return error(p, "Expected \"<\" in dtd, but got %s",
3373 		     escape(c, p->escbuf[0]));
3374     }
3375 }
3376 
parse_reference(Parser p,int pe,int expand,int allow_external)3377 static int parse_reference(Parser p, int pe, int expand, int allow_external)
3378 {
3379     Entity e;
3380     InputSource s;
3381 
3382     require(parse_name(p, pe ? "for parameter entity" : "for entity"));
3383     require(expect(p, ';', "after entity name"));
3384 
3385     if(ParserGetFlag(p, Validate) && VectorCount(p->element_stack) > 0)
3386     {
3387 	ElementDefinition parent = VectorLast(p->element_stack).definition;
3388 
3389 	if(parent->type == CT_empty)
3390 	{
3391 	   require(validity_error(p, "Entity reference not allowed in EMPTY element %S",
3392 				  parent->name));
3393 	}
3394     }
3395 
3396     if(!expand)
3397 	return transcribe(p, 1 + p->namelen + 1, 1 + p->namelen + 1);
3398 
3399     e = FindEntityN(p->dtd, p->name, p->namelen, pe);
3400     if(!e)
3401     {
3402 	Char *buf;
3403 	Char *q;
3404 	int i;
3405 
3406 	if(pe || ParserGetFlag(p, ErrorOnUndefinedEntities))
3407 	    return error(p, "Undefined%s entity %.*S",
3408 			 pe ? " parameter" : "" ,
3409 			 p->namelen > 50 ? 50 : p->namelen, p->name);
3410 
3411 	warn(p, "Undefined%s entity %.*S",
3412 	     pe ? " parameter" : "",
3413 	     p->namelen > 50 ? 50 : p->namelen, p->name);
3414 
3415 	/* Fake a definition for it */
3416 
3417 	buf = Malloc((5 + p->namelen + 1 + 1) * sizeof(Char));
3418 	if(!buf)
3419 	    return error(p, "System error");
3420 	q = buf;
3421 	*q++ = '&'; *q++ = '#'; *q++ = '3'; *q++ = '8'; *q++ = ';';
3422 	for(i=0; i<p->namelen; i++)
3423 	    *q++ = p->name[i];
3424 	*q++ = ';';
3425 	*q++ = 0;
3426 
3427 	if(!(e = NewInternalEntityN(p->name, p->namelen, buf, 0, 0, 0, 0)))
3428 	    return error(p, "System error");
3429 	if(!DefineEntity(p->dtd, e, 0))
3430 	    return error(p, "System error");
3431 
3432 	if(ParserGetFlag(p, XMLNamespaces) && Strchr(e->name, ':'))
3433 	{
3434 	    require(namespace_error(p, "Entity name %S contains colon",
3435 				    e->name));
3436 	}
3437     }
3438 
3439     if(e->type == ET_external && e->notation)
3440 	return error(p, "Illegal reference to unparsed entity \"%S\"",
3441 		     e->name);
3442 
3443     if(!allow_external && e->type == ET_external)
3444 	return error(p, "Illegal reference to external entity \"%S\"",
3445 		     e->name);
3446 
3447     for(s = p->source; s; s = s->parent)
3448 	if(s->entity == e)
3449 	    return error(p, "Recursive reference to entity \"%S\"", e->name);
3450 
3451     if(p->standalone == SDD_yes &&
3452        parsing_internal(p) && e->is_externally_declared)
3453     {
3454 	/* This is a WF error by erratum 34 */
3455 	require(error(p, "Internal reference to externally declared entity "
3456 		      "\"%S\" in document declared standalone",
3457 		      e->name));
3458     }
3459     else if(ParserGetFlag(p, Validate) && p->standalone == SDD_yes &&
3460 	    p->state == PS_body && e->is_externally_declared)
3461     {
3462 	require(validity_error(p, "Reference to externally declared entity "
3463 			          "\"%S\" in document declared standalone",
3464 			       e->name));
3465     }
3466 
3467     if(e->type == ET_external && p->entity_opener)
3468 	s = p->entity_opener(e, p->entity_opener_arg);
3469     else
3470 	s = EntityOpen(e);
3471     if(!s)
3472 	return error(p, "Couldn't open entity %S, %s",
3473 		     e->name, EntityDescription(e));
3474 
3475     require(ParserPush(p, s));
3476     NF16StartCheck(p);
3477 
3478     return 0;
3479 }
3480 
parse_character_reference(Parser p,int expand)3481 static int parse_character_reference(Parser p, int expand)
3482 {
3483     InputSource s = p->source;
3484     int c, base = 10;
3485     int count = 0;
3486     unsigned int code = 0;
3487     Char *ch = s->line + s->next;
3488 
3489     if(looking_at(p, "x"))
3490     {
3491 	ch++;
3492 	base = 16;
3493     }
3494     if(p->state == PS_error)	/* looking_at may have set it */
3495 	return -1;
3496 
3497     while((c = get(s)) != ';')
3498     {
3499 	if(c == BADCHAR)
3500 	    return error(p, "Input error: %s", s->error_msg);
3501 	if((c >= '0' && c <= '9') ||
3502 	   (base == 16 && ((c >= 'A' && c <= 'F') ||
3503 			   (c >= 'a' && c <= 'f'))))
3504 	    count++;
3505 	else
3506 	{
3507 	    unget(s);		/* For error position */
3508 	    return error(p,
3509 			 "Illegal character %s in base-%d character reference",
3510 			 escape(c, p->escbuf[0]), base);
3511 	}
3512     }
3513 
3514     if(!expand)
3515 	return transcribe(p, 2 + (base == 16) + count + 1,
3516 			     2 + (base == 16) + count + 1);
3517 
3518     while(count-- > 0)
3519     {
3520 	c = *ch++;
3521 	if(c >= '0' && c <= '9')
3522 	    code = code * base + (c - '0');
3523 	else if(c >= 'A' && c <= 'F')
3524 	    code = code * base + 10 + (c - 'A');
3525 	else
3526 	    code = code * base + 10 + (c - 'a');
3527 
3528 	/* Test here rather than just at the end to avoid undetected overflow */
3529 	if(code >= 0x110000)
3530 	{
3531 	    if(ParserGetFlag(p, ErrorOnBadCharacterEntities))
3532 		return error(p, "Character reference code too big");
3533 	    else
3534 		warn(p, "Character reference code too big, ignored");
3535 	    return 0;
3536 	}
3537     }
3538 
3539 /* allow refs to C0 and C1 controls except NUL in XML 1.1 */
3540 #define is_xml11_legal_control(c) \
3541     ((c >= 0x01 && c <= 0x1f) || (c >= 0x7f && c <= 0x9f))
3542 
3543 #if CHAR_SIZE == 8
3544     if(code > 255 ||
3545        !(is_xml_legal(code, p->map) ||
3546 	 (p->xml_version >= XV_1_1 && is_xml11_legal_control(code))))
3547     {
3548 	if(ParserGetFlag(p, ErrorOnBadCharacterEntities))
3549 	    return error(p, "0x%x is not a valid 8-bit XML character", code);
3550 	else
3551 	    warn(p, "0x%x is not a valid 8-bit XML character; ignored", code);
3552 	return 0;
3553     }
3554 #else
3555     if(!(is_xml_legal(code, p->map) |
3556 	 (p->xml_version >= XV_1_1 && is_xml11_legal_control(code))))
3557     {
3558 	if(ParserGetFlag(p, ErrorOnBadCharacterEntities))
3559 	    return error(p, "0x%x is not a valid XML character", code);
3560 	else
3561 	    warn(p, "0x%x is not a valid XML character; ignored", code);
3562 	return 0;
3563     }
3564 
3565     if(code >= 0x10000)
3566     {
3567 	/* Use surrogates */
3568 
3569 	ExpandBuf(p->pbuf, p->pbufnext+2);
3570 	code -= 0x10000;
3571 
3572 	p->pbuf[p->pbufnext++] = (code >> 10) + 0xd800;
3573 	p->pbuf[p->pbufnext++] = (code & 0x3ff) + 0xdc00;
3574         if(p->checker && NF16wrong==nf16checkL(p->checker,
3575 			    p->pbuf + p->pbufnext - 2, 2))
3576            return error(p, "numeric character reference not normalized");
3577 
3578 	return 0;
3579     }
3580 #endif
3581 
3582     ExpandBuf(p->pbuf, p->pbufnext+1);
3583     p->pbuf[p->pbufnext++] = code;
3584     if(p->checker && NF16wrong==nf16checkL(p->checker,
3585 		        p->pbuf + p->pbufnext - 1, 1))
3586        return error(p, "numeric character reference not normalized");
3587 
3588 
3589     return 0;
3590 }
3591 
3592 /* Called after reading '<!ELEMENT ' */
3593 
parse_element_decl(Parser p,Entity ent)3594 static int parse_element_decl(Parser p, Entity ent)
3595 {
3596     Char *name;
3597     ContentType type;
3598     ElementDefinition def;
3599     Entity tent;
3600     ContentParticle cp = 0;
3601     Char *content = 0;
3602 
3603     require(parse_name(p, "for name in element declaration"));
3604     CopyName(name);
3605     maybe_uppercase(p, name);
3606 
3607     require(expect_dtd_whitespace(p, "after name in element declaration"));
3608 
3609     if(looking_at(p, "EMPTY"))
3610     {
3611 	type = CT_empty;
3612 	content = 0;
3613     }
3614     else if(looking_at(p, "ANY"))
3615     {
3616 	type = CT_any;
3617 	content = 0;
3618     }
3619     else if(looking_at(p, "("))
3620     {
3621 	unget(p->source);
3622 	if(!(cp = parse_cp(p)) ||
3623 	   check_content_decl(p, cp) < 0 ||
3624 	   !(content = stringify_cp(cp)))
3625 	{
3626 	    FreeContentParticle(cp);
3627 	    Free(content);
3628 	    Free(name);
3629 	    return -1;
3630 	}
3631 
3632 	if(cp->type == CP_choice && cp->children[0]->type == CP_pcdata)
3633 	    type = CT_mixed;
3634 	else
3635 	    type = CT_element;
3636 	{
3637 	}
3638     }
3639     else if(p->state == PS_error)	/* looking_at may have set it */
3640 	return -1;
3641     else
3642     {
3643 	Free(name);
3644 	return error(p, "Expected \"EMPTY\", \"ANY\", or \"(\" after name in "
3645 		        "element declaration");
3646     }
3647 
3648     require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
3649     tent = p->source->entity;
3650     require(expect(p, '>', "at end of element declaration"));
3651     if(ParserGetFlag(p, Validate) && tent != ent)
3652     {
3653 	require(validity_error(p, "Element declaration ends in different "
3654 			          "entity from that in which it starts"));
3655     }
3656 
3657     if((def = FindElement(p->dtd, name)))
3658     {
3659 	if(def->tentative)
3660 	{
3661 	    RedefineElement(def, type, content, cp, 1);
3662 	    if(parsing_external_subset(p))
3663 		def->is_externally_declared = 1;
3664 	}
3665 	else
3666 	{
3667 	  FreeContentParticle(cp);
3668 	  Free(content);
3669 	  if(ParserGetFlag(p, Validate))
3670 	  {
3671 	      require(validity_error(p, "Element %S declared more than once",
3672 				     name));
3673 	  }
3674 	  else if(ParserGetFlag(p, WarnOnRedefinitions))
3675 	      warn(p, "Ignoring redeclaration of element %S", name);
3676 	}
3677     }
3678     else
3679     {
3680 	if (!(def = DefineElement(p->dtd, name, type, content, cp, 1))) {
3681 	    return error(p, "System error");
3682 	};
3683 	if(parsing_external_subset(p))
3684 	    def->is_externally_declared = 1;
3685 	if(ParserGetFlag(p, XMLNamespaces))
3686 	{
3687 	    require(check_qualname_syntax(p, name, "Element"));
3688 	}
3689     }
3690 
3691     Free(name);
3692 
3693     return 0;
3694 }
3695 
3696 /* Content model parsing */
3697 
parse_cp(Parser p)3698 static ContentParticle parse_cp(Parser p)
3699 {
3700     ContentParticle cp;
3701     Entity ent;
3702 
3703     ent = p->source->entity;
3704     if(looking_at(p, "("))
3705     {
3706 	if(!(cp = parse_choice_or_seq(p, ent)))
3707 	    return 0;
3708     }
3709     else if(looking_at(p, "#PCDATA"))
3710     {
3711 	if(!(cp = Malloc(sizeof(*cp))))
3712 	{
3713 	    error(p, "System error");
3714 	    return 0;
3715 	}
3716 
3717 	cp->type = CP_pcdata;
3718     }
3719     else if(p->state == PS_error)	/* looking_at may have set it */
3720 	return 0;
3721     else
3722     {
3723 	if(parse_name(p, "in content declaration") < 0)
3724 	    return 0;
3725 	maybe_uppercase_name(p);
3726 
3727 	if(!(cp = Malloc(sizeof(*cp))))
3728 	{
3729 	    error(p, "System error");
3730 	    return 0;
3731 	}
3732 
3733 	cp->type = CP_name;
3734 	if(!(cp->element = FindElementN(p->dtd, p->name, p->namelen)))
3735 	{
3736 	    if(!(cp->element = TentativelyDefineElementN(p->dtd,
3737 							 p->name, p->namelen)))
3738 	    {
3739 		error(p, "System error");
3740 		return 0;
3741 	    }
3742 	    if(ParserGetFlag(p, XMLNamespaces))
3743 		if(check_qualname_syntax(p, cp->element->name, "Element") < 0)
3744 		    return 0;
3745 	}
3746 	cp->name = cp->element->name;
3747     }
3748 
3749 
3750     if(looking_at(p, "*"))
3751 	cp->repetition = '*';
3752     else if(looking_at(p, "+"))
3753 	cp->repetition = '+';
3754     else if(looking_at(p, "?"))
3755 	cp->repetition = '?';
3756     else if(p->state == PS_error)	/* looking_at may have set it */
3757 	return 0;
3758     else
3759 	cp->repetition = 0;
3760 
3761     return cp;
3762 }
3763 
3764 /* Called after '(' */
3765 
parse_choice_or_seq(Parser p,Entity ent)3766 static ContentParticle parse_choice_or_seq(Parser p, Entity ent)
3767 {
3768     ContentParticle cp, cp1;
3769 
3770 
3771     require0(skip_dtd_whitespace(p, p->external_pe_depth > 0));
3772 
3773     if(!(cp1 = parse_cp(p)))
3774 	return 0;
3775 
3776     require0(skip_dtd_whitespace(p, p->external_pe_depth > 0));
3777 
3778     if(!(cp = parse_choice_or_seq_1(p, 1, 0, ent)))
3779 	FreeContentParticle(cp1);
3780     else
3781 	cp->children[0] = cp1;
3782 
3783     return cp;
3784 }
3785 
3786 /* Called before '|', ',', or ')' */
3787 
parse_choice_or_seq_1(Parser p,int nchildren,char sep,Entity ent)3788 static ContentParticle parse_choice_or_seq_1(Parser p, int nchildren,
3789 					     char sep, Entity ent)
3790 {
3791     ContentParticle cp = 0, cp1;
3792     int nsep = get(p->source);
3793 
3794     if(nsep == BADCHAR)
3795     {
3796 	error(p, "Input error: %s", p->source->error_msg);
3797 	return 0;
3798     }
3799 
3800     if(nsep == ')')
3801     {
3802 	/* We've reached the end */
3803 
3804 	if(ParserGetFlag(p, Validate) && p->source->entity != ent)
3805 	{
3806 	    if(validity_error(p, "Content particle ends in different "
3807 			         "entity from that in which it starts") < 0)
3808 		return 0;
3809 	}
3810 
3811 	if(!(cp = Malloc(sizeof(*cp))) ||
3812 	   !(cp->children = Malloc(nchildren * sizeof(cp))))
3813 	{
3814 	    Free(cp);
3815 	    error(p, "System error");
3816 	    return 0;
3817 	}
3818 
3819 	/* The standard does not specify whether '(foo)' is a choice or a
3820 	   sequence.  We make it a choice so that (#PCDATA) comes out as
3821 	   a choice, like other mixed models. */
3822 	/* Erratum E50 has now resolved this the other way, but I don't
3823 	   see any reason to change it, since it makes no difference. */
3824 
3825 	cp->type = sep == ',' ? CP_seq : CP_choice;
3826 	cp->nchildren = nchildren;
3827 
3828 	return cp;
3829     }
3830 
3831     if(nsep != '|' && nsep != ',')
3832     {
3833 	error(p, "Expected | or , or ) in content declaration, got %s",
3834 	      escape(nsep, p->escbuf[0]));
3835 	return 0;
3836     }
3837 
3838     if(sep && nsep != sep)
3839     {
3840 	error(p, "Content particle contains both | and ,");
3841 	return 0;
3842     }
3843 
3844     require0(skip_dtd_whitespace(p, p->external_pe_depth > 0));
3845 
3846     if(!(cp1 = parse_cp(p)))
3847 	return 0;
3848 
3849     require0(skip_dtd_whitespace(p, p->external_pe_depth > 0));
3850 
3851     if(!(cp = parse_choice_or_seq_1(p, nchildren+1, (char)nsep, ent)))
3852 	FreeContentParticle(cp1);
3853     else
3854 	cp->children[nchildren] = cp1;
3855 
3856     return cp;
3857 }
3858 
3859 /* Check content particle matches Mixed or children */
3860 
check_content_decl(Parser p,ContentParticle cp)3861 static int check_content_decl(Parser p, ContentParticle cp)
3862 {
3863     int i, j;
3864 
3865     if(cp->type == CP_choice && cp->children[0]->type == CP_pcdata)
3866     {
3867 	if(cp->children[0]->repetition != 0)
3868 	    return error(p, "Malformed mixed content declaration");
3869 	for(i=1; i<cp->nchildren; i++)
3870 	    if(cp->children[i]->type != CP_name ||
3871 	       cp->children[i]->repetition != 0)
3872 		return error(p, "Malformed mixed content declaration");
3873 
3874 	if(cp->repetition != '*' &&
3875 	   !(cp->nchildren == 1 && cp->repetition == 0))
3876 	    return error(p, "Malformed mixed content declaration");
3877 
3878 	if(ParserGetFlag(p, Validate))
3879 	{
3880 	    for(i=1; i<cp->nchildren; i++)
3881 		for(j=i+1; j<cp->nchildren; j++)
3882 		    if(Strcmp(cp->children[i]->name,
3883 			      cp->children[j]->name) == 0)
3884 		    {
3885 			require(validity_error(p,
3886 					  "Type %S appears more than once in "
3887 					  "mixed content declaration",
3888 					       cp->children[i]->name));
3889 		    }
3890 	}
3891 
3892 	return 0;
3893     }
3894     else
3895 	return check_content_decl_1(p, cp);
3896 }
3897 
check_content_decl_1(Parser p,ContentParticle cp)3898 static int check_content_decl_1(Parser p, ContentParticle cp)
3899 {
3900     int i;
3901 
3902     switch(cp->type)
3903     {
3904     case CP_pcdata:
3905 	return error(p, "Misplaced #PCDATA in content declaration");
3906     case CP_seq:
3907     case CP_choice:
3908 	for(i=0; i<cp->nchildren; i++)
3909 	    if(check_content_decl_1(p, cp->children[i]) < 0)
3910 		return -1;
3911 	return 0;
3912     default:
3913 	return 0;
3914     }
3915 }
3916 
3917 /* Reconstruct the content model as a string */
3918 
stringify_cp(ContentParticle cp)3919 static Char *stringify_cp(ContentParticle cp)
3920 {
3921     int size = size_cp(cp);
3922     Char *s;
3923     FILE16 *f;
3924 
3925     if(!(s = Malloc((size+1) * sizeof(Char))) ||
3926        !(f = MakeFILE16FromString(s, (size + 1) * sizeof(Char), "w")))
3927     {
3928 	Free(s);
3929 	return 0;
3930     }
3931 
3932     print_cp(cp, f);
3933     s[size] = 0;
3934 
3935     Fclose(f);
3936 
3937     return s;
3938 }
3939 
print_cp(ContentParticle cp,FILE16 * f)3940 static void print_cp(ContentParticle cp, FILE16 *f)
3941 {
3942     int i;
3943 
3944     switch(cp->type)
3945     {
3946     case CP_pcdata:
3947 	Fprintf(f, "#PCDATA");
3948 	break;
3949     case CP_name:
3950 	Fprintf(f, "%S", cp->name);
3951 	break;
3952     case CP_seq:
3953     case CP_choice:
3954 	Fprintf(f, "(");
3955 	for(i=0; i<cp->nchildren; i++)
3956 	{
3957 	    if(i != 0)
3958 		Fprintf(f, cp->type == CP_seq ? "," : "|");
3959 	    print_cp(cp->children[i], f);
3960 	}
3961 	Fprintf(f, ")");
3962 	break;
3963     default:
3964 	break;
3965     }
3966 
3967     if(cp->repetition)
3968 	Fprintf(f, "%c", cp->repetition);
3969 }
3970 
size_cp(ContentParticle cp)3971 static int size_cp(ContentParticle cp)
3972 {
3973     int i, s;
3974 
3975     switch(cp->type)
3976     {
3977     case CP_pcdata:
3978 	s = 7;
3979 	break;
3980     case CP_name:
3981 	s = Strlen(cp->name);
3982 	break;
3983     default:
3984 	s = 2;
3985 	for(i=0; i<cp->nchildren; i++)
3986 	{
3987 	    if(i != 0)
3988 		s++;
3989 	    s += size_cp(cp->children[i]);
3990 	}
3991 	break;
3992     }
3993 
3994     if(cp->repetition)
3995 	s++;
3996 
3997     return s;
3998 }
3999 
FreeContentParticle(ContentParticle cp)4000 void FreeContentParticle(ContentParticle cp)
4001 {
4002     int i;
4003 
4004     if(!cp)
4005 	return;
4006 
4007     switch(cp->type)
4008     {
4009     case CP_pcdata:
4010 	break;
4011     case CP_name:
4012 	/* The name is part of the element definition, so don't free it */
4013 	break;
4014     case CP_seq:
4015     case CP_choice:
4016 	for(i=0; i<cp->nchildren; i++)
4017 	    FreeContentParticle(cp->children[i]);
4018 	Free(cp->children);
4019 	break;
4020     default:
4021 	break;
4022     }
4023 
4024     Free(cp);
4025 }
4026 
4027 /* Called after reading '<!ATTLIST ' */
4028 
parse_attlist_decl(Parser p,Entity ent)4029 static int parse_attlist_decl(Parser p, Entity ent)
4030 {
4031     Char *name;
4032     ElementDefinition element;
4033     Entity tent;
4034     AttributeType type;
4035     DefaultType default_type;
4036     AttributeDefinition a;
4037     Char **allowed_values, *t;
4038     Char *default_value;
4039     int nvalues=0, i, j;
4040     static Char s_xml_space[] = {'x','m','l',':','s','p','a','c','e',0},
4041 		s_default[]   = {'d','e','f','a','u','l','t',0},
4042 		s_preserve[]   = {'p','r','e','s','e','r','v','e',0};
4043 
4044     require(parse_name(p, "for name in attlist declaration"));
4045     CopyName(name);
4046     maybe_uppercase(p, name);
4047 
4048     if(!(element = FindElement(p->dtd, name)))
4049     {
4050 	if(!(element = TentativelyDefineElement(p->dtd, name)))
4051 	    return error(p, "System error");
4052 	if(ParserGetFlag(p, XMLNamespaces))
4053 	{
4054 	    require(check_qualname_syntax(p, element->name, "Element"));
4055 	}
4056     }
4057 
4058     Free(name);
4059 
4060     if(looking_at(p, ">"))
4061 	unget(p->source);
4062     else
4063     {
4064 	if(p->state == PS_error)	/* looking_at may have set it */
4065 	    return -1;
4066 	require(expect_dtd_whitespace(p,
4067 				"after element name in attlist declaration"));
4068     }
4069 
4070     while(tent = p->source->entity, !looking_at(p, ">"))
4071     {
4072 	if(p->state == PS_error)	/* looking_at may have set it */
4073 	    return -1;
4074 	require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4075 	require(parse_name(p, "for attribute in attlist declaration"));
4076 	CopyName(name);
4077 	maybe_uppercase(p, name);
4078 
4079 	require(expect_dtd_whitespace(p, "after name in attlist declaration"));
4080 
4081 	if(looking_at(p, "CDATA"))
4082 	    type = AT_cdata;
4083 	else if(looking_at(p, "IDREFS"))
4084 	    type = AT_idrefs;
4085 	else if(looking_at(p, "IDREF"))
4086 	    type = AT_idref;
4087 	else if(looking_at(p, "ID"))
4088 	    type = AT_id;
4089 	else if(looking_at(p, "ENTITIES"))
4090 	    type = AT_entities;
4091 	else if(looking_at(p, "ENTITY"))
4092 	    type = AT_entity;
4093 	else if(looking_at(p, "NMTOKENS"))
4094 	    type = AT_nmtokens;
4095 	else if(looking_at(p, "NMTOKEN"))
4096 	    type = AT_nmtoken;
4097 	else if(looking_at(p, "NOTATION"))
4098 	    type = AT_notation;
4099 	else if(p->state == PS_error)	/* looking_at may have set it */
4100 	    return -1;
4101 	else
4102 	    type = AT_enumeration;
4103 
4104 	if(type != AT_enumeration)
4105 	{
4106 	    require(expect_dtd_whitespace(p, "after attribute type"));
4107 	}
4108 
4109 	if(type == AT_notation || type == AT_enumeration)
4110 	{
4111 	    require(expect(p, '(',
4112 			   "or keyword for type in attlist declaration"));
4113 
4114 	    nvalues = 0;
4115 	    p->pbufnext = 0;
4116 	    do
4117 	    {
4118 		require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4119 		if(type == AT_notation)
4120 		{
4121 		    require(parse_name(p,
4122 			       "for notation value in attlist declaration"));
4123 		}
4124 		else
4125 		{
4126 		    require(parse_nmtoken(p,
4127 			       "for enumerated value in attlist declaration"));
4128 		}
4129 		maybe_uppercase_name(p);
4130 		ExpandBuf(p->pbuf, p->pbufnext + p->namelen + 1);
4131 		memcpy(p->pbuf+p->pbufnext,
4132 		       p->name,
4133 		       p->namelen * sizeof(Char));
4134 		p->pbuf[p->pbufnext + p->namelen] = 0;
4135 		p->pbufnext += (p->namelen + 1);
4136 		nvalues++;
4137 		require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4138 	    }
4139 	    while(looking_at(p, "|"));
4140 
4141 	    if(p->state == PS_error)	/* looking_at may have set it */
4142 		return -1;
4143 
4144 	    require(expect(p, ')',
4145 		  "at end of enumerated value list in attlist declaration"));
4146 	    require(expect_dtd_whitespace(p, "after enumerated value list "
4147 					     "in attlist declaration"));
4148 
4149 	    allowed_values = Malloc((nvalues+1)*sizeof(Char *));
4150 	    if(!allowed_values)
4151 		return error(p, "System error");
4152 	    for(i=0, t=p->pbuf; i<nvalues; i++)
4153 	    {
4154 		allowed_values[i] = t;
4155 		while(*t++)
4156 		    ;
4157 	    }
4158 	    allowed_values[nvalues] = 0;
4159 
4160 	    Consume(p->pbuf);
4161 	}
4162 	else
4163 	    allowed_values = 0;
4164 
4165 	if(looking_at(p, "#REQUIRED"))
4166 	    default_type = DT_required;
4167 	else if(looking_at(p, "#IMPLIED"))
4168 	    default_type = DT_implied;
4169 	else if(looking_at(p, "#FIXED"))
4170 	{
4171 	    default_type = DT_fixed;
4172 	    require(expect_dtd_whitespace(p, "after #FIXED"));
4173 	}
4174 	else if(p->state == PS_error)	/* looking_at may have set it */
4175 	    return -1;
4176 	else
4177 	    default_type = DT_none;
4178 
4179 	if(default_type == DT_fixed || default_type == DT_none)
4180 	{
4181 	    require(parse_string(p,
4182 				 "for default value in attlist declaration",
4183 				 type == AT_cdata ? LT_cdata_attr :
4184 				                    LT_tok_attr, 0));
4185 	    default_value = p->pbuf;
4186 	    Consume(p->pbuf);
4187 	    if(type != AT_cdata && type != AT_entity && type != AT_entities)
4188 		maybe_uppercase(p, default_value);
4189 	}
4190 	else
4191 	    default_value = 0;
4192 
4193 	if(FindAttribute(element, name))
4194 	{
4195 	    if(ParserGetFlag(p, WarnOnRedefinitions))
4196 		warn(p, "Ignoring redeclaration of attribute %S", name);
4197 	    if(allowed_values)
4198 	    {
4199 		Free(allowed_values[0]);
4200 		Free(allowed_values);
4201 	    }
4202 	    if(default_value)
4203 		Free(default_value);
4204 
4205 	    goto done;
4206 	}
4207 
4208 	if(ParserGetFlag(p, Validate) && type == AT_id)
4209 	{
4210 	    if(element->id_attribute)
4211 	    {
4212 		require(validity_error(p,
4213 				       "ID attribute %S declared for element"
4214 				       " %S which already had one (%S)",
4215 				       name, element->name,
4216 				       element->id_attribute->name));
4217 	    }
4218 	    if(default_type != DT_implied && default_type != DT_required)
4219 	    {
4220 		require(validity_error(p,
4221 				    "ID attribute %S must have declared "
4222 				    "default of #IMPLIED or #REQUIRED, not %s",
4223 				       name, DefaultTypeName[default_type]));
4224 	    }
4225 	}
4226 
4227 	if(ParserGetFlag(p, Validate) &&
4228 	   (type == AT_notation || type == AT_enumeration))
4229 	    /* Duplicate enumerated values were made invalid by
4230 	       an erratum of 2 Nov 2000 */
4231 	{
4232 	    for(i=0; i<nvalues; i++)
4233 		for(j=i+1; j<nvalues; j++)
4234 		    if(Strcmp(allowed_values[i], allowed_values[j]) == 0)
4235 		    {
4236 			require(validity_error(p,
4237 					       "Enumerated attribute %S has "
4238 					       "duplicate allowed value %S",
4239 					       name,
4240 					       allowed_values[i],
4241 					       allowed_values[j]));
4242 			break;
4243 
4244 		    }
4245 	}
4246 
4247 	if(ParserGetFlag(p, Validate) && type == AT_notation)
4248 	{
4249 	    /* Requirement for at most one notation attribute was
4250 	       added in the errata of 17 Feb 1999 */
4251 	    if(element->notation_attribute)
4252 	    {
4253 		require(validity_error(p,
4254 				  "NOTATION attribute %S declared for element"
4255 				       " %S which already had one (%S)",
4256 				       name, element->name,
4257 				       element->notation_attribute->name));
4258 	    }
4259 	}
4260 
4261 	if(ParserGetFlag(p, Validate) && Strcmp(name, s_xml_space) == 0)
4262 	{
4263 	    if(type != AT_enumeration)
4264 	    {
4265 		require(validity_error(p,
4266 			  "xml:space attribute must have enumerated type"));
4267 	    }
4268 	    else for(i=0; i<nvalues; i++)
4269 		if(Strcmp(allowed_values[i], s_default) != 0 &&
4270 		   Strcmp(allowed_values[i], s_preserve) != 0)
4271 		{
4272 		    require(validity_error(p,
4273 	"xml:space attribute values may only be \"default\" or \"preserve\""));
4274 		    break;
4275 		}
4276 	}
4277 
4278 	/* It doesn't seem to be required that xml:lang be declared
4279 	   NMTOKEN, so don't check it */
4280 
4281 	a = DefineAttribute(element, name, type, allowed_values,
4282 			    default_type, default_value, 1);
4283 	if(!a)
4284 	    return error(p, "System error");
4285 	if(parsing_external_subset(p))
4286 	    a->is_externally_declared = 1;
4287 	if(ParserGetFlag(p, XMLID) &&
4288 	   element->xml_id_attribute == a && a->type != AT_id)
4289 	{
4290 	    warn(p, "xml:id error: xml:id attribute must be declared as type ID");
4291 	    /* Fix the declaration so that we treat it as type ID */
4292 	    a->type = AT_id;
4293 	}
4294 	if(ParserGetFlag(p, XMLNamespaces))
4295 	{
4296 	    require(check_qualname_syntax(p, a->name, "Attribute"));
4297 	}
4298 
4299     done:
4300 	Free(name);
4301 
4302 	require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4303     }
4304 
4305     if(ParserGetFlag(p, Validate) && tent != ent)
4306     {
4307 	require(validity_error(p, "Attlist declaration ends in different "
4308 			          "entity from that in which it starts"));
4309     }
4310 
4311     return 0;
4312 }
4313 
4314 /* Used for external dtd part, entity definitions and notation definitions. */
4315 /* NB PE references are not allowed here (why not?) */
4316 
parse_external_id(Parser p,int required,char8 ** publicid,char8 ** systemid,int preq,int sreq)4317 static int parse_external_id(Parser p, int required,
4318 			     char8 **publicid, char8 **systemid,
4319 			     int preq, int sreq)
4320 {
4321     InputSource s = p->source;
4322     int c;
4323     Char *cp;
4324 
4325     *publicid = 0;
4326     *systemid = 0;
4327 
4328     if(looking_at(p, "SYSTEM"))
4329     {
4330 	if(!sreq)
4331 	{
4332 	    skip_whitespace(s);
4333 	    c = get(s); unget(s);
4334 	    if(c == BADCHAR)
4335 		return error(p, "Input error: %s", s->error_msg);
4336 	    if(c != '"' && c != '\'')
4337 		return 0;
4338 	}
4339 	else
4340 	{
4341 	    require(expect_dtd_whitespace(p, "after SYSTEM"));
4342 	}
4343 
4344 	require(parse_string(p, "for system ID", LT_plain, 0));
4345 	if(!(*systemid = duptochar8(p->pbuf)))
4346 	    return error(p, "System error");
4347     }
4348     else if(looking_at(p, "PUBLIC"))
4349     {
4350 	if(!preq && !sreq)
4351 	{
4352 	    skip_whitespace(s);
4353 	    c = get(s); unget(s);
4354 	    if(c == BADCHAR)
4355 		return error(p, "Input error: %s", s->error_msg);
4356 	    if(c != '"' && c != '\'')
4357 		return 0;
4358 	}
4359 	else
4360 	{
4361 	    require(expect_dtd_whitespace(p, "after PUBLIC"));
4362 	}
4363 
4364 	require(parse_string(p, "for public ID", LT_pubid, 0));
4365 
4366 	for(cp=p->pbuf; *cp; cp++)
4367 	    if(!is_ascii_alpha(*cp) && !is_ascii_digit(*cp) &&
4368 	       strchr8("-'()+,./:=?;!*#@$_% \r\n", *cp) == 0)
4369 		return error(p, "Illegal character %s in public id",
4370 			     escape(*cp, p->escbuf[0]));
4371 
4372 	if(!(*publicid = duptochar8(p->pbuf)))
4373 	    return error(p, "System error");
4374 
4375 	if(!sreq)
4376 	{
4377 	    skip_whitespace(s);
4378 	    c = get(s); unget(s);
4379 	    if(c == BADCHAR)
4380 		return error(p, "Input error: %s", s->error_msg);
4381 	    if(c != '"' && c != '\'')
4382 		return 0;
4383 	}
4384 	else
4385 	{
4386 	    require(expect_dtd_whitespace(p, "after public id"));
4387 	}
4388 
4389 	require(parse_string(p, "for system ID", LT_plain, 0));
4390 	if(!(*systemid = duptochar8(p->pbuf)))
4391 	    return error(p, "System error");
4392     }
4393     else if(p->state == PS_error)	/* looking_at may have set it */
4394 	    return -1;
4395     else if(required)
4396 	return error(p, "Missing or malformed external ID");
4397 
4398     return 0;
4399 }
4400 
4401 /* Called after reading '<!ENTITY ' */
4402 
parse_entity_decl(Parser p,Entity ent,int line,int chpos,Entity ext_ent)4403 static int parse_entity_decl(Parser p, Entity ent, int line, int chpos,
4404 			     Entity ext_ent)
4405 {
4406     Entity e, old, tent;
4407     int pe, t, namelen;
4408     Char *name;
4409 
4410     pe = looking_at(p, "%");	/* If it were a PE ref, we would
4411 				   already have pushed it */
4412     if(p->state == PS_error)	/* looking_at may have set it */
4413 	return -1;
4414 
4415     require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4416     require(parse_name(p, "for name in entity declaration"));
4417     namelen = p->namelen;
4418     CopyName(name);
4419 
4420     if(ParserGetFlag(p, XMLNamespaces) && Strchr(name, ':'))
4421     {
4422 	require(namespace_error(p, "Entity name %S contains colon", name));
4423     }
4424 
4425     require(expect_dtd_whitespace(p, "after name in entity declaration"));
4426 
4427     if(looking_at(p, "'") || looking_at(p, "\""))
4428     {
4429 	Char *value;
4430 
4431 	unget(p->source);
4432 	require(parse_string(p, "for value in entity declaration",
4433 			     pe ? LT_param_entity : LT_entity, 0));
4434 	value = p->pbuf;
4435 	Consume(p->pbuf);
4436 
4437 	if(!(e = NewInternalEntity(name, value, ent, line, chpos, 0)))
4438 	    return error(p, "System error");
4439 	if(parsing_external_subset(p))
4440 	    e->is_externally_declared = 1;
4441 #if 0
4442 	Fprintf(Stderr, "internal %s entity %S\n",
4443 		pe ? "parameter" : "general", name);
4444 	Fprintf(Stderr, "base: %s\nreplacement text: %S\n",
4445 		e->base_url ? e->base_url : "<null>", e->text);
4446 #endif
4447     }
4448     else if(p->state == PS_error)	/* looking_at may have set it */
4449 	return -1;
4450     else
4451     {
4452 	char8 *publicid, *systemid;
4453 	NotationDefinition notation = 0;
4454 
4455 	require(parse_external_id(p, 1, &publicid, &systemid, 1, 1));
4456 
4457 	require((t = skip_dtd_whitespace(p, p->external_pe_depth > 0)));
4458 	if(looking_at(p, "NDATA"))
4459 	{
4460 	    if(t == 0)
4461 		return error(p, "Whitespace missing before NDATA");
4462 	    if(pe)
4463 		return error(p, "NDATA not allowed for parameter entity");
4464 	    require(expect_dtd_whitespace(p, "after NDATA"));
4465 	    require(parse_name(p, "for notation name in entity declaration"));
4466 	    maybe_uppercase_name(p);
4467 	    notation = FindNotationN(p->dtd, p->name, p->namelen);
4468 	    if(!notation)
4469 	    {
4470 		notation =
4471 		    TentativelyDefineNotationN(p->dtd, p->name, p->namelen);
4472 		if(!notation)
4473 		    return error(p, "System error");
4474 		if(ParserGetFlag(p, XMLNamespaces) &&
4475 		   Strchr(notation->name, ':'))
4476 		{
4477 		    require(namespace_error(p,
4478 					    "Notation name %S contains colon",
4479 					    notation->name));
4480 		}
4481 	    }
4482 	}
4483 	if(p->state == PS_error)	/* looking_at may have set it */
4484 	    return -1;
4485 
4486 	/* XXX we make the current external entity the parent so that
4487 	   system IDs are resoved correctly.  Should we instead record
4488 	   both parents? */
4489 	if(!(e = NewExternalEntityN(name, namelen,
4490 				    publicid, systemid, notation, ext_ent)))
4491 	    return error(p, "System error");
4492 	if(parsing_external_subset(p) || ent->is_externally_declared)
4493 	    e->is_externally_declared = 1;
4494 #if 0
4495 	Fprintf(Stderr, "external %s entity %S\n",
4496 		pe ? "parameter" : "general", name);
4497 	Fprintf(Stderr, "base: %s\nsystem identifier: %s\n",
4498 		e->base_url ? e->base_url : "<null>", e->systemid);
4499 #endif
4500     }
4501 
4502     Free(name);
4503 
4504     require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4505     tent = p->source->entity;
4506     require(expect(p, '>', "at end of entity declaration"));
4507     if(ParserGetFlag(p, Validate) && tent != ent)
4508     {
4509 	require(validity_error(p, "Entity declaration ends in different "
4510 			          "entity from that in which it starts"));
4511     }
4512 
4513     if((old = FindEntity(p->dtd, e->name, pe)))
4514     {
4515 	if(old->parent == xml_builtin_entity)
4516 	{
4517 	    if(e->type != ET_internal ||
4518 	       (ParserGetFlag(p, ExpandCharacterEntities) &&
4519 	        Strcmp(e->text, old->text) != 0))
4520 		warn(p, "Non-standard declaration of predefined "
4521 		        "entity %S (ignored)",
4522 		     e->name);
4523 	}
4524 	else
4525 	{
4526 	    if(ParserGetFlag(p, WarnOnRedefinitions))
4527 		warn(p, "Ignoring redefinition of%s entity %S",
4528 		     pe ? " parameter" : "", e->name);
4529 	}
4530 
4531 	FreeEntity(e);
4532     }
4533     else
4534 	if(!DefineEntity(p->dtd, e, pe))
4535 	    return error(p, "System error");
4536 
4537     return 0;
4538 }
4539 
parsing_internal(Parser p)4540 static int parsing_internal(Parser p)
4541 {
4542     Entity e = p->source->entity;
4543 
4544     if(e == p->document_entity)
4545 	return 1;
4546     if(e->type == ET_external)
4547 	return 0;
4548     if(e->is_externally_declared)
4549 	return 0;
4550     return 1;
4551 }
4552 
4553 /* NB assumes we are parsing the DTD */
4554 
parsing_external_subset(Parser p)4555 static int parsing_external_subset(Parser p)
4556 {
4557     Entity e = p->source->entity;
4558 
4559     return !e->is_internal_subset;
4560 }
4561 
4562 /* Called after reading '<!NOTATION ' */
4563 
parse_notation_decl(Parser p,Entity ent)4564 static int parse_notation_decl(Parser p, Entity ent)
4565 {
4566     Char *name;
4567     char8 *publicid, *systemid;
4568     NotationDefinition def;
4569     Entity tent;
4570 
4571     require(parse_name(p, "for name in notation declaration"));
4572     CopyName(name);
4573     maybe_uppercase(p, name);
4574 
4575     require(expect_dtd_whitespace(p, "after name in notation declaration"));
4576 
4577     require(parse_external_id(p, 1, &publicid, &systemid, 1, 0));
4578 
4579     require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4580     tent = p->source->entity;
4581     require(expect(p, '>', "at end of notation declaration"));
4582     if(ParserGetFlag(p, Validate) && tent != ent)
4583     {
4584 	require(validity_error(p, "Notation declaration ends in different "
4585 			          "entity from that in which it starts"));
4586     }
4587 
4588     if((def = FindNotation(p->dtd, name)))
4589     {
4590 	if(def->tentative)
4591 	    RedefineNotation(def, publicid, systemid, ent);
4592 	else
4593 	    if(ParserGetFlag(p, WarnOnRedefinitions))
4594 	    {
4595 		warn(p, "Ignoring redefinition of notation %S", name);
4596 		if(publicid) Free(publicid);
4597 		if(systemid) Free(systemid);
4598 	    }
4599     }
4600     else
4601     {
4602 	if(!DefineNotation(p->dtd, name, publicid, systemid, ent))
4603 	    return error(p, "System error");
4604 	if(ParserGetFlag(p, XMLNamespaces) && Strchr(name, ':'))
4605 	{
4606 	    require(namespace_error(p, "Notation name %S contains colon",
4607 				    name));
4608 	}
4609     }
4610 
4611     Free(name);
4612 
4613     return 0;
4614 }
4615 
parse_conditional(Parser p,Entity ent)4616 static int parse_conditional(Parser p, Entity ent)
4617 {
4618     int depth=1;
4619     Entity tent;
4620 
4621     if(p->external_pe_depth == 0)
4622 	return error(p, "Conditional section not allowed in internal subset");
4623 
4624     require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4625     if(looking_at(p, "INCLUDE"))
4626     {
4627 	require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4628 
4629 	tent = p->source->entity;
4630 	require(expect(p, '[', "at start of conditional section"));
4631 	if(ParserGetFlag(p, Validate) && tent != ent)
4632 	{
4633 	    require(validity_error(p, "[ of conditional section in "
4634 				      "different entity from <!["));
4635 	}
4636 
4637 	require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4638 
4639 	while(!looking_at(p, "]"))
4640 	{
4641 	    switch(parse_markupdecl(p))
4642 	    {
4643 	    case 1:
4644 		return error(p, "EOF in conditional section");
4645 	    case -1:
4646 		return -1;
4647 	    }
4648 	    require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4649 	}
4650 	tent = p->source->entity;
4651 
4652 	if(!looking_at(p, "]>"))
4653 	    return error(p, "]> required after ] in conditional section");
4654 
4655 	if(ParserGetFlag(p, Validate) && tent != ent)
4656 	{
4657 	    require(validity_error(p, "] of conditional section in "
4658 				      "different entity from <!["));
4659 	}
4660     }
4661     else if(looking_at(p, "IGNORE"))
4662     {
4663 	/* Easy, because ]]> not even allowed in strings! */
4664 
4665 	require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4666 	tent = p->source->entity;
4667 	require(expect(p, '[', "at start of conditional section"));
4668 	if(ParserGetFlag(p, Validate) && tent != ent)
4669 	{
4670 	    require(validity_error(p, "[ of conditional section in "
4671 				      "different entity from <!["));
4672 	}
4673 
4674 	while(depth > 0)
4675 	{
4676 	    switch(get(p->source))
4677 	    {
4678 	    case BADCHAR:
4679 		return error(p, "Input error: %s", p->source->error_msg);
4680 	    case XEOE:
4681 		if(p->source->parent)
4682 		    ParserPop(p);
4683 		else
4684 		    return error(p, "EOF in ignored conditional section");
4685 		break;
4686 	    case '<':
4687 		if(looking_at(p, "!["))
4688 		    depth++;
4689 		break;
4690 	    case ']':
4691 		tent = p->source->entity;
4692 		if(looking_at(p, "]>"))
4693 		    depth--;
4694 	    }
4695 	}
4696 	if(ParserGetFlag(p, Validate) && tent != ent)
4697 	{
4698 	    require(validity_error(p, "]]> of conditional section in "
4699 				      "different entity from <!["));
4700 	}
4701     }
4702     else if(p->state == PS_error)	/* looking_at may have set it */
4703 	return -1;
4704     else
4705 	return error(p, "INCLUDE or IGNORE required in conditional section");
4706 
4707     return 0;
4708 }
4709 
maybe_uppercase(Parser p,Char * s)4710 static void maybe_uppercase(Parser p, Char *s)
4711 {
4712     if(ParserGetFlag(p, CaseInsensitive))
4713 	while(*s)
4714 	{
4715 	    *s = Toupper(*s);
4716 	    s++;
4717 	}
4718 }
4719 
maybe_uppercase_name(Parser p)4720 static void maybe_uppercase_name(Parser p)
4721 {
4722     int i;
4723 
4724     if(ParserGetFlag(p, CaseInsensitive))
4725 	for(i=0; i<p->namelen; i++)
4726 	    p->name[i] = Toupper(p->name[i]);
4727 }
4728 
str_maybecase_cmp8(Parser p,const char8 * a,const char8 * b)4729 static int str_maybecase_cmp8(Parser p, const char8 *a, const char8 *b)
4730 {
4731     return
4732 	ParserGetFlag(p, CaseInsensitive) ? strcasecmp8(a, b) : strcmp8(a, b);
4733 }
4734 
is_ascii_alpha(int c)4735 static int is_ascii_alpha(int c)
4736 {
4737     return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
4738 }
4739 
is_ascii_digit(int c)4740 static int is_ascii_digit(int c)
4741 {
4742     return c >= '0' && c <= '9';
4743 }
4744 
4745 /* Error handling */
4746 
verror(char8 * buf,int size,XBit bit,const char8 * format,va_list args)4747 static void verror(char8 *buf, int size, XBit bit, const char8 *format, va_list args)
4748 {
4749     /* Print message before freeing xbit, so we can print data from it */
4750     Vsnprintf(buf, size, CE_ISO_8859_1, format, args);
4751 
4752     FreeXBit(bit);
4753     bit->type = XBIT_error;
4754     bit->error_message = buf;
4755 }
4756 
error(Parser p,const char8 * format,...)4757 static int error(Parser p, const char8 *format, ...)
4758 {
4759     va_list args;
4760 
4761     va_start(args, format);
4762     verror(p->errbuf, sizeof(p->errbuf), &p->xbit, format, args);
4763 
4764     p->state = PS_error;
4765 
4766     return -1;
4767 }
4768 
warn(Parser p,const char8 * format,...)4769 static int warn(Parser p, const char8 *format, ...)
4770 {
4771     va_list args;
4772     struct xbit bit;
4773 
4774     clear_xbit(&bit);
4775 
4776     va_start(args, format);
4777     verror(p->errbuf, sizeof(p->errbuf), &bit, format, args);
4778 
4779     bit.type = XBIT_warning;
4780 
4781     if(p->warning_callback)
4782 	p->warning_callback(&bit, p->warning_callback_arg);
4783     else
4784 	ParserPerror(p, &bit);
4785 
4786     return 0;
4787 }
4788 
4789 /* Validity checks applied when the prolog is complete. */
4790 
validate_dtd(Parser p)4791 static int validate_dtd(Parser p)
4792 {
4793     Dtd d = p->dtd;
4794     ElementDefinition e;
4795     AttributeDefinition a;
4796     Entity ent;
4797     int i;
4798 
4799     if(!p->have_dtd)
4800     {
4801 	if(!ParserGetFlag(p, NoNoDTDWarning))
4802 	{
4803 	    require(validity_error(p,
4804 			       "Document has no DTD, validating abandoned"));
4805 	}
4806 	ParserSetFlag(p, Validate, 0);
4807 	return 0;
4808     }
4809 
4810     if(!(e = FindElement(d, d->name)) || e->tentative)
4811     {
4812 	require(validity_error(p,
4813 			       "Root element name %S not declared", d->name));
4814     }
4815 
4816     for(e = NextElementDefinition(d, 0); e; e = NextElementDefinition(d, e))
4817 	if(e->type == CT_element || e->type == CT_mixed)
4818 	{
4819 	    FSMNode endnode;
4820 	    e->fsm = NewFSM();
4821 	    if(!e->fsm)
4822 		error(p, "System error");
4823 	    endnode = AddNode(e->fsm);
4824 	    if(!endnode)
4825 		error(p, "System error");
4826 	    endnode->end_node = 1;
4827 	    e->fsm->start_node =
4828 		translate_particle(e->fsm, e->particle, endnode);
4829 	    if(!e->fsm->start_node)
4830 		error(p, "System error");
4831 	    if(e->type == CT_mixed)
4832 		/* Mixed content may always be empty, even (#PCDATA) */
4833 		e->fsm->start_node->end_node = 1;
4834 #if DEBUG_FSM
4835 	    Printf("\nContent model for element %S is %S\n",
4836 		   e->name, e->content);
4837 	    PrintFSM(Stdout, e->fsm, 0);
4838 #endif
4839 	    SimplifyFSM(e->fsm);
4840 	    if(e->type == CT_element)
4841 	    {
4842 		/* Don't do this for mixed content, to prevent extra error
4843 		   message for (#PCDATA|a|a)* which we already reported */
4844 		require(check_deterministic(p, e));
4845 	    }
4846 #if DEBUG_FSM
4847 	    Printf("\nContent model for element %S is %S\n",
4848 		   e->name, e->content);
4849 	    PrintFSM(Stdout, e->fsm, 1);
4850 #endif
4851 	}
4852 
4853     /* check all NDATA notations declared */
4854 
4855     for(ent = NextEntity(d, 0); ent; ent = NextEntity(d, ent))
4856 	if(ent->notation && ent->notation->tentative)
4857 	{
4858 	    require(validity_error(p, "In declaration of unparsed entity %S, "
4859 				       "notation %S is undefined",
4860 				   ent->name, ent->notation->name));
4861 	}
4862 
4863     /* validate attribute defaults (do it here so all entities/notations
4864        declared) and check notations in enumeration all declared */
4865 
4866     for(e = NextElementDefinition(d, 0); e; e = NextElementDefinition(d, e))
4867 	for(a = NextAttributeDefinition(e, 0); a;
4868 	    a = NextAttributeDefinition(e, a))
4869 	{
4870 	    if(a->default_value)
4871 	    {
4872 		require(check_attribute_syntax(p, a, e, a->default_value,
4873 					       "default value for attribute",
4874 					       0));
4875 	    }
4876 	    if(a->type == AT_notation)
4877 	    {
4878 		if(e->type == CT_empty)
4879 		{
4880 		    require(validity_error(p,
4881 					   "NOTATION attribute %S not allowed "
4882 					   "on EMPTY element %S",
4883 					   a->name, e->name));
4884 
4885 		}
4886 
4887 		for(i=0; a->allowed_values[i]; i++)
4888 		    if(!FindNotation(d, a->allowed_values[i]))
4889 		    {
4890 			require(validity_error(p,
4891 				  "In allowed values for attribute %S of "
4892 				  "element %S, notation %S is not defined",
4893 					       a->name, e->name,
4894 					       a->allowed_values[i]));
4895 		    }
4896 	    }
4897 	}
4898 
4899     return 0;
4900 }
4901 
validate_final(Parser p)4902 static int validate_final(Parser p)
4903 {
4904     /* Check all IDs referred to were defined */
4905 
4906     hash_map(p->id_table, check_id, p);
4907 
4908     if(p->xbit.type == XBIT_error)
4909 	return -1;
4910 
4911     return 0;
4912 }
4913 
check_id(const HashEntryStruct * id_entry,void * pp)4914 static HashMapRetType check_id(const HashEntryStruct *id_entry, void *pp)
4915 {
4916     Parser p = (Parser)pp;
4917 
4918     if(!(int)hash_get_value(id_entry))
4919 	validity_error(p,
4920 		       "The ID %.*S was referred to but never defined",
4921 		       hash_get_key_len(id_entry) / sizeof(Char),
4922 		       hash_get_key(id_entry));
4923 
4924 #ifdef FOR_LT
4925     return 1;
4926 #endif
4927 }
4928 
4929 /* Determine whether an element is valid at this point.
4930  * Returns the new context, or NULL if invalid.
4931  */
4932 
validate_content(FSMNode context,ElementDefinition e)4933 static FSMNode validate_content(FSMNode context, ElementDefinition e)
4934 {
4935     int i;
4936 
4937     for(i=0; i<VectorCount(context->edges); i++)
4938 	if(context->edges[i]->label == e)
4939 	    return context->edges[i]->destination;
4940 
4941     return 0;
4942 }
4943 
NewFSM(void)4944 static FSM NewFSM(void)
4945 {
4946     FSM fsm;
4947 
4948     if(!(fsm = Malloc(sizeof(*fsm))))
4949 	return 0;
4950     VectorInit(fsm->nodes);
4951     fsm->start_node = 0;
4952 
4953     return fsm;
4954 }
4955 
FreeFSM(FSM fsm)4956 void FreeFSM(FSM fsm)
4957 {
4958     int i,j;
4959 
4960     if(!fsm)
4961 	return;
4962 
4963     for(i=0; i<VectorCount(fsm->nodes); i++)
4964     {
4965 	FSMNode node = fsm->nodes[i];
4966 	for(j=0; j<VectorCount(node->edges); j++)
4967 	    Free(node->edges[j]);
4968 	Free(node->edges);
4969 	Free(node);
4970     }
4971 
4972     Free(fsm->nodes);
4973     Free(fsm);
4974 }
4975 
AddNode(FSM fsm)4976 static FSMNode AddNode(FSM fsm)
4977 {
4978     FSMNode node;
4979 
4980     if(!(node = Malloc(sizeof(*node))))
4981 	return 0;
4982     node->fsm = fsm;
4983     node->mark = node->end_node = 0;
4984     node->id = VectorCount(fsm->nodes);
4985     VectorInit(node->edges);
4986     if(!VectorPush(fsm->nodes, node))
4987 	return 0;
4988 
4989     return node;
4990 }
4991 
DeleteNode(FSMNode node)4992 static void DeleteNode(FSMNode node)
4993 {
4994     int i;
4995     FSM fsm = node->fsm;
4996 
4997     fsm->nodes[node->id] = 0;
4998     for(i=0; i<VectorCount(node->edges); i++)
4999 	Free(node->edges[i]);
5000     Free(node->edges);
5001     Free(node);
5002 }
5003 
DeleteEdge(FSMEdge edge)5004 static void DeleteEdge(FSMEdge edge)
5005 {
5006     edge->source->edges[edge->id] = 0;
5007     Free(edge);
5008 }
5009 
5010 /* After deleting nodes there will be null nodes in the node list.
5011    This function removes them. */
5012 
CleanupFSM(FSM fsm)5013 static void CleanupFSM(FSM fsm)
5014 {
5015     int i, j;
5016 
5017     for(i=j=0; i<VectorCount(fsm->nodes); i++)
5018     {
5019 	if(fsm->nodes[i])
5020 	{
5021 	    if(i > j)
5022 	    {
5023 		fsm->nodes[j] = fsm->nodes[i];
5024 		fsm->nodes[j]->id = j;
5025 	    }
5026 	    j++;
5027 	}
5028     }
5029     VectorCount(fsm->nodes) = j;
5030 }
5031 
5032 /* After deleting edges there will be null edges in the edge list.
5033    This function removes them. */
5034 
CleanupNode(FSMNode node)5035 static void CleanupNode(FSMNode node)
5036 {
5037     int i, j;
5038 
5039     for(i=j=0; i<VectorCount(node->edges); i++)
5040     {
5041 	if(node->edges[i])
5042 	{
5043 	    if(i > j)
5044 	    {
5045 		node->edges[j] = node->edges[i];
5046 		node->edges[j]->id = j;
5047 	    }
5048 	    j++;
5049 	}
5050     }
5051     VectorCount(node->edges) = j;
5052 }
5053 
AddEdge(FSMNode source,FSMNode destination,void * label)5054 static FSMEdge AddEdge(FSMNode source, FSMNode destination, void *label)
5055 {
5056     FSMEdge edge;
5057 
5058     if(!(edge = Malloc(sizeof(*edge))))
5059 	return 0;
5060     edge->label = label;
5061     edge->source = source;
5062     edge->destination = destination;
5063     edge->id = VectorCount(source->edges);
5064     if(!VectorPush(source->edges, edge))
5065 	return 0;
5066 
5067     return edge;
5068 }
5069 
UnMarkFSM(FSM fsm,int value)5070 static void UnMarkFSM(FSM fsm, int value)
5071 {
5072     int i;
5073 
5074     for(i=0; i<VectorCount(fsm->nodes); i++)
5075 	fsm->nodes[i]->mark &= ~value;
5076 }
5077 
5078 /* Remove all epsilon links from a FSM */
5079 
5080 #define useful 1
5081 #define busy 2
5082 
SimplifyFSM(FSM fsm)5083 static int SimplifyFSM(FSM fsm)
5084 {
5085     int i, j;
5086     FSMNode node;
5087     FSMEdge edge;
5088 
5089     /* First find all the useful nodes, ie those pointed to by a
5090        non-epsilon edge. */
5091 
5092     fsm->start_node->mark |= useful;
5093     for(i=0; i<VectorCount(fsm->nodes); i++)
5094     {
5095 	node = fsm->nodes[i];
5096 	for(j=0; j<VectorCount(node->edges); j++)
5097 	{
5098 	    edge = node->edges[j];
5099 	    if(edge->label != Epsilon)
5100 		edge->destination->mark |= useful;
5101 	}
5102     }
5103 
5104     /* Now add to each useful node all the non-epsilon edges of
5105        the nodes in its epsilon-closure. */
5106 
5107     for(i=0; i<VectorCount(fsm->nodes); i++)
5108     {
5109 	node = fsm->nodes[i];
5110 	if(!(node->mark & useful))
5111 	    continue;
5112 	node->mark |= busy;
5113 	for(j=0; j<VectorCount(node->edges); j++)
5114 	{
5115 	    edge = node->edges[j];
5116 	    if(edge->label == Epsilon)
5117 		if(!add_epsilon_closure(node, edge->destination))
5118 		    return 0;
5119 	}
5120 	UnMarkFSM(fsm, busy);
5121     }
5122 
5123     /* Now remove all useless nodes and epsilon edges from useful nodes */
5124 
5125     for(i=0; i<VectorCount(fsm->nodes); i++)
5126     {
5127 	node = fsm->nodes[i];
5128 	if(node->mark & useful)
5129 	{
5130 	    for(j=0; j<VectorCount(node->edges); j++)
5131 	    {
5132 		edge = node->edges[j];
5133 		if(edge->label == Epsilon)
5134 		    DeleteEdge(edge);
5135 	    }
5136 	    CleanupNode(node);
5137 	}
5138 	else
5139 	    DeleteNode(node);
5140     }
5141     CleanupFSM(fsm);
5142 
5143     UnMarkFSM(fsm, useful);
5144 
5145     /* Now change the edge labels to be ElementDefinitions instead of CPs */
5146 
5147     for(i=0; i<VectorCount(fsm->nodes); i++)
5148     {
5149 	node = fsm->nodes[i];
5150 	for(j=0; j<VectorCount(node->edges); j++)
5151 	{
5152 	    edge = node->edges[j];
5153 	    if(edge->label == Epsilon || edge->label == PCDataElement)
5154 		continue;
5155 	    edge->label = ((ContentParticle)edge->label)->element;
5156 	}
5157     }
5158 
5159     return 1;
5160 }
5161 
add_epsilon_closure(FSMNode base,FSMNode node)5162 static int add_epsilon_closure(FSMNode base, FSMNode node)
5163 {
5164     int i, j;
5165     FSMEdge edge, edge2;
5166 
5167     if(node->mark & busy)
5168 	return 1;
5169     node->mark |= busy;
5170 
5171     if(node->end_node)
5172 	base->end_node = 1;
5173     for(i=0; i<VectorCount(node->edges); i++)
5174     {
5175 	edge = node->edges[i];
5176 	if(edge->label == Epsilon)
5177 	{
5178 	    if(!add_epsilon_closure(base, edge->destination))
5179 		return 0;
5180 	}
5181 	else
5182 	{
5183 	    /* Do we already have an edge corresponding to this very
5184 	       content particle? */
5185 	    for(j=0; j<VectorCount(base->edges); j++)
5186 	    {
5187 		edge2 = base->edges[j];
5188 		if(edge2->label == edge->label &&
5189 		   edge2->destination == edge->destination)
5190 		    break;
5191 	    }
5192 	    if(j == VectorCount(base->edges) &&
5193 	       !AddEdge(base, edge->destination, edge->label))
5194 		return 0;
5195 	}
5196     }
5197 
5198     return 1;
5199 }
5200 
5201 #if DEBUG_FSM
PrintFSM(FILE16 * out,FSM fsm,int relabelled)5202 static void PrintFSM(FILE16 *out, FSM fsm, int relabelled)
5203 {
5204     int i, j;
5205     FSMNode node;
5206     FSMEdge edge;
5207     ElementDefinition elt;
5208 
5209     for(i=0; i<VectorCount(fsm->nodes); i++)
5210     {
5211 	node = fsm->nodes[i];
5212 	Fprintf(out, "%d", node->id);
5213 	if(node == fsm->start_node)
5214 	    Fprintf(out, "S");
5215 	if(node->end_node)
5216 	    Fprintf(out, "E");
5217 
5218 	for(j=0; j<VectorCount(node->edges); j++)
5219 	{
5220 	    edge = node->edges[j];
5221 	    if(edge->label == Epsilon)
5222 		Fprintf(out, "\t{Epsilon} -> %d\n", edge->destination->id);
5223 	    else if(edge->label == PCDataElement)
5224 		Fprintf(out, "\t#PCDATA -> %d\n", edge->destination->id);
5225 	    else
5226 	    {
5227 		if(relabelled)
5228 		    elt = (ElementDefinition)edge->label;
5229 		else
5230 		    elt = ((ContentParticle)edge->label)->element;
5231 		Fprintf(out, "\t%S -> %d\n", elt->name, edge->destination->id);
5232 	    }
5233 	}
5234 	if(VectorCount(node->edges) == 0)
5235 	    printf("\n");
5236     }
5237 }
5238 #endif
5239 
translate_particle_1(FSM fsm,ContentParticle cp,FSMNode next)5240 static FSMNode translate_particle_1(FSM fsm, ContentParticle cp, FSMNode next)
5241 {
5242     FSMNode node, n;
5243     int i;
5244 
5245     if(!(node = AddNode(fsm)))
5246 	return 0;
5247 
5248     switch(cp->type)
5249     {
5250     case CP_name:
5251 	/* We initially label the edges with the content particles, so
5252 	   that we can recognise two "a" edges as being from different
5253 	   CPs for the purpose of determinism checking.  We will change
5254 	   the label to be the element definition later. */
5255 	if(!AddEdge(node, next, cp))
5256 	    return 0;
5257 	break;
5258     case CP_pcdata:
5259 	if(!AddEdge(node, next, PCDataElement))
5260 	    return 0;
5261 	break;
5262     case CP_choice:
5263 	for(i=0; i<cp->nchildren; i++)
5264 	{
5265 	    if(!(n = translate_particle(fsm, cp->children[i], next)) ||
5266 	       !AddEdge(node, n, Epsilon))
5267 		return 0;
5268 	}
5269 	break;
5270     case CP_seq:
5271 	n = next;
5272 	for(i=cp->nchildren-1; i>=0; i--)
5273 	{
5274 	    if(!(n = translate_particle(fsm, cp->children[i], n)))
5275 		return 0;
5276 	}
5277 	if(!AddEdge(node, n, Epsilon))
5278 	    return 0;
5279 	break;
5280     default:
5281 	break;
5282    }
5283 
5284     return node;
5285 }
5286 
translate_particle(FSM fsm,ContentParticle cp,FSMNode next)5287 static FSMNode translate_particle(FSM fsm, ContentParticle cp, FSMNode next)
5288 {
5289     FSMNode node1, node2, sub;
5290 
5291     switch(cp->repetition)
5292     {
5293     case 0:
5294 	return translate_particle_1(fsm, cp, next);
5295     case '*':
5296 	if(!(node1 = AddNode(fsm)) ||
5297 	   !(sub = translate_particle_1(fsm, cp, node1)) ||
5298 	   !AddEdge(node1, sub, Epsilon) ||
5299 	   !AddEdge(node1, next, Epsilon))
5300 	    return 0;
5301 	return node1;
5302     case '+':
5303 	if(!(node1 = AddNode(fsm)) ||
5304 	   !(node2 = AddNode(fsm)) ||
5305 	   !(sub = translate_particle_1(fsm, cp, node2)) ||
5306 	   !AddEdge(node1, sub, Epsilon) ||
5307 	   !AddEdge(node2, sub, Epsilon) ||
5308 	   !AddEdge(node2, next, Epsilon))
5309 	    return 0;
5310 	return node1;
5311     case '?':
5312 	if(!(node1 = AddNode(fsm)) ||
5313 	   !(sub = translate_particle_1(fsm, cp, next)) ||
5314 	   !AddEdge(node1, sub, Epsilon) ||
5315 	   !AddEdge(node1, next, Epsilon))
5316 	    return 0;
5317 	return node1;
5318     }
5319 
5320     return 0;			/* can't happen */
5321 }
5322 
check_deterministic(Parser p,ElementDefinition element)5323 static int check_deterministic(Parser p, ElementDefinition element)
5324 {
5325     int t;
5326 
5327     t = check_deterministic_1(p, element, element->fsm->start_node, 0);
5328     UnMarkFSM(element->fsm, busy);
5329     return t;
5330 }
5331 
check_deterministic_1(Parser p,ElementDefinition element,FSMNode node,ElementDefinition previous)5332 static int check_deterministic_1(Parser p, ElementDefinition element,
5333 				 FSMNode node, ElementDefinition previous)
5334 {
5335     int j, k;
5336     FSMEdge edge;
5337     Char empty_string[] = {0};
5338 
5339     if(node->mark & busy)
5340 	return 0;
5341     node->mark |= busy;
5342 
5343     /* Does this node have two or more edges labelled the same? */
5344 
5345     for(j=0; j<VectorCount(node->edges); j++)
5346     {
5347 	edge = node->edges[j];
5348 	for(k=0; k<j; k++)
5349 	    if(node->edges[k]->label == edge->label)
5350 	    {
5351 		require(validity_error(p,
5352 		     "Content model for %S is not deterministic.   %s%S "
5353 		     "there are multiple choices when the next element is %S.",
5354 		     element->name,
5355 		     previous ? "After element " : "At start of content",
5356 		     previous ? previous->name : empty_string,
5357 		     ((ElementDefinition)edge->label)->name));
5358 		goto next;	/* Don't report more errors for this node */
5359 	    }
5360     }
5361 
5362 next:
5363 
5364     /* Check its children */
5365     for(j=0; j<VectorCount(node->edges); j++)
5366     {
5367 	edge = node->edges[j];
5368 	require(check_deterministic_1(p, element, edge->destination,
5369 				      (ElementDefinition)edge->label));
5370     }
5371 
5372     return 0;
5373 }
5374 
validate_attribute(Parser p,AttributeDefinition a,ElementDefinition e,const Char * value)5375 static int validate_attribute(Parser p, AttributeDefinition a, ElementDefinition e, const Char *value)
5376 {
5377     require(check_attribute_syntax(p, a, e, value, "attribute", 1));
5378 
5379     if(a->default_type == DT_fixed)
5380 	if(Strcmp(value, a->default_value) != 0)
5381 	{
5382 	    require(validity_error(p,
5383 				   "The attribute %S of element %S does not "
5384 				   "match the declared #FIXED value",
5385 				   a->name, e->name));
5386 	}
5387 
5388     if(a == e->xml_lang_attribute)
5389     {
5390 	require(validate_xml_lang_attribute(p, e, value));
5391     }
5392 
5393     return 0;
5394 }
5395 
validate_xml_lang_attribute(Parser p,ElementDefinition e,const Char * value)5396 static int validate_xml_lang_attribute(Parser p, ElementDefinition e, const Char *value)
5397 {
5398     /* 1.1 will allow empty xml:lang values (and maybe 1.0 will be amended
5399        to), and it no longer seems worth checking anything here. */
5400 #if 0
5401     const Char *t;
5402 
5403     /* Look for the Langcode */
5404 
5405     if((value[0] == 'i' || value[0] == 'I' ||
5406 	value[0] == 'x' || value[0] == 'X') &&
5407        value[1] == '-')
5408     {
5409 	/* IANA or user code */
5410 
5411 	if(!is_ascii_alpha(value[2]))
5412 	    goto bad;
5413 	for(t = value+3; is_ascii_alpha(*t); t++)
5414 	    ;
5415 
5416     }
5417     else if(is_ascii_alpha(value[0]) && is_ascii_alpha(value[1]))
5418     {
5419 	/* ISO639 code */
5420 	t = value+2;
5421     }
5422     else
5423 	goto bad;
5424 
5425     /* Look for a subcode */
5426 
5427     if(!*t)
5428 	return 0;
5429     if(t[0] != '-' || !is_ascii_alpha(t[1]))
5430 	goto bad;
5431 
5432     for(t=t+2; is_ascii_alpha(*t); t++)
5433 	;
5434 
5435     if(!*t)
5436 	return 0;
5437 
5438  bad:
5439     /* Not a validity error since erratum 73 */
5440     warn(p, "Dubious xml:lang attribute for element %S", e->name);
5441 #endif
5442     return 0;
5443 }
5444 
5445 /* Check an attribute matches Name[s] or Nmtoken[s].
5446    Assume it has already been normalised (no leading or trailing
5447    whitespace, other whitespace normalised to single space). */
5448 
check_attribute_syntax(Parser p,AttributeDefinition a,ElementDefinition e,const Char * value,const char * message,int real_use)5449 static int check_attribute_syntax(Parser p, AttributeDefinition a, ElementDefinition e, const Char *value, const char *message, int real_use)
5450 {
5451     int nmchar = (a->type == AT_nmtoken || a->type == AT_nmtokens ||
5452 		  a->type == AT_enumeration);
5453     int multiple = (a->type == AT_nmtokens || a->type == AT_entities ||
5454 		    a->type == AT_idrefs);
5455 
5456     const Char *q, *start = value;
5457 
5458     if(a->type == AT_cdata)
5459 	return 0;		/* Nothing to check */
5460 
5461     if(!*value)
5462     {
5463 	require(validity_error(p, "The %s %S of element %S "
5464 			          "is declared as %s but is empty",
5465 			       message, a->name, e->name,
5466 			       AttributeTypeName[a->type]));
5467 	return 0;
5468     }
5469 
5470     for(q=value; *q; q++)
5471     {
5472 	if(!nmchar && q == start && !is_xml_namestart(*q, p->map))
5473 	{
5474 	    require(validity_error(p, "The %s %S of element %S "
5475 				   "is declared as %s but contains a token "
5476 				   "that does not start with a name start character",
5477 				   message, a->name, e->name,
5478 				   AttributeTypeName[a->type]));
5479 	    return 0;
5480 	}
5481 
5482 	if(*q == ' ')
5483 	{
5484 	    require(check_attribute_token(p, a, e, start, q-start, message,
5485 					  real_use));
5486 	    start = q+1;
5487 
5488 	    if(!multiple)
5489 	    {
5490 		require(validity_error(p, "The %s %S of element %S "
5491 				          "is declared as %s but "
5492 				          "contains more than one token",
5493 				       message, a->name, e->name,
5494 				       AttributeTypeName[a->type]));
5495 	    }
5496 	}
5497 	else if(!is_xml_namechar(*q, p->map))
5498 	{
5499 	    require(validity_error(p, "The %s %S of element %S is declared "
5500 				      "as %s but contains a character which "
5501 				      "is not a name character",
5502 				   message, a->name, e->name,
5503 				   AttributeTypeName[a->type]));
5504 	    return 0;
5505 	}
5506     }
5507 
5508     return check_attribute_token(p, a, e, start, q-start, message, real_use);
5509 }
5510 
check_attribute_token(Parser p,AttributeDefinition a,ElementDefinition e,const Char * value,int length,const char * message,int real_use)5511 static int check_attribute_token(Parser p, AttributeDefinition a, ElementDefinition e, const Char *value, int length, const char *message, int real_use)
5512 {
5513     Entity entity;
5514     NotationDefinition notation;
5515     int i, found;
5516     HashEntry id_entry;
5517 
5518     switch(a->type)
5519     {
5520     case AT_entity:
5521     case AT_entities:
5522 	if(!real_use)
5523 	    return 0;		/* don't check defaults unless they're used */
5524 	/* XXX Should maybe check for colons, but it must be invalid anyway
5525 	   because otherwise the declaration would have been not-nwf */
5526 	entity = FindEntityN(p->dtd, value, length, 0);
5527 	if(!entity)
5528 	{
5529 	    require(validity_error(p, "In the %s %S of element %S, "
5530 				      "entity %.*S is undefined",
5531 				   message, a->name, e->name, length, value));
5532 	}
5533 	else if(!entity->notation)
5534 	{
5535 	    require(validity_error(p, "In the %s %S of element %S, "
5536 				      "entity %.*S is not unparsed",
5537 				   message, a->name, e->name, length, value));
5538 	}
5539 	break;
5540     case AT_id:
5541 	if(!a->declared)
5542 	    /* don't validate undeclared xml:id attributes */
5543 	    return 0;
5544 	/* fall through */
5545     case AT_idref:
5546     case AT_idrefs:
5547 	if(!real_use)
5548 	    return 0;		/* don't check defaults unless they're used */
5549 	id_entry = hash_find_or_add(p->id_table, value, length*sizeof(Char),
5550 				    &found);
5551 	if(!id_entry)
5552 	    return error(p, "System error");
5553 	if(!found)
5554 	{
5555 	    hash_set_value(id_entry, (void *)(a->type == AT_id));
5556 	    if(ParserGetFlag(p, XMLNamespaces))
5557 		for(i=0; i<length; i++)
5558 		    if(value[i] == ':')
5559 		    {
5560 			require(namespace_validity_error(p, "ID %.*S contains colon", length, value));
5561 		    }
5562 	}
5563 	else if(a->type == AT_id)
5564 	{
5565 	    int idinfo = (int)hash_get_value(id_entry);
5566 	    if(idinfo & 1)
5567 	    {
5568 		require(validity_error(p, "Duplicate ID attribute value %.*S",
5569 				       length, value));
5570 	    }
5571 	    else
5572 	    {
5573 		if(idinfo & 2)
5574 		    warn(p, "xml:id error: duplicate ID attribute value %S", value);
5575 		hash_set_value(id_entry, (void *)(idinfo | 1));
5576 	    }
5577 	}
5578 	break;
5579     case AT_notation:
5580 	/* XXX Should maybe check for colons, but it must be invalid anyway
5581 	   because otherwise the declaration would have been not-nwf */
5582 	notation = FindNotationN(p->dtd, value, length);
5583 	if(!notation)
5584 	{
5585 	    require(validity_error(p, "In the %s %S of element %S, "
5586 				      "notation %.*S is undefined",
5587 				   message, a->name, e->name, length, value));
5588 	    break;
5589 	}
5590 	/* fall through */
5591     case AT_enumeration:
5592 	for(i=0; a->allowed_values[i]; i++)
5593 	    if(Strncmp(value, a->allowed_values[i], length) == 0 &&
5594 	       a->allowed_values[i][length] == 0)
5595 		break;
5596 	if(!a->allowed_values[i])
5597 	{
5598 	    require(validity_error(p, "In the %s %S of element %S, "
5599 				      "%.*S is not one of the allowed values",
5600 				   message, a->name, e->name, length, value));
5601 	}
5602 	break;
5603     default:
5604 	/* Nothing to check */
5605 	break;
5606     }
5607 
5608     return 0;
5609 }
5610 
5611 #if not_yet
magically_transform_dtd(Parser p,Char * name,int namelen)5612 static int magically_transform_dtd(Parser p, Char *name, int namelen)
5613 {
5614     int i;
5615     Char *prefix;
5616 
5617     for(i=0; i<namelen; i++)
5618 	if(name[i] == ':')
5619 	    break;
5620 
5621     if(i < namelen)
5622     {
5623 	if(!(prefix = Strndup(name, i)))
5624 	    return error(p, "System error");
5625     }
5626     else
5627 	prefix = 0;
5628 
5629     require(ReprefixDtd(p->dtd, p->magic_prefix, prefix));
5630 
5631     Free(prefix);
5632 
5633     return 0;
5634 }
5635 #endif
5636 
5637