1 /* $Id: xmlparser.c,v 1.135 2008/10/03 13:07:51 richard Exp $
2 */
3
4 #define DEBUG_FSM 0
5
6 #ifndef lint
7 static char vcid[] = "$Id: xmlparser.c,v 1.135 2008/10/03 13:07:51 richard Exp $";
8 #endif /* lint */
9
10 /*
11 * XML (and nSGML) parser.
12 * Author: Richard Tobin.
13 */
14
15 #include <stdarg.h>
16 #include <stdlib.h>
17
18 #ifdef FOR_LT
19
20 #include "lt-memory.h"
21 #include "nsllib.h"
22
23 #define Malloc salloc
24 #define Realloc srealloc
25 #define Free sfree
26
27 #else
28
29 #include "system.h"
30
31 #endif
32
33 #include "charset.h"
34 #include "string16.h"
35 #include "ctype16.h"
36 #include "dtd.h"
37 #include "input.h"
38 #include "stdio16.h"
39 #include "url.h"
40 #include "namespaces.h"
41 #include "xmlparser.h"
42
43 #ifdef FOR_LT
44
45 #include "lt-hash.h"
46
47 typedef HashList *HashEntry;
48 typedef HashList HashEntryStruct;
49 #define create_hash_table NewSizedHashStruct
50 #define free_hash_table(table) FreeHashStructM((table), 1)
51 #define hash_map MapHashLists1
52 #define hash_set_value(entry, value) ((entry)->index = (value))
53 #define hash_get_value(entry) ((entry)->index)
54 #define hash_get_key(entry) ((entry)->word)
55 #define hash_get_key_len(entry) ((entry)->length * sizeof(Char))
56 #define HashMapRetType boolean
57
hash_find_or_add(HashTable table,const Char * key,int key_len,int * foundp)58 static HashEntry hash_find_or_add(HashTable table, const Char *key,
59 int key_len, int *foundp)
60 {
61 HashEntry entry;
62
63 key_len /= sizeof(Char);
64 entry = FindWordInTableX(table, key, key_len);
65 if(!entry)
66 {
67 *foundp = 0;
68 entry = AddWordToTableXM(table, key, key_len);
69 if(!entry)
70 return 0;
71 }
72 else
73 *foundp = 1;
74
75 return entry;
76 }
77
78 #else
79
80 #include "hash.h"
81
82 #define hash_set_value(entry, _value) ((entry)->value = (_value))
83 #define hash_get_value(entry) ((entry)->value)
84 #define hash_get_key(entry) ((entry)->key)
85 #define hash_get_key_len(entry) ((entry)->key_len)
86 #define HashMapRetType void
87
88 #endif
89
90 static int transcribe(Parser p, int back, int count);
91 static void pop_while_at_eoe(Parser p);
92 static void maybe_uppercase(Parser p, Char *s);
93 static void maybe_uppercase_name(Parser p);
94 static int str_maybecase_cmp8(Parser p, const char8 *a, const char8 *b);
95 static int is_ascii_alpha(int c);
96 static int is_ascii_digit(int c);
97 static int parse_external_id(Parser p, int required,
98 char8 **publicid, char8 **systemid,
99 int preq, int sreq);
100 static int parse_conditional(Parser p, Entity ent);
101 static int parse_notation_decl(Parser p, Entity ent);
102 static int parse_entity_decl(Parser p, Entity ent, int line, int chpos,
103 Entity ext_ent);
104 static int parsing_internal(Parser p);
105 static int parsing_external_subset(Parser p);
106 static int parse_attlist_decl(Parser p, Entity ent);
107 static int parse_element_decl(Parser p, Entity ent);
108 static ContentParticle parse_cp(Parser p);
109 static ContentParticle parse_choice_or_seq(Parser p, Entity ent);
110 static ContentParticle parse_choice_or_seq_1(Parser p, int nchildren,
111 char sep, Entity ent);
112 static int check_content_decl(Parser p, ContentParticle cp);
113 static int check_content_decl_1(Parser p, ContentParticle cp);
114 static Char *stringify_cp(ContentParticle cp);
115 static void print_cp(ContentParticle cp, FILE16 *f);
116 static int size_cp(ContentParticle cp);
117 static int check_qualname_syntax(Parser p, const Char *name, const char *type);
118 static int parse_reference(Parser p, int pe, int expand, int allow_external);
119 static int parse_character_reference(Parser p, int expand);
120 static const char8 *escape(int c, char8 *buf);
121 static int parse_name(Parser p, const char8 *where);
122 static int parse_nmtoken(Parser p, const char8 *where);
123 static int looking_at(Parser p, const char8 *string);
124 static void clear_xbit(XBit xbit);
125 static int expect(Parser p, int expected, const char8 *where);
126 static int expect_dtd_whitespace(Parser p, const char8 *where);
127 static void skip_whitespace(InputSource s);
128 static int skip_dtd_whitespace(Parser p, int allow_pe);
129 static int parse_cdata(Parser p);
130 static int process_nsl_decl(Parser p);
131 static int process_xml_decl(Parser p);
132 static int is_v1x(const char *version);
133 static int parse_dtd(Parser p);
134 static int read_markupdecls(Parser p);
135 static int error(Parser p, const char8 *format, ...);
136 static int warn(Parser p, const char8 *format, ...);
137 static void verror(char8 *buf, int size, XBit bit, const char8 *format, va_list args);
138 enum literal_type {
139 LT_cdata_attr, LT_tok_attr, LT_plain, LT_entity, LT_param_entity,
140 LT_pubid
141 };
142 static int parse_string(Parser p, const char8 *where, enum literal_type type, int *normalised);
143 static int parse_pi(Parser p, Entity ent);
144 static int parse_comment(Parser p, int skip, Entity ent);
145 static int parse_pcdata(Parser p);
146 static int parse_starttag(Parser p);
147 Namespace LookupNamespace(NamespaceBinding dictionary, const Char *prefix);
148 static int process_namespace(Parser p,
149 AttributeDefinition d, const Char *value);
150 static int parse_attribute(Parser p);
151 static WhiteSpaceMode process_xml_space(Parser p, const Char *value);
152 static int parse_endtag(Parser p);
153 static int parse_markup(Parser p);
154 static int parse(Parser p);
155 static int parse_markupdecl(Parser p);
156 static int validate_dtd(Parser p);
157 static int validate_final(Parser p);
158 static HashMapRetType check_id(const HashEntryStruct *id_entry, void *p);
159 static int validate_attribute(Parser p, AttributeDefinition a, ElementDefinition e, const Char *value);
160 static int validate_xml_lang_attribute(Parser p, ElementDefinition e, const Char *value);
161 static int check_attribute_syntax(Parser p, AttributeDefinition a, ElementDefinition e, const Char *value, const char *message, int real_use);
162 static int check_attribute_token(Parser p, AttributeDefinition a, ElementDefinition e, const Char *value, int length, const char *message, int real_use);
163 #if not_yet
164 static int magically_transform_dtd(Parser p, Char *name, int namelen);
165 #endif
166
167 static struct element_definition pcdata_element;
168 const ElementDefinition Epsilon = 0, PCDataElement = &pcdata_element;
169
170 static FSM NewFSM(void);
171 void FreeFSM(FSM fsm);
172 static FSMNode AddNode(FSM fsm);
173 static FSMEdge AddEdge(FSMNode source, FSMNode destination, void *label);
174 static void UnMarkFSM(FSM fsm, int value);
175 static void DeleteNode(FSMNode node);
176 static void DeleteEdge(FSMEdge edge);
177 static void CleanupFSM(FSM fsm);
178 static void CleanupNode(FSMNode node);
179
180 #if DEBUG_FSM
181 static void PrintFSM(FILE16 *out, FSM fsm, int relabelled);
182 #endif
183 static int SimplifyFSM(FSM fsm);
184 static int add_epsilon_closure(FSMNode base, FSMNode node);
185 static FSMNode translate_particle(FSM fsm, ContentParticle cp, FSMNode next);
186 static FSMNode translate_particle_1(FSM fsm, ContentParticle cp, FSMNode next);
187 static FSMNode validate_content(FSMNode context, ElementDefinition e);
188 static int check_deterministic(Parser p, ElementDefinition element);
189 static int check_deterministic_1(Parser p, ElementDefinition element,
190 FSMNode node, ElementDefinition previous);
191
192 #define validity_error (p->seen_validity_error=1, ParserGetFlag(p, ErrorOnValidityErrors) ? error : warn)
193
194 #define namespace_error error
195 #define namespace_validity_error validity_error
196
197 #define require(x) if(x >= 0) {} else return -1
198 #define require0(x) if(x >= 0) {} else return 0
199
200 #define Consume(buf) (buf = 0, buf##size = 0)
201 #define ExpandBuf(buf, sz) \
202 if(buf##size >= (sz)+1) {} else if((buf = Realloc(buf, (buf##size = sz + 1) * sizeof(Char)))) {} else return error(p, "System error")
203
204 #define CopyName(n) if((n = Malloc((p->namelen + 1)*sizeof(Char)))) {memcpy(n, p->name, p->namelen * sizeof(Char)); n[p->namelen] = 0;} else return error(p, "System error");
205
206 #define CopyName0(n) if((n = Malloc((p->namelen + 1)*sizeof(Char)))) {memcpy(n, p->name, p->namelen * sizeof(Char)); n[p->namelen] = 0;} else {error(p, "System error"); return 0;}
207
208 #define ifNF16wrong(p,b,l) if((p)->checker && NF16wrong==nf16checkL((p)->checker, (p)->source->line + (p)->source->next - (b), (l)))
209 #define NF16StartCheck(p) if((p)->checker) nf16checkStart((p)->checker)
210 #define NF16noStartCheck(p) if((p)->checker) nf16checkNoStart((p)->checker)
211
212 #if CHAR_SIZE == 8
213 #define tochar8(s) s
214 #define duptochar8(s) strdup8(s)
215 #else
216 #define tochar8(s) (p->transbuf = translate_utf16_latin1_m(s, p->transbuf))
217 #define duptochar8(s) translate_utf16_latin1_m(s, 0)
218 #endif
219
220 const char8 *XBitTypeName[XBIT_enum_count] = {
221 "dtd",
222 "start",
223 "empty",
224 "end",
225 "eof",
226 "pcdata",
227 "pi",
228 "comment",
229 "cdsect",
230 "error",
231 "warning",
232 "none"
233 };
234
235 static Entity xml_builtin_entity;
236 static Entity xml_predefined_entities;
237
238 static int parser_initialised = 0;
239
240 static Char xml_ns[] = {'h','t','t','p',':','/','/','w','w','w','.','w','3',
241 '.','o','r','g','/','X','M','L','/','1','9','9','8',
242 '/','n','a','m','e','s','p','a','c','e',0};
243 static Char xmlns_ns[] = {'h','t','t','p',':','/','/','w','w','w','.','w', '3',
244 '.','o','r','g','/','2','0','0','0','/','x', 'm','l',
245 'n','s','/',0};
246
init_parser(void)247 int init_parser(void)
248 {
249 Entity e, f;
250 int i;
251 static const Char lt[] = {'l','t',0}, ltval[] = {'&','#','6','0',';',0};
252 static const Char gt[] = {'g','t',0}, gtval[] = {'>',0};
253 static const Char amp[] = {'a','m','p',0},
254 ampval[] = {'&','#','3','8',';',0};
255 static const Char apos[] = {'a','p','o','s',0}, aposval[] = {'\'',0};
256 static const Char quot[] = {'q','u','o','t',0}, quotval[] = {'"',0};
257 static const Char *builtins[5][2] = {
258 {lt, ltval}, {gt, gtval}, {amp, ampval},
259 {apos, aposval}, {quot, quotval}
260 };
261
262 if(parser_initialised)
263 return 0;
264 parser_initialised = 1;
265
266 if(init_charset() == -1 ||
267 init_ctype16() == -1 ||
268 init_stdio16() == -1 ||
269 init_url() == -1 ||
270 init_namespaces() == -1)
271 return -1;
272
273 xml_builtin_entity = NewInternalEntity(0, 0, 0, 0, 0, 0);
274
275 for(i=0, f=0; i<5; i++, f=e)
276 {
277 e = NewInternalEntity(builtins[i][0], builtins[i][1],
278 xml_builtin_entity, 0, 0, 0);
279 if(!e)
280 return -1;
281 e->next = f;
282 }
283
284 xml_predefined_entities = e;
285
286 return 0;
287 }
288
deinit_parser(void)289 void deinit_parser(void)
290 {
291 Entity e, f;
292
293 if(!parser_initialised)
294 return;
295 parser_initialised = 0;
296
297 deinit_charset();
298 deinit_ctype16();
299 deinit_stdio16();
300 deinit_namespaces();
301 deinit_url();
302
303 for(e = xml_predefined_entities; e; e=f)
304 {
305 f = e->next;
306 e->text = 0; /* it wasn't malloced so we mustn't free it */
307 FreeEntity(e);
308 }
309
310 FreeEntity(xml_builtin_entity);
311 }
312
skip_whitespace(InputSource s)313 static void skip_whitespace(InputSource s)
314 {
315 int c;
316
317 while((c = get(s)) != XEOE && is_xml_whitespace(c))
318 ;
319 unget(s);
320 }
321
322 /*
323 * Skip whitespace and (optionally) the start and end of PEs. Return 1 if
324 * there actually *was* some whitespace or a PE start/end, -1 if
325 * an error occurred, 0 otherwise.
326 */
327
skip_dtd_whitespace(Parser p,int allow_pe)328 static int skip_dtd_whitespace(Parser p, int allow_pe)
329 {
330 int c;
331 int got_some = 0;
332 InputSource s = p->source;
333
334 while(1)
335 {
336 c = get(s);
337
338 if(c == XEOE)
339 {
340 got_some = 1;
341 if(s->parent)
342 {
343 if(!allow_pe)
344 return error(p,
345 "PE end not allowed here in internal subset");
346 if(s->entity->type == ET_external)
347 p->external_pe_depth--;
348 ParserPop(p);
349 s = p->source;
350 }
351 else
352 {
353 unget(s); /* leave the final EOE waiting to be read */
354 return got_some;
355 }
356 }
357 else if(is_xml_whitespace(c))
358 {
359 got_some = 1;
360 }
361 else if(c == '%')
362 {
363 /* this complication is needed for <!ENTITY % ...
364 otherwise we could just assume it was a PE reference. */
365
366 c = get(s); unget(s);
367 if(c != XEOE && is_xml_namestart(c, p->map))
368 {
369 if(!allow_pe)
370 {
371 unget(s); /* For error position */
372 return error(p,
373 "PE ref not allowed here in internal subset");
374 }
375 require(parse_reference(p, 1, 1, 1));
376 s = p->source;
377 if(s->entity->type == ET_external)
378 p->external_pe_depth++;
379 got_some = 1;
380 }
381 else
382 {
383 unget(s);
384 return got_some;
385 }
386 }
387 else
388 {
389 unget(s);
390 return got_some;
391 }
392 }
393 }
394
expect(Parser p,int expected,const char8 * where)395 static int expect(Parser p, int expected, const char8 *where)
396 {
397 int c;
398 InputSource s = p->source;
399
400 c = get(s);
401 if(c != expected)
402 {
403 unget(s); /* For error position */
404 if(c == BADCHAR)
405 return error(p, "Input error: %s", s->error_msg);
406 else
407 return error(p, "Expected %s %s, but got %s",
408 escape(expected, p->escbuf[0]), where,
409 escape(c, p->escbuf[1]));
410 }
411
412 return 0;
413 }
414
415 /*
416 * Expects whitespace or the start or end of a PE.
417 */
418
expect_dtd_whitespace(Parser p,const char8 * where)419 static int expect_dtd_whitespace(Parser p, const char8 *where)
420 {
421 int r = skip_dtd_whitespace(p, p->external_pe_depth > 0);
422
423 if(r < 0)
424 return -1;
425
426 if(r == 0)
427 return error(p, "Expected whitespace %s", where);
428
429 return 0;
430 }
431
clear_xbit(XBit xbit)432 static void clear_xbit(XBit xbit)
433 {
434 xbit->type = XBIT_none;
435 xbit->s1 = 0;
436 xbit->S1 = xbit->S2 = 0;
437 xbit->attributes = 0;
438 xbit->element_definition = 0;
439 xbit->ns_dict = 0;
440 }
441
FreeXBit(XBit xbit)442 void FreeXBit(XBit xbit)
443 {
444 Attribute a, b;
445
446 if(xbit->S1) Free(xbit->S1);
447 if(xbit->S2) Free(xbit->S2);
448 if(xbit->type != XBIT_error && xbit->type != XBIT_warning && xbit->s1)
449 Free(xbit->s1);
450 if(xbit->ns_dict && xbit->nsowned)
451 {
452 int i;
453 NamespaceBinding parent, ns = xbit->ns_dict;
454 for(i=0; i<xbit->nsc; i++)
455 {
456 parent = ns->parent;
457 Free(ns);
458 ns = parent;
459 }
460 }
461
462 for(a = xbit->attributes; a; a = b)
463 {
464 b = a->next;
465 if(a->value) Free(a->value);
466 Free(a);
467 }
468 clear_xbit(xbit);
469 }
470
471 /*
472 * Returns 1 if the input matches string (and consume the input).
473 * Otherwise returns 0 and leaves the input stream where it was.
474 * Case-sensitivity depends on the CaseInsensitive flag.
475 * A space character at end of string matches any (non-zero) amount of
476 * whitespace; space are treated literally elsewhere.
477 * Never reads beyond an end-of-line, except to consume
478 * extra whitespace when the last character of string is a space.
479 * Never reads beyond end-of-entity.
480 */
481
looking_at(Parser p,const char8 * string)482 static int looking_at(Parser p, const char8 *string)
483 {
484 InputSource s = p->source;
485 int c, d;
486 int save = s->next;
487
488 if(p->state == PS_error)
489 /* we got a bad character before, don't try again */
490 return 0;
491
492 for(c = *string++; c; c = *string++)
493 {
494 if(at_eol(s))
495 goto fail; /* We would go over a line end */
496
497 d = get(s);
498
499 if(d == BADCHAR)
500 {
501 error(p, "Input error: %s", s->error_msg);
502 goto fail;
503 }
504
505 if(c == ' ' && *string == 0)
506 {
507 if(d == XEOE || !is_xml_whitespace(d))
508 goto fail;
509 skip_whitespace(s);
510 }
511 else
512 if((ParserGetFlag(p, CaseInsensitive) &&
513 Toupper(d) != Toupper(c)) ||
514 (!ParserGetFlag(p, CaseInsensitive) && d != c))
515 goto fail;
516 }
517
518 return 1;
519
520 fail:
521 s->next = save;
522 return 0;
523 }
524
parse_name(Parser p,const char8 * where)525 static int parse_name(Parser p, const char8 *where)
526 {
527 InputSource s = p->source;
528 int c, i;
529
530 c = get(s);
531 if(c == BADCHAR)
532 return error(p, "Input error: %s", s->error_msg);
533
534 if(c == XEOE || !is_xml_namestart(c, p->map))
535 {
536 unget(s); /* For error position */
537 error(p, "Expected name, but got %s %s",
538 escape(c, p->escbuf[0]), where);
539 return -1;
540 }
541 i = 1;
542
543 while(c = get(s), (c != XEOE && is_xml_namechar(c, p->map)))
544 i++;
545 unget(s);
546
547 p->name = s->line + s->next - i;
548 p->namelen = i;
549
550 NF16StartCheck(p);
551 if(p->namechecker && NF16wrong==nf16checkL(p->namechecker,
552 s->line + s->next - i, i))
553 return error(p, "Name not normalized after %s", where);
554
555 return 0;
556 }
557
parse_nmtoken(Parser p,const char8 * where)558 static int parse_nmtoken(Parser p, const char8 *where)
559 {
560 InputSource s = p->source;
561 int c, i=0;
562
563 c = get(s);
564 if(c == BADCHAR)
565 return error(p, "Input error: %s", s->error_msg);
566
567 while(c !=XEOE && is_xml_namechar(c, p->map))
568 {
569 i++;
570 c = get(s);
571 }
572 unget(s);
573
574 if(i == 0)
575 return error(p, "Expected nmtoken, but got %s %s",
576 escape(c, p->escbuf[0]), where);
577
578 p->name = s->line + s->next - i;
579 p->namelen = i;
580
581 NF16StartCheck(p);
582 if(p->namechecker && NF16wrong==nf16checkL(p->namechecker,
583 s->line + s->next - i, i))
584 return error(p, "nmtoken not normalized after %s", where);
585
586 return 0;
587 }
588
589 /* Escape a character for printing n an error message. */
590
escape(int c,char8 * buf)591 static const char8 *escape(int c, char8 *buf)
592 {
593 #if CHAR_SIZE == 8
594 if(c != XEOE)
595 c &= 0xff;
596 #endif
597
598 if(c == XEOE)
599 return "<EOE>";
600 else if(c >= 33 && c <= 126)
601 sprintf(buf, "%c", c);
602 else if(c == ' ')
603 sprintf(buf, "<space>");
604 else
605 sprintf(buf, "<0x%x>", c);
606
607 return buf;
608 }
609
NewParser(void)610 Parser NewParser(void)
611 {
612 Parser p;
613 static Char xml[] = {'x','m','l',0};
614
615 if(init_parser() == -1)
616 return 0;
617
618 p = Malloc(sizeof(*p));
619 if(!p)
620 return 0;
621 p->state = PS_prolog1;
622 p->seen_validity_error = 0;
623 p->document_entity = 0; /* Set at first ParserPush */
624 p->have_dtd = 0;
625 p->standalone = SDD_unspecified;
626 p->flags[0] = p->flags[1] = 0;
627 p->source = 0;
628 clear_xbit(&p->xbit);
629 #ifndef FOR_LT
630 p->xbit.nchildren = 0; /* These three should never be changed */
631 p->xbit.children = 0;
632 p->xbit.parent = 0;
633 #endif
634 p->pbufsize = p->pbufnext = 0;
635 p->pbuf = 0;
636 p->save_pbufsize = p->save_pbufnext = 0;
637 p->save_pbuf = 0;
638 p->transbuf = 0;
639
640 p->peeked = 0;
641 p->dtd = NewDtd();
642 p->dtd_callback = p->warning_callback = 0;
643 p->entity_opener = 0;
644 p->dtd_callback_arg = 0;
645 p->warning_callback_arg = 0;
646 p->entity_opener_arg = 0;
647 p->external_pe_depth = 0;
648
649 p->checker = 0;
650 p->namechecker = 0;
651
652 VectorInit(p->element_stack);
653
654 p->base_ns.parent = 0;
655 p->base_ns.prefix = xml;
656 p->base_ns.namespace =
657 FindNamespace(p->dtd->namespace_universe, xml_ns, 1);
658 if(!p->base_ns.namespace)
659 return 0;
660
661 p->id_table = create_hash_table(100);
662 if(!p->id_table)
663 return 0;
664
665 ParserSetFlag(p, XMLSyntax, 1);
666 ParserSetFlag(p, XMLPredefinedEntities, 1);
667 ParserSetFlag(p, XMLExternalIDs, 1);
668 ParserSetFlag(p, XMLMiscWFErrors, 1);
669 ParserSetFlag(p, ErrorOnUnquotedAttributeValues, 1);
670 ParserSetFlag(p, XMLLessThan, 1);
671 ParserSetFlag(p, ExpandGeneralEntities, 1);
672 ParserSetFlag(p, ExpandCharacterEntities, 1);
673 ParserSetFlag(p, NormaliseAttributeValues, 1);
674 ParserSetFlag(p, WarnOnRedefinitions, 1);
675 ParserSetFlag(p, TrustSDD, 1);
676 ParserSetFlag(p, ReturnComments, 1);
677 ParserSetFlag(p, MaintainElementStack, 1);
678 ParserSetFlag(p, XMLSpace, 0);
679 ParserSetFlag(p, XMLNamespaces, 0);
680 ParserSetFlag(p, XML11CheckNF, 0);
681 ParserSetFlag(p, XML11CheckExists, 0);
682
683 /* These are set here because LTXML sometimes pushes an internal
684 entity (for string reading), and the version-determining code
685 never gets run. */
686 p->xml_version = XV_1_0;
687 p->map = xml_char_map_105;
688
689 return p;
690 }
691
FreeParser(Parser p)692 void FreeParser(Parser p)
693 {
694 while (p->source)
695 ParserPop(p); /* Will close file */
696
697 Free(p->pbuf);
698 Free(p->save_pbuf);
699 Free(p->transbuf);
700 Free(p->element_stack);
701 free_hash_table(p->id_table);
702 if(p->checker)
703 nf16checkDelete(p->checker);
704 if(p->namechecker)
705 nf16checkDelete(p->namechecker);
706
707 Free(p);
708 }
709
ParserRootSource(Parser p)710 InputSource ParserRootSource(Parser p)
711 {
712 InputSource s;
713
714 for(s=p->source; s && s->parent; s = s->parent)
715 ;
716
717 return s;
718 }
719
ParserRootEntity(Parser p)720 Entity ParserRootEntity(Parser p)
721 {
722 return ParserRootSource(p)->entity;
723 }
724
ParserSetDtdCallbackArg(Parser p,void * arg)725 void ParserSetDtdCallbackArg(Parser p, void *arg)
726 {
727 p->dtd_callback_arg = arg;
728 }
729
ParserSetWarningCallbackArg(Parser p,void * arg)730 void ParserSetWarningCallbackArg(Parser p, void *arg)
731 {
732 p->warning_callback_arg = arg;
733 }
734
ParserSetEntityOpenerArg(Parser p,void * arg)735 void ParserSetEntityOpenerArg(Parser p, void *arg)
736 {
737 p->entity_opener_arg = arg;
738 }
739
ParserSetDtdCallback(Parser p,CallbackProc cb)740 void ParserSetDtdCallback(Parser p, CallbackProc cb)
741 {
742 p->dtd_callback = cb;
743 }
744
ParserSetWarningCallback(Parser p,CallbackProc cb)745 void ParserSetWarningCallback(Parser p, CallbackProc cb)
746 {
747 p->warning_callback = cb;
748 }
749
ParserSetEntityOpener(Parser p,EntityOpenerProc opener)750 void ParserSetEntityOpener(Parser p, EntityOpenerProc opener)
751 {
752 p->entity_opener = opener;
753 }
754
755 #ifndef FOR_LT
756
ReadXTree(Parser p)757 XBit ReadXTree(Parser p)
758 {
759 XBit bit, tree, child;
760 XBit *children;
761
762 bit = ReadXBit(p);
763
764 switch(bit->type)
765 {
766 case XBIT_error:
767 return bit;
768
769 case XBIT_start:
770 if(!(tree = Malloc(sizeof(*tree))))
771 {
772 error(p, "System error");
773 return &p->xbit;
774 }
775 *tree = *bit;
776 while(1)
777 {
778 child = ReadXTree(p);
779 switch(child->type)
780 {
781 case XBIT_error:
782 FreeXTree(tree);
783 return child;
784
785 case XBIT_eof:
786 FreeXTree(tree);
787 {
788 error(p, "EOF in element");
789 return &p->xbit;
790 }
791
792 case XBIT_end:
793 if(child->element_definition != tree->element_definition)
794 {
795 const Char *name1 = tree->element_definition->name,
796 *name2 = child->element_definition->name;
797 FreeXTree(tree);
798 FreeXTree(child);
799 error(p, "Mismatched end tag: expected </%S>, got </%S>",
800 name1, name2);
801 return &p->xbit;
802 }
803 /* Transfer ns records to start bit so that ns gets freed
804 when the tree is freed, rather than now. */
805 tree->nsowned = child->nsowned;
806 child->nsowned = 0;
807 FreeXTree(child);
808 return tree;
809
810 default:
811 children = Realloc(tree->children,
812 (tree->nchildren + 1) * sizeof(XBit));
813 if(!children)
814 {
815 FreeXTree(tree);
816 FreeXTree(child);
817 error(p, "System error");
818 return &p->xbit;
819 }
820 child->parent = tree;
821 children[tree->nchildren] = child;
822 tree->nchildren++;
823 tree->children = children;
824 break;
825 }
826 }
827
828 default:
829 if(!(tree = Malloc(sizeof(*tree))))
830 {
831 error(p, "System error");
832 return &p->xbit;
833 }
834 *tree = *bit;
835 return tree;
836 }
837 }
838
FreeXTree(XBit tree)839 void FreeXTree(XBit tree)
840 {
841 int i;
842 XBitType type = tree->type;
843
844 for(i=0; i<tree->nchildren; i++)
845 FreeXTree(tree->children[i]);
846
847 Free(tree->children);
848
849 FreeXBit(tree);
850
851 if(type == XBIT_error)
852 /* error "trees" are always in the Parser structure, not malloced */
853 return;
854
855 Free(tree);
856 }
857
858 #endif /* (not) FOR_LT */
859
ReadXBit(Parser p)860 XBit ReadXBit(Parser p)
861 {
862 if(p->peeked)
863 p->peeked = 0;
864 else
865 parse(p);
866
867 return &p->xbit;
868 }
869
PeekXBit(Parser p)870 XBit PeekXBit(Parser p)
871 {
872 if(p->peeked)
873 error(p, "Attempt to peek twice");
874 else
875 {
876 parse(p);
877 p->peeked = 1;
878 }
879
880 return &p->xbit;
881 }
882
ParserPush(Parser p,InputSource source)883 int ParserPush(Parser p, InputSource source)
884 {
885 Entity e = source->entity;
886
887 if(!p->source && !p->document_entity)
888 p->document_entity = e;
889
890 source->parent = p->source;
891 p->source = source;
892
893 if(e->type == ET_internal)
894 return 0;
895
896 if(e != p->document_entity)
897 source->map = p->map;
898
899 /* Look at first few bytes of external entities to guess encoding,
900 then look for an XMLDecl or TextDecl. */
901
902 /* Check encoding even if we have already determined it for this
903 entity, because otherwise we might leave a BOM unread. */
904 determine_character_encoding(source);
905
906 #if CHAR_SIZE == 8
907 if(!EncodingIsAsciiSuperset(e->encoding))
908 return error(p, "Unsupported character encoding %s",
909 CharacterEncodingName[e->encoding]);
910 #else
911 if(e->encoding == CE_unknown)
912 return error(p, "Unknown character encoding");
913 #endif
914
915 get(source); unget(source); /* To get the first line read */
916
917 if(looking_at(p, "<?NSL "))
918 {
919 require(process_nsl_decl(p));
920 source->read_carefully = 0;
921 return 0;
922 }
923
924 if(looking_at(p, "<?xml "))
925 {
926 require(process_xml_decl(p));
927 if(e == p->document_entity && !e->version_decl)
928 return error(p, "XML declaration in document entity lacked "
929 "version number");
930 if(e != p->document_entity && e->standalone_decl != SDD_unspecified)
931 return error(p, "Standalone attribute not allowed except in "
932 "document entity");
933 if(e != p->document_entity && e->encoding_decl == CE_unknown)
934 return error(p, "Encoding declaration is required in text "
935 "declaration");
936 }
937
938 else if(looking_at(p, "<?xml?"))
939 return error(p, "Empty XML or text declaration");
940
941 else if(looking_at(p, "<?XML "))
942 return error(p, "Wrong case XML declaration, must be <?xml ...");
943
944 else if(p->state == PS_error) /* looking_at may have set it */
945 return -1;
946
947 source->read_carefully = 0;
948
949 if(e == p->document_entity)
950 {
951 p->xml_version = e->xml_version;
952 if(p->xml_version >= XV_1_1)
953 {
954 ParserSetFlag(p, XML11Syntax, 1);
955 #if CHAR_SIZE == 16
956 p->map = xml_char_map_11;
957 #endif
958 #if CHAR_SIZE == 16
959 /* XXX is this the best place to do this? */
960 if(ParserGetFlag(p, XML11CheckNF))
961 {
962 p->checker = nf16checkNew(ParserGetFlag(p, XML11CheckExists));
963 NF16StartCheck(p);
964 p->namechecker =
965 nf16checkNew(ParserGetFlag(p, XML11CheckExists));
966 }
967 #endif
968 }
969 else if(ParserGetFlag(p, Pre105Chars))
970 p->map = xml_char_map;
971 else
972 p->map = xml_char_map_105;
973
974 source->map = p->map;
975 }
976 else if(e->xml_version > p->xml_version)
977 {
978 const char8 *doc_ver = p->document_entity->version_decl ?
979 p->document_entity->version_decl : "1.0";
980
981 if(ParserGetFlag(p, XMLStrictWFErrors))
982 return error(p, "Referenced entity has later version number "
983 "(%s) than document entity (%s)",
984 e->version_decl, doc_ver);
985 else
986 warn(p, "Referenced entity has later version number "
987 "(%s) than document entity (%s)",
988 e->version_decl, doc_ver);
989 }
990 #if 0
991 Fprintf(Stderr, "\npushing %s, map = %s\n",
992 EntityDescription(e), source->map == xml_char_map ? "1.0" : "1.1");
993 #endif
994 return 0;
995 }
996
ParserPop(Parser p)997 void ParserPop(Parser p)
998 {
999 InputSource source;
1000
1001 source = p->source;
1002 p->source = source->parent;
1003
1004 SourceClose(source);
1005 }
1006
1007 /* Returns true if the source is at EOE. If so, the EOE will have been read. */
1008
at_eoe(InputSource s)1009 static int at_eoe(InputSource s)
1010 {
1011 if(!at_eol(s))
1012 return 0;
1013 if(s->seen_eoe || get_with_fill(s) == XEOE)
1014 return 1;
1015 unget(s);
1016 return 0;
1017 }
1018
1019 /* Pops any sources that are at EOE. Leaves source buffer with at least
1020 one character in it (except at EOF, where it leaves the EOE unread). */
1021
pop_while_at_eoe(Parser p)1022 static void pop_while_at_eoe(Parser p)
1023 {
1024 while(1)
1025 {
1026 InputSource s = p->source;
1027
1028 if(!at_eoe(s))
1029 return;
1030 if(!s->parent)
1031 {
1032 unget(s);
1033 return;
1034 }
1035 ParserPop(p);
1036 }
1037 }
1038
ParserSetFlag(Parser p,ParserFlag flag,int value)1039 void ParserSetFlag(Parser p, ParserFlag flag, int value)
1040 {
1041 int flagset;
1042 unsigned int flagbit;
1043
1044 flagset = (flag >> 5);
1045 flagbit = (1u << (flag & 31));
1046
1047 if(value)
1048 p->flags[flagset] |= flagbit;
1049 else
1050 p->flags[flagset] &= ~flagbit;
1051
1052 if(flag == XMLPredefinedEntities)
1053 {
1054 if(value)
1055 p->dtd->predefined_entities = xml_predefined_entities;
1056 else
1057 p->dtd->predefined_entities = 0;
1058 }
1059 }
1060
ParserPerror(Parser p,XBit bit)1061 void ParserPerror(Parser p, XBit bit)
1062 {
1063 int linenum, charnum;
1064 InputSource s, root;
1065
1066 root = ParserRootSource(p);
1067
1068 if(ParserGetFlag(p, SimpleErrorFormat))
1069 {
1070 const char8 *d, *e;
1071
1072 d = EntityDescription(root->entity);
1073 e = d+strlen8(d);
1074 while(e > d && e[-1] != '/')
1075 --e;
1076
1077 if(p->state == PS_validate_dtd)
1078 Fprintf(Stderr, "%s:-1(end of prolog):-1: ", e);
1079 else if(p->state == PS_validate_final)
1080 Fprintf(Stderr, "%s:-1(end of body):-1: ", e);
1081 else
1082 Fprintf(Stderr, "%s:%d:%d: ", e,root->line_number+1, root->next+1);
1083
1084 if(bit->type == XBIT_warning)
1085 Fprintf(Stderr, "warning: ");
1086 Fprintf(Stderr, "%s\n", bit->error_message);
1087
1088 return;
1089 }
1090
1091 Fprintf(Stderr, "%s: %s\n",
1092 bit->type == XBIT_error ? "Error" : "Warning",
1093 bit->error_message);
1094
1095 if(p->state == PS_validate_dtd || p->state == PS_validate_final)
1096 {
1097 Fprintf(Stderr, " (detected at end of %s of document %s)\n",
1098 p->state == PS_validate_final ? "body" : "prolog",
1099 EntityDescription(root->entity));
1100
1101 return;
1102 }
1103
1104 for(s=p->source; s; s=s->parent)
1105 {
1106 if(s->entity->name)
1107 Fprintf(Stderr, " in entity \"%S\"", s->entity->name);
1108 else
1109 Fprintf(Stderr, " in unnamed entity");
1110
1111 switch(SourceLineAndChar(s, &linenum, &charnum))
1112 {
1113 case 1:
1114 Fprintf(Stderr, " at line %d char %d of", linenum+1, charnum+1);
1115 break;
1116 case 0:
1117 Fprintf(Stderr, " defined at line %d char %d of",
1118 linenum+1, charnum+1);
1119 break;
1120 case -1:
1121 Fprintf(Stderr, " defined in");
1122 break;
1123 }
1124
1125 Fprintf(Stderr, " %s\n", EntityDescription(s->entity));
1126 }
1127 }
1128
1129
parse(Parser p)1130 static int parse(Parser p)
1131 {
1132 int c;
1133 InputSource s;
1134
1135 if(p->state == PS_end || p->state == PS_error)
1136 {
1137 /* After an error or EOF, just keep returning EOF */
1138 p->xbit.type = XBIT_eof;
1139 return 0;
1140 }
1141
1142 clear_xbit(&p->xbit);
1143
1144 if(p->state <= PS_prolog2 || p->state == PS_epilog)
1145 skip_whitespace(p->source);
1146
1147 restart:
1148 pop_while_at_eoe(p);
1149 s = p->source;
1150 SourcePosition(s, &p->xbit.entity, &p->xbit.byte_offset);
1151
1152 switch(c = get(s))
1153 {
1154 case XEOE:
1155 if(p->state != PS_epilog)
1156 return error(p, "Document ends too soon");
1157 p->state = PS_end;
1158 p->xbit.type = XBIT_eof;
1159 NF16StartCheck(p);
1160 return 0;
1161 case '<':
1162 NF16StartCheck(p); /* only effective after markup */
1163 return parse_markup(p);
1164 case '&':
1165 if(ParserGetFlag(p, IgnoreEntities))
1166 goto pcdata;
1167 if(p->state <= PS_prolog2)
1168 return error(p, "Entity reference not allowed in prolog");
1169 if(looking_at(p, "#"))
1170 {
1171 /* a character reference - go back and parse as pcdata */
1172 unget(s);
1173 goto pcdata;
1174 }
1175 if(p->state == PS_error) /* looking_at may have set it */
1176 return -1;
1177 if(ParserGetFlag(p, ExpandGeneralEntities))
1178 {
1179 /* an entity reference - push it and start again */
1180 require(parse_reference(p, 0, 1, 1));
1181 NF16StartCheck(p);
1182 goto restart;
1183 }
1184 /* not expanding general entities, so treat as pcdata */
1185 goto pcdata;
1186 case BADCHAR:
1187 return error(p, "Input error: %s", s->error_msg);
1188 default:
1189 pcdata:
1190 unget(s);
1191 return parse_pcdata(p);
1192 }
1193 }
1194
1195 /* Called after reading '<' */
1196
parse_markup(Parser p)1197 static int parse_markup(Parser p)
1198 {
1199 InputSource s = p->source;
1200 int c = get(s);
1201
1202 switch(c)
1203 {
1204 case '!':
1205 if(looking_at(p, "--"))
1206 {
1207 if(ParserGetFlag(p, ReturnComments))
1208 return parse_comment(p, 0, 0);
1209 else
1210 {
1211 require(parse_comment(p, 1, 0));
1212 /* XXX avoid recursion here */
1213 return parse(p);
1214 }
1215 }
1216 else if(looking_at(p, "DOCTYPE "))
1217 return parse_dtd(p);
1218 else if(looking_at(p, "[CDATA["))
1219 return parse_cdata(p);
1220 else if(p->state == PS_error) /* looking_at may have set it */
1221 return -1;
1222 else
1223 return error(p, "Syntax error after <!");
1224
1225 case '/':
1226 return parse_endtag(p);
1227
1228 case '?':
1229 return parse_pi(p, 0);
1230
1231 case BADCHAR:
1232 return error(p, "Input error: %s", s->error_msg);
1233
1234 default:
1235 unget(s);
1236 if(!ParserGetFlag(p, XMLLessThan) &&
1237 (c == XEOE || !is_xml_namestart(c, p->map)))
1238 {
1239 /* In nSGML, recognise < as stago only if followed by namestart */
1240
1241 unget(s); /* put back the < */
1242 return parse_pcdata(p);
1243 }
1244 return parse_starttag(p);
1245 }
1246 }
1247
parse_endtag(Parser p)1248 static int parse_endtag(Parser p)
1249 {
1250 ElementDefinition e;
1251 NSElementDefinition nse;
1252 Entity ent;
1253
1254 p->xbit.type = XBIT_end;
1255 require(parse_name(p, "after </"));
1256 maybe_uppercase_name(p);
1257
1258 if(ParserGetFlag(p, MaintainElementStack))
1259 {
1260 if(VectorCount(p->element_stack) <= 0)
1261 return error(p, "End tag </%.*S> outside of any element",
1262 p->namelen, p->name);
1263 }
1264
1265 if(ParserGetFlag(p, Validate))
1266 {
1267 struct element_info *info = &VectorLast(p->element_stack);
1268 ElementDefinition parent = info->definition;
1269
1270 if(parent->type == CT_element && info->context &&
1271 !info->context->end_node)
1272 {
1273 require(validity_error(p, "Content model for %S does not "
1274 "allow it to end here",
1275 parent->name));
1276 }
1277 }
1278
1279 if(ParserGetFlag(p, MaintainElementStack))
1280 {
1281 ent = VectorLast(p->element_stack).entity;
1282 e = VectorLast(p->element_stack).definition;
1283 nse = VectorLast(p->element_stack).ns_definition;
1284 p->xbit.ns_dict = VectorLast(p->element_stack).ns;
1285 p->xbit.nsc = VectorLast(p->element_stack).nsc;
1286 p->xbit.nsowned = (p->xbit.ns_dict != &p->base_ns);
1287 (void)VectorPop(p->element_stack);
1288
1289 if(p->namelen != e->namelen ||
1290 memcmp(p->name, e->name, p->namelen * sizeof(Char)) != 0)
1291 return error(p, "Mismatched end tag: expected </%S>, got </%.*S>",
1292 e->name, p->namelen, p->name);
1293
1294 p->xbit.element_definition = e;
1295 p->xbit.ns_element_definition = nse;
1296
1297 if(ent != p->source->entity)
1298 return error(p, "Element ends in different entity from that "
1299 "in which it starts");
1300
1301 if(VectorCount(p->element_stack) == 0)
1302 {
1303 if(ParserGetFlag(p, Validate))
1304 {
1305 p->state = PS_validate_final;
1306 require(validate_final(p));
1307 }
1308 p->state = PS_epilog;
1309 }
1310 }
1311 else
1312 {
1313 e = FindElementN(p->dtd, p->name, p->namelen);
1314 p->xbit.element_definition = e;
1315 if(!p->xbit.element_definition)
1316 return error(p, "End tag for unknown element %.*S",
1317 p->namelen, p->name);
1318 }
1319
1320 skip_whitespace(p->source);
1321 NF16StartCheck(p);
1322 return expect(p, '>', "after name in end tag");
1323 }
1324
check_qualname_syntax(Parser p,const Char * name,const char * type)1325 static int check_qualname_syntax(Parser p, const Char *name, const char *type)
1326 {
1327 Char *t;
1328
1329 t = Strchr(name, ':');
1330
1331 if(!t)
1332 return 0;
1333
1334 if(t == name)
1335 {
1336 require(namespace_error(p, "%s name %S has empty prefix", type, name));
1337 }
1338 else if(t[1] == 0)
1339 {
1340 require(namespace_error(p, "%s name %S has empty local part",
1341 type, name));
1342 }
1343 else if(!is_xml_namestart(t[1], p->map))
1344 {
1345 require(namespace_error(p, "%s name %S has illegal local part",
1346 type, name));
1347 }
1348 else if(Strchr(t+1, ':'))
1349 {
1350 require(namespace_error(p, "%s name %S has multiple colons",
1351 type, name));
1352 }
1353
1354 return 0;
1355 }
1356
parse_starttag(Parser p)1357 static int parse_starttag(Parser p)
1358 {
1359 int c, is_top_level = 0;
1360 ElementDefinition e;
1361 AttributeDefinition d;
1362 Attribute a, aa, all_attrs;
1363 struct element_info *this_info = 0, *parent_info = 0;
1364
1365 if(p->state == PS_epilog && !ParserGetFlag(p, AllowMultipleElements))
1366 return error(p, "Document contains multiple elements");
1367
1368 if(p->state < PS_body)
1369 {
1370 if(ParserGetFlag(p, Validate))
1371 {
1372 p->state = PS_validate_dtd;
1373 require(validate_dtd(p));
1374 }
1375 is_top_level = 1;
1376 }
1377
1378 p->state = PS_body;
1379
1380 require(parse_name(p, "after <"));
1381 maybe_uppercase_name(p);
1382
1383 #if not_yet
1384 if(is_top_level && p->magic_prefix)
1385 require(magically_transform_dtd(p, p->name, p->namelen));
1386 #endif
1387
1388 e = FindElementN(p->dtd, p->name, p->namelen);
1389 if(!e || e->tentative)
1390 {
1391 if(p->have_dtd && ParserGetFlag(p, ErrorOnUndefinedElements))
1392 return error(p, "Start tag for undeclared element %.*S",
1393 p->namelen, p->name);
1394 if(ParserGetFlag(p, Validate) &&
1395 !(ParserGetFlag(p, RelaxedAny) &&
1396 VectorCount(p->element_stack) != 0 &&
1397 VectorLast(p->element_stack).definition->type == CT_any))
1398 {
1399 require(validity_error(p,
1400 "Start tag for undeclared element %.*S",
1401 p->namelen, p->name));
1402 }
1403 if(e)
1404 RedefineElement(e, CT_any, 0, 0, 0);
1405 else
1406 {
1407 if(!(e =
1408 DefineElementN(p->dtd, p->name, p->namelen, CT_any, 0, 0, 0)))
1409 return error(p, "System error");
1410 if(ParserGetFlag(p, XMLNamespaces))
1411 {
1412 require(check_qualname_syntax(p, e->name, "Element"));
1413 }
1414 }
1415 }
1416
1417 p->xbit.element_definition = e;
1418
1419 if(ParserGetFlag(p, Validate))
1420 {
1421 if(VectorCount(p->element_stack) == 0)
1422 {
1423 if(Strcmp(p->dtd->name, e->name) != 0)
1424 {
1425 require(validity_error(p, "Root element is %S, should be %S",
1426 e->name, p->dtd->name));
1427 }
1428 }
1429 else
1430 {
1431 struct element_info *info = &VectorLast(p->element_stack);
1432 ElementDefinition parent = info->definition;
1433
1434 if(parent->type == CT_empty)
1435 {
1436 require(validity_error(p, "Content model for %S does not "
1437 "allow anything here",
1438 parent->name));
1439 }
1440 else if(info->context)
1441 {
1442 info->context = validate_content(info->context, e);
1443 if(!info->context)
1444 {
1445 require(validity_error(p, "Content model for %S does not "
1446 "allow element %S here",
1447 parent->name, e->name));
1448 }
1449 }
1450 }
1451 }
1452
1453 while(1)
1454 {
1455 InputSource s = p->source;
1456
1457 /* We could just do skip_whitespace here, but we will get a
1458 better error message if we look a bit closer. */
1459
1460 c = get(s);
1461
1462 if(c == BADCHAR)
1463 return error(p, "Input error: %s", s->error_msg);
1464
1465 if(c !=XEOE && is_xml_whitespace(c))
1466 {
1467 skip_whitespace(s);
1468 c = get(s);
1469 }
1470 else if(c != '>' &&
1471 !(ParserGetFlag(p, XMLSyntax) && c == '/'))
1472 {
1473 unget(s); /* For error position */
1474 return error(p, "Expected whitespace or tag end in start tag");
1475 }
1476
1477 if(c == '>')
1478 {
1479 p->xbit.type = XBIT_start;
1480 break;
1481 }
1482
1483 if((ParserGetFlag(p, XMLSyntax)) && c == '/')
1484 {
1485 require(expect(p, '>', "after / in start tag"));
1486 p->xbit.type = XBIT_empty;
1487 break;
1488 }
1489
1490 unget(s);
1491
1492 require(parse_attribute(p));
1493 }
1494
1495 if(ParserGetFlag(p, MaintainElementStack))
1496 {
1497 if(p->xbit.type == XBIT_start)
1498 {
1499 if(!VectorPushNothing(p->element_stack))
1500 return error(p, "System error");
1501 if(VectorCount(p->element_stack) > 1)
1502 parent_info = &VectorLast(p->element_stack) - 1;
1503 this_info = &VectorLast(p->element_stack);
1504 this_info->definition = e;
1505 this_info->context = e->fsm ? e->fsm->start_node : 0;
1506 this_info->wsm = WSM_unspecified;
1507 this_info->ns = 0;
1508 this_info->entity = p->source->entity;
1509 /* Set these here even if not doing namespace processing, to
1510 avoid rui errors from dbx. */
1511 this_info->ns_definition = 0;
1512 this_info->nsc = 0;
1513 }
1514 else
1515 {
1516 /* Is this element allowed to be empty? */
1517
1518 if(ParserGetFlag(p, Validate) && e->fsm &&
1519 !e->fsm->start_node->end_node)
1520 {
1521 require(validity_error(p, "Content model for %S does not "
1522 "allow it to be empty",
1523 e->name));
1524 }
1525
1526 /* Is it the (empty) top-level element? */
1527
1528 if(VectorCount(p->element_stack) == 0)
1529 {
1530 if(ParserGetFlag(p, Validate))
1531 {
1532 p->state = PS_validate_final;
1533 require(validate_final(p));
1534 }
1535 p->state = PS_epilog;
1536 }
1537 else
1538 parent_info = &VectorLast(p->element_stack);
1539 }
1540 }
1541
1542 if(ParserGetFlag(p, Validate))
1543 {
1544 /* check for required attributes */
1545
1546 AttributeDefinition d;
1547 Attribute a;
1548
1549 for(d=NextAttributeDefinition(e, 0);
1550 d;
1551 d=NextAttributeDefinition(e, d))
1552 {
1553 if(d->default_type != DT_required)
1554 continue;
1555 for(a=p->xbit.attributes; a; a=a->next)
1556 if(a->definition == d)
1557 break;
1558 if(!a)
1559 {
1560 require(validity_error(p,
1561 "Required attribute %S for element %S "
1562 "is not present",
1563 d->name, e->name));
1564 }
1565 }
1566 }
1567
1568 /* Find defaulted attributes if we need them */
1569
1570 /* p->xbit.attributes only points to actually present attributes
1571 until the end of this function. */
1572 all_attrs = p->xbit.attributes;
1573
1574 if(ParserGetFlag(p, ReturnDefaultedAttributes) ||
1575 ParserGetFlag(p, XMLNamespaces))
1576 {
1577
1578 for(d=NextAttributeDefinition(e, 0);
1579 d;
1580 d=NextAttributeDefinition(e, d))
1581 {
1582 if(!d->default_value)
1583 continue;
1584 for(a=p->xbit.attributes; a; a=a->next)
1585 if(a->definition == d)
1586 break;
1587 if(!a)
1588 {
1589 if(!(a = Malloc(sizeof(*a))))
1590 return error(p, "System error");
1591 a->definition = d;
1592 if(!(a->value = Strdup(d->default_value)))
1593 return error(p, "System error");
1594 a->specified = 0;
1595 a->quoted = 1;
1596 a->next = all_attrs;
1597 all_attrs = a;
1598 }
1599 }
1600 }
1601
1602 /* Do some checks on defaulted attributes if validating */
1603
1604 if(ParserGetFlag(p, Validate))
1605 {
1606 for(d=NextAttributeDefinition(e, 0);
1607 d;
1608 d=NextAttributeDefinition(e, d))
1609 {
1610 int ed, sem;
1611
1612 if(!d->default_value)
1613 continue;
1614
1615 /* Check no externally-declared defaults in standalone document,
1616 and do "non-lexical" validation of some attribute types */
1617
1618 ed = (p->standalone == SDD_yes && d->is_externally_declared);
1619 sem =
1620 (d->type == AT_entity || d->type == AT_entities ||
1621 d->type == AT_id ||
1622 d->type == AT_idref || d->type == AT_idrefs);
1623
1624 if(ed || sem)
1625 {
1626 /* was it actually defaulted? */
1627
1628 for(a=p->xbit.attributes; a; a=a->next)
1629 if(a->definition == d)
1630 break;
1631 if(a)
1632 /* no */
1633 continue;
1634 }
1635
1636 if(sem)
1637 {
1638 require(check_attribute_syntax(p, d, e, d->default_value,
1639 "defaulted value for attribute",
1640 1));
1641 }
1642
1643 if(ed)
1644 {
1645 require(validity_error(p, "Externally declared attribute %S "
1646 "for element %S defaulted in document declared standalone",
1647 d->name, e->name));
1648 }
1649 }
1650 }
1651
1652 /* Look for xml:space attribute */
1653
1654 if(ParserGetFlag(p, XMLSpace))
1655 {
1656 d = e->xml_space_attribute;
1657
1658 if(d)
1659 {
1660 for(a=p->xbit.attributes; a; a=a->next)
1661 if(a->definition == d)
1662 {
1663 p->xbit.wsm = process_xml_space(p, a->value);
1664 goto done;
1665 }
1666
1667 if(d->default_type == DT_none || d->default_type == DT_fixed)
1668 {
1669 p->xbit.wsm = process_xml_space(p, d->default_value);
1670 goto done;
1671 }
1672 }
1673
1674 p->xbit.wsm = parent_info ? parent_info->wsm : WSM_unspecified;
1675
1676 done:
1677 if(this_info)
1678 this_info->wsm = p->xbit.wsm;
1679 }
1680 else
1681 p->xbit.wsm = WSM_unspecified;
1682
1683 /* Look for xml:id attribute */
1684
1685 if(ParserGetFlag(p, XMLID) && e->xml_id_attribute)
1686 {
1687 Char *s;
1688
1689 d = e->xml_id_attribute;
1690
1691 for(a=p->xbit.attributes; a; a=a->next)
1692 if(a->definition == d)
1693 break;
1694 if(!a)
1695 goto id_done;
1696
1697 /* check that it's an NCName */
1698
1699 if(!is_xml_namestart(a->value[0], p->map))
1700 {
1701 warn(p, "xml:id error: value \"%S\" does not start with a name start character",
1702 a->value);
1703 goto id_done;
1704 }
1705
1706 for(s=a->value; *s; s++)
1707 {
1708 if(*s == ':')
1709 {
1710 warn(p, "xml:id error: value \"%S\" contains a colon", a->value);
1711 goto id_done;
1712 }
1713 else if(!is_xml_namechar(*s, p->map))
1714 {
1715 warn(p, "xml:id error: value \"%S\" contains a character which is not a name character",
1716 a->value);
1717 goto id_done;
1718 }
1719 }
1720
1721 id_done:
1722 ;
1723 }
1724
1725 if(ParserGetFlag(p, XMLIDCheckUnique))
1726 {
1727 int found;
1728 HashEntry id_entry;
1729
1730 for(a=p->xbit.attributes; a; a=a->next)
1731 {
1732 d = a->definition;
1733 if(d->type != AT_id)
1734 continue;
1735 if(ParserGetFlag(p, Validate) && d->declared)
1736 /* declared attributes will have been checked during validation */
1737 continue;
1738
1739 id_entry = hash_find_or_add(p->id_table,
1740 a->value,
1741 Strlen(a->value)*sizeof(Char),
1742 &found);
1743 if(!id_entry)
1744 return error(p, "System error");
1745
1746 if(!found)
1747 hash_set_value(id_entry, (void *)2);
1748 else
1749 warn(p, "xml:id error: duplicate ID attribute value %S",
1750 a->value);
1751 }
1752 }
1753
1754 if(ParserGetFlag(p, XMLNamespaces))
1755 {
1756 Attribute *attp;
1757 Namespace ns;
1758 NSElementDefinition nselt;
1759 NSAttributeDefinition nsattr;
1760
1761 p->xbit.ns_dict = parent_info ? parent_info->ns : &p->base_ns;
1762 p->xbit.nsc = 0;
1763
1764 /* Look for xmlns attributes */
1765
1766 for(attp=&all_attrs; *attp; )
1767 {
1768 a = *attp;
1769 if(a->definition->ns_attr_prefix)
1770 {
1771 require(process_namespace(p, a->definition, a->value));
1772 p->xbit.nsc++;
1773
1774 /* remove the attribute now we've processed it */
1775
1776 if(!ParserGetFlag(p, ReturnNamespaceAttributes))
1777 {
1778 if(p->xbit.attributes == a)
1779 p->xbit.attributes = a->next;
1780 *attp = a->next;
1781 Free(a->value);
1782 Free(a);
1783 }
1784 else
1785 attp = &a->next;
1786 }
1787 else
1788 attp = &a->next;
1789 }
1790
1791 p->xbit.nsowned = (p->xbit.type == XBIT_empty &&
1792 p->xbit.ns_dict != &p->base_ns);
1793
1794 /* Find namespace for element */
1795
1796 if(e->prefix)
1797 {
1798 ns = LookupNamespace(p->xbit.ns_dict, e->prefix);
1799 if(!ns)
1800 {
1801 require(namespace_error(p,
1802 "Element name %S has unbound prefix",
1803 e->name));
1804 }
1805 }
1806 else
1807 ns = LookupNamespace(p->xbit.ns_dict, 0);
1808
1809 nselt = 0;
1810 if(ns)
1811 if(!(nselt = NamespacifyElementDefinition(e, ns)))
1812 return error(p, "System error");
1813
1814 p->xbit.ns_element_definition = nselt;
1815
1816 if(this_info)
1817 {
1818 this_info->ns = p->xbit.ns_dict;
1819 this_info->nsc = p->xbit.nsc;
1820 this_info->ns_definition = nselt;
1821 }
1822
1823 /* Find namespaces for attributes */
1824
1825 for(a=all_attrs; a; a=a->next)
1826 {
1827 d = a->definition;
1828 nsattr = 0;
1829
1830 if(!d->ns_attr_prefix) /* Ignore namespace attributes themselves */
1831 {
1832 if(d->prefix)
1833 {
1834 ns = LookupNamespace(p->xbit.ns_dict, d->prefix);
1835 if(!ns)
1836 {
1837 require(namespace_error(p,
1838 "Attribute name %S has unbound prefix",
1839 d->name));
1840 }
1841 else
1842 if(!(nsattr =
1843 NamespacifyGlobalAttributeDefinition(d, ns)))
1844 return error(p, "System error");
1845 }
1846 else if(nselt)
1847 {
1848 if(!(nsattr =
1849 NamespacifyElementAttributeDefinition(d, nselt)))
1850 return error(p, "System error");
1851 }
1852 }
1853
1854 a->ns_definition = nsattr;
1855 }
1856
1857 /* Check for repeated qualified attributes */
1858
1859 for(a=all_attrs; a; a=a->next)
1860 {
1861 d = a->definition;
1862 if(a->ns_definition && !a->ns_definition->element)
1863 for(aa=all_attrs; aa != a; aa=aa->next)
1864 {
1865 if(aa->ns_definition == a->ns_definition)
1866 {
1867 require(namespace_error(p,
1868 "Repeated attribute %S in namespace %S",
1869 d->local, a->ns_definition->namespace->nsname));
1870 }
1871 }
1872 }
1873
1874 /* Free defaulted attrs if we only got them for namespace stuff */
1875
1876 if(!ParserGetFlag(p, ReturnDefaultedAttributes))
1877 {
1878 for(a=all_attrs; a != p->xbit.attributes; a = aa)
1879 {
1880 aa = a->next;
1881 Free(a->value);
1882 Free(a);
1883 }
1884 all_attrs = p->xbit.attributes;
1885 }
1886 }
1887
1888 p->xbit.attributes = all_attrs;
1889
1890 NF16StartCheck(p);
1891 return 0;
1892 }
1893
process_namespace(Parser p,AttributeDefinition d,const Char * value)1894 static int process_namespace(Parser p, AttributeDefinition d,const Char *value)
1895 {
1896 NamespaceBinding nb;
1897 const Char *prefix;
1898 const Char *nsname;
1899 Namespace ns;
1900
1901 static Char xmlns[] = {'x','m','l','n','s',0};
1902 static Char xml[] = {'x','m','l',0};
1903
1904 int xml_prefix = 0, xmlns_prefix = 0;
1905 int xml_uri = 0, xmlns_uri = 0;
1906
1907 prefix = *d->ns_attr_prefix ? d->ns_attr_prefix : 0;
1908 nsname = *value == 0 ? 0 : value;
1909
1910 if(prefix && !nsname && p->xml_version < XV_1_1)
1911 {
1912 require(namespace_error(p,
1913 "Namespace declaration for %S has empty URI",
1914 prefix));
1915 }
1916 if(prefix)
1917 {
1918 if(Strcmp(prefix, xml) == 0)
1919 xml_prefix = 1;
1920 else if(Strcmp(prefix, xmlns) == 0)
1921 xmlns_prefix = 1;
1922 }
1923
1924 if(nsname)
1925 {
1926 if(Strcmp(nsname, xml_ns) == 0)
1927 xml_uri = 1;
1928 else if(Strcmp(nsname, xmlns_ns) == 0)
1929 xmlns_uri = 1;
1930 }
1931
1932 if(xml_prefix && !xml_uri)
1933 {
1934 require(namespace_error(p,
1935 "Declaration of xml prefix has wrong URI \"%S\"",
1936 nsname));
1937 }
1938
1939 if(xmlns_prefix)
1940 {
1941 require(namespace_error(p,
1942 "Declaration of xmlns prefix is not allowed"));
1943 }
1944
1945 if(xml_uri && !xml_prefix)
1946 {
1947 require(namespace_error(p, "Declaration of xml namespace with "
1948 " prefix \"%S\" (must be \"xml\")", prefix));
1949 }
1950
1951 if(xmlns_uri)
1952 {
1953 require(namespace_error(p,
1954 "Declaration of xmlns namespace is not allowed"));
1955 }
1956
1957 if(nsname)
1958 {
1959 if(!(ns = FindNamespace(p->dtd->namespace_universe, nsname, 1)))
1960 return error(p, "System error");
1961 }
1962 else
1963 ns = 0;
1964
1965 if(!(nb = Malloc(sizeof(*nb))))
1966 return error(p, "System error");
1967
1968 nb->prefix = prefix;
1969 nb->namespace = ns;
1970 nb->parent = p->xbit.ns_dict;
1971 p->xbit.ns_dict = nb;
1972
1973 return 0;
1974 }
1975
LookupNamespace(NamespaceBinding dictionary,const Char * prefix)1976 Namespace LookupNamespace(NamespaceBinding dictionary, const Char *prefix)
1977 {
1978 NamespaceBinding n;
1979
1980 for(n=dictionary; n; n=n->parent)
1981 {
1982 if(prefix == 0)
1983 {
1984 if(n->prefix == 0)
1985 return n->namespace;
1986 }
1987 else if(n->prefix && Strcmp(prefix, n->prefix) == 0)
1988 return n->namespace;
1989 }
1990
1991 return 0;
1992 }
1993
parse_attribute(Parser p)1994 static int parse_attribute(Parser p)
1995 {
1996 InputSource s = p->source;
1997 ElementDefinition elt = p->xbit.element_definition;
1998 AttributeDefinition def;
1999 struct attribute *a;
2000 int c;
2001 int normalised = 0;
2002 static Char xmlns[] = {'x','m','l','n','s',0};
2003
2004 require(parse_name(p, "for attribute"));
2005 maybe_uppercase_name(p);
2006
2007 def = FindAttributeN(elt, p->name, p->namelen);
2008 if(!def)
2009 {
2010 if(p->have_dtd && ParserGetFlag(p, ErrorOnUndefinedAttributes))
2011 return error(p, "Undeclared attribute %.*S for element %S",
2012 p->namelen, p->name, elt->name);
2013 if(ParserGetFlag(p, Validate) &&
2014 (elt->declared || elt->has_attlist) &&
2015 !(ParserGetFlag(p, AllowUndeclaredNSAttributes) &&
2016 p->namelen >= 5 && Strncmp(p->name, xmlns, 5) == 0 &&
2017 (p->namelen == 5 || p->name[5] == ':')))
2018 {
2019 require(validity_error(p,
2020 "Undeclared attribute %.*S for element %S",
2021 p->namelen, p->name, elt->name));
2022 }
2023 if(!(def = DefineAttributeN(elt, p->name, p->namelen,
2024 AT_cdata, 0, DT_implied, 0, 0)))
2025 return error(p, "System error");
2026 if(ParserGetFlag(p, XMLID) && elt->xml_id_attribute == def)
2027 def->type = AT_id;
2028 if(ParserGetFlag(p, XMLNamespaces))
2029 {
2030 require(check_qualname_syntax(p, def->name, "Attribute"));
2031 }
2032 }
2033
2034 for(a = p->xbit.attributes; a; a = a->next)
2035 if(a->definition == def)
2036 return error(p, "Repeated attribute %.*S", p->namelen, p->name);
2037
2038 if(!(a = Malloc(sizeof(*a))))
2039 return error(p, "System error");
2040
2041 a->value = 0; /* in case of error */
2042 a->next = p->xbit.attributes;
2043 p->xbit.attributes = a;
2044 a->definition = def;
2045 a->specified = 1;
2046
2047 skip_whitespace(s);
2048 require(expect(p, '=', "after attribute name"));
2049
2050 skip_whitespace(s);
2051 c = get(s);
2052 unget(s);
2053 switch(c)
2054 {
2055 case BADCHAR:
2056 case '"':
2057 case '\'':
2058 a->quoted = 1;
2059 require(parse_string(p, "in attribute value",
2060 a->definition->type == AT_cdata ? LT_cdata_attr :
2061 LT_tok_attr,
2062 &normalised));
2063 a->value = p->pbuf;
2064 Consume(p->pbuf);
2065 break;
2066 default:
2067 if(ParserGetFlag(p, ErrorOnUnquotedAttributeValues))
2068 return error(p, "Value of attribute is unquoted");
2069 a->quoted = 0;
2070 require(parse_nmtoken(p, "in unquoted attribute value"));
2071 CopyName(a->value);
2072 break;
2073 }
2074
2075 if(ParserGetFlag(p, Validate))
2076 {
2077 if(p->standalone == SDD_yes && normalised &&
2078 a->definition->is_externally_declared)
2079 {
2080 require(validity_error(p, "Externally declared attribute %S for "
2081 "element %S was normalised in document declared standalone",
2082 a->definition->name, elt->name));
2083 }
2084
2085 require(validate_attribute(p, a->definition, elt, a->value));
2086 }
2087
2088 return 0;
2089 }
2090
process_xml_space(Parser p,const Char * value)2091 static WhiteSpaceMode process_xml_space(Parser p, const Char *value)
2092 {
2093 static Char _preserve[9] = {'p','r','e','s','e','r','v','e',0};
2094 static Char _default[8] = {'d','e','f','a','u','l','t',0};
2095 Char buf[9];
2096 const Char *v;
2097 int i;
2098
2099 /* It's possible that it hasn't been normalised (sigh) */
2100
2101 for(v=value; is_xml_whitespace(*v); v++)
2102 ;
2103 for(i=0; i<8; i++)
2104 {
2105 if(!v[i] || is_xml_whitespace(v[i]))
2106 break;
2107 buf[i] = v[i];
2108 }
2109 buf[i] = '\0';
2110 for(; v[i]; i++)
2111 if(!is_xml_whitespace(v[i]))
2112 /* If you want validation, you know where to find it */
2113 return WSM_unspecified;
2114
2115 if(Strcmp(v, _preserve) == 0)
2116 return WSM_preserve;
2117 if(Strcmp(v, _default) == 0)
2118 return WSM_default;
2119 return WSM_unspecified;
2120 }
2121
transcribe(Parser p,int back,int count)2122 static int transcribe(Parser p, int back, int count)
2123 {
2124 ExpandBuf(p->pbuf, p->pbufnext + count);
2125 memcpy(p->pbuf + p->pbufnext,
2126 p->source->line + p->source->next - back,
2127 count * sizeof(Char));
2128 p->pbufnext += count;
2129 return 0;
2130 }
2131
2132 /* Called after pushing back the first character of the pcdata */
2133
parse_pcdata(Parser p)2134 static int parse_pcdata(Parser p)
2135 {
2136 int count = 0;
2137 int had_charref = 0;
2138 InputSource s;
2139 Char *buf;
2140 int next, buflen;
2141
2142 if(p->state <= PS_prolog2)
2143 return error(p, "Character data not allowed in prolog");
2144 if(p->state == PS_epilog)
2145 return error(p, "Character data not allowed after body");
2146
2147 s = p->source;
2148 buf = s->line;
2149 next = s->next;
2150 buflen = s->line_length;
2151
2152 p->pbufnext = 0;
2153
2154 while(1)
2155 {
2156 if(next == buflen)
2157 {
2158 s->next = next;
2159 if(count > 0)
2160 {
2161 ifNF16wrong(p,count,count)
2162 return error(p, "pcdata not normalized");
2163 require(transcribe(p, count, count));
2164 }
2165 count = 0;
2166 if(at_eoe(s))
2167 {
2168 NF16StartCheck(p);
2169 if(!ParserGetFlag(p, MergePCData))
2170 goto done;
2171 else
2172 pop_while_at_eoe(p);
2173 }
2174 s = p->source;
2175 buf = s->line;
2176 next = s->next;
2177 buflen = s->line_length;
2178 if(next == buflen)
2179 goto done; /* must be EOF */
2180 }
2181
2182 switch(buf[next++])
2183 {
2184 case BADCHAR:
2185 return error(p, "Input error: %s", s->error_msg);
2186 case '<':
2187 if(!ParserGetFlag(p, XMLLessThan))
2188 {
2189 /* In nSGML, don't recognise < as markup unless it looks ok */
2190 if(next == buflen)
2191 goto deflt;
2192 if(buf[next] != '!' && buf[next] != '/' && buf[next] != '?' &&
2193 !is_xml_namestart(buf[next], p->map))
2194 goto deflt;
2195 }
2196 s->next = next;
2197 if(count > 0)
2198 {
2199 ifNF16wrong(p,count+1,count)
2200 return error(p, "pcdata not normalized");
2201 require(transcribe(p, count+1, count));
2202 }
2203 count = 0;
2204 if(!ParserGetFlag(p, ReturnComments) &&
2205 buflen >= next + 3 &&
2206 buf[next] == '!' && buf[next+1] == '-' && buf[next+2] == '-')
2207 {
2208 s->next = next + 3;
2209 require(parse_comment(p, 1, 0));
2210 NF16StartCheck(p);
2211 buflen = s->line_length;
2212 next = s->next;
2213 buf = s->line; /* thanks to robin@reportlab.com for this */
2214 }
2215 else
2216 {
2217 s->next = next-1;
2218 goto done;
2219 }
2220 break;
2221 case '&':
2222 if(ParserGetFlag(p, IgnoreEntities))
2223 goto deflt;
2224 if(!ParserGetFlag(p, MergePCData) &&
2225 (p->pbufnext > 0 || count > 0))
2226 {
2227 /* We're returning references as separate bits, and we've
2228 come to one, and we've already got some data to return,
2229 so return what we've got and get the reference next time. */
2230
2231 s->next = next-1;
2232 if(count > 0)
2233 {
2234 ifNF16wrong(p,count,count)
2235 return error(p, "pcdata not normalized");
2236 require(transcribe(p, count, count));
2237 }
2238 goto done;
2239 }
2240 if(buflen >= next+1 && buf[next] == '#')
2241 {
2242 /* It's a character reference */
2243
2244 had_charref = 1;
2245 s->next = next+1;
2246 if(count > 0)
2247 {
2248 ifNF16wrong(p,count,count+2)
2249 return error(p,"pcdata not normalized");
2250 require(transcribe(p, count+2, count));
2251 }
2252 count = 0;
2253 require(parse_character_reference(p,
2254 ParserGetFlag(p, ExpandCharacterEntities)));
2255 NF16StartCheck(p);
2256 next = s->next;
2257
2258 if(!ParserGetFlag(p, MergePCData))
2259 goto done;
2260 }
2261 else
2262 {
2263 /* It's a general entity reference */
2264
2265 s->next = next;
2266 if(count > 0)
2267 {
2268 ifNF16wrong(p,count,count+1)
2269 return error(p, "pcdata not normalized");
2270 require(transcribe(p, count+1, count));
2271 }
2272 count = 0;
2273 require(parse_reference(p, 0,
2274 ParserGetFlag(p, ExpandGeneralEntities),
2275 1));
2276 NF16StartCheck(p);
2277 s = p->source;
2278 buf = s->line;
2279 buflen = s->line_length;
2280 next = s->next;
2281
2282 if(!ParserGetFlag(p, MergePCData))
2283 goto done;
2284 }
2285 break;
2286 case ']':
2287 if(ParserGetFlag(p, XMLMiscWFErrors) &&
2288 buflen >= next + 2 &&
2289 buf[next] == ']' && buf[next+1] == '>')
2290 return error(p, "Illegal character sequence ']]>' in pcdata");
2291 /* fall through */
2292 default:
2293 deflt:
2294 count++;
2295 break;
2296 }
2297 }
2298
2299 done:
2300 ExpandBuf(p->pbuf, 0); /* In case we got nothing */
2301 p->pbuf[p->pbufnext++] = 0;
2302 p->xbit.type = XBIT_pcdata;
2303 p->xbit.pcdata_chars = p->pbuf;
2304 Consume(p->pbuf);
2305 p->xbit.pcdata_ignorable_whitespace = 0;
2306
2307 if(ParserGetFlag(p, Validate))
2308 {
2309 ElementDefinition e = VectorLast(p->element_stack).definition;
2310 if(e->type == CT_empty)
2311 {
2312 require(validity_error(p, "PCDATA not allowed in EMPTY element %S",
2313 e->name));
2314 }
2315 else if(e->type == CT_element)
2316 {
2317 Char *t;
2318
2319 for(t = p->xbit.pcdata_chars; *t; t++)
2320 if(!is_xml_whitespace(*t))
2321 break;
2322 if(*t)
2323 {
2324 require(validity_error(p,
2325 "Content model for %S does not allow PCDATA",
2326 e->name));
2327 }
2328 else if(had_charref)
2329 {
2330 /* E15 to 2nd edition */
2331 require(validity_error(p,
2332 "Content model for %S does not allow character reference",
2333 e->name));
2334 }
2335 else
2336 {
2337 p->xbit.pcdata_ignorable_whitespace = 1;
2338 if(p->standalone == SDD_yes && e->is_externally_declared)
2339 {
2340 require(validity_error(p, "Ignorable whitespace in "
2341 "externally declared element %S in document declared standalone",
2342 e->name));
2343 }
2344 }
2345 }
2346 }
2347
2348 return 0;
2349 }
2350
2351 /* Called after reading '<!--'. Won't go over an entity end. */
2352
parse_comment(Parser p,int skip,Entity ent)2353 static int parse_comment(Parser p, int skip, Entity ent)
2354 {
2355 InputSource s = p->source;
2356 int c, c1=0, c2=0;
2357 int count = 0;
2358 NF16noStartCheck(p);
2359
2360 if(ParserGetFlag(p, Validate) && VectorCount(p->element_stack) > 0)
2361 {
2362 ElementDefinition parent = VectorLast(p->element_stack).definition;
2363
2364 if(parent->type == CT_empty)
2365 {
2366 require(validity_error(p, "Comment not allowed in EMPTY element %S",
2367 parent->name));
2368 }
2369 }
2370
2371 if(!skip)
2372 p->pbufnext = 0;
2373
2374 while((c = get(s)) != XEOE)
2375 {
2376 if(c == BADCHAR)
2377 return error(p, "Input error: %s", s->error_msg);
2378
2379 count++;
2380 if(c1 == '-' && c2 == '-')
2381 {
2382 if(c == '>')
2383 break;
2384 unget(s); /* For error position */
2385 return error(p, "-- in comment");
2386 }
2387
2388 if(at_eol(s))
2389 {
2390 ifNF16wrong(p,count,count)
2391 return error(p, "comment not normalized");
2392 if(!skip)
2393 {
2394 require(transcribe(p, count, count));
2395 }
2396 count = 0;
2397 }
2398 c2 = c1; c1 = c;
2399 }
2400
2401 /* XXX comment going over PE end should be only a validity error,
2402 but we treat it as a WF error */
2403
2404 if(c == XEOE)
2405 return error(p, "EOE in comment");
2406
2407 ifNF16wrong(p,count,count-3)
2408 return error(p, "comment not normalized");
2409 NF16StartCheck(p);
2410 if(skip)
2411 return 0;
2412
2413 require(transcribe(p, count, count-3));
2414 p->pbuf[p->pbufnext++] = 0;
2415 p->xbit.type = XBIT_comment;
2416 p->xbit.comment_chars = p->pbuf;
2417 Consume(p->pbuf);
2418
2419 return 0;
2420 }
2421
parse_pi(Parser p,Entity ent)2422 static int parse_pi(Parser p, Entity ent)
2423 {
2424 InputSource s = p->source;
2425 int c, c1=0;
2426 int count = 0;
2427 Char xml[] = {'x', 'm', 'l', 0};
2428
2429 if(ParserGetFlag(p, Validate) && VectorCount(p->element_stack) > 0)
2430 {
2431 ElementDefinition parent = VectorLast(p->element_stack).definition;
2432
2433 if(parent->type == CT_empty)
2434 {
2435 require(validity_error(p, "PI not allowed in EMPTY element %S",
2436 parent->name));
2437 }
2438 }
2439
2440 require(parse_name(p, "after <?"));
2441 CopyName(p->xbit.pi_name);
2442
2443 p->pbufnext = 0;
2444 NF16noStartCheck(p);
2445
2446 if(Strcasecmp(p->xbit.pi_name, xml) == 0)
2447 {
2448 if(ParserGetFlag(p, XMLStrictWFErrors))
2449 return error(p, "Misplaced xml declaration");
2450 else if(!ParserGetFlag(p, IgnorePlacementErrors))
2451 warn(p, "Misplaced xml declaration; treating as PI");
2452 }
2453
2454 if(ParserGetFlag(p, XMLNamespaces) && Strchr(p->xbit.pi_name, ':'))
2455 {
2456 require(namespace_error(p, "PI name %S contains colon",
2457 p->xbit.pi_name));
2458 }
2459
2460 /* Empty PI? */
2461
2462 if(looking_at(p, ParserGetFlag(p, XMLSyntax) ? "?>" : ">"))
2463 {
2464 ExpandBuf(p->pbuf, 0);
2465 goto done;
2466 }
2467 if(p->state == PS_error) /* looking_at may have set it */
2468 return -1;
2469
2470 /* If non-empty, must be white space after name */
2471
2472 c = get(s);
2473 if(c == BADCHAR)
2474 return error(p, "Input error: %s", s->error_msg);
2475 if(c == XEOE || !is_xml_whitespace(c))
2476 return error(p, "Expected whitespace after PI name");
2477 skip_whitespace(s);
2478
2479 while((c = get(s)) != XEOE)
2480 {
2481 if(c == BADCHAR)
2482 return error(p, "Input error: %s", s->error_msg);
2483 count++;
2484 if(c == '>' &&
2485 (!ParserGetFlag(p, XMLSyntax) || c1 == '?'))
2486 break;
2487 if(at_eol(s))
2488 {
2489 ifNF16wrong(p,count,count)
2490 return error(p, "PI not normalized");
2491 require(transcribe(p, count, count));
2492 count = 0;
2493 }
2494 c1 = c;
2495 }
2496
2497 /* XXX pi going over PE end should (perhaps?) only be a validity error,
2498 but we treat it as a WF error */
2499
2500 if(c == XEOE)
2501 return error(p, "EOE in PI");
2502
2503 ifNF16wrong(p,count,count-(ParserGetFlag(p, XMLSyntax) ? 2 : 1))
2504 return error(p, "PI not normalized");
2505 require(transcribe(p, count, count-(ParserGetFlag(p, XMLSyntax) ? 2 : 1)));
2506 done:
2507 p->pbuf[p->pbufnext++] = 0;
2508 p->xbit.type = XBIT_pi;
2509 p->xbit.pi_chars = p->pbuf;
2510 Consume(p->pbuf);
2511
2512 NF16StartCheck(p);
2513 return 0;
2514 }
2515
parse_string(Parser p,const char8 * where,enum literal_type type,int * normalised)2516 static int parse_string(Parser p, const char8 *where, enum literal_type type, int *normalised)
2517 {
2518 int c, quote;
2519 int count = 0;
2520 InputSource start_source, s;
2521 int changed = 0;
2522
2523 /* entities cannot start with combiner, other things can */
2524 if (type==LT_param_entity||type==LT_entity) {
2525 NF16StartCheck(p);
2526 }
2527 else {
2528 NF16noStartCheck(p);
2529 }
2530
2531 s = start_source = p->source;
2532
2533 quote = get(s);
2534 if(quote == BADCHAR)
2535 return error(p, "Input error: %s", s->error_msg);
2536 if(quote != '\'' && quote != '"')
2537 {
2538 unget(s); /* For error position */
2539 return error(p, "Expected quoted string %s, but got %s",
2540 where, escape(quote, p->escbuf[0]));
2541 }
2542
2543 p->pbufnext = 0;
2544
2545 while(1)
2546 {
2547 switch(c = get(s))
2548 {
2549 case BADCHAR:
2550 return error(p, "Input error: %s", s->error_msg);
2551
2552 case '\r':
2553 case '\n':
2554 case '\t':
2555 if(!((type == LT_pubid && c != '\t') || /* no tab in pubid */
2556 ((type == LT_cdata_attr || type == LT_tok_attr) &&
2557 ParserGetFlag(p, NormaliseAttributeValues))))
2558 {
2559 count++;
2560 break;
2561 }
2562 if(count > 0)
2563 {
2564 ifNF16wrong(p,count+1,count)
2565 return error(p, "not normalized: %s", where);
2566 require(transcribe(p, count+1, count));
2567 }
2568 count = 0;
2569 ExpandBuf(p->pbuf, p->pbufnext+1);
2570 p->pbuf[p->pbufnext++] = ' ';
2571 NF16noStartCheck(p); /* space resets normalization checking */
2572 break;
2573
2574 case '<':
2575 if((type == LT_tok_attr || type == LT_cdata_attr) &&
2576 ParserGetFlag(p, XMLMiscWFErrors))
2577 return error(p, "Illegal character '<' %s", where);
2578 count++;
2579 break;
2580
2581 case XEOE:
2582 if(s == start_source)
2583 return error(p, "Quoted string goes past entity end");
2584 if(count > 0)
2585 {
2586 ifNF16wrong(p,count,count)
2587 return error(p, "not normalized: %s", where);
2588 require(transcribe(p, count, count));
2589 }
2590 count = 0;
2591 ParserPop(p);
2592 s = p->source;
2593 break;
2594
2595 case '%':
2596 if(!(type == LT_entity || type == LT_param_entity))
2597 {
2598 count++;
2599 break;
2600 }
2601 if(count > 0)
2602 {
2603 ifNF16wrong(p,count+1,count)
2604 return error(p, "not normalized: %s", where);
2605 require(transcribe(p, count+1, count));
2606 }
2607 count = 0;
2608 if(p->external_pe_depth == 0)
2609 {
2610 unget(s); /* For error position */
2611 return error(p, "PE ref not allowed here in internal subset");
2612 }
2613 require(parse_reference(p, 1, 1, 1));
2614 s = p->source;
2615 break;
2616
2617 case '&':
2618 if(ParserGetFlag(p, IgnoreEntities))
2619 goto deflt;
2620 if(type == LT_plain || type == LT_pubid)
2621 {
2622 count++;
2623 break;
2624 }
2625
2626 if(count > 0)
2627 {
2628 ifNF16wrong(p,count+1,count)
2629 return error(p, "not normalized: %s", where);
2630 require(transcribe(p, count+1, count));
2631 }
2632 count = 0;
2633 if(looking_at(p, "#"))
2634 /* We *must* expand character references in parameter
2635 entity definitions otherwise the result when it is
2636 used may be syntactically incorrect. */
2637 {
2638 require(parse_character_reference(p,
2639 type == LT_param_entity ||
2640 ParserGetFlag(p, ExpandCharacterEntities)));
2641 }
2642 else
2643 {
2644 if(p->state == PS_error) /* looking_at may have set it */
2645 return -1;
2646 require(parse_reference(p, 0,
2647 !(type == LT_entity || type == LT_param_entity) &&
2648 ParserGetFlag(p, ExpandGeneralEntities),
2649 !ParserGetFlag(p, XMLMiscWFErrors)));
2650 s = p->source;
2651 }
2652 break;
2653
2654 default:
2655 deflt:
2656 if(c == quote && p->source == start_source)
2657 goto done;
2658 count++;
2659 }
2660
2661 if(at_eol(s) && count > 0)
2662 {
2663 ifNF16wrong(p,count,count)
2664 return error(p, "not normalized: %s", where);
2665 require(transcribe(p, count, count));
2666 count = 0;
2667 }
2668 }
2669
2670 done:
2671 if(count > 0)
2672 {
2673 ifNF16wrong(p,count+1,count)
2674 return error(p, "not normalized: %s", where);
2675 require(transcribe(p, count+1, count));
2676 }
2677 else
2678 ExpandBuf(p->pbuf, p->pbufnext+1);
2679 p->pbuf[p->pbufnext++] = 0;
2680
2681 if((ParserGetFlag(p, NormaliseAttributeValues) && type == LT_tok_attr) ||
2682 type == LT_pubid)
2683 {
2684 Char *old, *new;
2685
2686 new = old = p->pbuf;
2687
2688 /* Skip leading whitespace */
2689
2690 while(*old == ' ')
2691 {
2692 changed = 1;
2693 old++;
2694 }
2695
2696 /* Compress whitespace */
2697
2698 for( ; *old; old++)
2699 {
2700 if(*old == ' ')
2701 {
2702 /* NB can't be at start because we skipped whitespace */
2703 if(new[-1] == ' ')
2704 changed = 1;
2705 else
2706 *new++ = ' ';
2707 }
2708 else
2709 *new++ = *old;
2710 }
2711
2712 /* Trim trailing space (only one possible because we compressed) */
2713
2714 if(new > p->pbuf && new[-1] == ' ')
2715 {
2716 changed = 1;
2717 new--;
2718 }
2719
2720 *new = 0;
2721 }
2722
2723 if(normalised)
2724 *normalised = changed;
2725
2726 return 0;
2727 }
2728
parse_dtd(Parser p)2729 static int parse_dtd(Parser p)
2730 {
2731 InputSource s = p->source;
2732 Entity parent = s->entity;
2733 Entity internal_part = 0, external_part = 0;
2734 Char *name;
2735 char8 *publicid = 0, *systemid = 0;
2736 struct xbit xbit;
2737
2738 xbit = p->xbit; /* copy start position */
2739 xbit.type = XBIT_dtd;
2740
2741 require(parse_name(p, "for name in dtd"));
2742 CopyName(name);
2743 maybe_uppercase(p, name);
2744
2745 if(ParserGetFlag(p, XMLNamespaces))
2746 {
2747 require(check_qualname_syntax(p, name, "Doctype"));
2748 }
2749
2750 skip_whitespace(s);
2751
2752 require(parse_external_id(p, 0, &publicid, &systemid,
2753 ParserGetFlag(p, XMLExternalIDs),
2754 ParserGetFlag(p, XMLExternalIDs)));
2755
2756 if(systemid || publicid)
2757 {
2758 external_part = NewExternalEntityN(0,0, publicid, systemid, 0, parent);
2759 if(!external_part)
2760 {
2761 Free(name);
2762 return error(p, "System error");
2763 }
2764 skip_whitespace(s);
2765 }
2766
2767 if(looking_at(p, "["))
2768 {
2769 int line = s->line_number, cpos = s->next;
2770
2771 require(read_markupdecls(p));
2772 skip_whitespace(s);
2773 internal_part = NewInternalEntity(0, p->pbuf, parent, line, cpos, 1);
2774 Consume(p->pbuf);
2775 if(!internal_part)
2776 {
2777 Free(name);
2778 FreeEntity(external_part);
2779 return error(p, "System error");
2780 }
2781 internal_part->is_internal_subset = 1;
2782 }
2783 if(p->state == PS_error) /* looking_at may have set it */
2784 return -1;
2785
2786 require(expect(p, '>', "at end of dtd"));
2787
2788 if(p->state == PS_prolog1)
2789 p->state = PS_prolog2;
2790 else
2791 {
2792 Free(name);
2793 FreeEntity(external_part);
2794 FreeEntity(internal_part);
2795
2796 if(ParserGetFlag(p, XMLStrictWFErrors))
2797 return error(p, "Misplaced or repeated DOCTYPE declaration");
2798 else if(!ParserGetFlag(p, IgnorePlacementErrors))
2799 warn(p, "Misplaced or repeated DOCTYPE declaration");
2800
2801 /* Ignore it and return the next bit */
2802 return parse(p);
2803 }
2804
2805 if(p->dtd->name)
2806 {
2807 Free(name);
2808 FreeEntity(external_part);
2809 FreeEntity(internal_part);
2810
2811 /* This happens if we manually set the dtd */
2812 return parse(p);
2813 }
2814
2815 p->dtd->name = name;
2816 p->dtd->internal_part = internal_part;
2817 p->dtd->external_part = external_part;
2818
2819 if(internal_part)
2820 {
2821 if(ParserGetFlag(p, TrustSDD) || ParserGetFlag(p, ProcessDTD))
2822 {
2823 ParseDtd(p, internal_part);
2824 if(p->xbit.type == XBIT_error)
2825 return -1;
2826 }
2827 }
2828
2829 if(external_part)
2830 {
2831 if((ParserGetFlag(p, TrustSDD) &&
2832 (ParserGetFlag(p, Validate) || p->standalone != SDD_yes)) ||
2833 (!ParserGetFlag(p, TrustSDD) &&
2834 ParserGetFlag(p, ProcessDTD)))
2835 {
2836 ParseDtd(p, external_part);
2837 if(p->xbit.type == XBIT_error)
2838 return -1;
2839 }
2840 }
2841
2842 p->xbit = xbit;
2843 return 0;
2844 }
2845
read_markupdecls(Parser p)2846 static int read_markupdecls(Parser p)
2847 {
2848 InputSource s = p->source;
2849 int depth=1;
2850 int c, d, hyphens=0;
2851 int count = 0;
2852
2853 p->pbufnext = 0;
2854
2855 while(1)
2856 {
2857 c = get(s);
2858 if(c == BADCHAR)
2859 return error(p, "Input error: %s", s->error_msg);
2860 if(c == XEOE)
2861 return error(p, "EOE in DTD");
2862 if(c == '-')
2863 hyphens++;
2864 else
2865 hyphens = 0;
2866
2867 count++;
2868
2869 switch(c)
2870 {
2871 case ']':
2872 if(--depth == 0)
2873 {
2874 count--; /* We don't want the final ']' */
2875 require(transcribe(p, count+1, count));
2876 p->pbuf[p->pbufnext++] = 0;
2877 return 0;
2878 }
2879 break;
2880
2881 case '[':
2882 depth++;
2883 break;
2884
2885 case '"':
2886 case '\'':
2887 while((d = get(s)) != XEOE)
2888 {
2889 if(d == BADCHAR)
2890 return error(p, "Input error: %s", s->error_msg);
2891 count++;
2892 if(at_eol(s))
2893 {
2894 require(transcribe(p, count, count));
2895 count = 0;
2896 }
2897 if(d == c)
2898 break;
2899 }
2900 if(d == XEOE)
2901 return error(p, "EOE in DTD");
2902 break;
2903
2904 case '-':
2905 if(hyphens < 2)
2906 break;
2907 hyphens = 0;
2908 while((d = get(s)) != XEOE)
2909 {
2910 if(d == BADCHAR)
2911 return error(p, "Input error: %s", s->error_msg);
2912 count++;
2913 if(at_eol(s))
2914 {
2915 require(transcribe(p, count, count));
2916 count = 0;
2917 }
2918 if(d == '-')
2919 hyphens++;
2920 else
2921 hyphens = 0;
2922 if(hyphens == 2)
2923 break;
2924 }
2925 if(d == XEOE)
2926 return error(p, "EOE in DTD");
2927 hyphens = 0;
2928 break;
2929
2930 default:
2931 break;
2932 }
2933
2934 if(at_eol(s) && count > 0)
2935 {
2936 require(transcribe(p, count, count));
2937 count = 0;
2938 }
2939 }
2940 }
2941
process_nsl_decl(Parser p)2942 static int process_nsl_decl(Parser p)
2943 {
2944 InputSource s = p->source;
2945 int c, count = 0;
2946
2947 s->entity->ml_decl = ML_nsl;
2948
2949 /* The default character encoding for nSGML files is ascii-ash */
2950 if(s->entity->encoding == CE_UTF_8)
2951 s->entity->encoding = CE_unspecified_ascii_superset;
2952
2953 /* Syntax is <?NSL DDB unquoted-filename 0> */
2954
2955 if(!looking_at(p, "DDB "))
2956 {
2957 if(p->state == PS_error) /* looking_at may have set it */
2958 return -1;
2959 return error(p, "Expected \"DDB\" in NSL declaration");
2960 }
2961
2962 while(c = get(s), !is_xml_whitespace(c))
2963 switch(c)
2964 {
2965 case BADCHAR:
2966 return error(p, "Input error: %s", s->error_msg);
2967
2968 case XEOE:
2969 return error(p, "EOE in NSL declaration");
2970
2971 case '>':
2972 return error(p, "Syntax error in NSL declaration");
2973
2974 default:
2975 count++;
2976 }
2977
2978 p->pbufnext = 0;
2979 require(transcribe(p, count+1, count));
2980 p->pbuf[p->pbufnext++] = 0;
2981
2982 skip_whitespace(s);
2983 if(!looking_at(p, "0>"))
2984 {
2985 if(p->state == PS_error) /* looking_at may have set it */
2986 return -1;
2987 return error(p, "Expected \"0>\" at end of NSL declaration");
2988 }
2989
2990 if(!(s->entity->ddb_filename = duptochar8(p->pbuf)))
2991 return error(p, "System error");
2992
2993 return 0;
2994 }
2995
process_xml_decl(Parser p)2996 static int process_xml_decl(Parser p)
2997 {
2998 InputSource s = p->source;
2999 enum {None, V, E, S} which, last = None;
3000 Char *Value, *cp;
3001 char8 *value;
3002 CharacterEncoding enc = CE_unknown;
3003 Char c;
3004
3005 /*
3006 * If we are reading an external entity, should the XML declaration
3007 * (actually "text declaration") be included as part of the replacement
3008 * text? The standard does not as far as I can see define the
3009 * replacement text of an external entity, but it says "a parsed
3010 * entity's contents are referred to as its replacement text" and
3011 * the production for extParsedEnt is "TextDecl? content". If the
3012 * "contents" of the entity are identified with "content" in the
3013 * production, then clearly the text declaration is not part of the
3014 * replacement text. On the other hand, this would be an inconsistency
3015 * between XML and SGML (which regards the declaration as just another
3016 * processing instruction), and there aren't any of those.
3017 *
3018 * It seems quite reasonable to want to put an encoding declaration
3019 * on an external entity containing only PCDATA, and this would be
3020 * illegal if the text declaration were inserted. Furthermore, it's
3021 * way too much trouble to save the text declaration as well as parse it.
3022 */
3023
3024 s->entity->ml_decl = ML_xml;
3025
3026 /* Save the string buffer because it may already be in use */
3027 p->save_pbuf = p->pbuf;
3028 p->save_pbufsize = p->pbufsize;
3029 p->save_pbufnext = p->pbufnext;
3030 Consume(p->pbuf);
3031
3032 while(!looking_at(p, "?>"))
3033 {
3034 if(looking_at(p, "version"))
3035 which = V;
3036 else if(looking_at(p, "encoding"))
3037 which = E;
3038 else if(looking_at(p, "standalone"))
3039 which = S;
3040 else if(p->state == PS_error) /* looking_at may have set it */
3041 return -1;
3042 else
3043 return error(p, "Expected \"version\", \"encoding\" or "
3044 "\"standalone\" in XML declaration");
3045
3046 if(which <= last)
3047 {
3048 if(ParserGetFlag(p, XMLStrictWFErrors))
3049 return error(p, "Repeated or misordered attributes "
3050 "in XML declaration");
3051 warn(p, "Repeated or misordered attributes in XML declaration");
3052 }
3053 last = which;
3054
3055 skip_whitespace(s);
3056 require(expect(p, '=', "after attribute name in XML declaration"));
3057 skip_whitespace(s);
3058
3059 require(parse_string(p, "for attribute value in XML declaration",
3060 LT_plain, 0));
3061
3062 maybe_uppercase(p, p->pbuf);
3063 Value = p->pbuf;
3064
3065 if(which == E)
3066 {
3067 if(!is_ascii_alpha(Value[0]))
3068 return error(p, "Encoding name does not begin with letter");
3069 for(cp=Value+1; *cp; cp++)
3070 if(!is_ascii_alpha(*cp) && !is_ascii_digit(*cp) &&
3071 *cp != '.' && *cp != '_' && *cp != '-')
3072 return error(p, "Illegal character %s in encoding name",
3073 escape(*cp, p->escbuf[0]));
3074
3075 value = tochar8(Value);
3076
3077 enc = FindEncoding(value);
3078 if(enc == CE_unknown)
3079 return error(p, "Unknown declared encoding %s", value);
3080
3081 if(EncodingsCompatible(p->source->entity->encoding, enc, &enc))
3082 {
3083 #if CHAR_SIZE == 8
3084 /* We ignore the declared encoding in 8-bit mode,
3085 and treat it as a random ascii superset. */
3086 #else
3087 p->source->entity->encoding = enc;
3088 #endif
3089 }
3090 else
3091 return error(p, "Declared encoding %s is incompatible with %s "
3092 "which was used to read it",
3093 CharacterEncodingName[enc],
3094 CharacterEncodingName[p->source->entity->encoding]);
3095
3096 s->entity->encoding_decl = enc;
3097 }
3098
3099 if(which == S)
3100 {
3101 value = tochar8(Value);
3102
3103 if(str_maybecase_cmp8(p, value, "no") == 0)
3104 p->standalone = SDD_no;
3105 else if(str_maybecase_cmp8(p, value, "yes") == 0)
3106 p->standalone = SDD_yes;
3107 else
3108 return error(p, "Expected \"yes\" or \"no\" "
3109 "for standalone in XML declaration");
3110
3111 s->entity->standalone_decl = p->standalone;
3112 }
3113
3114 if(which == V)
3115 {
3116 for(cp=Value; *cp; cp++)
3117 if(!is_ascii_alpha(*cp) && !is_ascii_digit(*cp) &&
3118 *cp != '.' && *cp != '_' && *cp != '-' && *cp != ':')
3119 return error(p, "Illegal character %s in version number",
3120 escape(*cp, p->escbuf[0]));
3121
3122 if(!s->entity->version_decl)
3123 {
3124 if(!(s->entity->version_decl = duptochar8(Value)))
3125 return error(p, "System error");
3126
3127 if(strcmp8(s->entity->version_decl, "1.0") == 0)
3128 s->entity->xml_version = XV_1_0;
3129 else if(strcmp8(s->entity->version_decl, "1.1") == 0)
3130 s->entity->xml_version = XV_1_1;
3131 else if(!ParserGetFlag(p, Pre105VersionCheck) && is_v1x(s->entity->version_decl))
3132 s->entity->xml_version = XV_1_0;
3133 else
3134 {
3135 if(ParserGetFlag(p, XMLStrictWFErrors))
3136 return error(p, "Version number \"%s\" not supported",
3137 s->entity->version_decl);
3138 warn(p, "Version number \"%s\" not supported, "
3139 "parsing as XML 1.1",
3140 s->entity->version_decl);
3141 s->entity->xml_version = XV_1_1;
3142 }
3143 }
3144 }
3145
3146 c = get(s);
3147 if(c == BADCHAR)
3148 return error(p, "Input error: %s", s->error_msg);
3149 if(c == '?')
3150 unget(s);
3151 else if(!is_xml_whitespace(c))
3152 return error(p, "Expected whitespace or \"?>\" after attribute "
3153 "in XML declaration");
3154 skip_whitespace(s);
3155 }
3156
3157 /* Restore the string buffer */
3158 Free(p->pbuf);
3159 p->pbuf = p->save_pbuf;
3160 p->pbufsize = p->save_pbufsize;
3161 p->pbufnext = p->save_pbufnext;
3162 Consume(p->save_pbuf);
3163
3164 return 0;
3165 }
3166
is_v1x(const char * version)3167 static int is_v1x(const char *version)
3168 {
3169 int i;
3170
3171 if(version[0] != '1' || version[1] != '.')
3172 return 0;
3173 if(!version[2])
3174 return 0;
3175
3176 for(i=2; version[i]; i++)
3177 if(version[i] < '0' || version[i] > '9')
3178 return 0;
3179
3180 return 1;
3181 }
3182
parse_cdata(Parser p)3183 static int parse_cdata(Parser p)
3184 {
3185 InputSource s = p->source;
3186 int c, c1=0, c2=0;
3187 int count = 0;
3188 NF16StartCheck(p);
3189
3190 if(p->state <= PS_prolog2)
3191 return error(p, "CDATA section not allowed in prolog");
3192 if(p->state == PS_epilog)
3193 return error(p, "CDATA section not allowed after body");
3194 if(ParserGetFlag(p, Validate))
3195 {
3196 ElementDefinition e = VectorLast(p->element_stack).definition;
3197 if(!(e->type == CT_mixed || e->type == CT_any))
3198 {
3199 require(validity_error(p, "CDATA section not allowed here"));
3200 VectorLast(p->element_stack).context = 0;
3201 }
3202 }
3203
3204 p->pbufnext = 0;
3205
3206 while((c = get(s)) != XEOE)
3207 {
3208 if(c == BADCHAR)
3209 return error(p, "Input error: %s", s->error_msg);
3210 count++;
3211 if(c == '>' && c1 == ']' && c2 == ']')
3212 break;
3213 if(at_eol(s))
3214 {
3215 ifNF16wrong(p,count,count)
3216 return error(p, "CDATA section not normalized");
3217 require(transcribe(p, count, count));
3218 count = 0;
3219 }
3220 c2 = c1; c1 = c;
3221 }
3222
3223 if(c == XEOE)
3224 return error(p, "EOE in CDATA section");
3225
3226 ifNF16wrong(p,count,count)
3227 return error(p, "CDATA section not normalized");
3228 require(transcribe(p, count, count-3));
3229 p->pbuf[p->pbufnext++] = 0;
3230 p->xbit.type = XBIT_cdsect;
3231 p->xbit.cdsect_chars = p->pbuf;
3232 Consume(p->pbuf);
3233
3234 NF16StartCheck(p);
3235 return 0;
3236 }
3237
ParseDtd(Parser p,Entity e)3238 XBit ParseDtd(Parser p, Entity e)
3239 {
3240 InputSource source, save;
3241
3242 if(e->type == ET_external && p->entity_opener)
3243 source = p->entity_opener(e, p->entity_opener_arg);
3244 else
3245 source = EntityOpen(e);
3246 if(!source)
3247 {
3248 error(p, "Couldn't open dtd entity %s", EntityDescription(e));
3249 return &p->xbit;
3250 }
3251
3252 save = p->source;
3253 p->source = 0;
3254 if(ParserPush(p, source) == -1)
3255 return &p->xbit;
3256
3257 p->have_dtd = 1;
3258
3259 p->external_pe_depth = (source->entity->type == ET_external);
3260
3261 while(parse_markupdecl(p) == 0)
3262 ;
3263
3264 p->external_pe_depth = 0;
3265
3266 /* don't restore after error, so user can call ParserPerror */
3267 if(p->xbit.type != XBIT_error)
3268 {
3269 ParserPop(p); /* to free the input source */
3270 p->source = save;
3271 }
3272
3273 return &p->xbit;
3274 }
3275
3276 /*
3277 * Returns 0 normally, -1 if error, 1 at EOF.
3278 */
parse_markupdecl(Parser p)3279 static int parse_markupdecl(Parser p)
3280 {
3281 InputSource s, t;
3282 int c;
3283 int cur_line, cur_char;
3284 Entity cur_ent, cur_ext_ent = 0;
3285
3286 if(p->state == PS_error)
3287 return error(p, "Attempt to continue reading DTD after error");
3288
3289 clear_xbit(&p->xbit);
3290
3291 require(skip_dtd_whitespace(p, 1)); /* allow PE even in internal subset */
3292 s = p->source;
3293 SourcePosition(s, &p->xbit.entity, &p->xbit.byte_offset);
3294
3295 cur_ent = s->entity;
3296 cur_line = s->line_number;
3297 cur_char = s->next;
3298
3299 /* Find the current *external* entity, to use as base URI for system
3300 identifiers */
3301
3302 for(t = s; t; t = t->parent)
3303 if(t->entity->type == ET_external)
3304 {
3305 cur_ext_ent = t->entity;
3306 break;
3307 }
3308 if(!cur_ext_ent)
3309 cur_ext_ent = p->document_entity;
3310
3311 c = get(s);
3312 switch(c)
3313 {
3314 case BADCHAR:
3315 return error(p, "Input error: %s", s->error_msg);
3316 case XEOE:
3317 p->xbit.type = XBIT_none;
3318 return 1;
3319 case '<':
3320 if(looking_at(p, "!ELEMENT"))
3321 {
3322 require(expect_dtd_whitespace(p, "after ELEMENT"));
3323 return parse_element_decl(p, cur_ent);
3324 }
3325 else if(looking_at(p, "!ATTLIST"))
3326 {
3327 require(expect_dtd_whitespace(p, "after ATTLIST"));
3328 return parse_attlist_decl(p, cur_ent);
3329 }
3330 else if(looking_at(p, "!ENTITY"))
3331 {
3332 require(expect_dtd_whitespace(p, "after ENTITY"));
3333 return parse_entity_decl(p, cur_ent, cur_line, cur_char,
3334 cur_ext_ent);
3335 }
3336 else if(looking_at(p, "!NOTATION"))
3337 {
3338 require(expect_dtd_whitespace(p, "after NOTATION"));
3339 return parse_notation_decl(p, cur_ent);
3340 }
3341 else if(looking_at(p, "!["))
3342 return parse_conditional(p, cur_ent);
3343 else if(looking_at(p, "?"))
3344 {
3345 require(parse_pi(p, cur_ent));
3346 if(p->dtd_callback)
3347 p->dtd_callback(&p->xbit, p->dtd_callback_arg);
3348 else
3349 FreeXBit(&p->xbit);
3350 return 0;
3351 }
3352 else if(looking_at(p, "!--"))
3353 {
3354 if(ParserGetFlag(p, ReturnComments))
3355 {
3356 require(parse_comment(p, 0, cur_ent));
3357 if(p->dtd_callback)
3358 p->dtd_callback(&p->xbit, p->dtd_callback_arg);
3359 else
3360 FreeXBit(&p->xbit);
3361 return 0;
3362 }
3363 else
3364 return parse_comment(p, 1, cur_ent);
3365 }
3366 else if(p->state == PS_error) /* looking_at may have set it */
3367 return -1;
3368 else
3369 return error(p, "Syntax error after < in dtd");
3370 default:
3371 unget(s); /* For error position */
3372 return error(p, "Expected \"<\" in dtd, but got %s",
3373 escape(c, p->escbuf[0]));
3374 }
3375 }
3376
parse_reference(Parser p,int pe,int expand,int allow_external)3377 static int parse_reference(Parser p, int pe, int expand, int allow_external)
3378 {
3379 Entity e;
3380 InputSource s;
3381
3382 require(parse_name(p, pe ? "for parameter entity" : "for entity"));
3383 require(expect(p, ';', "after entity name"));
3384
3385 if(ParserGetFlag(p, Validate) && VectorCount(p->element_stack) > 0)
3386 {
3387 ElementDefinition parent = VectorLast(p->element_stack).definition;
3388
3389 if(parent->type == CT_empty)
3390 {
3391 require(validity_error(p, "Entity reference not allowed in EMPTY element %S",
3392 parent->name));
3393 }
3394 }
3395
3396 if(!expand)
3397 return transcribe(p, 1 + p->namelen + 1, 1 + p->namelen + 1);
3398
3399 e = FindEntityN(p->dtd, p->name, p->namelen, pe);
3400 if(!e)
3401 {
3402 Char *buf;
3403 Char *q;
3404 int i;
3405
3406 if(pe || ParserGetFlag(p, ErrorOnUndefinedEntities))
3407 return error(p, "Undefined%s entity %.*S",
3408 pe ? " parameter" : "" ,
3409 p->namelen > 50 ? 50 : p->namelen, p->name);
3410
3411 warn(p, "Undefined%s entity %.*S",
3412 pe ? " parameter" : "",
3413 p->namelen > 50 ? 50 : p->namelen, p->name);
3414
3415 /* Fake a definition for it */
3416
3417 buf = Malloc((5 + p->namelen + 1 + 1) * sizeof(Char));
3418 if(!buf)
3419 return error(p, "System error");
3420 q = buf;
3421 *q++ = '&'; *q++ = '#'; *q++ = '3'; *q++ = '8'; *q++ = ';';
3422 for(i=0; i<p->namelen; i++)
3423 *q++ = p->name[i];
3424 *q++ = ';';
3425 *q++ = 0;
3426
3427 if(!(e = NewInternalEntityN(p->name, p->namelen, buf, 0, 0, 0, 0)))
3428 return error(p, "System error");
3429 if(!DefineEntity(p->dtd, e, 0))
3430 return error(p, "System error");
3431
3432 if(ParserGetFlag(p, XMLNamespaces) && Strchr(e->name, ':'))
3433 {
3434 require(namespace_error(p, "Entity name %S contains colon",
3435 e->name));
3436 }
3437 }
3438
3439 if(e->type == ET_external && e->notation)
3440 return error(p, "Illegal reference to unparsed entity \"%S\"",
3441 e->name);
3442
3443 if(!allow_external && e->type == ET_external)
3444 return error(p, "Illegal reference to external entity \"%S\"",
3445 e->name);
3446
3447 for(s = p->source; s; s = s->parent)
3448 if(s->entity == e)
3449 return error(p, "Recursive reference to entity \"%S\"", e->name);
3450
3451 if(p->standalone == SDD_yes &&
3452 parsing_internal(p) && e->is_externally_declared)
3453 {
3454 /* This is a WF error by erratum 34 */
3455 require(error(p, "Internal reference to externally declared entity "
3456 "\"%S\" in document declared standalone",
3457 e->name));
3458 }
3459 else if(ParserGetFlag(p, Validate) && p->standalone == SDD_yes &&
3460 p->state == PS_body && e->is_externally_declared)
3461 {
3462 require(validity_error(p, "Reference to externally declared entity "
3463 "\"%S\" in document declared standalone",
3464 e->name));
3465 }
3466
3467 if(e->type == ET_external && p->entity_opener)
3468 s = p->entity_opener(e, p->entity_opener_arg);
3469 else
3470 s = EntityOpen(e);
3471 if(!s)
3472 return error(p, "Couldn't open entity %S, %s",
3473 e->name, EntityDescription(e));
3474
3475 require(ParserPush(p, s));
3476 NF16StartCheck(p);
3477
3478 return 0;
3479 }
3480
parse_character_reference(Parser p,int expand)3481 static int parse_character_reference(Parser p, int expand)
3482 {
3483 InputSource s = p->source;
3484 int c, base = 10;
3485 int count = 0;
3486 unsigned int code = 0;
3487 Char *ch = s->line + s->next;
3488
3489 if(looking_at(p, "x"))
3490 {
3491 ch++;
3492 base = 16;
3493 }
3494 if(p->state == PS_error) /* looking_at may have set it */
3495 return -1;
3496
3497 while((c = get(s)) != ';')
3498 {
3499 if(c == BADCHAR)
3500 return error(p, "Input error: %s", s->error_msg);
3501 if((c >= '0' && c <= '9') ||
3502 (base == 16 && ((c >= 'A' && c <= 'F') ||
3503 (c >= 'a' && c <= 'f'))))
3504 count++;
3505 else
3506 {
3507 unget(s); /* For error position */
3508 return error(p,
3509 "Illegal character %s in base-%d character reference",
3510 escape(c, p->escbuf[0]), base);
3511 }
3512 }
3513
3514 if(!expand)
3515 return transcribe(p, 2 + (base == 16) + count + 1,
3516 2 + (base == 16) + count + 1);
3517
3518 while(count-- > 0)
3519 {
3520 c = *ch++;
3521 if(c >= '0' && c <= '9')
3522 code = code * base + (c - '0');
3523 else if(c >= 'A' && c <= 'F')
3524 code = code * base + 10 + (c - 'A');
3525 else
3526 code = code * base + 10 + (c - 'a');
3527
3528 /* Test here rather than just at the end to avoid undetected overflow */
3529 if(code >= 0x110000)
3530 {
3531 if(ParserGetFlag(p, ErrorOnBadCharacterEntities))
3532 return error(p, "Character reference code too big");
3533 else
3534 warn(p, "Character reference code too big, ignored");
3535 return 0;
3536 }
3537 }
3538
3539 /* allow refs to C0 and C1 controls except NUL in XML 1.1 */
3540 #define is_xml11_legal_control(c) \
3541 ((c >= 0x01 && c <= 0x1f) || (c >= 0x7f && c <= 0x9f))
3542
3543 #if CHAR_SIZE == 8
3544 if(code > 255 ||
3545 !(is_xml_legal(code, p->map) ||
3546 (p->xml_version >= XV_1_1 && is_xml11_legal_control(code))))
3547 {
3548 if(ParserGetFlag(p, ErrorOnBadCharacterEntities))
3549 return error(p, "0x%x is not a valid 8-bit XML character", code);
3550 else
3551 warn(p, "0x%x is not a valid 8-bit XML character; ignored", code);
3552 return 0;
3553 }
3554 #else
3555 if(!(is_xml_legal(code, p->map) |
3556 (p->xml_version >= XV_1_1 && is_xml11_legal_control(code))))
3557 {
3558 if(ParserGetFlag(p, ErrorOnBadCharacterEntities))
3559 return error(p, "0x%x is not a valid XML character", code);
3560 else
3561 warn(p, "0x%x is not a valid XML character; ignored", code);
3562 return 0;
3563 }
3564
3565 if(code >= 0x10000)
3566 {
3567 /* Use surrogates */
3568
3569 ExpandBuf(p->pbuf, p->pbufnext+2);
3570 code -= 0x10000;
3571
3572 p->pbuf[p->pbufnext++] = (code >> 10) + 0xd800;
3573 p->pbuf[p->pbufnext++] = (code & 0x3ff) + 0xdc00;
3574 if(p->checker && NF16wrong==nf16checkL(p->checker,
3575 p->pbuf + p->pbufnext - 2, 2))
3576 return error(p, "numeric character reference not normalized");
3577
3578 return 0;
3579 }
3580 #endif
3581
3582 ExpandBuf(p->pbuf, p->pbufnext+1);
3583 p->pbuf[p->pbufnext++] = code;
3584 if(p->checker && NF16wrong==nf16checkL(p->checker,
3585 p->pbuf + p->pbufnext - 1, 1))
3586 return error(p, "numeric character reference not normalized");
3587
3588
3589 return 0;
3590 }
3591
3592 /* Called after reading '<!ELEMENT ' */
3593
parse_element_decl(Parser p,Entity ent)3594 static int parse_element_decl(Parser p, Entity ent)
3595 {
3596 Char *name;
3597 ContentType type;
3598 ElementDefinition def;
3599 Entity tent;
3600 ContentParticle cp = 0;
3601 Char *content = 0;
3602
3603 require(parse_name(p, "for name in element declaration"));
3604 CopyName(name);
3605 maybe_uppercase(p, name);
3606
3607 require(expect_dtd_whitespace(p, "after name in element declaration"));
3608
3609 if(looking_at(p, "EMPTY"))
3610 {
3611 type = CT_empty;
3612 content = 0;
3613 }
3614 else if(looking_at(p, "ANY"))
3615 {
3616 type = CT_any;
3617 content = 0;
3618 }
3619 else if(looking_at(p, "("))
3620 {
3621 unget(p->source);
3622 if(!(cp = parse_cp(p)) ||
3623 check_content_decl(p, cp) < 0 ||
3624 !(content = stringify_cp(cp)))
3625 {
3626 FreeContentParticle(cp);
3627 Free(content);
3628 Free(name);
3629 return -1;
3630 }
3631
3632 if(cp->type == CP_choice && cp->children[0]->type == CP_pcdata)
3633 type = CT_mixed;
3634 else
3635 type = CT_element;
3636 {
3637 }
3638 }
3639 else if(p->state == PS_error) /* looking_at may have set it */
3640 return -1;
3641 else
3642 {
3643 Free(name);
3644 return error(p, "Expected \"EMPTY\", \"ANY\", or \"(\" after name in "
3645 "element declaration");
3646 }
3647
3648 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
3649 tent = p->source->entity;
3650 require(expect(p, '>', "at end of element declaration"));
3651 if(ParserGetFlag(p, Validate) && tent != ent)
3652 {
3653 require(validity_error(p, "Element declaration ends in different "
3654 "entity from that in which it starts"));
3655 }
3656
3657 if((def = FindElement(p->dtd, name)))
3658 {
3659 if(def->tentative)
3660 {
3661 RedefineElement(def, type, content, cp, 1);
3662 if(parsing_external_subset(p))
3663 def->is_externally_declared = 1;
3664 }
3665 else
3666 {
3667 FreeContentParticle(cp);
3668 Free(content);
3669 if(ParserGetFlag(p, Validate))
3670 {
3671 require(validity_error(p, "Element %S declared more than once",
3672 name));
3673 }
3674 else if(ParserGetFlag(p, WarnOnRedefinitions))
3675 warn(p, "Ignoring redeclaration of element %S", name);
3676 }
3677 }
3678 else
3679 {
3680 if (!(def = DefineElement(p->dtd, name, type, content, cp, 1))) {
3681 return error(p, "System error");
3682 };
3683 if(parsing_external_subset(p))
3684 def->is_externally_declared = 1;
3685 if(ParserGetFlag(p, XMLNamespaces))
3686 {
3687 require(check_qualname_syntax(p, name, "Element"));
3688 }
3689 }
3690
3691 Free(name);
3692
3693 return 0;
3694 }
3695
3696 /* Content model parsing */
3697
parse_cp(Parser p)3698 static ContentParticle parse_cp(Parser p)
3699 {
3700 ContentParticle cp;
3701 Entity ent;
3702
3703 ent = p->source->entity;
3704 if(looking_at(p, "("))
3705 {
3706 if(!(cp = parse_choice_or_seq(p, ent)))
3707 return 0;
3708 }
3709 else if(looking_at(p, "#PCDATA"))
3710 {
3711 if(!(cp = Malloc(sizeof(*cp))))
3712 {
3713 error(p, "System error");
3714 return 0;
3715 }
3716
3717 cp->type = CP_pcdata;
3718 }
3719 else if(p->state == PS_error) /* looking_at may have set it */
3720 return 0;
3721 else
3722 {
3723 if(parse_name(p, "in content declaration") < 0)
3724 return 0;
3725 maybe_uppercase_name(p);
3726
3727 if(!(cp = Malloc(sizeof(*cp))))
3728 {
3729 error(p, "System error");
3730 return 0;
3731 }
3732
3733 cp->type = CP_name;
3734 if(!(cp->element = FindElementN(p->dtd, p->name, p->namelen)))
3735 {
3736 if(!(cp->element = TentativelyDefineElementN(p->dtd,
3737 p->name, p->namelen)))
3738 {
3739 error(p, "System error");
3740 return 0;
3741 }
3742 if(ParserGetFlag(p, XMLNamespaces))
3743 if(check_qualname_syntax(p, cp->element->name, "Element") < 0)
3744 return 0;
3745 }
3746 cp->name = cp->element->name;
3747 }
3748
3749
3750 if(looking_at(p, "*"))
3751 cp->repetition = '*';
3752 else if(looking_at(p, "+"))
3753 cp->repetition = '+';
3754 else if(looking_at(p, "?"))
3755 cp->repetition = '?';
3756 else if(p->state == PS_error) /* looking_at may have set it */
3757 return 0;
3758 else
3759 cp->repetition = 0;
3760
3761 return cp;
3762 }
3763
3764 /* Called after '(' */
3765
parse_choice_or_seq(Parser p,Entity ent)3766 static ContentParticle parse_choice_or_seq(Parser p, Entity ent)
3767 {
3768 ContentParticle cp, cp1;
3769
3770
3771 require0(skip_dtd_whitespace(p, p->external_pe_depth > 0));
3772
3773 if(!(cp1 = parse_cp(p)))
3774 return 0;
3775
3776 require0(skip_dtd_whitespace(p, p->external_pe_depth > 0));
3777
3778 if(!(cp = parse_choice_or_seq_1(p, 1, 0, ent)))
3779 FreeContentParticle(cp1);
3780 else
3781 cp->children[0] = cp1;
3782
3783 return cp;
3784 }
3785
3786 /* Called before '|', ',', or ')' */
3787
parse_choice_or_seq_1(Parser p,int nchildren,char sep,Entity ent)3788 static ContentParticle parse_choice_or_seq_1(Parser p, int nchildren,
3789 char sep, Entity ent)
3790 {
3791 ContentParticle cp = 0, cp1;
3792 int nsep = get(p->source);
3793
3794 if(nsep == BADCHAR)
3795 {
3796 error(p, "Input error: %s", p->source->error_msg);
3797 return 0;
3798 }
3799
3800 if(nsep == ')')
3801 {
3802 /* We've reached the end */
3803
3804 if(ParserGetFlag(p, Validate) && p->source->entity != ent)
3805 {
3806 if(validity_error(p, "Content particle ends in different "
3807 "entity from that in which it starts") < 0)
3808 return 0;
3809 }
3810
3811 if(!(cp = Malloc(sizeof(*cp))) ||
3812 !(cp->children = Malloc(nchildren * sizeof(cp))))
3813 {
3814 Free(cp);
3815 error(p, "System error");
3816 return 0;
3817 }
3818
3819 /* The standard does not specify whether '(foo)' is a choice or a
3820 sequence. We make it a choice so that (#PCDATA) comes out as
3821 a choice, like other mixed models. */
3822 /* Erratum E50 has now resolved this the other way, but I don't
3823 see any reason to change it, since it makes no difference. */
3824
3825 cp->type = sep == ',' ? CP_seq : CP_choice;
3826 cp->nchildren = nchildren;
3827
3828 return cp;
3829 }
3830
3831 if(nsep != '|' && nsep != ',')
3832 {
3833 error(p, "Expected | or , or ) in content declaration, got %s",
3834 escape(nsep, p->escbuf[0]));
3835 return 0;
3836 }
3837
3838 if(sep && nsep != sep)
3839 {
3840 error(p, "Content particle contains both | and ,");
3841 return 0;
3842 }
3843
3844 require0(skip_dtd_whitespace(p, p->external_pe_depth > 0));
3845
3846 if(!(cp1 = parse_cp(p)))
3847 return 0;
3848
3849 require0(skip_dtd_whitespace(p, p->external_pe_depth > 0));
3850
3851 if(!(cp = parse_choice_or_seq_1(p, nchildren+1, (char)nsep, ent)))
3852 FreeContentParticle(cp1);
3853 else
3854 cp->children[nchildren] = cp1;
3855
3856 return cp;
3857 }
3858
3859 /* Check content particle matches Mixed or children */
3860
check_content_decl(Parser p,ContentParticle cp)3861 static int check_content_decl(Parser p, ContentParticle cp)
3862 {
3863 int i, j;
3864
3865 if(cp->type == CP_choice && cp->children[0]->type == CP_pcdata)
3866 {
3867 if(cp->children[0]->repetition != 0)
3868 return error(p, "Malformed mixed content declaration");
3869 for(i=1; i<cp->nchildren; i++)
3870 if(cp->children[i]->type != CP_name ||
3871 cp->children[i]->repetition != 0)
3872 return error(p, "Malformed mixed content declaration");
3873
3874 if(cp->repetition != '*' &&
3875 !(cp->nchildren == 1 && cp->repetition == 0))
3876 return error(p, "Malformed mixed content declaration");
3877
3878 if(ParserGetFlag(p, Validate))
3879 {
3880 for(i=1; i<cp->nchildren; i++)
3881 for(j=i+1; j<cp->nchildren; j++)
3882 if(Strcmp(cp->children[i]->name,
3883 cp->children[j]->name) == 0)
3884 {
3885 require(validity_error(p,
3886 "Type %S appears more than once in "
3887 "mixed content declaration",
3888 cp->children[i]->name));
3889 }
3890 }
3891
3892 return 0;
3893 }
3894 else
3895 return check_content_decl_1(p, cp);
3896 }
3897
check_content_decl_1(Parser p,ContentParticle cp)3898 static int check_content_decl_1(Parser p, ContentParticle cp)
3899 {
3900 int i;
3901
3902 switch(cp->type)
3903 {
3904 case CP_pcdata:
3905 return error(p, "Misplaced #PCDATA in content declaration");
3906 case CP_seq:
3907 case CP_choice:
3908 for(i=0; i<cp->nchildren; i++)
3909 if(check_content_decl_1(p, cp->children[i]) < 0)
3910 return -1;
3911 return 0;
3912 default:
3913 return 0;
3914 }
3915 }
3916
3917 /* Reconstruct the content model as a string */
3918
stringify_cp(ContentParticle cp)3919 static Char *stringify_cp(ContentParticle cp)
3920 {
3921 int size = size_cp(cp);
3922 Char *s;
3923 FILE16 *f;
3924
3925 if(!(s = Malloc((size+1) * sizeof(Char))) ||
3926 !(f = MakeFILE16FromString(s, (size + 1) * sizeof(Char), "w")))
3927 {
3928 Free(s);
3929 return 0;
3930 }
3931
3932 print_cp(cp, f);
3933 s[size] = 0;
3934
3935 Fclose(f);
3936
3937 return s;
3938 }
3939
print_cp(ContentParticle cp,FILE16 * f)3940 static void print_cp(ContentParticle cp, FILE16 *f)
3941 {
3942 int i;
3943
3944 switch(cp->type)
3945 {
3946 case CP_pcdata:
3947 Fprintf(f, "#PCDATA");
3948 break;
3949 case CP_name:
3950 Fprintf(f, "%S", cp->name);
3951 break;
3952 case CP_seq:
3953 case CP_choice:
3954 Fprintf(f, "(");
3955 for(i=0; i<cp->nchildren; i++)
3956 {
3957 if(i != 0)
3958 Fprintf(f, cp->type == CP_seq ? "," : "|");
3959 print_cp(cp->children[i], f);
3960 }
3961 Fprintf(f, ")");
3962 break;
3963 default:
3964 break;
3965 }
3966
3967 if(cp->repetition)
3968 Fprintf(f, "%c", cp->repetition);
3969 }
3970
size_cp(ContentParticle cp)3971 static int size_cp(ContentParticle cp)
3972 {
3973 int i, s;
3974
3975 switch(cp->type)
3976 {
3977 case CP_pcdata:
3978 s = 7;
3979 break;
3980 case CP_name:
3981 s = Strlen(cp->name);
3982 break;
3983 default:
3984 s = 2;
3985 for(i=0; i<cp->nchildren; i++)
3986 {
3987 if(i != 0)
3988 s++;
3989 s += size_cp(cp->children[i]);
3990 }
3991 break;
3992 }
3993
3994 if(cp->repetition)
3995 s++;
3996
3997 return s;
3998 }
3999
FreeContentParticle(ContentParticle cp)4000 void FreeContentParticle(ContentParticle cp)
4001 {
4002 int i;
4003
4004 if(!cp)
4005 return;
4006
4007 switch(cp->type)
4008 {
4009 case CP_pcdata:
4010 break;
4011 case CP_name:
4012 /* The name is part of the element definition, so don't free it */
4013 break;
4014 case CP_seq:
4015 case CP_choice:
4016 for(i=0; i<cp->nchildren; i++)
4017 FreeContentParticle(cp->children[i]);
4018 Free(cp->children);
4019 break;
4020 default:
4021 break;
4022 }
4023
4024 Free(cp);
4025 }
4026
4027 /* Called after reading '<!ATTLIST ' */
4028
parse_attlist_decl(Parser p,Entity ent)4029 static int parse_attlist_decl(Parser p, Entity ent)
4030 {
4031 Char *name;
4032 ElementDefinition element;
4033 Entity tent;
4034 AttributeType type;
4035 DefaultType default_type;
4036 AttributeDefinition a;
4037 Char **allowed_values, *t;
4038 Char *default_value;
4039 int nvalues=0, i, j;
4040 static Char s_xml_space[] = {'x','m','l',':','s','p','a','c','e',0},
4041 s_default[] = {'d','e','f','a','u','l','t',0},
4042 s_preserve[] = {'p','r','e','s','e','r','v','e',0};
4043
4044 require(parse_name(p, "for name in attlist declaration"));
4045 CopyName(name);
4046 maybe_uppercase(p, name);
4047
4048 if(!(element = FindElement(p->dtd, name)))
4049 {
4050 if(!(element = TentativelyDefineElement(p->dtd, name)))
4051 return error(p, "System error");
4052 if(ParserGetFlag(p, XMLNamespaces))
4053 {
4054 require(check_qualname_syntax(p, element->name, "Element"));
4055 }
4056 }
4057
4058 Free(name);
4059
4060 if(looking_at(p, ">"))
4061 unget(p->source);
4062 else
4063 {
4064 if(p->state == PS_error) /* looking_at may have set it */
4065 return -1;
4066 require(expect_dtd_whitespace(p,
4067 "after element name in attlist declaration"));
4068 }
4069
4070 while(tent = p->source->entity, !looking_at(p, ">"))
4071 {
4072 if(p->state == PS_error) /* looking_at may have set it */
4073 return -1;
4074 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4075 require(parse_name(p, "for attribute in attlist declaration"));
4076 CopyName(name);
4077 maybe_uppercase(p, name);
4078
4079 require(expect_dtd_whitespace(p, "after name in attlist declaration"));
4080
4081 if(looking_at(p, "CDATA"))
4082 type = AT_cdata;
4083 else if(looking_at(p, "IDREFS"))
4084 type = AT_idrefs;
4085 else if(looking_at(p, "IDREF"))
4086 type = AT_idref;
4087 else if(looking_at(p, "ID"))
4088 type = AT_id;
4089 else if(looking_at(p, "ENTITIES"))
4090 type = AT_entities;
4091 else if(looking_at(p, "ENTITY"))
4092 type = AT_entity;
4093 else if(looking_at(p, "NMTOKENS"))
4094 type = AT_nmtokens;
4095 else if(looking_at(p, "NMTOKEN"))
4096 type = AT_nmtoken;
4097 else if(looking_at(p, "NOTATION"))
4098 type = AT_notation;
4099 else if(p->state == PS_error) /* looking_at may have set it */
4100 return -1;
4101 else
4102 type = AT_enumeration;
4103
4104 if(type != AT_enumeration)
4105 {
4106 require(expect_dtd_whitespace(p, "after attribute type"));
4107 }
4108
4109 if(type == AT_notation || type == AT_enumeration)
4110 {
4111 require(expect(p, '(',
4112 "or keyword for type in attlist declaration"));
4113
4114 nvalues = 0;
4115 p->pbufnext = 0;
4116 do
4117 {
4118 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4119 if(type == AT_notation)
4120 {
4121 require(parse_name(p,
4122 "for notation value in attlist declaration"));
4123 }
4124 else
4125 {
4126 require(parse_nmtoken(p,
4127 "for enumerated value in attlist declaration"));
4128 }
4129 maybe_uppercase_name(p);
4130 ExpandBuf(p->pbuf, p->pbufnext + p->namelen + 1);
4131 memcpy(p->pbuf+p->pbufnext,
4132 p->name,
4133 p->namelen * sizeof(Char));
4134 p->pbuf[p->pbufnext + p->namelen] = 0;
4135 p->pbufnext += (p->namelen + 1);
4136 nvalues++;
4137 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4138 }
4139 while(looking_at(p, "|"));
4140
4141 if(p->state == PS_error) /* looking_at may have set it */
4142 return -1;
4143
4144 require(expect(p, ')',
4145 "at end of enumerated value list in attlist declaration"));
4146 require(expect_dtd_whitespace(p, "after enumerated value list "
4147 "in attlist declaration"));
4148
4149 allowed_values = Malloc((nvalues+1)*sizeof(Char *));
4150 if(!allowed_values)
4151 return error(p, "System error");
4152 for(i=0, t=p->pbuf; i<nvalues; i++)
4153 {
4154 allowed_values[i] = t;
4155 while(*t++)
4156 ;
4157 }
4158 allowed_values[nvalues] = 0;
4159
4160 Consume(p->pbuf);
4161 }
4162 else
4163 allowed_values = 0;
4164
4165 if(looking_at(p, "#REQUIRED"))
4166 default_type = DT_required;
4167 else if(looking_at(p, "#IMPLIED"))
4168 default_type = DT_implied;
4169 else if(looking_at(p, "#FIXED"))
4170 {
4171 default_type = DT_fixed;
4172 require(expect_dtd_whitespace(p, "after #FIXED"));
4173 }
4174 else if(p->state == PS_error) /* looking_at may have set it */
4175 return -1;
4176 else
4177 default_type = DT_none;
4178
4179 if(default_type == DT_fixed || default_type == DT_none)
4180 {
4181 require(parse_string(p,
4182 "for default value in attlist declaration",
4183 type == AT_cdata ? LT_cdata_attr :
4184 LT_tok_attr, 0));
4185 default_value = p->pbuf;
4186 Consume(p->pbuf);
4187 if(type != AT_cdata && type != AT_entity && type != AT_entities)
4188 maybe_uppercase(p, default_value);
4189 }
4190 else
4191 default_value = 0;
4192
4193 if(FindAttribute(element, name))
4194 {
4195 if(ParserGetFlag(p, WarnOnRedefinitions))
4196 warn(p, "Ignoring redeclaration of attribute %S", name);
4197 if(allowed_values)
4198 {
4199 Free(allowed_values[0]);
4200 Free(allowed_values);
4201 }
4202 if(default_value)
4203 Free(default_value);
4204
4205 goto done;
4206 }
4207
4208 if(ParserGetFlag(p, Validate) && type == AT_id)
4209 {
4210 if(element->id_attribute)
4211 {
4212 require(validity_error(p,
4213 "ID attribute %S declared for element"
4214 " %S which already had one (%S)",
4215 name, element->name,
4216 element->id_attribute->name));
4217 }
4218 if(default_type != DT_implied && default_type != DT_required)
4219 {
4220 require(validity_error(p,
4221 "ID attribute %S must have declared "
4222 "default of #IMPLIED or #REQUIRED, not %s",
4223 name, DefaultTypeName[default_type]));
4224 }
4225 }
4226
4227 if(ParserGetFlag(p, Validate) &&
4228 (type == AT_notation || type == AT_enumeration))
4229 /* Duplicate enumerated values were made invalid by
4230 an erratum of 2 Nov 2000 */
4231 {
4232 for(i=0; i<nvalues; i++)
4233 for(j=i+1; j<nvalues; j++)
4234 if(Strcmp(allowed_values[i], allowed_values[j]) == 0)
4235 {
4236 require(validity_error(p,
4237 "Enumerated attribute %S has "
4238 "duplicate allowed value %S",
4239 name,
4240 allowed_values[i],
4241 allowed_values[j]));
4242 break;
4243
4244 }
4245 }
4246
4247 if(ParserGetFlag(p, Validate) && type == AT_notation)
4248 {
4249 /* Requirement for at most one notation attribute was
4250 added in the errata of 17 Feb 1999 */
4251 if(element->notation_attribute)
4252 {
4253 require(validity_error(p,
4254 "NOTATION attribute %S declared for element"
4255 " %S which already had one (%S)",
4256 name, element->name,
4257 element->notation_attribute->name));
4258 }
4259 }
4260
4261 if(ParserGetFlag(p, Validate) && Strcmp(name, s_xml_space) == 0)
4262 {
4263 if(type != AT_enumeration)
4264 {
4265 require(validity_error(p,
4266 "xml:space attribute must have enumerated type"));
4267 }
4268 else for(i=0; i<nvalues; i++)
4269 if(Strcmp(allowed_values[i], s_default) != 0 &&
4270 Strcmp(allowed_values[i], s_preserve) != 0)
4271 {
4272 require(validity_error(p,
4273 "xml:space attribute values may only be \"default\" or \"preserve\""));
4274 break;
4275 }
4276 }
4277
4278 /* It doesn't seem to be required that xml:lang be declared
4279 NMTOKEN, so don't check it */
4280
4281 a = DefineAttribute(element, name, type, allowed_values,
4282 default_type, default_value, 1);
4283 if(!a)
4284 return error(p, "System error");
4285 if(parsing_external_subset(p))
4286 a->is_externally_declared = 1;
4287 if(ParserGetFlag(p, XMLID) &&
4288 element->xml_id_attribute == a && a->type != AT_id)
4289 {
4290 warn(p, "xml:id error: xml:id attribute must be declared as type ID");
4291 /* Fix the declaration so that we treat it as type ID */
4292 a->type = AT_id;
4293 }
4294 if(ParserGetFlag(p, XMLNamespaces))
4295 {
4296 require(check_qualname_syntax(p, a->name, "Attribute"));
4297 }
4298
4299 done:
4300 Free(name);
4301
4302 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4303 }
4304
4305 if(ParserGetFlag(p, Validate) && tent != ent)
4306 {
4307 require(validity_error(p, "Attlist declaration ends in different "
4308 "entity from that in which it starts"));
4309 }
4310
4311 return 0;
4312 }
4313
4314 /* Used for external dtd part, entity definitions and notation definitions. */
4315 /* NB PE references are not allowed here (why not?) */
4316
parse_external_id(Parser p,int required,char8 ** publicid,char8 ** systemid,int preq,int sreq)4317 static int parse_external_id(Parser p, int required,
4318 char8 **publicid, char8 **systemid,
4319 int preq, int sreq)
4320 {
4321 InputSource s = p->source;
4322 int c;
4323 Char *cp;
4324
4325 *publicid = 0;
4326 *systemid = 0;
4327
4328 if(looking_at(p, "SYSTEM"))
4329 {
4330 if(!sreq)
4331 {
4332 skip_whitespace(s);
4333 c = get(s); unget(s);
4334 if(c == BADCHAR)
4335 return error(p, "Input error: %s", s->error_msg);
4336 if(c != '"' && c != '\'')
4337 return 0;
4338 }
4339 else
4340 {
4341 require(expect_dtd_whitespace(p, "after SYSTEM"));
4342 }
4343
4344 require(parse_string(p, "for system ID", LT_plain, 0));
4345 if(!(*systemid = duptochar8(p->pbuf)))
4346 return error(p, "System error");
4347 }
4348 else if(looking_at(p, "PUBLIC"))
4349 {
4350 if(!preq && !sreq)
4351 {
4352 skip_whitespace(s);
4353 c = get(s); unget(s);
4354 if(c == BADCHAR)
4355 return error(p, "Input error: %s", s->error_msg);
4356 if(c != '"' && c != '\'')
4357 return 0;
4358 }
4359 else
4360 {
4361 require(expect_dtd_whitespace(p, "after PUBLIC"));
4362 }
4363
4364 require(parse_string(p, "for public ID", LT_pubid, 0));
4365
4366 for(cp=p->pbuf; *cp; cp++)
4367 if(!is_ascii_alpha(*cp) && !is_ascii_digit(*cp) &&
4368 strchr8("-'()+,./:=?;!*#@$_% \r\n", *cp) == 0)
4369 return error(p, "Illegal character %s in public id",
4370 escape(*cp, p->escbuf[0]));
4371
4372 if(!(*publicid = duptochar8(p->pbuf)))
4373 return error(p, "System error");
4374
4375 if(!sreq)
4376 {
4377 skip_whitespace(s);
4378 c = get(s); unget(s);
4379 if(c == BADCHAR)
4380 return error(p, "Input error: %s", s->error_msg);
4381 if(c != '"' && c != '\'')
4382 return 0;
4383 }
4384 else
4385 {
4386 require(expect_dtd_whitespace(p, "after public id"));
4387 }
4388
4389 require(parse_string(p, "for system ID", LT_plain, 0));
4390 if(!(*systemid = duptochar8(p->pbuf)))
4391 return error(p, "System error");
4392 }
4393 else if(p->state == PS_error) /* looking_at may have set it */
4394 return -1;
4395 else if(required)
4396 return error(p, "Missing or malformed external ID");
4397
4398 return 0;
4399 }
4400
4401 /* Called after reading '<!ENTITY ' */
4402
parse_entity_decl(Parser p,Entity ent,int line,int chpos,Entity ext_ent)4403 static int parse_entity_decl(Parser p, Entity ent, int line, int chpos,
4404 Entity ext_ent)
4405 {
4406 Entity e, old, tent;
4407 int pe, t, namelen;
4408 Char *name;
4409
4410 pe = looking_at(p, "%"); /* If it were a PE ref, we would
4411 already have pushed it */
4412 if(p->state == PS_error) /* looking_at may have set it */
4413 return -1;
4414
4415 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4416 require(parse_name(p, "for name in entity declaration"));
4417 namelen = p->namelen;
4418 CopyName(name);
4419
4420 if(ParserGetFlag(p, XMLNamespaces) && Strchr(name, ':'))
4421 {
4422 require(namespace_error(p, "Entity name %S contains colon", name));
4423 }
4424
4425 require(expect_dtd_whitespace(p, "after name in entity declaration"));
4426
4427 if(looking_at(p, "'") || looking_at(p, "\""))
4428 {
4429 Char *value;
4430
4431 unget(p->source);
4432 require(parse_string(p, "for value in entity declaration",
4433 pe ? LT_param_entity : LT_entity, 0));
4434 value = p->pbuf;
4435 Consume(p->pbuf);
4436
4437 if(!(e = NewInternalEntity(name, value, ent, line, chpos, 0)))
4438 return error(p, "System error");
4439 if(parsing_external_subset(p))
4440 e->is_externally_declared = 1;
4441 #if 0
4442 Fprintf(Stderr, "internal %s entity %S\n",
4443 pe ? "parameter" : "general", name);
4444 Fprintf(Stderr, "base: %s\nreplacement text: %S\n",
4445 e->base_url ? e->base_url : "<null>", e->text);
4446 #endif
4447 }
4448 else if(p->state == PS_error) /* looking_at may have set it */
4449 return -1;
4450 else
4451 {
4452 char8 *publicid, *systemid;
4453 NotationDefinition notation = 0;
4454
4455 require(parse_external_id(p, 1, &publicid, &systemid, 1, 1));
4456
4457 require((t = skip_dtd_whitespace(p, p->external_pe_depth > 0)));
4458 if(looking_at(p, "NDATA"))
4459 {
4460 if(t == 0)
4461 return error(p, "Whitespace missing before NDATA");
4462 if(pe)
4463 return error(p, "NDATA not allowed for parameter entity");
4464 require(expect_dtd_whitespace(p, "after NDATA"));
4465 require(parse_name(p, "for notation name in entity declaration"));
4466 maybe_uppercase_name(p);
4467 notation = FindNotationN(p->dtd, p->name, p->namelen);
4468 if(!notation)
4469 {
4470 notation =
4471 TentativelyDefineNotationN(p->dtd, p->name, p->namelen);
4472 if(!notation)
4473 return error(p, "System error");
4474 if(ParserGetFlag(p, XMLNamespaces) &&
4475 Strchr(notation->name, ':'))
4476 {
4477 require(namespace_error(p,
4478 "Notation name %S contains colon",
4479 notation->name));
4480 }
4481 }
4482 }
4483 if(p->state == PS_error) /* looking_at may have set it */
4484 return -1;
4485
4486 /* XXX we make the current external entity the parent so that
4487 system IDs are resoved correctly. Should we instead record
4488 both parents? */
4489 if(!(e = NewExternalEntityN(name, namelen,
4490 publicid, systemid, notation, ext_ent)))
4491 return error(p, "System error");
4492 if(parsing_external_subset(p) || ent->is_externally_declared)
4493 e->is_externally_declared = 1;
4494 #if 0
4495 Fprintf(Stderr, "external %s entity %S\n",
4496 pe ? "parameter" : "general", name);
4497 Fprintf(Stderr, "base: %s\nsystem identifier: %s\n",
4498 e->base_url ? e->base_url : "<null>", e->systemid);
4499 #endif
4500 }
4501
4502 Free(name);
4503
4504 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4505 tent = p->source->entity;
4506 require(expect(p, '>', "at end of entity declaration"));
4507 if(ParserGetFlag(p, Validate) && tent != ent)
4508 {
4509 require(validity_error(p, "Entity declaration ends in different "
4510 "entity from that in which it starts"));
4511 }
4512
4513 if((old = FindEntity(p->dtd, e->name, pe)))
4514 {
4515 if(old->parent == xml_builtin_entity)
4516 {
4517 if(e->type != ET_internal ||
4518 (ParserGetFlag(p, ExpandCharacterEntities) &&
4519 Strcmp(e->text, old->text) != 0))
4520 warn(p, "Non-standard declaration of predefined "
4521 "entity %S (ignored)",
4522 e->name);
4523 }
4524 else
4525 {
4526 if(ParserGetFlag(p, WarnOnRedefinitions))
4527 warn(p, "Ignoring redefinition of%s entity %S",
4528 pe ? " parameter" : "", e->name);
4529 }
4530
4531 FreeEntity(e);
4532 }
4533 else
4534 if(!DefineEntity(p->dtd, e, pe))
4535 return error(p, "System error");
4536
4537 return 0;
4538 }
4539
parsing_internal(Parser p)4540 static int parsing_internal(Parser p)
4541 {
4542 Entity e = p->source->entity;
4543
4544 if(e == p->document_entity)
4545 return 1;
4546 if(e->type == ET_external)
4547 return 0;
4548 if(e->is_externally_declared)
4549 return 0;
4550 return 1;
4551 }
4552
4553 /* NB assumes we are parsing the DTD */
4554
parsing_external_subset(Parser p)4555 static int parsing_external_subset(Parser p)
4556 {
4557 Entity e = p->source->entity;
4558
4559 return !e->is_internal_subset;
4560 }
4561
4562 /* Called after reading '<!NOTATION ' */
4563
parse_notation_decl(Parser p,Entity ent)4564 static int parse_notation_decl(Parser p, Entity ent)
4565 {
4566 Char *name;
4567 char8 *publicid, *systemid;
4568 NotationDefinition def;
4569 Entity tent;
4570
4571 require(parse_name(p, "for name in notation declaration"));
4572 CopyName(name);
4573 maybe_uppercase(p, name);
4574
4575 require(expect_dtd_whitespace(p, "after name in notation declaration"));
4576
4577 require(parse_external_id(p, 1, &publicid, &systemid, 1, 0));
4578
4579 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4580 tent = p->source->entity;
4581 require(expect(p, '>', "at end of notation declaration"));
4582 if(ParserGetFlag(p, Validate) && tent != ent)
4583 {
4584 require(validity_error(p, "Notation declaration ends in different "
4585 "entity from that in which it starts"));
4586 }
4587
4588 if((def = FindNotation(p->dtd, name)))
4589 {
4590 if(def->tentative)
4591 RedefineNotation(def, publicid, systemid, ent);
4592 else
4593 if(ParserGetFlag(p, WarnOnRedefinitions))
4594 {
4595 warn(p, "Ignoring redefinition of notation %S", name);
4596 if(publicid) Free(publicid);
4597 if(systemid) Free(systemid);
4598 }
4599 }
4600 else
4601 {
4602 if(!DefineNotation(p->dtd, name, publicid, systemid, ent))
4603 return error(p, "System error");
4604 if(ParserGetFlag(p, XMLNamespaces) && Strchr(name, ':'))
4605 {
4606 require(namespace_error(p, "Notation name %S contains colon",
4607 name));
4608 }
4609 }
4610
4611 Free(name);
4612
4613 return 0;
4614 }
4615
parse_conditional(Parser p,Entity ent)4616 static int parse_conditional(Parser p, Entity ent)
4617 {
4618 int depth=1;
4619 Entity tent;
4620
4621 if(p->external_pe_depth == 0)
4622 return error(p, "Conditional section not allowed in internal subset");
4623
4624 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4625 if(looking_at(p, "INCLUDE"))
4626 {
4627 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4628
4629 tent = p->source->entity;
4630 require(expect(p, '[', "at start of conditional section"));
4631 if(ParserGetFlag(p, Validate) && tent != ent)
4632 {
4633 require(validity_error(p, "[ of conditional section in "
4634 "different entity from <!["));
4635 }
4636
4637 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4638
4639 while(!looking_at(p, "]"))
4640 {
4641 switch(parse_markupdecl(p))
4642 {
4643 case 1:
4644 return error(p, "EOF in conditional section");
4645 case -1:
4646 return -1;
4647 }
4648 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4649 }
4650 tent = p->source->entity;
4651
4652 if(!looking_at(p, "]>"))
4653 return error(p, "]> required after ] in conditional section");
4654
4655 if(ParserGetFlag(p, Validate) && tent != ent)
4656 {
4657 require(validity_error(p, "] of conditional section in "
4658 "different entity from <!["));
4659 }
4660 }
4661 else if(looking_at(p, "IGNORE"))
4662 {
4663 /* Easy, because ]]> not even allowed in strings! */
4664
4665 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
4666 tent = p->source->entity;
4667 require(expect(p, '[', "at start of conditional section"));
4668 if(ParserGetFlag(p, Validate) && tent != ent)
4669 {
4670 require(validity_error(p, "[ of conditional section in "
4671 "different entity from <!["));
4672 }
4673
4674 while(depth > 0)
4675 {
4676 switch(get(p->source))
4677 {
4678 case BADCHAR:
4679 return error(p, "Input error: %s", p->source->error_msg);
4680 case XEOE:
4681 if(p->source->parent)
4682 ParserPop(p);
4683 else
4684 return error(p, "EOF in ignored conditional section");
4685 break;
4686 case '<':
4687 if(looking_at(p, "!["))
4688 depth++;
4689 break;
4690 case ']':
4691 tent = p->source->entity;
4692 if(looking_at(p, "]>"))
4693 depth--;
4694 }
4695 }
4696 if(ParserGetFlag(p, Validate) && tent != ent)
4697 {
4698 require(validity_error(p, "]]> of conditional section in "
4699 "different entity from <!["));
4700 }
4701 }
4702 else if(p->state == PS_error) /* looking_at may have set it */
4703 return -1;
4704 else
4705 return error(p, "INCLUDE or IGNORE required in conditional section");
4706
4707 return 0;
4708 }
4709
maybe_uppercase(Parser p,Char * s)4710 static void maybe_uppercase(Parser p, Char *s)
4711 {
4712 if(ParserGetFlag(p, CaseInsensitive))
4713 while(*s)
4714 {
4715 *s = Toupper(*s);
4716 s++;
4717 }
4718 }
4719
maybe_uppercase_name(Parser p)4720 static void maybe_uppercase_name(Parser p)
4721 {
4722 int i;
4723
4724 if(ParserGetFlag(p, CaseInsensitive))
4725 for(i=0; i<p->namelen; i++)
4726 p->name[i] = Toupper(p->name[i]);
4727 }
4728
str_maybecase_cmp8(Parser p,const char8 * a,const char8 * b)4729 static int str_maybecase_cmp8(Parser p, const char8 *a, const char8 *b)
4730 {
4731 return
4732 ParserGetFlag(p, CaseInsensitive) ? strcasecmp8(a, b) : strcmp8(a, b);
4733 }
4734
is_ascii_alpha(int c)4735 static int is_ascii_alpha(int c)
4736 {
4737 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
4738 }
4739
is_ascii_digit(int c)4740 static int is_ascii_digit(int c)
4741 {
4742 return c >= '0' && c <= '9';
4743 }
4744
4745 /* Error handling */
4746
verror(char8 * buf,int size,XBit bit,const char8 * format,va_list args)4747 static void verror(char8 *buf, int size, XBit bit, const char8 *format, va_list args)
4748 {
4749 /* Print message before freeing xbit, so we can print data from it */
4750 Vsnprintf(buf, size, CE_ISO_8859_1, format, args);
4751
4752 FreeXBit(bit);
4753 bit->type = XBIT_error;
4754 bit->error_message = buf;
4755 }
4756
error(Parser p,const char8 * format,...)4757 static int error(Parser p, const char8 *format, ...)
4758 {
4759 va_list args;
4760
4761 va_start(args, format);
4762 verror(p->errbuf, sizeof(p->errbuf), &p->xbit, format, args);
4763
4764 p->state = PS_error;
4765
4766 return -1;
4767 }
4768
warn(Parser p,const char8 * format,...)4769 static int warn(Parser p, const char8 *format, ...)
4770 {
4771 va_list args;
4772 struct xbit bit;
4773
4774 clear_xbit(&bit);
4775
4776 va_start(args, format);
4777 verror(p->errbuf, sizeof(p->errbuf), &bit, format, args);
4778
4779 bit.type = XBIT_warning;
4780
4781 if(p->warning_callback)
4782 p->warning_callback(&bit, p->warning_callback_arg);
4783 else
4784 ParserPerror(p, &bit);
4785
4786 return 0;
4787 }
4788
4789 /* Validity checks applied when the prolog is complete. */
4790
validate_dtd(Parser p)4791 static int validate_dtd(Parser p)
4792 {
4793 Dtd d = p->dtd;
4794 ElementDefinition e;
4795 AttributeDefinition a;
4796 Entity ent;
4797 int i;
4798
4799 if(!p->have_dtd)
4800 {
4801 if(!ParserGetFlag(p, NoNoDTDWarning))
4802 {
4803 require(validity_error(p,
4804 "Document has no DTD, validating abandoned"));
4805 }
4806 ParserSetFlag(p, Validate, 0);
4807 return 0;
4808 }
4809
4810 if(!(e = FindElement(d, d->name)) || e->tentative)
4811 {
4812 require(validity_error(p,
4813 "Root element name %S not declared", d->name));
4814 }
4815
4816 for(e = NextElementDefinition(d, 0); e; e = NextElementDefinition(d, e))
4817 if(e->type == CT_element || e->type == CT_mixed)
4818 {
4819 FSMNode endnode;
4820 e->fsm = NewFSM();
4821 if(!e->fsm)
4822 error(p, "System error");
4823 endnode = AddNode(e->fsm);
4824 if(!endnode)
4825 error(p, "System error");
4826 endnode->end_node = 1;
4827 e->fsm->start_node =
4828 translate_particle(e->fsm, e->particle, endnode);
4829 if(!e->fsm->start_node)
4830 error(p, "System error");
4831 if(e->type == CT_mixed)
4832 /* Mixed content may always be empty, even (#PCDATA) */
4833 e->fsm->start_node->end_node = 1;
4834 #if DEBUG_FSM
4835 Printf("\nContent model for element %S is %S\n",
4836 e->name, e->content);
4837 PrintFSM(Stdout, e->fsm, 0);
4838 #endif
4839 SimplifyFSM(e->fsm);
4840 if(e->type == CT_element)
4841 {
4842 /* Don't do this for mixed content, to prevent extra error
4843 message for (#PCDATA|a|a)* which we already reported */
4844 require(check_deterministic(p, e));
4845 }
4846 #if DEBUG_FSM
4847 Printf("\nContent model for element %S is %S\n",
4848 e->name, e->content);
4849 PrintFSM(Stdout, e->fsm, 1);
4850 #endif
4851 }
4852
4853 /* check all NDATA notations declared */
4854
4855 for(ent = NextEntity(d, 0); ent; ent = NextEntity(d, ent))
4856 if(ent->notation && ent->notation->tentative)
4857 {
4858 require(validity_error(p, "In declaration of unparsed entity %S, "
4859 "notation %S is undefined",
4860 ent->name, ent->notation->name));
4861 }
4862
4863 /* validate attribute defaults (do it here so all entities/notations
4864 declared) and check notations in enumeration all declared */
4865
4866 for(e = NextElementDefinition(d, 0); e; e = NextElementDefinition(d, e))
4867 for(a = NextAttributeDefinition(e, 0); a;
4868 a = NextAttributeDefinition(e, a))
4869 {
4870 if(a->default_value)
4871 {
4872 require(check_attribute_syntax(p, a, e, a->default_value,
4873 "default value for attribute",
4874 0));
4875 }
4876 if(a->type == AT_notation)
4877 {
4878 if(e->type == CT_empty)
4879 {
4880 require(validity_error(p,
4881 "NOTATION attribute %S not allowed "
4882 "on EMPTY element %S",
4883 a->name, e->name));
4884
4885 }
4886
4887 for(i=0; a->allowed_values[i]; i++)
4888 if(!FindNotation(d, a->allowed_values[i]))
4889 {
4890 require(validity_error(p,
4891 "In allowed values for attribute %S of "
4892 "element %S, notation %S is not defined",
4893 a->name, e->name,
4894 a->allowed_values[i]));
4895 }
4896 }
4897 }
4898
4899 return 0;
4900 }
4901
validate_final(Parser p)4902 static int validate_final(Parser p)
4903 {
4904 /* Check all IDs referred to were defined */
4905
4906 hash_map(p->id_table, check_id, p);
4907
4908 if(p->xbit.type == XBIT_error)
4909 return -1;
4910
4911 return 0;
4912 }
4913
check_id(const HashEntryStruct * id_entry,void * pp)4914 static HashMapRetType check_id(const HashEntryStruct *id_entry, void *pp)
4915 {
4916 Parser p = (Parser)pp;
4917
4918 if(!(int)hash_get_value(id_entry))
4919 validity_error(p,
4920 "The ID %.*S was referred to but never defined",
4921 hash_get_key_len(id_entry) / sizeof(Char),
4922 hash_get_key(id_entry));
4923
4924 #ifdef FOR_LT
4925 return 1;
4926 #endif
4927 }
4928
4929 /* Determine whether an element is valid at this point.
4930 * Returns the new context, or NULL if invalid.
4931 */
4932
validate_content(FSMNode context,ElementDefinition e)4933 static FSMNode validate_content(FSMNode context, ElementDefinition e)
4934 {
4935 int i;
4936
4937 for(i=0; i<VectorCount(context->edges); i++)
4938 if(context->edges[i]->label == e)
4939 return context->edges[i]->destination;
4940
4941 return 0;
4942 }
4943
NewFSM(void)4944 static FSM NewFSM(void)
4945 {
4946 FSM fsm;
4947
4948 if(!(fsm = Malloc(sizeof(*fsm))))
4949 return 0;
4950 VectorInit(fsm->nodes);
4951 fsm->start_node = 0;
4952
4953 return fsm;
4954 }
4955
FreeFSM(FSM fsm)4956 void FreeFSM(FSM fsm)
4957 {
4958 int i,j;
4959
4960 if(!fsm)
4961 return;
4962
4963 for(i=0; i<VectorCount(fsm->nodes); i++)
4964 {
4965 FSMNode node = fsm->nodes[i];
4966 for(j=0; j<VectorCount(node->edges); j++)
4967 Free(node->edges[j]);
4968 Free(node->edges);
4969 Free(node);
4970 }
4971
4972 Free(fsm->nodes);
4973 Free(fsm);
4974 }
4975
AddNode(FSM fsm)4976 static FSMNode AddNode(FSM fsm)
4977 {
4978 FSMNode node;
4979
4980 if(!(node = Malloc(sizeof(*node))))
4981 return 0;
4982 node->fsm = fsm;
4983 node->mark = node->end_node = 0;
4984 node->id = VectorCount(fsm->nodes);
4985 VectorInit(node->edges);
4986 if(!VectorPush(fsm->nodes, node))
4987 return 0;
4988
4989 return node;
4990 }
4991
DeleteNode(FSMNode node)4992 static void DeleteNode(FSMNode node)
4993 {
4994 int i;
4995 FSM fsm = node->fsm;
4996
4997 fsm->nodes[node->id] = 0;
4998 for(i=0; i<VectorCount(node->edges); i++)
4999 Free(node->edges[i]);
5000 Free(node->edges);
5001 Free(node);
5002 }
5003
DeleteEdge(FSMEdge edge)5004 static void DeleteEdge(FSMEdge edge)
5005 {
5006 edge->source->edges[edge->id] = 0;
5007 Free(edge);
5008 }
5009
5010 /* After deleting nodes there will be null nodes in the node list.
5011 This function removes them. */
5012
CleanupFSM(FSM fsm)5013 static void CleanupFSM(FSM fsm)
5014 {
5015 int i, j;
5016
5017 for(i=j=0; i<VectorCount(fsm->nodes); i++)
5018 {
5019 if(fsm->nodes[i])
5020 {
5021 if(i > j)
5022 {
5023 fsm->nodes[j] = fsm->nodes[i];
5024 fsm->nodes[j]->id = j;
5025 }
5026 j++;
5027 }
5028 }
5029 VectorCount(fsm->nodes) = j;
5030 }
5031
5032 /* After deleting edges there will be null edges in the edge list.
5033 This function removes them. */
5034
CleanupNode(FSMNode node)5035 static void CleanupNode(FSMNode node)
5036 {
5037 int i, j;
5038
5039 for(i=j=0; i<VectorCount(node->edges); i++)
5040 {
5041 if(node->edges[i])
5042 {
5043 if(i > j)
5044 {
5045 node->edges[j] = node->edges[i];
5046 node->edges[j]->id = j;
5047 }
5048 j++;
5049 }
5050 }
5051 VectorCount(node->edges) = j;
5052 }
5053
AddEdge(FSMNode source,FSMNode destination,void * label)5054 static FSMEdge AddEdge(FSMNode source, FSMNode destination, void *label)
5055 {
5056 FSMEdge edge;
5057
5058 if(!(edge = Malloc(sizeof(*edge))))
5059 return 0;
5060 edge->label = label;
5061 edge->source = source;
5062 edge->destination = destination;
5063 edge->id = VectorCount(source->edges);
5064 if(!VectorPush(source->edges, edge))
5065 return 0;
5066
5067 return edge;
5068 }
5069
UnMarkFSM(FSM fsm,int value)5070 static void UnMarkFSM(FSM fsm, int value)
5071 {
5072 int i;
5073
5074 for(i=0; i<VectorCount(fsm->nodes); i++)
5075 fsm->nodes[i]->mark &= ~value;
5076 }
5077
5078 /* Remove all epsilon links from a FSM */
5079
5080 #define useful 1
5081 #define busy 2
5082
SimplifyFSM(FSM fsm)5083 static int SimplifyFSM(FSM fsm)
5084 {
5085 int i, j;
5086 FSMNode node;
5087 FSMEdge edge;
5088
5089 /* First find all the useful nodes, ie those pointed to by a
5090 non-epsilon edge. */
5091
5092 fsm->start_node->mark |= useful;
5093 for(i=0; i<VectorCount(fsm->nodes); i++)
5094 {
5095 node = fsm->nodes[i];
5096 for(j=0; j<VectorCount(node->edges); j++)
5097 {
5098 edge = node->edges[j];
5099 if(edge->label != Epsilon)
5100 edge->destination->mark |= useful;
5101 }
5102 }
5103
5104 /* Now add to each useful node all the non-epsilon edges of
5105 the nodes in its epsilon-closure. */
5106
5107 for(i=0; i<VectorCount(fsm->nodes); i++)
5108 {
5109 node = fsm->nodes[i];
5110 if(!(node->mark & useful))
5111 continue;
5112 node->mark |= busy;
5113 for(j=0; j<VectorCount(node->edges); j++)
5114 {
5115 edge = node->edges[j];
5116 if(edge->label == Epsilon)
5117 if(!add_epsilon_closure(node, edge->destination))
5118 return 0;
5119 }
5120 UnMarkFSM(fsm, busy);
5121 }
5122
5123 /* Now remove all useless nodes and epsilon edges from useful nodes */
5124
5125 for(i=0; i<VectorCount(fsm->nodes); i++)
5126 {
5127 node = fsm->nodes[i];
5128 if(node->mark & useful)
5129 {
5130 for(j=0; j<VectorCount(node->edges); j++)
5131 {
5132 edge = node->edges[j];
5133 if(edge->label == Epsilon)
5134 DeleteEdge(edge);
5135 }
5136 CleanupNode(node);
5137 }
5138 else
5139 DeleteNode(node);
5140 }
5141 CleanupFSM(fsm);
5142
5143 UnMarkFSM(fsm, useful);
5144
5145 /* Now change the edge labels to be ElementDefinitions instead of CPs */
5146
5147 for(i=0; i<VectorCount(fsm->nodes); i++)
5148 {
5149 node = fsm->nodes[i];
5150 for(j=0; j<VectorCount(node->edges); j++)
5151 {
5152 edge = node->edges[j];
5153 if(edge->label == Epsilon || edge->label == PCDataElement)
5154 continue;
5155 edge->label = ((ContentParticle)edge->label)->element;
5156 }
5157 }
5158
5159 return 1;
5160 }
5161
add_epsilon_closure(FSMNode base,FSMNode node)5162 static int add_epsilon_closure(FSMNode base, FSMNode node)
5163 {
5164 int i, j;
5165 FSMEdge edge, edge2;
5166
5167 if(node->mark & busy)
5168 return 1;
5169 node->mark |= busy;
5170
5171 if(node->end_node)
5172 base->end_node = 1;
5173 for(i=0; i<VectorCount(node->edges); i++)
5174 {
5175 edge = node->edges[i];
5176 if(edge->label == Epsilon)
5177 {
5178 if(!add_epsilon_closure(base, edge->destination))
5179 return 0;
5180 }
5181 else
5182 {
5183 /* Do we already have an edge corresponding to this very
5184 content particle? */
5185 for(j=0; j<VectorCount(base->edges); j++)
5186 {
5187 edge2 = base->edges[j];
5188 if(edge2->label == edge->label &&
5189 edge2->destination == edge->destination)
5190 break;
5191 }
5192 if(j == VectorCount(base->edges) &&
5193 !AddEdge(base, edge->destination, edge->label))
5194 return 0;
5195 }
5196 }
5197
5198 return 1;
5199 }
5200
5201 #if DEBUG_FSM
PrintFSM(FILE16 * out,FSM fsm,int relabelled)5202 static void PrintFSM(FILE16 *out, FSM fsm, int relabelled)
5203 {
5204 int i, j;
5205 FSMNode node;
5206 FSMEdge edge;
5207 ElementDefinition elt;
5208
5209 for(i=0; i<VectorCount(fsm->nodes); i++)
5210 {
5211 node = fsm->nodes[i];
5212 Fprintf(out, "%d", node->id);
5213 if(node == fsm->start_node)
5214 Fprintf(out, "S");
5215 if(node->end_node)
5216 Fprintf(out, "E");
5217
5218 for(j=0; j<VectorCount(node->edges); j++)
5219 {
5220 edge = node->edges[j];
5221 if(edge->label == Epsilon)
5222 Fprintf(out, "\t{Epsilon} -> %d\n", edge->destination->id);
5223 else if(edge->label == PCDataElement)
5224 Fprintf(out, "\t#PCDATA -> %d\n", edge->destination->id);
5225 else
5226 {
5227 if(relabelled)
5228 elt = (ElementDefinition)edge->label;
5229 else
5230 elt = ((ContentParticle)edge->label)->element;
5231 Fprintf(out, "\t%S -> %d\n", elt->name, edge->destination->id);
5232 }
5233 }
5234 if(VectorCount(node->edges) == 0)
5235 printf("\n");
5236 }
5237 }
5238 #endif
5239
translate_particle_1(FSM fsm,ContentParticle cp,FSMNode next)5240 static FSMNode translate_particle_1(FSM fsm, ContentParticle cp, FSMNode next)
5241 {
5242 FSMNode node, n;
5243 int i;
5244
5245 if(!(node = AddNode(fsm)))
5246 return 0;
5247
5248 switch(cp->type)
5249 {
5250 case CP_name:
5251 /* We initially label the edges with the content particles, so
5252 that we can recognise two "a" edges as being from different
5253 CPs for the purpose of determinism checking. We will change
5254 the label to be the element definition later. */
5255 if(!AddEdge(node, next, cp))
5256 return 0;
5257 break;
5258 case CP_pcdata:
5259 if(!AddEdge(node, next, PCDataElement))
5260 return 0;
5261 break;
5262 case CP_choice:
5263 for(i=0; i<cp->nchildren; i++)
5264 {
5265 if(!(n = translate_particle(fsm, cp->children[i], next)) ||
5266 !AddEdge(node, n, Epsilon))
5267 return 0;
5268 }
5269 break;
5270 case CP_seq:
5271 n = next;
5272 for(i=cp->nchildren-1; i>=0; i--)
5273 {
5274 if(!(n = translate_particle(fsm, cp->children[i], n)))
5275 return 0;
5276 }
5277 if(!AddEdge(node, n, Epsilon))
5278 return 0;
5279 break;
5280 default:
5281 break;
5282 }
5283
5284 return node;
5285 }
5286
translate_particle(FSM fsm,ContentParticle cp,FSMNode next)5287 static FSMNode translate_particle(FSM fsm, ContentParticle cp, FSMNode next)
5288 {
5289 FSMNode node1, node2, sub;
5290
5291 switch(cp->repetition)
5292 {
5293 case 0:
5294 return translate_particle_1(fsm, cp, next);
5295 case '*':
5296 if(!(node1 = AddNode(fsm)) ||
5297 !(sub = translate_particle_1(fsm, cp, node1)) ||
5298 !AddEdge(node1, sub, Epsilon) ||
5299 !AddEdge(node1, next, Epsilon))
5300 return 0;
5301 return node1;
5302 case '+':
5303 if(!(node1 = AddNode(fsm)) ||
5304 !(node2 = AddNode(fsm)) ||
5305 !(sub = translate_particle_1(fsm, cp, node2)) ||
5306 !AddEdge(node1, sub, Epsilon) ||
5307 !AddEdge(node2, sub, Epsilon) ||
5308 !AddEdge(node2, next, Epsilon))
5309 return 0;
5310 return node1;
5311 case '?':
5312 if(!(node1 = AddNode(fsm)) ||
5313 !(sub = translate_particle_1(fsm, cp, next)) ||
5314 !AddEdge(node1, sub, Epsilon) ||
5315 !AddEdge(node1, next, Epsilon))
5316 return 0;
5317 return node1;
5318 }
5319
5320 return 0; /* can't happen */
5321 }
5322
check_deterministic(Parser p,ElementDefinition element)5323 static int check_deterministic(Parser p, ElementDefinition element)
5324 {
5325 int t;
5326
5327 t = check_deterministic_1(p, element, element->fsm->start_node, 0);
5328 UnMarkFSM(element->fsm, busy);
5329 return t;
5330 }
5331
check_deterministic_1(Parser p,ElementDefinition element,FSMNode node,ElementDefinition previous)5332 static int check_deterministic_1(Parser p, ElementDefinition element,
5333 FSMNode node, ElementDefinition previous)
5334 {
5335 int j, k;
5336 FSMEdge edge;
5337 Char empty_string[] = {0};
5338
5339 if(node->mark & busy)
5340 return 0;
5341 node->mark |= busy;
5342
5343 /* Does this node have two or more edges labelled the same? */
5344
5345 for(j=0; j<VectorCount(node->edges); j++)
5346 {
5347 edge = node->edges[j];
5348 for(k=0; k<j; k++)
5349 if(node->edges[k]->label == edge->label)
5350 {
5351 require(validity_error(p,
5352 "Content model for %S is not deterministic. %s%S "
5353 "there are multiple choices when the next element is %S.",
5354 element->name,
5355 previous ? "After element " : "At start of content",
5356 previous ? previous->name : empty_string,
5357 ((ElementDefinition)edge->label)->name));
5358 goto next; /* Don't report more errors for this node */
5359 }
5360 }
5361
5362 next:
5363
5364 /* Check its children */
5365 for(j=0; j<VectorCount(node->edges); j++)
5366 {
5367 edge = node->edges[j];
5368 require(check_deterministic_1(p, element, edge->destination,
5369 (ElementDefinition)edge->label));
5370 }
5371
5372 return 0;
5373 }
5374
validate_attribute(Parser p,AttributeDefinition a,ElementDefinition e,const Char * value)5375 static int validate_attribute(Parser p, AttributeDefinition a, ElementDefinition e, const Char *value)
5376 {
5377 require(check_attribute_syntax(p, a, e, value, "attribute", 1));
5378
5379 if(a->default_type == DT_fixed)
5380 if(Strcmp(value, a->default_value) != 0)
5381 {
5382 require(validity_error(p,
5383 "The attribute %S of element %S does not "
5384 "match the declared #FIXED value",
5385 a->name, e->name));
5386 }
5387
5388 if(a == e->xml_lang_attribute)
5389 {
5390 require(validate_xml_lang_attribute(p, e, value));
5391 }
5392
5393 return 0;
5394 }
5395
validate_xml_lang_attribute(Parser p,ElementDefinition e,const Char * value)5396 static int validate_xml_lang_attribute(Parser p, ElementDefinition e, const Char *value)
5397 {
5398 /* 1.1 will allow empty xml:lang values (and maybe 1.0 will be amended
5399 to), and it no longer seems worth checking anything here. */
5400 #if 0
5401 const Char *t;
5402
5403 /* Look for the Langcode */
5404
5405 if((value[0] == 'i' || value[0] == 'I' ||
5406 value[0] == 'x' || value[0] == 'X') &&
5407 value[1] == '-')
5408 {
5409 /* IANA or user code */
5410
5411 if(!is_ascii_alpha(value[2]))
5412 goto bad;
5413 for(t = value+3; is_ascii_alpha(*t); t++)
5414 ;
5415
5416 }
5417 else if(is_ascii_alpha(value[0]) && is_ascii_alpha(value[1]))
5418 {
5419 /* ISO639 code */
5420 t = value+2;
5421 }
5422 else
5423 goto bad;
5424
5425 /* Look for a subcode */
5426
5427 if(!*t)
5428 return 0;
5429 if(t[0] != '-' || !is_ascii_alpha(t[1]))
5430 goto bad;
5431
5432 for(t=t+2; is_ascii_alpha(*t); t++)
5433 ;
5434
5435 if(!*t)
5436 return 0;
5437
5438 bad:
5439 /* Not a validity error since erratum 73 */
5440 warn(p, "Dubious xml:lang attribute for element %S", e->name);
5441 #endif
5442 return 0;
5443 }
5444
5445 /* Check an attribute matches Name[s] or Nmtoken[s].
5446 Assume it has already been normalised (no leading or trailing
5447 whitespace, other whitespace normalised to single space). */
5448
check_attribute_syntax(Parser p,AttributeDefinition a,ElementDefinition e,const Char * value,const char * message,int real_use)5449 static int check_attribute_syntax(Parser p, AttributeDefinition a, ElementDefinition e, const Char *value, const char *message, int real_use)
5450 {
5451 int nmchar = (a->type == AT_nmtoken || a->type == AT_nmtokens ||
5452 a->type == AT_enumeration);
5453 int multiple = (a->type == AT_nmtokens || a->type == AT_entities ||
5454 a->type == AT_idrefs);
5455
5456 const Char *q, *start = value;
5457
5458 if(a->type == AT_cdata)
5459 return 0; /* Nothing to check */
5460
5461 if(!*value)
5462 {
5463 require(validity_error(p, "The %s %S of element %S "
5464 "is declared as %s but is empty",
5465 message, a->name, e->name,
5466 AttributeTypeName[a->type]));
5467 return 0;
5468 }
5469
5470 for(q=value; *q; q++)
5471 {
5472 if(!nmchar && q == start && !is_xml_namestart(*q, p->map))
5473 {
5474 require(validity_error(p, "The %s %S of element %S "
5475 "is declared as %s but contains a token "
5476 "that does not start with a name start character",
5477 message, a->name, e->name,
5478 AttributeTypeName[a->type]));
5479 return 0;
5480 }
5481
5482 if(*q == ' ')
5483 {
5484 require(check_attribute_token(p, a, e, start, q-start, message,
5485 real_use));
5486 start = q+1;
5487
5488 if(!multiple)
5489 {
5490 require(validity_error(p, "The %s %S of element %S "
5491 "is declared as %s but "
5492 "contains more than one token",
5493 message, a->name, e->name,
5494 AttributeTypeName[a->type]));
5495 }
5496 }
5497 else if(!is_xml_namechar(*q, p->map))
5498 {
5499 require(validity_error(p, "The %s %S of element %S is declared "
5500 "as %s but contains a character which "
5501 "is not a name character",
5502 message, a->name, e->name,
5503 AttributeTypeName[a->type]));
5504 return 0;
5505 }
5506 }
5507
5508 return check_attribute_token(p, a, e, start, q-start, message, real_use);
5509 }
5510
check_attribute_token(Parser p,AttributeDefinition a,ElementDefinition e,const Char * value,int length,const char * message,int real_use)5511 static int check_attribute_token(Parser p, AttributeDefinition a, ElementDefinition e, const Char *value, int length, const char *message, int real_use)
5512 {
5513 Entity entity;
5514 NotationDefinition notation;
5515 int i, found;
5516 HashEntry id_entry;
5517
5518 switch(a->type)
5519 {
5520 case AT_entity:
5521 case AT_entities:
5522 if(!real_use)
5523 return 0; /* don't check defaults unless they're used */
5524 /* XXX Should maybe check for colons, but it must be invalid anyway
5525 because otherwise the declaration would have been not-nwf */
5526 entity = FindEntityN(p->dtd, value, length, 0);
5527 if(!entity)
5528 {
5529 require(validity_error(p, "In the %s %S of element %S, "
5530 "entity %.*S is undefined",
5531 message, a->name, e->name, length, value));
5532 }
5533 else if(!entity->notation)
5534 {
5535 require(validity_error(p, "In the %s %S of element %S, "
5536 "entity %.*S is not unparsed",
5537 message, a->name, e->name, length, value));
5538 }
5539 break;
5540 case AT_id:
5541 if(!a->declared)
5542 /* don't validate undeclared xml:id attributes */
5543 return 0;
5544 /* fall through */
5545 case AT_idref:
5546 case AT_idrefs:
5547 if(!real_use)
5548 return 0; /* don't check defaults unless they're used */
5549 id_entry = hash_find_or_add(p->id_table, value, length*sizeof(Char),
5550 &found);
5551 if(!id_entry)
5552 return error(p, "System error");
5553 if(!found)
5554 {
5555 hash_set_value(id_entry, (void *)(a->type == AT_id));
5556 if(ParserGetFlag(p, XMLNamespaces))
5557 for(i=0; i<length; i++)
5558 if(value[i] == ':')
5559 {
5560 require(namespace_validity_error(p, "ID %.*S contains colon", length, value));
5561 }
5562 }
5563 else if(a->type == AT_id)
5564 {
5565 int idinfo = (int)hash_get_value(id_entry);
5566 if(idinfo & 1)
5567 {
5568 require(validity_error(p, "Duplicate ID attribute value %.*S",
5569 length, value));
5570 }
5571 else
5572 {
5573 if(idinfo & 2)
5574 warn(p, "xml:id error: duplicate ID attribute value %S", value);
5575 hash_set_value(id_entry, (void *)(idinfo | 1));
5576 }
5577 }
5578 break;
5579 case AT_notation:
5580 /* XXX Should maybe check for colons, but it must be invalid anyway
5581 because otherwise the declaration would have been not-nwf */
5582 notation = FindNotationN(p->dtd, value, length);
5583 if(!notation)
5584 {
5585 require(validity_error(p, "In the %s %S of element %S, "
5586 "notation %.*S is undefined",
5587 message, a->name, e->name, length, value));
5588 break;
5589 }
5590 /* fall through */
5591 case AT_enumeration:
5592 for(i=0; a->allowed_values[i]; i++)
5593 if(Strncmp(value, a->allowed_values[i], length) == 0 &&
5594 a->allowed_values[i][length] == 0)
5595 break;
5596 if(!a->allowed_values[i])
5597 {
5598 require(validity_error(p, "In the %s %S of element %S, "
5599 "%.*S is not one of the allowed values",
5600 message, a->name, e->name, length, value));
5601 }
5602 break;
5603 default:
5604 /* Nothing to check */
5605 break;
5606 }
5607
5608 return 0;
5609 }
5610
5611 #if not_yet
magically_transform_dtd(Parser p,Char * name,int namelen)5612 static int magically_transform_dtd(Parser p, Char *name, int namelen)
5613 {
5614 int i;
5615 Char *prefix;
5616
5617 for(i=0; i<namelen; i++)
5618 if(name[i] == ':')
5619 break;
5620
5621 if(i < namelen)
5622 {
5623 if(!(prefix = Strndup(name, i)))
5624 return error(p, "System error");
5625 }
5626 else
5627 prefix = 0;
5628
5629 require(ReprefixDtd(p->dtd, p->magic_prefix, prefix));
5630
5631 Free(prefix);
5632
5633 return 0;
5634 }
5635 #endif
5636
5637