1 /*  Part of SWI-Prolog
2 
3     Author:        Jan Wielemaker
4     E-mail:        J.Wielemaker@vu.nl
5     WWW:           http://www.swi-prolog.org
6     Copyright (c)  2000-2017, University of Amsterdam
7                               Vu University Amsterdam
8     All rights reserved.
9 
10     Redistribution and use in source and binary forms, with or without
11     modification, are permitted provided that the following conditions
12     are met:
13 
14     1. Redistributions of source code must retain the above copyright
15        notice, this list of conditions and the following disclaimer.
16 
17     2. Redistributions in binary form must reproduce the above copyright
18        notice, this list of conditions and the following disclaimer in
19        the documentation and/or other materials provided with the
20        distribution.
21 
22     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26     COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
28     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
32     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33     POSSIBILITY OF SUCH DAMAGE.
34 */
35 
36 #ifndef DTD_H_INCLUDED
37 #define DTD_H_INCLUDED
38 #include "sgmldefs.h"
39 
40 #define CH_WHITE	0x0001
41 #define CH_LCLETTER	0x0002
42 #define CH_UCLETTER	0x0004
43 #define CH_CNMSTRT	0x0008		/* may start a name */
44 #define CH_CNM		0x0010		/* may be in a name */
45 #define CH_DIGIT	0x0020
46 #define CH_RE		0x0040
47 #define CH_RS		0x0080
48 
49 #define CH_LETTER	(CH_LCLETTER|CH_UCLETTER)
50 #define CH_NMSTART	(CH_LCLETTER|CH_UCLETTER|CH_CNMSTRT)
51 #define CH_NAME		(CH_NMSTART|CH_DIGIT|CH_CNM)
52 #define CH_BLANK	(CH_WHITE|CH_RE|CH_RS)
53 
54 #define CHR_BLANK	0x1		/* SHORTREF 'B' */
55 #define CHR_DBLANK	0x2		/* SHORTREF 'BB' */
56 
57 #define SGML_DTD_MAGIC	0x7364573
58 
59 typedef enum
60 { CF_STAGO = 0,				/* < */
61   CF_STAGC,				/* > */
62   CF_ETAGO1,				/* < */
63   CF_ETAGO2,				/* / */
64   CF_VI,				/* = */
65   CF_NS,				/* : (XMLNS) */
66   CF_LIT,				/* " */
67   CF_LITA,				/* ' */
68   CF_PERO,				/* % */
69   CF_ERO,				/* & */
70   CF_ERC,				/* ; */
71   CF_MDO1,				/* < */
72   CF_MDO2,				/* ! (MDO=<!) */
73   CF_MDC,				/* > */
74   CF_PRO1,				/* < */
75   CF_PRO2,				/* ? (PRO=<?) */
76   CF_PRC,				/* > */
77   CF_GRPO,				/* ( */
78   CF_GRPC,				/* ) */
79   CF_SEQ,				/* , */
80   CF_AND,				/* & */
81   CF_OR,				/* | */
82   CF_OPT,				/* ? */
83   CF_PLUS,				/* + */
84   CF_DSO,				/* [ */
85   CF_DSC,				/* ] */
86   CF_REP,				/* * */
87   CF_RS,				/* \n */
88   CF_RE,				/* \r */
89   CF_CMT,				/* - */
90   CF_NG,				/* , or & or | */
91   CF_ENDTABLE				/* to find size */
92 } charfunc;				/* function of characters */
93 
94 typedef enum
95 { SGML_ENC_ISO_LATIN1 = 0,		/* ISO Latin-1 */
96   SGML_ENC_UTF8				/* Multi-byte UTF-8 encoding */
97 } dtd_char_encoding;
98 
99 typedef enum
100 { C_CDATA,				/* pure cdata */
101   C_PCDATA,				/* parsed character data */
102   C_RCDATA,				/* pure cdata + entities */
103   C_EMPTY,				/* empy element */
104   C_ANY					/* element may contain anything */
105 } contenttype;
106 
107 typedef enum
108 { MC_ONE,				/* one time */
109   MC_OPT,				/* optional element (?) */
110   MC_REP,				/* any times (*) */
111   MC_PLUS				/* one-or-more (+) */
112 } modelcard;
113 
114 typedef enum
115 { MT_UNDEF = 0,				/* undefined */
116   MT_PCDATA,				/* Contains PCDATA */
117   MT_ELEMENT,				/* refers to element */
118   MT_SEQ,				/* Sequence (,) */
119   MT_AND,				/* Ony order (&) */
120   MT_OR					/* Disjunction (|) */
121 } modeltype;
122 
123 typedef enum
124 { AT_CDATA,				/* CDATA attribute */
125   AT_ENTITY,				/* entity-name */
126   AT_ENTITIES,				/* entity-name list */
127   AT_ID,				/* identifier */
128   AT_IDREF,				/* identifier reference */
129   AT_IDREFS,				/* list of identifier references */
130   AT_NAME,				/* name token */
131   AT_NAMES,				/* list of names */
132   AT_NAMEOF,				/* one of these names */
133   AT_NMTOKEN,				/* name-token */
134   AT_NMTOKENS,				/* name-token list */
135   AT_NOTATION,				/* notation-name */
136   AT_NUMBER,				/* number */
137   AT_NUMBERS,				/* number list */
138   AT_NUTOKEN,				/* number token */
139   AT_NUTOKENS				/* number token list */
140 } attrtype;
141 
142 typedef enum
143 { AT_FIXED,				/* fixed value */
144   AT_REQUIRED,				/* Required attribute */
145   AT_CURRENT,				/* most recent value */
146   AT_CONREF,				/* cross-reference */
147   AT_IMPLIED,				/* Implied attribute */
148   AT_DEFAULT				/* has default */
149 } attrdef;
150 
151 
152 typedef enum
153 { ET_SYSTEM,				/* System (file) entity */
154   ET_PUBLIC,				/* Public (external) entity */
155   ET_LITERAL				/* Literal text */
156 } entity_type;
157 
158 
159 typedef enum
160 { EC_SGML,				/* SGML data */
161   EC_STARTTAG,				/* SGML start-tag */
162   EC_ENDTAG,				/* SGML end-tag */
163   EC_CDATA,				/* CDATA entity */
164   EC_SDATA,				/* SDATA entity */
165   EC_NDATA,				/* non-sgml data */
166   EC_PI					/* Programming instruction */
167 } data_type;
168 
169 
170 typedef enum
171 { DL_SGML,				/* Use SGML */
172   DL_HTML,				/* Pre-HTML5 */
173   DL_HTML5,				/* HTML5 extensions of SGML */
174   DL_XHTML,				/* Pre-HTML5 */
175   DL_XHTML5,				/* HTML5 extensions of SGML */
176   DL_XML,				/* Use XML */
177   DL_XMLNS				/* Use XML + Namespaces */
178 } dtd_dialect;
179 
180 #define IS_SGML_DIALECT(d) ((int)(d) <= (int)DL_HTML5)
181 #define IS_HTML_DIALECT(d) ((d) >= DL_HTML && (d) <= DL_XHTML5)
182 #define IS_HTML5_DIALECT(d) ((d) == DL_HTML5 || (d) == DL_XHTML5)
183 #define IS_XML_DIALECT(d)  ((int)(d) >= (int)DL_XHTML)
184 
185 typedef enum
186 { OPT_SHORTTAG,				/* do/don't accept shorttag */
187   OPT_CASE_SENSITIVE_ATTRIBUTES,	/* attribute values case(in)sensitive */
188   OPT_CASE_PRESERVING_ATTRIBUTES,	/* attribute values case(in)preserving */
189   OPT_SYSTEM_ENTITIES,			/* expand system entities */
190   OPT_KEEP_PREFIX			/* keep the prefix identifiers */
191 } dtd_option;
192 
193 
194 typedef enum
195 { SP_PRESERVE = 0,			/* Preserve all white-space */
196   SP_DEFAULT,				/* Default space handling */
197   SP_REMOVE,				/* Remove all blank CDATA elements */
198   SP_SGML,				/* Compliant SGML mode */
199   SP_INHERIT,				/* DTD: inherit from environment */
200   SP_STRICT				/* Strict reading of spaces for signature verification */
201 } dtd_space_mode;
202 
203 
204 typedef enum
205 { NU_TOKEN,				/* Treat numbers as tokens */
206   NU_INTEGER				/* Convert to integer */
207 } dtd_number_mode;
208 
209 
210 		 /*******************************
211 		 *	      ERRORS		*
212 		 *******************************/
213 
214 #ifdef DTD_IMPLEMENTATION
215 #define DTD_MINOR_ERRORS 1
216 #endif
217 
218 typedef enum
219 { ERS_WARNING,				/* probably correct result */
220   ERS_ERROR,				/* probably incrorrect result */
221   ERS_STYLE				/* dubious/bad style; correct result */
222 } dtd_error_severity;
223 
224 
225 typedef enum
226 { ERC_REPRESENTATION,			/* Internal limit */
227 	/* id */
228   ERC_RESOURCE,				/* external limit */
229 	/* id */
230   ERC_LIMIT,				/* Exceeded SGML limit */
231 	/* id */
232   ERC_VALIDATE,				/* DTD Validation */
233 	/* Message */
234   ERC_SYNTAX_ERROR,			/* Syntax error */
235 	/* Message, found */
236   ERC_EXISTENCE,			/* Existence error */
237 	/* Type, name */
238   ERC_REDEFINED,			/* Redefined object */
239 	/* Type, name */
240   ERC_ET_SYSTEM			/* Disallowed SYSTEM entity */
241         /* name */
242 #ifdef DTD_MINOR_ERRORS
243   ,					/* reopen list */
244   ERC_SYNTAX_WARNING,			/* Syntax warning (i.e. fixed) */
245 	/* Message, found */
246   ERC_DOMAIN,				/* Relative to declared type */
247 	/* Type, found */
248   ERC_OMITTED_CLOSE,
249 	/* Element */
250   ERC_OMITTED_OPEN,
251 	/* Element */
252   ERC_NOT_OPEN,
253 	/* Element */
254   ERC_NOT_ALLOWED,
255 	/* Element */
256   ERC_NOT_ALLOWED_PCDATA,
257         /* Text */
258   ERC_NO_ATTRIBUTE,
259 	/* Element, Attribute */
260   ERC_NO_ATTRIBUTE_VALUE,
261 	/* Element, Value */
262   ERC_NO_VALUE,
263 	/* Entity */
264   ERC_NO_DOCTYPE,
265         /* Implicit, file */
266   ERC_NO_CATALOGUE
267 	/* file */
268 #endif
269 } dtd_error_id;
270 
271 
272 typedef enum
273 { IN_NONE,				/* unspecified input */
274   IN_FILE,				/* input from file */
275   IN_ENTITY				/* input from entity */
276 } input_type;
277 
278 
279 typedef struct _dtd_srcloc
280 { input_type  type;			/* type of input */
281   union
282   { const ichar *file;			/* name of the file */
283     const ichar *entity;		/* name of entity */
284   } name;
285   int	      line;			/* 1-based Line no */
286   int	      linepos;			/* 1-based char  */
287   long	      charpos;			/* 0-based file char  */
288   struct _dtd_srcloc *parent;		/* parent location */
289 } dtd_srcloc;
290 
291 
292 typedef struct _dtd_error
293 { dtd_error_id id;			/* ERC_* identifier */
294   dtd_error_id minor;			/* Minor code */
295   dtd_error_severity severity;		/* ERS_* severity */
296   dtd_srcloc *location;			/* location of the error */
297   wchar_t *plain_message;		/* Clean message */
298   wchar_t *message;			/* complete message */
299 					/* (Warning: file:line: <plain>) */
300   wchar_t *argv[2];			/* context arguments */
301 } dtd_error;
302 
303 
304 		 /*******************************
305 		 *	     DTD TYPES		*
306 		 *******************************/
307 
308 typedef struct _dtd_symbol
309 { const ichar *name;			/* name of the atom */
310   struct _dtd_symbol *next;		/* next in atom list */
311   struct _dtd_element *element;		/* connected element (if any) */
312   struct _dtd_entity  *entity;		/* connected entity (if any) */
313 } dtd_symbol;
314 
315 
316 typedef struct _dtd_symbol_table
317 { int		size;			/* Allocated size */
318   dtd_symbol  **entries;		/* Entries */
319 } dtd_symbol_table;
320 
321 
322 typedef struct _dtd_entity
323 { dtd_symbol *name;			/* its name */
324   entity_type type;			/* ET_* */
325   data_type content;			/* EC_* */
326   int catalog_location;			/* what catalog to use for lookup */
327   int length;				/* size of literal value */
328   ichar *value;				/* literal value */
329   ichar *extid;				/* external identifier */
330   ichar *exturl;			/* url to fetch from */
331   ichar *baseurl;			/* base url for exturl */
332   struct _dtd_entity *next;		/* list-link */
333 } dtd_entity;
334 
335 
336 typedef struct _dtd_notation
337 { dtd_symbol *name;			/* name of the notation */
338   entity_type type;			/* ET_{PUBLIC|SYSTEM} */
339   ichar *public;			/* public id */
340   ichar *system;			/* file with info */
341   struct _dtd_notation *next;		/* list-link */
342 } dtd_notation;
343 
344 
345 typedef struct _dtd_element_list
346 { struct _dtd_element *value;		/* element */
347   struct _dtd_element_list *next;	/* next in list */
348 } dtd_element_list;
349 
350 
351 typedef struct _dtd_name_list
352 { dtd_symbol	*value;
353   struct _dtd_name_list *next;
354 } dtd_name_list;
355 
356 
357 typedef struct _dtd_attr
358 { dtd_symbol  *name;			/* name of attribute */
359   attrtype type;			/* type (AT_*) */
360   attrdef  def;				/* AT_REQUIRED/AT_IMPLIED */
361   int islist;				/* attribute is a list */
362   union
363   { dtd_name_list *nameof;		/* (name1|name2|...) */
364   } typeex;
365   union
366   { ichar *cdata;			/* default for CDATA */
367     ichar *list;			/* text for list-data */
368     dtd_symbol *name;			/* AT_NAME or AT_NAMEOF */
369     long number;			/* AT_NUMBER */
370   } att_def;
371   int references;			/* reference count */
372 } dtd_attr;
373 
374 
375 typedef struct _dtd_attr_list
376 { dtd_attr	*attribute;
377   struct _dtd_attr_list *next;
378 } dtd_attr_list;
379 
380 
381 typedef struct _dtd_model
382 { modeltype type;			/* MT_* */
383   modelcard cardinality;		/* MC_* */
384 
385   union
386   { struct _dtd_model *group;		/* ,/|/& group */
387     struct _dtd_element *element;	/* element */
388   } content;
389   struct _dtd_model *next;		/* next in list (for groups) */
390 } dtd_model;
391 
392 
393 typedef struct _dtd_edef
394 { contenttype	type;			/* EMPTY, MIXED, ... */
395   int		omit_open;		/* allow omitted open tag? */
396   int		omit_close;		/* allow omitted close tag? */
397   dtd_model	*content;		/* the content model */
398   dtd_element_list *included;		/* +(namegroup) */
399   dtd_element_list *excluded;		/* -(namegroup) */
400   struct _dtd_state *initial_state;	/* Initial state in state engine */
401   struct _dtd_state *final_state;	/* Final state in state engine */
402   int		references;		/* #elements using this def */
403 } dtd_edef;
404 
405 
406 typedef struct _dtd_map
407 { ichar	       *from;			/* mapped text */
408   int		len;			/* length of mapped text */
409   dtd_symbol   *to;			/* name of symbol mapped onto */
410   struct _dtd_map *next;		/* next in shortref map */
411 } dtd_map;
412 
413 
414 typedef struct _dtd_shortref
415 { dtd_symbol	*name;			/* name of SHORTREF map */
416   dtd_map	*map;			/* implemented map */
417   char		ends[SHORTMAP_SIZE];	/* ending-characters in map */
418   int		defined;		/* has been defined */
419   struct _dtd_shortref *next;		/* next declared shortref */
420 } dtd_shortref;
421 
422 
423 typedef struct _dtd_element
424 { dtd_symbol	*name;			/* its name */
425   dtd_edef	*structure;		/* content structure of the element */
426   dtd_attr_list *attributes;		/* defined attributes */
427   dtd_space_mode space_mode;		/* How to handle white-space (SP_*) */
428   dtd_shortref	*map;			/* SHORTREF map */
429   int		undefined;		/* Only implicitely defined */
430   struct _dtd_element *next;		/* in DTD'e element list */
431 } dtd_element;
432 
433 
434 typedef struct _dtd_charclass
435 { unsigned char	class[INPUT_CHARSET_SIZE]; /* ichar --> class-mask */
436 } dtd_charclass;
437 
438 
439 typedef struct _dtd_charfunc
440 { ichar func[(int)CF_ENDTABLE];		/* CF_ --> ichar */
441 } dtd_charfunc;
442 
443 
444 typedef struct _dtd
445 { int		        magic;		/* SGML_DTD_MAGIC */
446   int			implicit;	/* There is no DTD */
447   dtd_dialect		dialect;	/* DL_* */
448   int			case_sensitive;	/* Tags are case-sensitive */
449   int			ent_case_sensitive; /* Entities are case-sensitive */
450   int			att_case_sensitive; /* Att values are case-sensitive */
451   int			att_case_preserving; /* Preserve attribute value case */
452   ichar		       *doctype;	/* defined document type */
453   dtd_symbol_table     *symbols;	/* symbol-table */
454   dtd_entity           *pentities;	/* defined parameter entities */
455   dtd_entity	       *entities;	/* defined entities */
456   dtd_entity	       *default_entity;	/* default-entity (if any) */
457   dtd_notation	       *notations;	/* Declared notations */
458   dtd_shortref	       *shortrefs;	/* SHORTREF declarations */
459   dtd_element          *elements;	/* defined elements */
460   dtd_charfunc	       *charfunc;	/* CF_ --> ichar */
461   dtd_charclass	       *charclass;	/* ichar -> CH_-mask */
462   dtd_char_encoding	encoding;	/* document encoding */
463   dtd_space_mode	space_mode;	/* Default for handling white-space */
464   dtd_number_mode	number_mode;	/* How to treat number attributes */
465   int			shorttag;	/* support SHORTTAG */
466   int			system_entities; /* expand SYSTEM entities */
467   int			keep_prefix;    /* keep namespace prefixes */
468   int			references;	/* destruction reference count */
469 } dtd;
470 
471 extern dtd_charfunc *new_charfunc(void);   /* default classification */
472 extern dtd_charclass *new_charclass(void); /* default classification */
473 
474 extern dtd_symbol*	dtd_find_symbol(dtd *dtd, const ichar *name);
475 extern dtd_symbol*	dtd_add_symbol(dtd *dtd, const ichar *name);
476 
477 
478 		 /*******************************
479 		 *	       PUBLIC		*
480 		 *******************************/
481 
482 #include "parser.h"
483 
484 dtd *		file_to_dtd(const ichar *file, const ichar *doctype,
485 			    dtd_dialect dialect);
486 int		sgml_process_file(dtd_parser *p,
487 				  const ichar *file, unsigned flags);
488 int		sgml_process_stream(dtd_parser *p, FILE *in,
489 				    unsigned flags);
490 dtd_parser *	new_dtd_parser(dtd *dtd);
491 void		free_dtd_parser(dtd_parser *p);
492 
493 void		free_dtd(dtd *dtd);
494 int		load_dtd_from_file(dtd_parser *p, const ichar *file);
495 dtd *		new_dtd(const ichar *doctype);
496 int		set_dialect_dtd(dtd *dtd, dtd_parser *p, dtd_dialect dialect);
497 int		set_option_dtd(dtd *dtd, dtd_option option, int set);
498 
499 int		putchar_dtd_parser(dtd_parser *p, int chr);
500 int		begin_document_dtd_parser(dtd_parser *p);
501 int		end_document_dtd_parser(dtd_parser *p);
502 void		reset_document_dtd_parser(dtd_parser *p);
503 void		set_file_dtd_parser(dtd_parser *p,
504 				    input_type in, const ichar *file);
505 void		set_mode_dtd_parser(dtd_parser *p, data_mode mode);
506 void		sgml_cplocation(dtd_srcloc *dst, dtd_srcloc *src);
507 int		xml_set_encoding(dtd_parser *p, const char *enc);
508 
509 #endif /*DTD_H_INCLUDED*/
510 
511 
512