1 /* Part of SWI-Prolog 2 3 Author: Jan Wielemaker 4 E-mail: J.Wielemaker@vu.nl 5 WWW: http://www.swi-prolog.org 6 Copyright (c) 2000-2017, University of Amsterdam 7 Vu University Amsterdam 8 All rights reserved. 9 10 Redistribution and use in source and binary forms, with or without 11 modification, are permitted provided that the following conditions 12 are met: 13 14 1. Redistributions of source code must retain the above copyright 15 notice, this list of conditions and the following disclaimer. 16 17 2. Redistributions in binary form must reproduce the above copyright 18 notice, this list of conditions and the following disclaimer in 19 the documentation and/or other materials provided with the 20 distribution. 21 22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 30 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 32 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 33 POSSIBILITY OF SUCH DAMAGE. 34 */ 35 36 #ifndef DTD_H_INCLUDED 37 #define DTD_H_INCLUDED 38 #include "sgmldefs.h" 39 40 #define CH_WHITE 0x0001 41 #define CH_LCLETTER 0x0002 42 #define CH_UCLETTER 0x0004 43 #define CH_CNMSTRT 0x0008 /* may start a name */ 44 #define CH_CNM 0x0010 /* may be in a name */ 45 #define CH_DIGIT 0x0020 46 #define CH_RE 0x0040 47 #define CH_RS 0x0080 48 49 #define CH_LETTER (CH_LCLETTER|CH_UCLETTER) 50 #define CH_NMSTART (CH_LCLETTER|CH_UCLETTER|CH_CNMSTRT) 51 #define CH_NAME (CH_NMSTART|CH_DIGIT|CH_CNM) 52 #define CH_BLANK (CH_WHITE|CH_RE|CH_RS) 53 54 #define CHR_BLANK 0x1 /* SHORTREF 'B' */ 55 #define CHR_DBLANK 0x2 /* SHORTREF 'BB' */ 56 57 #define SGML_DTD_MAGIC 0x7364573 58 59 typedef enum 60 { CF_STAGO = 0, /* < */ 61 CF_STAGC, /* > */ 62 CF_ETAGO1, /* < */ 63 CF_ETAGO2, /* / */ 64 CF_VI, /* = */ 65 CF_NS, /* : (XMLNS) */ 66 CF_LIT, /* " */ 67 CF_LITA, /* ' */ 68 CF_PERO, /* % */ 69 CF_ERO, /* & */ 70 CF_ERC, /* ; */ 71 CF_MDO1, /* < */ 72 CF_MDO2, /* ! (MDO=<!) */ 73 CF_MDC, /* > */ 74 CF_PRO1, /* < */ 75 CF_PRO2, /* ? (PRO=<?) */ 76 CF_PRC, /* > */ 77 CF_GRPO, /* ( */ 78 CF_GRPC, /* ) */ 79 CF_SEQ, /* , */ 80 CF_AND, /* & */ 81 CF_OR, /* | */ 82 CF_OPT, /* ? */ 83 CF_PLUS, /* + */ 84 CF_DSO, /* [ */ 85 CF_DSC, /* ] */ 86 CF_REP, /* * */ 87 CF_RS, /* \n */ 88 CF_RE, /* \r */ 89 CF_CMT, /* - */ 90 CF_NG, /* , or & or | */ 91 CF_ENDTABLE /* to find size */ 92 } charfunc; /* function of characters */ 93 94 typedef enum 95 { SGML_ENC_ISO_LATIN1 = 0, /* ISO Latin-1 */ 96 SGML_ENC_UTF8 /* Multi-byte UTF-8 encoding */ 97 } dtd_char_encoding; 98 99 typedef enum 100 { C_CDATA, /* pure cdata */ 101 C_PCDATA, /* parsed character data */ 102 C_RCDATA, /* pure cdata + entities */ 103 C_EMPTY, /* empy element */ 104 C_ANY /* element may contain anything */ 105 } contenttype; 106 107 typedef enum 108 { MC_ONE, /* one time */ 109 MC_OPT, /* optional element (?) */ 110 MC_REP, /* any times (*) */ 111 MC_PLUS /* one-or-more (+) */ 112 } modelcard; 113 114 typedef enum 115 { MT_UNDEF = 0, /* undefined */ 116 MT_PCDATA, /* Contains PCDATA */ 117 MT_ELEMENT, /* refers to element */ 118 MT_SEQ, /* Sequence (,) */ 119 MT_AND, /* Ony order (&) */ 120 MT_OR /* Disjunction (|) */ 121 } modeltype; 122 123 typedef enum 124 { AT_CDATA, /* CDATA attribute */ 125 AT_ENTITY, /* entity-name */ 126 AT_ENTITIES, /* entity-name list */ 127 AT_ID, /* identifier */ 128 AT_IDREF, /* identifier reference */ 129 AT_IDREFS, /* list of identifier references */ 130 AT_NAME, /* name token */ 131 AT_NAMES, /* list of names */ 132 AT_NAMEOF, /* one of these names */ 133 AT_NMTOKEN, /* name-token */ 134 AT_NMTOKENS, /* name-token list */ 135 AT_NOTATION, /* notation-name */ 136 AT_NUMBER, /* number */ 137 AT_NUMBERS, /* number list */ 138 AT_NUTOKEN, /* number token */ 139 AT_NUTOKENS /* number token list */ 140 } attrtype; 141 142 typedef enum 143 { AT_FIXED, /* fixed value */ 144 AT_REQUIRED, /* Required attribute */ 145 AT_CURRENT, /* most recent value */ 146 AT_CONREF, /* cross-reference */ 147 AT_IMPLIED, /* Implied attribute */ 148 AT_DEFAULT /* has default */ 149 } attrdef; 150 151 152 typedef enum 153 { ET_SYSTEM, /* System (file) entity */ 154 ET_PUBLIC, /* Public (external) entity */ 155 ET_LITERAL /* Literal text */ 156 } entity_type; 157 158 159 typedef enum 160 { EC_SGML, /* SGML data */ 161 EC_STARTTAG, /* SGML start-tag */ 162 EC_ENDTAG, /* SGML end-tag */ 163 EC_CDATA, /* CDATA entity */ 164 EC_SDATA, /* SDATA entity */ 165 EC_NDATA, /* non-sgml data */ 166 EC_PI /* Programming instruction */ 167 } data_type; 168 169 170 typedef enum 171 { DL_SGML, /* Use SGML */ 172 DL_HTML, /* Pre-HTML5 */ 173 DL_HTML5, /* HTML5 extensions of SGML */ 174 DL_XHTML, /* Pre-HTML5 */ 175 DL_XHTML5, /* HTML5 extensions of SGML */ 176 DL_XML, /* Use XML */ 177 DL_XMLNS /* Use XML + Namespaces */ 178 } dtd_dialect; 179 180 #define IS_SGML_DIALECT(d) ((int)(d) <= (int)DL_HTML5) 181 #define IS_HTML_DIALECT(d) ((d) >= DL_HTML && (d) <= DL_XHTML5) 182 #define IS_HTML5_DIALECT(d) ((d) == DL_HTML5 || (d) == DL_XHTML5) 183 #define IS_XML_DIALECT(d) ((int)(d) >= (int)DL_XHTML) 184 185 typedef enum 186 { OPT_SHORTTAG, /* do/don't accept shorttag */ 187 OPT_CASE_SENSITIVE_ATTRIBUTES, /* attribute values case(in)sensitive */ 188 OPT_CASE_PRESERVING_ATTRIBUTES, /* attribute values case(in)preserving */ 189 OPT_SYSTEM_ENTITIES, /* expand system entities */ 190 OPT_KEEP_PREFIX /* keep the prefix identifiers */ 191 } dtd_option; 192 193 194 typedef enum 195 { SP_PRESERVE = 0, /* Preserve all white-space */ 196 SP_DEFAULT, /* Default space handling */ 197 SP_REMOVE, /* Remove all blank CDATA elements */ 198 SP_SGML, /* Compliant SGML mode */ 199 SP_INHERIT, /* DTD: inherit from environment */ 200 SP_STRICT /* Strict reading of spaces for signature verification */ 201 } dtd_space_mode; 202 203 204 typedef enum 205 { NU_TOKEN, /* Treat numbers as tokens */ 206 NU_INTEGER /* Convert to integer */ 207 } dtd_number_mode; 208 209 210 /******************************* 211 * ERRORS * 212 *******************************/ 213 214 #ifdef DTD_IMPLEMENTATION 215 #define DTD_MINOR_ERRORS 1 216 #endif 217 218 typedef enum 219 { ERS_WARNING, /* probably correct result */ 220 ERS_ERROR, /* probably incrorrect result */ 221 ERS_STYLE /* dubious/bad style; correct result */ 222 } dtd_error_severity; 223 224 225 typedef enum 226 { ERC_REPRESENTATION, /* Internal limit */ 227 /* id */ 228 ERC_RESOURCE, /* external limit */ 229 /* id */ 230 ERC_LIMIT, /* Exceeded SGML limit */ 231 /* id */ 232 ERC_VALIDATE, /* DTD Validation */ 233 /* Message */ 234 ERC_SYNTAX_ERROR, /* Syntax error */ 235 /* Message, found */ 236 ERC_EXISTENCE, /* Existence error */ 237 /* Type, name */ 238 ERC_REDEFINED, /* Redefined object */ 239 /* Type, name */ 240 ERC_ET_SYSTEM /* Disallowed SYSTEM entity */ 241 /* name */ 242 #ifdef DTD_MINOR_ERRORS 243 , /* reopen list */ 244 ERC_SYNTAX_WARNING, /* Syntax warning (i.e. fixed) */ 245 /* Message, found */ 246 ERC_DOMAIN, /* Relative to declared type */ 247 /* Type, found */ 248 ERC_OMITTED_CLOSE, 249 /* Element */ 250 ERC_OMITTED_OPEN, 251 /* Element */ 252 ERC_NOT_OPEN, 253 /* Element */ 254 ERC_NOT_ALLOWED, 255 /* Element */ 256 ERC_NOT_ALLOWED_PCDATA, 257 /* Text */ 258 ERC_NO_ATTRIBUTE, 259 /* Element, Attribute */ 260 ERC_NO_ATTRIBUTE_VALUE, 261 /* Element, Value */ 262 ERC_NO_VALUE, 263 /* Entity */ 264 ERC_NO_DOCTYPE, 265 /* Implicit, file */ 266 ERC_NO_CATALOGUE 267 /* file */ 268 #endif 269 } dtd_error_id; 270 271 272 typedef enum 273 { IN_NONE, /* unspecified input */ 274 IN_FILE, /* input from file */ 275 IN_ENTITY /* input from entity */ 276 } input_type; 277 278 279 typedef struct _dtd_srcloc 280 { input_type type; /* type of input */ 281 union 282 { const ichar *file; /* name of the file */ 283 const ichar *entity; /* name of entity */ 284 } name; 285 int line; /* 1-based Line no */ 286 int linepos; /* 1-based char */ 287 long charpos; /* 0-based file char */ 288 struct _dtd_srcloc *parent; /* parent location */ 289 } dtd_srcloc; 290 291 292 typedef struct _dtd_error 293 { dtd_error_id id; /* ERC_* identifier */ 294 dtd_error_id minor; /* Minor code */ 295 dtd_error_severity severity; /* ERS_* severity */ 296 dtd_srcloc *location; /* location of the error */ 297 wchar_t *plain_message; /* Clean message */ 298 wchar_t *message; /* complete message */ 299 /* (Warning: file:line: <plain>) */ 300 wchar_t *argv[2]; /* context arguments */ 301 } dtd_error; 302 303 304 /******************************* 305 * DTD TYPES * 306 *******************************/ 307 308 typedef struct _dtd_symbol 309 { const ichar *name; /* name of the atom */ 310 struct _dtd_symbol *next; /* next in atom list */ 311 struct _dtd_element *element; /* connected element (if any) */ 312 struct _dtd_entity *entity; /* connected entity (if any) */ 313 } dtd_symbol; 314 315 316 typedef struct _dtd_symbol_table 317 { int size; /* Allocated size */ 318 dtd_symbol **entries; /* Entries */ 319 } dtd_symbol_table; 320 321 322 typedef struct _dtd_entity 323 { dtd_symbol *name; /* its name */ 324 entity_type type; /* ET_* */ 325 data_type content; /* EC_* */ 326 int catalog_location; /* what catalog to use for lookup */ 327 int length; /* size of literal value */ 328 ichar *value; /* literal value */ 329 ichar *extid; /* external identifier */ 330 ichar *exturl; /* url to fetch from */ 331 ichar *baseurl; /* base url for exturl */ 332 struct _dtd_entity *next; /* list-link */ 333 } dtd_entity; 334 335 336 typedef struct _dtd_notation 337 { dtd_symbol *name; /* name of the notation */ 338 entity_type type; /* ET_{PUBLIC|SYSTEM} */ 339 ichar *public; /* public id */ 340 ichar *system; /* file with info */ 341 struct _dtd_notation *next; /* list-link */ 342 } dtd_notation; 343 344 345 typedef struct _dtd_element_list 346 { struct _dtd_element *value; /* element */ 347 struct _dtd_element_list *next; /* next in list */ 348 } dtd_element_list; 349 350 351 typedef struct _dtd_name_list 352 { dtd_symbol *value; 353 struct _dtd_name_list *next; 354 } dtd_name_list; 355 356 357 typedef struct _dtd_attr 358 { dtd_symbol *name; /* name of attribute */ 359 attrtype type; /* type (AT_*) */ 360 attrdef def; /* AT_REQUIRED/AT_IMPLIED */ 361 int islist; /* attribute is a list */ 362 union 363 { dtd_name_list *nameof; /* (name1|name2|...) */ 364 } typeex; 365 union 366 { ichar *cdata; /* default for CDATA */ 367 ichar *list; /* text for list-data */ 368 dtd_symbol *name; /* AT_NAME or AT_NAMEOF */ 369 long number; /* AT_NUMBER */ 370 } att_def; 371 int references; /* reference count */ 372 } dtd_attr; 373 374 375 typedef struct _dtd_attr_list 376 { dtd_attr *attribute; 377 struct _dtd_attr_list *next; 378 } dtd_attr_list; 379 380 381 typedef struct _dtd_model 382 { modeltype type; /* MT_* */ 383 modelcard cardinality; /* MC_* */ 384 385 union 386 { struct _dtd_model *group; /* ,/|/& group */ 387 struct _dtd_element *element; /* element */ 388 } content; 389 struct _dtd_model *next; /* next in list (for groups) */ 390 } dtd_model; 391 392 393 typedef struct _dtd_edef 394 { contenttype type; /* EMPTY, MIXED, ... */ 395 int omit_open; /* allow omitted open tag? */ 396 int omit_close; /* allow omitted close tag? */ 397 dtd_model *content; /* the content model */ 398 dtd_element_list *included; /* +(namegroup) */ 399 dtd_element_list *excluded; /* -(namegroup) */ 400 struct _dtd_state *initial_state; /* Initial state in state engine */ 401 struct _dtd_state *final_state; /* Final state in state engine */ 402 int references; /* #elements using this def */ 403 } dtd_edef; 404 405 406 typedef struct _dtd_map 407 { ichar *from; /* mapped text */ 408 int len; /* length of mapped text */ 409 dtd_symbol *to; /* name of symbol mapped onto */ 410 struct _dtd_map *next; /* next in shortref map */ 411 } dtd_map; 412 413 414 typedef struct _dtd_shortref 415 { dtd_symbol *name; /* name of SHORTREF map */ 416 dtd_map *map; /* implemented map */ 417 char ends[SHORTMAP_SIZE]; /* ending-characters in map */ 418 int defined; /* has been defined */ 419 struct _dtd_shortref *next; /* next declared shortref */ 420 } dtd_shortref; 421 422 423 typedef struct _dtd_element 424 { dtd_symbol *name; /* its name */ 425 dtd_edef *structure; /* content structure of the element */ 426 dtd_attr_list *attributes; /* defined attributes */ 427 dtd_space_mode space_mode; /* How to handle white-space (SP_*) */ 428 dtd_shortref *map; /* SHORTREF map */ 429 int undefined; /* Only implicitely defined */ 430 struct _dtd_element *next; /* in DTD'e element list */ 431 } dtd_element; 432 433 434 typedef struct _dtd_charclass 435 { unsigned char class[INPUT_CHARSET_SIZE]; /* ichar --> class-mask */ 436 } dtd_charclass; 437 438 439 typedef struct _dtd_charfunc 440 { ichar func[(int)CF_ENDTABLE]; /* CF_ --> ichar */ 441 } dtd_charfunc; 442 443 444 typedef struct _dtd 445 { int magic; /* SGML_DTD_MAGIC */ 446 int implicit; /* There is no DTD */ 447 dtd_dialect dialect; /* DL_* */ 448 int case_sensitive; /* Tags are case-sensitive */ 449 int ent_case_sensitive; /* Entities are case-sensitive */ 450 int att_case_sensitive; /* Att values are case-sensitive */ 451 int att_case_preserving; /* Preserve attribute value case */ 452 ichar *doctype; /* defined document type */ 453 dtd_symbol_table *symbols; /* symbol-table */ 454 dtd_entity *pentities; /* defined parameter entities */ 455 dtd_entity *entities; /* defined entities */ 456 dtd_entity *default_entity; /* default-entity (if any) */ 457 dtd_notation *notations; /* Declared notations */ 458 dtd_shortref *shortrefs; /* SHORTREF declarations */ 459 dtd_element *elements; /* defined elements */ 460 dtd_charfunc *charfunc; /* CF_ --> ichar */ 461 dtd_charclass *charclass; /* ichar -> CH_-mask */ 462 dtd_char_encoding encoding; /* document encoding */ 463 dtd_space_mode space_mode; /* Default for handling white-space */ 464 dtd_number_mode number_mode; /* How to treat number attributes */ 465 int shorttag; /* support SHORTTAG */ 466 int system_entities; /* expand SYSTEM entities */ 467 int keep_prefix; /* keep namespace prefixes */ 468 int references; /* destruction reference count */ 469 } dtd; 470 471 extern dtd_charfunc *new_charfunc(void); /* default classification */ 472 extern dtd_charclass *new_charclass(void); /* default classification */ 473 474 extern dtd_symbol* dtd_find_symbol(dtd *dtd, const ichar *name); 475 extern dtd_symbol* dtd_add_symbol(dtd *dtd, const ichar *name); 476 477 478 /******************************* 479 * PUBLIC * 480 *******************************/ 481 482 #include "parser.h" 483 484 dtd * file_to_dtd(const ichar *file, const ichar *doctype, 485 dtd_dialect dialect); 486 int sgml_process_file(dtd_parser *p, 487 const ichar *file, unsigned flags); 488 int sgml_process_stream(dtd_parser *p, FILE *in, 489 unsigned flags); 490 dtd_parser * new_dtd_parser(dtd *dtd); 491 void free_dtd_parser(dtd_parser *p); 492 493 void free_dtd(dtd *dtd); 494 int load_dtd_from_file(dtd_parser *p, const ichar *file); 495 dtd * new_dtd(const ichar *doctype); 496 int set_dialect_dtd(dtd *dtd, dtd_parser *p, dtd_dialect dialect); 497 int set_option_dtd(dtd *dtd, dtd_option option, int set); 498 499 int putchar_dtd_parser(dtd_parser *p, int chr); 500 int begin_document_dtd_parser(dtd_parser *p); 501 int end_document_dtd_parser(dtd_parser *p); 502 void reset_document_dtd_parser(dtd_parser *p); 503 void set_file_dtd_parser(dtd_parser *p, 504 input_type in, const ichar *file); 505 void set_mode_dtd_parser(dtd_parser *p, data_mode mode); 506 void sgml_cplocation(dtd_srcloc *dst, dtd_srcloc *src); 507 int xml_set_encoding(dtd_parser *p, const char *enc); 508 509 #endif /*DTD_H_INCLUDED*/ 510 511 512