1 /* 2 * $LynxId: SGML.h,v 1.47 2021/07/22 23:34:13 tom Exp $ 3 * SGML parse and stream definition for libwww 4 * SGML AND STRUCTURED STREAMS 5 * 6 * The SGML parser is a state machine. It is called for every character 7 * of the input stream. The DTD data structure contains pointers 8 * to functions which are called to implement the actual effect of the 9 * text read. When these functions are called, the attribute structures pointed to by the 10 * DTD are valid, and the function is passed a pointer to the current tag structure, and an 11 * "element stack" which represents the state of nesting within SGML elements. 12 * 13 * The following aspects are from Dan Connolly's suggestions: Binary search, 14 * Structured object scheme basically, SGML content enum type. 15 * 16 * (c) Copyright CERN 1991 - See Copyright.html 17 * 18 */ 19 #ifndef SGML_H 20 #define SGML_H 21 22 #include <HTStream.h> 23 #include <HTAnchor.h> 24 #include <LYJustify.h> 25 26 #ifdef __cplusplus 27 extern "C" { 28 #endif 29 /* 30 * 31 * SGML content types 32 * 33 */ typedef enum { 34 SGML_EMPTY, /* No content. */ 35 SGML_LITTERAL, /* Literal character data. Recognize exact close tag only. 36 Old www server compatibility only! Not SGML */ 37 SGML_CDATA, /* Character data. Recognize </ only. 38 (But we treat it just as SGML_LITTERAL.) */ 39 SGML_SCRIPT, /* Like CDATA, but allow it to be a comment */ 40 SGML_RCDATA, /* Replaceable character data. Should recognize </ and &ref; 41 (but we treat it like SGML_MIXED for old times' sake). */ 42 SGML_MIXED, /* Elements and parsed character data. 43 Recognize all markup. */ 44 SGML_ELEMENT, /* Any data found should be regarded as an error. 45 (But we treat it just like SGML_MIXED.) */ 46 SGML_PCDATA /* Should contain no elements but &ref; is parsed. 47 (We treat it like SGML_CDATA wrt. contained tags 48 i.e. pass them on literally, i.e. like we should 49 treat SGML_RCDATA) (added by KW). */ 50 } SGMLContent; 51 52 typedef struct { 53 const char *name; /* The name of the attribute */ 54 #ifdef USE_PRETTYSRC 55 char type; /* code of the type of the attribute. Code 56 values are in HTMLDTD.h */ 57 #endif 58 } attr; 59 60 typedef const attr *AttrList; 61 62 typedef struct { 63 const char *name; 64 AttrList list; 65 } AttrType; 66 67 typedef int TagClass; 68 69 /* textflow */ 70 #define Tgc_FONTlike 0x00001 /* S,STRIKE,I,B,TT,U,BIG,SMALL,STYLE,BLINK;BR,TAB */ 71 #define Tgc_EMlike 0x00002 /* EM,STRONG,DFN,CODE,SAMP,KBD,VAR,CITE,Q,INS,DEL,SPAN,.. */ 72 #define Tgc_MATHlike 0x00004 /* SUB,SUP,MATH,COMMENT */ 73 #define Tgc_Alike 0x00008 /* A */ 74 #define Tgc_formula 0x00010 /* not used until math is supported better... */ 75 /* used for special structures: forms, tables,... */ 76 #define Tgc_TRlike 0x00020 /* TR and similar */ 77 #define Tgc_SELECTlike 0x00040 /* SELECT,INPUT,TEXTAREA(,...) */ 78 /* structure */ 79 #define Tgc_FORMlike 0x00080 /* FORM itself */ 80 #define Tgc_Plike 0x00100 /* P,H1..H6,... structures containing text or 81 insertion but not other structures */ 82 #define Tgc_DIVlike 0x00200 /* ADDRESS,FIG,BDO,NOTE,FN,DIV,CENTER;FIG 83 structures which can contain other structures */ 84 #define Tgc_LIlike 0x00400 /* LH,LI,DT,DD;TH,TD structure-like, only valid 85 within certain other structures */ 86 #define Tgc_ULlike 0x00800 /* UL,OL,DL,DIR,MENU;TABLE;XMP,LISTING 87 special in some way, cannot contain (parsed) 88 text directly */ 89 /* insertions */ 90 #define Tgc_BRlike 0x01000 /* BR,IMG,TAB allowed in any text */ 91 #define Tgc_APPLETlike 0x02000 /* APPLET,OBJECT,EMBED,SCRIPT;BUTTON */ 92 #define Tgc_HRlike 0x04000 /* HR,MARQUEE can contain all kinds of things 93 and/or are not allowed (?) in running text */ 94 #define Tgc_MAPlike 0x08000 /* MAP,AREA some specials that never contain 95 (directly or indirectly) other things than 96 special insertions */ 97 #define Tgc_outer 0x10000 /* HTML,FRAMESET,FRAME,PLAINTEXT; */ 98 #define Tgc_BODYlike 0x20000 /* BODY,BODYTEXT,NOFRAMES,TEXTFLOW; */ 99 #define Tgc_HEADstuff 0x40000 /* HEAD,BASE,STYLE,TITLE; */ 100 /* special relations */ 101 #define Tgc_same 0x80000 102 #define Tgc_DELlike 0x100000 103 /* DELlike is a class of aliases for inline DEL/INS */ 104 typedef unsigned char TagAlias; 105 106 /* 107 * Groups for contains-data. 108 */ 109 #define Tgc_INLINElike (Tgc_Alike | Tgc_APPLETlike | Tgc_BRlike | Tgc_EMlike | Tgc_FONTlike | Tgc_SELECTlike) 110 #define Tgc_LISTlike (Tgc_LIlike | Tgc_ULlike) 111 #define Tgc_BLOCKlike (Tgc_DIVlike | Tgc_LISTlike) 112 113 /* Some more properties of tags (or rather, elements) and rules how 114 to deal with them. - kw */ 115 typedef int TagFlags; 116 117 #define Tgf_endO 0x00001 /* end tag can be Omitted */ 118 #define Tgf_startO 0x00002 /* start tag can be Omitted */ 119 #define Tgf_mafse 0x00004 /* Make Attribute-Free Start-tag End instead 120 (if found invalid) */ 121 #define Tgf_strict 0x00008 /* Ignore contained invalid elements, 122 don't pass them on; or other variant 123 handling for some content types */ 124 #define Tgf_nreie 0x00010 /* Not Really Empty If Empty, 125 used by color style code */ 126 #define Tgf_frecyc 0x00020 /* Pass element content on in a form that 127 allows recycling, i.e. don't translate to 128 output (display) character set yet (treat 129 content similar to attribute values) */ 130 #define Tgf_nolyspcl 0x00040 /* Don't generate lynx special characters 131 for soft hyphen and various spaces (nbsp, 132 ensp,..) */ 133 134 /* A tag structure describes an SGML element. 135 * ----------------------------------------- 136 * 137 * 138 * name is the string which comes after the tag opener "<". 139 * 140 * attributes points to a zero-terminated array 141 * of attribute names. 142 * 143 * litteral determines how the SGML engine parses the characters 144 * within the element. If set, tag openers are ignored 145 * except for that which opens a matching closing tag. 146 * 147 */ 148 typedef struct _tag HTTag; 149 struct _tag { 150 const char *name; /* The name of the tag */ 151 #ifdef USE_COLOR_STYLE 152 unsigned name_len; /* The length of the name */ 153 #endif 154 #ifdef USE_JUSTIFY_ELTS 155 BOOL can_justify; /* justification allowed? */ 156 #endif 157 AttrList attributes; /* The list of acceptable attributes */ 158 int number_of_attributes; /* Number of possible attributes */ 159 const AttrType *attr_types; 160 SGMLContent contents; /* End only on end tag @@ */ 161 TagClass tagclass; 162 TagClass contains; /* which classes of elements this one can contain directly */ 163 TagClass icontains; /* which classes of elements this one can contain indirectly */ 164 TagClass contained; /* in which classes can this tag be contained ? */ 165 TagClass icontained; /* in which classes can this tag be indirectly contained ? */ 166 TagClass canclose; /* which classes of elements can this one close 167 if something looks wrong ? */ 168 TagFlags flags; 169 TagAlias alias; /* extra levels, e.g, DEL/INS */ 170 TagAlias aliases; /* number of extra levels, e.g, DEL/INS */ 171 }; 172 173 /* DTD Information 174 * --------------- 175 * 176 * Not the whole DTD, but all this parser uses of it. 177 */ 178 typedef struct { 179 HTTag *tags; /* Must be in strcmp order by name */ 180 int number_of_tags; 181 STRING2PTR entity_names; /* Must be in strcmp order by name */ 182 size_t number_of_entities; 183 /* "entity_names" table probably unused, 184 * see comments in HTMLDTD.c near the top 185 */ 186 } SGML_dtd; 187 188 /* SGML context passed to parsers 189 */ 190 typedef struct _HTSGMLContext *HTSGMLContext; /* Hidden */ 191 192 /*__________________________________________________________________________ 193 */ 194 195 /* 196 197 Structured Object definition 198 199 A structured object is something which can reasonably be represented 200 in SGML. I'll rephrase that. A structured object is an ordered 201 tree-structured arrangement of data which is representable as text. 202 The SGML parser outputs to a Structured object. A Structured object 203 can output its contents to another Structured Object. It's a kind of 204 typed stream. The architecture is largely Dan Conolly's. Elements and 205 entities are passed to the sob by number, implying a knowledge of the 206 DTD. Knowledge of the SGML syntax is not here, though. 207 208 Superclass: HTStream 209 210 The creation methods will vary on the type of Structured Object. 211 Maybe the callerData is enough info to pass along. 212 213 */ 214 typedef struct _HTStructured HTStructured; 215 216 typedef struct _HTStructuredClass { 217 218 const char *name; /* Just for diagnostics */ 219 220 void (*_free) (HTStructured * me); 221 222 void (*_abort) (HTStructured * me, HTError e); 223 224 void (*put_character) (HTStructured * me, int ch); 225 226 void (*put_string) (HTStructured * me, const char *str); 227 228 void (*put_block) (HTStructured * me, const char *str, int len); 229 230 /* HTStreamClass ends here */ 231 232 int (*start_element) (HTStructured * me, int element_number, 233 const BOOL *attribute_present, 234 STRING2PTR attribute_value, 235 int charset, 236 char **include); 237 238 int (*end_element) (HTStructured * me, int element_number, 239 char **include); 240 241 int (*put_entity) (HTStructured * me, int entity_number); 242 243 } HTStructuredClass; 244 245 /* 246 Equivalents to the following functions possibly could be generalised 247 into additional HTStructuredClass members. For now they don't do 248 anything target-specific. - kw 249 */ 250 extern BOOLEAN LYCheckForCSI(HTParentAnchor *anchor, char **url); 251 extern void LYDoCSI(char *url, const char *comment, char **csi); 252 extern BOOLEAN LYCommentHacks(HTParentAnchor *anchor, const char *comment); 253 254 /* 255 256 Find a Tag by Name 257 258 Returns a pointer to the tag within the DTD. 259 260 */ 261 extern HTTag *SGMLFindTag(const SGML_dtd * dtd, 262 const char *string); 263 264 /* 265 * Return the current offset within the file that SGML is parsing 266 */ 267 extern int SGML_offset(void); 268 269 /* 270 271 Create an SGML parser 272 273 */ 274 /* 275 * On entry, 276 * dtd must point to a DTD structure as defined above 277 * callbacks must point to user routines. 278 * callData is returned in callbacks transparently. 279 * On exit, 280 * The default tag starter has been processed. 281 */ 282 extern HTStream *SGML_new(const SGML_dtd * dtd, 283 HTParentAnchor *anchor, 284 HTStructured * target); 285 286 extern const HTStreamClass SGMLParser; 287 288 #ifdef __cplusplus 289 } 290 #endif 291 #endif /* SGML_H */ 292