1 /* 2 * $LynxId: SGML.h,v 1.46 2012/02/10 18:32:26 tom Exp $ 3 * SGML parse and stream definition for libwww 4 * SGML AND STRUCTURED STREAMS 5 * 6 * The SGML parser is a state machine. It is called for every character 7 * of the input stream. The DTD data structure contains pointers 8 * to functions which are called to implement the actual effect of the 9 * text read. When these functions are called, the attribute structures pointed to by the 10 * DTD are valid, and the function is passed a pointer to the current tag structure, and an 11 * "element stack" which represents the state of nesting within SGML elements. 12 * 13 * The following aspects are from Dan Connolly's suggestions: Binary search, 14 * Structured object scheme basically, SGML content enum type. 15 * 16 * (c) Copyright CERN 1991 - See Copyright.html 17 * 18 */ 19 #ifndef SGML_H 20 #define SGML_H 21 22 #include <HTStream.h> 23 #include <HTAnchor.h> 24 #include <LYJustify.h> 25 26 #ifdef __cplusplus 27 extern "C" { 28 #endif 29 /* 30 * 31 * SGML content types 32 * 33 */ typedef enum { 34 SGML_EMPTY, /* No content. */ 35 SGML_LITTERAL, /* Literal character data. Recognize exact close tag only. 36 Old www server compatibility only! Not SGML */ 37 SGML_CDATA, /* Character data. Recognize </ only. 38 (But we treat it just as SGML_LITTERAL.) */ 39 SGML_SCRIPT, /* Like CDATA, but allow it to be a comment */ 40 SGML_RCDATA, /* Replaceable character data. Should recognize </ and &ref; 41 (but we treat it like SGML_MIXED for old times' sake). */ 42 SGML_MIXED, /* Elements and parsed character data. 43 Recognize all markup. */ 44 SGML_ELEMENT, /* Any data found should be regarded as an error. 45 (But we treat it just like SGML_MIXED.) */ 46 SGML_PCDATA /* Should contain no elements but &ref; is parsed. 47 (We treat it like SGML_CDATA wrt. contained tags 48 i.e. pass them on literally, i.e. like we should 49 treat SGML_RCDATA) (added by KW). */ 50 } SGMLContent; 51 52 typedef struct { 53 const char *name; /* The name of the attribute */ 54 #ifdef USE_PRETTYSRC 55 char type; /* code of the type of the attribute. Code 56 values are in HTMLDTD.h */ 57 #endif 58 } attr; 59 60 typedef const attr *AttrList; 61 62 typedef struct { 63 const char *name; 64 AttrList list; 65 } AttrType; 66 67 typedef int TagClass; 68 69 /* textflow */ 70 #define Tgc_FONTlike 0x00001 /* S,STRIKE,I,B,TT,U,BIG,SMALL,STYLE,BLINK;BR,TAB */ 71 #define Tgc_EMlike 0x00002 /* EM,STRONG,DFN,CODE,SAMP,KBD,VAR,CITE,Q,INS,DEL,SPAN,.. */ 72 #define Tgc_MATHlike 0x00004 /* SUB,SUP,MATH,COMMENT */ 73 #define Tgc_Alike 0x00008 /* A */ 74 #define Tgc_formula 0x00010 /* not used until math is supported better... */ 75 /* used for special structures: forms, tables,... */ 76 #define Tgc_TRlike 0x00020 /* TR and similar */ 77 #define Tgc_SELECTlike 0x00040 /* SELECT,INPUT,TEXTAREA(,...) */ 78 /* structure */ 79 #define Tgc_FORMlike 0x00080 /* FORM itself */ 80 #define Tgc_Plike 0x00100 /* P,H1..H6,... structures containing text or 81 insertion but not other structures */ 82 #define Tgc_DIVlike 0x00200 /* ADDRESS,FIG,BDO,NOTE,FN,DIV,CENTER;FIG 83 structures which can contain other structures */ 84 #define Tgc_LIlike 0x00400 /* LH,LI,DT,DD;TH,TD structure-like, only valid 85 within certain other structures */ 86 #define Tgc_ULlike 0x00800 /* UL,OL,DL,DIR,MENU;TABLE;XMP,LISTING 87 special in some way, cannot contain (parsed) 88 text directly */ 89 /* insertions */ 90 #define Tgc_BRlike 0x01000 /* BR,IMG,TAB allowed in any text */ 91 #define Tgc_APPLETlike 0x02000 /* APPLET,OBJECT,EMBED,SCRIPT;BUTTON */ 92 #define Tgc_HRlike 0x04000 /* HR,MARQUEE can contain all kinds of things 93 and/or are not allowed (?) in running text */ 94 #define Tgc_MAPlike 0x08000 /* MAP,AREA some specials that never contain 95 (directly or indirectly) other things than 96 special insertions */ 97 #define Tgc_outer 0x10000 /* HTML,FRAMESET,FRAME,PLAINTEXT; */ 98 #define Tgc_BODYlike 0x20000 /* BODY,BODYTEXT,NOFRAMES,TEXTFLOW; */ 99 #define Tgc_HEADstuff 0x40000 /* HEAD,BASE,STYLE,TITLE; */ 100 /* special relations */ 101 #define Tgc_same 0x80000 102 103 /* 104 * Groups for contains-data. 105 */ 106 #define Tgc_INLINElike (Tgc_Alike | Tgc_APPLETlike | Tgc_BRlike | Tgc_EMlike | Tgc_FONTlike | Tgc_SELECTlike) 107 #define Tgc_LISTlike (Tgc_LIlike | Tgc_ULlike) 108 #define Tgc_BLOCKlike (Tgc_DIVlike | Tgc_LISTlike) 109 110 /* Some more properties of tags (or rather, elements) and rules how 111 to deal with them. - kw */ 112 typedef int TagFlags; 113 114 #define Tgf_endO 0x00001 /* end tag can be Omitted */ 115 #define Tgf_startO 0x00002 /* start tag can be Omitted */ 116 #define Tgf_mafse 0x00004 /* Make Attribute-Free Start-tag End instead 117 (if found invalid) */ 118 #define Tgf_strict 0x00008 /* Ignore contained invalid elements, 119 don't pass them on; or other variant 120 handling for some content types */ 121 #define Tgf_nreie 0x00010 /* Not Really Empty If Empty, 122 used by color style code */ 123 #define Tgf_frecyc 0x00020 /* Pass element content on in a form that 124 allows recycling, i.e. don't translate to 125 output (display) character set yet (treat 126 content similar to attribute values) */ 127 #define Tgf_nolyspcl 0x00040 /* Don't generate lynx special characters 128 for soft hyphen and various spaces (nbsp, 129 ensp,..) */ 130 131 /* A tag structure describes an SGML element. 132 * ----------------------------------------- 133 * 134 * 135 * name is the string which comes after the tag opener "<". 136 * 137 * attributes points to a zero-terminated array 138 * of attribute names. 139 * 140 * litteral determines how the SGML engine parses the characters 141 * within the element. If set, tag openers are ignored 142 * except for that which opens a matching closing tag. 143 * 144 */ 145 typedef struct _tag HTTag; 146 struct _tag { 147 const char *name; /* The name of the tag */ 148 #ifdef USE_COLOR_STYLE 149 unsigned name_len; /* The length of the name */ 150 #endif 151 #ifdef USE_JUSTIFY_ELTS 152 BOOL can_justify; /* justification allowed? */ 153 #endif 154 AttrList attributes; /* The list of acceptable attributes */ 155 int number_of_attributes; /* Number of possible attributes */ 156 const AttrType *attr_types; 157 SGMLContent contents; /* End only on end tag @@ */ 158 TagClass tagclass; 159 TagClass contains; /* which classes of elements this one can contain directly */ 160 TagClass icontains; /* which classes of elements this one can contain indirectly */ 161 TagClass contained; /* in which classes can this tag be contained ? */ 162 TagClass icontained; /* in which classes can this tag be indirectly contained ? */ 163 TagClass canclose; /* which classes of elements can this one close 164 if something looks wrong ? */ 165 TagFlags flags; 166 }; 167 168 /* DTD Information 169 * --------------- 170 * 171 * Not the whole DTD, but all this parser uses of it. 172 */ 173 typedef struct { 174 HTTag *tags; /* Must be in strcmp order by name */ 175 int number_of_tags; 176 STRING2PTR entity_names; /* Must be in strcmp order by name */ 177 size_t number_of_entities; 178 /* "entity_names" table probably unused, 179 * see comments in HTMLDTD.c near the top 180 */ 181 } SGML_dtd; 182 183 /* SGML context passed to parsers 184 */ 185 typedef struct _HTSGMLContext *HTSGMLContext; /* Hidden */ 186 187 /*__________________________________________________________________________ 188 */ 189 190 /* 191 192 Structured Object definition 193 194 A structured object is something which can reasonably be represented 195 in SGML. I'll rephrase that. A structured object is an ordered 196 tree-structured arrangement of data which is representable as text. 197 The SGML parser outputs to a Structured object. A Structured object 198 can output its contents to another Structured Object. It's a kind of 199 typed stream. The architecture is largely Dan Conolly's. Elements and 200 entities are passed to the sob by number, implying a knowledge of the 201 DTD. Knowledge of the SGML syntax is not here, though. 202 203 Superclass: HTStream 204 205 The creation methods will vary on the type of Structured Object. 206 Maybe the callerData is enough info to pass along. 207 208 */ 209 typedef struct _HTStructured HTStructured; 210 211 typedef struct _HTStructuredClass { 212 213 const char *name; /* Just for diagnostics */ 214 215 void (*_free) (HTStructured * me); 216 217 void (*_abort) (HTStructured * me, HTError e); 218 219 void (*put_character) (HTStructured * me, int ch); 220 221 void (*put_string) (HTStructured * me, const char *str); 222 223 void (*put_block) (HTStructured * me, const char *str, int len); 224 225 /* HTStreamClass ends here */ 226 227 int (*start_element) (HTStructured * me, int element_number, 228 const BOOL *attribute_present, 229 STRING2PTR attribute_value, 230 int charset, 231 char **include); 232 233 int (*end_element) (HTStructured * me, int element_number, 234 char **include); 235 236 int (*put_entity) (HTStructured * me, int entity_number); 237 238 } HTStructuredClass; 239 240 /* 241 Equivalents to the following functions possibly could be generalised 242 into additional HTStructuredClass members. For now they don't do 243 anything target-specific. - kw 244 */ 245 extern BOOLEAN LYCheckForCSI(HTParentAnchor *anchor, char **url); 246 extern void LYDoCSI(char *url, const char *comment, char **csi); 247 extern BOOLEAN LYCommentHacks(HTParentAnchor *anchor, const char *comment); 248 249 /* 250 251 Find a Tag by Name 252 253 Returns a pointer to the tag within the DTD. 254 255 */ 256 extern HTTag *SGMLFindTag(const SGML_dtd * dtd, 257 const char *string); 258 259 /* 260 * Return the current offset within the file that SGML is parsing 261 */ 262 extern int SGML_offset(void); 263 264 /* 265 266 Create an SGML parser 267 268 */ 269 /* 270 * On entry, 271 * dtd must point to a DTD structure as defined above 272 * callbacks must point to user routines. 273 * callData is returned in callbacks transparently. 274 * On exit, 275 * The default tag starter has been processed. 276 */ 277 extern HTStream *SGML_new(const SGML_dtd * dtd, 278 HTParentAnchor *anchor, 279 HTStructured * target); 280 281 extern const HTStreamClass SGMLParser; 282 283 #ifdef __cplusplus 284 } 285 #endif 286 #endif /* SGML_H */ 287