1 /*
2  * $LynxId: SGML.h,v 1.46 2012/02/10 18:32:26 tom Exp $
3  *			       SGML parse and stream definition for libwww
4  *                             SGML AND STRUCTURED STREAMS
5  *
6  * The SGML parser is a state machine.	It is called for every character
7  * of the input stream.	 The DTD data structure contains pointers
8  * to functions which are called to implement the actual effect of the
9  * text read. When these functions are called, the attribute structures pointed to by the
10  * DTD are valid, and the function is passed a pointer to the current tag structure, and an
11  * "element stack" which represents the state of nesting within SGML elements.
12  *
13  * The following aspects are from Dan Connolly's suggestions:  Binary search,
14  * Structured object scheme basically, SGML content enum type.
15  *
16  * (c) Copyright CERN 1991 - See Copyright.html
17  *
18  */
19 #ifndef SGML_H
20 #define SGML_H
21 
22 #include <HTStream.h>
23 #include <HTAnchor.h>
24 #include <LYJustify.h>
25 
26 #ifdef __cplusplus
27 extern "C" {
28 #endif
29 /*
30  *
31  * SGML content types
32  *
33  */ typedef enum {
34 	SGML_EMPTY,		/* No content. */
35 	SGML_LITTERAL,		/* Literal character data.  Recognize exact close tag only.
36 				   Old www server compatibility only!  Not SGML */
37 	SGML_CDATA,		/* Character data.  Recognize </ only.
38 				   (But we treat it just as SGML_LITTERAL.) */
39 	SGML_SCRIPT,		/* Like CDATA, but allow it to be a comment */
40 	SGML_RCDATA,		/* Replaceable character data. Should recognize </ and &ref;
41 				   (but we treat it like SGML_MIXED for old times' sake). */
42 	SGML_MIXED,		/* Elements and parsed character data.
43 				   Recognize all markup. */
44 	SGML_ELEMENT,		/* Any data found should be regarded as an error.
45 				   (But we treat it just like SGML_MIXED.) */
46 	SGML_PCDATA		/* Should contain no elements but &ref; is parsed.
47 				   (We treat it like SGML_CDATA wrt. contained tags
48 				   i.e. pass them on literally, i.e. like we should
49 				   treat SGML_RCDATA) (added by KW). */
50     } SGMLContent;
51 
52     typedef struct {
53 	const char *name;	/* The name of the attribute */
54 #ifdef USE_PRETTYSRC
55 	char type;		/* code of the type of the attribute. Code
56 				   values are in HTMLDTD.h */
57 #endif
58     } attr;
59 
60     typedef const attr *AttrList;
61 
62     typedef struct {
63 	const char *name;
64 	AttrList list;
65     } AttrType;
66 
67     typedef int TagClass;
68 
69     /* textflow */
70 #define Tgc_FONTlike	0x00001	/* S,STRIKE,I,B,TT,U,BIG,SMALL,STYLE,BLINK;BR,TAB */
71 #define Tgc_EMlike	0x00002	/* EM,STRONG,DFN,CODE,SAMP,KBD,VAR,CITE,Q,INS,DEL,SPAN,.. */
72 #define Tgc_MATHlike	0x00004	/* SUB,SUP,MATH,COMMENT */
73 #define Tgc_Alike	0x00008	/* A */
74 #define Tgc_formula	0x00010	/* not used until math is supported better... */
75     /* used for special structures: forms, tables,... */
76 #define Tgc_TRlike	0x00020	/* TR and similar */
77 #define Tgc_SELECTlike	0x00040	/* SELECT,INPUT,TEXTAREA(,...) */
78     /* structure */
79 #define Tgc_FORMlike	0x00080	/* FORM itself */
80 #define Tgc_Plike	0x00100	/* P,H1..H6,... structures containing text or
81 				   insertion but not other structures */
82 #define Tgc_DIVlike	0x00200	/* ADDRESS,FIG,BDO,NOTE,FN,DIV,CENTER;FIG
83 				   structures which can contain other structures */
84 #define Tgc_LIlike	0x00400	/* LH,LI,DT,DD;TH,TD structure-like, only valid
85 				   within certain other structures */
86 #define Tgc_ULlike	0x00800	/* UL,OL,DL,DIR,MENU;TABLE;XMP,LISTING
87 				   special in some way, cannot contain (parsed)
88 				   text directly */
89     /* insertions */
90 #define Tgc_BRlike	0x01000	/* BR,IMG,TAB allowed in any text */
91 #define Tgc_APPLETlike	0x02000	/* APPLET,OBJECT,EMBED,SCRIPT;BUTTON */
92 #define Tgc_HRlike	0x04000	/* HR,MARQUEE can contain all kinds of things
93 				   and/or are not allowed (?) in running text */
94 #define Tgc_MAPlike	0x08000	/* MAP,AREA some specials that never contain
95 				   (directly or indirectly) other things than
96 				   special insertions */
97 #define Tgc_outer	0x10000	/* HTML,FRAMESET,FRAME,PLAINTEXT; */
98 #define Tgc_BODYlike	0x20000	/* BODY,BODYTEXT,NOFRAMES,TEXTFLOW; */
99 #define Tgc_HEADstuff	0x40000	/* HEAD,BASE,STYLE,TITLE; */
100     /* special relations */
101 #define Tgc_same	0x80000
102 
103 /*
104  * Groups for contains-data.
105  */
106 #define Tgc_INLINElike	(Tgc_Alike | Tgc_APPLETlike | Tgc_BRlike | Tgc_EMlike | Tgc_FONTlike | Tgc_SELECTlike)
107 #define Tgc_LISTlike	(Tgc_LIlike | Tgc_ULlike)
108 #define Tgc_BLOCKlike	(Tgc_DIVlike | Tgc_LISTlike)
109 
110 /* Some more properties of tags (or rather, elements) and rules how
111    to deal with them. - kw */
112     typedef int TagFlags;
113 
114 #define Tgf_endO	0x00001	/* end tag can be Omitted */
115 #define Tgf_startO	0x00002	/* start tag can be Omitted */
116 #define Tgf_mafse	0x00004	/* Make Attribute-Free Start-tag End instead
117 				   (if found invalid) */
118 #define Tgf_strict	0x00008	/* Ignore contained invalid elements,
119 				   don't pass them on; or other variant
120 				   handling for some content types */
121 #define Tgf_nreie	0x00010	/* Not Really Empty If Empty,
122 				   used by color style code */
123 #define Tgf_frecyc	0x00020	/* Pass element content on in a form that
124 				   allows recycling, i.e. don't translate to
125 				   output (display) character set yet (treat
126 				   content similar to attribute values) */
127 #define Tgf_nolyspcl	0x00040	/* Don't generate lynx special characters
128 				   for soft hyphen and various spaces (nbsp,
129 				   ensp,..) */
130 
131 /*		A tag structure describes an SGML element.
132  *		-----------------------------------------
133  *
134  *
135  *	name		is the string which comes after the tag opener "<".
136  *
137  *	attributes	points to a zero-terminated array
138  *			of attribute names.
139  *
140  *	litteral	determines how the SGML engine parses the characters
141  *			within the element.  If set, tag openers are ignored
142  *			except for that which opens a matching closing tag.
143  *
144  */
145     typedef struct _tag HTTag;
146     struct _tag {
147 	const char *name;	/* The name of the tag */
148 #ifdef USE_COLOR_STYLE
149 	unsigned name_len;	/* The length of the name */
150 #endif
151 #ifdef USE_JUSTIFY_ELTS
152 	BOOL can_justify;	/* justification allowed? */
153 #endif
154 	AttrList attributes;	/* The list of acceptable attributes */
155 	int number_of_attributes;	/* Number of possible attributes */
156 	const AttrType *attr_types;
157 	SGMLContent contents;	/* End only on end tag @@ */
158 	TagClass tagclass;
159 	TagClass contains;	/* which classes of elements this one can contain directly */
160 	TagClass icontains;	/* which classes of elements this one can contain indirectly */
161 	TagClass contained;	/* in which classes can this tag be contained ? */
162 	TagClass icontained;	/* in which classes can this tag be indirectly contained ? */
163 	TagClass canclose;	/* which classes of elements can this one close
164 				   if something looks wrong ? */
165 	TagFlags flags;
166     };
167 
168 /*		DTD Information
169  *		---------------
170  *
171  *  Not the whole DTD, but all this parser uses of it.
172  */
173     typedef struct {
174 	HTTag *tags;		/* Must be in strcmp order by name */
175 	int number_of_tags;
176 	STRING2PTR entity_names;	/* Must be in strcmp order by name */
177 	size_t number_of_entities;
178 	/*  "entity_names" table probably unused,
179 	 *  see comments in HTMLDTD.c near the top
180 	 */
181     } SGML_dtd;
182 
183 /*	SGML context passed to parsers
184 */
185     typedef struct _HTSGMLContext *HTSGMLContext;	/* Hidden */
186 
187 /*__________________________________________________________________________
188 */
189 
190 /*
191 
192 Structured Object definition
193 
194    A structured object is something which can reasonably be represented
195    in SGML.  I'll rephrase that.  A structured object is an ordered
196    tree-structured arrangement of data which is representable as text.
197    The SGML parser outputs to a Structured object.  A Structured object
198    can output its contents to another Structured Object.  It's a kind of
199    typed stream.  The architecture is largely Dan Conolly's.  Elements and
200    entities are passed to the sob by number, implying a knowledge of the
201    DTD.	 Knowledge of the SGML syntax is not here, though.
202 
203    Superclass: HTStream
204 
205    The creation methods will vary on the type of Structured Object.
206    Maybe the callerData is enough info to pass along.
207 
208  */
209     typedef struct _HTStructured HTStructured;
210 
211     typedef struct _HTStructuredClass {
212 
213 	const char *name;	/* Just for diagnostics */
214 
215 	void (*_free) (HTStructured * me);
216 
217 	void (*_abort) (HTStructured * me, HTError e);
218 
219 	void (*put_character) (HTStructured * me, int ch);
220 
221 	void (*put_string) (HTStructured * me, const char *str);
222 
223 	void (*put_block) (HTStructured * me, const char *str, int len);
224 
225 	/* HTStreamClass ends here */
226 
227 	int (*start_element) (HTStructured * me, int element_number,
228 			      const BOOL *attribute_present,
229 			      STRING2PTR attribute_value,
230 			      int charset,
231 			      char **include);
232 
233 	int (*end_element) (HTStructured * me, int element_number,
234 			    char **include);
235 
236 	int (*put_entity) (HTStructured * me, int entity_number);
237 
238     } HTStructuredClass;
239 
240 /*
241   Equivalents to the following functions possibly could be generalised
242   into additional HTStructuredClass members.  For now they don't do
243   anything target-specific. - kw
244   */
245     extern BOOLEAN LYCheckForCSI(HTParentAnchor *anchor, char **url);
246     extern void LYDoCSI(char *url, const char *comment, char **csi);
247     extern BOOLEAN LYCommentHacks(HTParentAnchor *anchor, const char *comment);
248 
249 /*
250 
251 Find a Tag by Name
252 
253    Returns a pointer to the tag within the DTD.
254 
255  */
256     extern HTTag *SGMLFindTag(const SGML_dtd * dtd,
257 			      const char *string);
258 
259 /*
260  * Return the current offset within the file that SGML is parsing
261  */
262     extern int SGML_offset(void);
263 
264 /*
265 
266 Create an SGML parser
267 
268  */
269 /*
270  * On entry,
271  *	dtd		must point to a DTD structure as defined above
272  *	callbacks	must point to user routines.
273  *	callData	is returned in callbacks transparently.
274  * On exit,
275  *		The default tag starter has been processed.
276  */
277     extern HTStream *SGML_new(const SGML_dtd * dtd,
278 			      HTParentAnchor *anchor,
279 			      HTStructured * target);
280 
281     extern const HTStreamClass SGMLParser;
282 
283 #ifdef __cplusplus
284 }
285 #endif
286 #endif				/* SGML_H */
287