1 /*
2  * $LynxId: SGML.h,v 1.47 2021/07/22 23:34:13 tom Exp $
3  *			       SGML parse and stream definition for libwww
4  *                             SGML AND STRUCTURED STREAMS
5  *
6  * The SGML parser is a state machine.	It is called for every character
7  * of the input stream.	 The DTD data structure contains pointers
8  * to functions which are called to implement the actual effect of the
9  * text read. When these functions are called, the attribute structures pointed to by the
10  * DTD are valid, and the function is passed a pointer to the current tag structure, and an
11  * "element stack" which represents the state of nesting within SGML elements.
12  *
13  * The following aspects are from Dan Connolly's suggestions:  Binary search,
14  * Structured object scheme basically, SGML content enum type.
15  *
16  * (c) Copyright CERN 1991 - See Copyright.html
17  *
18  */
19 #ifndef SGML_H
20 #define SGML_H
21 
22 #include <HTStream.h>
23 #include <HTAnchor.h>
24 #include <LYJustify.h>
25 
26 #ifdef __cplusplus
27 extern "C" {
28 #endif
29 /*
30  *
31  * SGML content types
32  *
33  */ typedef enum {
34 	SGML_EMPTY,		/* No content. */
35 	SGML_LITTERAL,		/* Literal character data.  Recognize exact close tag only.
36 				   Old www server compatibility only!  Not SGML */
37 	SGML_CDATA,		/* Character data.  Recognize </ only.
38 				   (But we treat it just as SGML_LITTERAL.) */
39 	SGML_SCRIPT,		/* Like CDATA, but allow it to be a comment */
40 	SGML_RCDATA,		/* Replaceable character data. Should recognize </ and &ref;
41 				   (but we treat it like SGML_MIXED for old times' sake). */
42 	SGML_MIXED,		/* Elements and parsed character data.
43 				   Recognize all markup. */
44 	SGML_ELEMENT,		/* Any data found should be regarded as an error.
45 				   (But we treat it just like SGML_MIXED.) */
46 	SGML_PCDATA		/* Should contain no elements but &ref; is parsed.
47 				   (We treat it like SGML_CDATA wrt. contained tags
48 				   i.e. pass them on literally, i.e. like we should
49 				   treat SGML_RCDATA) (added by KW). */
50     } SGMLContent;
51 
52     typedef struct {
53 	const char *name;	/* The name of the attribute */
54 #ifdef USE_PRETTYSRC
55 	char type;		/* code of the type of the attribute. Code
56 				   values are in HTMLDTD.h */
57 #endif
58     } attr;
59 
60     typedef const attr *AttrList;
61 
62     typedef struct {
63 	const char *name;
64 	AttrList list;
65     } AttrType;
66 
67     typedef int TagClass;
68 
69     /* textflow */
70 #define Tgc_FONTlike	0x00001	/* S,STRIKE,I,B,TT,U,BIG,SMALL,STYLE,BLINK;BR,TAB */
71 #define Tgc_EMlike	0x00002	/* EM,STRONG,DFN,CODE,SAMP,KBD,VAR,CITE,Q,INS,DEL,SPAN,.. */
72 #define Tgc_MATHlike	0x00004	/* SUB,SUP,MATH,COMMENT */
73 #define Tgc_Alike	0x00008	/* A */
74 #define Tgc_formula	0x00010	/* not used until math is supported better... */
75     /* used for special structures: forms, tables,... */
76 #define Tgc_TRlike	0x00020	/* TR and similar */
77 #define Tgc_SELECTlike	0x00040	/* SELECT,INPUT,TEXTAREA(,...) */
78     /* structure */
79 #define Tgc_FORMlike	0x00080	/* FORM itself */
80 #define Tgc_Plike	0x00100	/* P,H1..H6,... structures containing text or
81 				   insertion but not other structures */
82 #define Tgc_DIVlike	0x00200	/* ADDRESS,FIG,BDO,NOTE,FN,DIV,CENTER;FIG
83 				   structures which can contain other structures */
84 #define Tgc_LIlike	0x00400	/* LH,LI,DT,DD;TH,TD structure-like, only valid
85 				   within certain other structures */
86 #define Tgc_ULlike	0x00800	/* UL,OL,DL,DIR,MENU;TABLE;XMP,LISTING
87 				   special in some way, cannot contain (parsed)
88 				   text directly */
89     /* insertions */
90 #define Tgc_BRlike	0x01000	/* BR,IMG,TAB allowed in any text */
91 #define Tgc_APPLETlike	0x02000	/* APPLET,OBJECT,EMBED,SCRIPT;BUTTON */
92 #define Tgc_HRlike	0x04000	/* HR,MARQUEE can contain all kinds of things
93 				   and/or are not allowed (?) in running text */
94 #define Tgc_MAPlike	0x08000	/* MAP,AREA some specials that never contain
95 				   (directly or indirectly) other things than
96 				   special insertions */
97 #define Tgc_outer	0x10000	/* HTML,FRAMESET,FRAME,PLAINTEXT; */
98 #define Tgc_BODYlike	0x20000	/* BODY,BODYTEXT,NOFRAMES,TEXTFLOW; */
99 #define Tgc_HEADstuff	0x40000	/* HEAD,BASE,STYLE,TITLE; */
100     /* special relations */
101 #define Tgc_same	0x80000
102 #define Tgc_DELlike	0x100000
103     /* DELlike is a class of aliases for inline DEL/INS */
104     typedef unsigned char TagAlias;
105 
106 /*
107  * Groups for contains-data.
108  */
109 #define Tgc_INLINElike	(Tgc_Alike | Tgc_APPLETlike | Tgc_BRlike | Tgc_EMlike | Tgc_FONTlike | Tgc_SELECTlike)
110 #define Tgc_LISTlike	(Tgc_LIlike | Tgc_ULlike)
111 #define Tgc_BLOCKlike	(Tgc_DIVlike | Tgc_LISTlike)
112 
113 /* Some more properties of tags (or rather, elements) and rules how
114    to deal with them. - kw */
115     typedef int TagFlags;
116 
117 #define Tgf_endO	0x00001	/* end tag can be Omitted */
118 #define Tgf_startO	0x00002	/* start tag can be Omitted */
119 #define Tgf_mafse	0x00004	/* Make Attribute-Free Start-tag End instead
120 				   (if found invalid) */
121 #define Tgf_strict	0x00008	/* Ignore contained invalid elements,
122 				   don't pass them on; or other variant
123 				   handling for some content types */
124 #define Tgf_nreie	0x00010	/* Not Really Empty If Empty,
125 				   used by color style code */
126 #define Tgf_frecyc	0x00020	/* Pass element content on in a form that
127 				   allows recycling, i.e. don't translate to
128 				   output (display) character set yet (treat
129 				   content similar to attribute values) */
130 #define Tgf_nolyspcl	0x00040	/* Don't generate lynx special characters
131 				   for soft hyphen and various spaces (nbsp,
132 				   ensp,..) */
133 
134 /*		A tag structure describes an SGML element.
135  *		-----------------------------------------
136  *
137  *
138  *	name		is the string which comes after the tag opener "<".
139  *
140  *	attributes	points to a zero-terminated array
141  *			of attribute names.
142  *
143  *	litteral	determines how the SGML engine parses the characters
144  *			within the element.  If set, tag openers are ignored
145  *			except for that which opens a matching closing tag.
146  *
147  */
148     typedef struct _tag HTTag;
149     struct _tag {
150 	const char *name;	/* The name of the tag */
151 #ifdef USE_COLOR_STYLE
152 	unsigned name_len;	/* The length of the name */
153 #endif
154 #ifdef USE_JUSTIFY_ELTS
155 	BOOL can_justify;	/* justification allowed? */
156 #endif
157 	AttrList attributes;	/* The list of acceptable attributes */
158 	int number_of_attributes;	/* Number of possible attributes */
159 	const AttrType *attr_types;
160 	SGMLContent contents;	/* End only on end tag @@ */
161 	TagClass tagclass;
162 	TagClass contains;	/* which classes of elements this one can contain directly */
163 	TagClass icontains;	/* which classes of elements this one can contain indirectly */
164 	TagClass contained;	/* in which classes can this tag be contained ? */
165 	TagClass icontained;	/* in which classes can this tag be indirectly contained ? */
166 	TagClass canclose;	/* which classes of elements can this one close
167 				   if something looks wrong ? */
168 	TagFlags flags;
169 	TagAlias alias;		/* extra levels, e.g, DEL/INS */
170 	TagAlias aliases;	/* number of extra levels, e.g, DEL/INS */
171     };
172 
173 /*		DTD Information
174  *		---------------
175  *
176  *  Not the whole DTD, but all this parser uses of it.
177  */
178     typedef struct {
179 	HTTag *tags;		/* Must be in strcmp order by name */
180 	int number_of_tags;
181 	STRING2PTR entity_names;	/* Must be in strcmp order by name */
182 	size_t number_of_entities;
183 	/*  "entity_names" table probably unused,
184 	 *  see comments in HTMLDTD.c near the top
185 	 */
186     } SGML_dtd;
187 
188 /*	SGML context passed to parsers
189 */
190     typedef struct _HTSGMLContext *HTSGMLContext;	/* Hidden */
191 
192 /*__________________________________________________________________________
193 */
194 
195 /*
196 
197 Structured Object definition
198 
199    A structured object is something which can reasonably be represented
200    in SGML.  I'll rephrase that.  A structured object is an ordered
201    tree-structured arrangement of data which is representable as text.
202    The SGML parser outputs to a Structured object.  A Structured object
203    can output its contents to another Structured Object.  It's a kind of
204    typed stream.  The architecture is largely Dan Conolly's.  Elements and
205    entities are passed to the sob by number, implying a knowledge of the
206    DTD.	 Knowledge of the SGML syntax is not here, though.
207 
208    Superclass: HTStream
209 
210    The creation methods will vary on the type of Structured Object.
211    Maybe the callerData is enough info to pass along.
212 
213  */
214     typedef struct _HTStructured HTStructured;
215 
216     typedef struct _HTStructuredClass {
217 
218 	const char *name;	/* Just for diagnostics */
219 
220 	void (*_free) (HTStructured * me);
221 
222 	void (*_abort) (HTStructured * me, HTError e);
223 
224 	void (*put_character) (HTStructured * me, int ch);
225 
226 	void (*put_string) (HTStructured * me, const char *str);
227 
228 	void (*put_block) (HTStructured * me, const char *str, int len);
229 
230 	/* HTStreamClass ends here */
231 
232 	int (*start_element) (HTStructured * me, int element_number,
233 			      const BOOL *attribute_present,
234 			      STRING2PTR attribute_value,
235 			      int charset,
236 			      char **include);
237 
238 	int (*end_element) (HTStructured * me, int element_number,
239 			    char **include);
240 
241 	int (*put_entity) (HTStructured * me, int entity_number);
242 
243     } HTStructuredClass;
244 
245 /*
246   Equivalents to the following functions possibly could be generalised
247   into additional HTStructuredClass members.  For now they don't do
248   anything target-specific. - kw
249   */
250     extern BOOLEAN LYCheckForCSI(HTParentAnchor *anchor, char **url);
251     extern void LYDoCSI(char *url, const char *comment, char **csi);
252     extern BOOLEAN LYCommentHacks(HTParentAnchor *anchor, const char *comment);
253 
254 /*
255 
256 Find a Tag by Name
257 
258    Returns a pointer to the tag within the DTD.
259 
260  */
261     extern HTTag *SGMLFindTag(const SGML_dtd * dtd,
262 			      const char *string);
263 
264 /*
265  * Return the current offset within the file that SGML is parsing
266  */
267     extern int SGML_offset(void);
268 
269 /*
270 
271 Create an SGML parser
272 
273  */
274 /*
275  * On entry,
276  *	dtd		must point to a DTD structure as defined above
277  *	callbacks	must point to user routines.
278  *	callData	is returned in callbacks transparently.
279  * On exit,
280  *		The default tag starter has been processed.
281  */
282     extern HTStream *SGML_new(const SGML_dtd * dtd,
283 			      HTParentAnchor *anchor,
284 			      HTStructured * target);
285 
286     extern const HTStreamClass SGMLParser;
287 
288 #ifdef __cplusplus
289 }
290 #endif
291 #endif				/* SGML_H */
292