1 #ifndef __LEXER_H__
2 #define __LEXER_H__
3 
4 /* lexer.h -- Lexer for html parser
5 
6    (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
7    See tidy.h for the copyright notice.
8 
9    CVS Info:
10     $Author: arnaud02 $
11     $Date: 2008/03/22 21:06:11 $
12     $Revision: 1.41 $
13 
14 */
15 
16 /*
17   Given an input source, it returns a sequence of tokens.
18 
19      GetToken(source) gets the next token
20      UngetToken(source) provides one level undo
21 
22   The tags include an attribute list:
23 
24     - linked list of attribute/value nodes
25     - each node has 2 NULL-terminated strings.
26     - entities are replaced in attribute values
27 
28   white space is compacted if not in preformatted mode
29   If not in preformatted mode then leading white space
30   is discarded and subsequent white space sequences
31   compacted to single space characters.
32 
33   If XmlTags is no then Tag names are folded to upper
34   case and attribute names to lower case.
35 
36  Not yet done:
37     -   Doctype subset and marked sections
38 */
39 
40 #ifdef __cplusplus
41 extern "C" {
42 #endif
43 
44 #include "forward.h"
45 
46 /* lexer character types
47 */
48 #define digit       1u
49 #define letter      2u
50 #define namechar    4u
51 #define white       8u
52 #define newline     16u
53 #define lowercase   32u
54 #define uppercase   64u
55 #define digithex    128u
56 
57 
58 /* node->type is one of these values
59 */
60 typedef enum
61 {
62   RootNode,
63   DocTypeTag,
64   CommentTag,
65   ProcInsTag,
66   TextNode,
67   StartTag,
68   EndTag,
69   StartEndTag,
70   CDATATag,
71   SectionTag,
72   AspTag,
73   JsteTag,
74   PhpTag,
75   XmlDecl
76 } NodeType;
77 
78 
79 
80 /* lexer GetToken states
81 */
82 typedef enum
83 {
84   LEX_CONTENT,
85   LEX_GT,
86   LEX_ENDTAG,
87   LEX_STARTTAG,
88   LEX_COMMENT,
89   LEX_DOCTYPE,
90   LEX_PROCINSTR,
91   LEX_CDATA,
92   LEX_SECTION,
93   LEX_ASP,
94   LEX_JSTE,
95   LEX_PHP,
96   LEX_XMLDECL
97 } LexerState;
98 
99 /* ParseDocTypeDecl state constants */
100 typedef enum
101 {
102   DT_INTERMEDIATE,
103   DT_DOCTYPENAME,
104   DT_PUBLICSYSTEM,
105   DT_QUOTEDSTRING,
106   DT_INTSUBSET
107 } ParseDocTypeDeclState;
108 
109 /* content model shortcut encoding
110 
111    Descriptions are tentative.
112 */
113 #define CM_UNKNOWN      0
114 /* Elements with no content. Map to HTML specification. */
115 #define CM_EMPTY        (1 << 0)
116 /* Elements that appear outside of "BODY". */
117 #define CM_HTML         (1 << 1)
118 /* Elements that can appear within HEAD. */
119 #define CM_HEAD         (1 << 2)
120 /* HTML "block" elements. */
121 #define CM_BLOCK        (1 << 3)
122 /* HTML "inline" elements. */
123 #define CM_INLINE       (1 << 4)
124 /* Elements that mark list item ("LI"). */
125 #define CM_LIST         (1 << 5)
126 /* Elements that mark definition list item ("DL", "DT"). */
127 #define CM_DEFLIST      (1 << 6)
128 /* Elements that can appear inside TABLE. */
129 #define CM_TABLE        (1 << 7)
130 /* Used for "THEAD", "TFOOT" or "TBODY". */
131 #define CM_ROWGRP       (1 << 8)
132 /* Used for "TD", "TH" */
133 #define CM_ROW          (1 << 9)
134 /* Elements whose content must be protected against white space movement.
135    Includes some elements that can found in forms. */
136 #define CM_FIELD        (1 << 10)
137 /* Used to avoid propagating inline emphasis inside some elements
138    such as OBJECT or APPLET. */
139 #define CM_OBJECT       (1 << 11)
140 /* Elements that allows "PARAM". */
141 #define CM_PARAM        (1 << 12)
142 /* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
143 #define CM_FRAMES       (1 << 13)
144 /* Heading elements (h1, h2, ...). */
145 #define CM_HEADING      (1 << 14)
146 /* Elements with an optional end tag. */
147 #define CM_OPT          (1 << 15)
148 /* Elements that use "align" attribute for vertical position. */
149 #define CM_IMG          (1 << 16)
150 /* Elements with inline and block model. Used to avoid calling InlineDup. */
151 #define CM_MIXED        (1 << 17)
152 /* Elements whose content needs to be indented only if containing one
153    CM_BLOCK element. */
154 #define CM_NO_INDENT    (1 << 18)
155 /* Elements that are obsolete (such as "dir", "menu"). */
156 #define CM_OBSOLETE     (1 << 19)
157 /* User defined elements. Used to determine how attributes wihout value
158    should be printed. */
159 #define CM_NEW          (1 << 20)
160 /* Elements that cannot be omitted. */
161 #define CM_OMITST       (1 << 21)
162 
163 /* If the document uses just HTML 2.0 tags and attributes described
164 ** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0.
165 ** If there are proprietary tags and attributes then describe it as
166 ** HTML Proprietary. If it includes the xml-lang or xmlns attributes
167 ** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the
168 ** flavors of Voyager (strict, loose or frameset).
169 */
170 
171 /* unknown */
172 #define xxxx                   0u
173 
174 /* W3C defined HTML/XHTML family document types */
175 #define HT20                   1u
176 #define HT32                   2u
177 #define H40S                   4u
178 #define H40T                   8u
179 #define H40F                  16u
180 #define H41S                  32u
181 #define H41T                  64u
182 #define H41F                 128u
183 #define X10S                 256u
184 #define X10T                 512u
185 #define X10F                1024u
186 #define XH11                2048u
187 #define XB10                4096u
188 
189 /* proprietary stuff */
190 #define VERS_SUN            8192u
191 #define VERS_NETSCAPE      16384u
192 #define VERS_MICROSOFT     32768u
193 
194 /* special flag */
195 #define VERS_XML           65536u
196 
197 /* compatibility symbols */
198 #define VERS_UNKNOWN       (xxxx)
199 #define VERS_HTML20        (HT20)
200 #define VERS_HTML32        (HT32)
201 #define VERS_HTML40_STRICT (H40S|H41S|X10S)
202 #define VERS_HTML40_LOOSE  (H40T|H41T|X10T)
203 #define VERS_FRAMESET      (H40F|H41F|X10F)
204 #define VERS_XHTML11       (XH11)
205 #define VERS_BASIC         (XB10)
206 
207 /* meta symbols */
208 #define VERS_HTML40        (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMESET)
209 #define VERS_IFRAME        (VERS_HTML40_LOOSE|VERS_FRAMESET)
210 #define VERS_LOOSE         (VERS_HTML20|VERS_HTML32|VERS_IFRAME)
211 #define VERS_EVENTS        (VERS_HTML40|VERS_XHTML11)
212 #define VERS_FROM32        (VERS_HTML32|VERS_HTML40)
213 #define VERS_FROM40        (VERS_HTML40|VERS_XHTML11|VERS_BASIC)
214 #define VERS_XHTML         (X10S|X10T|X10F|XH11|XB10)
215 
216 /* all W3C defined document types */
217 #define VERS_ALL           (VERS_HTML20|VERS_HTML32|VERS_FROM40)
218 
219 /* all proprietary types */
220 #define VERS_PROPRIETARY   (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)
221 
222 /* Linked list of class names and styles
223 */
224 struct _Style;
225 typedef struct _Style TagStyle;
226 
227 struct _Style
228 {
229     tmbstr tag;
230     tmbstr tag_class;
231     tmbstr properties;
232     TagStyle *next;
233 };
234 
235 
236 /* Linked list of style properties
237 */
238 struct _StyleProp;
239 typedef struct _StyleProp StyleProp;
240 
241 struct _StyleProp
242 {
243     tmbstr name;
244     tmbstr value;
245     StyleProp *next;
246 };
247 
248 
249 
250 
251 /* Attribute/Value linked list node
252 */
253 
254 struct _AttVal
255 {
256     AttVal*           next;
257     const Attribute*  dict;
258     Node*             asp;
259     Node*             php;
260     int               delim;
261     tmbstr            attribute;
262     tmbstr            value;
263 };
264 
265 
266 
267 /*
268   Mosaic handles inlines via a separate stack from other elements
269   We duplicate this to recover from inline markup errors such as:
270 
271      <i>italic text
272      <p>more italic text</b> normal text
273 
274   which for compatibility with Mosaic is mapped to:
275 
276      <i>italic text</i>
277      <p><i>more italic text</i> normal text
278 
279   Note that any inline end tag pop's the effect of the current
280   inline start tag, so that </b> pop's <i> in the above example.
281 */
282 struct _IStack
283 {
284     IStack*     next;
285     const Dict* tag;        /* tag's dictionary definition */
286     tmbstr      element;    /* name (NULL for text nodes) */
287     AttVal*     attributes;
288 };
289 
290 
291 /* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl,
292 ** etc. etc.
293 */
294 
295 struct _Node
296 {
297     Node*       parent;         /* tree structure */
298     Node*       prev;
299     Node*       next;
300     Node*       content;
301     Node*       last;
302 
303     AttVal*     attributes;
304     const Dict* was;            /* old tag when it was changed */
305     const Dict* tag;            /* tag's dictionary definition */
306 
307     tmbstr      element;        /* name (NULL for text nodes) */
308 
309     uint        start;          /* start of span onto text array */
310     uint        end;            /* end of span onto text array */
311     NodeType    type;           /* TextNode, StartTag, EndTag etc. */
312 
313     uint        line;           /* current line of document */
314     uint        column;         /* current column of document */
315 
316     Bool        closed;         /* true if closed by explicit end tag */
317     Bool        implicit;       /* true if inferred */
318     Bool        linebreak;      /* true if followed by a line break */
319 
320 #ifdef TIDY_STORE_ORIGINAL_TEXT
321     tmbstr      otext;
322 #endif
323 };
324 
325 
326 /*
327   The following are private to the lexer
328   Use NewLexer() to create a lexer, and
329   FreeLexer() to free it.
330 */
331 
332 struct _Lexer
333 {
334 #if 0  /* Move to TidyDocImpl */
335     StreamIn* in;           /* document content input */
336     StreamOut* errout;      /* error output stream */
337 
338     uint badAccess;         /* for accessibility errors */
339     uint badLayout;         /* for bad style errors */
340     uint badChars;          /* for bad character encodings */
341     uint badForm;           /* for mismatched/mispositioned form tags */
342     uint warnings;          /* count of warnings in this document */
343     uint errors;            /* count of errors */
344 #endif
345 
346     uint lines;             /* lines seen */
347     uint columns;           /* at start of current token */
348     Bool waswhite;          /* used to collapse contiguous white space */
349     Bool pushed;            /* true after token has been pushed back */
350     Bool insertspace;       /* when space is moved after end tag */
351     Bool excludeBlocks;     /* Netscape compatibility */
352     Bool exiled;            /* true if moved out of table */
353     Bool isvoyager;         /* true if xmlns attribute on html element */
354     uint versions;          /* bit vector of HTML versions */
355     uint doctype;           /* version as given by doctype (if any) */
356     uint versionEmitted;    /* version of doctype emitted */
357     Bool bad_doctype;       /* e.g. if html or PUBLIC is missing */
358     uint txtstart;          /* start of current node */
359     uint txtend;            /* end of current node */
360     LexerState state;       /* state of lexer's finite state machine */
361 
362     Node* token;            /* last token returned by GetToken() */
363     Node* itoken;           /* last duplicate inline returned by GetToken() */
364     Node* root;             /* remember root node of the document */
365     Node* parent;           /* remember parent node for CDATA elements */
366 
367     Bool seenEndBody;       /* true if a </body> tag has been encountered */
368     Bool seenEndHtml;       /* true if a </html> tag has been encountered */
369 
370     /*
371       Lexer character buffer
372 
373       Parse tree nodes span onto this buffer
374       which contains the concatenated text
375       contents of all of the elements.
376 
377       lexsize must be reset for each file.
378     */
379     tmbstr lexbuf;          /* MB character buffer */
380     uint lexlength;         /* allocated */
381     uint lexsize;           /* used */
382 
383     /* Inline stack for compatibility with Mosaic */
384     Node* inode;            /* for deferring text node */
385     IStack* insert;         /* for inferring inline tags */
386     IStack* istack;
387     uint istacklength;      /* allocated */
388     uint istacksize;        /* used */
389     uint istackbase;        /* start of frame */
390 
391     TagStyle *styles;          /* used for cleaning up presentation markup */
392 
393     TidyAllocator* allocator; /* allocator */
394 
395 #if 0
396     TidyDocImpl* doc;       /* Pointer back to doc for error reporting */
397 #endif
398 };
399 
400 
401 /* Lexer Functions
402 */
403 
404 /* choose what version to use for new doctype */
405 int TY_(HTMLVersion)( TidyDocImpl* doc );
406 
407 /* everything is allowed in proprietary version of HTML */
408 /* this is handled here rather than in the tag/attr dicts */
409 
410 void TY_(ConstrainVersion)( TidyDocImpl* doc, uint vers );
411 
412 Bool TY_(IsWhite)(uint c);
413 Bool TY_(IsDigit)(uint c);
414 Bool TY_(IsLetter)(uint c);
415 Bool TY_(IsNewline)(uint c);
416 Bool TY_(IsNamechar)(uint c);
417 Bool TY_(IsXMLLetter)(uint c);
418 Bool TY_(IsXMLNamechar)(uint c);
419 
420 /* Bool IsLower(uint c); */
421 Bool TY_(IsUpper)(uint c);
422 uint TY_(ToLower)(uint c);
423 uint TY_(ToUpper)(uint c);
424 
425 Lexer* TY_(NewLexer)( TidyDocImpl* doc );
426 void TY_(FreeLexer)( TidyDocImpl* doc );
427 
428 /* store character c as UTF-8 encoded byte stream */
429 void TY_(AddCharToLexer)( Lexer *lexer, uint c );
430 
431 /*
432   Used for elements and text nodes
433   element name is NULL for text nodes
434   start and end are offsets into lexbuf
435   which contains the textual content of
436   all elements in the parse tree.
437 
438   parent and content allow traversal
439   of the parse tree in any direction.
440   attributes are represented as a linked
441   list of AttVal nodes which hold the
442   strings for attribute/value pairs.
443 */
444 Node* TY_(NewNode)( TidyAllocator* allocator, Lexer* lexer );
445 
446 
447 /* used to clone heading nodes when split by an <HR> */
448 Node* TY_(CloneNode)( TidyDocImpl* doc, Node *element );
449 
450 /* free node's attributes */
451 void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node );
452 
453 /* doesn't repair attribute list linkage */
454 void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av );
455 
456 /* detach attribute from node */
457 void TY_(DetachAttribute)( Node *node, AttVal *attr );
458 
459 /* detach attribute from node then free it
460 */
461 void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr );
462 
463 /*
464   Free document nodes by iterating through peers and recursing
465   through children. Set next to NULL before calling FreeNode()
466   to avoid freeing peer nodes. Doesn't patch up prev/next links.
467  */
468 void TY_(FreeNode)( TidyDocImpl* doc, Node *node );
469 
470 Node* TY_(TextToken)( Lexer *lexer );
471 
472 /* used for creating preformatted text from Word2000 */
473 Node* TY_(NewLineNode)( Lexer *lexer );
474 
475 /* used for adding a &nbsp; for Word2000 */
476 Node* TY_(NewLiteralTextNode)(Lexer *lexer, ctmbstr txt );
477 
478 void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str );
479 /* void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ); */
480 
481 /* find element */
482 Node* TY_(FindDocType)( TidyDocImpl* doc );
483 Node* TY_(FindHTML)( TidyDocImpl* doc );
484 Node* TY_(FindHEAD)( TidyDocImpl* doc );
485 Node* TY_(FindTITLE)(TidyDocImpl* doc);
486 Node* TY_(FindBody)( TidyDocImpl* doc );
487 Node* TY_(FindXmlDecl)(TidyDocImpl* doc);
488 
489 /* Returns containing block element, if any */
490 Node* TY_(FindContainer)( Node* node );
491 
492 /* add meta element for Tidy */
493 Bool TY_(AddGenerator)( TidyDocImpl* doc );
494 
495 uint TY_(ApparentVersion)( TidyDocImpl* doc );
496 
497 ctmbstr TY_(HTMLVersionNameFromCode)( uint vers, Bool isXhtml );
498 
499 Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc );
500 
501 Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc );
502 
503 
504 /* fixup doctype if missing */
505 Bool TY_(FixDocType)( TidyDocImpl* doc );
506 
507 /* ensure XML document starts with <?xml version="1.0"?> */
508 /* add encoding attribute if not using ASCII or UTF-8 output */
509 Bool TY_(FixXmlDecl)( TidyDocImpl* doc );
510 
511 Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id);
512 
513 void TY_(UngetToken)( TidyDocImpl* doc );
514 
515 
516 /*
517   modes for GetToken()
518 
519   MixedContent   -- for elements which don't accept PCDATA
520   Preformatted   -- white space preserved as is
521   IgnoreMarkup   -- for CDATA elements such as script, style
522 */
523 typedef enum
524 {
525   IgnoreWhitespace,
526   MixedContent,
527   Preformatted,
528   IgnoreMarkup,
529   CdataContent
530 } GetTokenMode;
531 
532 Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode );
533 
534 void TY_(InitMap)(void);
535 
536 
537 /* create a new attribute */
538 AttVal* TY_(NewAttribute)( TidyDocImpl* doc );
539 
540 /* create a new attribute with given name and value */
541 AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
542                              int delim );
543 
544 /* insert attribute at the end of attribute list of a node */
545 void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av );
546 
547 /* insert attribute at the start of attribute list of a node */
548 void TY_(InsertAttributeAtStart)( Node *node, AttVal *av );
549 
550 /*************************************
551   In-line Stack functions
552 *************************************/
553 
554 
555 /* duplicate attributes */
556 AttVal* TY_(DupAttrs)( TidyDocImpl* doc, AttVal* attrs );
557 
558 /*
559   push a copy of an inline node onto stack
560   but don't push if implicit or OBJECT or APPLET
561   (implicit tags are ones generated from the istack)
562 
563   One issue arises with pushing inlines when
564   the tag is already pushed. For instance:
565 
566       <p><em>text
567       <p><em>more text
568 
569   Shouldn't be mapped to
570 
571       <p><em>text</em></p>
572       <p><em><em>more text</em></em>
573 */
574 void TY_(PushInline)( TidyDocImpl* doc, Node* node );
575 
576 /* pop inline stack */
577 void TY_(PopInline)( TidyDocImpl* doc, Node* node );
578 
579 Bool TY_(IsPushed)( TidyDocImpl* doc, Node* node );
580 Bool TY_(IsPushedLast)( TidyDocImpl* doc, Node *element, Node *node );
581 
582 /*
583   This has the effect of inserting "missing" inline
584   elements around the contents of blocklevel elements
585   such as P, TD, TH, DIV, PRE etc. This procedure is
586   called at the start of ParseBlock. when the inline
587   stack is not empty, as will be the case in:
588 
589     <i><h1>italic heading</h1></i>
590 
591   which is then treated as equivalent to
592 
593     <h1><i>italic heading</i></h1>
594 
595   This is implemented by setting the lexer into a mode
596   where it gets tokens from the inline stack rather than
597   from the input stream.
598 */
599 int TY_(InlineDup)( TidyDocImpl* doc, Node *node );
600 
601 /*
602  defer duplicates when entering a table or other
603  element where the inlines shouldn't be duplicated
604 */
605 void TY_(DeferDup)( TidyDocImpl* doc );
606 Node* TY_(InsertedToken)( TidyDocImpl* doc );
607 
608 /* stack manipulation for inline elements */
609 Bool TY_(SwitchInline)( TidyDocImpl* doc, Node* element, Node* node );
610 Bool TY_(InlineDup1)( TidyDocImpl* doc, Node* node, Node* element );
611 
612 #ifdef __cplusplus
613 }
614 #endif
615 
616 
617 #endif /* __LEXER_H__ */
618