1 #ifndef __LEXER_H__
2 #define __LEXER_H__
3 
4 /* lexer.h -- Lexer for html parser
5 
6    (c) 1998-2006 (W3C) MIT, ERCIM, Keio University
7    See tidy.h for the copyright notice.
8 
9    CVS Info:
10     $Author: arnaud02 $
11     $Date: 2006/02/24 16:09:00 $
12     $Revision: 1.34 $
13 
14 */
15 
16 /*
17   Given an input source, it returns a sequence of tokens.
18 
19      GetToken(source) gets the next token
20      UngetToken(source) provides one level undo
21 
22   The tags include an attribute list:
23 
24     - linked list of attribute/value nodes
25     - each node has 2 NULL-terminated strings.
26     - entities are replaced in attribute values
27 
28   white space is compacted if not in preformatted mode
29   If not in preformatted mode then leading white space
30   is discarded and subsequent white space sequences
31   compacted to single space characters.
32 
33   If XmlTags is no then Tag names are folded to upper
34   case and attribute names to lower case.
35 
36  Not yet done:
37     -   Doctype subset and marked sections
38 */
39 
40 #ifdef __cplusplus
41 extern "C" {
42 #endif
43 
44 #include "forward.h"
45 
46 /* lexer character types
47 */
48 #define digit       1u
49 #define letter      2u
50 #define namechar    4u
51 #define white       8u
52 #define newline     16u
53 #define lowercase   32u
54 #define uppercase   64u
55 
56 
57 /* node->type is one of these values
58 */
59 typedef enum
60 {
61   RootNode,
62   DocTypeTag,
63   CommentTag,
64   ProcInsTag,
65   TextNode,
66   StartTag,
67   EndTag,
68   StartEndTag,
69   CDATATag,
70   SectionTag,
71   AspTag,
72   JsteTag,
73   PhpTag,
74   XmlDecl
75 } NodeType;
76 
77 
78 
79 /* lexer GetToken states
80 */
81 typedef enum
82 {
83   LEX_CONTENT,
84   LEX_GT,
85   LEX_ENDTAG,
86   LEX_STARTTAG,
87   LEX_COMMENT,
88   LEX_DOCTYPE,
89   LEX_PROCINSTR,
90   LEX_ENDCOMMENT,
91   LEX_CDATA,
92   LEX_SECTION,
93   LEX_ASP,
94   LEX_JSTE,
95   LEX_PHP,
96   LEX_XMLDECL
97 } LexerState;
98 
99 /* ParseDocTypeDecl state constants */
100 typedef enum
101 {
102   DT_INTERMEDIATE,
103   DT_DOCTYPENAME,
104   DT_PUBLICSYSTEM,
105   DT_QUOTEDSTRING,
106   DT_INTSUBSET
107 } ParseDocTypeDeclState;
108 
109 /* content model shortcut encoding
110 
111    Descriptions are tentative.
112 */
113 #define CM_UNKNOWN      0
114 /* Elements with no content. Map to HTML specification. */
115 #define CM_EMPTY        (1 << 0)
116 /* Elements that appear outside of "BODY". */
117 #define CM_HTML         (1 << 1)
118 /* Elements that can appear within HEAD. */
119 #define CM_HEAD         (1 << 2)
120 /* HTML "block" elements. */
121 #define CM_BLOCK        (1 << 3)
122 /* HTML "inline" elements. */
123 #define CM_INLINE       (1 << 4)
124 /* Elements that mark list item ("LI"). */
125 #define CM_LIST         (1 << 5)
126 /* Elements that mark definition list item ("DL", "DT"). */
127 #define CM_DEFLIST      (1 << 6)
128 /* Elements that can appear inside TABLE. */
129 #define CM_TABLE        (1 << 7)
130 /* Used for "THEAD", "TFOOT" or "TBODY". */
131 #define CM_ROWGRP       (1 << 8)
132 /* Used for "TD", "TH" */
133 #define CM_ROW          (1 << 9)
134 /* Elements whose content must be protected against white space movement.
135    Includes some elements that can found in forms. */
136 #define CM_FIELD        (1 << 10)
137 /* Used to avoid propagating inline emphasis inside some elements
138    such as OBJECT or APPLET. */
139 #define CM_OBJECT       (1 << 11)
140 /* Elements that allows "PARAM". */
141 #define CM_PARAM        (1 << 12)
142 /* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
143 #define CM_FRAMES       (1 << 13)
144 /* Heading elements (h1, h2, ...). */
145 #define CM_HEADING      (1 << 14)
146 /* Elements with an optional end tag. */
147 #define CM_OPT          (1 << 15)
148 /* Elements that use "align" attribute for vertical position. */
149 #define CM_IMG          (1 << 16)
150 /* Elements with inline and block model. Used to avoid calling InlineDup. */
151 #define CM_MIXED        (1 << 17)
152 /* Elements whose content needs to be indented only if containing one
153    CM_BLOCK element. */
154 #define CM_NO_INDENT    (1 << 18)
155 /* Elements that are obsolete (such as "dir", "menu"). */
156 #define CM_OBSOLETE     (1 << 19)
157 /* User defined elements. Used to determine how attributes wihout value
158    should be printed. */
159 #define CM_NEW          (1 << 20)
160 /* Elements that cannot be omitted. */
161 #define CM_OMITST       (1 << 21)
162 
163 /* If the document uses just HTML 2.0 tags and attributes described
164 ** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0.
165 ** If there are proprietary tags and attributes then describe it as
166 ** HTML Proprietary. If it includes the xml-lang or xmlns attributes
167 ** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the
168 ** flavors of Voyager (strict, loose or frameset).
169 */
170 
171 /* unknown */
172 #define xxxx                   0u
173 
174 /* W3C defined HTML/XHTML family document types */
175 #define HT20                   1u
176 #define HT32                   2u
177 #define H40S                   4u
178 #define H40T                   8u
179 #define H40F                  16u
180 #define H41S                  32u
181 #define H41T                  64u
182 #define H41F                 128u
183 #define X10S                 256u
184 #define X10T                 512u
185 #define X10F                1024u
186 #define XH11                2048u
187 #define XB10                4096u
188 
189 /* proprietary stuff */
190 #define VERS_SUN            8192u
191 #define VERS_NETSCAPE      16384u
192 #define VERS_MICROSOFT     32768u
193 
194 /* special flag */
195 #define VERS_XML           65536u
196 
197 /* compatibility symbols */
198 #define VERS_UNKNOWN       (xxxx)
199 #define VERS_HTML20        (HT20)
200 #define VERS_HTML32        (HT32)
201 #define VERS_HTML40_STRICT (H40S|H41S|X10S)
202 #define VERS_HTML40_LOOSE  (H40T|H41T|X10T)
203 #define VERS_FRAMESET      (H40F|H41F|X10F)
204 #define VERS_XHTML11       (XH11)
205 #define VERS_BASIC         (XB10)
206 
207 /* meta symbols */
208 #define VERS_HTML40        (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMESET)
209 #define VERS_IFRAME        (VERS_HTML40_LOOSE|VERS_FRAMESET)
210 #define VERS_LOOSE         (VERS_HTML20|VERS_HTML32|VERS_IFRAME)
211 #define VERS_EVENTS        (VERS_HTML40|VERS_XHTML11)
212 #define VERS_FROM32        (VERS_HTML32|VERS_HTML40)
213 #define VERS_FROM40        (VERS_HTML40|VERS_XHTML11|VERS_BASIC)
214 #define VERS_XHTML         (X10S|X10T|X10F|XH11|XB10)
215 
216 /* all W3C defined document types */
217 #define VERS_ALL           (VERS_HTML20|VERS_HTML32|VERS_FROM40)
218 
219 /* all proprietary types */
220 #define VERS_PROPRIETARY   (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)
221 
222 /* Linked list of class names and styles
223 */
224 struct _Style;
225 typedef struct _Style TagStyle;
226 
227 struct _Style
228 {
229     tmbstr tag;
230     tmbstr tag_class;
231     tmbstr properties;
232     TagStyle *next;
233 };
234 
235 
236 /* Linked list of style properties
237 */
238 struct _StyleProp;
239 typedef struct _StyleProp StyleProp;
240 
241 struct _StyleProp
242 {
243     tmbstr name;
244     tmbstr value;
245     StyleProp *next;
246 };
247 
248 
249 
250 
251 /* Attribute/Value linked list node
252 */
253 
254 struct _AttVal
255 {
256     AttVal*           next;
257     const Attribute*  dict;
258     Node*             asp;
259     Node*             php;
260     int               delim;
261     tmbstr            attribute;
262     tmbstr            value;
263 };
264 
265 
266 
267 /*
268   Mosaic handles inlines via a separate stack from other elements
269   We duplicate this to recover from inline markup errors such as:
270 
271      <i>italic text
272      <p>more italic text</b> normal text
273 
274   which for compatibility with Mosaic is mapped to:
275 
276      <i>italic text</i>
277      <p><i>more italic text</i> normal text
278 
279   Note that any inline end tag pop's the effect of the current
280   inline start tag, so that </b> pop's <i> in the above example.
281 */
282 struct _IStack
283 {
284     IStack*     next;
285     const Dict* tag;        /* tag's dictionary definition */
286     tmbstr      element;    /* name (NULL for text nodes) */
287     AttVal*     attributes;
288 };
289 
290 
291 /* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl,
292 ** etc. etc.
293 */
294 
295 struct _Node
296 {
297     Node*       parent;         /* tree structure */
298     Node*       prev;
299     Node*       next;
300     Node*       content;
301     Node*       last;
302 
303     AttVal*     attributes;
304     const Dict* was;            /* old tag when it was changed */
305     const Dict* tag;            /* tag's dictionary definition */
306 
307     tmbstr      element;        /* name (NULL for text nodes) */
308 
309     uint        start;          /* start of span onto text array */
310     uint        end;            /* end of span onto text array */
311     NodeType    type;           /* TextNode, StartTag, EndTag etc. */
312 
313     uint        line;           /* current line of document */
314     uint        column;         /* current column of document */
315 
316     Bool        closed;         /* true if closed by explicit end tag */
317     Bool        implicit;       /* true if inferred */
318     Bool        linebreak;      /* true if followed by a line break */
319 
320 #ifdef TIDY_STORE_ORIGINAL_TEXT
321     tmbstr      otext;
322 #endif
323 };
324 
325 
326 /*
327   The following are private to the lexer
328   Use NewLexer() to create a lexer, and
329   FreeLexer() to free it.
330 */
331 
332 struct _Lexer
333 {
334 #if 0  /* Move to TidyDocImpl */
335     StreamIn* in;           /* document content input */
336     StreamOut* errout;      /* error output stream */
337 
338     uint badAccess;         /* for accessibility errors */
339     uint badLayout;         /* for bad style errors */
340     uint badChars;          /* for bad character encodings */
341     uint badForm;           /* for mismatched/mispositioned form tags */
342     uint warnings;          /* count of warnings in this document */
343     uint errors;            /* count of errors */
344 #endif
345 
346     uint lines;             /* lines seen */
347     uint columns;           /* at start of current token */
348     Bool waswhite;          /* used to collapse contiguous white space */
349     Bool pushed;            /* true after token has been pushed back */
350     Bool insertspace;       /* when space is moved after end tag */
351     Bool excludeBlocks;     /* Netscape compatibility */
352     Bool exiled;            /* true if moved out of table */
353     Bool isvoyager;         /* true if xmlns attribute on html element */
354     uint versions;          /* bit vector of HTML versions */
355     uint doctype;           /* version as given by doctype (if any) */
356     uint versionEmitted;    /* version of doctype emitted */
357     Bool bad_doctype;       /* e.g. if html or PUBLIC is missing */
358     uint txtstart;          /* start of current node */
359     uint txtend;            /* end of current node */
360     LexerState state;       /* state of lexer's finite state machine */
361 
362     Node* token;            /* current parse point */
363     Node* root;             /* remember root node of the document */
364     Node* parent;           /* remember parent node for CDATA elements */
365 
366     Bool seenEndBody;       /* true if a </body> tag has been encountered */
367     Bool seenEndHtml;       /* true if a </html> tag has been encountered */
368 
369     /*
370       Lexer character buffer
371 
372       Parse tree nodes span onto this buffer
373       which contains the concatenated text
374       contents of all of the elements.
375 
376       lexsize must be reset for each file.
377     */
378     tmbstr lexbuf;          /* MB character buffer */
379     uint lexlength;         /* allocated */
380     uint lexsize;           /* used */
381 
382     /* Inline stack for compatibility with Mosaic */
383     Node* inode;            /* for deferring text node */
384     IStack* insert;         /* for inferring inline tags */
385     IStack* istack;
386     uint istacklength;      /* allocated */
387     uint istacksize;        /* used */
388     uint istackbase;        /* start of frame */
389 
390     TagStyle *styles;          /* used for cleaning up presentation markup */
391 
392 #if 0
393     TidyDocImpl* doc;       /* Pointer back to doc for error reporting */
394 #endif
395 };
396 
397 
398 /* Lexer Functions
399 */
400 Node *CommentToken( Lexer *lexer );
401 
402 /* choose what version to use for new doctype */
403 int HTMLVersion( TidyDocImpl* doc );
404 
405 ctmbstr GetFPIFromVers(uint vers);
406 
407 /* everything is allowed in proprietary version of HTML */
408 /* this is handled here rather than in the tag/attr dicts */
409 
410 void ConstrainVersion( TidyDocImpl* doc, uint vers );
411 
412 Bool IsWhite(uint c);
413 Bool IsDigit(uint c);
414 Bool IsLetter(uint c);
415 Bool IsNewline(uint c);
416 Bool IsNamechar(uint c);
417 Bool IsXMLLetter(uint c);
418 Bool IsXMLNamechar(uint c);
419 
420 Bool IsLower(uint c);
421 Bool IsUpper(uint c);
422 uint ToLower(uint c);
423 uint ToUpper(uint c);
424 
425 char FoldCase( TidyDocImpl* doc, tmbchar c, Bool tocaps );
426 
427 
428 Lexer* NewLexer( TidyDocImpl* doc );
429 Bool EndOfInput( TidyDocImpl* doc );
430 void FreeLexer( TidyDocImpl* doc );
431 
432 /* store character c as UTF-8 encoded byte stream */
433 void AddCharToLexer( Lexer *lexer, uint c );
434 
435 /*
436   Used for elements and text nodes
437   element name is NULL for text nodes
438   start and end are offsets into lexbuf
439   which contains the textual content of
440   all elements in the parse tree.
441 
442   parent and content allow traversal
443   of the parse tree in any direction.
444   attributes are represented as a linked
445   list of AttVal nodes which hold the
446   strings for attribute/value pairs.
447 */
448 Node* NewNode( Lexer* lexer );
449 
450 
451 /* used to clone heading nodes when split by an <HR> */
452 Node *CloneNode( TidyDocImpl* doc, Node *element );
453 
454 /* free node's attributes */
455 void FreeAttrs( TidyDocImpl* doc, Node *node );
456 
457 /* doesn't repair attribute list linkage */
458 void FreeAttribute( TidyDocImpl* doc, AttVal *av );
459 
460 /* detach attribute from node */
461 void DetachAttribute( Node *node, AttVal *attr );
462 
463 /* detach attribute from node then free it
464 */
465 void RemoveAttribute( TidyDocImpl* doc, Node *node, AttVal *attr );
466 
467 /*
468   Free document nodes by iterating through peers and recursing
469   through children. Set next to NULL before calling FreeNode()
470   to avoid freeing peer nodes. Doesn't patch up prev/next links.
471  */
472 void FreeNode( TidyDocImpl* doc, Node *node );
473 
474 Node* TextToken( Lexer *lexer );
475 
476 /* used for creating preformatted text from Word2000 */
477 Node *NewLineNode( Lexer *lexer );
478 
479 /* used for adding a &nbsp; for Word2000 */
480 Node *NewLiteralTextNode(Lexer *lexer, ctmbstr txt );
481 
482 Node* CommentToken(Lexer *lexer);
483 Node* GetCDATA( TidyDocImpl* doc, Node *container );
484 
485 void AddByte( Lexer *lexer, tmbchar c );
486 void AddStringLiteral( Lexer* lexer, ctmbstr str );
487 void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len );
488 
489 /* find element */
490 Node* FindDocType( TidyDocImpl* doc );
491 Node* FindHTML( TidyDocImpl* doc );
492 Node* FindHEAD( TidyDocImpl* doc );
493 Node* FindTITLE(TidyDocImpl* doc);
494 Node* FindBody( TidyDocImpl* doc );
495 Node* FindXmlDecl(TidyDocImpl* doc);
496 
497 /* Returns containing block element, if any */
498 Node* FindContainer( Node* node );
499 
500 /* add meta element for Tidy */
501 Bool AddGenerator( TidyDocImpl* doc );
502 
503 /* examine <!DOCTYPE> to identify version */
504 uint FindGivenVersion( TidyDocImpl* doc, Node* doctype );
505 uint ApparentVersion( TidyDocImpl* doc );
506 
507 
508 Bool CheckDocTypeKeyWords(Lexer *lexer, Node *doctype);
509 
510 ctmbstr HTMLVersionName( TidyDocImpl* doc );
511 ctmbstr HTMLVersionNameFromCode( uint vers, Bool isXhtml );
512 
513 Bool WarnMissingSIInEmittedDocType( TidyDocImpl* doc );
514 
515 Bool SetXHTMLDocType( TidyDocImpl* doc );
516 
517 
518 /* fixup doctype if missing */
519 Bool FixDocType( TidyDocImpl* doc );
520 
521 /* ensure XML document starts with <?xml version="1.0"?> */
522 /* add encoding attribute if not using ASCII or UTF-8 output */
523 Bool FixXmlDecl( TidyDocImpl* doc );
524 
525 Node* InferredTag(TidyDocImpl* doc, TidyTagId id);
526 
527 Bool ExpectsContent(Node *node);
528 
529 
530 void UngetToken( TidyDocImpl* doc );
531 
532 
533 /*
534   modes for GetToken()
535 
536   MixedContent   -- for elements which don't accept PCDATA
537   Preformatted   -- white space preserved as is
538   IgnoreMarkup   -- for CDATA elements such as script, style
539 */
540 typedef enum
541 {
542   IgnoreWhitespace,
543   MixedContent,
544   Preformatted,
545   IgnoreMarkup,
546   CdataContent
547 } GetTokenMode;
548 
549 Node* GetToken( TidyDocImpl* doc, GetTokenMode mode );
550 
551 void InitMap(void);
552 
553 Bool IsValidAttrName( ctmbstr attr );
554 
555 
556 /* create a new attribute */
557 AttVal *NewAttribute(void);
558 
559 /* create a new attribute with given name and value */
560 AttVal *NewAttributeEx( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
561                         int delim );
562 
563 /* insert attribute at the end of attribute list of a node */
564 void InsertAttributeAtEnd( Node *node, AttVal *av );
565 
566 /* insert attribute at the start of attribute list of a node */
567 void InsertAttributeAtStart( Node *node, AttVal *av );
568 
569 /*************************************
570   In-line Stack functions
571 *************************************/
572 
573 
574 /* duplicate attributes */
575 AttVal* DupAttrs( TidyDocImpl* doc, AttVal* attrs );
576 
577 /*
578   push a copy of an inline node onto stack
579   but don't push if implicit or OBJECT or APPLET
580   (implicit tags are ones generated from the istack)
581 
582   One issue arises with pushing inlines when
583   the tag is already pushed. For instance:
584 
585       <p><em>text
586       <p><em>more text
587 
588   Shouldn't be mapped to
589 
590       <p><em>text</em></p>
591       <p><em><em>more text</em></em>
592 */
593 void PushInline( TidyDocImpl* doc, Node* node );
594 
595 /* pop inline stack */
596 void PopInline( TidyDocImpl* doc, Node* node );
597 
598 Bool IsPushed( TidyDocImpl* doc, Node* node );
599 Bool IsPushedLast( TidyDocImpl* doc, Node *element, Node *node );
600 
601 /*
602   This has the effect of inserting "missing" inline
603   elements around the contents of blocklevel elements
604   such as P, TD, TH, DIV, PRE etc. This procedure is
605   called at the start of ParseBlock. when the inline
606   stack is not empty, as will be the case in:
607 
608     <i><h1>italic heading</h1></i>
609 
610   which is then treated as equivalent to
611 
612     <h1><i>italic heading</i></h1>
613 
614   This is implemented by setting the lexer into a mode
615   where it gets tokens from the inline stack rather than
616   from the input stream.
617 */
618 int InlineDup( TidyDocImpl* doc, Node *node );
619 
620 /*
621  defer duplicates when entering a table or other
622  element where the inlines shouldn't be duplicated
623 */
624 void DeferDup( TidyDocImpl* doc );
625 Node *InsertedToken( TidyDocImpl* doc );
626 
627 #ifdef __cplusplus
628 }
629 #endif
630 
631 
632 #endif /* __LEXER_H__ */
633