1 #ifndef __LEXER_H__ 2 #define __LEXER_H__ 3 4 /* lexer.h -- Lexer for html parser 5 6 (c) 1998-2006 (W3C) MIT, ERCIM, Keio University 7 See tidy.h for the copyright notice. 8 9 CVS Info: 10 $Author: arnaud02 $ 11 $Date: 2006/02/24 16:09:00 $ 12 $Revision: 1.34 $ 13 14 */ 15 16 /* 17 Given an input source, it returns a sequence of tokens. 18 19 GetToken(source) gets the next token 20 UngetToken(source) provides one level undo 21 22 The tags include an attribute list: 23 24 - linked list of attribute/value nodes 25 - each node has 2 NULL-terminated strings. 26 - entities are replaced in attribute values 27 28 white space is compacted if not in preformatted mode 29 If not in preformatted mode then leading white space 30 is discarded and subsequent white space sequences 31 compacted to single space characters. 32 33 If XmlTags is no then Tag names are folded to upper 34 case and attribute names to lower case. 35 36 Not yet done: 37 - Doctype subset and marked sections 38 */ 39 40 #ifdef __cplusplus 41 extern "C" { 42 #endif 43 44 #include "forward.h" 45 46 /* lexer character types 47 */ 48 #define digit 1u 49 #define letter 2u 50 #define namechar 4u 51 #define white 8u 52 #define newline 16u 53 #define lowercase 32u 54 #define uppercase 64u 55 56 57 /* node->type is one of these values 58 */ 59 typedef enum 60 { 61 RootNode, 62 DocTypeTag, 63 CommentTag, 64 ProcInsTag, 65 TextNode, 66 StartTag, 67 EndTag, 68 StartEndTag, 69 CDATATag, 70 SectionTag, 71 AspTag, 72 JsteTag, 73 PhpTag, 74 XmlDecl 75 } NodeType; 76 77 78 79 /* lexer GetToken states 80 */ 81 typedef enum 82 { 83 LEX_CONTENT, 84 LEX_GT, 85 LEX_ENDTAG, 86 LEX_STARTTAG, 87 LEX_COMMENT, 88 LEX_DOCTYPE, 89 LEX_PROCINSTR, 90 LEX_ENDCOMMENT, 91 LEX_CDATA, 92 LEX_SECTION, 93 LEX_ASP, 94 LEX_JSTE, 95 LEX_PHP, 96 LEX_XMLDECL 97 } LexerState; 98 99 /* ParseDocTypeDecl state constants */ 100 typedef enum 101 { 102 DT_INTERMEDIATE, 103 DT_DOCTYPENAME, 104 DT_PUBLICSYSTEM, 105 DT_QUOTEDSTRING, 106 DT_INTSUBSET 107 } ParseDocTypeDeclState; 108 109 /* content model shortcut encoding 110 111 Descriptions are tentative. 112 */ 113 #define CM_UNKNOWN 0 114 /* Elements with no content. Map to HTML specification. */ 115 #define CM_EMPTY (1 << 0) 116 /* Elements that appear outside of "BODY". */ 117 #define CM_HTML (1 << 1) 118 /* Elements that can appear within HEAD. */ 119 #define CM_HEAD (1 << 2) 120 /* HTML "block" elements. */ 121 #define CM_BLOCK (1 << 3) 122 /* HTML "inline" elements. */ 123 #define CM_INLINE (1 << 4) 124 /* Elements that mark list item ("LI"). */ 125 #define CM_LIST (1 << 5) 126 /* Elements that mark definition list item ("DL", "DT"). */ 127 #define CM_DEFLIST (1 << 6) 128 /* Elements that can appear inside TABLE. */ 129 #define CM_TABLE (1 << 7) 130 /* Used for "THEAD", "TFOOT" or "TBODY". */ 131 #define CM_ROWGRP (1 << 8) 132 /* Used for "TD", "TH" */ 133 #define CM_ROW (1 << 9) 134 /* Elements whose content must be protected against white space movement. 135 Includes some elements that can found in forms. */ 136 #define CM_FIELD (1 << 10) 137 /* Used to avoid propagating inline emphasis inside some elements 138 such as OBJECT or APPLET. */ 139 #define CM_OBJECT (1 << 11) 140 /* Elements that allows "PARAM". */ 141 #define CM_PARAM (1 << 12) 142 /* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */ 143 #define CM_FRAMES (1 << 13) 144 /* Heading elements (h1, h2, ...). */ 145 #define CM_HEADING (1 << 14) 146 /* Elements with an optional end tag. */ 147 #define CM_OPT (1 << 15) 148 /* Elements that use "align" attribute for vertical position. */ 149 #define CM_IMG (1 << 16) 150 /* Elements with inline and block model. Used to avoid calling InlineDup. */ 151 #define CM_MIXED (1 << 17) 152 /* Elements whose content needs to be indented only if containing one 153 CM_BLOCK element. */ 154 #define CM_NO_INDENT (1 << 18) 155 /* Elements that are obsolete (such as "dir", "menu"). */ 156 #define CM_OBSOLETE (1 << 19) 157 /* User defined elements. Used to determine how attributes wihout value 158 should be printed. */ 159 #define CM_NEW (1 << 20) 160 /* Elements that cannot be omitted. */ 161 #define CM_OMITST (1 << 21) 162 163 /* If the document uses just HTML 2.0 tags and attributes described 164 ** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0. 165 ** If there are proprietary tags and attributes then describe it as 166 ** HTML Proprietary. If it includes the xml-lang or xmlns attributes 167 ** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the 168 ** flavors of Voyager (strict, loose or frameset). 169 */ 170 171 /* unknown */ 172 #define xxxx 0u 173 174 /* W3C defined HTML/XHTML family document types */ 175 #define HT20 1u 176 #define HT32 2u 177 #define H40S 4u 178 #define H40T 8u 179 #define H40F 16u 180 #define H41S 32u 181 #define H41T 64u 182 #define H41F 128u 183 #define X10S 256u 184 #define X10T 512u 185 #define X10F 1024u 186 #define XH11 2048u 187 #define XB10 4096u 188 189 /* proprietary stuff */ 190 #define VERS_SUN 8192u 191 #define VERS_NETSCAPE 16384u 192 #define VERS_MICROSOFT 32768u 193 194 /* special flag */ 195 #define VERS_XML 65536u 196 197 /* compatibility symbols */ 198 #define VERS_UNKNOWN (xxxx) 199 #define VERS_HTML20 (HT20) 200 #define VERS_HTML32 (HT32) 201 #define VERS_HTML40_STRICT (H40S|H41S|X10S) 202 #define VERS_HTML40_LOOSE (H40T|H41T|X10T) 203 #define VERS_FRAMESET (H40F|H41F|X10F) 204 #define VERS_XHTML11 (XH11) 205 #define VERS_BASIC (XB10) 206 207 /* meta symbols */ 208 #define VERS_HTML40 (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMESET) 209 #define VERS_IFRAME (VERS_HTML40_LOOSE|VERS_FRAMESET) 210 #define VERS_LOOSE (VERS_HTML20|VERS_HTML32|VERS_IFRAME) 211 #define VERS_EVENTS (VERS_HTML40|VERS_XHTML11) 212 #define VERS_FROM32 (VERS_HTML32|VERS_HTML40) 213 #define VERS_FROM40 (VERS_HTML40|VERS_XHTML11|VERS_BASIC) 214 #define VERS_XHTML (X10S|X10T|X10F|XH11|XB10) 215 216 /* all W3C defined document types */ 217 #define VERS_ALL (VERS_HTML20|VERS_HTML32|VERS_FROM40) 218 219 /* all proprietary types */ 220 #define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN) 221 222 /* Linked list of class names and styles 223 */ 224 struct _Style; 225 typedef struct _Style TagStyle; 226 227 struct _Style 228 { 229 tmbstr tag; 230 tmbstr tag_class; 231 tmbstr properties; 232 TagStyle *next; 233 }; 234 235 236 /* Linked list of style properties 237 */ 238 struct _StyleProp; 239 typedef struct _StyleProp StyleProp; 240 241 struct _StyleProp 242 { 243 tmbstr name; 244 tmbstr value; 245 StyleProp *next; 246 }; 247 248 249 250 251 /* Attribute/Value linked list node 252 */ 253 254 struct _AttVal 255 { 256 AttVal* next; 257 const Attribute* dict; 258 Node* asp; 259 Node* php; 260 int delim; 261 tmbstr attribute; 262 tmbstr value; 263 }; 264 265 266 267 /* 268 Mosaic handles inlines via a separate stack from other elements 269 We duplicate this to recover from inline markup errors such as: 270 271 <i>italic text 272 <p>more italic text</b> normal text 273 274 which for compatibility with Mosaic is mapped to: 275 276 <i>italic text</i> 277 <p><i>more italic text</i> normal text 278 279 Note that any inline end tag pop's the effect of the current 280 inline start tag, so that </b> pop's <i> in the above example. 281 */ 282 struct _IStack 283 { 284 IStack* next; 285 const Dict* tag; /* tag's dictionary definition */ 286 tmbstr element; /* name (NULL for text nodes) */ 287 AttVal* attributes; 288 }; 289 290 291 /* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, 292 ** etc. etc. 293 */ 294 295 struct _Node 296 { 297 Node* parent; /* tree structure */ 298 Node* prev; 299 Node* next; 300 Node* content; 301 Node* last; 302 303 AttVal* attributes; 304 const Dict* was; /* old tag when it was changed */ 305 const Dict* tag; /* tag's dictionary definition */ 306 307 tmbstr element; /* name (NULL for text nodes) */ 308 309 uint start; /* start of span onto text array */ 310 uint end; /* end of span onto text array */ 311 NodeType type; /* TextNode, StartTag, EndTag etc. */ 312 313 uint line; /* current line of document */ 314 uint column; /* current column of document */ 315 316 Bool closed; /* true if closed by explicit end tag */ 317 Bool implicit; /* true if inferred */ 318 Bool linebreak; /* true if followed by a line break */ 319 320 #ifdef TIDY_STORE_ORIGINAL_TEXT 321 tmbstr otext; 322 #endif 323 }; 324 325 326 /* 327 The following are private to the lexer 328 Use NewLexer() to create a lexer, and 329 FreeLexer() to free it. 330 */ 331 332 struct _Lexer 333 { 334 #if 0 /* Move to TidyDocImpl */ 335 StreamIn* in; /* document content input */ 336 StreamOut* errout; /* error output stream */ 337 338 uint badAccess; /* for accessibility errors */ 339 uint badLayout; /* for bad style errors */ 340 uint badChars; /* for bad character encodings */ 341 uint badForm; /* for mismatched/mispositioned form tags */ 342 uint warnings; /* count of warnings in this document */ 343 uint errors; /* count of errors */ 344 #endif 345 346 uint lines; /* lines seen */ 347 uint columns; /* at start of current token */ 348 Bool waswhite; /* used to collapse contiguous white space */ 349 Bool pushed; /* true after token has been pushed back */ 350 Bool insertspace; /* when space is moved after end tag */ 351 Bool excludeBlocks; /* Netscape compatibility */ 352 Bool exiled; /* true if moved out of table */ 353 Bool isvoyager; /* true if xmlns attribute on html element */ 354 uint versions; /* bit vector of HTML versions */ 355 uint doctype; /* version as given by doctype (if any) */ 356 uint versionEmitted; /* version of doctype emitted */ 357 Bool bad_doctype; /* e.g. if html or PUBLIC is missing */ 358 uint txtstart; /* start of current node */ 359 uint txtend; /* end of current node */ 360 LexerState state; /* state of lexer's finite state machine */ 361 362 Node* token; /* current parse point */ 363 Node* root; /* remember root node of the document */ 364 Node* parent; /* remember parent node for CDATA elements */ 365 366 Bool seenEndBody; /* true if a </body> tag has been encountered */ 367 Bool seenEndHtml; /* true if a </html> tag has been encountered */ 368 369 /* 370 Lexer character buffer 371 372 Parse tree nodes span onto this buffer 373 which contains the concatenated text 374 contents of all of the elements. 375 376 lexsize must be reset for each file. 377 */ 378 tmbstr lexbuf; /* MB character buffer */ 379 uint lexlength; /* allocated */ 380 uint lexsize; /* used */ 381 382 /* Inline stack for compatibility with Mosaic */ 383 Node* inode; /* for deferring text node */ 384 IStack* insert; /* for inferring inline tags */ 385 IStack* istack; 386 uint istacklength; /* allocated */ 387 uint istacksize; /* used */ 388 uint istackbase; /* start of frame */ 389 390 TagStyle *styles; /* used for cleaning up presentation markup */ 391 392 #if 0 393 TidyDocImpl* doc; /* Pointer back to doc for error reporting */ 394 #endif 395 }; 396 397 398 /* Lexer Functions 399 */ 400 Node *CommentToken( Lexer *lexer ); 401 402 /* choose what version to use for new doctype */ 403 int HTMLVersion( TidyDocImpl* doc ); 404 405 ctmbstr GetFPIFromVers(uint vers); 406 407 /* everything is allowed in proprietary version of HTML */ 408 /* this is handled here rather than in the tag/attr dicts */ 409 410 void ConstrainVersion( TidyDocImpl* doc, uint vers ); 411 412 Bool IsWhite(uint c); 413 Bool IsDigit(uint c); 414 Bool IsLetter(uint c); 415 Bool IsNewline(uint c); 416 Bool IsNamechar(uint c); 417 Bool IsXMLLetter(uint c); 418 Bool IsXMLNamechar(uint c); 419 420 Bool IsLower(uint c); 421 Bool IsUpper(uint c); 422 uint ToLower(uint c); 423 uint ToUpper(uint c); 424 425 char FoldCase( TidyDocImpl* doc, tmbchar c, Bool tocaps ); 426 427 428 Lexer* NewLexer( TidyDocImpl* doc ); 429 Bool EndOfInput( TidyDocImpl* doc ); 430 void FreeLexer( TidyDocImpl* doc ); 431 432 /* store character c as UTF-8 encoded byte stream */ 433 void AddCharToLexer( Lexer *lexer, uint c ); 434 435 /* 436 Used for elements and text nodes 437 element name is NULL for text nodes 438 start and end are offsets into lexbuf 439 which contains the textual content of 440 all elements in the parse tree. 441 442 parent and content allow traversal 443 of the parse tree in any direction. 444 attributes are represented as a linked 445 list of AttVal nodes which hold the 446 strings for attribute/value pairs. 447 */ 448 Node* NewNode( Lexer* lexer ); 449 450 451 /* used to clone heading nodes when split by an <HR> */ 452 Node *CloneNode( TidyDocImpl* doc, Node *element ); 453 454 /* free node's attributes */ 455 void FreeAttrs( TidyDocImpl* doc, Node *node ); 456 457 /* doesn't repair attribute list linkage */ 458 void FreeAttribute( TidyDocImpl* doc, AttVal *av ); 459 460 /* detach attribute from node */ 461 void DetachAttribute( Node *node, AttVal *attr ); 462 463 /* detach attribute from node then free it 464 */ 465 void RemoveAttribute( TidyDocImpl* doc, Node *node, AttVal *attr ); 466 467 /* 468 Free document nodes by iterating through peers and recursing 469 through children. Set next to NULL before calling FreeNode() 470 to avoid freeing peer nodes. Doesn't patch up prev/next links. 471 */ 472 void FreeNode( TidyDocImpl* doc, Node *node ); 473 474 Node* TextToken( Lexer *lexer ); 475 476 /* used for creating preformatted text from Word2000 */ 477 Node *NewLineNode( Lexer *lexer ); 478 479 /* used for adding a for Word2000 */ 480 Node *NewLiteralTextNode(Lexer *lexer, ctmbstr txt ); 481 482 Node* CommentToken(Lexer *lexer); 483 Node* GetCDATA( TidyDocImpl* doc, Node *container ); 484 485 void AddByte( Lexer *lexer, tmbchar c ); 486 void AddStringLiteral( Lexer* lexer, ctmbstr str ); 487 void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ); 488 489 /* find element */ 490 Node* FindDocType( TidyDocImpl* doc ); 491 Node* FindHTML( TidyDocImpl* doc ); 492 Node* FindHEAD( TidyDocImpl* doc ); 493 Node* FindTITLE(TidyDocImpl* doc); 494 Node* FindBody( TidyDocImpl* doc ); 495 Node* FindXmlDecl(TidyDocImpl* doc); 496 497 /* Returns containing block element, if any */ 498 Node* FindContainer( Node* node ); 499 500 /* add meta element for Tidy */ 501 Bool AddGenerator( TidyDocImpl* doc ); 502 503 /* examine <!DOCTYPE> to identify version */ 504 uint FindGivenVersion( TidyDocImpl* doc, Node* doctype ); 505 uint ApparentVersion( TidyDocImpl* doc ); 506 507 508 Bool CheckDocTypeKeyWords(Lexer *lexer, Node *doctype); 509 510 ctmbstr HTMLVersionName( TidyDocImpl* doc ); 511 ctmbstr HTMLVersionNameFromCode( uint vers, Bool isXhtml ); 512 513 Bool WarnMissingSIInEmittedDocType( TidyDocImpl* doc ); 514 515 Bool SetXHTMLDocType( TidyDocImpl* doc ); 516 517 518 /* fixup doctype if missing */ 519 Bool FixDocType( TidyDocImpl* doc ); 520 521 /* ensure XML document starts with <?xml version="1.0"?> */ 522 /* add encoding attribute if not using ASCII or UTF-8 output */ 523 Bool FixXmlDecl( TidyDocImpl* doc ); 524 525 Node* InferredTag(TidyDocImpl* doc, TidyTagId id); 526 527 Bool ExpectsContent(Node *node); 528 529 530 void UngetToken( TidyDocImpl* doc ); 531 532 533 /* 534 modes for GetToken() 535 536 MixedContent -- for elements which don't accept PCDATA 537 Preformatted -- white space preserved as is 538 IgnoreMarkup -- for CDATA elements such as script, style 539 */ 540 typedef enum 541 { 542 IgnoreWhitespace, 543 MixedContent, 544 Preformatted, 545 IgnoreMarkup, 546 CdataContent 547 } GetTokenMode; 548 549 Node* GetToken( TidyDocImpl* doc, GetTokenMode mode ); 550 551 void InitMap(void); 552 553 Bool IsValidAttrName( ctmbstr attr ); 554 555 556 /* create a new attribute */ 557 AttVal *NewAttribute(void); 558 559 /* create a new attribute with given name and value */ 560 AttVal *NewAttributeEx( TidyDocImpl* doc, ctmbstr name, ctmbstr value, 561 int delim ); 562 563 /* insert attribute at the end of attribute list of a node */ 564 void InsertAttributeAtEnd( Node *node, AttVal *av ); 565 566 /* insert attribute at the start of attribute list of a node */ 567 void InsertAttributeAtStart( Node *node, AttVal *av ); 568 569 /************************************* 570 In-line Stack functions 571 *************************************/ 572 573 574 /* duplicate attributes */ 575 AttVal* DupAttrs( TidyDocImpl* doc, AttVal* attrs ); 576 577 /* 578 push a copy of an inline node onto stack 579 but don't push if implicit or OBJECT or APPLET 580 (implicit tags are ones generated from the istack) 581 582 One issue arises with pushing inlines when 583 the tag is already pushed. For instance: 584 585 <p><em>text 586 <p><em>more text 587 588 Shouldn't be mapped to 589 590 <p><em>text</em></p> 591 <p><em><em>more text</em></em> 592 */ 593 void PushInline( TidyDocImpl* doc, Node* node ); 594 595 /* pop inline stack */ 596 void PopInline( TidyDocImpl* doc, Node* node ); 597 598 Bool IsPushed( TidyDocImpl* doc, Node* node ); 599 Bool IsPushedLast( TidyDocImpl* doc, Node *element, Node *node ); 600 601 /* 602 This has the effect of inserting "missing" inline 603 elements around the contents of blocklevel elements 604 such as P, TD, TH, DIV, PRE etc. This procedure is 605 called at the start of ParseBlock. when the inline 606 stack is not empty, as will be the case in: 607 608 <i><h1>italic heading</h1></i> 609 610 which is then treated as equivalent to 611 612 <h1><i>italic heading</i></h1> 613 614 This is implemented by setting the lexer into a mode 615 where it gets tokens from the inline stack rather than 616 from the input stream. 617 */ 618 int InlineDup( TidyDocImpl* doc, Node *node ); 619 620 /* 621 defer duplicates when entering a table or other 622 element where the inlines shouldn't be duplicated 623 */ 624 void DeferDup( TidyDocImpl* doc ); 625 Node *InsertedToken( TidyDocImpl* doc ); 626 627 #ifdef __cplusplus 628 } 629 #endif 630 631 632 #endif /* __LEXER_H__ */ 633