1 #ifndef __LEXER_H__ 2 #define __LEXER_H__ 3 4 /* lexer.h -- Lexer for html parser 5 6 (c) 1998-2008 (W3C) MIT, ERCIM, Keio University 7 See tidy.h for the copyright notice. 8 9 CVS Info: 10 $Author: arnaud02 $ 11 $Date: 2008/03/22 21:06:11 $ 12 $Revision: 1.41 $ 13 14 */ 15 16 /* 17 Given an input source, it returns a sequence of tokens. 18 19 GetToken(source) gets the next token 20 UngetToken(source) provides one level undo 21 22 The tags include an attribute list: 23 24 - linked list of attribute/value nodes 25 - each node has 2 NULL-terminated strings. 26 - entities are replaced in attribute values 27 28 white space is compacted if not in preformatted mode 29 If not in preformatted mode then leading white space 30 is discarded and subsequent white space sequences 31 compacted to single space characters. 32 33 If XmlTags is no then Tag names are folded to upper 34 case and attribute names to lower case. 35 36 Not yet done: 37 - Doctype subset and marked sections 38 */ 39 40 #ifdef __cplusplus 41 extern "C" { 42 #endif 43 44 #include "forward.h" 45 46 /* lexer character types 47 */ 48 #define digit 1u 49 #define letter 2u 50 #define namechar 4u 51 #define white 8u 52 #define newline 16u 53 #define lowercase 32u 54 #define uppercase 64u 55 #define digithex 128u 56 57 58 /* node->type is one of these values 59 */ 60 typedef enum 61 { 62 RootNode, 63 DocTypeTag, 64 CommentTag, 65 ProcInsTag, 66 TextNode, 67 StartTag, 68 EndTag, 69 StartEndTag, 70 CDATATag, 71 SectionTag, 72 AspTag, 73 JsteTag, 74 PhpTag, 75 XmlDecl 76 } NodeType; 77 78 79 80 /* lexer GetToken states 81 */ 82 typedef enum 83 { 84 LEX_CONTENT, 85 LEX_GT, 86 LEX_ENDTAG, 87 LEX_STARTTAG, 88 LEX_COMMENT, 89 LEX_DOCTYPE, 90 LEX_PROCINSTR, 91 LEX_CDATA, 92 LEX_SECTION, 93 LEX_ASP, 94 LEX_JSTE, 95 LEX_PHP, 96 LEX_XMLDECL 97 } LexerState; 98 99 /* ParseDocTypeDecl state constants */ 100 typedef enum 101 { 102 DT_INTERMEDIATE, 103 DT_DOCTYPENAME, 104 DT_PUBLICSYSTEM, 105 DT_QUOTEDSTRING, 106 DT_INTSUBSET 107 } ParseDocTypeDeclState; 108 109 /* content model shortcut encoding 110 111 Descriptions are tentative. 112 */ 113 #define CM_UNKNOWN 0 114 /* Elements with no content. Map to HTML specification. */ 115 #define CM_EMPTY (1 << 0) 116 /* Elements that appear outside of "BODY". */ 117 #define CM_HTML (1 << 1) 118 /* Elements that can appear within HEAD. */ 119 #define CM_HEAD (1 << 2) 120 /* HTML "block" elements. */ 121 #define CM_BLOCK (1 << 3) 122 /* HTML "inline" elements. */ 123 #define CM_INLINE (1 << 4) 124 /* Elements that mark list item ("LI"). */ 125 #define CM_LIST (1 << 5) 126 /* Elements that mark definition list item ("DL", "DT"). */ 127 #define CM_DEFLIST (1 << 6) 128 /* Elements that can appear inside TABLE. */ 129 #define CM_TABLE (1 << 7) 130 /* Used for "THEAD", "TFOOT" or "TBODY". */ 131 #define CM_ROWGRP (1 << 8) 132 /* Used for "TD", "TH" */ 133 #define CM_ROW (1 << 9) 134 /* Elements whose content must be protected against white space movement. 135 Includes some elements that can found in forms. */ 136 #define CM_FIELD (1 << 10) 137 /* Used to avoid propagating inline emphasis inside some elements 138 such as OBJECT or APPLET. */ 139 #define CM_OBJECT (1 << 11) 140 /* Elements that allows "PARAM". */ 141 #define CM_PARAM (1 << 12) 142 /* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */ 143 #define CM_FRAMES (1 << 13) 144 /* Heading elements (h1, h2, ...). */ 145 #define CM_HEADING (1 << 14) 146 /* Elements with an optional end tag. */ 147 #define CM_OPT (1 << 15) 148 /* Elements that use "align" attribute for vertical position. */ 149 #define CM_IMG (1 << 16) 150 /* Elements with inline and block model. Used to avoid calling InlineDup. */ 151 #define CM_MIXED (1 << 17) 152 /* Elements whose content needs to be indented only if containing one 153 CM_BLOCK element. */ 154 #define CM_NO_INDENT (1 << 18) 155 /* Elements that are obsolete (such as "dir", "menu"). */ 156 #define CM_OBSOLETE (1 << 19) 157 /* User defined elements. Used to determine how attributes wihout value 158 should be printed. */ 159 #define CM_NEW (1 << 20) 160 /* Elements that cannot be omitted. */ 161 #define CM_OMITST (1 << 21) 162 163 /* If the document uses just HTML 2.0 tags and attributes described 164 ** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0. 165 ** If there are proprietary tags and attributes then describe it as 166 ** HTML Proprietary. If it includes the xml-lang or xmlns attributes 167 ** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the 168 ** flavors of Voyager (strict, loose or frameset). 169 */ 170 171 /* unknown */ 172 #define xxxx 0u 173 174 /* W3C defined HTML/XHTML family document types */ 175 #define HT20 1u 176 #define HT32 2u 177 #define H40S 4u 178 #define H40T 8u 179 #define H40F 16u 180 #define H41S 32u 181 #define H41T 64u 182 #define H41F 128u 183 #define X10S 256u 184 #define X10T 512u 185 #define X10F 1024u 186 #define XH11 2048u 187 #define XB10 4096u 188 189 /* proprietary stuff */ 190 #define VERS_SUN 8192u 191 #define VERS_NETSCAPE 16384u 192 #define VERS_MICROSOFT 32768u 193 194 /* special flag */ 195 #define VERS_XML 65536u 196 197 /* compatibility symbols */ 198 #define VERS_UNKNOWN (xxxx) 199 #define VERS_HTML20 (HT20) 200 #define VERS_HTML32 (HT32) 201 #define VERS_HTML40_STRICT (H40S|H41S|X10S) 202 #define VERS_HTML40_LOOSE (H40T|H41T|X10T) 203 #define VERS_FRAMESET (H40F|H41F|X10F) 204 #define VERS_XHTML11 (XH11) 205 #define VERS_BASIC (XB10) 206 207 /* meta symbols */ 208 #define VERS_HTML40 (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMESET) 209 #define VERS_IFRAME (VERS_HTML40_LOOSE|VERS_FRAMESET) 210 #define VERS_LOOSE (VERS_HTML20|VERS_HTML32|VERS_IFRAME) 211 #define VERS_EVENTS (VERS_HTML40|VERS_XHTML11) 212 #define VERS_FROM32 (VERS_HTML32|VERS_HTML40) 213 #define VERS_FROM40 (VERS_HTML40|VERS_XHTML11|VERS_BASIC) 214 #define VERS_XHTML (X10S|X10T|X10F|XH11|XB10) 215 216 /* all W3C defined document types */ 217 #define VERS_ALL (VERS_HTML20|VERS_HTML32|VERS_FROM40) 218 219 /* all proprietary types */ 220 #define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN) 221 222 /* Linked list of class names and styles 223 */ 224 struct _Style; 225 typedef struct _Style TagStyle; 226 227 struct _Style 228 { 229 tmbstr tag; 230 tmbstr tag_class; 231 tmbstr properties; 232 TagStyle *next; 233 }; 234 235 236 /* Linked list of style properties 237 */ 238 struct _StyleProp; 239 typedef struct _StyleProp StyleProp; 240 241 struct _StyleProp 242 { 243 tmbstr name; 244 tmbstr value; 245 StyleProp *next; 246 }; 247 248 249 250 251 /* Attribute/Value linked list node 252 */ 253 254 struct _AttVal 255 { 256 AttVal* next; 257 const Attribute* dict; 258 Node* asp; 259 Node* php; 260 int delim; 261 tmbstr attribute; 262 tmbstr value; 263 }; 264 265 266 267 /* 268 Mosaic handles inlines via a separate stack from other elements 269 We duplicate this to recover from inline markup errors such as: 270 271 <i>italic text 272 <p>more italic text</b> normal text 273 274 which for compatibility with Mosaic is mapped to: 275 276 <i>italic text</i> 277 <p><i>more italic text</i> normal text 278 279 Note that any inline end tag pop's the effect of the current 280 inline start tag, so that </b> pop's <i> in the above example. 281 */ 282 struct _IStack 283 { 284 IStack* next; 285 const Dict* tag; /* tag's dictionary definition */ 286 tmbstr element; /* name (NULL for text nodes) */ 287 AttVal* attributes; 288 }; 289 290 291 /* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, 292 ** etc. etc. 293 */ 294 295 struct _Node 296 { 297 Node* parent; /* tree structure */ 298 Node* prev; 299 Node* next; 300 Node* content; 301 Node* last; 302 303 AttVal* attributes; 304 const Dict* was; /* old tag when it was changed */ 305 const Dict* tag; /* tag's dictionary definition */ 306 307 tmbstr element; /* name (NULL for text nodes) */ 308 309 uint start; /* start of span onto text array */ 310 uint end; /* end of span onto text array */ 311 NodeType type; /* TextNode, StartTag, EndTag etc. */ 312 313 uint line; /* current line of document */ 314 uint column; /* current column of document */ 315 316 Bool closed; /* true if closed by explicit end tag */ 317 Bool implicit; /* true if inferred */ 318 Bool linebreak; /* true if followed by a line break */ 319 320 #ifdef TIDY_STORE_ORIGINAL_TEXT 321 tmbstr otext; 322 #endif 323 }; 324 325 326 /* 327 The following are private to the lexer 328 Use NewLexer() to create a lexer, and 329 FreeLexer() to free it. 330 */ 331 332 struct _Lexer 333 { 334 #if 0 /* Move to TidyDocImpl */ 335 StreamIn* in; /* document content input */ 336 StreamOut* errout; /* error output stream */ 337 338 uint badAccess; /* for accessibility errors */ 339 uint badLayout; /* for bad style errors */ 340 uint badChars; /* for bad character encodings */ 341 uint badForm; /* for mismatched/mispositioned form tags */ 342 uint warnings; /* count of warnings in this document */ 343 uint errors; /* count of errors */ 344 #endif 345 346 uint lines; /* lines seen */ 347 uint columns; /* at start of current token */ 348 Bool waswhite; /* used to collapse contiguous white space */ 349 Bool pushed; /* true after token has been pushed back */ 350 Bool insertspace; /* when space is moved after end tag */ 351 Bool excludeBlocks; /* Netscape compatibility */ 352 Bool exiled; /* true if moved out of table */ 353 Bool isvoyager; /* true if xmlns attribute on html element */ 354 uint versions; /* bit vector of HTML versions */ 355 uint doctype; /* version as given by doctype (if any) */ 356 uint versionEmitted; /* version of doctype emitted */ 357 Bool bad_doctype; /* e.g. if html or PUBLIC is missing */ 358 uint txtstart; /* start of current node */ 359 uint txtend; /* end of current node */ 360 LexerState state; /* state of lexer's finite state machine */ 361 362 Node* token; /* last token returned by GetToken() */ 363 Node* itoken; /* last duplicate inline returned by GetToken() */ 364 Node* root; /* remember root node of the document */ 365 Node* parent; /* remember parent node for CDATA elements */ 366 367 Bool seenEndBody; /* true if a </body> tag has been encountered */ 368 Bool seenEndHtml; /* true if a </html> tag has been encountered */ 369 370 /* 371 Lexer character buffer 372 373 Parse tree nodes span onto this buffer 374 which contains the concatenated text 375 contents of all of the elements. 376 377 lexsize must be reset for each file. 378 */ 379 tmbstr lexbuf; /* MB character buffer */ 380 uint lexlength; /* allocated */ 381 uint lexsize; /* used */ 382 383 /* Inline stack for compatibility with Mosaic */ 384 Node* inode; /* for deferring text node */ 385 IStack* insert; /* for inferring inline tags */ 386 IStack* istack; 387 uint istacklength; /* allocated */ 388 uint istacksize; /* used */ 389 uint istackbase; /* start of frame */ 390 391 TagStyle *styles; /* used for cleaning up presentation markup */ 392 393 TidyAllocator* allocator; /* allocator */ 394 395 #if 0 396 TidyDocImpl* doc; /* Pointer back to doc for error reporting */ 397 #endif 398 }; 399 400 401 /* Lexer Functions 402 */ 403 404 /* choose what version to use for new doctype */ 405 int TY_(HTMLVersion)( TidyDocImpl* doc ); 406 407 /* everything is allowed in proprietary version of HTML */ 408 /* this is handled here rather than in the tag/attr dicts */ 409 410 void TY_(ConstrainVersion)( TidyDocImpl* doc, uint vers ); 411 412 Bool TY_(IsWhite)(uint c); 413 Bool TY_(IsDigit)(uint c); 414 Bool TY_(IsLetter)(uint c); 415 Bool TY_(IsNewline)(uint c); 416 Bool TY_(IsNamechar)(uint c); 417 Bool TY_(IsXMLLetter)(uint c); 418 Bool TY_(IsXMLNamechar)(uint c); 419 420 /* Bool IsLower(uint c); */ 421 Bool TY_(IsUpper)(uint c); 422 uint TY_(ToLower)(uint c); 423 uint TY_(ToUpper)(uint c); 424 425 Lexer* TY_(NewLexer)( TidyDocImpl* doc ); 426 void TY_(FreeLexer)( TidyDocImpl* doc ); 427 428 /* store character c as UTF-8 encoded byte stream */ 429 void TY_(AddCharToLexer)( Lexer *lexer, uint c ); 430 431 /* 432 Used for elements and text nodes 433 element name is NULL for text nodes 434 start and end are offsets into lexbuf 435 which contains the textual content of 436 all elements in the parse tree. 437 438 parent and content allow traversal 439 of the parse tree in any direction. 440 attributes are represented as a linked 441 list of AttVal nodes which hold the 442 strings for attribute/value pairs. 443 */ 444 Node* TY_(NewNode)( TidyAllocator* allocator, Lexer* lexer ); 445 446 447 /* used to clone heading nodes when split by an <HR> */ 448 Node* TY_(CloneNode)( TidyDocImpl* doc, Node *element ); 449 450 /* free node's attributes */ 451 void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node ); 452 453 /* doesn't repair attribute list linkage */ 454 void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av ); 455 456 /* detach attribute from node */ 457 void TY_(DetachAttribute)( Node *node, AttVal *attr ); 458 459 /* detach attribute from node then free it 460 */ 461 void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr ); 462 463 /* 464 Free document nodes by iterating through peers and recursing 465 through children. Set next to NULL before calling FreeNode() 466 to avoid freeing peer nodes. Doesn't patch up prev/next links. 467 */ 468 void TY_(FreeNode)( TidyDocImpl* doc, Node *node ); 469 470 Node* TY_(TextToken)( Lexer *lexer ); 471 472 /* used for creating preformatted text from Word2000 */ 473 Node* TY_(NewLineNode)( Lexer *lexer ); 474 475 /* used for adding a for Word2000 */ 476 Node* TY_(NewLiteralTextNode)(Lexer *lexer, ctmbstr txt ); 477 478 void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str ); 479 /* void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ); */ 480 481 /* find element */ 482 Node* TY_(FindDocType)( TidyDocImpl* doc ); 483 Node* TY_(FindHTML)( TidyDocImpl* doc ); 484 Node* TY_(FindHEAD)( TidyDocImpl* doc ); 485 Node* TY_(FindTITLE)(TidyDocImpl* doc); 486 Node* TY_(FindBody)( TidyDocImpl* doc ); 487 Node* TY_(FindXmlDecl)(TidyDocImpl* doc); 488 489 /* Returns containing block element, if any */ 490 Node* TY_(FindContainer)( Node* node ); 491 492 /* add meta element for Tidy */ 493 Bool TY_(AddGenerator)( TidyDocImpl* doc ); 494 495 uint TY_(ApparentVersion)( TidyDocImpl* doc ); 496 497 ctmbstr TY_(HTMLVersionNameFromCode)( uint vers, Bool isXhtml ); 498 499 Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc ); 500 501 Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc ); 502 503 504 /* fixup doctype if missing */ 505 Bool TY_(FixDocType)( TidyDocImpl* doc ); 506 507 /* ensure XML document starts with <?xml version="1.0"?> */ 508 /* add encoding attribute if not using ASCII or UTF-8 output */ 509 Bool TY_(FixXmlDecl)( TidyDocImpl* doc ); 510 511 Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id); 512 513 void TY_(UngetToken)( TidyDocImpl* doc ); 514 515 516 /* 517 modes for GetToken() 518 519 MixedContent -- for elements which don't accept PCDATA 520 Preformatted -- white space preserved as is 521 IgnoreMarkup -- for CDATA elements such as script, style 522 */ 523 typedef enum 524 { 525 IgnoreWhitespace, 526 MixedContent, 527 Preformatted, 528 IgnoreMarkup, 529 CdataContent 530 } GetTokenMode; 531 532 Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode ); 533 534 void TY_(InitMap)(void); 535 536 537 /* create a new attribute */ 538 AttVal* TY_(NewAttribute)( TidyDocImpl* doc ); 539 540 /* create a new attribute with given name and value */ 541 AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value, 542 int delim ); 543 544 /* insert attribute at the end of attribute list of a node */ 545 void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av ); 546 547 /* insert attribute at the start of attribute list of a node */ 548 void TY_(InsertAttributeAtStart)( Node *node, AttVal *av ); 549 550 /************************************* 551 In-line Stack functions 552 *************************************/ 553 554 555 /* duplicate attributes */ 556 AttVal* TY_(DupAttrs)( TidyDocImpl* doc, AttVal* attrs ); 557 558 /* 559 push a copy of an inline node onto stack 560 but don't push if implicit or OBJECT or APPLET 561 (implicit tags are ones generated from the istack) 562 563 One issue arises with pushing inlines when 564 the tag is already pushed. For instance: 565 566 <p><em>text 567 <p><em>more text 568 569 Shouldn't be mapped to 570 571 <p><em>text</em></p> 572 <p><em><em>more text</em></em> 573 */ 574 void TY_(PushInline)( TidyDocImpl* doc, Node* node ); 575 576 /* pop inline stack */ 577 void TY_(PopInline)( TidyDocImpl* doc, Node* node ); 578 579 Bool TY_(IsPushed)( TidyDocImpl* doc, Node* node ); 580 Bool TY_(IsPushedLast)( TidyDocImpl* doc, Node *element, Node *node ); 581 582 /* 583 This has the effect of inserting "missing" inline 584 elements around the contents of blocklevel elements 585 such as P, TD, TH, DIV, PRE etc. This procedure is 586 called at the start of ParseBlock. when the inline 587 stack is not empty, as will be the case in: 588 589 <i><h1>italic heading</h1></i> 590 591 which is then treated as equivalent to 592 593 <h1><i>italic heading</i></h1> 594 595 This is implemented by setting the lexer into a mode 596 where it gets tokens from the inline stack rather than 597 from the input stream. 598 */ 599 int TY_(InlineDup)( TidyDocImpl* doc, Node *node ); 600 601 /* 602 defer duplicates when entering a table or other 603 element where the inlines shouldn't be duplicated 604 */ 605 void TY_(DeferDup)( TidyDocImpl* doc ); 606 Node* TY_(InsertedToken)( TidyDocImpl* doc ); 607 608 /* stack manipulation for inline elements */ 609 Bool TY_(SwitchInline)( TidyDocImpl* doc, Node* element, Node* node ); 610 Bool TY_(InlineDup1)( TidyDocImpl* doc, Node* node, Node* element ); 611 612 #ifdef __cplusplus 613 } 614 #endif 615 616 617 #endif /* __LEXER_H__ */ 618