1 #ifndef __TAGS_H__ 2 #define __TAGS_H__ 3 4 /**************************************************************************//** 5 * @file 6 * Recognize HTML Tags. 7 * 8 * The HTML tags are stored as 8 bit ASCII strings. 9 * Use lookupw() to find a tag given a wide char string. 10 * 11 * @author HTACG, et al (consult git log) 12 * 13 * @copyright 14 * Copyright (c) 1998-2017 World Wide Web Consortium (Massachusetts 15 * Institute of Technology, European Research Consortium for Informatics 16 * and Mathematics, Keio University) and HTACG. 17 * @par 18 * All Rights Reserved. 19 * @par 20 * See `tidy.h` for the complete license. 21 * 22 * @date Additional updates: consult git log 23 * 24 ******************************************************************************/ 25 26 #include "forward.h" 27 #include "attrdict.h" 28 29 /** @addtogroup internal_api */ 30 /** @{ */ 31 32 33 /***************************************************************************//** 34 ** @defgroup tags_h HTML Tags 35 ** 36 ** This module organizes all of Tidy's HTML tag operations, such as parsing 37 ** tags, defining tags, and user-defined tags. 38 ** 39 ** @{ 40 ******************************************************************************/ 41 42 43 /** @name Basic Structures and Tag Operations. 44 ** These structures form the backbone of Tidy tag processing, and the 45 ** functions in this group provide basic operations with tags and nodes. 46 */ 47 /** @{ */ 48 49 50 /** This enumeration defines the types of user-defined tags that can be 51 ** created. 52 */ 53 typedef enum 54 { 55 tagtype_null = 0, /**< First item marker. */ 56 tagtype_empty = 1, /**< Tag is an empty element. */ 57 tagtype_inline = 2, /**< Tag is an inline element. */ 58 tagtype_block = 4, /**< Tag is a block level element. */ 59 tagtype_pre = 8 /**< Tag is a preformatted tag. */ 60 } UserTagType; 61 62 63 /** This typedef describes a function to be used to parse HTML of a Tidy tag. 64 */ 65 typedef void (Parser)( TidyDocImpl* doc, Node *node, GetTokenMode mode ); 66 67 68 /** This typedef describes a function be be used to check the attributes 69 ** of a Tidy tag. 70 */ 71 typedef void (CheckAttribs)( TidyDocImpl* doc, Node *node ); 72 73 74 /** Defines a dictionary entry for a single Tidy tag, including all of the 75 ** relevant information that it requires. 76 */ 77 struct _Dict 78 { 79 TidyTagId id; /**< Identifier for this tag. */ 80 tmbstr name; /**< The tag name. */ 81 uint versions; /**< Accumulates potential HTML versions. See TY_(ConstrainVersion). */ 82 AttrVersion const * attrvers; /**< Accumulates potential HTML versions for attributes. */ 83 uint model; /**< Indicates the relevant content models for the tag. See lexer.h; there is no enum. */ 84 Parser* parser; /**< Specifies the parser to use for this tag. */ 85 CheckAttribs* chkattrs; /**< Specifies the function to check this tag's attributes. */ 86 Dict* next; /**< Link to next tag. */ 87 }; 88 89 90 /** This enum indicates the maximum size of the has table for tag hash lookup. 91 */ 92 enum 93 { 94 ELEMENT_HASH_SIZE=178u /**< Maximum number of tags in the hash table. */ 95 }; 96 97 98 /** This structure provide hash lookup for Tidy tags. 99 */ 100 typedef struct _DictHash 101 { 102 Dict const* tag; /**< The current tag. */ 103 struct _DictHash* next; /**< The next tag. */ 104 } DictHash; 105 106 107 /** This structure consists of the lists of all tags known to Tidy. 108 */ 109 typedef struct _TidyTagImpl 110 { 111 Dict* xml_tags; /**< Placeholder for all xml tags. */ 112 Dict* declared_tag_list; /**< User-declared tags. */ 113 DictHash* hashtab[ELEMENT_HASH_SIZE]; /**< All of Tidy's built-in tags. */ 114 } TidyTagImpl; 115 116 117 /** Coordinates Config update and Tags data. 118 ** @param doc The Tidy document. 119 ** @param opt The option the tag is intended for. 120 ** @param name The name of the new tag. 121 */ 122 TY_PRIVATE void TY_(DeclareUserTag)( TidyDocImpl* doc, const TidyOptionImpl* opt, ctmbstr name ); 123 124 125 /** Interface for finding a tag by TidyTagId. 126 ** @param tid The TidyTagId to search for. 127 ** @returns An instance of a Tidy tag. 128 */ 129 TY_PRIVATE const Dict* TY_(LookupTagDef)( TidyTagId tid ); 130 131 /** Assigns the node's tag. 132 ** @param doc The Tidy document. 133 ** @param node The node to assign the tag to. 134 ** @returns Returns a bool indicating whether or not the tag was assigned. 135 */ 136 TY_PRIVATE Bool TY_(FindTag)( TidyDocImpl* doc, Node *node ); 137 138 139 /** Finds the parser function for a given node. 140 ** @param doc The Tidy document. 141 ** @param node The node to lookup. 142 ** @returns The parser for the given node. 143 */ 144 TY_PRIVATE Parser* TY_(FindParser)( TidyDocImpl* doc, Node *node ); 145 146 147 /** Defines a new user-defined tag. 148 ** @param doc The Tidy document. 149 ** @param tagType The type of user-defined tag to define. 150 ** @param name The name of the new tag. 151 */ 152 TY_PRIVATE void TY_(DefineTag)( TidyDocImpl* doc, UserTagType tagType, ctmbstr name ); 153 154 155 /** Frees user-defined tags of the given type, or all user tags in given 156 ** `tagtype_null`. 157 ** @param doc The Tidy document. 158 ** @param tagType The type of tag to free, or `tagtype_null` to free all 159 ** user-defined tags. 160 */ 161 TY_PRIVATE void TY_(FreeDeclaredTags)( TidyDocImpl* doc, UserTagType tagType ); 162 163 164 /** Initiates an iterator for a list of user-declared tags, including autonomous 165 ** custom tags detected in the document if @ref TidyUseCustomTags is not set to 166 ** **no**. 167 ** @param doc An instance of a TidyDocImp to query. 168 ** @result Returns a TidyIterator, which is a token used to represent the 169 ** current position in a list within LibTidy. 170 */ 171 TY_PRIVATE TidyIterator TY_(GetDeclaredTagList)( TidyDocImpl* doc ); 172 173 174 /** Given a valid TidyIterator initiated with TY_(GetDeclaredTagList)(), 175 ** returns a string representing a user-declared or autonomous custom tag. 176 ** @remark Specifying tagType limits the scope of the tags to one of 177 ** @ref UserTagType types. Note that autonomous custom tags (if used) 178 ** are added to one of these option types, depending on the value of 179 ** @ref TidyUseCustomTags. 180 ** @param doc The Tidy document. 181 ** @param tagType The type of tag to iterate through. 182 ** @param iter The iterator token provided initially by 183 ** TY_(GetDeclaredTagList)(). 184 ** @result A string containing the next tag. 185 */ 186 TY_PRIVATE ctmbstr TY_(GetNextDeclaredTag)( TidyDocImpl* doc, UserTagType tagType, 187 TidyIterator* iter ); 188 189 190 /** Initializes tags and tag structures for the given Tidy document. 191 ** @param doc The Tidy document. 192 */ 193 TY_PRIVATE void TY_(InitTags)( TidyDocImpl* doc ); 194 195 196 /** Frees the tags and structures used by Tidy for tags. 197 ** @param doc The Tidy document. 198 */ 199 TY_PRIVATE void TY_(FreeTags)( TidyDocImpl* doc ); 200 201 202 /** Tidy defaults to HTML5 mode. If the <!DOCTYPE ...> is found to NOT be 203 ** HTML5, then adjust the tags table to HTML4 mode. 204 ** @param doc The Tidy document. 205 */ 206 TY_PRIVATE void TY_(AdjustTags)( TidyDocImpl *doc ); 207 208 209 /** Reset the tags table back to default HTML5 mode. 210 ** @param doc The Tidy document. 211 */ 212 TY_PRIVATE void TY_(ResetTags)( TidyDocImpl *doc ); 213 214 215 /** Indicates whether or not the Tidy is procesing in HTML5 mode. 216 ** @param doc The Tidy document. 217 ** @returns Returns `yes` if processing in HTML5 mode. 218 */ 219 TY_PRIVATE Bool TY_(IsHTML5Mode)( TidyDocImpl *doc ); 220 221 222 /** @} */ 223 /** @name Parser Methods And Attribute Checker Functions for Tags 224 ** These functions define the parsers and attribute checking functions for 225 ** each of Tidy's tags. 226 */ 227 /** @{ */ 228 229 230 TY_PRIVATE Parser TY_(ParseHTML); 231 TY_PRIVATE Parser TY_(ParseHead); 232 TY_PRIVATE Parser TY_(ParseTitle); 233 TY_PRIVATE Parser TY_(ParseScript); 234 TY_PRIVATE Parser TY_(ParseFrameSet); 235 TY_PRIVATE Parser TY_(ParseNoFrames); 236 TY_PRIVATE Parser TY_(ParseBody); 237 TY_PRIVATE Parser TY_(ParsePre); 238 TY_PRIVATE Parser TY_(ParseList); 239 TY_PRIVATE Parser TY_(ParseDefList); 240 TY_PRIVATE Parser TY_(ParseBlock); 241 TY_PRIVATE Parser TY_(ParseInline); 242 TY_PRIVATE Parser TY_(ParseEmpty); 243 TY_PRIVATE Parser TY_(ParseTableTag); 244 TY_PRIVATE Parser TY_(ParseColGroup); 245 TY_PRIVATE Parser TY_(ParseRowGroup); 246 TY_PRIVATE Parser TY_(ParseRow); 247 TY_PRIVATE Parser TY_(ParseSelect); 248 TY_PRIVATE Parser TY_(ParseOptGroup); 249 TY_PRIVATE Parser TY_(ParseText); 250 TY_PRIVATE Parser TY_(ParseDatalist); 251 TY_PRIVATE Parser TY_(ParseNamespace); 252 253 TY_PRIVATE CheckAttribs TY_(CheckAttributes); 254 255 256 /** @} */ 257 /** @name Other Tag and Node Lookup Functions 258 ** These functions perform additional lookup on tags and nodes. 259 */ 260 /** @{ */ 261 262 263 /** Gets the TidyTagId of the given node. 0 == TidyTag_UNKNOWN. 264 */ 265 #define TagId(node) ((node) && (node)->tag ? (node)->tag->id : TidyTag_UNKNOWN) 266 267 268 /** Determines if the given node is of the given tag id type. 269 */ 270 #define TagIsId(node, tid) ((node) && (node)->tag && (node)->tag->id == tid) 271 272 273 /** Inquires whether or not the given node is a text node. 274 ** @param node The node being interrogated. 275 ** @returns The status of the inquiry. 276 */ 277 TY_PRIVATE Bool TY_(nodeIsText)( Node* node ); 278 279 280 /** Inquires whether or not the given node is an element node. 281 ** @param node The node being interrogated. 282 ** @returns The status of the inquiry. 283 */ 284 TY_PRIVATE Bool TY_(nodeIsElement)( Node* node ); 285 286 287 /** Inquires whether or not the given node has any text. 288 ** @param doc The Tidy document. 289 ** @param node The node being interrogated. 290 ** @returns The status of the inquiry. 291 */ 292 TY_PRIVATE Bool TY_(nodeHasText)( TidyDocImpl* doc, Node* node ); 293 294 295 /** Inquires whether the given element looks like it's an autonomous custom 296 ** element tag. 297 ** @param element A string to be checked. 298 ** @returns The status of the inquiry. 299 */ 300 TY_PRIVATE Bool TY_(elementIsAutonomousCustomFormat)( ctmbstr element ); 301 302 303 /** Inquires whether the given node looks like it's an autonomous custom 304 ** element tag. 305 ** @param node The node being interrogated. 306 ** @returns The status of the inquiry. 307 */ 308 TY_PRIVATE Bool TY_(nodeIsAutonomousCustomFormat)( Node* node ); 309 310 311 /** True if the node looks like it's an autonomous custom element tag, and 312 ** TidyCustomTags is not disabled, and we're in HTML5 mode, which are all 313 ** requirements for valid autonomous custom tags. 314 ** @param doc The Tidy document. 315 ** @param node The node being interrogated. 316 ** @returns The status of the inquiry. 317 */ 318 TY_PRIVATE Bool TY_(nodeIsAutonomousCustomTag)( TidyDocImpl* doc, Node* node ); 319 320 321 /** Does the node have the indicated content model? True if any of the bits 322 ** requested are set. 323 ** @param node The node being interrogated. 324 ** @param contentModel The content model to check against. 325 ** @returns The status of the inquiry. 326 */ 327 TY_PRIVATE Bool TY_(nodeHasCM)( Node* node, uint contentModel ); 328 329 330 /** Does the content model of the node include block? 331 ** @param node The node being interrogated. 332 ** @returns The status of the inquiry. 333 */ 334 TY_PRIVATE Bool TY_(nodeCMIsBlock)( Node* node ); 335 336 337 /** Does the content model of the node include inline? 338 ** @param node The node being interrogated. 339 ** @returns The status of the inquiry. 340 */ 341 TY_PRIVATE Bool TY_(nodeCMIsInline)( Node* node ); 342 343 344 /** Does the content model of the node include empty? 345 ** @param node The node being interrogated. 346 ** @returns The status of the inquiry. 347 */ 348 TY_PRIVATE Bool TY_(nodeCMIsEmpty)( Node* node ); 349 350 351 /** Is the node a header, such as H1, H2, ..., H6? 352 ** @param node The node being interrogated. 353 ** @returns The status of the inquiry. 354 */ 355 TY_PRIVATE Bool TY_(nodeIsHeader)( Node* node ); 356 357 358 /** Inquires as to the header level of the given node: 1, 2, ..., 6. 359 ** @param node The node being interrogated. 360 ** @returns The header level. 361 */ 362 TY_PRIVATE uint TY_(nodeHeaderLevel)( Node* node ); 363 364 365 #define nodeIsHTML( node ) TagIsId( node, TidyTag_HTML ) 366 #define nodeIsHEAD( node ) TagIsId( node, TidyTag_HEAD ) 367 #define nodeIsTITLE( node ) TagIsId( node, TidyTag_TITLE ) 368 #define nodeIsBASE( node ) TagIsId( node, TidyTag_BASE ) 369 #define nodeIsMETA( node ) TagIsId( node, TidyTag_META ) 370 #define nodeIsBODY( node ) TagIsId( node, TidyTag_BODY ) 371 #define nodeIsFRAMESET( node ) TagIsId( node, TidyTag_FRAMESET ) 372 #define nodeIsFRAME( node ) TagIsId( node, TidyTag_FRAME ) 373 #define nodeIsIFRAME( node ) TagIsId( node, TidyTag_IFRAME ) 374 #define nodeIsNOFRAMES( node ) TagIsId( node, TidyTag_NOFRAMES ) 375 #define nodeIsHR( node ) TagIsId( node, TidyTag_HR ) 376 #define nodeIsH1( node ) TagIsId( node, TidyTag_H1 ) 377 #define nodeIsH2( node ) TagIsId( node, TidyTag_H2 ) 378 #define nodeIsPRE( node ) TagIsId( node, TidyTag_PRE ) 379 #define nodeIsLISTING( node ) TagIsId( node, TidyTag_LISTING ) 380 #define nodeIsP( node ) TagIsId( node, TidyTag_P ) 381 #define nodeIsUL( node ) TagIsId( node, TidyTag_UL ) 382 #define nodeIsOL( node ) TagIsId( node, TidyTag_OL ) 383 #define nodeIsDL( node ) TagIsId( node, TidyTag_DL ) 384 #define nodeIsDIR( node ) TagIsId( node, TidyTag_DIR ) 385 #define nodeIsLI( node ) TagIsId( node, TidyTag_LI ) 386 #define nodeIsDT( node ) TagIsId( node, TidyTag_DT ) 387 #define nodeIsDD( node ) TagIsId( node, TidyTag_DD ) 388 #define nodeIsTABLE( node ) TagIsId( node, TidyTag_TABLE ) 389 #define nodeIsCAPTION( node ) TagIsId( node, TidyTag_CAPTION ) 390 #define nodeIsTD( node ) TagIsId( node, TidyTag_TD ) 391 #define nodeIsTH( node ) TagIsId( node, TidyTag_TH ) 392 #define nodeIsTR( node ) TagIsId( node, TidyTag_TR ) 393 #define nodeIsCOL( node ) TagIsId( node, TidyTag_COL ) 394 #define nodeIsCOLGROUP( node ) TagIsId( node, TidyTag_COLGROUP ) 395 #define nodeIsBR( node ) TagIsId( node, TidyTag_BR ) 396 #define nodeIsA( node ) TagIsId( node, TidyTag_A ) 397 #define nodeIsLINK( node ) TagIsId( node, TidyTag_LINK ) 398 #define nodeIsB( node ) TagIsId( node, TidyTag_B ) 399 #define nodeIsI( node ) TagIsId( node, TidyTag_I ) 400 #define nodeIsSTRONG( node ) TagIsId( node, TidyTag_STRONG ) 401 #define nodeIsEM( node ) TagIsId( node, TidyTag_EM ) 402 #define nodeIsBIG( node ) TagIsId( node, TidyTag_BIG ) 403 #define nodeIsSMALL( node ) TagIsId( node, TidyTag_SMALL ) 404 #define nodeIsPARAM( node ) TagIsId( node, TidyTag_PARAM ) 405 #define nodeIsOPTION( node ) TagIsId( node, TidyTag_OPTION ) 406 #define nodeIsOPTGROUP( node ) TagIsId( node, TidyTag_OPTGROUP ) 407 #define nodeIsIMG( node ) TagIsId( node, TidyTag_IMG ) 408 #define nodeIsMAP( node ) TagIsId( node, TidyTag_MAP ) 409 #define nodeIsAREA( node ) TagIsId( node, TidyTag_AREA ) 410 #define nodeIsNOBR( node ) TagIsId( node, TidyTag_NOBR ) 411 #define nodeIsWBR( node ) TagIsId( node, TidyTag_WBR ) 412 #define nodeIsFONT( node ) TagIsId( node, TidyTag_FONT ) 413 #define nodeIsLAYER( node ) TagIsId( node, TidyTag_LAYER ) 414 #define nodeIsSPACER( node ) TagIsId( node, TidyTag_SPACER ) 415 #define nodeIsCENTER( node ) TagIsId( node, TidyTag_CENTER ) 416 #define nodeIsSTYLE( node ) TagIsId( node, TidyTag_STYLE ) 417 #define nodeIsSCRIPT( node ) TagIsId( node, TidyTag_SCRIPT ) 418 #define nodeIsNOSCRIPT( node ) TagIsId( node, TidyTag_NOSCRIPT ) 419 #define nodeIsFORM( node ) TagIsId( node, TidyTag_FORM ) 420 #define nodeIsTEXTAREA( node ) TagIsId( node, TidyTag_TEXTAREA ) 421 #define nodeIsBLOCKQUOTE( node ) TagIsId( node, TidyTag_BLOCKQUOTE ) 422 #define nodeIsAPPLET( node ) TagIsId( node, TidyTag_APPLET ) 423 #define nodeIsOBJECT( node ) TagIsId( node, TidyTag_OBJECT ) 424 #define nodeIsDIV( node ) TagIsId( node, TidyTag_DIV ) 425 #define nodeIsSPAN( node ) TagIsId( node, TidyTag_SPAN ) 426 #define nodeIsINPUT( node ) TagIsId( node, TidyTag_INPUT ) 427 #define nodeIsQ( node ) TagIsId( node, TidyTag_Q ) 428 #define nodeIsLABEL( node ) TagIsId( node, TidyTag_LABEL ) 429 #define nodeIsH3( node ) TagIsId( node, TidyTag_H3 ) 430 #define nodeIsH4( node ) TagIsId( node, TidyTag_H4 ) 431 #define nodeIsH5( node ) TagIsId( node, TidyTag_H5 ) 432 #define nodeIsH6( node ) TagIsId( node, TidyTag_H6 ) 433 #define nodeIsADDRESS( node ) TagIsId( node, TidyTag_ADDRESS ) 434 #define nodeIsXMP( node ) TagIsId( node, TidyTag_XMP ) 435 #define nodeIsSELECT( node ) TagIsId( node, TidyTag_SELECT ) 436 #define nodeIsBLINK( node ) TagIsId( node, TidyTag_BLINK ) 437 #define nodeIsMARQUEE( node ) TagIsId( node, TidyTag_MARQUEE ) 438 #define nodeIsEMBED( node ) TagIsId( node, TidyTag_EMBED ) 439 #define nodeIsBASEFONT( node ) TagIsId( node, TidyTag_BASEFONT ) 440 #define nodeIsISINDEX( node ) TagIsId( node, TidyTag_ISINDEX ) 441 #define nodeIsS( node ) TagIsId( node, TidyTag_S ) 442 #define nodeIsSTRIKE( node ) TagIsId( node, TidyTag_STRIKE ) 443 #define nodeIsSUB( node ) TagIsId( node, TidyTag_SUB ) 444 #define nodeIsSUP( node ) TagIsId( node, TidyTag_SUP ) 445 #define nodeIsU( node ) TagIsId( node, TidyTag_U ) 446 #define nodeIsMENU( node ) TagIsId( node, TidyTag_MENU ) 447 #define nodeIsMAIN( node ) TagIsId( node, TidyTag_MAIN ) 448 #define nodeIsBUTTON( node ) TagIsId( node, TidyTag_BUTTON ) 449 #define nodeIsCANVAS( node ) TagIsId( node, TidyTag_CANVAS ) 450 #define nodeIsPROGRESS( node ) TagIsId( node, TidyTag_PROGRESS ) 451 452 #define nodeIsINS( node ) TagIsId( node, TidyTag_INS ) 453 #define nodeIsDEL( node ) TagIsId( node, TidyTag_DEL ) 454 455 #define nodeIsSVG( node ) TagIsId( node, TidyTag_SVG ) 456 457 /* HTML5 */ 458 #define nodeIsDATALIST( node ) TagIsId( node, TidyTag_DATALIST ) 459 #define nodeIsDATA( node ) TagIsId( node, TidyTag_DATA ) 460 #define nodeIsMATHML( node ) TagIsId( node, TidyTag_MATHML ) /* #130 MathML attr and entity fix! */ 461 462 /* NOT in HTML 5 */ 463 #define nodeIsACRONYM( node ) TagIsId( node, TidyTag_ACRONYM ) 464 #define nodesIsFRAME( node ) TagIsId( node, TidyTag_FRAME ) 465 #define nodeIsTT( node ) TagIsId( node, TidyTag_TT ) 466 467 468 /** @} name */ 469 /** @} tags_h group */ 470 /** @} internal_api addtogroup */ 471 472 473 #endif /* __TAGS_H__ */ 474