1<?php 2 3/* This file is part of the Markdownify project, which is under LGPL license */ 4 5namespace Markdownify; 6 7class Parser 8{ 9 public static $skipWhitespace = true; 10 public static $a_ord; 11 public static $z_ord; 12 public static $special_ords; 13 14 /** 15 * tags which are always empty (<br /> etc.) 16 * 17 * @var array<string> 18 */ 19 public $emptyTags = array( 20 'br', 21 'hr', 22 'input', 23 'img', 24 'area', 25 'link', 26 'meta', 27 'param', 28 ); 29 30 /** 31 * tags with preformatted text 32 * whitespaces wont be touched in them 33 * 34 * @var array<string> 35 */ 36 public $preformattedTags = array( 37 'script', 38 'style', 39 'pre', 40 'code', 41 ); 42 43 /** 44 * supress HTML tags inside preformatted tags (see above) 45 * 46 * @var bool 47 */ 48 public $noTagsInCode = false; 49 50 /** 51 * html to be parsed 52 * 53 * @var string 54 */ 55 public $html = ''; 56 57 /** 58 * node type: 59 * 60 * - tag (see isStartTag) 61 * - text (includes cdata) 62 * - comment 63 * - doctype 64 * - pi (processing instruction) 65 * 66 * @var string 67 */ 68 public $nodeType = ''; 69 70 /** 71 * current node content, i.e. either a 72 * simple string (text node), or something like 73 * <tag attrib="value"...> 74 * 75 * @var string 76 */ 77 public $node = ''; 78 79 /** 80 * wether current node is an opening tag (<a>) or not (</a>) 81 * set to NULL if current node is not a tag 82 * NOTE: empty tags (<br />) set this to true as well! 83 * 84 * @var bool | null 85 */ 86 public $isStartTag = null; 87 88 /** 89 * wether current node is an empty tag (<br />) or not (<a></a>) 90 * 91 * @var bool | null 92 */ 93 public $isEmptyTag = null; 94 95 /** 96 * tag name 97 * 98 * @var string | null 99 */ 100 public $tagName = ''; 101 102 /** 103 * attributes of current tag 104 * 105 * @var array (attribName=>value) | null 106 */ 107 public $tagAttributes = null; 108 109 /** 110 * whether or not the actual context is a inline context 111 * 112 * @var bool | null 113 */ 114 public $isInlineContext = null; 115 116 /** 117 * whether the current tag is a block element 118 * 119 * @var bool | null 120 */ 121 public $isBlockElement = null; 122 123 /** 124 * whether the previous tag (browser) is a block element 125 * 126 * @var bool | null 127 */ 128 public $isNextToInlineContext = null; 129 130 /** 131 * keep whitespace 132 * 133 * @var int 134 */ 135 public $keepWhitespace = 0; 136 137 /** 138 * list of open tags 139 * count this to get current depth 140 * 141 * @var array 142 */ 143 public $openTags = array(); 144 145 /** 146 * list of block elements 147 * 148 * @var array 149 * TODO: what shall we do with <del> and <ins> ?! 150 */ 151 public $blockElements = array( 152 // tag name => <bool> is block 153 // block elements 154 'address' => true, 155 'blockquote' => true, 156 'center' => true, 157 'del' => true, 158 'dir' => true, 159 'div' => true, 160 'dl' => true, 161 'fieldset' => true, 162 'form' => true, 163 'h1' => true, 164 'h2' => true, 165 'h3' => true, 166 'h4' => true, 167 'h5' => true, 168 'h6' => true, 169 'hr' => true, 170 'ins' => true, 171 'isindex' => true, 172 'menu' => true, 173 'noframes' => true, 174 'noscript' => true, 175 'ol' => true, 176 'p' => true, 177 'pre' => true, 178 'table' => true, 179 'ul' => true, 180 // set table elements and list items to block as well 181 'thead' => true, 182 'tbody' => true, 183 'tfoot' => true, 184 'td' => true, 185 'tr' => true, 186 'th' => true, 187 'li' => true, 188 'dd' => true, 189 'dt' => true, 190 // header items and html / body as well 191 'html' => true, 192 'body' => true, 193 'head' => true, 194 'meta' => true, 195 'link' => true, 196 'style' => true, 197 'title' => true, 198 // unfancy media tags, when indented should be rendered as block 199 'map' => true, 200 'object' => true, 201 'param' => true, 202 'embed' => true, 203 'area' => true, 204 // inline elements 205 'a' => false, 206 'abbr' => false, 207 'acronym' => false, 208 'applet' => false, 209 'b' => false, 210 'basefont' => false, 211 'bdo' => false, 212 'big' => false, 213 'br' => false, 214 'button' => false, 215 'cite' => false, 216 'code' => false, 217 'del' => false, 218 'dfn' => false, 219 'em' => false, 220 'font' => false, 221 'i' => false, 222 'img' => false, 223 'ins' => false, 224 'input' => false, 225 'iframe' => false, 226 'kbd' => false, 227 'label' => false, 228 'q' => false, 229 'samp' => false, 230 'script' => false, 231 'select' => false, 232 'small' => false, 233 'span' => false, 234 'strong' => false, 235 'sub' => false, 236 'sup' => false, 237 'textarea' => false, 238 'tt' => false, 239 'var' => false, 240 ); 241 242 /** 243 * get next node, set $this->html prior! 244 * 245 * @param void 246 * @return bool 247 */ 248 public function nextNode() 249 { 250 if (empty($this->html)) { 251 // we are done with parsing the html string 252 253 return false; 254 } 255 256 if ($this->isStartTag && !$this->isEmptyTag) { 257 array_push($this->openTags, $this->tagName); 258 if (in_array($this->tagName, $this->preformattedTags)) { 259 // dont truncate whitespaces for <code> or <pre> contents 260 $this->keepWhitespace++; 261 } 262 } 263 264 if ($this->html[0] == '<') { 265 $token = substr($this->html, 0, 9); 266 if (substr($token, 0, 2) == '<?') { 267 // xml prolog or other pi's 268 /** TODO **/ 269 // trigger_error('this might need some work', E_USER_NOTICE); 270 $pos = strpos($this->html, '>'); 271 $this->setNode('pi', $pos + 1); 272 273 return true; 274 } 275 if (substr($token, 0, 4) == '<!--') { 276 // comment 277 $pos = strpos($this->html, '-->'); 278 if ($pos === false) { 279 // could not find a closing -->, use next gt instead 280 // this is firefox' behaviour 281 $pos = strpos($this->html, '>') + 1; 282 } else { 283 $pos += 3; 284 } 285 $this->setNode('comment', $pos); 286 287 static::$skipWhitespace = true; 288 289 return true; 290 } 291 if ($token == '<!DOCTYPE') { 292 // doctype 293 $this->setNode('doctype', strpos($this->html, '>') + 1); 294 295 static::$skipWhitespace = true; 296 297 return true; 298 } 299 if ($token == '<![CDATA[') { 300 // cdata, use text node 301 302 // remove leading <![CDATA[ 303 $this->html = substr($this->html, 9); 304 305 $this->setNode('text', strpos($this->html, ']]>') + 3); 306 307 // remove trailing ]]> and trim 308 $this->node = substr($this->node, 0, -3); 309 $this->handleWhitespaces(); 310 311 static::$skipWhitespace = true; 312 313 return true; 314 } 315 if ($this->parseTag()) { 316 // seems to be a tag 317 // handle whitespaces 318 if ($this->isBlockElement) { 319 static::$skipWhitespace = true; 320 } else { 321 static::$skipWhitespace = false; 322 } 323 324 return true; 325 } 326 } 327 if ($this->keepWhitespace) { 328 static::$skipWhitespace = false; 329 } 330 // when we get here it seems to be a text node 331 $pos = strpos($this->html, '<'); 332 if ($pos === false) { 333 $pos = strlen($this->html); 334 } 335 $this->setNode('text', $pos); 336 $this->handleWhitespaces(); 337 if (static::$skipWhitespace && $this->node == ' ') { 338 return $this->nextNode(); 339 } 340 $this->isInlineContext = true; 341 static::$skipWhitespace = false; 342 343 return true; 344 } 345 346 /** 347 * parse tag, set tag name and attributes, see if it's a closing tag and so forth... 348 * 349 * @param void 350 * @return bool 351 */ 352 protected function parseTag() 353 { 354 if (!isset(static::$a_ord)) { 355 static::$a_ord = ord('a'); 356 static::$z_ord = ord('z'); 357 static::$special_ords = array( 358 ord(':'), // for xml:lang 359 ord('-'), // for http-equiv 360 ); 361 } 362 363 $tagName = ''; 364 365 $pos = 1; 366 $isStartTag = $this->html[$pos] != '/'; 367 if (!$isStartTag) { 368 $pos++; 369 } 370 // get tagName 371 while (isset($this->html[$pos])) { 372 $pos_ord = ord(strtolower($this->html[$pos])); 373 if (($pos_ord >= static::$a_ord && $pos_ord <= static::$z_ord) || (!empty($tagName) && is_numeric($this->html[$pos]))) { 374 $tagName .= $this->html[$pos]; 375 $pos++; 376 } else { 377 $pos--; 378 break; 379 } 380 } 381 382 $tagName = strtolower($tagName); 383 if (empty($tagName) || !isset($this->blockElements[$tagName])) { 384 // something went wrong => invalid tag 385 $this->invalidTag(); 386 387 return false; 388 } 389 if ($this->noTagsInCode && end($this->openTags) == 'code' && !($tagName == 'code' && !$isStartTag)) { 390 // we supress all HTML tags inside code tags 391 $this->invalidTag(); 392 393 return false; 394 } 395 396 // get tag attributes 397 /** TODO: in html 4 attributes do not need to be quoted **/ 398 $isEmptyTag = false; 399 $attributes = array(); 400 $currAttrib = ''; 401 while (isset($this->html[$pos + 1])) { 402 $pos++; 403 // close tag 404 if ($this->html[$pos] == '>' || $this->html[$pos] . $this->html[$pos + 1] == '/>') { 405 if ($this->html[$pos] == '/') { 406 $isEmptyTag = true; 407 $pos++; 408 } 409 break; 410 } 411 412 $pos_ord = ord(strtolower($this->html[$pos])); 413 if (($pos_ord >= static::$a_ord && $pos_ord <= static::$z_ord) || in_array($pos_ord, static::$special_ords)) { 414 // attribute name 415 $currAttrib .= $this->html[$pos]; 416 } elseif (in_array($this->html[$pos], array(' ', "\t", "\n"))) { 417 // drop whitespace 418 } elseif (in_array($this->html[$pos] . $this->html[$pos + 1], array('="', "='"))) { 419 // get attribute value 420 $pos++; 421 $await = $this->html[$pos]; // single or double quote 422 $pos++; 423 $value = ''; 424 while (isset($this->html[$pos]) && $this->html[$pos] != $await) { 425 $value .= $this->html[$pos]; 426 $pos++; 427 } 428 $attributes[$currAttrib] = $value; 429 $currAttrib = ''; 430 } else { 431 $this->invalidTag(); 432 433 return false; 434 } 435 } 436 if ($this->html[$pos] != '>') { 437 $this->invalidTag(); 438 439 return false; 440 } 441 442 if (!empty($currAttrib)) { 443 // html 4 allows something like <option selected> instead of <option selected="selected"> 444 $attributes[$currAttrib] = $currAttrib; 445 } 446 if (!$isStartTag) { 447 if (!empty($attributes) || $tagName != end($this->openTags)) { 448 // end tags must not contain any attributes 449 // or maybe we did not expect a different tag to be closed 450 $this->invalidTag(); 451 452 return false; 453 } 454 array_pop($this->openTags); 455 if (in_array($tagName, $this->preformattedTags)) { 456 $this->keepWhitespace--; 457 } 458 } 459 $pos++; 460 $this->node = substr($this->html, 0, $pos); 461 $this->html = substr($this->html, $pos); 462 $this->tagName = $tagName; 463 $this->tagAttributes = $attributes; 464 $this->isStartTag = $isStartTag; 465 $this->isEmptyTag = $isEmptyTag || in_array($tagName, $this->emptyTags); 466 if ($this->isEmptyTag) { 467 // might be not well formed 468 $this->node = preg_replace('# */? *>$#', ' />', $this->node); 469 } 470 $this->nodeType = 'tag'; 471 $this->isBlockElement = $this->blockElements[$tagName]; 472 $this->isNextToInlineContext = $isStartTag && $this->isInlineContext; 473 $this->isInlineContext = !$this->isBlockElement; 474 return true; 475 } 476 477 /** 478 * handle invalid tags 479 * 480 * @param void 481 * @return void 482 */ 483 protected function invalidTag() 484 { 485 $this->html = substr_replace($this->html, '<', 0, 1); 486 } 487 488 /** 489 * update all vars and make $this->html shorter 490 * 491 * @param string $type see description for $this->nodeType 492 * @param int $pos to which position shall we cut? 493 * @return void 494 */ 495 protected function setNode($type, $pos) 496 { 497 if ($this->nodeType == 'tag') { 498 // set tag specific vars to null 499 // $type == tag should not be called here 500 // see this::parseTag() for more 501 $this->tagName = null; 502 $this->tagAttributes = null; 503 $this->isStartTag = null; 504 $this->isEmptyTag = null; 505 $this->isBlockElement = null; 506 507 } 508 $this->nodeType = $type; 509 $this->node = substr($this->html, 0, $pos); 510 $this->html = substr($this->html, $pos); 511 } 512 513 /** 514 * check if $this->html begins with $str 515 * 516 * @param string $str 517 * @return bool 518 */ 519 protected function match($str) 520 { 521 return substr($this->html, 0, strlen($str)) == $str; 522 } 523 524 /** 525 * truncate whitespaces 526 * 527 * @param void 528 * @return void 529 */ 530 protected function handleWhitespaces() 531 { 532 if ($this->keepWhitespace) { 533 // <pre> or <code> before... 534 535 return; 536 } 537 // truncate multiple whitespaces to a single one 538 $this->node = preg_replace('#\s+#s', ' ', $this->node); 539 } 540 541 /** 542 * normalize self::node 543 * 544 * @param void 545 * @return void 546 */ 547 protected function normalizeNode() 548 { 549 $this->node = '<'; 550 if (!$this->isStartTag) { 551 $this->node .= '/' . $this->tagName . '>'; 552 553 return; 554 } 555 $this->node .= $this->tagName; 556 foreach ($this->tagAttributes as $name => $value) { 557 $this->node .= ' ' . $name . '="' . str_replace('"', '"', $value) . '"'; 558 } 559 if ($this->isEmptyTag) { 560 $this->node .= ' /'; 561 } 562 $this->node .= '>'; 563 } 564} 565