1<?php 2/** 3 * base include file for SimpleTest 4 * @package SimpleTest 5 * @subpackage MockObjects 6 * @version $Id$ 7 */ 8 9/**#@+ 10 * Lexer mode stack constants 11 */ 12foreach (array('LEXER_ENTER', 'LEXER_MATCHED', 13 'LEXER_UNMATCHED', 'LEXER_EXIT', 14 'LEXER_SPECIAL') as $i => $constant) { 15 if (! defined($constant)) { 16 define($constant, $i + 1); 17 } 18} 19/**#@-*/ 20 21/** 22 * Compounded regular expression. Any of 23 * the contained patterns could match and 24 * when one does, it's label is returned. 25 * @package SimpleTest 26 * @subpackage WebTester 27 */ 28class ParallelRegex { 29 var $_patterns; 30 var $_labels; 31 var $_regex; 32 var $_case; 33 34 /** 35 * Constructor. Starts with no patterns. 36 * @param boolean $case True for case sensitive, false 37 * for insensitive. 38 * @access public 39 */ 40 function ParallelRegex($case) { 41 $this->_case = $case; 42 $this->_patterns = array(); 43 $this->_labels = array(); 44 $this->_regex = null; 45 } 46 47 /** 48 * Adds a pattern with an optional label. 49 * @param string $pattern Perl style regex, but ( and ) 50 * lose the usual meaning. 51 * @param string $label Label of regex to be returned 52 * on a match. 53 * @access public 54 */ 55 function addPattern($pattern, $label = true) { 56 $count = count($this->_patterns); 57 $this->_patterns[$count] = $pattern; 58 $this->_labels[$count] = $label; 59 $this->_regex = null; 60 } 61 62 /** 63 * Attempts to match all patterns at once against 64 * a string. 65 * @param string $subject String to match against. 66 * @param string $match First matched portion of 67 * subject. 68 * @return boolean True on success. 69 * @access public 70 */ 71 function match($subject, &$match) { 72 if (count($this->_patterns) == 0) { 73 return false; 74 } 75 if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) { 76 $match = ''; 77 return false; 78 } 79 $match = $matches[0]; 80 for ($i = 1; $i < count($matches); $i++) { 81 if ($matches[$i]) { 82 return $this->_labels[$i - 1]; 83 } 84 } 85 return true; 86 } 87 88 /** 89 * Compounds the patterns into a single 90 * regular expression separated with the 91 * "or" operator. Caches the regex. 92 * Will automatically escape (, ) and / tokens. 93 * @param array $patterns List of patterns in order. 94 * @access private 95 */ 96 function _getCompoundedRegex() { 97 if ($this->_regex == null) { 98 for ($i = 0, $count = count($this->_patterns); $i < $count; $i++) { 99 $this->_patterns[$i] = '(' . str_replace( 100 array('/', '(', ')'), 101 array('\/', '\(', '\)'), 102 $this->_patterns[$i]) . ')'; 103 } 104 $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags(); 105 } 106 return $this->_regex; 107 } 108 109 /** 110 * Accessor for perl regex mode flags to use. 111 * @return string Perl regex flags. 112 * @access private 113 */ 114 function _getPerlMatchingFlags() { 115 return ($this->_case ? "msS" : "msSi"); 116 } 117} 118 119/** 120 * States for a stack machine. 121 * @package SimpleTest 122 * @subpackage WebTester 123 */ 124class SimpleStateStack { 125 var $_stack; 126 127 /** 128 * Constructor. Starts in named state. 129 * @param string $start Starting state name. 130 * @access public 131 */ 132 function SimpleStateStack($start) { 133 $this->_stack = array($start); 134 } 135 136 /** 137 * Accessor for current state. 138 * @return string State. 139 * @access public 140 */ 141 function getCurrent() { 142 return $this->_stack[count($this->_stack) - 1]; 143 } 144 145 /** 146 * Adds a state to the stack and sets it 147 * to be the current state. 148 * @param string $state New state. 149 * @access public 150 */ 151 function enter($state) { 152 array_push($this->_stack, $state); 153 } 154 155 /** 156 * Leaves the current state and reverts 157 * to the previous one. 158 * @return boolean False if we drop off 159 * the bottom of the list. 160 * @access public 161 */ 162 function leave() { 163 if (count($this->_stack) == 1) { 164 return false; 165 } 166 array_pop($this->_stack); 167 return true; 168 } 169} 170 171/** 172 * Accepts text and breaks it into tokens. 173 * Some optimisation to make the sure the 174 * content is only scanned by the PHP regex 175 * parser once. Lexer modes must not start 176 * with leading underscores. 177 * @package SimpleTest 178 * @subpackage WebTester 179 */ 180class SimpleLexer { 181 var $_regexes; 182 var $_parser; 183 var $_mode; 184 var $_mode_handlers; 185 var $_case; 186 187 /** 188 * Sets up the lexer in case insensitive matching 189 * by default. 190 * @param SimpleSaxParser $parser Handling strategy by 191 * reference. 192 * @param string $start Starting handler. 193 * @param boolean $case True for case sensitive. 194 * @access public 195 */ 196 function SimpleLexer(&$parser, $start = "accept", $case = false) { 197 $this->_case = $case; 198 $this->_regexes = array(); 199 $this->_parser = &$parser; 200 $this->_mode = &new SimpleStateStack($start); 201 $this->_mode_handlers = array($start => $start); 202 } 203 204 /** 205 * Adds a token search pattern for a particular 206 * parsing mode. The pattern does not change the 207 * current mode. 208 * @param string $pattern Perl style regex, but ( and ) 209 * lose the usual meaning. 210 * @param string $mode Should only apply this 211 * pattern when dealing with 212 * this type of input. 213 * @access public 214 */ 215 function addPattern($pattern, $mode = "accept") { 216 if (! isset($this->_regexes[$mode])) { 217 $this->_regexes[$mode] = new ParallelRegex($this->_case); 218 } 219 $this->_regexes[$mode]->addPattern($pattern); 220 if (! isset($this->_mode_handlers[$mode])) { 221 $this->_mode_handlers[$mode] = $mode; 222 } 223 } 224 225 /** 226 * Adds a pattern that will enter a new parsing 227 * mode. Useful for entering parenthesis, strings, 228 * tags, etc. 229 * @param string $pattern Perl style regex, but ( and ) 230 * lose the usual meaning. 231 * @param string $mode Should only apply this 232 * pattern when dealing with 233 * this type of input. 234 * @param string $new_mode Change parsing to this new 235 * nested mode. 236 * @access public 237 */ 238 function addEntryPattern($pattern, $mode, $new_mode) { 239 if (! isset($this->_regexes[$mode])) { 240 $this->_regexes[$mode] = new ParallelRegex($this->_case); 241 } 242 $this->_regexes[$mode]->addPattern($pattern, $new_mode); 243 if (! isset($this->_mode_handlers[$new_mode])) { 244 $this->_mode_handlers[$new_mode] = $new_mode; 245 } 246 } 247 248 /** 249 * Adds a pattern that will exit the current mode 250 * and re-enter the previous one. 251 * @param string $pattern Perl style regex, but ( and ) 252 * lose the usual meaning. 253 * @param string $mode Mode to leave. 254 * @access public 255 */ 256 function addExitPattern($pattern, $mode) { 257 if (! isset($this->_regexes[$mode])) { 258 $this->_regexes[$mode] = new ParallelRegex($this->_case); 259 } 260 $this->_regexes[$mode]->addPattern($pattern, "__exit"); 261 if (! isset($this->_mode_handlers[$mode])) { 262 $this->_mode_handlers[$mode] = $mode; 263 } 264 } 265 266 /** 267 * Adds a pattern that has a special mode. Acts as an entry 268 * and exit pattern in one go, effectively calling a special 269 * parser handler for this token only. 270 * @param string $pattern Perl style regex, but ( and ) 271 * lose the usual meaning. 272 * @param string $mode Should only apply this 273 * pattern when dealing with 274 * this type of input. 275 * @param string $special Use this mode for this one token. 276 * @access public 277 */ 278 function addSpecialPattern($pattern, $mode, $special) { 279 if (! isset($this->_regexes[$mode])) { 280 $this->_regexes[$mode] = new ParallelRegex($this->_case); 281 } 282 $this->_regexes[$mode]->addPattern($pattern, "_$special"); 283 if (! isset($this->_mode_handlers[$special])) { 284 $this->_mode_handlers[$special] = $special; 285 } 286 } 287 288 /** 289 * Adds a mapping from a mode to another handler. 290 * @param string $mode Mode to be remapped. 291 * @param string $handler New target handler. 292 * @access public 293 */ 294 function mapHandler($mode, $handler) { 295 $this->_mode_handlers[$mode] = $handler; 296 } 297 298 /** 299 * Splits the page text into tokens. Will fail 300 * if the handlers report an error or if no 301 * content is consumed. If successful then each 302 * unparsed and parsed token invokes a call to the 303 * held listener. 304 * @param string $raw Raw HTML text. 305 * @return boolean True on success, else false. 306 * @access public 307 */ 308 function parse($raw) { 309 if (! isset($this->_parser)) { 310 return false; 311 } 312 $length = strlen($raw); 313 while (is_array($parsed = $this->_reduce($raw))) { 314 list($raw, $unmatched, $matched, $mode) = $parsed; 315 if (! $this->_dispatchTokens($unmatched, $matched, $mode)) { 316 return false; 317 } 318 if ($raw === '') { 319 return true; 320 } 321 if (strlen($raw) == $length) { 322 return false; 323 } 324 $length = strlen($raw); 325 } 326 if (! $parsed) { 327 return false; 328 } 329 return $this->_invokeParser($raw, LEXER_UNMATCHED); 330 } 331 332 /** 333 * Sends the matched token and any leading unmatched 334 * text to the parser changing the lexer to a new 335 * mode if one is listed. 336 * @param string $unmatched Unmatched leading portion. 337 * @param string $matched Actual token match. 338 * @param string $mode Mode after match. A boolean 339 * false mode causes no change. 340 * @return boolean False if there was any error 341 * from the parser. 342 * @access private 343 */ 344 function _dispatchTokens($unmatched, $matched, $mode = false) { 345 if (! $this->_invokeParser($unmatched, LEXER_UNMATCHED)) { 346 return false; 347 } 348 if (is_bool($mode)) { 349 return $this->_invokeParser($matched, LEXER_MATCHED); 350 } 351 if ($this->_isModeEnd($mode)) { 352 if (! $this->_invokeParser($matched, LEXER_EXIT)) { 353 return false; 354 } 355 return $this->_mode->leave(); 356 } 357 if ($this->_isSpecialMode($mode)) { 358 $this->_mode->enter($this->_decodeSpecial($mode)); 359 if (! $this->_invokeParser($matched, LEXER_SPECIAL)) { 360 return false; 361 } 362 return $this->_mode->leave(); 363 } 364 $this->_mode->enter($mode); 365 return $this->_invokeParser($matched, LEXER_ENTER); 366 } 367 368 /** 369 * Tests to see if the new mode is actually to leave 370 * the current mode and pop an item from the matching 371 * mode stack. 372 * @param string $mode Mode to test. 373 * @return boolean True if this is the exit mode. 374 * @access private 375 */ 376 function _isModeEnd($mode) { 377 return ($mode === "__exit"); 378 } 379 380 /** 381 * Test to see if the mode is one where this mode 382 * is entered for this token only and automatically 383 * leaves immediately afterwoods. 384 * @param string $mode Mode to test. 385 * @return boolean True if this is the exit mode. 386 * @access private 387 */ 388 function _isSpecialMode($mode) { 389 return (strncmp($mode, "_", 1) == 0); 390 } 391 392 /** 393 * Strips the magic underscore marking single token 394 * modes. 395 * @param string $mode Mode to decode. 396 * @return string Underlying mode name. 397 * @access private 398 */ 399 function _decodeSpecial($mode) { 400 return substr($mode, 1); 401 } 402 403 /** 404 * Calls the parser method named after the current 405 * mode. Empty content will be ignored. The lexer 406 * has a parser handler for each mode in the lexer. 407 * @param string $content Text parsed. 408 * @param boolean $is_match Token is recognised rather 409 * than unparsed data. 410 * @access private 411 */ 412 function _invokeParser($content, $is_match) { 413 if (($content === '') || ($content === false)) { 414 return true; 415 } 416 $handler = $this->_mode_handlers[$this->_mode->getCurrent()]; 417 return $this->_parser->$handler($content, $is_match); 418 } 419 420 /** 421 * Tries to match a chunk of text and if successful 422 * removes the recognised chunk and any leading 423 * unparsed data. Empty strings will not be matched. 424 * @param string $raw The subject to parse. This is the 425 * content that will be eaten. 426 * @return array/boolean Three item list of unparsed 427 * content followed by the 428 * recognised token and finally the 429 * action the parser is to take. 430 * True if no match, false if there 431 * is a parsing error. 432 * @access private 433 */ 434 function _reduce($raw) { 435 if ($action = $this->_regexes[$this->_mode->getCurrent()]->match($raw, $match)) { 436 $unparsed_character_count = strpos($raw, $match); 437 $unparsed = substr($raw, 0, $unparsed_character_count); 438 $raw = substr($raw, $unparsed_character_count + strlen($match)); 439 return array($raw, $unparsed, $match, $action); 440 } 441 return true; 442 } 443} 444 445/** 446 * Breaks HTML into SAX events. 447 * @package SimpleTest 448 * @subpackage WebTester 449 */ 450class SimpleHtmlLexer extends SimpleLexer { 451 452 /** 453 * Sets up the lexer with case insensitive matching 454 * and adds the HTML handlers. 455 * @param SimpleSaxParser $parser Handling strategy by 456 * reference. 457 * @access public 458 */ 459 function SimpleHtmlLexer(&$parser) { 460 $this->SimpleLexer($parser, 'text'); 461 $this->mapHandler('text', 'acceptTextToken'); 462 $this->_addSkipping(); 463 foreach ($this->_getParsedTags() as $tag) { 464 $this->_addTag($tag); 465 } 466 $this->_addInTagTokens(); 467 } 468 469 /** 470 * List of parsed tags. Others are ignored. 471 * @return array List of searched for tags. 472 * @access private 473 */ 474 function _getParsedTags() { 475 return array('a', 'base', 'title', 'form', 'input', 'button', 'textarea', 'select', 476 'option', 'frameset', 'frame', 'label'); 477 } 478 479 /** 480 * The lexer has to skip certain sections such 481 * as server code, client code and styles. 482 * @access private 483 */ 484 function _addSkipping() { 485 $this->mapHandler('css', 'ignore'); 486 $this->addEntryPattern('<style', 'text', 'css'); 487 $this->addExitPattern('</style>', 'css'); 488 $this->mapHandler('js', 'ignore'); 489 $this->addEntryPattern('<script', 'text', 'js'); 490 $this->addExitPattern('</script>', 'js'); 491 $this->mapHandler('comment', 'ignore'); 492 $this->addEntryPattern('<!--', 'text', 'comment'); 493 $this->addExitPattern('-->', 'comment'); 494 } 495 496 /** 497 * Pattern matches to start and end a tag. 498 * @param string $tag Name of tag to scan for. 499 * @access private 500 */ 501 function _addTag($tag) { 502 $this->addSpecialPattern("</$tag>", 'text', 'acceptEndToken'); 503 $this->addEntryPattern("<$tag", 'text', 'tag'); 504 } 505 506 /** 507 * Pattern matches to parse the inside of a tag 508 * including the attributes and their quoting. 509 * @access private 510 */ 511 function _addInTagTokens() { 512 $this->mapHandler('tag', 'acceptStartToken'); 513 $this->addSpecialPattern('\s+', 'tag', 'ignore'); 514 $this->_addAttributeTokens(); 515 $this->addExitPattern('/>', 'tag'); 516 $this->addExitPattern('>', 'tag'); 517 } 518 519 /** 520 * Matches attributes that are either single quoted, 521 * double quoted or unquoted. 522 * @access private 523 */ 524 function _addAttributeTokens() { 525 $this->mapHandler('dq_attribute', 'acceptAttributeToken'); 526 $this->addEntryPattern('=\s*"', 'tag', 'dq_attribute'); 527 $this->addPattern("\\\\\"", 'dq_attribute'); 528 $this->addExitPattern('"', 'dq_attribute'); 529 $this->mapHandler('sq_attribute', 'acceptAttributeToken'); 530 $this->addEntryPattern("=\s*'", 'tag', 'sq_attribute'); 531 $this->addPattern("\\\\'", 'sq_attribute'); 532 $this->addExitPattern("'", 'sq_attribute'); 533 $this->mapHandler('uq_attribute', 'acceptAttributeToken'); 534 $this->addSpecialPattern('=\s*[^>\s]*', 'tag', 'uq_attribute'); 535 } 536} 537 538/** 539 * Converts HTML tokens into selected SAX events. 540 * @package SimpleTest 541 * @subpackage WebTester 542 */ 543class SimpleHtmlSaxParser { 544 var $_lexer; 545 var $_listener; 546 var $_tag; 547 var $_attributes; 548 var $_current_attribute; 549 550 /** 551 * Sets the listener. 552 * @param SimpleSaxListener $listener SAX event handler. 553 * @access public 554 */ 555 function SimpleHtmlSaxParser(&$listener) { 556 $this->_listener = &$listener; 557 $this->_lexer = &$this->createLexer($this); 558 $this->_tag = ''; 559 $this->_attributes = array(); 560 $this->_current_attribute = ''; 561 } 562 563 /** 564 * Runs the content through the lexer which 565 * should call back to the acceptors. 566 * @param string $raw Page text to parse. 567 * @return boolean False if parse error. 568 * @access public 569 */ 570 function parse($raw) { 571 return $this->_lexer->parse($raw); 572 } 573 574 /** 575 * Sets up the matching lexer. Starts in 'text' mode. 576 * @param SimpleSaxParser $parser Event generator, usually $self. 577 * @return SimpleLexer Lexer suitable for this parser. 578 * @access public 579 * @static 580 */ 581 function &createLexer(&$parser) { 582 $lexer = &new SimpleHtmlLexer($parser); 583 return $lexer; 584 } 585 586 /** 587 * Accepts a token from the tag mode. If the 588 * starting element completes then the element 589 * is dispatched and the current attributes 590 * set back to empty. The element or attribute 591 * name is converted to lower case. 592 * @param string $token Incoming characters. 593 * @param integer $event Lexer event type. 594 * @return boolean False if parse error. 595 * @access public 596 */ 597 function acceptStartToken($token, $event) { 598 if ($event == LEXER_ENTER) { 599 $this->_tag = strtolower(substr($token, 1)); 600 return true; 601 } 602 if ($event == LEXER_EXIT) { 603 $success = $this->_listener->startElement( 604 $this->_tag, 605 $this->_attributes); 606 $this->_tag = ''; 607 $this->_attributes = array(); 608 return $success; 609 } 610 if ($token != '=') { 611 $this->_current_attribute = strtolower(SimpleHtmlSaxParser::decodeHtml($token)); 612 $this->_attributes[$this->_current_attribute] = ''; 613 } 614 return true; 615 } 616 617 /** 618 * Accepts a token from the end tag mode. 619 * The element name is converted to lower case. 620 * @param string $token Incoming characters. 621 * @param integer $event Lexer event type. 622 * @return boolean False if parse error. 623 * @access public 624 */ 625 function acceptEndToken($token, $event) { 626 if (! preg_match('/<\/(.*)>/', $token, $matches)) { 627 return false; 628 } 629 return $this->_listener->endElement(strtolower($matches[1])); 630 } 631 632 /** 633 * Part of the tag data. 634 * @param string $token Incoming characters. 635 * @param integer $event Lexer event type. 636 * @return boolean False if parse error. 637 * @access public 638 */ 639 function acceptAttributeToken($token, $event) { 640 if ($this->_current_attribute) { 641 if ($event == LEXER_UNMATCHED) { 642 $this->_attributes[$this->_current_attribute] .= 643 SimpleHtmlSaxParser::decodeHtml($token); 644 } 645 if ($event == LEXER_SPECIAL) { 646 $this->_attributes[$this->_current_attribute] .= 647 preg_replace('/^=\s*/' , '', SimpleHtmlSaxParser::decodeHtml($token)); 648 } 649 } 650 return true; 651 } 652 653 /** 654 * A character entity. 655 * @param string $token Incoming characters. 656 * @param integer $event Lexer event type. 657 * @return boolean False if parse error. 658 * @access public 659 */ 660 function acceptEntityToken($token, $event) { 661 } 662 663 /** 664 * Character data between tags regarded as 665 * important. 666 * @param string $token Incoming characters. 667 * @param integer $event Lexer event type. 668 * @return boolean False if parse error. 669 * @access public 670 */ 671 function acceptTextToken($token, $event) { 672 return $this->_listener->addContent($token); 673 } 674 675 /** 676 * Incoming data to be ignored. 677 * @param string $token Incoming characters. 678 * @param integer $event Lexer event type. 679 * @return boolean False if parse error. 680 * @access public 681 */ 682 function ignore($token, $event) { 683 return true; 684 } 685 686 /** 687 * Decodes any HTML entities. 688 * @param string $html Incoming HTML. 689 * @return string Outgoing plain text. 690 * @access public 691 * @static 692 */ 693 function decodeHtml($html) { 694 return html_entity_decode($html, ENT_QUOTES); 695 } 696 697 /** 698 * Turns HTML into text browser visible text. Images 699 * are converted to their alt text and tags are supressed. 700 * Entities are converted to their visible representation. 701 * @param string $html HTML to convert. 702 * @return string Plain text. 703 * @access public 704 * @static 705 */ 706 function normalise($html) { 707 $text = preg_replace('|<!--.*?-->|', '', $html); 708 $text = preg_replace('|<script[^>]*>.*?</script>|', '', $text); 709 $text = preg_replace('|<img[^>]*alt\s*=\s*"([^"]*)"[^>]*>|', ' \1 ', $text); 710 $text = preg_replace('|<img[^>]*alt\s*=\s*\'([^\']*)\'[^>]*>|', ' \1 ', $text); 711 $text = preg_replace('|<img[^>]*alt\s*=\s*([a-zA-Z_]+)[^>]*>|', ' \1 ', $text); 712 $text = preg_replace('|<[^>]*>|', '', $text); 713 $text = SimpleHtmlSaxParser::decodeHtml($text); 714 $text = preg_replace('|\s+|', ' ', $text); 715 return trim(trim($text), "\xA0"); // TODO: The \xAO is a . Add a test for this. 716 } 717} 718 719/** 720 * SAX event handler. 721 * @package SimpleTest 722 * @subpackage WebTester 723 * @abstract 724 */ 725class SimpleSaxListener { 726 727 /** 728 * Sets the document to write to. 729 * @access public 730 */ 731 function SimpleSaxListener() { 732 } 733 734 /** 735 * Start of element event. 736 * @param string $name Element name. 737 * @param hash $attributes Name value pairs. 738 * Attributes without content 739 * are marked as true. 740 * @return boolean False on parse error. 741 * @access public 742 */ 743 function startElement($name, $attributes) { 744 } 745 746 /** 747 * End of element event. 748 * @param string $name Element name. 749 * @return boolean False on parse error. 750 * @access public 751 */ 752 function endElement($name) { 753 } 754 755 /** 756 * Unparsed, but relevant data. 757 * @param string $text May include unparsed tags. 758 * @return boolean False on parse error. 759 * @access public 760 */ 761 function addContent($text) { 762 } 763} 764?>