1<?php 2/* vim: set expandtab tabstop=4 shiftwidth=4: */ 3// 4// +----------------------------------------------------------------------+ 5// | PHP Version 4 | 6// +----------------------------------------------------------------------+ 7// | Copyright (c) 1997-2002 The PHP Group | 8// +----------------------------------------------------------------------+ 9// | This source file is subject to version 2.02 of the PHP license, | 10// | that is bundled with this package in the file LICENSE, and is | 11// | available at through the world-wide-web at | 12// | http://www.php.net/license/3_0.txt. | 13// | If you did not receive a copy of the PHP license and are unable to | 14// | obtain it through the world-wide-web, please send a note to | 15// | license@php.net so we can mail you a copy immediately. | 16// +----------------------------------------------------------------------+ 17// | Authors: Alexander Zhukov <alex@veresk.ru> Original port from Python | 18// | Authors: Harry Fuecks <hfuecks@phppatterns.com> Port to PEAR + more | 19// | Authors: Many @ Sitepointforums Advanced PHP Forums | 20// +----------------------------------------------------------------------+ 21// 22// $Id: XML_HTMLSax.php,v 1.15 2003/12/04 23:35:18 harryf Exp $ 23// 24/** 25* Main parser components 26* @package XML_HTMLSax 27* @version $Id: XML_HTMLSax.php,v 1.15 2003/12/04 23:35:18 harryf Exp $ 28*/ 29/** 30* Required classes 31*/ 32require_once('PEAR.php'); 33if (!defined('XML_HTMLSAX')) { 34 define('XML_HTMLSAX', 'XML/'); 35} 36require_once(XML_HTMLSAX . 'HTMLSax/XML_HTMLSax_States.php'); 37require_once(XML_HTMLSAX . 'HTMLSax/XML_HTMLSax_Decorators.php'); 38/** 39* Base State Parser 40* @package XML_HTMLSax 41* @access protected 42* @abstract 43*/ 44class XML_HTMLSax_StateParser { 45 /** 46 * Instance of user front end class to be passed to callbacks 47 * @var XML_HTMLSax 48 * @access private 49 */ 50 var $htmlsax; 51 /** 52 * User defined object for handling elements 53 * @var object 54 * @access private 55 */ 56 var $handler_object_element; 57 /** 58 * User defined open tag handler method 59 * @var string 60 * @access private 61 */ 62 var $handler_method_opening; 63 /** 64 * User defined close tag handler method 65 * @var string 66 * @access private 67 */ 68 var $handler_method_closing; 69 /** 70 * User defined object for handling data in elements 71 * @var object 72 * @access private 73 */ 74 var $handler_object_data; 75 /** 76 * User defined data handler method 77 * @var string 78 * @access private 79 */ 80 var $handler_method_data; 81 /** 82 * User defined object for handling processing instructions 83 * @var object 84 * @access private 85 */ 86 var $handler_object_pi; 87 /** 88 * User defined processing instruction handler method 89 * @var string 90 * @access private 91 */ 92 var $handler_method_pi; 93 /** 94 * User defined object for handling JSP/ASP tags 95 * @var object 96 * @access private 97 */ 98 var $handler_object_jasp; 99 /** 100 * User defined JSP/ASP handler method 101 * @var string 102 * @access private 103 */ 104 var $handler_method_jasp; 105 /** 106 * User defined object for handling XML escapes 107 * @var object 108 * @access private 109 */ 110 var $handler_object_escape; 111 /** 112 * User defined XML escape handler method 113 * @var string 114 * @access private 115 */ 116 var $handler_method_escape; 117 /** 118 * User defined handler object or NullHandler 119 * @var object 120 * @access private 121 */ 122 var $handler_default; 123 /** 124 * Parser options determining parsing behavior 125 * @var array 126 * @access private 127 */ 128 var $parser_options = array(); 129 /** 130 * XML document being parsed 131 * @var string 132 * @access private 133 */ 134 var $rawtext; 135 /** 136 * Position in XML document relative to start (0) 137 * @var int 138 * @access private 139 */ 140 var $position; 141 /** 142 * Length of the XML document in characters 143 * @var int 144 * @access private 145 */ 146 var $length; 147 /** 148 * Array of state objects 149 * @var array 150 * @access private 151 */ 152 var $State = array(); 153 154 /** 155 * Constructs XML_HTMLSax_StateParser setting up states 156 * @var XML_HTMLSax instance of user front end class 157 * @access protected 158 */ 159 function XML_HTMLSax_StateParser (& $htmlsax) { 160 $this->htmlsax = & $htmlsax; 161 $this->State[XML_HTMLSAX_STATE_START] =& new XML_HTMLSax_StartingState(); 162 163 $this->State[XML_HTMLSAX_STATE_CLOSING_TAG] =& new XML_HTMLSax_ClosingTagState(); 164 $this->State[XML_HTMLSAX_STATE_TAG] =& new XML_HTMLSax_TagState(); 165 $this->State[XML_HTMLSAX_STATE_OPENING_TAG] =& new XML_HTMLSax_OpeningTagState(); 166 167 $this->State[XML_HTMLSAX_STATE_PI] =& new XML_HTMLSax_PiState(); 168 $this->State[XML_HTMLSAX_STATE_JASP] =& new XML_HTMLSax_JaspState(); 169 $this->State[XML_HTMLSAX_STATE_ESCAPE] =& new XML_HTMLSax_EscapeState(); 170 } 171 172 /** 173 * Moves the position back one character 174 * @access protected 175 * @return void 176 */ 177 function unscanCharacter() { 178 $this->position -= 1; 179 } 180 181 /** 182 * Moves the position forward one character 183 * @access protected 184 * @return void 185 */ 186 function ignoreCharacter() { 187 $this->position += 1; 188 } 189 190 /** 191 * Returns the next character from the XML document or void if at end 192 * @access protected 193 * @return mixed 194 */ 195 function scanCharacter() { 196 if ($this->position < $this->length) { 197 return $this->rawtext{$this->position++}; 198 } 199 } 200 201 /** 202 * Returns a string from the current position to the next occurance 203 * of the supplied string 204 * @param string string to search until 205 * @access protected 206 * @return string 207 */ 208 function scanUntilString($string) { 209 $start = $this->position; 210 $this->position = strpos($this->rawtext, $string, $start); 211 if ($this->position === FALSE) { 212 $this->position = $this->length; 213 } 214 return substr($this->rawtext, $start, $this->position - $start); 215 } 216 217 /** 218 * Returns a string from the current position until the first instance of 219 * one of the characters in the supplied string argument 220 * @param string string to search until 221 * @access protected 222 * @return string 223 * @abstract 224 */ 225 function scanUntilCharacters($string) {} 226 227 /** 228 * Moves the position forward past any whitespace characters 229 * @access protected 230 * @return void 231 * @abstract 232 */ 233 function ignoreWhitespace() {} 234 235 /** 236 * Begins the parsing operation, setting up any decorators, depending on 237 * parse options invoking _parse() to execute parsing 238 * @param string XML document to parse 239 * @access protected 240 * @return void 241 */ 242 function parse($data) { 243 if ($this->parser_options['XML_OPTION_TRIM_DATA_NODES']==1) { 244 $decorator =& new XML_HTMLSax_Trim( 245 $this->handler_object_data, 246 $this->handler_method_data); 247 $this->handler_object_data =& $decorator; 248 $this->handler_method_data = 'trimData'; 249 } 250 if ($this->parser_options['XML_OPTION_CASE_FOLDING']==1) { 251 $open_decor =& new XML_HTMLSax_CaseFolding( 252 $this->handler_object_element, 253 $this->handler_method_opening, 254 $this->handler_method_closing); 255 $this->handler_object_element =& $open_decor; 256 $this->handler_method_opening ='foldOpen'; 257 $this->handler_method_closing ='foldClose'; 258 } 259 if ($this->parser_options['XML_OPTION_LINEFEED_BREAK']==1) { 260 $decorator =& new XML_HTMLSax_Linefeed( 261 $this->handler_object_data, 262 $this->handler_method_data); 263 $this->handler_object_data =& $decorator; 264 $this->handler_method_data = 'breakData'; 265 } 266 if ($this->parser_options['XML_OPTION_TAB_BREAK']==1) { 267 $decorator =& new XML_HTMLSax_Tab( 268 $this->handler_object_data, 269 $this->handler_method_data); 270 $this->handler_object_data =& $decorator; 271 $this->handler_method_data = 'breakData'; 272 } 273 if ($this->parser_options['XML_OPTION_ENTITIES_UNPARSED']==1) { 274 $decorator =& new XML_HTMLSax_Entities_Unparsed( 275 $this->handler_object_data, 276 $this->handler_method_data); 277 $this->handler_object_data =& $decorator; 278 $this->handler_method_data = 'breakData'; 279 } 280 if ($this->parser_options['XML_OPTION_ENTITIES_PARSED']==1) { 281 $decorator =& new XML_HTMLSax_Entities_Parsed( 282 $this->handler_object_data, 283 $this->handler_method_data); 284 $this->handler_object_data =& $decorator; 285 $this->handler_method_data = 'breakData'; 286 } 287 $this->rawtext = $data; 288 $this->length = strlen($data); 289 $this->position = 0; 290 $this->_parse(); 291 } 292 293 /** 294 * Performs the parsing itself, delegating calls to a specific parser 295 * state 296 * @param constant state object to parse with 297 * @access protected 298 * @return void 299 */ 300 function _parse($state = XML_HTMLSAX_STATE_START) { 301 do { 302 $state = $this->State[$state]->parse($this); 303 } while ($state != XML_HTMLSAX_STATE_STOP && 304 $this->position < $this->length); 305 } 306} 307 308/** 309* Parser for PHP Versions below 4.3.0. Uses a slower parsing mechanism than 310* the equivalent PHP 4.3.0+ subclass of StateParser 311* @package XML_HTMLSax 312* @access protected 313* @see XML_HTMLSax_StateParser_Gtet430 314*/ 315class XML_HTMLSax_StateParser_Lt430 extends XML_HTMLSax_StateParser { 316 /** 317 * Constructs XML_HTMLSax_StateParser_Lt430 defining available 318 * parser options 319 * @var XML_HTMLSax instance of user front end class 320 * @access protected 321 */ 322 function XML_HTMLSax_StateParser_Lt430(& $htmlsax) { 323 parent::XML_HTMLSax_StateParser($htmlsax); 324 $this->parser_options['XML_OPTION_TRIM_DATA_NODES'] = 0; 325 $this->parser_options['XML_OPTION_CASE_FOLDING'] = 0; 326 $this->parser_options['XML_OPTION_LINEFEED_BREAK'] = 0; 327 $this->parser_options['XML_OPTION_TAB_BREAK'] = 0; 328 $this->parser_options['XML_OPTION_ENTITIES_PARSED'] = 0; 329 $this->parser_options['XML_OPTION_ENTITIES_UNPARSED'] = 0; 330 $this->parser_options['XML_OPTION_FULL_ESCAPES'] = 0; 331 } 332 333 /** 334 * Returns a string from the current position until the first instance of 335 * one of the characters in the supplied string argument 336 * @param string string to search until 337 * @access protected 338 * @return string 339 */ 340 function scanUntilCharacters($string) { 341 $startpos = $this->position; 342 while ($this->position < $this->length && strpos($string, $this->rawtext{$this->position}) === FALSE) { 343 $this->position++; 344 } 345 return substr($this->rawtext, $startpos, $this->position - $startpos); 346 } 347 348 /** 349 * Moves the position forward past any whitespace characters 350 * @access protected 351 * @return void 352 */ 353 function ignoreWhitespace() { 354 while ($this->position < $this->length && 355 strpos(" \n\r\t", $this->rawtext{$this->position}) !== FALSE) { 356 $this->position++; 357 } 358 } 359 360 /** 361 * Begins the parsing operation, setting up the unparsed XML entities 362 * decorator if necessary then delegating further work to parent 363 * @param string XML document to parse 364 * @access protected 365 * @return void 366 */ 367 function parse($data) { 368 parent::parse($data); 369 } 370} 371 372/** 373* Parser for PHP Versions equal to or greater than 4.3.0. Uses a faster 374* parsing mechanism than the equivalent PHP < 4.3.0 subclass of StateParser 375* @package XML_HTMLSax 376* @access protected 377* @see XML_HTMLSax_StateParser_Lt430 378*/ 379class XML_HTMLSax_StateParser_Gtet430 extends XML_HTMLSax_StateParser { 380 /** 381 * Constructs XML_HTMLSax_StateParser_Gtet430 defining available 382 * parser options 383 * @var XML_HTMLSax instance of user front end class 384 * @access protected 385 */ 386 function XML_HTMLSax_StateParser_Gtet430(& $htmlsax) { 387 parent::XML_HTMLSax_StateParser($htmlsax); 388 $this->parser_options['XML_OPTION_TRIM_DATA_NODES'] = 0; 389 $this->parser_options['XML_OPTION_CASE_FOLDING'] = 0; 390 $this->parser_options['XML_OPTION_LINEFEED_BREAK'] = 0; 391 $this->parser_options['XML_OPTION_TAB_BREAK'] = 0; 392 $this->parser_options['XML_OPTION_ENTITIES_PARSED'] = 0; 393 $this->parser_options['XML_OPTION_ENTITIES_UNPARSED'] = 0; 394 $this->parser_options['XML_OPTION_FULL_ESCAPES'] = 0; 395 } 396 /** 397 * Returns a string from the current position until the first instance of 398 * one of the characters in the supplied string argument. 399 * @param string string to search until 400 * @access protected 401 * @return string 402 */ 403 function scanUntilCharacters($string) { 404 $startpos = $this->position; 405 $length = strcspn($this->rawtext, $string, $startpos); 406 $this->position += $length; 407 return substr($this->rawtext, $startpos, $length); 408 } 409 410 /** 411 * Moves the position forward past any whitespace characters 412 * @access protected 413 * @return void 414 */ 415 function ignoreWhitespace() { 416 $this->position += strspn($this->rawtext, " \n\r\t", $this->position); 417 } 418 419 /** 420 * Begins the parsing operation, setting up the parsed and unparsed 421 * XML entity decorators if necessary then delegating further work 422 * to parent 423 * @param string XML document to parse 424 * @access protected 425 * @return void 426 */ 427 function parse($data) { 428 parent::parse($data); 429 } 430} 431 432/** 433* Default NullHandler for methods which were not set by user 434* @package XML_HTMLSax 435* @access protected 436*/ 437class XML_HTMLSax_NullHandler { 438 /** 439 * Generic handler method which does nothing 440 * @access protected 441 * @return void 442 */ 443 function DoNothing() { 444 } 445} 446 447/** 448* User interface class. All user calls should only be made to this class 449* @package XML_HTMLSax 450* @access public 451*/ 452class XML_HTMLSax extends Pear { 453 /** 454 * Instance of concrete subclass of XML_HTMLSax_StateParser 455 * @var XML_HTMLSax_StateParser 456 * @access private 457 */ 458 var $state_parser; 459 460 /** 461 * Constructs XML_HTMLSax selecting concrete StateParser subclass 462 * depending on PHP version being used as well as setting the default 463 * NullHandler for all callbacks<br /> 464 * <b>Example:</b> 465 * <pre> 466 * $myHandler = & new MyHandler(); 467 * $parser = new XML_HTMLSax(); 468 * $parser->set_object($myHandler); 469 * $parser->set_option('XML_OPTION_CASE_FOLDING'); 470 * $parser->set_element_handler('myOpenHandler','myCloseHandler'); 471 * $parser->set_data_handler('myDataHandler'); 472 * $parser->parser($xml); 473 * </pre> 474 * @access public 475 */ 476 function XML_HTMLSax() { 477 if (version_compare(phpversion(), '4.3', 'ge')) { 478 $this->state_parser =& new XML_HTMLSax_StateParser_Gtet430($this); 479 } else { 480 $this->state_parser =& new XML_HTMLSax_StateParser_Lt430($this); 481 } 482 $nullhandler =& new XML_HTMLSax_NullHandler(); 483 $this->set_object($nullhandler); 484 $this->set_element_handler('DoNothing', 'DoNothing'); 485 $this->set_data_handler('DoNothing'); 486 $this->set_pi_handler('DoNothing'); 487 $this->set_jasp_handler('DoNothing'); 488 $this->set_escape_handler('DoNothing'); 489 } 490 491 /** 492 * Sets the user defined handler object. Returns a PEAR Error 493 * if supplied argument is not an object. 494 * @param object handler object containing SAX callback methods 495 * @access public 496 * @return mixed 497 */ 498 function set_object(&$object) { 499 if ( is_object($object) ) { 500 $this->state_parser->handler_default =& $object; 501 return true; 502 } else { 503 return $this->raiseError('XML_HTMLSax::set_object requires '. 504 'an object instance'); 505 } 506 } 507 508 /** 509 * Sets a parser option. Returns a PEAR Error if option is invalid<br /> 510 * <b>Available options:</b> 511 * <ul> 512 * <li>XML_OPTION_TRIM_DATA_NODES: trim whitespace off the beginning 513 * and end of data passed to the data handler</li> 514 * <li>XML_OPTION_LINEFEED_BREAK: linefeeds result in additional data 515 * handler calls</li> 516 * <li>XML_OPTION_TAB_BREAK: tabs result in additional data handler 517 * calls</li> 518 * <li>XML_OPTION_ENTIES_UNPARSED: XML entities are returned as 519 * seperate data handler calls in unparsed form</li> 520 * <li>XML_OPTION_ENTIES_PARSED: (PHP 4.3.0+ only) XML entities are 521 * returned as seperate data handler calls and are parsed with 522 * PHP's html_entity_decode() function</li> 523 * </ul> 524 * @param string name of parser option 525 * @param int (optional) 1 to switch on, 0 for off 526 * @access public 527 * @return boolean 528 */ 529 function set_option($name, $value=1) { 530 if ( array_key_exists($name,$this->state_parser->parser_options) ) { 531 $this->state_parser->parser_options[$name] = $value; 532 return true; 533 } else { 534 return $this->raiseError('XML_HTMLSax::set_option('.$name.') illegal'); 535 } 536 } 537 538 /** 539 * Sets the data handler method which deals with the contents of XML 540 * elements.<br /> 541 * The handler method must accept two arguments, the first being an 542 * instance of XML_HTMLSax and the second being the contents of an 543 * XML element e.g. 544 * <pre> 545 * function myDataHander(& $parser,$data){} 546 * </pre> 547 * @param string name of method 548 * @access public 549 * @return void 550 * @see set_object 551 */ 552 function set_data_handler($data_method) { 553 $this->state_parser->handler_object_data =& $this->state_parser->handler_default; 554 $this->state_parser->handler_method_data = $data_method; 555 } 556 557 /** 558 * Sets the open and close tag handlers 559 * <br />The open handler method must accept three arguments; the parser, 560 * the tag name and an array of attributes e.g. 561 * <pre> 562 * function myOpenHander(& $parser,$tagname,$attrs=array()){} 563 * </pre> 564 * The close handler method must accept two arguments; the parser and 565 * the tag name e.g. 566 * <pre> 567 * function myCloseHander(& $parser,$tagname){} 568 * </pre> 569 * @param string name of open method 570 * @param string name of close method 571 * @access public 572 * @return void 573 * @see set_object 574 */ 575 function set_element_handler($opening_method, $closing_method) { 576 $this->state_parser->handler_object_element =& $this->state_parser->handler_default; 577 $this->state_parser->handler_method_opening = $opening_method; 578 $this->state_parser->handler_method_closing = $closing_method; 579 } 580 581 /** 582 * Sets the processing instruction handler method e.g. for PHP open 583 * and close tags<br /> 584 * The handler method must accept three arguments; the parser, the 585 * PI target and data inside the PI 586 * <pre> 587 * function myPIHander(& $parser,$target, $data){} 588 * </pre> 589 * @param string name of method 590 * @access public 591 * @return void 592 * @see set_object 593 */ 594 function set_pi_handler($pi_method) { 595 $this->state_parser->handler_object_pi =& $this->state_parser->handler_default; 596 $this->state_parser->handler_method_pi = $pi_method; 597 } 598 599 /** 600 * Sets the XML escape handler method e.g. for comments and doctype 601 * declarations<br /> 602 * The handler method must accept two arguments; the parser and the 603 * contents of the escaped section 604 * <pre> 605 * function myEscapeHander(& $parser, $data){} 606 * </pre> 607 * @param string name of method 608 * @access public 609 * @return void 610 * @see set_object 611 */ 612 function set_escape_handler($escape_method) { 613 $this->state_parser->handler_object_escape =& $this->state_parser->handler_default; 614 $this->state_parser->handler_method_escape = $escape_method; 615 } 616 617 /** 618 * Sets the JSP/ASP markup handler<br /> 619 * The handler method must accept two arguments; the parser and 620 * body of the JASP tag 621 * <pre> 622 * function myJaspHander(& $parser, $data){} 623 * </pre> 624 * @param string name of method 625 * @access public 626 * @return void 627 * @see set_object 628 */ 629 function set_jasp_handler ($jasp_method) { 630 $this->state_parser->handler_object_jasp =& $this->state_parser->handler_default; 631 $this->state_parser->handler_method_jasp = $jasp_method; 632 } 633 634 /** 635 * Returns the current string position of the "cursor" inside the XML 636 * document 637 * <br />Intended for use from within a user defined handler called 638 * via the $parser reference e.g. 639 * <pre> 640 * function myDataHandler(& $parser,$data) { 641 * echo( 'Current position: '.$parser->get_current_position() ); 642 * } 643 * </pre> 644 * @access public 645 * @return int 646 * @see get_length 647 */ 648 function get_current_position() { 649 return $this->state_parser->position; 650 } 651 652 /** 653 * Returns the string length of the XML document being parsed 654 * @access public 655 * @return int 656 */ 657 function get_length() { 658 return $this->state_parser->length; 659 } 660 661 /** 662 * Start parsing some XML 663 * @param string XML document 664 * @access public 665 * @return void 666 */ 667 function parse($data) { 668 $this->state_parser->parse($data); 669 } 670} 671?>