1<?php
2/* vim: set expandtab tabstop=4 shiftwidth=4: */
3//
4// +----------------------------------------------------------------------+
5// | PHP Version 4                                                        |
6// +----------------------------------------------------------------------+
7// | Copyright (c) 1997-2002 The PHP Group                                |
8// +----------------------------------------------------------------------+
9// | This source file is subject to version 2.02 of the PHP license,      |
10// | that is bundled with this package in the file LICENSE, and is        |
11// | available at through the world-wide-web at                           |
12// | http://www.php.net/license/3_0.txt.                                  |
13// | If you did not receive a copy of the PHP license and are unable to   |
14// | obtain it through the world-wide-web, please send a note to          |
15// | license@php.net so we can mail you a copy immediately.               |
16// +----------------------------------------------------------------------+
17// | Authors: Alexander Zhukov <alex@veresk.ru> Original port from Python |
18// | Authors: Harry Fuecks <hfuecks@phppatterns.com> Port to PEAR + more  |
19// | Authors: Many @ Sitepointforums Advanced PHP Forums                  |
20// +----------------------------------------------------------------------+
21//
22// $Id: XML_HTMLSax.php,v 1.15 2003/12/04 23:35:18 harryf Exp $
23//
24/**
25* Main parser components
26* @package XML_HTMLSax
27* @version $Id: XML_HTMLSax.php,v 1.15 2003/12/04 23:35:18 harryf Exp $
28*/
29/**
30* Required classes
31*/
32require_once('PEAR.php');
33if (!defined('XML_HTMLSAX')) {
34    define('XML_HTMLSAX', 'XML/');
35}
36require_once(XML_HTMLSAX . 'HTMLSax/XML_HTMLSax_States.php');
37require_once(XML_HTMLSAX . 'HTMLSax/XML_HTMLSax_Decorators.php');
38/**
39* Base State Parser
40* @package XML_HTMLSax
41* @access protected
42* @abstract
43*/
44class XML_HTMLSax_StateParser {
45    /**
46    * Instance of user front end class to be passed to callbacks
47    * @var XML_HTMLSax
48    * @access private
49    */
50    var $htmlsax;
51    /**
52    * User defined object for handling elements
53    * @var object
54    * @access private
55    */
56    var $handler_object_element;
57    /**
58    * User defined open tag handler method
59    * @var string
60    * @access private
61    */
62    var $handler_method_opening;
63    /**
64    * User defined close tag handler method
65    * @var string
66    * @access private
67    */
68    var $handler_method_closing;
69    /**
70    * User defined object for handling data in elements
71    * @var object
72    * @access private
73    */
74    var $handler_object_data;
75    /**
76    * User defined data handler method
77    * @var string
78    * @access private
79    */
80    var $handler_method_data;
81    /**
82    * User defined object for handling processing instructions
83    * @var object
84    * @access private
85    */
86    var $handler_object_pi;
87    /**
88    * User defined processing instruction handler method
89    * @var string
90    * @access private
91    */
92    var $handler_method_pi;
93    /**
94    * User defined object for handling JSP/ASP tags
95    * @var object
96    * @access private
97    */
98    var $handler_object_jasp;
99    /**
100    * User defined JSP/ASP handler method
101    * @var string
102    * @access private
103    */
104    var $handler_method_jasp;
105    /**
106    * User defined object for handling XML escapes
107    * @var object
108    * @access private
109    */
110    var $handler_object_escape;
111    /**
112    * User defined XML escape handler method
113    * @var string
114    * @access private
115    */
116    var $handler_method_escape;
117    /**
118    * User defined handler object or NullHandler
119    * @var object
120    * @access private
121    */
122    var $handler_default;
123    /**
124    * Parser options determining parsing behavior
125    * @var array
126    * @access private
127    */
128    var $parser_options = array();
129    /**
130    * XML document being parsed
131    * @var string
132    * @access private
133    */
134    var $rawtext;
135    /**
136    * Position in XML document relative to start (0)
137    * @var int
138    * @access private
139    */
140    var $position;
141    /**
142    * Length of the XML document in characters
143    * @var int
144    * @access private
145    */
146    var $length;
147    /**
148    * Array of state objects
149    * @var array
150    * @access private
151    */
152    var $State = array();
153
154    /**
155    * Constructs XML_HTMLSax_StateParser setting up states
156    * @var XML_HTMLSax instance of user front end class
157    * @access protected
158    */
159    function XML_HTMLSax_StateParser (& $htmlsax) {
160        $this->htmlsax = & $htmlsax;
161        $this->State[XML_HTMLSAX_STATE_START] =& new XML_HTMLSax_StartingState();
162
163        $this->State[XML_HTMLSAX_STATE_CLOSING_TAG] =& new XML_HTMLSax_ClosingTagState();
164        $this->State[XML_HTMLSAX_STATE_TAG] =& new XML_HTMLSax_TagState();
165        $this->State[XML_HTMLSAX_STATE_OPENING_TAG] =& new XML_HTMLSax_OpeningTagState();
166
167        $this->State[XML_HTMLSAX_STATE_PI] =& new XML_HTMLSax_PiState();
168        $this->State[XML_HTMLSAX_STATE_JASP] =& new XML_HTMLSax_JaspState();
169        $this->State[XML_HTMLSAX_STATE_ESCAPE] =& new XML_HTMLSax_EscapeState();
170    }
171
172    /**
173    * Moves the position back one character
174    * @access protected
175    * @return void
176    */
177    function unscanCharacter() {
178        $this->position -= 1;
179    }
180
181    /**
182    * Moves the position forward one character
183    * @access protected
184    * @return void
185    */
186    function ignoreCharacter() {
187        $this->position += 1;
188    }
189
190    /**
191    * Returns the next character from the XML document or void if at end
192    * @access protected
193    * @return mixed
194    */
195    function scanCharacter() {
196        if ($this->position < $this->length) {
197            return $this->rawtext{$this->position++};
198        }
199    }
200
201    /**
202    * Returns a string from the current position to the next occurance
203    * of the supplied string
204    * @param string string to search until
205    * @access protected
206    * @return string
207    */
208    function scanUntilString($string) {
209        $start = $this->position;
210        $this->position = strpos($this->rawtext, $string, $start);
211        if ($this->position === FALSE) {
212            $this->position = $this->length;
213        }
214        return substr($this->rawtext, $start, $this->position - $start);
215    }
216
217    /**
218    * Returns a string from the current position until the first instance of
219    * one of the characters in the supplied string argument
220    * @param string string to search until
221    * @access protected
222    * @return string
223    * @abstract
224    */
225    function scanUntilCharacters($string) {}
226
227    /**
228    * Moves the position forward past any whitespace characters
229    * @access protected
230    * @return void
231    * @abstract
232    */
233    function ignoreWhitespace() {}
234
235    /**
236    * Begins the parsing operation, setting up any decorators, depending on
237    * parse options invoking _parse() to execute parsing
238    * @param string XML document to parse
239    * @access protected
240    * @return void
241    */
242    function parse($data) {
243        if ($this->parser_options['XML_OPTION_TRIM_DATA_NODES']==1) {
244            $decorator =& new XML_HTMLSax_Trim(
245                $this->handler_object_data,
246                $this->handler_method_data);
247            $this->handler_object_data =& $decorator;
248            $this->handler_method_data = 'trimData';
249        }
250        if ($this->parser_options['XML_OPTION_CASE_FOLDING']==1) {
251            $open_decor =& new XML_HTMLSax_CaseFolding(
252                $this->handler_object_element,
253                $this->handler_method_opening,
254                $this->handler_method_closing);
255            $this->handler_object_element =& $open_decor;
256            $this->handler_method_opening ='foldOpen';
257            $this->handler_method_closing ='foldClose';
258        }
259        if ($this->parser_options['XML_OPTION_LINEFEED_BREAK']==1) {
260            $decorator =& new XML_HTMLSax_Linefeed(
261                $this->handler_object_data,
262                $this->handler_method_data);
263            $this->handler_object_data =& $decorator;
264            $this->handler_method_data = 'breakData';
265        }
266        if ($this->parser_options['XML_OPTION_TAB_BREAK']==1) {
267            $decorator =& new XML_HTMLSax_Tab(
268                $this->handler_object_data,
269                $this->handler_method_data);
270            $this->handler_object_data =& $decorator;
271            $this->handler_method_data = 'breakData';
272        }
273        if ($this->parser_options['XML_OPTION_ENTITIES_UNPARSED']==1) {
274            $decorator =& new XML_HTMLSax_Entities_Unparsed(
275                $this->handler_object_data,
276                $this->handler_method_data);
277            $this->handler_object_data =& $decorator;
278            $this->handler_method_data = 'breakData';
279        }
280        if ($this->parser_options['XML_OPTION_ENTITIES_PARSED']==1) {
281            $decorator =& new XML_HTMLSax_Entities_Parsed(
282                $this->handler_object_data,
283                $this->handler_method_data);
284            $this->handler_object_data =& $decorator;
285            $this->handler_method_data = 'breakData';
286        }
287        $this->rawtext = $data;
288        $this->length = strlen($data);
289        $this->position = 0;
290        $this->_parse();
291    }
292
293    /**
294    * Performs the parsing itself, delegating calls to a specific parser
295    * state
296    * @param constant state object to parse with
297    * @access protected
298    * @return void
299    */
300    function _parse($state = XML_HTMLSAX_STATE_START) {
301        do {
302            $state = $this->State[$state]->parse($this);
303        } while ($state != XML_HTMLSAX_STATE_STOP &&
304                    $this->position < $this->length);
305    }
306}
307
308/**
309* Parser for PHP Versions below 4.3.0. Uses a slower parsing mechanism than
310* the equivalent PHP 4.3.0+  subclass of StateParser
311* @package XML_HTMLSax
312* @access protected
313* @see XML_HTMLSax_StateParser_Gtet430
314*/
315class XML_HTMLSax_StateParser_Lt430 extends XML_HTMLSax_StateParser {
316    /**
317    * Constructs XML_HTMLSax_StateParser_Lt430 defining available
318    * parser options
319    * @var XML_HTMLSax instance of user front end class
320    * @access protected
321    */
322    function XML_HTMLSax_StateParser_Lt430(& $htmlsax) {
323        parent::XML_HTMLSax_StateParser($htmlsax);
324        $this->parser_options['XML_OPTION_TRIM_DATA_NODES'] = 0;
325        $this->parser_options['XML_OPTION_CASE_FOLDING'] = 0;
326        $this->parser_options['XML_OPTION_LINEFEED_BREAK'] = 0;
327        $this->parser_options['XML_OPTION_TAB_BREAK'] = 0;
328        $this->parser_options['XML_OPTION_ENTITIES_PARSED'] = 0;
329        $this->parser_options['XML_OPTION_ENTITIES_UNPARSED'] = 0;
330        $this->parser_options['XML_OPTION_FULL_ESCAPES'] = 0;
331    }
332
333    /**
334    * Returns a string from the current position until the first instance of
335    * one of the characters in the supplied string argument
336    * @param string string to search until
337    * @access protected
338    * @return string
339    */
340    function scanUntilCharacters($string) {
341        $startpos = $this->position;
342        while ($this->position < $this->length && strpos($string, $this->rawtext{$this->position}) === FALSE) {
343            $this->position++;
344        }
345        return substr($this->rawtext, $startpos, $this->position - $startpos);
346    }
347
348    /**
349    * Moves the position forward past any whitespace characters
350    * @access protected
351    * @return void
352    */
353    function ignoreWhitespace() {
354        while ($this->position < $this->length &&
355            strpos(" \n\r\t", $this->rawtext{$this->position}) !== FALSE) {
356            $this->position++;
357        }
358    }
359
360    /**
361    * Begins the parsing operation, setting up the unparsed XML entities
362    * decorator if necessary then delegating further work to parent
363    * @param string XML document to parse
364    * @access protected
365    * @return void
366    */
367    function parse($data) {
368        parent::parse($data);
369    }
370}
371
372/**
373* Parser for PHP Versions equal to or greater than 4.3.0. Uses a faster
374* parsing mechanism than the equivalent PHP < 4.3.0 subclass of StateParser
375* @package XML_HTMLSax
376* @access protected
377* @see XML_HTMLSax_StateParser_Lt430
378*/
379class XML_HTMLSax_StateParser_Gtet430 extends XML_HTMLSax_StateParser {
380    /**
381    * Constructs XML_HTMLSax_StateParser_Gtet430 defining available
382    * parser options
383    * @var XML_HTMLSax instance of user front end class
384    * @access protected
385    */
386    function XML_HTMLSax_StateParser_Gtet430(& $htmlsax) {
387        parent::XML_HTMLSax_StateParser($htmlsax);
388        $this->parser_options['XML_OPTION_TRIM_DATA_NODES'] = 0;
389        $this->parser_options['XML_OPTION_CASE_FOLDING'] = 0;
390        $this->parser_options['XML_OPTION_LINEFEED_BREAK'] = 0;
391        $this->parser_options['XML_OPTION_TAB_BREAK'] = 0;
392        $this->parser_options['XML_OPTION_ENTITIES_PARSED'] = 0;
393        $this->parser_options['XML_OPTION_ENTITIES_UNPARSED'] = 0;
394        $this->parser_options['XML_OPTION_FULL_ESCAPES'] = 0;
395    }
396    /**
397    * Returns a string from the current position until the first instance of
398    * one of the characters in the supplied string argument.
399    * @param string string to search until
400    * @access protected
401    * @return string
402    */
403    function scanUntilCharacters($string) {
404        $startpos = $this->position;
405        $length = strcspn($this->rawtext, $string, $startpos);
406        $this->position += $length;
407        return substr($this->rawtext, $startpos, $length);
408    }
409
410    /**
411    * Moves the position forward past any whitespace characters
412    * @access protected
413    * @return void
414    */
415    function ignoreWhitespace() {
416        $this->position += strspn($this->rawtext, " \n\r\t", $this->position);
417    }
418
419    /**
420    * Begins the parsing operation, setting up the parsed and unparsed
421    * XML entity decorators if necessary then delegating further work
422    * to parent
423    * @param string XML document to parse
424    * @access protected
425    * @return void
426    */
427    function parse($data) {
428        parent::parse($data);
429    }
430}
431
432/**
433* Default NullHandler for methods which were not set by user
434* @package XML_HTMLSax
435* @access protected
436*/
437class XML_HTMLSax_NullHandler {
438    /**
439    * Generic handler method which does nothing
440    * @access protected
441    * @return void
442    */
443    function DoNothing() {
444    }
445}
446
447/**
448* User interface class. All user calls should only be made to this class
449* @package XML_HTMLSax
450* @access public
451*/
452class XML_HTMLSax extends Pear {
453    /**
454    * Instance of concrete subclass of XML_HTMLSax_StateParser
455    * @var XML_HTMLSax_StateParser
456    * @access private
457    */
458    var $state_parser;
459
460    /**
461    * Constructs XML_HTMLSax selecting concrete StateParser subclass
462    * depending on PHP version being used as well as setting the default
463    * NullHandler for all callbacks<br />
464    * <b>Example:</b>
465    * <pre>
466    * $myHandler = & new MyHandler();
467    * $parser = new XML_HTMLSax();
468    * $parser->set_object($myHandler);
469    * $parser->set_option('XML_OPTION_CASE_FOLDING');
470    * $parser->set_element_handler('myOpenHandler','myCloseHandler');
471    * $parser->set_data_handler('myDataHandler');
472    * $parser->parser($xml);
473    * </pre>
474    * @access public
475    */
476    function XML_HTMLSax() {
477        if (version_compare(phpversion(), '4.3', 'ge')) {
478            $this->state_parser =& new XML_HTMLSax_StateParser_Gtet430($this);
479        } else {
480            $this->state_parser =& new XML_HTMLSax_StateParser_Lt430($this);
481        }
482        $nullhandler =& new XML_HTMLSax_NullHandler();
483        $this->set_object($nullhandler);
484        $this->set_element_handler('DoNothing', 'DoNothing');
485        $this->set_data_handler('DoNothing');
486        $this->set_pi_handler('DoNothing');
487        $this->set_jasp_handler('DoNothing');
488        $this->set_escape_handler('DoNothing');
489    }
490
491    /**
492    * Sets the user defined handler object. Returns a PEAR Error
493    * if supplied argument is not an object.
494    * @param object handler object containing SAX callback methods
495    * @access public
496    * @return mixed
497    */
498    function set_object(&$object) {
499        if ( is_object($object) ) {
500            $this->state_parser->handler_default =& $object;
501            return true;
502        } else {
503            return $this->raiseError('XML_HTMLSax::set_object requires '.
504                'an object instance');
505        }
506    }
507
508    /**
509    * Sets a parser option. Returns a PEAR Error if option is invalid<br />
510    * <b>Available options:</b>
511    * <ul>
512    * <li>XML_OPTION_TRIM_DATA_NODES: trim whitespace off the beginning
513    * and end of data passed to the data handler</li>
514    * <li>XML_OPTION_LINEFEED_BREAK: linefeeds result in additional data
515    * handler calls</li>
516    * <li>XML_OPTION_TAB_BREAK: tabs result in additional data handler
517    * calls</li>
518    * <li>XML_OPTION_ENTIES_UNPARSED: XML entities are returned as
519    * seperate data handler calls in unparsed form</li>
520    * <li>XML_OPTION_ENTIES_PARSED: (PHP 4.3.0+ only) XML entities are
521    * returned as seperate data handler calls and are parsed with
522    * PHP's html_entity_decode() function</li>
523    * </ul>
524    * @param string name of parser option
525    * @param int (optional) 1 to switch on, 0 for off
526    * @access public
527    * @return boolean
528    */
529    function set_option($name, $value=1) {
530        if ( array_key_exists($name,$this->state_parser->parser_options) ) {
531            $this->state_parser->parser_options[$name] = $value;
532            return true;
533        } else {
534            return $this->raiseError('XML_HTMLSax::set_option('.$name.') illegal');
535        }
536    }
537
538    /**
539    * Sets the data handler method which deals with the contents of XML
540    * elements.<br />
541    * The handler method must accept two arguments, the first being an
542    * instance of XML_HTMLSax and the second being the contents of an
543    * XML element e.g.
544    * <pre>
545    * function myDataHander(& $parser,$data){}
546    * </pre>
547    * @param string name of method
548    * @access public
549    * @return void
550    * @see set_object
551    */
552    function set_data_handler($data_method) {
553        $this->state_parser->handler_object_data =& $this->state_parser->handler_default;
554        $this->state_parser->handler_method_data = $data_method;
555    }
556
557    /**
558    * Sets the open and close tag handlers
559    * <br />The open handler method must accept three arguments; the parser,
560    * the tag name and an array of attributes e.g.
561    * <pre>
562    * function myOpenHander(& $parser,$tagname,$attrs=array()){}
563    * </pre>
564    * The close handler method must accept two arguments; the parser and
565    * the tag name e.g.
566    * <pre>
567    * function myCloseHander(& $parser,$tagname){}
568    * </pre>
569    * @param string name of open method
570    * @param string name of close method
571    * @access public
572    * @return void
573    * @see set_object
574    */
575    function set_element_handler($opening_method, $closing_method) {
576        $this->state_parser->handler_object_element =& $this->state_parser->handler_default;
577        $this->state_parser->handler_method_opening = $opening_method;
578        $this->state_parser->handler_method_closing = $closing_method;
579    }
580
581    /**
582    * Sets the processing instruction handler method e.g. for PHP open
583    * and close tags<br />
584    * The handler method must accept three arguments; the parser, the
585    * PI target and data inside the PI
586    * <pre>
587    * function myPIHander(& $parser,$target, $data){}
588    * </pre>
589    * @param string name of method
590    * @access public
591    * @return void
592    * @see set_object
593    */
594    function set_pi_handler($pi_method) {
595        $this->state_parser->handler_object_pi =& $this->state_parser->handler_default;
596        $this->state_parser->handler_method_pi = $pi_method;
597    }
598
599    /**
600    * Sets the XML escape handler method e.g. for comments and doctype
601    * declarations<br />
602    * The handler method must accept two arguments; the parser and the
603    * contents of the escaped section
604    * <pre>
605    * function myEscapeHander(& $parser, $data){}
606    * </pre>
607    * @param string name of method
608    * @access public
609    * @return void
610    * @see set_object
611    */
612    function set_escape_handler($escape_method) {
613        $this->state_parser->handler_object_escape =& $this->state_parser->handler_default;
614        $this->state_parser->handler_method_escape = $escape_method;
615    }
616
617    /**
618    * Sets the JSP/ASP markup handler<br />
619    * The handler method must accept two arguments; the parser and
620    * body of the JASP tag
621    * <pre>
622    * function myJaspHander(& $parser, $data){}
623    * </pre>
624    * @param string name of method
625    * @access public
626    * @return void
627    * @see set_object
628    */
629    function set_jasp_handler ($jasp_method) {
630        $this->state_parser->handler_object_jasp =& $this->state_parser->handler_default;
631        $this->state_parser->handler_method_jasp = $jasp_method;
632    }
633
634    /**
635    * Returns the current string position of the "cursor" inside the XML
636    * document
637    * <br />Intended for use from within a user defined handler called
638    * via the $parser reference e.g.
639    * <pre>
640    * function myDataHandler(& $parser,$data) {
641    *     echo( 'Current position: '.$parser->get_current_position() );
642    * }
643    * </pre>
644    * @access public
645    * @return int
646    * @see get_length
647    */
648    function get_current_position() {
649        return $this->state_parser->position;
650    }
651
652    /**
653    * Returns the string length of the XML document being parsed
654    * @access public
655    * @return int
656    */
657    function get_length() {
658        return $this->state_parser->length;
659    }
660
661    /**
662    * Start parsing some XML
663    * @param string XML document
664    * @access public
665    * @return void
666    */
667    function parse($data) {
668        $this->state_parser->parse($data);
669    }
670}
671?>