1<?php
2
3/// @cond CORE
4
5/**
6 * @file
7 * @brief The base scanner classes
8 *
9 * This file contains the base scanning classes. All language scanners should
10 * subclass one of these. They are all essentially abstract as far as Luminous
11 * is concerned, but it may occasionally be useful to instantiate one.
12 *
13 * The classes defined here follow a simple hierarchy: Scanner defines basic
14 * string scanning methods, LuminousScanner extends this with logic useful to
15 * highlighting. LuminousEmbeddedWebScript adds some helpers for web languages,
16 * which may be nested in other languages. LuminousSimpleScanner is a
17 * generic flat scanner which is driven by token rules.
18 * LuminousStatefulScanner is a generic transition table driven scanner.
19 */
20
21require_once(dirname(__FILE__) . '/strsearch.class.php');
22require_once(dirname(__FILE__) . '/utils.class.php');
23require_once(dirname(__FILE__) . '/filters.class.php');
24require_once(dirname(__FILE__) . '/tokenpresets.class.php');
25
26if (!defined('LUMINOUS_DEBUG'))
27  define('LUMINOUS_DEBUG', false);
28
29/**
30 * @brief Base string scanning class
31 *
32 * The Scanner class is the base class which handles traversing a string while
33 * searching for various different tokens.
34 * It is loosely based on Ruby's StringScanner.
35 *
36 * The rough idea is we keep track of the position (a string pointer) and
37 * use scan() to see what matches at the current position.
38 *
39 * It also provides some automation methods, but it's fairly low-level as
40 * regards string scanning.
41 *
42 * Scanner is abstract as far as Luminous is concerned. LuminousScanner extends
43 * Scanner significantly with some methods which are useful for recording
44 * highlighting related data.
45 *
46 * @see LuminousScanner
47 */
48class Scanner {
49  /// @brief Our local copy of the input string to be scanned.
50  private $src;
51
52  /// @brief Length of input string (cached for performance)
53  private $src_len;
54
55  /// @brief The current scan pointer (AKA the offset or index)
56  private $index;
57
58  /**
59   * @brief Match history
60   *
61   * History of matches. This is an array (queue), which should have at most
62   * two elements. Each element consists of an array:
63   *
64   *  0 => Scan pointer when the match was found,
65   *  1 => Match index (probably the same as scan pointer, but not necessarily),
66   *  2 => Match data (match groups, as map, as returned by PCRE)
67   *
68   * @note Numerical indices are used for performance.
69   */
70  private $match_history = array(null, null);
71
72  /// @brief LuminousStringSearch instance (caches preg_* results)
73  private $ss;
74
75  /// @brief Caller defined patterns used by next_match()
76  private $patterns = array();
77
78  /// constructor
79  function __construct($src=null) {
80    $this->string($src);
81  }
82
83  /**
84   * @brief Gets the remaining string
85   *
86   * @return The rest of the string, which has not yet been consumed
87   */
88  function rest() {
89    $rest = substr($this->src, $this->index);
90    if ($rest === false) $rest = '';
91    return $rest;
92  }
93
94  /**
95   * @brief Getter and setter for the current position (string pointer).
96   *
97   * @param $new_pos The new position (leave \c NULL to use as a getter), note
98   * that this will be clipped to a legal string index if you specify a
99   * negative number or an index greater than the string's length.
100   * @return the current string pointer
101   */
102  function pos($new_pos=null) {
103    if ($new_pos !== null) {
104      $new_pos = max(min((int)$new_pos, $this->src_len), 0);
105      $this->index = $new_pos;
106    }
107    return $this->index;
108  }
109
110  /**
111   * @brief Moves the string pointer by a given offset
112   *
113   * @param $offset the offset by which to move the pointer. This can be positve
114   * or negative, but using a negative offset is currently generally unsafe.
115   * You should use unscan() to revert the last operation.
116   * @see pos
117   * @see unscan
118   */
119  function pos_shift($offset) {
120    $this->pos( $this->pos() + $offset );
121  }
122  /**
123   * @brief Beginning of line?
124   *
125   * @return @c TRUE if the scan pointer is at the beginning of a line (i.e.
126   * immediately following a newline character), or at the beginning of the
127   * string, else @c FALSE
128   */
129  function bol() {
130    return $this->index === 0 || $this->src[$this->index-1] === "\n";
131  }
132
133  /**
134   * @brief End of line?
135   *
136   * @return @c TRUE if the scan pointer is at the end of a line (i.e.
137   * immediately preceding a newline character), or at the end of the
138   * string, else @c FALSE
139   */
140  function eol() {
141    return ($this->eos() || $this->src[$this->index] === "\n");
142  }
143
144  /**
145   * @brief End of string?
146   *
147   * @return @c TRUE if the scan pointer at the end of the string, else
148   * @c FALSE.
149   */
150  function eos() {
151    return $this->index >= $this->src_len;
152  }
153
154  /**
155   * @brief Reset the scanner
156   *
157   * Resets the scanner: sets the scan pointer to 0 and clears the match
158   * history.
159   */
160  function reset() {
161    $this->pos(0);
162    $this->match_history = array(null, null);
163    $this->ss = new LuminousStringSearch($this->src);
164  }
165
166  /**
167   * @brief Getter and setter for the source string
168   *
169   * @param $s The new source string (leave as @c NULL to use this method as a
170   * getter)
171   * @return The current source string
172   *
173   * @note This method triggers a reset()
174   * @note Any strings passed into this method are converted to Unix line
175   * endings, i.e. @c \\n
176   */
177  function string($s=null) {
178    if ($s !== null) {
179      $s = str_replace("\r\n", "\n", $s);
180      $s = str_replace("\r", "\n", $s);
181      $this->src = $s;
182      $this->src_len = strlen($s);
183      $this->reset();
184    }
185    return $this->src;
186  }
187
188  /**
189   * @brief Ends scanning of a string
190   *
191   * Moves the scan pointer to the end of the string, terminating the
192   * current scan.
193   */
194  function terminate() {
195    $this->reset();
196    $this->pos($this->src_len);
197  }
198
199  /**
200   * @brief Lookahead into the string a given number of bytes
201   *
202   * @param $n The number of bytes.
203   * @return The given number of bytes from the string from the current scan
204   * pointer onwards. The returned string will be at most n bytes long, it may
205   * be shorter or the empty string if the scanner is in the termination
206   * position.
207   *
208   * @note This method is identitical to get(), but it does not consume the
209   * string.
210   * @note neither get nor peek logs its matches into the match history.
211   */
212  function peek($n=1) {
213    if ($n === 0 || $this->eos()) return '';
214    elseif ($n === 1) return $this->src[$this->index];
215    else return substr($this->src, $this->index, $n);
216  }
217
218  /**
219   * @brief Consume a given number of bytes
220   *
221   * @param $n The number of bytes.
222   * @return The given number of bytes from the string from the current scan
223   * pointer onwards. The returned string will be at most n bytes long, it may
224   * be shorter or the empty string if the scanner is in the termination
225   * position.
226   *
227   * @note This method is identitical to peek(), but it does consume the
228   * string.
229   * @note neither get nor peek logs its matches into the match history.
230   */
231  function get($n=1) {
232    $p = $this->peek($n);
233    $this->index += strlen($p);
234    return $p;
235  }
236
237  /**
238   * @brief Get the result of the most recent match operation.
239   *
240   * @return The return value is either a string or \c NULL depending on
241   * whether or not the most recent scanning function matched anything.
242   *
243   * @throw Exception if no matches have been recorded.
244   *
245   */
246  function match() {
247//     $index = false;
248    if (isset($this->match_history[0])) {
249      return $this->match_history[0][2][0];
250    }
251    throw new Exception('match history empty');
252  }
253
254  /**
255   * @brief Get the match groups of the most recent match operation.
256   *
257   * @return The return value is either an array/map or \c NULL depending on
258   * whether or not the most recent scanning function was successful. The map
259   * is the same as PCRE returns, i.e. group_name => match_string, where
260   * group_name may be a string or numerical index.
261   *
262   * @throw Exception if no matches have been recorded.
263   */
264  function match_groups() {
265    if (isset($this->match_history[0]))
266      return $this->match_history[0][2];
267    throw new Exception('match history empty');
268  }
269
270  /**
271   *
272   * @brief Get a group from the most recent match operation
273   *
274   * @param $g the group's numerical index or name, in the case of named
275   * subpatterns.
276   * @return A string represeting the group's contents.
277   *
278   * @see match_groups()
279   *
280   * @throw Exception if no matches have been recorded.
281   * @throw Exception if matches have been recorded, but the group does not
282   * exist.
283   */
284  function match_group($g=0) {
285    if (isset($this->match_history[0])) {
286      if (isset($this->match_history[0][2])) {
287        if (isset($this->match_history[0][2][$g]))
288          return $this->match_history[0][2][$g];
289        throw new Exception("No such group '$g'");
290      }
291    }
292    throw new Exception('match history empty');
293  }
294
295  /**
296   * @brief Get the position (offset) of the most recent match
297   *
298   * @return The position, as integer. This is a standard zero-indexed offset
299   * into the string. It is independent of the scan pointer.
300   *
301   * @throw Exception if no matches have been recorded.
302   */
303  function match_pos() {
304    if (isset($this->match_history[0]))
305      return $this->match_history[0][1];
306
307    throw new Exception('match history empty');
308  }
309  /**
310   * @brief Helper function to log a match into the history
311   *
312   * @internal
313   */
314  private function __log_match($index, $match_pos, $match_data) {
315    if (isset($this->match_history[0])) {
316      $this->match_history[1] = $this->match_history[0];
317    }
318    $m = & $this->match_history[0];
319    $m[0] = $index;
320    $m[1] = $match_pos;
321    $m[2] = $match_data;
322  }
323
324  /**
325   *
326   * @brief Revert the most recent scanning operation.
327   *
328   * Unscans the most recent match. The match is removed from the history, and
329   * the scan pointer is moved to where it was before the match.
330   *
331   * Calls to get(), and peek() are not logged and are therefore not
332   * unscannable.
333   *
334   * @warning Do not call unscan more than once before calling a scanning
335   * function. This is not currently defined.
336   */
337  function unscan() {
338    if (isset($this->match_history[0])) {
339      $this->index = $this->match_history[0][0];
340      if (isset($this->match_history[1])) {
341        $this->match_history[0] = $this->match_history[1];
342        $this->match_history[1] = null;
343      } else
344        $this->match_history[0] = null;
345    }
346    else
347      throw new Exception('match history empty');
348  }
349
350  /**
351   * @brief Helper function to consume a match
352   *
353   * @param $pos (int) The match position
354   * @param $consume_match (bool) Whether or not to consume the actual matched
355   * text
356   * @param $match_data The matching groups, as returned by PCRE.
357   * @internal
358   */
359  private function __consume($pos, $consume_match, $match_data) {
360    $this->index = $pos;
361    if ($consume_match) $this->index += strlen($match_data[0]);
362  }
363
364  /**
365   * @brief The real scanning function
366   *
367   * @internal
368   * @param $pattern The pattern to scan for
369   * @param $instant Whether or not the only legal match is at the
370   * current scan pointer or whether one beyond the scan pointer is also
371   * legal.
372   * @param $consume Whether or not to consume string as a result of matching
373   * @param $consume_match Whether or not to consume the actual matched string.
374   * This only has effect if $consume is @c TRUE. If $instant is @c TRUE,
375   * $consume is true and $consume_match is @c FALSE, the intermediate
376   * substring is consumed and the scan pointer moved to the beginning of the
377   * match, and the substring is recorded as a single-group match.
378   * @param $log whether or not to log the matches into the match_register
379   * @return The matched string or null. This is subsequently
380   * equivalent to match() or match_groups()[0] or match_group(0).
381   */
382  private function __check($pattern, $instant=true, $consume=true,
383    $consume_match=true, $log=true) {
384      $matches = null;
385      $index = $this->index;
386      $pos = null;
387      if (($pos = $this->ss->match($pattern, $this->index, $matches)) !== false) {
388        if ($instant && $pos !== $index) {
389          $matches = null;
390        }
391        // don't consume match and not instant: the match we are interested in
392        // is actually the substring between the start and the match.
393        // this is used by scan_to
394        if (!$consume_match && !$instant) {
395          $matches = array(substr($this->src, $this->index, $pos-$this->index));
396        }
397      }
398      else $matches = null;
399
400      if ($log) {
401        $this->__log_match($index, $pos, $matches);
402      }
403      if ($matches !== null && $consume) {
404        $this->__consume($pos, $consume_match, $matches);
405      }
406      return ($matches === null)? null : $matches[0];
407  }
408
409  /**
410   * @brief Scans at the current pointer
411   *
412   * Looks for the given pattern at the current index and consumes and logs it
413   * if it is found.
414   * @param $pattern the pattern to search for
415   * @return @c null if not found, else the full match.
416   */
417  function scan($pattern) {
418    return $this->__check($pattern);
419  }
420
421  /**
422  * @brief Scans until the start of a pattern
423  *
424  * Looks for the given pattern anywhere beyond the current index and
425  * advances the scan pointer to the start of the pattern. The match is logged.
426  *
427  * The match itself is not consumed.
428  *
429  * @param $pattern the pattern to search for
430  * @return The substring between here and the given pattern, or @c null if it
431  * is not found.
432  */
433  function scan_until($pattern) {
434    return $this->__check($pattern, false, true, false, true);
435  }
436
437
438  /**
439   * @brief Non-consuming lookahead
440   *
441   * Looks for the given pattern at the current index and logs it
442   * if it is found, but does not consume it. This is a look-ahead.
443   * @param $pattern the pattern to search for
444   * @return @c null if not found, else the matched string.
445   */
446  function check($pattern) {
447    return $this->__check($pattern, true, false, false, true);
448  }
449
450  /**
451   * @brief Find the index of the next occurrence of a pattern
452   *
453   * @param $pattern the pattern to search for
454   * @return The next index of the pattern, or -1 if it is not found
455   */
456  function index($pattern) {
457    $ret = $this->ss->match($pattern, $this->index, $dontcare_ref);
458    return ($ret !== false)? $ret : -1;
459  }
460
461  /**
462   * @brief Find the index of the next occurrence of a named pattern
463   * @param $patterns A map of $name=>$pattern
464   * @return An array: ($name, $index, $matches). If there is no next match,
465   * name will be null, index will be -1 and matches will be null.
466   *
467   * @note consider using this method to build a transition table
468   *
469   */
470
471  function get_next_named($patterns) {
472    $next = -1;
473    $matches = null;
474    $name = null;
475    $m;
476    foreach($patterns as $name_=>$p) {
477      $index = $this->ss->match($p, $this->index, $m);
478      if ($index === false) continue;
479      if ($next === -1 || $index < $next) {
480        $next = $index;
481        $matches = $m;
482        assert($m !== null);
483        $name = $name_;
484        if ($index === $this->index) break;
485      }
486    }
487    return array($name, $next, $matches);
488  }
489
490  /**
491   * @brief Look for the next occurrence of a set of patterns
492   *
493   * Finds the next match of the given patterns and returns it. The
494   * string is not consumed or logged.
495   * Convenience function.
496   * @param $patterns an array of regular expressions
497   * @return an array of (0=>index, 1=>match_groups). The index may be -1 if
498   * no pattern is found.
499   */
500  function get_next($patterns) {
501    $next = -1;
502    $matches = null;
503    foreach($patterns as $p) {
504      $m;
505      $index = $this->ss->match($p, $this->index, $m);
506      if ($index === false) continue;
507      if ($next === -1 || $index < $next) {
508        $next = $index;
509        $matches = $m;
510        assert($m !== null);
511      }
512    }
513    return array($next, $matches);
514  }
515
516  /**
517   * @brief Look for the next occurrence of a set of substrings
518   *
519   * Like get_next() but uses strpos instead of preg_*
520   * @return An array: 0 => index 1 => substring. If the substring is not found,
521   *    index is -1 and substring is null
522   * @see get_next()
523   */
524  function get_next_strpos($patterns) {
525    $next = -1;
526    $match = null;
527    foreach($patterns as $p) {
528      $index = strpos($this->src, $p, $this->index);
529      if ($index === false) continue;
530      if ($next === -1 || $index < $next) {
531        $next = $index;
532        $match = $p;
533      }
534    }
535    return array($next, $match);
536  }
537
538
539  /**
540   * @brief Allows the caller to add a predefined named pattern
541   *
542   * Adds a predefined pattern which is visible to next_match.
543   *
544   * @param $name A name for the pattern. This does not have to be unique.
545   * @param $pattern A regular expression pattern.
546   */
547  function add_pattern($name, $pattern) {
548    $this->patterns[] = array($name, $pattern . 'S', -1, null);
549  }
550  /**
551   * @brief Allows the caller to remove a named pattern
552   *
553   * @param $name the name of the pattern to remove, this should be as it was
554   * supplied to add_pattern().
555   * @warning If there are multiple patterns with the same name, they will all
556   * be removed.
557   */
558  function remove_pattern($name) {
559    foreach($this->patterns as $k=>$p) {
560      if ($p[0] === $name) unset($this->patterns[$k]);
561    }
562  }
563
564  /**
565   * @brief Automation function: returns the next occurrence of any known patterns.
566   *
567   * Iterates over the predefined patterns array (add_pattern) and consumes/logs
568   * the nearest match, skipping unrecognised segments of string.
569   * @return An array:
570   *    0 => pattern name  (as given to add_pattern)
571   *    1 => match index (although the scan pointer will have progressed to the
572   *            end of the match if the pattern is consumed).
573   * When no more matches are found, return value is @c NULL and nothing is
574   * logged.
575   *
576   *
577   * @param $consume_and_log If this is @c FALSE, the pattern is not consumed
578   * or logged.
579   *
580   * @warning this method is not the same as get_next. This does not return
581   * the match groups, instead it returns a name. The ordering of the return
582   * array is also different, but the array does in fact hold different data.
583   */
584  function next_match($consume_and_log=true) {
585    $target = $this->index;
586
587    $nearest_index = -1;
588    $nearest_key = -1;
589    $nearest_name = null;
590    $nearest_match_data = null;
591
592    foreach($this->patterns as &$p_data) {
593      $name = $p_data[0];
594      $pattern = $p_data[1];
595      $index = &$p_data[2];
596      $match_data = &$p_data[3];
597
598      if ($index !== false && $index < $target) {
599        $index = $this->ss->match($pattern, $target, $match_data);
600      }
601
602      if ($index === false) {
603        unset($p_data);
604        continue;
605      }
606
607      if ($nearest_index === -1 || $index < $nearest_index) {
608        $nearest_index = $index;
609        $nearest_name = $name;
610        $nearest_match_data = $match_data;
611        if ($index === $target) break;
612      }
613    }
614
615    if ($nearest_index !== -1) {
616      if ($consume_and_log) {
617        $this->__log_match($nearest_index, $nearest_index, $nearest_match_data);
618        $this->__consume($nearest_index, true, $nearest_match_data);
619      }
620      return array($nearest_name, $nearest_index);
621    }
622    else return null;
623  }
624}
625
626
627
628
629
630
631/**
632 * @brief the base class for all scanners
633 *
634 * LuminousScanner is the base class for all language scanners. Here we
635 * provide a set of methods comprising a highlighting layer. This includes
636 * recording a token stream, and ultimately being responsible for
637 * producing some XML representing the token stream.
638 *
639 * We also define here some filters which rely on state information expected
640 * to be recorded into the instance variables.
641 *
642 * Highlighting a string at this level is a four-stage process:
643 *
644 *      @li string() - set the string
645 *      @li init() - set up the scanner
646 *      @li main() - perform tokenization
647 *      @li tagged() - build the XML
648 *
649 *
650 *
651 * A note on tokens: Tokens are stored as an array with the following indices:
652 *    @li 0:   Token name   (e.g. 'COMMENT'
653 *    @li 1:   Token string (e.g. '// foo')
654 *    @li 2:   escaped? (bool) Because it's often more convenient to embed nested
655 *              tokens by tagging token string, we need to escape it. This
656 *              index stores whether or nto it has been escaped.
657 *
658 */
659
660class LuminousScanner extends Scanner {
661
662  /// scanner version.
663  public $version = 'master';
664
665
666
667  /**
668   * @brief The token stream
669   *
670   * The token stream is recorded as a flat array of tokens. A token is
671   * made up of 3 parts, and stored as an array:
672   *  @li 0 => Token name
673   *  @li 1 => Token string (from input source code)
674   *  @li 2 => XML-Escaped?
675   */
676  protected $tokens = array();
677
678  /**
679   * @brief State stack
680   *
681   * A stack of the scanner's state, should the scanner wish to use a stack
682   * based state mechanism.
683   *
684   * The top element can be retrieved (but not popped) with stack()
685   *
686   * TODO More useful functions for manipulating the stack
687   */
688  protected $state_ = array();
689
690  /**
691   * @brief Individual token filters
692   *
693   * A list of lists, each filter is an array: (name, token_name, callback)
694   */
695  protected $filters = array();
696
697  /**
698   * @brief Token stream filters
699   *
700   * A list of lists, each filter is an array: (name, callback)
701   */
702  protected $stream_filters = array();
703
704  /**
705   * @brief Rule remappings
706   *
707   * A map to handle re-mapping of rules, in the form:
708   * OLD_TOKEN_NAME => NEW_TOKEN_NAME
709   *
710   * This is used by rule_mapper_filter()
711   */
712  protected $rule_tag_map = array();
713
714  /**
715   * @brief Identifier remappings based on definitions identified in the source code
716   *
717   * A map of remappings of user-defined types/functions. This is a map of
718   * identifier_string => TOKEN_NAME
719   *
720   * This is used by user_def_filter()
721   */
722  protected $user_defs;
723
724
725  /**
726   * @brief A map of identifiers and their corresponding token names
727   *
728   * A map of recognised identifiers, in the form
729   * identifier_string => TOKEN_NAME
730   *
731   * This is currently used by map_identifier_filter
732   */
733  protected $ident_map = array();
734
735  /**
736   * @brief Whether or not the language is case sensitive
737   *
738   * Whether or not the scanner is dealing with a case sensitive language.
739   * This currently affects map_identifier_filter
740   */
741  protected $case_sensitive = true;
742
743  /// constructor
744  function __construct($src=null) {
745    parent::__construct($src);
746
747    $this->add_filter('map-ident', 'IDENT', array($this,
748      'map_identifier_filter'));
749
750    $this->add_filter('comment-note', 'COMMENT', array('LuminousFilters', 'comment_note'));
751    $this->add_filter('comment-to-doc', 'COMMENT', array('LuminousFilters', 'generic_doc_comment'));
752    $this->add_filter('string-escape', 'STRING', array('LuminousFilters', 'string'));
753    $this->add_filter('char-escape', 'CHARACTER', array('LuminousFilters', 'string'));
754    $this->add_filter('pcre', 'REGEX', array('LuminousFilters', 'pcre'));
755    $this->add_filter('user-defs', 'IDENT', array($this, 'user_def_filter'));
756
757    $this->add_filter('constant', 'IDENT', array('LuminousFilters', 'upper_to_constant'));
758
759    $this->add_filter('clean-ident', 'IDENT', array('LuminousFilters', 'clean_ident'));
760
761    $this->add_stream_filter('rule-map', array($this, 'rule_mapper_filter'));
762    $this->add_stream_filter('oo-syntax', array('LuminousFilters', 'oo_stream_filter'));
763  }
764
765
766  /**
767   * @brief Language guessing
768   *
769   * Each real language scanner should override this method and implement a
770   * simple guessing function to estimate how likely the input source code
771   * is to be the language which they recognise.
772   *
773   * @param $src the input source code
774   * @return The estimated chance that the source code is in the same language
775   *  as the one the scanner tokenizes, as a real number between 0 (least
776   *  likely) and 1 (most likely), inclusive
777   */
778  public static function guess_language($src, $info) {
779    return 0.0;
780  }
781
782  /**
783   * @brief Filter to highlight identifiers whose definitions are in the source
784   *
785   * maps anything recorded in LuminousScanner::user_defs to the recorded type.
786   * This is called as the filter 'user-defs'
787   * @internal
788   */
789  protected function user_def_filter($token) {
790    if (isset($this->user_defs[$token[1]])) {
791      $token[0] = $this->user_defs[$token[1]];
792    }
793    return $token;
794  }
795
796  /**
797   * @brief Rule re-mapper filter
798   *
799   * Re-maps token rules according to the LuminousScanner::rule_tag_map
800   * map.
801   * This is called as the filter 'rule-map'
802   * @internal
803   */
804  protected function rule_mapper_filter($tokens) {
805    foreach($tokens as &$t) {
806      if (array_key_exists($t[0], $this->rule_tag_map))
807        $t[0] = $this->rule_tag_map[$t[0]];
808    }
809    return $tokens;
810  }
811
812
813
814
815  /**
816   * @brief Public convenience function for setting the string and highlighting it
817   *
818   * Alias for:
819   *   $s->string($src)
820   *   $s->init();
821   *   $s->main();
822   *   return $s->tagged();
823   *
824   * @returns the highlighted string, as an XML string
825   */
826  function highlight($src) {
827    $this->string($src);
828    $this->init();
829    $this->main();
830    return $this->tagged();
831  }
832
833  /**
834   * @brief Set up the scanner immediately prior to tokenization.
835   *
836   * The init method is always called prior to main(). At this stage, all
837   * configuration variables are assumed to have been set, and it's now time
838   * for the scanner to perform any last set-up information. This may include
839   * actually finalizing its rule patterns. Some scanners may not need to
840   * override this if they are in no way dynamic.
841   */
842  function init() {}
843
844
845  /**
846   * @brief the method responsible for tokenization
847   *
848   * The main method is fully responsible for tokenizing the string stored
849   * in string() at the time of its call. By the time main returns, it should
850   * have consumed the whole of the string and populated the token array.
851   *
852   */
853  function main() {}
854
855  /**
856   * @brief Add an individual token filter
857   *
858   * Adds an indivdual token filter. The filter is bound to the given
859   * token_name. The filter is a callback which should take a token and return
860   * a token.
861   *
862   * The arguments are: [name], token_name, filter
863   *
864   * Name is an optional argument.
865   */
866  public function add_filter($arg1, $arg2, $arg3=null) {
867    $filter = null;
868    $name = null;
869    $token = null;
870    if ($arg3 === null) {
871      $filter = $arg2;
872      $token = $arg1;
873    } else {
874      $filter = $arg3;
875      $token = $arg2;
876      $name = $arg1;
877    }
878    if (!isset($this->filters[$token])) $this->filters[$token] = array();
879    $this->filters[$token][] = array($name, $filter);
880  }
881
882  /**
883   * @brief Removes the individual filter(s) with the given name
884   */
885  public function remove_filter($name) {
886    foreach($this->filters as $token=>$filters) {
887      foreach($filters as $k=>$f) {
888        if ($f[0] === $name) unset($this->filters[$token][$k]);
889      }
890    }
891  }
892
893  /**
894   * @brief Removes the stream filter(s) with the given name
895   */
896  public function remove_stream_filter($name) {
897    foreach($this->stream_filters as $k=>$f) {
898      if ($f[0] === $name) unset($this->stream_filters[$k]);
899    }
900  }
901
902  /**
903   * @brief Adds a stream filter
904   *
905   * A stream filter receives the entire token stream and should return it.
906   *
907   * The parameters are: ([name], filter). Name is an optional argument.
908   *
909   */
910  public function add_stream_filter($arg1, $arg2=null) {
911    $filter = null;
912    $name = null;
913    if ($arg2 === null) {
914      $filter = $arg1;
915    } else {
916      $filter = $arg2;
917      $name = $arg1;
918    }
919    $this->stream_filters[] = array($name, $filter);
920  }
921
922  /**
923   * @brief Gets the top element on $state_ or null if it is empty
924   */
925  function state() {
926    if (!isset($this->state_[0])) return null;
927    return $this->state_[count($this->state_)-1];
928  }
929  /**
930   * @brief Pushes some data onto the stack
931   */
932  function push($state) {
933    $this->state_[] = $state;
934  }
935  /**
936   * @brief Pops the top element of the stack, and returns it
937   * @throw Exception if the state stack is empty
938   */
939  function pop() {
940    if (empty($this->state_))
941      throw new Exception('Cannot pop empty state stack');
942    return array_pop($this->state_);
943  }
944
945  /**
946   * @brief Flushes the token stream
947   */
948  function start() {
949    $this->tokens = array();
950  }
951
952  /**
953   * @brief Records a string as a given token type.
954   * @param $string The string to record
955   * @param $type The name of the token the string represents
956   * @param $pre_escaped Luminous works towards getting this in XML and
957   * therefore at some point, the $string has to be escaped. If you have
958   * already escaped it for some reason (or if you got it from another scanner),
959   * then you want to set this to @c TRUE
960   * @see LuminousUtils::escape_string
961   * @throw Exception if $string is @c NULL
962   */
963  function record($string, $type, $pre_escaped=false) {
964    if ($string === null) throw new Exception('Tagging null string');
965    $this->tokens[] = array($type, $string, $pre_escaped);
966  }
967
968  /**
969   * @brief Helper function to record a range of the string
970   * @param $from the start index
971   * @param $to the end index
972   * @param $type the type of the token
973   * This is shorthand for
974   * <code> $this->record(substr($this->string(), $from, $to-$from)</code>
975   *
976   * @throw RangeException if the range is invalid (i.e. $to < $from)
977   *
978   * An empty range (i.e. $to === $from) is allowed, but it is essentially a
979   * no-op.
980   */
981  function record_range($from, $to, $type) {
982    if ($to === $from)
983      return;
984    else if ($to > $from)
985      $this->record(substr($this->string(), $from, $to-$from), $type);
986    else
987      throw new RangeException("Invalid range supplied [$from, $to]");
988  }
989
990  /**
991   * @brief Returns the XML representation of the token stream
992   *
993   * This function triggers the generation of the XML output.
994   * @return An XML-string which represents the tokens recorded by the scanner.
995   */
996  function tagged() {
997    $out = '';
998
999    // call stream filters.
1000    foreach($this->stream_filters as $f) {
1001      $this->tokens = call_user_func($f[1], $this->tokens);
1002    }
1003    foreach($this->tokens as $t) {
1004      $type = $t[0];
1005
1006      // speed is roughly 10% faster if we process the filters inside this
1007      // loop instead of separately.
1008      if (isset($this->filters[$type])) {
1009        foreach($this->filters[$type] as $filter) {
1010          $t = call_user_func($filter[1], $t);
1011        }
1012      }
1013      list($type, $string, $esc) = $t;
1014
1015      if (!$esc) $string = LuminousUtils::escape_string($string);
1016      if ($type !== null)
1017        $out .= LuminousUtils::tag_block($type, $string);
1018      else $out .= $string;
1019    }
1020    return $out;
1021  }
1022
1023  /**
1024   * @brief Gets the token array
1025   * @return The token array
1026   */
1027  function token_array() {
1028    return $this->tokens;
1029  }
1030
1031  /**
1032   * @brief Identifier mapping filter
1033   *
1034   * Tries to map any 'IDENT' token to a TOKEN_NAME in
1035   * LuminousScanner::$ident_map
1036   * This is implemented as the filter 'map-ident'
1037   */
1038  function map_identifier_filter($token) {
1039    $ident = $token[1];
1040    if (!$this->case_sensitive) $ident = strtolower($ident);
1041    foreach($this->ident_map as $n=>$hits) {
1042      if (isset($hits[$ident])) {
1043        $token[0] = $n;
1044        break;
1045      }
1046    }
1047    return $token;
1048  }
1049
1050  /**
1051   * @brief Adds an identifier mapping which is later analysed by map_identifier_filter
1052   * @param $name The token name
1053   * @param $matches an array of identifiers which correspond to this token
1054   * name, i.e. add_identifier_mapping('KEYWORD', array('if', 'else', ...));
1055   *
1056   * This method observes LuminousScanner::$case_sensitive
1057   */
1058  function add_identifier_mapping($name, $matches) {
1059    $array = array();
1060    foreach($matches as $m) {
1061      if (!$this->case_sensitive) $m = strtolower($m);
1062      $array[$m] = true;
1063    }
1064    if (!isset($this->ident_map[$name]))
1065      $this->ident_map[$name] = array();
1066    $this->ident_map[$name] = array_merge($this->ident_map[$name], $array);
1067  }
1068
1069  /**
1070   * Convenience function
1071   * @brief Skips whitespace, and records it as a null token.
1072   */
1073  function skip_whitespace() {
1074    if (ctype_space($this->peek())) {
1075      $this->record($this->scan('/\s+/'), null);
1076    }
1077  }
1078
1079  /**
1080   * @brief Handles tokens that may nest inside themselves
1081   *
1082   * Convenience function. It's fairly common for many languages to allow
1083   * things like nestable comments. Handling these is easy but fairly
1084   * long winded, so this function will take an opening and closing delimiter
1085   * and consume the token until it is fully closed, or until the end of
1086   * the string in the case that it is unterminated.
1087   *
1088   * When the function returns, the token will have been consumed and appended
1089   * to the token stream.
1090   *
1091   * @param $token_name the name of the token
1092   * @param $open the opening delimiter pattern (regex), e.g. '% /\\* %x'
1093   * @param $close the closing delimiter pattern (regex), e.g. '% \\* /%x'
1094   *
1095   * @warning Although PCRE provides recursive regular expressions, this
1096   * function is far preferable. A recursive regex will easily crash PCRE
1097   * on garbage input due to it having a fairly small stack: this function
1098   * is much more resilient.
1099   *
1100   * @throws Exception if called at a non-matching point (i.e.
1101   * <code>$this->scan($open)</code> does not match)
1102   *
1103   */
1104  function nestable_token($token_name, $open, $close) {
1105    if ($this->check($open) === null) {
1106      throw new Exception('Nestable called at a non-matching point');
1107      return;
1108    }
1109    $patterns = array('open'=>$open, 'close'=>$close);
1110
1111    $stack = 0;
1112    $start = $this->pos();
1113    do {
1114      list($name, $index, $matches) = $this->get_next_named($patterns);
1115      if ($name === 'open') $stack++;
1116      elseif($name === 'close') $stack--;
1117      else {
1118        $this->terminate();
1119        break;
1120      }
1121      $this->pos($index + strlen($matches[0]));
1122    } while ($stack);
1123    $substr = substr($this->string(), $start, $this->pos()-$start);
1124    $this->record($substr, $token_name);
1125  }
1126}
1127
1128
1129
1130
1131
1132/**
1133 * @brief Superclass for languages which may nest, i.e. web languages
1134 *
1135 * Web languages get their own special class because they have to deal with
1136 * server-script code embedded inside them and the potential for languages
1137 * nested under them (PHP has HTML, HTML has CSS and JavaScript)
1138 *
1139 * The relationship is strictly hierarchical, not recursive descent
1140 * Meeting a '\<?' in CSS bubbles up to HTML and then up to PHP (or whatever).
1141 * The top-level scanner is ultimately what should have sub-scanner code
1142 * embedded in its own token stream.
1143 *
1144 * The scanners should be persistent, so only one JavaScript scanner exists
1145 * even if there are 20 javascript tags. This is so they can keep persistent
1146 * state, which might be necessary if they are interrupted by server-side tags.
1147 * For this reason, the main() method might be called multiple times, therefore
1148 * each web sub-scanner should
1149 *     \li Not rely on keeping state related data in main()'s function scope,
1150 *              make it a class variable
1151 *      \li flush its token stream every time main() is called
1152 *
1153 * The init method of the class should be used to set relevant rules based
1154 * on whether or not the embedded flags are set; and therefore the embedded
1155 * flags should be set before init is called.
1156 */
1157abstract class LuminousEmbeddedWebScript extends LuminousScanner {
1158
1159  /**
1160   * @brief Is the source embedded in HTML?
1161   *
1162   * Embedded in HTML? i.e. do we need to observe tag terminators like \</script\>
1163   */
1164  public $embedded_html = false;
1165
1166  /**
1167   * @brief Is the source embedded in a server-side script (e.g. PHP)?
1168   *
1169   * Embedded in a server side language? i.e. do we need to break at
1170   * (for example) \<? tags?
1171   */
1172  public $embedded_server = false;
1173
1174  /**
1175   * @brief Opening tag for server-side code. This is a regular expression.
1176   */
1177  public $server_tags = '/<\?/';
1178
1179  /// @brief closing HTML tag for our code, e.g \</script\>
1180  public $script_tags;
1181
1182
1183  /** @brief I think this is ignored and obsolete */
1184  public $interrupt = false;
1185
1186  /**
1187   * @brief Clean exit or inconvenient, mid-token forced exit
1188   *
1189   * Signifies whether the program exited due to inconvenient interruption by
1190   * a parent language (i.e. a server-side langauge), or whether it reached
1191   * a legitimate break. A server-side language isn't necessarily a dirty exit,
1192   * but if it comes in the middle of a token it is, because we need to resume
1193   * from it later. e.g.:
1194   *
1195   * var x = "this is \<?php echo 'a' ?\> string";
1196   */
1197  public $clean_exit = true;
1198
1199
1200
1201  /**
1202   * @brief Child scanners
1203   *
1204   * Persistent storage of child scanners, name => scanner (instance)
1205   */
1206  protected $child_scanners = array();
1207
1208  /**
1209   * @brief Name of interrupted token, in case of a dirty exit
1210   *
1211   * exit state logs our exit state in the case of a dirty exit: this is the
1212   * rule that was interrupted.
1213   */
1214  protected $exit_state = null;
1215
1216
1217  /**
1218   * @brief Recovery patterns for when we reach an untimely interrupt
1219   *
1220   * If we reach a dirty exit, when we resume we need to figure out how to
1221   * continue consuming the rule that was interrupted. So essentially, this
1222   * will be a regex which matches the rule without start delimiters.
1223   *
1224   * This is a map of rule => pattern
1225   */
1226  protected $dirty_exit_recovery = array();
1227
1228  /**
1229   * @brief adds a child scanner
1230   * Adds a child scanner and indexes it against a name, convenience function
1231   */
1232  public function add_child_scanner($name, $scanner) {
1233    $this->child_scanners[$name] = $scanner;
1234  }
1235
1236  // override string to hit the child scanners as well
1237  public function string($str=null) {
1238    if ($str !== null) {
1239      foreach($this->child_scanners as $s) {
1240        $s->string($str);
1241      }
1242    }
1243    return parent::string($str);
1244  }
1245
1246  /**
1247   * @brief Sets the exit data to signify the exit is dirty and will need recovering from
1248   *
1249   * @param $token_name the name of the token which is being interrupted
1250   *
1251   * @throw Exception if no recovery data is associated with the given token.
1252   */
1253  function dirty_exit($token_name) {
1254    if (!isset($this->dirty_exit_recovery[$token_name])) {
1255      throw new Exception('No dirty exit recovery data for '. $token_name);
1256      $this->clean_exit = true;
1257      return;
1258    }
1259    $this->exit_state = $token_name;
1260    $this->interrupt = true;
1261    $this->clean_exit = false;
1262  }
1263
1264  /**
1265   * @brief Attempts to recover from a dirty exit.
1266   *
1267   * This method should be called on @b every iteration of the main loop when
1268   * LuminousEmbeddedWebScript::$clean_exit is @b FALSE. It will attempt to
1269   * recover from an interruption which left the scanner in the middle of a
1270   * token. The remainder of the token will be in Scanner::match() as usual.
1271   *
1272   * @return the name of the token which was interrupted
1273   *
1274   * @note there is no reason why a scanner should fail to recover from this,
1275   * and failing is classed as an implementation error, therefore assertions
1276   * will be failed and errors will be spewed forth. A failure can either be
1277   * because no recovery regex is set, or that the recovery regex did not
1278   * match. The former should never have been tagged as a dirty exit and the
1279   * latter should be rewritten so it must definitely match, even if the match
1280   * is zero-length or the remainder of the string.
1281   *
1282   */
1283  function resume() {
1284    assert (!$this->clean_exit);
1285    $this->clean_exit = true;
1286    $this->interrupt = false;
1287    if (!isset($this->dirty_exit_recovery[$this->exit_state])) {
1288      throw new Exception("Not implemented error: The scanner was interrupted
1289mid-state (in state {$this->exit_state}), but there is no recovery associated
1290with this state");
1291      return null;
1292    }
1293    $pattern = $this->dirty_exit_recovery[$this->exit_state];
1294    $m = $this->scan($pattern);
1295    if ($m === null) throw new Exception('Implementation error: recovery
1296pattern for ' . $this->exit_state . ' failed to match');
1297    return $this->exit_state;
1298  }
1299
1300
1301  /**
1302   * @brief Checks for a server-side script inside a matched token
1303   *
1304   * @param $token_name The token name of the matched text
1305   * @param $match The string from the last match. If this is left @c NULL then
1306   * Scanner::match() is assumed to hold the match.
1307   * @param $pos The position of the last match. If this is left @c NULL then
1308   * Scanner::match_pos() is assumed to hold the offset.
1309   * @return @c TRUE if the scanner should break, else @c FALSE
1310   *
1311   *
1312   * This method checks whether an interruption by a server-side script tag,
1313   * LuminousEmbeddedWebScript::server_tags, occurs within a matched token.
1314   * If it does, this method records the substring up until that point as
1315   * the provided $token_name, and also sets up a 'dirty exit'. This means that
1316   * some type was interrupted and we expect to have to recover from it when
1317   * the server-side language's scanner has ended.
1318   *
1319   * Returning @c TRUE is a signal that the scanner should break immediately
1320   * and let its parent scanner take over.
1321   *
1322   */
1323  function server_break($token_name, $match=null, $pos=null) {
1324    if (!$this->embedded_server) {
1325      return false;
1326    }
1327    if ($match === null) $match = $this->match();
1328    if ($match === null) return false;
1329
1330    if (preg_match($this->server_tags, $match, $m_, PREG_OFFSET_CAPTURE)) {
1331      $pos_ = $m_[0][1];
1332      $this->record(substr($match, 0, $pos_), $token_name);
1333      if ($pos === null) $pos = $this->match_pos();
1334      $this->pos($pos + $pos_);
1335      $this->dirty_exit($token_name);
1336      return true;
1337    }
1338    else return false;
1339  }
1340
1341  /**
1342   * @brief Checks for a script terminator tag inside a matched token
1343   *
1344   * @param $token_name The token name of the matched text
1345   * @param $match The string from the last match. If this is left @c NULL then
1346   * Scanner::match() is assumed to hold the match.
1347   * @param $pos The position of the last match. If this is left @c NULL then
1348   * Scanner::match_pos() is assumed to hold the offset.
1349   * @return @c TRUE if the scanner should break, else @c FALSE
1350   *
1351   * This method checks whether the string provided as match contains the
1352   * string in LuminousEmbeddedWebScript::script_tags. If yes, then it records
1353   * the substring as $token_name, advances the scan pointer to immediately
1354   * before the script tags, and returns @c TRUE. Returning @c TRUE is a
1355   * signal that the scanner should break immediately and let its parent
1356   * scanner take over.
1357   *
1358   * This condition is a 'clean_exit'.
1359   */
1360  function script_break($token_name, $match=null, $pos=null) {
1361    if (!$this->embedded_html) return false;
1362    if ($match === null) $match = $this->match();
1363    if ($match === null) return false;
1364
1365    if (($pos_ = stripos($this->match(), $this->script_tags)) !== false) {
1366      $this->record(substr($match, 0, $pos_), $token_name);
1367      if ($pos === null) $pos = $this->match_pos();
1368      $this->pos($pos + $pos_);
1369      $this->clean_exit = true;
1370      return true;
1371    }
1372    else return false;
1373  }
1374}
1375
1376
1377
1378
1379
1380/**
1381 * @brief A largely automated scanner
1382 *
1383 * LuminousSimpleScanner implements a main() method and observes the
1384 * patterns added with Scanner::add_pattern()
1385 *
1386 * An overrides array allows the caller to override the handling of any token.
1387 * If an override is set for a token, the override is called when that token is
1388 * reached and the caller should consume it. If the callback fails to advance
1389 * the string pointer, an Exception is thrown.
1390 */
1391class LuminousSimpleScanner extends LuminousScanner {
1392
1393  /**
1394   * @brief Overrides array.
1395   *
1396   * A map of TOKEN_NAME => callback.
1397   *
1398   * The callbacks are fired by main() when the TOKEN_NAME rule matches.
1399   * The callback receives the match_groups array, but the scanner is
1400   * unscan()ed before the callback is fired, so that the pos() is directly
1401   * in front of the match. The callback is responsible for consuming the
1402   * token appropriately.
1403   */
1404  protected $overrides = array();
1405
1406
1407  function main() {
1408    while (!$this->eos()) {
1409      $index = $this->pos();
1410      if (($match = $this->next_match()) !== null) {
1411        $tok = $match[0];
1412        if ($match[1] > $index) {
1413          $this->record(substr($this->string(), $index, $match[1] - $index), null);
1414        }
1415        $match = $this->match();
1416        if (isset($this->overrides[$tok])) {
1417          $groups = $this->match_groups();
1418          $this->unscan();
1419          $p = $this->pos();
1420          $ret = call_user_func($this->overrides[$tok], $groups);
1421          if ($ret === true) break;
1422          if ($this->pos() <= $p)
1423            throw new Exception('Failed to consume any string in override for ' . $tok);
1424        } else
1425          $this->record($match, $tok);
1426      } else {
1427        $this->record(substr($this->string(), $index), null);
1428        $this->terminate();
1429        break;
1430      }
1431    }
1432  }
1433}
1434
1435
1436/**
1437 *
1438 * @brief Experimental transition table driven scanner
1439 *
1440 * The stateful scanner follows a transition table and generates a hierarchical
1441 * token tree. As such, the states follow a hierarchical parent->child
1442 * relationship rather than a strict from->to
1443 *
1444 * A node in the token tree looks like this:
1445 *
1446 * @code array('token_name' => 'name','children' => array(...)) @endcode
1447 *
1448 * Children is an ordered list and its elements may be either other token
1449 * nodes or just strings. We override tagged to try to collapse this into XML
1450 * while still applying filters.
1451 *
1452 *
1453 * We now store patterns as the following tuple:
1454 * @code ($name, $open_pattern, $teminate_pattern). @endcode
1455 * The termination pattern may be null, in which case the $open_pattern
1456 * is complete. No transitions can occur within a complete state because
1457 * the patterns' match is fixed.
1458 *
1459 * We have two stacks. One is LuminousStatefulScanner::$token_tree_stack,
1460 * which stores the token tree, and the other is a standard state stack which
1461 * stores the current state data. State data is currently a pattern, as the
1462 * above tuple.
1463 *
1464 *
1465 * @warning Currently 'stream filters' are not applied, because we at no point
1466 * end up with a flat stream of tokens. Although the rule name remapper is
1467 * applied.
1468 *
1469 */
1470class LuminousStatefulScanner extends LuminousSimpleScanner {
1471
1472
1473  /// @brief Transition table
1474  protected $transitions = array();
1475
1476  /**
1477   * @brief Legal transitions for the current state
1478   *
1479   * @see LuminousStatefulScanner::load_transitions()
1480   */
1481  protected $legal_transitions = array();
1482
1483  /**
1484   * @brief Pattern list
1485   *
1486   * Pattern array. Each pattern is a tuple of
1487   * @code ($name, $open_pattern, $teminate_pattern) @endcode
1488   */
1489  protected $patterns = array();
1490
1491  /**
1492   * @brief The token tree
1493   *
1494   * The tokens we end up with are a tree which we build as we go along. The
1495   * easiest way to build it is to keep track of the currently active node on
1496   * top of a stack. When the node is completed, we pop it and insert it as
1497   * a child of the element which is now at the top of the stack.
1498   *
1499   * At the end of the process we end up with one element in here which is
1500   * the root node.
1501   */
1502  protected $token_tree_stack = array();
1503
1504  /// Records whether or not the FSM has been set up for the first time.
1505  /// @see setup()
1506  private $setup = false;
1507  /// remembers the state on the last iteration so we know whether or not
1508  /// to load in a new transition-set
1509  private $last_state = null;
1510
1511  /// Cache of transition rules
1512  /// @see next_start_data()
1513  private $transition_rule_cache = array();
1514
1515
1516  /**
1517   * Pushes a new token onto the stack as a child of the currently active
1518   * token
1519   *
1520   * @see push_state
1521   * @internal
1522   */
1523  function push_child($child) {
1524    assert(!empty($this->token_tree_stack));
1525    $this->token_tree_stack[] = $child;
1526  }
1527
1528  /**
1529   * @brief Pushes a state
1530   *
1531   * @param $state_data A tuple of ($name, $open_pattern, $teminate_pattern).
1532   * This should be as it is stored in LuminousStatefulScanner::patterns
1533   *
1534   * This actually causes two push operations. One is onto the token_tree_stack,
1535   * and the other is onto the actual stack. The former creates a new token,
1536   * the latter is used for state information
1537   */
1538  function push_state($state_data) {
1539    $token_node = array('token_name' => $state_data[0], 'children'=>array());
1540    $this->push_child($token_node);
1541    $this->push($state_data);
1542  }
1543
1544
1545  /**
1546   * @brief Pops a state from the stack.
1547   *
1548   * The top token on the token_tree_stack is popped and appended as a child to
1549   * the new top token.
1550   *
1551   * The top state on the state stack is popped and discarded.
1552   * @throw Exception if there is only the initial state on the stack
1553   * (we cannot pop the initial state, because then we have no state at all)
1554   */
1555  function pop_state() {
1556    $c = count($this->token_tree_stack);
1557    if ($c <= 1) {
1558      throw new Exception('Attempted to pop the initial state');
1559    }
1560    $s = array_pop($this->token_tree_stack);
1561    // -2 because we popped once since counting
1562    $this->token_tree_stack[$c-2]['children'][] = $s;
1563    $this->pop();
1564  }
1565
1566  /**
1567   * @brief Adds a state transition
1568   *
1569   * This is a helper function for LuminousStatefulScanner::transitions, you
1570   * can specify it directly instead
1571   * @param $from The parent state
1572   * @param $to The child state
1573   */
1574  function add_transition($from, $to) {
1575    if (!isset($this->transitions[$from])) $this->transitions[$from] = array();
1576    $this->transitions[$from][] = $to;
1577  }
1578
1579  /**
1580   * @brief Gets the name of the current state
1581   *
1582   * @returns The name of the current state
1583   */
1584  function state_name() {
1585    $state_data = $this->state();
1586    if ($state_data === null) return 'initial';
1587    $state_name = $state_data[0];
1588    return $state_name;
1589  }
1590
1591  /**
1592   * @brief Adds a pattern
1593   *
1594   * @param $name the name of the pattern/state
1595   * @param $pattern Either the entire pattern, or just its opening delimiter
1596   * @param $end If $pattern was just the opening delimiter, $end is the closing
1597   * delimiter. Separating the two delimiters like this makes the state flexible
1598   * length, as state transitions can occur inside it.
1599   * @param $consume Not currently observed. Might never be. Don't specify this yet.
1600   */
1601  function add_pattern($name, $pattern, $end=null, $consume=true) {
1602    $this->patterns[] = array($name, $pattern, $end, $consume);
1603  }
1604
1605
1606  /**
1607   * @brief Loads legal state transitions for the current state
1608   *
1609   * Loads in legal state transitions into the legal_transitions array
1610   * according to the current state
1611   */
1612  function load_transitions() {
1613    $state_name = $this->state_name();
1614    if ($this->last_state === $state_name) return;
1615    $this->last_state = $state_name;
1616    if (isset($this->transitions[$state_name]))
1617      $this->legal_transitions = $this->transitions[$this->state_name()];
1618    else $this->legal_transitions = array();
1619  }
1620
1621  /**
1622   * @brief Looks for the next state-pop sequence (close/end) for the current state
1623   *
1624   * @returns Data in the same format as get_next: a tuple of (next, matches).
1625   * If no match is found, next is -1 and matches is null
1626   */
1627  function next_end_data() {
1628    $state_data = $this->state();
1629    if ($state_data === null) {
1630      return array(-1, null); // init/root state
1631    }
1632    $term_pattern = $state_data[2];
1633    assert($term_pattern !== null);
1634    $data = $this->get_next(array($term_pattern));
1635    return $data;
1636  }
1637
1638  /**
1639   * @brief Looks for the next legal state transition
1640   *
1641   * @returns A tuple of (pattern_data, next, matches).
1642   * If no match is found, next is -1 and pattern_data and matches is null
1643   */
1644  function next_start_data() {
1645    $patterns = array();
1646    $states = array();
1647    $sn = $this->state_name();
1648    // at the moment we are using get_next_named, so we have to convert
1649    // our patterns into key=>pattern so it can return to us a key. We use
1650    // numerical indices which also correspond with 'states' for full pattern
1651    // data. We are caching this.
1652    // TODO turns out get_next_named is pretty slow and we'd be better off
1653    // caching some results inside the pattern data
1654    if (isset($this->transition_rule_cache[$sn]))
1655      list($patterns,$states) = $this->transition_rule_cache[$sn];
1656    else {
1657      foreach($this->legal_transitions as $t) {
1658        foreach($this->patterns as $p) {
1659          if ($p[0] === $t) {
1660            $patterns[] = $p[1];
1661            $states[] = $p;
1662          }
1663        }
1664      }
1665      $this->transition_rule_cache[$sn] = array($patterns, $states);
1666    }
1667    $next = $this->get_next_named($patterns);
1668    // map to real state data
1669    if ($next[1] !== -1) {
1670      $next[0] = $states[$next[0]];
1671    }
1672    return $next;
1673  }
1674
1675  /**
1676   * @brief Sets up the FSM
1677   *
1678   * If the caller has omitted to specify an initial state then one is created,
1679   * with valid transitions to all other known states. We also push the
1680   * initial state onto the tree stack, and add a type mapping from the initial
1681   * type to @c NULL.
1682   */
1683  protected function setup() {
1684    if ($this->setup) return;
1685    $this->setup = true;
1686    if (!isset($this->transitions['initial'])) {
1687      $initial = array();
1688      foreach($this->patterns as $p) $initial[] = $p[0];
1689      $this->transitions['initial'] = $initial;
1690    }
1691    $this->token_tree_stack[] = array('token_name' => 'initial',
1692      'children'=>array());
1693    $this->rule_tag_map['initial'] = null;
1694  }
1695
1696  /**
1697   * Records a string as a child of the currently active token
1698   * @warning the second and third parameters are not applicable to this
1699   * method, they are only present to suppress PHP warnings. If you set them,
1700   * an exception is thrown.
1701   *
1702   */
1703  function record($str, $dummy1=null, $dummy2=null) {
1704    if ($dummy1 !== null || $dummy2 !== null) {
1705      throw new Exception('LuminousStatefulScanner::record does not currently'
1706        . ' observe its second and third parameters');
1707    }
1708    // NOTE to self: if ever this needs to change, don't call count on $c.
1709    // Dereference it first: http://bugs.php.net/bug.php?id=34540
1710    $c = &$this->token_tree_stack[count($this->token_tree_stack)-1]['children'];
1711    $c[] = $str;
1712  }
1713  /**
1714   * @brief Records a complete token
1715   * This is shorthand for pushing a new node onto the stack, recording its
1716   * text, and then popping it
1717   *
1718   * @param $str the string
1719   * @param $type the token type
1720   */
1721  function record_token($str, $type) {
1722    $state_data = array($type);
1723    $this->push_state($state_data);
1724    $this->record($str);
1725    $this->pop_state();
1726  }
1727
1728  /**
1729   * @brief Helper function to record a range of the string
1730   * @param $from the start index
1731   * @param $to the end index
1732   * @param $type dummy argument
1733   * This is shorthand for
1734   * <code> $this->record(substr($this->string(), $from, $to-$from)</code>
1735   *
1736   * @throw RangeException if the range is invalid (i.e. $to < $from)
1737   *
1738   * An empty range (i.e. $to === $from) is allowed, but it is essentially a
1739   * no-op.
1740   */
1741  function record_range($from, $to, $type=null) {
1742    if ($type !== null) throw new Exception('type argument not supported in '
1743      . ' LuminousStatefulScanner::record_range');
1744    if ($to === $from)
1745      return;
1746    else if ($to > $from)
1747      $this->record(substr($this->string(), $from, $to-$from), $type);
1748    else
1749      throw new RangeException("Invalid range supplied [$from, $to]");
1750  }
1751
1752
1753  /**
1754   * Generic main function which observes the transition table
1755   */
1756  function main() {
1757    $this->setup();
1758    while (!$this->eos()) {
1759      $p = $this->pos();
1760      $state = $this->state_name();
1761
1762      $this->load_transitions();
1763      list($next_pattern_data,
1764           $next_pattern_index,
1765           $next_pattern_matches) = $this->next_start_data();
1766      list($end_index, $end_matches) = $this->next_end_data();
1767
1768
1769      if( ($next_pattern_index <= $end_index || $end_index === -1) && $next_pattern_index !== -1) {
1770        // we're pushing a new state
1771        if ($p < $next_pattern_index)
1772          $this->record_range($p, $next_pattern_index);
1773        $new_pos = $next_pattern_index;
1774        $this->pos($new_pos);
1775        $tok = $next_pattern_data[0];
1776        if (isset($this->overrides[$tok])) {
1777          // call override
1778          $ret = call_user_func($this->overrides[$tok], $next_pattern_matches);
1779          if ($ret === true) break;
1780          if ($this->state_name() === $state && $this->pos() <= $new_pos) {
1781            throw new Exception('Override failed to either advance the pointer'
1782              . ' or change the state');
1783          }
1784        } else {
1785          // no override
1786          $this->pos_shift(strlen($next_pattern_matches[0]));
1787          $this->push_state($next_pattern_data);
1788          $this->record($next_pattern_matches[0]);
1789          if ($next_pattern_data[2] === null) {
1790            // state was a full pattern, so pop now
1791            $this->pop_state();
1792          }
1793        }
1794      } elseif($end_index !== -1) {
1795        // we're at the end of a state, record what's left and pop it
1796        $to = $end_index + strlen($end_matches[0]);
1797        $this->record_range($this->pos(), $to);
1798        $this->pos($to);
1799        $this->pop_state();
1800      }
1801      else {
1802        // no more matches, consume the rest of the stirng and break
1803        $this->record($this->rest());
1804        $this->terminate();
1805        break;
1806      }
1807      if ($this->state_name() === $state && $this->pos() <= $p) {
1808        throw new Exception('Failed to advance pointer in state'
1809          . $this->state_name());
1810      }
1811    }
1812
1813    // unterminated states will have left some tokens open, we need to
1814    // close these so there's just the root node on the stack
1815    assert(count($this->token_tree_stack) >= 1);
1816    while(count($this->token_tree_stack) > 1)
1817      $this->pop_state();
1818
1819    return $this->token_tree_stack[0];
1820  }
1821
1822  /**
1823   * Recursive function to collapse the token tree into XML
1824   * @internal
1825   */
1826  protected function collapse_token_tree($node) {
1827    $text = '';
1828    foreach($node['children'] as $c) {
1829      if (is_string($c)) $text .= LuminousUtils::escape_string($c);
1830      else $text .= $this->collapse_token_tree($c);
1831    }
1832    $token_name = $node['token_name'];
1833    $token = array($node['token_name'], $text, true);
1834
1835    $token_ = $this->rule_mapper_filter(array($token));
1836    $token = $token_[0];
1837
1838    if (isset($this->filters[$token_name])) {
1839      foreach($this->filters[$token_name] as $filter) {
1840        $token = call_user_func($filter[1], $token);
1841      }
1842    }
1843    list($token_name, $text,) = $token;
1844    return ($token_name === null)?
1845      $text : LuminousUtils::tag_block($token_name, $text);
1846  }
1847
1848  function tagged() {
1849    $stream = $this->collapse_token_tree($this->token_tree_stack[0]);
1850    return $stream;
1851  }
1852}
1853
1854/// @endcond
1855