1<?php 2 3/// @cond CORE 4 5/** 6 * @file 7 * @brief The base scanner classes 8 * 9 * This file contains the base scanning classes. All language scanners should 10 * subclass one of these. They are all essentially abstract as far as Luminous 11 * is concerned, but it may occasionally be useful to instantiate one. 12 * 13 * The classes defined here follow a simple hierarchy: Scanner defines basic 14 * string scanning methods, LuminousScanner extends this with logic useful to 15 * highlighting. LuminousEmbeddedWebScript adds some helpers for web languages, 16 * which may be nested in other languages. LuminousSimpleScanner is a 17 * generic flat scanner which is driven by token rules. 18 * LuminousStatefulScanner is a generic transition table driven scanner. 19 */ 20 21require_once(dirname(__FILE__) . '/strsearch.class.php'); 22require_once(dirname(__FILE__) . '/utils.class.php'); 23require_once(dirname(__FILE__) . '/filters.class.php'); 24require_once(dirname(__FILE__) . '/tokenpresets.class.php'); 25 26if (!defined('LUMINOUS_DEBUG')) 27 define('LUMINOUS_DEBUG', false); 28 29/** 30 * @brief Base string scanning class 31 * 32 * The Scanner class is the base class which handles traversing a string while 33 * searching for various different tokens. 34 * It is loosely based on Ruby's StringScanner. 35 * 36 * The rough idea is we keep track of the position (a string pointer) and 37 * use scan() to see what matches at the current position. 38 * 39 * It also provides some automation methods, but it's fairly low-level as 40 * regards string scanning. 41 * 42 * Scanner is abstract as far as Luminous is concerned. LuminousScanner extends 43 * Scanner significantly with some methods which are useful for recording 44 * highlighting related data. 45 * 46 * @see LuminousScanner 47 */ 48class Scanner { 49 /// @brief Our local copy of the input string to be scanned. 50 private $src; 51 52 /// @brief Length of input string (cached for performance) 53 private $src_len; 54 55 /// @brief The current scan pointer (AKA the offset or index) 56 private $index; 57 58 /** 59 * @brief Match history 60 * 61 * History of matches. This is an array (queue), which should have at most 62 * two elements. Each element consists of an array: 63 * 64 * 0 => Scan pointer when the match was found, 65 * 1 => Match index (probably the same as scan pointer, but not necessarily), 66 * 2 => Match data (match groups, as map, as returned by PCRE) 67 * 68 * @note Numerical indices are used for performance. 69 */ 70 private $match_history = array(null, null); 71 72 /// @brief LuminousStringSearch instance (caches preg_* results) 73 private $ss; 74 75 /// @brief Caller defined patterns used by next_match() 76 private $patterns = array(); 77 78 /// constructor 79 function __construct($src=null) { 80 $this->string($src); 81 } 82 83 /** 84 * @brief Gets the remaining string 85 * 86 * @return The rest of the string, which has not yet been consumed 87 */ 88 function rest() { 89 $rest = substr($this->src, $this->index); 90 if ($rest === false) $rest = ''; 91 return $rest; 92 } 93 94 /** 95 * @brief Getter and setter for the current position (string pointer). 96 * 97 * @param $new_pos The new position (leave \c NULL to use as a getter), note 98 * that this will be clipped to a legal string index if you specify a 99 * negative number or an index greater than the string's length. 100 * @return the current string pointer 101 */ 102 function pos($new_pos=null) { 103 if ($new_pos !== null) { 104 $new_pos = max(min((int)$new_pos, $this->src_len), 0); 105 $this->index = $new_pos; 106 } 107 return $this->index; 108 } 109 110 /** 111 * @brief Moves the string pointer by a given offset 112 * 113 * @param $offset the offset by which to move the pointer. This can be positve 114 * or negative, but using a negative offset is currently generally unsafe. 115 * You should use unscan() to revert the last operation. 116 * @see pos 117 * @see unscan 118 */ 119 function pos_shift($offset) { 120 $this->pos( $this->pos() + $offset ); 121 } 122 /** 123 * @brief Beginning of line? 124 * 125 * @return @c TRUE if the scan pointer is at the beginning of a line (i.e. 126 * immediately following a newline character), or at the beginning of the 127 * string, else @c FALSE 128 */ 129 function bol() { 130 return $this->index === 0 || $this->src[$this->index-1] === "\n"; 131 } 132 133 /** 134 * @brief End of line? 135 * 136 * @return @c TRUE if the scan pointer is at the end of a line (i.e. 137 * immediately preceding a newline character), or at the end of the 138 * string, else @c FALSE 139 */ 140 function eol() { 141 return ($this->eos() || $this->src[$this->index] === "\n"); 142 } 143 144 /** 145 * @brief End of string? 146 * 147 * @return @c TRUE if the scan pointer at the end of the string, else 148 * @c FALSE. 149 */ 150 function eos() { 151 return $this->index >= $this->src_len; 152 } 153 154 /** 155 * @brief Reset the scanner 156 * 157 * Resets the scanner: sets the scan pointer to 0 and clears the match 158 * history. 159 */ 160 function reset() { 161 $this->pos(0); 162 $this->match_history = array(null, null); 163 $this->ss = new LuminousStringSearch($this->src); 164 } 165 166 /** 167 * @brief Getter and setter for the source string 168 * 169 * @param $s The new source string (leave as @c NULL to use this method as a 170 * getter) 171 * @return The current source string 172 * 173 * @note This method triggers a reset() 174 * @note Any strings passed into this method are converted to Unix line 175 * endings, i.e. @c \\n 176 */ 177 function string($s=null) { 178 if ($s !== null) { 179 $s = str_replace("\r\n", "\n", $s); 180 $s = str_replace("\r", "\n", $s); 181 $this->src = $s; 182 $this->src_len = strlen($s); 183 $this->reset(); 184 } 185 return $this->src; 186 } 187 188 /** 189 * @brief Ends scanning of a string 190 * 191 * Moves the scan pointer to the end of the string, terminating the 192 * current scan. 193 */ 194 function terminate() { 195 $this->reset(); 196 $this->pos($this->src_len); 197 } 198 199 /** 200 * @brief Lookahead into the string a given number of bytes 201 * 202 * @param $n The number of bytes. 203 * @return The given number of bytes from the string from the current scan 204 * pointer onwards. The returned string will be at most n bytes long, it may 205 * be shorter or the empty string if the scanner is in the termination 206 * position. 207 * 208 * @note This method is identitical to get(), but it does not consume the 209 * string. 210 * @note neither get nor peek logs its matches into the match history. 211 */ 212 function peek($n=1) { 213 if ($n === 0 || $this->eos()) return ''; 214 elseif ($n === 1) return $this->src[$this->index]; 215 else return substr($this->src, $this->index, $n); 216 } 217 218 /** 219 * @brief Consume a given number of bytes 220 * 221 * @param $n The number of bytes. 222 * @return The given number of bytes from the string from the current scan 223 * pointer onwards. The returned string will be at most n bytes long, it may 224 * be shorter or the empty string if the scanner is in the termination 225 * position. 226 * 227 * @note This method is identitical to peek(), but it does consume the 228 * string. 229 * @note neither get nor peek logs its matches into the match history. 230 */ 231 function get($n=1) { 232 $p = $this->peek($n); 233 $this->index += strlen($p); 234 return $p; 235 } 236 237 /** 238 * @brief Get the result of the most recent match operation. 239 * 240 * @return The return value is either a string or \c NULL depending on 241 * whether or not the most recent scanning function matched anything. 242 * 243 * @throw Exception if no matches have been recorded. 244 * 245 */ 246 function match() { 247// $index = false; 248 if (isset($this->match_history[0])) { 249 return $this->match_history[0][2][0]; 250 } 251 throw new Exception('match history empty'); 252 } 253 254 /** 255 * @brief Get the match groups of the most recent match operation. 256 * 257 * @return The return value is either an array/map or \c NULL depending on 258 * whether or not the most recent scanning function was successful. The map 259 * is the same as PCRE returns, i.e. group_name => match_string, where 260 * group_name may be a string or numerical index. 261 * 262 * @throw Exception if no matches have been recorded. 263 */ 264 function match_groups() { 265 if (isset($this->match_history[0])) 266 return $this->match_history[0][2]; 267 throw new Exception('match history empty'); 268 } 269 270 /** 271 * 272 * @brief Get a group from the most recent match operation 273 * 274 * @param $g the group's numerical index or name, in the case of named 275 * subpatterns. 276 * @return A string represeting the group's contents. 277 * 278 * @see match_groups() 279 * 280 * @throw Exception if no matches have been recorded. 281 * @throw Exception if matches have been recorded, but the group does not 282 * exist. 283 */ 284 function match_group($g=0) { 285 if (isset($this->match_history[0])) { 286 if (isset($this->match_history[0][2])) { 287 if (isset($this->match_history[0][2][$g])) 288 return $this->match_history[0][2][$g]; 289 throw new Exception("No such group '$g'"); 290 } 291 } 292 throw new Exception('match history empty'); 293 } 294 295 /** 296 * @brief Get the position (offset) of the most recent match 297 * 298 * @return The position, as integer. This is a standard zero-indexed offset 299 * into the string. It is independent of the scan pointer. 300 * 301 * @throw Exception if no matches have been recorded. 302 */ 303 function match_pos() { 304 if (isset($this->match_history[0])) 305 return $this->match_history[0][1]; 306 307 throw new Exception('match history empty'); 308 } 309 /** 310 * @brief Helper function to log a match into the history 311 * 312 * @internal 313 */ 314 private function __log_match($index, $match_pos, $match_data) { 315 if (isset($this->match_history[0])) { 316 $this->match_history[1] = $this->match_history[0]; 317 } 318 $m = & $this->match_history[0]; 319 $m[0] = $index; 320 $m[1] = $match_pos; 321 $m[2] = $match_data; 322 } 323 324 /** 325 * 326 * @brief Revert the most recent scanning operation. 327 * 328 * Unscans the most recent match. The match is removed from the history, and 329 * the scan pointer is moved to where it was before the match. 330 * 331 * Calls to get(), and peek() are not logged and are therefore not 332 * unscannable. 333 * 334 * @warning Do not call unscan more than once before calling a scanning 335 * function. This is not currently defined. 336 */ 337 function unscan() { 338 if (isset($this->match_history[0])) { 339 $this->index = $this->match_history[0][0]; 340 if (isset($this->match_history[1])) { 341 $this->match_history[0] = $this->match_history[1]; 342 $this->match_history[1] = null; 343 } else 344 $this->match_history[0] = null; 345 } 346 else 347 throw new Exception('match history empty'); 348 } 349 350 /** 351 * @brief Helper function to consume a match 352 * 353 * @param $pos (int) The match position 354 * @param $consume_match (bool) Whether or not to consume the actual matched 355 * text 356 * @param $match_data The matching groups, as returned by PCRE. 357 * @internal 358 */ 359 private function __consume($pos, $consume_match, $match_data) { 360 $this->index = $pos; 361 if ($consume_match) $this->index += strlen($match_data[0]); 362 } 363 364 /** 365 * @brief The real scanning function 366 * 367 * @internal 368 * @param $pattern The pattern to scan for 369 * @param $instant Whether or not the only legal match is at the 370 * current scan pointer or whether one beyond the scan pointer is also 371 * legal. 372 * @param $consume Whether or not to consume string as a result of matching 373 * @param $consume_match Whether or not to consume the actual matched string. 374 * This only has effect if $consume is @c TRUE. If $instant is @c TRUE, 375 * $consume is true and $consume_match is @c FALSE, the intermediate 376 * substring is consumed and the scan pointer moved to the beginning of the 377 * match, and the substring is recorded as a single-group match. 378 * @param $log whether or not to log the matches into the match_register 379 * @return The matched string or null. This is subsequently 380 * equivalent to match() or match_groups()[0] or match_group(0). 381 */ 382 private function __check($pattern, $instant=true, $consume=true, 383 $consume_match=true, $log=true) { 384 $matches = null; 385 $index = $this->index; 386 $pos = null; 387 if (($pos = $this->ss->match($pattern, $this->index, $matches)) !== false) { 388 if ($instant && $pos !== $index) { 389 $matches = null; 390 } 391 // don't consume match and not instant: the match we are interested in 392 // is actually the substring between the start and the match. 393 // this is used by scan_to 394 if (!$consume_match && !$instant) { 395 $matches = array(substr($this->src, $this->index, $pos-$this->index)); 396 } 397 } 398 else $matches = null; 399 400 if ($log) { 401 $this->__log_match($index, $pos, $matches); 402 } 403 if ($matches !== null && $consume) { 404 $this->__consume($pos, $consume_match, $matches); 405 } 406 return ($matches === null)? null : $matches[0]; 407 } 408 409 /** 410 * @brief Scans at the current pointer 411 * 412 * Looks for the given pattern at the current index and consumes and logs it 413 * if it is found. 414 * @param $pattern the pattern to search for 415 * @return @c null if not found, else the full match. 416 */ 417 function scan($pattern) { 418 return $this->__check($pattern); 419 } 420 421 /** 422 * @brief Scans until the start of a pattern 423 * 424 * Looks for the given pattern anywhere beyond the current index and 425 * advances the scan pointer to the start of the pattern. The match is logged. 426 * 427 * The match itself is not consumed. 428 * 429 * @param $pattern the pattern to search for 430 * @return The substring between here and the given pattern, or @c null if it 431 * is not found. 432 */ 433 function scan_until($pattern) { 434 return $this->__check($pattern, false, true, false, true); 435 } 436 437 438 /** 439 * @brief Non-consuming lookahead 440 * 441 * Looks for the given pattern at the current index and logs it 442 * if it is found, but does not consume it. This is a look-ahead. 443 * @param $pattern the pattern to search for 444 * @return @c null if not found, else the matched string. 445 */ 446 function check($pattern) { 447 return $this->__check($pattern, true, false, false, true); 448 } 449 450 /** 451 * @brief Find the index of the next occurrence of a pattern 452 * 453 * @param $pattern the pattern to search for 454 * @return The next index of the pattern, or -1 if it is not found 455 */ 456 function index($pattern) { 457 $ret = $this->ss->match($pattern, $this->index, $dontcare_ref); 458 return ($ret !== false)? $ret : -1; 459 } 460 461 /** 462 * @brief Find the index of the next occurrence of a named pattern 463 * @param $patterns A map of $name=>$pattern 464 * @return An array: ($name, $index, $matches). If there is no next match, 465 * name will be null, index will be -1 and matches will be null. 466 * 467 * @note consider using this method to build a transition table 468 * 469 */ 470 471 function get_next_named($patterns) { 472 $next = -1; 473 $matches = null; 474 $name = null; 475 $m; 476 foreach($patterns as $name_=>$p) { 477 $index = $this->ss->match($p, $this->index, $m); 478 if ($index === false) continue; 479 if ($next === -1 || $index < $next) { 480 $next = $index; 481 $matches = $m; 482 assert($m !== null); 483 $name = $name_; 484 if ($index === $this->index) break; 485 } 486 } 487 return array($name, $next, $matches); 488 } 489 490 /** 491 * @brief Look for the next occurrence of a set of patterns 492 * 493 * Finds the next match of the given patterns and returns it. The 494 * string is not consumed or logged. 495 * Convenience function. 496 * @param $patterns an array of regular expressions 497 * @return an array of (0=>index, 1=>match_groups). The index may be -1 if 498 * no pattern is found. 499 */ 500 function get_next($patterns) { 501 $next = -1; 502 $matches = null; 503 foreach($patterns as $p) { 504 $m; 505 $index = $this->ss->match($p, $this->index, $m); 506 if ($index === false) continue; 507 if ($next === -1 || $index < $next) { 508 $next = $index; 509 $matches = $m; 510 assert($m !== null); 511 } 512 } 513 return array($next, $matches); 514 } 515 516 /** 517 * @brief Look for the next occurrence of a set of substrings 518 * 519 * Like get_next() but uses strpos instead of preg_* 520 * @return An array: 0 => index 1 => substring. If the substring is not found, 521 * index is -1 and substring is null 522 * @see get_next() 523 */ 524 function get_next_strpos($patterns) { 525 $next = -1; 526 $match = null; 527 foreach($patterns as $p) { 528 $index = strpos($this->src, $p, $this->index); 529 if ($index === false) continue; 530 if ($next === -1 || $index < $next) { 531 $next = $index; 532 $match = $p; 533 } 534 } 535 return array($next, $match); 536 } 537 538 539 /** 540 * @brief Allows the caller to add a predefined named pattern 541 * 542 * Adds a predefined pattern which is visible to next_match. 543 * 544 * @param $name A name for the pattern. This does not have to be unique. 545 * @param $pattern A regular expression pattern. 546 */ 547 function add_pattern($name, $pattern) { 548 $this->patterns[] = array($name, $pattern . 'S', -1, null); 549 } 550 /** 551 * @brief Allows the caller to remove a named pattern 552 * 553 * @param $name the name of the pattern to remove, this should be as it was 554 * supplied to add_pattern(). 555 * @warning If there are multiple patterns with the same name, they will all 556 * be removed. 557 */ 558 function remove_pattern($name) { 559 foreach($this->patterns as $k=>$p) { 560 if ($p[0] === $name) unset($this->patterns[$k]); 561 } 562 } 563 564 /** 565 * @brief Automation function: returns the next occurrence of any known patterns. 566 * 567 * Iterates over the predefined patterns array (add_pattern) and consumes/logs 568 * the nearest match, skipping unrecognised segments of string. 569 * @return An array: 570 * 0 => pattern name (as given to add_pattern) 571 * 1 => match index (although the scan pointer will have progressed to the 572 * end of the match if the pattern is consumed). 573 * When no more matches are found, return value is @c NULL and nothing is 574 * logged. 575 * 576 * 577 * @param $consume_and_log If this is @c FALSE, the pattern is not consumed 578 * or logged. 579 * 580 * @warning this method is not the same as get_next. This does not return 581 * the match groups, instead it returns a name. The ordering of the return 582 * array is also different, but the array does in fact hold different data. 583 */ 584 function next_match($consume_and_log=true) { 585 $target = $this->index; 586 587 $nearest_index = -1; 588 $nearest_key = -1; 589 $nearest_name = null; 590 $nearest_match_data = null; 591 592 foreach($this->patterns as &$p_data) { 593 $name = $p_data[0]; 594 $pattern = $p_data[1]; 595 $index = &$p_data[2]; 596 $match_data = &$p_data[3]; 597 598 if ($index !== false && $index < $target) { 599 $index = $this->ss->match($pattern, $target, $match_data); 600 } 601 602 if ($index === false) { 603 unset($p_data); 604 continue; 605 } 606 607 if ($nearest_index === -1 || $index < $nearest_index) { 608 $nearest_index = $index; 609 $nearest_name = $name; 610 $nearest_match_data = $match_data; 611 if ($index === $target) break; 612 } 613 } 614 615 if ($nearest_index !== -1) { 616 if ($consume_and_log) { 617 $this->__log_match($nearest_index, $nearest_index, $nearest_match_data); 618 $this->__consume($nearest_index, true, $nearest_match_data); 619 } 620 return array($nearest_name, $nearest_index); 621 } 622 else return null; 623 } 624} 625 626 627 628 629 630 631/** 632 * @brief the base class for all scanners 633 * 634 * LuminousScanner is the base class for all language scanners. Here we 635 * provide a set of methods comprising a highlighting layer. This includes 636 * recording a token stream, and ultimately being responsible for 637 * producing some XML representing the token stream. 638 * 639 * We also define here some filters which rely on state information expected 640 * to be recorded into the instance variables. 641 * 642 * Highlighting a string at this level is a four-stage process: 643 * 644 * @li string() - set the string 645 * @li init() - set up the scanner 646 * @li main() - perform tokenization 647 * @li tagged() - build the XML 648 * 649 * 650 * 651 * A note on tokens: Tokens are stored as an array with the following indices: 652 * @li 0: Token name (e.g. 'COMMENT' 653 * @li 1: Token string (e.g. '// foo') 654 * @li 2: escaped? (bool) Because it's often more convenient to embed nested 655 * tokens by tagging token string, we need to escape it. This 656 * index stores whether or nto it has been escaped. 657 * 658 */ 659 660class LuminousScanner extends Scanner { 661 662 /// scanner version. 663 public $version = 'master'; 664 665 666 667 /** 668 * @brief The token stream 669 * 670 * The token stream is recorded as a flat array of tokens. A token is 671 * made up of 3 parts, and stored as an array: 672 * @li 0 => Token name 673 * @li 1 => Token string (from input source code) 674 * @li 2 => XML-Escaped? 675 */ 676 protected $tokens = array(); 677 678 /** 679 * @brief State stack 680 * 681 * A stack of the scanner's state, should the scanner wish to use a stack 682 * based state mechanism. 683 * 684 * The top element can be retrieved (but not popped) with stack() 685 * 686 * TODO More useful functions for manipulating the stack 687 */ 688 protected $state_ = array(); 689 690 /** 691 * @brief Individual token filters 692 * 693 * A list of lists, each filter is an array: (name, token_name, callback) 694 */ 695 protected $filters = array(); 696 697 /** 698 * @brief Token stream filters 699 * 700 * A list of lists, each filter is an array: (name, callback) 701 */ 702 protected $stream_filters = array(); 703 704 /** 705 * @brief Rule remappings 706 * 707 * A map to handle re-mapping of rules, in the form: 708 * OLD_TOKEN_NAME => NEW_TOKEN_NAME 709 * 710 * This is used by rule_mapper_filter() 711 */ 712 protected $rule_tag_map = array(); 713 714 /** 715 * @brief Identifier remappings based on definitions identified in the source code 716 * 717 * A map of remappings of user-defined types/functions. This is a map of 718 * identifier_string => TOKEN_NAME 719 * 720 * This is used by user_def_filter() 721 */ 722 protected $user_defs; 723 724 725 /** 726 * @brief A map of identifiers and their corresponding token names 727 * 728 * A map of recognised identifiers, in the form 729 * identifier_string => TOKEN_NAME 730 * 731 * This is currently used by map_identifier_filter 732 */ 733 protected $ident_map = array(); 734 735 /** 736 * @brief Whether or not the language is case sensitive 737 * 738 * Whether or not the scanner is dealing with a case sensitive language. 739 * This currently affects map_identifier_filter 740 */ 741 protected $case_sensitive = true; 742 743 /// constructor 744 function __construct($src=null) { 745 parent::__construct($src); 746 747 $this->add_filter('map-ident', 'IDENT', array($this, 748 'map_identifier_filter')); 749 750 $this->add_filter('comment-note', 'COMMENT', array('LuminousFilters', 'comment_note')); 751 $this->add_filter('comment-to-doc', 'COMMENT', array('LuminousFilters', 'generic_doc_comment')); 752 $this->add_filter('string-escape', 'STRING', array('LuminousFilters', 'string')); 753 $this->add_filter('char-escape', 'CHARACTER', array('LuminousFilters', 'string')); 754 $this->add_filter('pcre', 'REGEX', array('LuminousFilters', 'pcre')); 755 $this->add_filter('user-defs', 'IDENT', array($this, 'user_def_filter')); 756 757 $this->add_filter('constant', 'IDENT', array('LuminousFilters', 'upper_to_constant')); 758 759 $this->add_filter('clean-ident', 'IDENT', array('LuminousFilters', 'clean_ident')); 760 761 $this->add_stream_filter('rule-map', array($this, 'rule_mapper_filter')); 762 $this->add_stream_filter('oo-syntax', array('LuminousFilters', 'oo_stream_filter')); 763 } 764 765 766 /** 767 * @brief Language guessing 768 * 769 * Each real language scanner should override this method and implement a 770 * simple guessing function to estimate how likely the input source code 771 * is to be the language which they recognise. 772 * 773 * @param $src the input source code 774 * @return The estimated chance that the source code is in the same language 775 * as the one the scanner tokenizes, as a real number between 0 (least 776 * likely) and 1 (most likely), inclusive 777 */ 778 public static function guess_language($src, $info) { 779 return 0.0; 780 } 781 782 /** 783 * @brief Filter to highlight identifiers whose definitions are in the source 784 * 785 * maps anything recorded in LuminousScanner::user_defs to the recorded type. 786 * This is called as the filter 'user-defs' 787 * @internal 788 */ 789 protected function user_def_filter($token) { 790 if (isset($this->user_defs[$token[1]])) { 791 $token[0] = $this->user_defs[$token[1]]; 792 } 793 return $token; 794 } 795 796 /** 797 * @brief Rule re-mapper filter 798 * 799 * Re-maps token rules according to the LuminousScanner::rule_tag_map 800 * map. 801 * This is called as the filter 'rule-map' 802 * @internal 803 */ 804 protected function rule_mapper_filter($tokens) { 805 foreach($tokens as &$t) { 806 if (array_key_exists($t[0], $this->rule_tag_map)) 807 $t[0] = $this->rule_tag_map[$t[0]]; 808 } 809 return $tokens; 810 } 811 812 813 814 815 /** 816 * @brief Public convenience function for setting the string and highlighting it 817 * 818 * Alias for: 819 * $s->string($src) 820 * $s->init(); 821 * $s->main(); 822 * return $s->tagged(); 823 * 824 * @returns the highlighted string, as an XML string 825 */ 826 function highlight($src) { 827 $this->string($src); 828 $this->init(); 829 $this->main(); 830 return $this->tagged(); 831 } 832 833 /** 834 * @brief Set up the scanner immediately prior to tokenization. 835 * 836 * The init method is always called prior to main(). At this stage, all 837 * configuration variables are assumed to have been set, and it's now time 838 * for the scanner to perform any last set-up information. This may include 839 * actually finalizing its rule patterns. Some scanners may not need to 840 * override this if they are in no way dynamic. 841 */ 842 function init() {} 843 844 845 /** 846 * @brief the method responsible for tokenization 847 * 848 * The main method is fully responsible for tokenizing the string stored 849 * in string() at the time of its call. By the time main returns, it should 850 * have consumed the whole of the string and populated the token array. 851 * 852 */ 853 function main() {} 854 855 /** 856 * @brief Add an individual token filter 857 * 858 * Adds an indivdual token filter. The filter is bound to the given 859 * token_name. The filter is a callback which should take a token and return 860 * a token. 861 * 862 * The arguments are: [name], token_name, filter 863 * 864 * Name is an optional argument. 865 */ 866 public function add_filter($arg1, $arg2, $arg3=null) { 867 $filter = null; 868 $name = null; 869 $token = null; 870 if ($arg3 === null) { 871 $filter = $arg2; 872 $token = $arg1; 873 } else { 874 $filter = $arg3; 875 $token = $arg2; 876 $name = $arg1; 877 } 878 if (!isset($this->filters[$token])) $this->filters[$token] = array(); 879 $this->filters[$token][] = array($name, $filter); 880 } 881 882 /** 883 * @brief Removes the individual filter(s) with the given name 884 */ 885 public function remove_filter($name) { 886 foreach($this->filters as $token=>$filters) { 887 foreach($filters as $k=>$f) { 888 if ($f[0] === $name) unset($this->filters[$token][$k]); 889 } 890 } 891 } 892 893 /** 894 * @brief Removes the stream filter(s) with the given name 895 */ 896 public function remove_stream_filter($name) { 897 foreach($this->stream_filters as $k=>$f) { 898 if ($f[0] === $name) unset($this->stream_filters[$k]); 899 } 900 } 901 902 /** 903 * @brief Adds a stream filter 904 * 905 * A stream filter receives the entire token stream and should return it. 906 * 907 * The parameters are: ([name], filter). Name is an optional argument. 908 * 909 */ 910 public function add_stream_filter($arg1, $arg2=null) { 911 $filter = null; 912 $name = null; 913 if ($arg2 === null) { 914 $filter = $arg1; 915 } else { 916 $filter = $arg2; 917 $name = $arg1; 918 } 919 $this->stream_filters[] = array($name, $filter); 920 } 921 922 /** 923 * @brief Gets the top element on $state_ or null if it is empty 924 */ 925 function state() { 926 if (!isset($this->state_[0])) return null; 927 return $this->state_[count($this->state_)-1]; 928 } 929 /** 930 * @brief Pushes some data onto the stack 931 */ 932 function push($state) { 933 $this->state_[] = $state; 934 } 935 /** 936 * @brief Pops the top element of the stack, and returns it 937 * @throw Exception if the state stack is empty 938 */ 939 function pop() { 940 if (empty($this->state_)) 941 throw new Exception('Cannot pop empty state stack'); 942 return array_pop($this->state_); 943 } 944 945 /** 946 * @brief Flushes the token stream 947 */ 948 function start() { 949 $this->tokens = array(); 950 } 951 952 /** 953 * @brief Records a string as a given token type. 954 * @param $string The string to record 955 * @param $type The name of the token the string represents 956 * @param $pre_escaped Luminous works towards getting this in XML and 957 * therefore at some point, the $string has to be escaped. If you have 958 * already escaped it for some reason (or if you got it from another scanner), 959 * then you want to set this to @c TRUE 960 * @see LuminousUtils::escape_string 961 * @throw Exception if $string is @c NULL 962 */ 963 function record($string, $type, $pre_escaped=false) { 964 if ($string === null) throw new Exception('Tagging null string'); 965 $this->tokens[] = array($type, $string, $pre_escaped); 966 } 967 968 /** 969 * @brief Helper function to record a range of the string 970 * @param $from the start index 971 * @param $to the end index 972 * @param $type the type of the token 973 * This is shorthand for 974 * <code> $this->record(substr($this->string(), $from, $to-$from)</code> 975 * 976 * @throw RangeException if the range is invalid (i.e. $to < $from) 977 * 978 * An empty range (i.e. $to === $from) is allowed, but it is essentially a 979 * no-op. 980 */ 981 function record_range($from, $to, $type) { 982 if ($to === $from) 983 return; 984 else if ($to > $from) 985 $this->record(substr($this->string(), $from, $to-$from), $type); 986 else 987 throw new RangeException("Invalid range supplied [$from, $to]"); 988 } 989 990 /** 991 * @brief Returns the XML representation of the token stream 992 * 993 * This function triggers the generation of the XML output. 994 * @return An XML-string which represents the tokens recorded by the scanner. 995 */ 996 function tagged() { 997 $out = ''; 998 999 // call stream filters. 1000 foreach($this->stream_filters as $f) { 1001 $this->tokens = call_user_func($f[1], $this->tokens); 1002 } 1003 foreach($this->tokens as $t) { 1004 $type = $t[0]; 1005 1006 // speed is roughly 10% faster if we process the filters inside this 1007 // loop instead of separately. 1008 if (isset($this->filters[$type])) { 1009 foreach($this->filters[$type] as $filter) { 1010 $t = call_user_func($filter[1], $t); 1011 } 1012 } 1013 list($type, $string, $esc) = $t; 1014 1015 if (!$esc) $string = LuminousUtils::escape_string($string); 1016 if ($type !== null) 1017 $out .= LuminousUtils::tag_block($type, $string); 1018 else $out .= $string; 1019 } 1020 return $out; 1021 } 1022 1023 /** 1024 * @brief Gets the token array 1025 * @return The token array 1026 */ 1027 function token_array() { 1028 return $this->tokens; 1029 } 1030 1031 /** 1032 * @brief Identifier mapping filter 1033 * 1034 * Tries to map any 'IDENT' token to a TOKEN_NAME in 1035 * LuminousScanner::$ident_map 1036 * This is implemented as the filter 'map-ident' 1037 */ 1038 function map_identifier_filter($token) { 1039 $ident = $token[1]; 1040 if (!$this->case_sensitive) $ident = strtolower($ident); 1041 foreach($this->ident_map as $n=>$hits) { 1042 if (isset($hits[$ident])) { 1043 $token[0] = $n; 1044 break; 1045 } 1046 } 1047 return $token; 1048 } 1049 1050 /** 1051 * @brief Adds an identifier mapping which is later analysed by map_identifier_filter 1052 * @param $name The token name 1053 * @param $matches an array of identifiers which correspond to this token 1054 * name, i.e. add_identifier_mapping('KEYWORD', array('if', 'else', ...)); 1055 * 1056 * This method observes LuminousScanner::$case_sensitive 1057 */ 1058 function add_identifier_mapping($name, $matches) { 1059 $array = array(); 1060 foreach($matches as $m) { 1061 if (!$this->case_sensitive) $m = strtolower($m); 1062 $array[$m] = true; 1063 } 1064 if (!isset($this->ident_map[$name])) 1065 $this->ident_map[$name] = array(); 1066 $this->ident_map[$name] = array_merge($this->ident_map[$name], $array); 1067 } 1068 1069 /** 1070 * Convenience function 1071 * @brief Skips whitespace, and records it as a null token. 1072 */ 1073 function skip_whitespace() { 1074 if (ctype_space($this->peek())) { 1075 $this->record($this->scan('/\s+/'), null); 1076 } 1077 } 1078 1079 /** 1080 * @brief Handles tokens that may nest inside themselves 1081 * 1082 * Convenience function. It's fairly common for many languages to allow 1083 * things like nestable comments. Handling these is easy but fairly 1084 * long winded, so this function will take an opening and closing delimiter 1085 * and consume the token until it is fully closed, or until the end of 1086 * the string in the case that it is unterminated. 1087 * 1088 * When the function returns, the token will have been consumed and appended 1089 * to the token stream. 1090 * 1091 * @param $token_name the name of the token 1092 * @param $open the opening delimiter pattern (regex), e.g. '% /\\* %x' 1093 * @param $close the closing delimiter pattern (regex), e.g. '% \\* /%x' 1094 * 1095 * @warning Although PCRE provides recursive regular expressions, this 1096 * function is far preferable. A recursive regex will easily crash PCRE 1097 * on garbage input due to it having a fairly small stack: this function 1098 * is much more resilient. 1099 * 1100 * @throws Exception if called at a non-matching point (i.e. 1101 * <code>$this->scan($open)</code> does not match) 1102 * 1103 */ 1104 function nestable_token($token_name, $open, $close) { 1105 if ($this->check($open) === null) { 1106 throw new Exception('Nestable called at a non-matching point'); 1107 return; 1108 } 1109 $patterns = array('open'=>$open, 'close'=>$close); 1110 1111 $stack = 0; 1112 $start = $this->pos(); 1113 do { 1114 list($name, $index, $matches) = $this->get_next_named($patterns); 1115 if ($name === 'open') $stack++; 1116 elseif($name === 'close') $stack--; 1117 else { 1118 $this->terminate(); 1119 break; 1120 } 1121 $this->pos($index + strlen($matches[0])); 1122 } while ($stack); 1123 $substr = substr($this->string(), $start, $this->pos()-$start); 1124 $this->record($substr, $token_name); 1125 } 1126} 1127 1128 1129 1130 1131 1132/** 1133 * @brief Superclass for languages which may nest, i.e. web languages 1134 * 1135 * Web languages get their own special class because they have to deal with 1136 * server-script code embedded inside them and the potential for languages 1137 * nested under them (PHP has HTML, HTML has CSS and JavaScript) 1138 * 1139 * The relationship is strictly hierarchical, not recursive descent 1140 * Meeting a '\<?' in CSS bubbles up to HTML and then up to PHP (or whatever). 1141 * The top-level scanner is ultimately what should have sub-scanner code 1142 * embedded in its own token stream. 1143 * 1144 * The scanners should be persistent, so only one JavaScript scanner exists 1145 * even if there are 20 javascript tags. This is so they can keep persistent 1146 * state, which might be necessary if they are interrupted by server-side tags. 1147 * For this reason, the main() method might be called multiple times, therefore 1148 * each web sub-scanner should 1149 * \li Not rely on keeping state related data in main()'s function scope, 1150 * make it a class variable 1151 * \li flush its token stream every time main() is called 1152 * 1153 * The init method of the class should be used to set relevant rules based 1154 * on whether or not the embedded flags are set; and therefore the embedded 1155 * flags should be set before init is called. 1156 */ 1157abstract class LuminousEmbeddedWebScript extends LuminousScanner { 1158 1159 /** 1160 * @brief Is the source embedded in HTML? 1161 * 1162 * Embedded in HTML? i.e. do we need to observe tag terminators like \</script\> 1163 */ 1164 public $embedded_html = false; 1165 1166 /** 1167 * @brief Is the source embedded in a server-side script (e.g. PHP)? 1168 * 1169 * Embedded in a server side language? i.e. do we need to break at 1170 * (for example) \<? tags? 1171 */ 1172 public $embedded_server = false; 1173 1174 /** 1175 * @brief Opening tag for server-side code. This is a regular expression. 1176 */ 1177 public $server_tags = '/<\?/'; 1178 1179 /// @brief closing HTML tag for our code, e.g \</script\> 1180 public $script_tags; 1181 1182 1183 /** @brief I think this is ignored and obsolete */ 1184 public $interrupt = false; 1185 1186 /** 1187 * @brief Clean exit or inconvenient, mid-token forced exit 1188 * 1189 * Signifies whether the program exited due to inconvenient interruption by 1190 * a parent language (i.e. a server-side langauge), or whether it reached 1191 * a legitimate break. A server-side language isn't necessarily a dirty exit, 1192 * but if it comes in the middle of a token it is, because we need to resume 1193 * from it later. e.g.: 1194 * 1195 * var x = "this is \<?php echo 'a' ?\> string"; 1196 */ 1197 public $clean_exit = true; 1198 1199 1200 1201 /** 1202 * @brief Child scanners 1203 * 1204 * Persistent storage of child scanners, name => scanner (instance) 1205 */ 1206 protected $child_scanners = array(); 1207 1208 /** 1209 * @brief Name of interrupted token, in case of a dirty exit 1210 * 1211 * exit state logs our exit state in the case of a dirty exit: this is the 1212 * rule that was interrupted. 1213 */ 1214 protected $exit_state = null; 1215 1216 1217 /** 1218 * @brief Recovery patterns for when we reach an untimely interrupt 1219 * 1220 * If we reach a dirty exit, when we resume we need to figure out how to 1221 * continue consuming the rule that was interrupted. So essentially, this 1222 * will be a regex which matches the rule without start delimiters. 1223 * 1224 * This is a map of rule => pattern 1225 */ 1226 protected $dirty_exit_recovery = array(); 1227 1228 /** 1229 * @brief adds a child scanner 1230 * Adds a child scanner and indexes it against a name, convenience function 1231 */ 1232 public function add_child_scanner($name, $scanner) { 1233 $this->child_scanners[$name] = $scanner; 1234 } 1235 1236 // override string to hit the child scanners as well 1237 public function string($str=null) { 1238 if ($str !== null) { 1239 foreach($this->child_scanners as $s) { 1240 $s->string($str); 1241 } 1242 } 1243 return parent::string($str); 1244 } 1245 1246 /** 1247 * @brief Sets the exit data to signify the exit is dirty and will need recovering from 1248 * 1249 * @param $token_name the name of the token which is being interrupted 1250 * 1251 * @throw Exception if no recovery data is associated with the given token. 1252 */ 1253 function dirty_exit($token_name) { 1254 if (!isset($this->dirty_exit_recovery[$token_name])) { 1255 throw new Exception('No dirty exit recovery data for '. $token_name); 1256 $this->clean_exit = true; 1257 return; 1258 } 1259 $this->exit_state = $token_name; 1260 $this->interrupt = true; 1261 $this->clean_exit = false; 1262 } 1263 1264 /** 1265 * @brief Attempts to recover from a dirty exit. 1266 * 1267 * This method should be called on @b every iteration of the main loop when 1268 * LuminousEmbeddedWebScript::$clean_exit is @b FALSE. It will attempt to 1269 * recover from an interruption which left the scanner in the middle of a 1270 * token. The remainder of the token will be in Scanner::match() as usual. 1271 * 1272 * @return the name of the token which was interrupted 1273 * 1274 * @note there is no reason why a scanner should fail to recover from this, 1275 * and failing is classed as an implementation error, therefore assertions 1276 * will be failed and errors will be spewed forth. A failure can either be 1277 * because no recovery regex is set, or that the recovery regex did not 1278 * match. The former should never have been tagged as a dirty exit and the 1279 * latter should be rewritten so it must definitely match, even if the match 1280 * is zero-length or the remainder of the string. 1281 * 1282 */ 1283 function resume() { 1284 assert (!$this->clean_exit); 1285 $this->clean_exit = true; 1286 $this->interrupt = false; 1287 if (!isset($this->dirty_exit_recovery[$this->exit_state])) { 1288 throw new Exception("Not implemented error: The scanner was interrupted 1289mid-state (in state {$this->exit_state}), but there is no recovery associated 1290with this state"); 1291 return null; 1292 } 1293 $pattern = $this->dirty_exit_recovery[$this->exit_state]; 1294 $m = $this->scan($pattern); 1295 if ($m === null) throw new Exception('Implementation error: recovery 1296pattern for ' . $this->exit_state . ' failed to match'); 1297 return $this->exit_state; 1298 } 1299 1300 1301 /** 1302 * @brief Checks for a server-side script inside a matched token 1303 * 1304 * @param $token_name The token name of the matched text 1305 * @param $match The string from the last match. If this is left @c NULL then 1306 * Scanner::match() is assumed to hold the match. 1307 * @param $pos The position of the last match. If this is left @c NULL then 1308 * Scanner::match_pos() is assumed to hold the offset. 1309 * @return @c TRUE if the scanner should break, else @c FALSE 1310 * 1311 * 1312 * This method checks whether an interruption by a server-side script tag, 1313 * LuminousEmbeddedWebScript::server_tags, occurs within a matched token. 1314 * If it does, this method records the substring up until that point as 1315 * the provided $token_name, and also sets up a 'dirty exit'. This means that 1316 * some type was interrupted and we expect to have to recover from it when 1317 * the server-side language's scanner has ended. 1318 * 1319 * Returning @c TRUE is a signal that the scanner should break immediately 1320 * and let its parent scanner take over. 1321 * 1322 */ 1323 function server_break($token_name, $match=null, $pos=null) { 1324 if (!$this->embedded_server) { 1325 return false; 1326 } 1327 if ($match === null) $match = $this->match(); 1328 if ($match === null) return false; 1329 1330 if (preg_match($this->server_tags, $match, $m_, PREG_OFFSET_CAPTURE)) { 1331 $pos_ = $m_[0][1]; 1332 $this->record(substr($match, 0, $pos_), $token_name); 1333 if ($pos === null) $pos = $this->match_pos(); 1334 $this->pos($pos + $pos_); 1335 $this->dirty_exit($token_name); 1336 return true; 1337 } 1338 else return false; 1339 } 1340 1341 /** 1342 * @brief Checks for a script terminator tag inside a matched token 1343 * 1344 * @param $token_name The token name of the matched text 1345 * @param $match The string from the last match. If this is left @c NULL then 1346 * Scanner::match() is assumed to hold the match. 1347 * @param $pos The position of the last match. If this is left @c NULL then 1348 * Scanner::match_pos() is assumed to hold the offset. 1349 * @return @c TRUE if the scanner should break, else @c FALSE 1350 * 1351 * This method checks whether the string provided as match contains the 1352 * string in LuminousEmbeddedWebScript::script_tags. If yes, then it records 1353 * the substring as $token_name, advances the scan pointer to immediately 1354 * before the script tags, and returns @c TRUE. Returning @c TRUE is a 1355 * signal that the scanner should break immediately and let its parent 1356 * scanner take over. 1357 * 1358 * This condition is a 'clean_exit'. 1359 */ 1360 function script_break($token_name, $match=null, $pos=null) { 1361 if (!$this->embedded_html) return false; 1362 if ($match === null) $match = $this->match(); 1363 if ($match === null) return false; 1364 1365 if (($pos_ = stripos($this->match(), $this->script_tags)) !== false) { 1366 $this->record(substr($match, 0, $pos_), $token_name); 1367 if ($pos === null) $pos = $this->match_pos(); 1368 $this->pos($pos + $pos_); 1369 $this->clean_exit = true; 1370 return true; 1371 } 1372 else return false; 1373 } 1374} 1375 1376 1377 1378 1379 1380/** 1381 * @brief A largely automated scanner 1382 * 1383 * LuminousSimpleScanner implements a main() method and observes the 1384 * patterns added with Scanner::add_pattern() 1385 * 1386 * An overrides array allows the caller to override the handling of any token. 1387 * If an override is set for a token, the override is called when that token is 1388 * reached and the caller should consume it. If the callback fails to advance 1389 * the string pointer, an Exception is thrown. 1390 */ 1391class LuminousSimpleScanner extends LuminousScanner { 1392 1393 /** 1394 * @brief Overrides array. 1395 * 1396 * A map of TOKEN_NAME => callback. 1397 * 1398 * The callbacks are fired by main() when the TOKEN_NAME rule matches. 1399 * The callback receives the match_groups array, but the scanner is 1400 * unscan()ed before the callback is fired, so that the pos() is directly 1401 * in front of the match. The callback is responsible for consuming the 1402 * token appropriately. 1403 */ 1404 protected $overrides = array(); 1405 1406 1407 function main() { 1408 while (!$this->eos()) { 1409 $index = $this->pos(); 1410 if (($match = $this->next_match()) !== null) { 1411 $tok = $match[0]; 1412 if ($match[1] > $index) { 1413 $this->record(substr($this->string(), $index, $match[1] - $index), null); 1414 } 1415 $match = $this->match(); 1416 if (isset($this->overrides[$tok])) { 1417 $groups = $this->match_groups(); 1418 $this->unscan(); 1419 $p = $this->pos(); 1420 $ret = call_user_func($this->overrides[$tok], $groups); 1421 if ($ret === true) break; 1422 if ($this->pos() <= $p) 1423 throw new Exception('Failed to consume any string in override for ' . $tok); 1424 } else 1425 $this->record($match, $tok); 1426 } else { 1427 $this->record(substr($this->string(), $index), null); 1428 $this->terminate(); 1429 break; 1430 } 1431 } 1432 } 1433} 1434 1435 1436/** 1437 * 1438 * @brief Experimental transition table driven scanner 1439 * 1440 * The stateful scanner follows a transition table and generates a hierarchical 1441 * token tree. As such, the states follow a hierarchical parent->child 1442 * relationship rather than a strict from->to 1443 * 1444 * A node in the token tree looks like this: 1445 * 1446 * @code array('token_name' => 'name','children' => array(...)) @endcode 1447 * 1448 * Children is an ordered list and its elements may be either other token 1449 * nodes or just strings. We override tagged to try to collapse this into XML 1450 * while still applying filters. 1451 * 1452 * 1453 * We now store patterns as the following tuple: 1454 * @code ($name, $open_pattern, $teminate_pattern). @endcode 1455 * The termination pattern may be null, in which case the $open_pattern 1456 * is complete. No transitions can occur within a complete state because 1457 * the patterns' match is fixed. 1458 * 1459 * We have two stacks. One is LuminousStatefulScanner::$token_tree_stack, 1460 * which stores the token tree, and the other is a standard state stack which 1461 * stores the current state data. State data is currently a pattern, as the 1462 * above tuple. 1463 * 1464 * 1465 * @warning Currently 'stream filters' are not applied, because we at no point 1466 * end up with a flat stream of tokens. Although the rule name remapper is 1467 * applied. 1468 * 1469 */ 1470class LuminousStatefulScanner extends LuminousSimpleScanner { 1471 1472 1473 /// @brief Transition table 1474 protected $transitions = array(); 1475 1476 /** 1477 * @brief Legal transitions for the current state 1478 * 1479 * @see LuminousStatefulScanner::load_transitions() 1480 */ 1481 protected $legal_transitions = array(); 1482 1483 /** 1484 * @brief Pattern list 1485 * 1486 * Pattern array. Each pattern is a tuple of 1487 * @code ($name, $open_pattern, $teminate_pattern) @endcode 1488 */ 1489 protected $patterns = array(); 1490 1491 /** 1492 * @brief The token tree 1493 * 1494 * The tokens we end up with are a tree which we build as we go along. The 1495 * easiest way to build it is to keep track of the currently active node on 1496 * top of a stack. When the node is completed, we pop it and insert it as 1497 * a child of the element which is now at the top of the stack. 1498 * 1499 * At the end of the process we end up with one element in here which is 1500 * the root node. 1501 */ 1502 protected $token_tree_stack = array(); 1503 1504 /// Records whether or not the FSM has been set up for the first time. 1505 /// @see setup() 1506 private $setup = false; 1507 /// remembers the state on the last iteration so we know whether or not 1508 /// to load in a new transition-set 1509 private $last_state = null; 1510 1511 /// Cache of transition rules 1512 /// @see next_start_data() 1513 private $transition_rule_cache = array(); 1514 1515 1516 /** 1517 * Pushes a new token onto the stack as a child of the currently active 1518 * token 1519 * 1520 * @see push_state 1521 * @internal 1522 */ 1523 function push_child($child) { 1524 assert(!empty($this->token_tree_stack)); 1525 $this->token_tree_stack[] = $child; 1526 } 1527 1528 /** 1529 * @brief Pushes a state 1530 * 1531 * @param $state_data A tuple of ($name, $open_pattern, $teminate_pattern). 1532 * This should be as it is stored in LuminousStatefulScanner::patterns 1533 * 1534 * This actually causes two push operations. One is onto the token_tree_stack, 1535 * and the other is onto the actual stack. The former creates a new token, 1536 * the latter is used for state information 1537 */ 1538 function push_state($state_data) { 1539 $token_node = array('token_name' => $state_data[0], 'children'=>array()); 1540 $this->push_child($token_node); 1541 $this->push($state_data); 1542 } 1543 1544 1545 /** 1546 * @brief Pops a state from the stack. 1547 * 1548 * The top token on the token_tree_stack is popped and appended as a child to 1549 * the new top token. 1550 * 1551 * The top state on the state stack is popped and discarded. 1552 * @throw Exception if there is only the initial state on the stack 1553 * (we cannot pop the initial state, because then we have no state at all) 1554 */ 1555 function pop_state() { 1556 $c = count($this->token_tree_stack); 1557 if ($c <= 1) { 1558 throw new Exception('Attempted to pop the initial state'); 1559 } 1560 $s = array_pop($this->token_tree_stack); 1561 // -2 because we popped once since counting 1562 $this->token_tree_stack[$c-2]['children'][] = $s; 1563 $this->pop(); 1564 } 1565 1566 /** 1567 * @brief Adds a state transition 1568 * 1569 * This is a helper function for LuminousStatefulScanner::transitions, you 1570 * can specify it directly instead 1571 * @param $from The parent state 1572 * @param $to The child state 1573 */ 1574 function add_transition($from, $to) { 1575 if (!isset($this->transitions[$from])) $this->transitions[$from] = array(); 1576 $this->transitions[$from][] = $to; 1577 } 1578 1579 /** 1580 * @brief Gets the name of the current state 1581 * 1582 * @returns The name of the current state 1583 */ 1584 function state_name() { 1585 $state_data = $this->state(); 1586 if ($state_data === null) return 'initial'; 1587 $state_name = $state_data[0]; 1588 return $state_name; 1589 } 1590 1591 /** 1592 * @brief Adds a pattern 1593 * 1594 * @param $name the name of the pattern/state 1595 * @param $pattern Either the entire pattern, or just its opening delimiter 1596 * @param $end If $pattern was just the opening delimiter, $end is the closing 1597 * delimiter. Separating the two delimiters like this makes the state flexible 1598 * length, as state transitions can occur inside it. 1599 * @param $consume Not currently observed. Might never be. Don't specify this yet. 1600 */ 1601 function add_pattern($name, $pattern, $end=null, $consume=true) { 1602 $this->patterns[] = array($name, $pattern, $end, $consume); 1603 } 1604 1605 1606 /** 1607 * @brief Loads legal state transitions for the current state 1608 * 1609 * Loads in legal state transitions into the legal_transitions array 1610 * according to the current state 1611 */ 1612 function load_transitions() { 1613 $state_name = $this->state_name(); 1614 if ($this->last_state === $state_name) return; 1615 $this->last_state = $state_name; 1616 if (isset($this->transitions[$state_name])) 1617 $this->legal_transitions = $this->transitions[$this->state_name()]; 1618 else $this->legal_transitions = array(); 1619 } 1620 1621 /** 1622 * @brief Looks for the next state-pop sequence (close/end) for the current state 1623 * 1624 * @returns Data in the same format as get_next: a tuple of (next, matches). 1625 * If no match is found, next is -1 and matches is null 1626 */ 1627 function next_end_data() { 1628 $state_data = $this->state(); 1629 if ($state_data === null) { 1630 return array(-1, null); // init/root state 1631 } 1632 $term_pattern = $state_data[2]; 1633 assert($term_pattern !== null); 1634 $data = $this->get_next(array($term_pattern)); 1635 return $data; 1636 } 1637 1638 /** 1639 * @brief Looks for the next legal state transition 1640 * 1641 * @returns A tuple of (pattern_data, next, matches). 1642 * If no match is found, next is -1 and pattern_data and matches is null 1643 */ 1644 function next_start_data() { 1645 $patterns = array(); 1646 $states = array(); 1647 $sn = $this->state_name(); 1648 // at the moment we are using get_next_named, so we have to convert 1649 // our patterns into key=>pattern so it can return to us a key. We use 1650 // numerical indices which also correspond with 'states' for full pattern 1651 // data. We are caching this. 1652 // TODO turns out get_next_named is pretty slow and we'd be better off 1653 // caching some results inside the pattern data 1654 if (isset($this->transition_rule_cache[$sn])) 1655 list($patterns,$states) = $this->transition_rule_cache[$sn]; 1656 else { 1657 foreach($this->legal_transitions as $t) { 1658 foreach($this->patterns as $p) { 1659 if ($p[0] === $t) { 1660 $patterns[] = $p[1]; 1661 $states[] = $p; 1662 } 1663 } 1664 } 1665 $this->transition_rule_cache[$sn] = array($patterns, $states); 1666 } 1667 $next = $this->get_next_named($patterns); 1668 // map to real state data 1669 if ($next[1] !== -1) { 1670 $next[0] = $states[$next[0]]; 1671 } 1672 return $next; 1673 } 1674 1675 /** 1676 * @brief Sets up the FSM 1677 * 1678 * If the caller has omitted to specify an initial state then one is created, 1679 * with valid transitions to all other known states. We also push the 1680 * initial state onto the tree stack, and add a type mapping from the initial 1681 * type to @c NULL. 1682 */ 1683 protected function setup() { 1684 if ($this->setup) return; 1685 $this->setup = true; 1686 if (!isset($this->transitions['initial'])) { 1687 $initial = array(); 1688 foreach($this->patterns as $p) $initial[] = $p[0]; 1689 $this->transitions['initial'] = $initial; 1690 } 1691 $this->token_tree_stack[] = array('token_name' => 'initial', 1692 'children'=>array()); 1693 $this->rule_tag_map['initial'] = null; 1694 } 1695 1696 /** 1697 * Records a string as a child of the currently active token 1698 * @warning the second and third parameters are not applicable to this 1699 * method, they are only present to suppress PHP warnings. If you set them, 1700 * an exception is thrown. 1701 * 1702 */ 1703 function record($str, $dummy1=null, $dummy2=null) { 1704 if ($dummy1 !== null || $dummy2 !== null) { 1705 throw new Exception('LuminousStatefulScanner::record does not currently' 1706 . ' observe its second and third parameters'); 1707 } 1708 // NOTE to self: if ever this needs to change, don't call count on $c. 1709 // Dereference it first: http://bugs.php.net/bug.php?id=34540 1710 $c = &$this->token_tree_stack[count($this->token_tree_stack)-1]['children']; 1711 $c[] = $str; 1712 } 1713 /** 1714 * @brief Records a complete token 1715 * This is shorthand for pushing a new node onto the stack, recording its 1716 * text, and then popping it 1717 * 1718 * @param $str the string 1719 * @param $type the token type 1720 */ 1721 function record_token($str, $type) { 1722 $state_data = array($type); 1723 $this->push_state($state_data); 1724 $this->record($str); 1725 $this->pop_state(); 1726 } 1727 1728 /** 1729 * @brief Helper function to record a range of the string 1730 * @param $from the start index 1731 * @param $to the end index 1732 * @param $type dummy argument 1733 * This is shorthand for 1734 * <code> $this->record(substr($this->string(), $from, $to-$from)</code> 1735 * 1736 * @throw RangeException if the range is invalid (i.e. $to < $from) 1737 * 1738 * An empty range (i.e. $to === $from) is allowed, but it is essentially a 1739 * no-op. 1740 */ 1741 function record_range($from, $to, $type=null) { 1742 if ($type !== null) throw new Exception('type argument not supported in ' 1743 . ' LuminousStatefulScanner::record_range'); 1744 if ($to === $from) 1745 return; 1746 else if ($to > $from) 1747 $this->record(substr($this->string(), $from, $to-$from), $type); 1748 else 1749 throw new RangeException("Invalid range supplied [$from, $to]"); 1750 } 1751 1752 1753 /** 1754 * Generic main function which observes the transition table 1755 */ 1756 function main() { 1757 $this->setup(); 1758 while (!$this->eos()) { 1759 $p = $this->pos(); 1760 $state = $this->state_name(); 1761 1762 $this->load_transitions(); 1763 list($next_pattern_data, 1764 $next_pattern_index, 1765 $next_pattern_matches) = $this->next_start_data(); 1766 list($end_index, $end_matches) = $this->next_end_data(); 1767 1768 1769 if( ($next_pattern_index <= $end_index || $end_index === -1) && $next_pattern_index !== -1) { 1770 // we're pushing a new state 1771 if ($p < $next_pattern_index) 1772 $this->record_range($p, $next_pattern_index); 1773 $new_pos = $next_pattern_index; 1774 $this->pos($new_pos); 1775 $tok = $next_pattern_data[0]; 1776 if (isset($this->overrides[$tok])) { 1777 // call override 1778 $ret = call_user_func($this->overrides[$tok], $next_pattern_matches); 1779 if ($ret === true) break; 1780 if ($this->state_name() === $state && $this->pos() <= $new_pos) { 1781 throw new Exception('Override failed to either advance the pointer' 1782 . ' or change the state'); 1783 } 1784 } else { 1785 // no override 1786 $this->pos_shift(strlen($next_pattern_matches[0])); 1787 $this->push_state($next_pattern_data); 1788 $this->record($next_pattern_matches[0]); 1789 if ($next_pattern_data[2] === null) { 1790 // state was a full pattern, so pop now 1791 $this->pop_state(); 1792 } 1793 } 1794 } elseif($end_index !== -1) { 1795 // we're at the end of a state, record what's left and pop it 1796 $to = $end_index + strlen($end_matches[0]); 1797 $this->record_range($this->pos(), $to); 1798 $this->pos($to); 1799 $this->pop_state(); 1800 } 1801 else { 1802 // no more matches, consume the rest of the stirng and break 1803 $this->record($this->rest()); 1804 $this->terminate(); 1805 break; 1806 } 1807 if ($this->state_name() === $state && $this->pos() <= $p) { 1808 throw new Exception('Failed to advance pointer in state' 1809 . $this->state_name()); 1810 } 1811 } 1812 1813 // unterminated states will have left some tokens open, we need to 1814 // close these so there's just the root node on the stack 1815 assert(count($this->token_tree_stack) >= 1); 1816 while(count($this->token_tree_stack) > 1) 1817 $this->pop_state(); 1818 1819 return $this->token_tree_stack[0]; 1820 } 1821 1822 /** 1823 * Recursive function to collapse the token tree into XML 1824 * @internal 1825 */ 1826 protected function collapse_token_tree($node) { 1827 $text = ''; 1828 foreach($node['children'] as $c) { 1829 if (is_string($c)) $text .= LuminousUtils::escape_string($c); 1830 else $text .= $this->collapse_token_tree($c); 1831 } 1832 $token_name = $node['token_name']; 1833 $token = array($node['token_name'], $text, true); 1834 1835 $token_ = $this->rule_mapper_filter(array($token)); 1836 $token = $token_[0]; 1837 1838 if (isset($this->filters[$token_name])) { 1839 foreach($this->filters[$token_name] as $filter) { 1840 $token = call_user_func($filter[1], $token); 1841 } 1842 } 1843 list($token_name, $text,) = $token; 1844 return ($token_name === null)? 1845 $text : LuminousUtils::tag_block($token_name, $text); 1846 } 1847 1848 function tagged() { 1849 $stream = $this->collapse_token_tree($this->token_tree_stack[0]); 1850 return $stream; 1851 } 1852} 1853 1854/// @endcond 1855