1<?php 2/** 3 * Loads a string to be parsed. 4 */ 5 6namespace Masterminds\HTML5\Parser; 7 8/* 9 * 10* Based on code from html5lib: 11 12Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/> 13 14Permission is hereby granted, free of charge, to any person obtaining a 15copy of this software and associated documentation files (the 16 "Software"), to deal in the Software without restriction, including 17without limitation the rights to use, copy, modify, merge, publish, 18distribute, sublicense, and/or sell copies of the Software, and to 19permit persons to whom the Software is furnished to do so, subject to 20the following conditions: 21 22The above copyright notice and this permission notice shall be included 23in all copies or substantial portions of the Software. 24 25THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 26OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 27MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 28IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 29CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 30TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 31SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 32 33*/ 34 35// Some conventions: 36// - /* */ indicates verbatim text from the HTML 5 specification 37// MPB: Not sure which version of the spec. Moving from HTML5lib to 38// HTML5-PHP, I have been using this version: 39// http://www.w3.org/TR/2012/CR-html5-20121217/Overview.html#contents 40// 41// - // indicates regular comments 42 43/** 44 * @deprecated since 2.4, to remove in 3.0. Use a string in the scanner instead. 45 */ 46class StringInputStream implements InputStream 47{ 48 /** 49 * The string data we're parsing. 50 */ 51 private $data; 52 53 /** 54 * The current integer byte position we are in $data. 55 */ 56 private $char; 57 58 /** 59 * Length of $data; when $char === $data, we are at the end-of-file. 60 */ 61 private $EOF; 62 63 /** 64 * Parse errors. 65 */ 66 public $errors = array(); 67 68 /** 69 * Create a new InputStream wrapper. 70 * 71 * @param string $data Data to parse. 72 * @param string $encoding The encoding to use for the data. 73 * @param string $debug A fprintf format to use to echo the data on stdout. 74 */ 75 public function __construct($data, $encoding = 'UTF-8', $debug = '') 76 { 77 $data = UTF8Utils::convertToUTF8($data, $encoding); 78 if ($debug) { 79 fprintf(STDOUT, $debug, $data, strlen($data)); 80 } 81 82 // There is good reason to question whether it makes sense to 83 // do this here, since most of these checks are done during 84 // parsing, and since this check doesn't actually *do* anything. 85 $this->errors = UTF8Utils::checkForIllegalCodepoints($data); 86 87 $data = $this->replaceLinefeeds($data); 88 89 $this->data = $data; 90 $this->char = 0; 91 $this->EOF = strlen($data); 92 } 93 94 public function __toString() 95 { 96 return $this->data; 97 } 98 99 /** 100 * Replace linefeed characters according to the spec. 101 */ 102 protected function replaceLinefeeds($data) 103 { 104 /* 105 * U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED (LF) characters are treated specially. 106 * Any CR characters that are followed by LF characters must be removed, and any CR characters not 107 * followed by LF characters must be converted to LF characters. Thus, newlines in HTML DOMs are 108 * represented by LF characters, and there are never any CR characters in the input to the tokenization 109 * stage. 110 */ 111 $crlfTable = array( 112 "\0" => "\xEF\xBF\xBD", 113 "\r\n" => "\n", 114 "\r" => "\n", 115 ); 116 117 return strtr($data, $crlfTable); 118 } 119 120 /** 121 * Returns the current line that the tokenizer is at. 122 */ 123 public function currentLine() 124 { 125 if (empty($this->EOF) || 0 === $this->char) { 126 return 1; 127 } 128 // Add one to $this->char because we want the number for the next 129 // byte to be processed. 130 return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1; 131 } 132 133 /** 134 * @deprecated 135 */ 136 public function getCurrentLine() 137 { 138 return $this->currentLine(); 139 } 140 141 /** 142 * Returns the current column of the current line that the tokenizer is at. 143 * Newlines are column 0. The first char after a newline is column 1. 144 * 145 * @return int The column number. 146 */ 147 public function columnOffset() 148 { 149 // Short circuit for the first char. 150 if (0 === $this->char) { 151 return 0; 152 } 153 // strrpos is weird, and the offset needs to be negative for what we 154 // want (i.e., the last \n before $this->char). This needs to not have 155 // one (to make it point to the next character, the one we want the 156 // position of) added to it because strrpos's behaviour includes the 157 // final offset byte. 158 $backwardFrom = $this->char - 1 - strlen($this->data); 159 $lastLine = strrpos($this->data, "\n", $backwardFrom); 160 161 // However, for here we want the length up until the next byte to be 162 // processed, so add one to the current byte ($this->char). 163 if (false !== $lastLine) { 164 $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine); 165 } else { 166 // After a newline. 167 $findLengthOf = substr($this->data, 0, $this->char); 168 } 169 170 return UTF8Utils::countChars($findLengthOf); 171 } 172 173 /** 174 * @deprecated 175 */ 176 public function getColumnOffset() 177 { 178 return $this->columnOffset(); 179 } 180 181 /** 182 * Get the current character. 183 * 184 * @return string The current character. 185 */ 186 public function current() 187 { 188 return $this->data[$this->char]; 189 } 190 191 /** 192 * Advance the pointer. 193 * This is part of the Iterator interface. 194 */ 195 public function next() 196 { 197 ++$this->char; 198 } 199 200 /** 201 * Rewind to the start of the string. 202 */ 203 public function rewind() 204 { 205 $this->char = 0; 206 } 207 208 /** 209 * Is the current pointer location valid. 210 * 211 * @return bool Whether the current pointer location is valid. 212 */ 213 public function valid() 214 { 215 return $this->char < $this->EOF; 216 } 217 218 /** 219 * Get all characters until EOF. 220 * 221 * This reads to the end of the file, and sets the read marker at the 222 * end of the file. 223 * 224 * Note this performs bounds checking. 225 * 226 * @return string Returns the remaining text. If called when the InputStream is 227 * already exhausted, it returns an empty string. 228 */ 229 public function remainingChars() 230 { 231 if ($this->char < $this->EOF) { 232 $data = substr($this->data, $this->char); 233 $this->char = $this->EOF; 234 235 return $data; 236 } 237 238 return ''; // false; 239 } 240 241 /** 242 * Read to a particular match (or until $max bytes are consumed). 243 * 244 * This operates on byte sequences, not characters. 245 * 246 * Matches as far as possible until we reach a certain set of bytes 247 * and returns the matched substring. 248 * 249 * @param string $bytes Bytes to match. 250 * @param int $max Maximum number of bytes to scan. 251 * 252 * @return mixed Index or false if no match is found. You should use strong 253 * equality when checking the result, since index could be 0. 254 */ 255 public function charsUntil($bytes, $max = null) 256 { 257 if ($this->char >= $this->EOF) { 258 return false; 259 } 260 261 if (0 === $max || $max) { 262 $len = strcspn($this->data, $bytes, $this->char, $max); 263 } else { 264 $len = strcspn($this->data, $bytes, $this->char); 265 } 266 267 $string = (string) substr($this->data, $this->char, $len); 268 $this->char += $len; 269 270 return $string; 271 } 272 273 /** 274 * Returns the string so long as $bytes matches. 275 * 276 * Matches as far as possible with a certain set of bytes 277 * and returns the matched substring. 278 * 279 * @param string $bytes A mask of bytes to match. If ANY byte in this mask matches the 280 * current char, the pointer advances and the char is part of the 281 * substring. 282 * @param int $max The max number of chars to read. 283 * 284 * @return string 285 */ 286 public function charsWhile($bytes, $max = null) 287 { 288 if ($this->char >= $this->EOF) { 289 return false; 290 } 291 292 if (0 === $max || $max) { 293 $len = strspn($this->data, $bytes, $this->char, $max); 294 } else { 295 $len = strspn($this->data, $bytes, $this->char); 296 } 297 $string = (string) substr($this->data, $this->char, $len); 298 $this->char += $len; 299 300 return $string; 301 } 302 303 /** 304 * Unconsume characters. 305 * 306 * @param int $howMany The number of characters to unconsume. 307 */ 308 public function unconsume($howMany = 1) 309 { 310 if (($this->char - $howMany) >= 0) { 311 $this->char -= $howMany; 312 } 313 } 314 315 /** 316 * Look ahead without moving cursor. 317 */ 318 public function peek() 319 { 320 if (($this->char + 1) <= $this->EOF) { 321 return $this->data[$this->char + 1]; 322 } 323 324 return false; 325 } 326 327 public function key() 328 { 329 return $this->char; 330 } 331} 332