1<?php
2/**
3 * Loads a string to be parsed.
4 */
5
6namespace Masterminds\HTML5\Parser;
7
8/*
9 *
10* Based on code from html5lib:
11
12Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
13
14Permission is hereby granted, free of charge, to any person obtaining a
15copy of this software and associated documentation files (the
16    "Software"), to deal in the Software without restriction, including
17without limitation the rights to use, copy, modify, merge, publish,
18distribute, sublicense, and/or sell copies of the Software, and to
19permit persons to whom the Software is furnished to do so, subject to
20the following conditions:
21
22The above copyright notice and this permission notice shall be included
23in all copies or substantial portions of the Software.
24
25THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
26OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
28IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
29CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
30TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
31SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
32
33*/
34
35// Some conventions:
36// - /* */ indicates verbatim text from the HTML 5 specification
37//   MPB: Not sure which version of the spec. Moving from HTML5lib to
38//   HTML5-PHP, I have been using this version:
39//   http://www.w3.org/TR/2012/CR-html5-20121217/Overview.html#contents
40//
41// - // indicates regular comments
42
43/**
44 * @deprecated since 2.4, to remove in 3.0. Use a string in the scanner instead.
45 */
46class StringInputStream implements InputStream
47{
48    /**
49     * The string data we're parsing.
50     */
51    private $data;
52
53    /**
54     * The current integer byte position we are in $data.
55     */
56    private $char;
57
58    /**
59     * Length of $data; when $char === $data, we are at the end-of-file.
60     */
61    private $EOF;
62
63    /**
64     * Parse errors.
65     */
66    public $errors = array();
67
68    /**
69     * Create a new InputStream wrapper.
70     *
71     * @param string $data     Data to parse.
72     * @param string $encoding The encoding to use for the data.
73     * @param string $debug    A fprintf format to use to echo the data on stdout.
74     */
75    public function __construct($data, $encoding = 'UTF-8', $debug = '')
76    {
77        $data = UTF8Utils::convertToUTF8($data, $encoding);
78        if ($debug) {
79            fprintf(STDOUT, $debug, $data, strlen($data));
80        }
81
82        // There is good reason to question whether it makes sense to
83        // do this here, since most of these checks are done during
84        // parsing, and since this check doesn't actually *do* anything.
85        $this->errors = UTF8Utils::checkForIllegalCodepoints($data);
86
87        $data = $this->replaceLinefeeds($data);
88
89        $this->data = $data;
90        $this->char = 0;
91        $this->EOF = strlen($data);
92    }
93
94    public function __toString()
95    {
96        return $this->data;
97    }
98
99    /**
100     * Replace linefeed characters according to the spec.
101     */
102    protected function replaceLinefeeds($data)
103    {
104        /*
105         * U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED (LF) characters are treated specially.
106         * Any CR characters that are followed by LF characters must be removed, and any CR characters not
107         * followed by LF characters must be converted to LF characters. Thus, newlines in HTML DOMs are
108         * represented by LF characters, and there are never any CR characters in the input to the tokenization
109         * stage.
110         */
111        $crlfTable = array(
112            "\0" => "\xEF\xBF\xBD",
113            "\r\n" => "\n",
114            "\r" => "\n",
115        );
116
117        return strtr($data, $crlfTable);
118    }
119
120    /**
121     * Returns the current line that the tokenizer is at.
122     */
123    public function currentLine()
124    {
125        if (empty($this->EOF) || 0 === $this->char) {
126            return 1;
127        }
128        // Add one to $this->char because we want the number for the next
129        // byte to be processed.
130        return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1;
131    }
132
133    /**
134     * @deprecated
135     */
136    public function getCurrentLine()
137    {
138        return $this->currentLine();
139    }
140
141    /**
142     * Returns the current column of the current line that the tokenizer is at.
143     * Newlines are column 0. The first char after a newline is column 1.
144     *
145     * @return int The column number.
146     */
147    public function columnOffset()
148    {
149        // Short circuit for the first char.
150        if (0 === $this->char) {
151            return 0;
152        }
153        // strrpos is weird, and the offset needs to be negative for what we
154        // want (i.e., the last \n before $this->char). This needs to not have
155        // one (to make it point to the next character, the one we want the
156        // position of) added to it because strrpos's behaviour includes the
157        // final offset byte.
158        $backwardFrom = $this->char - 1 - strlen($this->data);
159        $lastLine = strrpos($this->data, "\n", $backwardFrom);
160
161        // However, for here we want the length up until the next byte to be
162        // processed, so add one to the current byte ($this->char).
163        if (false !== $lastLine) {
164            $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine);
165        } else {
166            // After a newline.
167            $findLengthOf = substr($this->data, 0, $this->char);
168        }
169
170        return UTF8Utils::countChars($findLengthOf);
171    }
172
173    /**
174     * @deprecated
175     */
176    public function getColumnOffset()
177    {
178        return $this->columnOffset();
179    }
180
181    /**
182     * Get the current character.
183     *
184     * @return string The current character.
185     */
186    public function current()
187    {
188        return $this->data[$this->char];
189    }
190
191    /**
192     * Advance the pointer.
193     * This is part of the Iterator interface.
194     */
195    public function next()
196    {
197        ++$this->char;
198    }
199
200    /**
201     * Rewind to the start of the string.
202     */
203    public function rewind()
204    {
205        $this->char = 0;
206    }
207
208    /**
209     * Is the current pointer location valid.
210     *
211     * @return bool Whether the current pointer location is valid.
212     */
213    public function valid()
214    {
215        return $this->char < $this->EOF;
216    }
217
218    /**
219     * Get all characters until EOF.
220     *
221     * This reads to the end of the file, and sets the read marker at the
222     * end of the file.
223     *
224     * Note this performs bounds checking.
225     *
226     * @return string Returns the remaining text. If called when the InputStream is
227     *                already exhausted, it returns an empty string.
228     */
229    public function remainingChars()
230    {
231        if ($this->char < $this->EOF) {
232            $data = substr($this->data, $this->char);
233            $this->char = $this->EOF;
234
235            return $data;
236        }
237
238        return ''; // false;
239    }
240
241    /**
242     * Read to a particular match (or until $max bytes are consumed).
243     *
244     * This operates on byte sequences, not characters.
245     *
246     * Matches as far as possible until we reach a certain set of bytes
247     * and returns the matched substring.
248     *
249     * @param string $bytes Bytes to match.
250     * @param int    $max   Maximum number of bytes to scan.
251     *
252     * @return mixed Index or false if no match is found. You should use strong
253     *               equality when checking the result, since index could be 0.
254     */
255    public function charsUntil($bytes, $max = null)
256    {
257        if ($this->char >= $this->EOF) {
258            return false;
259        }
260
261        if (0 === $max || $max) {
262            $len = strcspn($this->data, $bytes, $this->char, $max);
263        } else {
264            $len = strcspn($this->data, $bytes, $this->char);
265        }
266
267        $string = (string) substr($this->data, $this->char, $len);
268        $this->char += $len;
269
270        return $string;
271    }
272
273    /**
274     * Returns the string so long as $bytes matches.
275     *
276     * Matches as far as possible with a certain set of bytes
277     * and returns the matched substring.
278     *
279     * @param string $bytes A mask of bytes to match. If ANY byte in this mask matches the
280     *                      current char, the pointer advances and the char is part of the
281     *                      substring.
282     * @param int    $max   The max number of chars to read.
283     *
284     * @return string
285     */
286    public function charsWhile($bytes, $max = null)
287    {
288        if ($this->char >= $this->EOF) {
289            return false;
290        }
291
292        if (0 === $max || $max) {
293            $len = strspn($this->data, $bytes, $this->char, $max);
294        } else {
295            $len = strspn($this->data, $bytes, $this->char);
296        }
297        $string = (string) substr($this->data, $this->char, $len);
298        $this->char += $len;
299
300        return $string;
301    }
302
303    /**
304     * Unconsume characters.
305     *
306     * @param int $howMany The number of characters to unconsume.
307     */
308    public function unconsume($howMany = 1)
309    {
310        if (($this->char - $howMany) >= 0) {
311            $this->char -= $howMany;
312        }
313    }
314
315    /**
316     * Look ahead without moving cursor.
317     */
318    public function peek()
319    {
320        if (($this->char + 1) <= $this->EOF) {
321            return $this->data[$this->char + 1];
322        }
323
324        return false;
325    }
326
327    public function key()
328    {
329        return $this->char;
330    }
331}
332