1<?php
2/**
3 * Zend Framework (http://framework.zend.com/)
4 *
5 * @link      http://github.com/zendframework/zf2 for the canonical source repository
6 * @copyright Copyright (c) 2005-2015 Zend Technologies USA Inc. (http://www.zend.com)
7 * @license   http://framework.zend.com/license/new-bsd New BSD License
8 */
9
10namespace Zend\Json;
11
12use stdClass;
13use Zend\Json\Exception\InvalidArgumentException;
14use Zend\Json\Exception\RuntimeException;
15
16/**
17 * Decode JSON encoded string to PHP variable constructs
18 */
19class Decoder
20{
21    /**
22     * Parse tokens used to decode the JSON object. These are not
23     * for public consumption, they are just used internally to the
24     * class.
25     */
26    const EOF       = 0;
27    const DATUM     = 1;
28    const LBRACE    = 2;
29    const LBRACKET  = 3;
30    const RBRACE    = 4;
31    const RBRACKET  = 5;
32    const COMMA     = 6;
33    const COLON     = 7;
34
35    /**
36     * Use to maintain a "pointer" to the source being decoded
37     *
38     * @var string
39     */
40    protected $source;
41
42    /**
43     * Caches the source length
44     *
45     * @var int
46     */
47    protected $sourceLength;
48
49    /**
50     * The offset within the source being decoded
51     *
52     * @var int
53     *
54     */
55    protected $offset;
56
57    /**
58     * The current token being considered in the parser cycle
59     *
60     * @var int
61     */
62    protected $token;
63
64    /**
65     * Flag indicating how objects should be decoded
66     *
67     * @var int
68     * @access protected
69     */
70    protected $decodeType;
71
72    /**
73     * @var $_tokenValue
74     */
75    protected $tokenValue;
76
77    /**
78     * Decode Unicode Characters from \u0000 ASCII syntax.
79     *
80     * This algorithm was originally developed for the
81     * Solar Framework by Paul M. Jones
82     *
83     * @link   http://solarphp.com/
84     * @link   https://github.com/solarphp/core/blob/master/Solar/Json.php
85     * @param  string $chrs
86     * @return string
87     */
88    public static function decodeUnicodeString($chrs)
89    {
90        $chrs       = (string) $chrs;
91        $utf8       = '';
92        $strlenChrs = strlen($chrs);
93
94        for ($i = 0; $i < $strlenChrs; $i++) {
95            $ordChrsC = ord($chrs[$i]);
96
97            switch (true) {
98                case preg_match('/\\\u[0-9A-F]{4}/i', substr($chrs, $i, 6)):
99                    // single, escaped unicode character
100                    $utf16 = chr(hexdec(substr($chrs, ($i + 2), 2)))
101                           . chr(hexdec(substr($chrs, ($i + 4), 2)));
102                    $utf8char = self::_utf162utf8($utf16);
103                    $search  = array('\\', "\n", "\t", "\r", chr(0x08), chr(0x0C), '"', '\'', '/');
104                    if (in_array($utf8char, $search)) {
105                        $replace = array('\\\\', '\\n', '\\t', '\\r', '\\b', '\\f', '\\"', '\\\'', '\\/');
106                        $utf8char  = str_replace($search, $replace, $utf8char);
107                    }
108                    $utf8 .= $utf8char;
109                    $i += 5;
110                    break;
111                case ($ordChrsC >= 0x20) && ($ordChrsC <= 0x7F):
112                    $utf8 .= $chrs{$i};
113                    break;
114                case ($ordChrsC & 0xE0) == 0xC0:
115                    // characters U-00000080 - U-000007FF, mask 110XXXXX
116                    //see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
117                    $utf8 .= substr($chrs, $i, 2);
118                    ++$i;
119                    break;
120                case ($ordChrsC & 0xF0) == 0xE0:
121                    // characters U-00000800 - U-0000FFFF, mask 1110XXXX
122                    // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
123                    $utf8 .= substr($chrs, $i, 3);
124                    $i += 2;
125                    break;
126                case ($ordChrsC & 0xF8) == 0xF0:
127                    // characters U-00010000 - U-001FFFFF, mask 11110XXX
128                    // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
129                    $utf8 .= substr($chrs, $i, 4);
130                    $i += 3;
131                    break;
132                case ($ordChrsC & 0xFC) == 0xF8:
133                    // characters U-00200000 - U-03FFFFFF, mask 111110XX
134                    // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
135                    $utf8 .= substr($chrs, $i, 5);
136                    $i += 4;
137                    break;
138                case ($ordChrsC & 0xFE) == 0xFC:
139                    // characters U-04000000 - U-7FFFFFFF, mask 1111110X
140                    // see http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
141                    $utf8 .= substr($chrs, $i, 6);
142                    $i += 5;
143                    break;
144            }
145        }
146
147        return $utf8;
148    }
149
150    /**
151     * Constructor
152     *
153     * @param string $source     String source to decode
154     * @param int    $decodeType How objects should be decoded -- see
155     * {@link Zend\Json\Json::TYPE_ARRAY} and {@link Zend\Json\Json::TYPE_OBJECT} for
156     * valid values
157     * @throws InvalidArgumentException
158     */
159    protected function __construct($source, $decodeType)
160    {
161        // Set defaults
162        $this->source       = self::decodeUnicodeString($source);
163        $this->sourceLength = strlen($this->source);
164        $this->token        = self::EOF;
165        $this->offset       = 0;
166
167        switch ($decodeType) {
168            case Json::TYPE_ARRAY:
169            case Json::TYPE_OBJECT:
170                $this->decodeType = $decodeType;
171                break;
172            default:
173                throw new InvalidArgumentException("Unknown decode type '{$decodeType}', please use one of the constants Json::TYPE_*");
174        }
175
176        // Set pointer at first token
177        $this->_getNextToken();
178    }
179
180    /**
181     * Decode a JSON source string
182     *
183     * Decodes a JSON encoded string. The value returned will be one of the
184     * following:
185     *        - integer
186     *        - float
187     *        - boolean
188     *        - null
189     *      - stdClass
190     *      - array
191     *         - array of one or more of the above types
192     *
193     * By default, decoded objects will be returned as associative arrays; to
194     * return a stdClass object instead, pass {@link Zend\Json\Json::TYPE_OBJECT} to
195     * the $objectDecodeType parameter.
196     *
197     * @static
198     * @access public
199     * @param string $source String to be decoded
200     * @param int $objectDecodeType How objects should be decoded; should be
201     * either or {@link Zend\Json\Json::TYPE_ARRAY} or
202     * {@link Zend\Json\Json::TYPE_OBJECT}; defaults to TYPE_ARRAY
203     * @return mixed
204     */
205    public static function decode($source, $objectDecodeType = Json::TYPE_OBJECT)
206    {
207        $decoder = new static($source, $objectDecodeType);
208        return $decoder->_decodeValue();
209    }
210
211    /**
212     * Recursive driving routine for supported toplevel tops
213     *
214     * @return mixed
215     */
216    protected function _decodeValue()
217    {
218        switch ($this->token) {
219            case self::DATUM:
220                $result  = $this->tokenValue;
221                $this->_getNextToken();
222                return($result);
223            case self::LBRACE:
224                return($this->_decodeObject());
225            case self::LBRACKET:
226                return($this->_decodeArray());
227            default:
228                return;
229        }
230    }
231
232    /**
233     * Decodes an object of the form:
234     *  { "attribute: value, "attribute2" : value,...}
235     *
236     * If Zend\Json\Encoder was used to encode the original object then
237     * a special attribute called __className which specifies a class
238     * name that should wrap the data contained within the encoded source.
239     *
240     * Decodes to either an array or stdClass object, based on the value of
241     * {@link $decodeType}. If invalid $decodeType present, returns as an
242     * array.
243     *
244     * @return array|stdClass
245     * @throws RuntimeException
246     */
247    protected function _decodeObject()
248    {
249        $members = array();
250        $tok = $this->_getNextToken();
251
252        while ($tok && $tok != self::RBRACE) {
253            if ($tok != self::DATUM || ! is_string($this->tokenValue)) {
254                throw new RuntimeException('Missing key in object encoding: ' . $this->source);
255            }
256
257            $key = $this->tokenValue;
258            $tok = $this->_getNextToken();
259
260            if ($tok != self::COLON) {
261                throw new RuntimeException('Missing ":" in object encoding: ' . $this->source);
262            }
263
264            $this->_getNextToken();
265            $members[$key] = $this->_decodeValue();
266            $tok = $this->token;
267
268            if ($tok == self::RBRACE) {
269                break;
270            }
271
272            if ($tok != self::COMMA) {
273                throw new RuntimeException('Missing "," in object encoding: ' . $this->source);
274            }
275
276            $tok = $this->_getNextToken();
277        }
278
279        switch ($this->decodeType) {
280            case Json::TYPE_OBJECT:
281                // Create new stdClass and populate with $members
282                $result = new stdClass();
283                foreach ($members as $key => $value) {
284                    if ($key === '') {
285                        $key = '_empty_';
286                    }
287                    $result->$key = $value;
288                }
289                break;
290            case Json::TYPE_ARRAY:
291            default:
292                $result = $members;
293                break;
294        }
295
296        $this->_getNextToken();
297        return $result;
298    }
299
300    /**
301     * Decodes a JSON array format:
302     *    [element, element2,...,elementN]
303     *
304     * @return array
305     * @throws RuntimeException
306     */
307    protected function _decodeArray()
308    {
309        $result = array();
310        $tok = $this->_getNextToken(); // Move past the '['
311        $index  = 0;
312
313        while ($tok && $tok != self::RBRACKET) {
314            $result[$index++] = $this->_decodeValue();
315
316            $tok = $this->token;
317
318            if ($tok == self::RBRACKET || !$tok) {
319                break;
320            }
321
322            if ($tok != self::COMMA) {
323                throw new RuntimeException('Missing "," in array encoding: ' . $this->source);
324            }
325
326            $tok = $this->_getNextToken();
327        }
328
329        $this->_getNextToken();
330        return $result;
331    }
332
333    /**
334     * Removes whitespace characters from the source input
335     */
336    protected function _eatWhitespace()
337    {
338        if (preg_match('/([\t\b\f\n\r ])*/s', $this->source, $matches, PREG_OFFSET_CAPTURE, $this->offset)
339            && $matches[0][1] == $this->offset) {
340            $this->offset += strlen($matches[0][0]);
341        }
342    }
343
344    /**
345     * Retrieves the next token from the source stream
346     *
347     * @return int Token constant value specified in class definition
348     * @throws RuntimeException
349     */
350    protected function _getNextToken()
351    {
352        $this->token      = self::EOF;
353        $this->tokenValue = null;
354        $this->_eatWhitespace();
355
356        if ($this->offset >= $this->sourceLength) {
357            return(self::EOF);
358        }
359
360        $str       = $this->source;
361        $strLength = $this->sourceLength;
362        $i         = $this->offset;
363        $start     = $i;
364
365        switch ($str{$i}) {
366            case '{':
367                $this->token = self::LBRACE;
368                break;
369            case '}':
370                $this->token = self::RBRACE;
371                break;
372            case '[':
373                $this->token = self::LBRACKET;
374                break;
375            case ']':
376                $this->token = self::RBRACKET;
377                break;
378            case ',':
379                $this->token = self::COMMA;
380                break;
381            case ':':
382                $this->token = self::COLON;
383                break;
384            case '"':
385                $result = '';
386                do {
387                    $i++;
388                    if ($i >= $strLength) {
389                        break;
390                    }
391
392                    $chr = $str{$i};
393
394                    if ($chr == '\\') {
395                        $i++;
396                        if ($i >= $strLength) {
397                            break;
398                        }
399                        $chr = $str{$i};
400                        switch ($chr) {
401                            case '"':
402                                $result .= '"';
403                                break;
404                            case '\\':
405                                $result .= '\\';
406                                break;
407                            case '/':
408                                $result .= '/';
409                                break;
410                            case 'b':
411                                $result .= "\x08";
412                                break;
413                            case 'f':
414                                $result .= "\x0c";
415                                break;
416                            case 'n':
417                                $result .= "\x0a";
418                                break;
419                            case 'r':
420                                $result .= "\x0d";
421                                break;
422                            case 't':
423                                $result .= "\x09";
424                                break;
425                            case '\'':
426                                $result .= '\'';
427                                break;
428                            default:
429                                throw new RuntimeException("Illegal escape sequence '{$chr}'");
430                        }
431                    } elseif ($chr == '"') {
432                        break;
433                    } else {
434                        $result .= $chr;
435                    }
436                } while ($i < $strLength);
437
438                $this->token = self::DATUM;
439                //$this->tokenValue = substr($str, $start + 1, $i - $start - 1);
440                $this->tokenValue = $result;
441                break;
442            case 't':
443                if (($i+ 3) < $strLength && substr($str, $start, 4) == "true") {
444                    $this->token = self::DATUM;
445                }
446                $this->tokenValue = true;
447                $i += 3;
448                break;
449            case 'f':
450                if (($i+ 4) < $strLength && substr($str, $start, 5) == "false") {
451                    $this->token = self::DATUM;
452                }
453                $this->tokenValue = false;
454                $i += 4;
455                break;
456            case 'n':
457                if (($i+ 3) < $strLength && substr($str, $start, 4) == "null") {
458                    $this->token = self::DATUM;
459                }
460                $this->tokenValue = null;
461                $i += 3;
462                break;
463        }
464
465        if ($this->token != self::EOF) {
466            $this->offset = $i + 1; // Consume the last token character
467            return($this->token);
468        }
469
470        $chr = $str{$i};
471        if ($chr == '-' || $chr == '.' || ($chr >= '0' && $chr <= '9')) {
472            if (preg_match('/-?([0-9])*(\.[0-9]*)?((e|E)((-|\+)?)[0-9]+)?/s', $str, $matches, PREG_OFFSET_CAPTURE, $start) && $matches[0][1] == $start) {
473                $datum = $matches[0][0];
474
475                if (is_numeric($datum)) {
476                    if (preg_match('/^0\d+$/', $datum)) {
477                        throw new RuntimeException("Octal notation not supported by JSON (value: {$datum})");
478                    } else {
479                        $val  = intval($datum);
480                        $fVal = floatval($datum);
481                        $this->tokenValue = ($val == $fVal ? $val : $fVal);
482                    }
483                } else {
484                    throw new RuntimeException("Illegal number format: {$datum}");
485                }
486
487                $this->token = self::DATUM;
488                $this->offset = $start + strlen($datum);
489            }
490        } else {
491            throw new RuntimeException('Illegal Token');
492        }
493
494        return $this->token;
495    }
496
497    /**
498     * Convert a string from one UTF-16 char to one UTF-8 char.
499     *
500     * Normally should be handled by mb_convert_encoding, but
501     * provides a slower PHP-only method for installations
502     * that lack the multibyte string extension.
503     *
504     * This method is from the Solar Framework by Paul M. Jones
505     *
506     * @link   http://solarphp.com
507     * @param  string $utf16 UTF-16 character
508     * @return string UTF-8 character
509     */
510    protected static function _utf162utf8($utf16)
511    {
512        // Check for mb extension otherwise do by hand.
513        if (function_exists('mb_convert_encoding')) {
514            return mb_convert_encoding($utf16, 'UTF-8', 'UTF-16');
515        }
516
517        $bytes = (ord($utf16{0}) << 8) | ord($utf16{1});
518
519        switch (true) {
520            case ((0x7F & $bytes) == $bytes):
521                // this case should never be reached, because we are in ASCII range
522                // see: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
523                return chr(0x7F & $bytes);
524
525            case (0x07FF & $bytes) == $bytes:
526                // return a 2-byte UTF-8 character
527                // see: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
528                return chr(0xC0 | (($bytes >> 6) & 0x1F))
529                     . chr(0x80 | ($bytes & 0x3F));
530
531            case (0xFFFF & $bytes) == $bytes:
532                // return a 3-byte UTF-8 character
533                // see: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
534                return chr(0xE0 | (($bytes >> 12) & 0x0F))
535                     . chr(0x80 | (($bytes >> 6) & 0x3F))
536                     . chr(0x80 | ($bytes & 0x3F));
537        }
538
539        // ignoring UTF-32 for now, sorry
540        return '';
541    }
542}
543