1<?php
2
3/**
4 * League.Uri (https://uri.thephpleague.com)
5 *
6 * (c) Ignace Nyamagana Butera <nyamsprod@gmail.com>
7 *
8 * For the full copyright and license information, please view the LICENSE
9 * file that was distributed with this source code.
10 */
11
12declare(strict_types=1);
13
14namespace League\Uri;
15
16use League\Uri\Exceptions\IdnSupportMissing;
17use League\Uri\Exceptions\SyntaxError;
18use function array_merge;
19use function defined;
20use function explode;
21use function filter_var;
22use function function_exists;
23use function gettype;
24use function idn_to_ascii;
25use function implode;
26use function inet_pton;
27use function is_object;
28use function is_scalar;
29use function method_exists;
30use function preg_match;
31use function rawurldecode;
32use function sprintf;
33use function strpos;
34use function substr;
35use const FILTER_FLAG_IPV6;
36use const FILTER_VALIDATE_IP;
37use const IDNA_ERROR_BIDI;
38use const IDNA_ERROR_CONTEXTJ;
39use const IDNA_ERROR_DISALLOWED;
40use const IDNA_ERROR_DOMAIN_NAME_TOO_LONG;
41use const IDNA_ERROR_EMPTY_LABEL;
42use const IDNA_ERROR_HYPHEN_3_4;
43use const IDNA_ERROR_INVALID_ACE_LABEL;
44use const IDNA_ERROR_LABEL_HAS_DOT;
45use const IDNA_ERROR_LABEL_TOO_LONG;
46use const IDNA_ERROR_LEADING_COMBINING_MARK;
47use const IDNA_ERROR_LEADING_HYPHEN;
48use const IDNA_ERROR_PUNYCODE;
49use const IDNA_ERROR_TRAILING_HYPHEN;
50use const INTL_IDNA_VARIANT_UTS46;
51
52/**
53 * A class to parse a URI string according to RFC3986.
54 *
55 * @link    https://tools.ietf.org/html/rfc3986
56 * @package League\Uri
57 * @author  Ignace Nyamagana Butera <nyamsprod@gmail.com>
58 * @since   6.0.0
59 */
60final class UriString
61{
62    /**
63     * Default URI component values.
64     */
65    private const URI_COMPONENTS = [
66        'scheme' => null, 'user' => null, 'pass' => null, 'host' => null,
67        'port' => null, 'path' => '', 'query' => null, 'fragment' => null,
68    ];
69
70    /**
71     * Simple URI which do not need any parsing.
72     */
73    private const URI_SCHORTCUTS = [
74        '' => [],
75        '#' => ['fragment' => ''],
76        '?' => ['query' => ''],
77        '?#' => ['query' => '', 'fragment' => ''],
78        '/' => ['path' => '/'],
79        '//' => ['host' => ''],
80    ];
81
82    /**
83     * Range of invalid characters in URI string.
84     */
85    private const REGEXP_INVALID_URI_CHARS = '/[\x00-\x1f\x7f]/';
86
87    /**
88     * RFC3986 regular expression URI splitter.
89     *
90     * @link https://tools.ietf.org/html/rfc3986#appendix-B
91     */
92    private const REGEXP_URI_PARTS = ',^
93        (?<scheme>(?<scontent>[^:/?\#]+):)?    # URI scheme component
94        (?<authority>//(?<acontent>[^/?\#]*))? # URI authority part
95        (?<path>[^?\#]*)                       # URI path component
96        (?<query>\?(?<qcontent>[^\#]*))?       # URI query component
97        (?<fragment>\#(?<fcontent>.*))?        # URI fragment component
98    ,x';
99
100    /**
101     * URI scheme regular expresssion.
102     *
103     * @link https://tools.ietf.org/html/rfc3986#section-3.1
104     */
105    private const REGEXP_URI_SCHEME = '/^([a-z][a-z\d\+\.\-]*)?$/i';
106
107    /**
108     * IPvFuture regular expression.
109     *
110     * @link https://tools.ietf.org/html/rfc3986#section-3.2.2
111     */
112    private const REGEXP_IP_FUTURE = '/^
113        v(?<version>[A-F0-9])+\.
114        (?:
115            (?<unreserved>[a-z0-9_~\-\.])|
116            (?<sub_delims>[!$&\'()*+,;=:])  # also include the : character
117        )+
118    $/ix';
119
120    /**
121     * General registered name regular expression.
122     *
123     * @link https://tools.ietf.org/html/rfc3986#section-3.2.2
124     */
125    private const REGEXP_REGISTERED_NAME = '/(?(DEFINE)
126        (?<unreserved>[a-z0-9_~\-])   # . is missing as it is used to separate labels
127        (?<sub_delims>[!$&\'()*+,;=])
128        (?<encoded>%[A-F0-9]{2})
129        (?<reg_name>(?:(?&unreserved)|(?&sub_delims)|(?&encoded))*)
130    )
131    ^(?:(?&reg_name)\.)*(?&reg_name)\.?$/ix';
132
133    /**
134     * Invalid characters in host regular expression.
135     *
136     * @link https://tools.ietf.org/html/rfc3986#section-3.2.2
137     */
138    private const REGEXP_INVALID_HOST_CHARS = '/
139        [:\/?#\[\]@ ]  # gen-delims characters as well as the space character
140    /ix';
141
142    /**
143     * Invalid path for URI without scheme and authority regular expression.
144     *
145     * @link https://tools.ietf.org/html/rfc3986#section-3.3
146     */
147    private const REGEXP_INVALID_PATH = ',^(([^/]*):)(.*)?/,';
148
149    /**
150     * Host and Port splitter regular expression.
151     */
152    private const REGEXP_HOST_PORT = ',^(?<host>\[.*\]|[^:]*)(:(?<port>.*))?$,';
153
154    /**
155     * IDN Host detector regular expression.
156     */
157    private const REGEXP_IDN_PATTERN = '/[^\x20-\x7f]/';
158
159    /**
160     * Only the address block fe80::/10 can have a Zone ID attach to
161     * let's detect the link local significant 10 bits.
162     */
163    private const ZONE_ID_ADDRESS_BLOCK = "\xfe\x80";
164
165    /**
166     * Generate an URI string representation from its parsed representation
167     * returned by League\Uri\parse() or PHP's parse_url.
168     *
169     * If you supply your own array, you are responsible for providing
170     * valid components without their URI delimiters.
171     *
172     * @link https://tools.ietf.org/html/rfc3986#section-5.3
173     * @link https://tools.ietf.org/html/rfc3986#section-7.5
174     *
175     * @param array{
176     *  scheme:?string,
177     *  user:?string,
178     *  pass:?string,
179     *  host:?string,
180     *  port:?int,
181     *  path:string,
182     *  query:?string,
183     *  fragment:?string
184     * } $components
185     */
186    public static function build(array $components): string
187    {
188        $result = $components['path'] ?? '';
189        if (isset($components['query'])) {
190            $result .= '?'.$components['query'];
191        }
192
193        if (isset($components['fragment'])) {
194            $result .= '#'.$components['fragment'];
195        }
196
197        $scheme = null;
198        if (isset($components['scheme'])) {
199            $scheme = $components['scheme'].':';
200        }
201
202        if (!isset($components['host'])) {
203            return $scheme.$result;
204        }
205
206        $scheme .= '//';
207        $authority = $components['host'];
208        if (isset($components['port'])) {
209            $authority .= ':'.$components['port'];
210        }
211
212        if (!isset($components['user'])) {
213            return $scheme.$authority.$result;
214        }
215
216        $authority = '@'.$authority;
217        if (!isset($components['pass'])) {
218            return $scheme.$components['user'].$authority.$result;
219        }
220
221        return $scheme.$components['user'].':'.$components['pass'].$authority.$result;
222    }
223
224    /**
225     * Parse an URI string into its components.
226     *
227     * This method parses a URI and returns an associative array containing any
228     * of the various components of the URI that are present.
229     *
230     * <code>
231     * $components = (new Parser())->parse('http://foo@test.example.com:42?query#');
232     * var_export($components);
233     * //will display
234     * array(
235     *   'scheme' => 'http',           // the URI scheme component
236     *   'user' => 'foo',              // the URI user component
237     *   'pass' => null,               // the URI pass component
238     *   'host' => 'test.example.com', // the URI host component
239     *   'port' => 42,                 // the URI port component
240     *   'path' => '',                 // the URI path component
241     *   'query' => 'query',           // the URI query component
242     *   'fragment' => '',             // the URI fragment component
243     * );
244     * </code>
245     *
246     * The returned array is similar to PHP's parse_url return value with the following
247     * differences:
248     *
249     * <ul>
250     * <li>All components are always present in the returned array</li>
251     * <li>Empty and undefined component are treated differently. And empty component is
252     *   set to the empty string while an undefined component is set to the `null` value.</li>
253     * <li>The path component is never undefined</li>
254     * <li>The method parses the URI following the RFC3986 rules but you are still
255     *   required to validate the returned components against its related scheme specific rules.</li>
256     * </ul>
257     *
258     * @link https://tools.ietf.org/html/rfc3986
259     *
260     * @param mixed $uri any scalar or stringable object
261     *
262     * @throws SyntaxError if the URI contains invalid characters
263     * @throws SyntaxError if the URI contains an invalid scheme
264     * @throws SyntaxError if the URI contains an invalid path
265     *
266     * @return array{
267     *                scheme:?string,
268     *                user:?string,
269     *                pass:?string,
270     *                host:?string,
271     *                port:?int,
272     *                path:string,
273     *                query:?string,
274     *                fragment:?string
275     *                }
276     */
277    public static function parse($uri): array
278    {
279        if (is_object($uri) && method_exists($uri, '__toString')) {
280            $uri = (string) $uri;
281        }
282
283        if (!is_scalar($uri)) {
284            throw new \TypeError(sprintf('The uri must be a scalar or a stringable object `%s` given', gettype($uri)));
285        }
286
287        $uri = (string) $uri;
288
289        if (isset(self::URI_SCHORTCUTS[$uri])) {
290            /** @var array{scheme:?string, user:?string, pass:?string, host:?string, port:?int, path:string, query:?string, fragment:?string} $components */
291            $components = array_merge(self::URI_COMPONENTS, self::URI_SCHORTCUTS[$uri]);
292
293            return $components;
294        }
295
296        if (1 === preg_match(self::REGEXP_INVALID_URI_CHARS, $uri)) {
297            throw new SyntaxError(sprintf('The uri `%s` contains invalid characters', $uri));
298        }
299
300        //if the first character is a known URI delimiter parsing can be simplified
301        $first_char = $uri[0];
302
303        //The URI is made of the fragment only
304        if ('#' === $first_char) {
305            [, $fragment] = explode('#', $uri, 2);
306            $components = self::URI_COMPONENTS;
307            $components['fragment'] = $fragment;
308
309            return $components;
310        }
311
312        //The URI is made of the query and fragment
313        if ('?' === $first_char) {
314            [, $partial] = explode('?', $uri, 2);
315            [$query, $fragment] = explode('#', $partial, 2) + [1 => null];
316            $components = self::URI_COMPONENTS;
317            $components['query'] = $query;
318            $components['fragment'] = $fragment;
319
320            return $components;
321        }
322
323        //use RFC3986 URI regexp to split the URI
324        preg_match(self::REGEXP_URI_PARTS, $uri, $parts);
325        $parts += ['query' => '', 'fragment' => ''];
326
327        if (':' === $parts['scheme'] || 1 !== preg_match(self::REGEXP_URI_SCHEME, $parts['scontent'])) {
328            throw new SyntaxError(sprintf('The uri `%s` contains an invalid scheme', $uri));
329        }
330
331        if ('' === $parts['scheme'].$parts['authority'] && 1 === preg_match(self::REGEXP_INVALID_PATH, $parts['path'])) {
332            throw new SyntaxError(sprintf('The uri `%s` contains an invalid path.', $uri));
333        }
334
335        /** @var array{scheme:?string, user:?string, pass:?string, host:?string, port:?int, path:string, query:?string, fragment:?string} $components */
336        $components = array_merge(
337            self::URI_COMPONENTS,
338            '' === $parts['authority'] ? [] : self::parseAuthority($parts['acontent']),
339            [
340                'path' => $parts['path'],
341                'scheme' => '' === $parts['scheme'] ? null : $parts['scontent'],
342                'query' => '' === $parts['query'] ? null : $parts['qcontent'],
343                'fragment' => '' === $parts['fragment'] ? null : $parts['fcontent'],
344            ]
345        );
346
347        return $components;
348    }
349
350    /**
351     * Parses the URI authority part.
352     *
353     * @link https://tools.ietf.org/html/rfc3986#section-3.2
354     *
355     * @throws SyntaxError If the port component is invalid
356     *
357     * @return array{user:?string, pass:?string, host:?string, port:?int}
358     */
359    private static function parseAuthority(string $authority): array
360    {
361        $components = ['user' => null, 'pass' => null, 'host' => '', 'port' => null];
362        if ('' === $authority) {
363            return $components;
364        }
365
366        $parts = explode('@', $authority, 2);
367        if (isset($parts[1])) {
368            [$components['user'], $components['pass']] = explode(':', $parts[0], 2) + [1 => null];
369        }
370
371        preg_match(self::REGEXP_HOST_PORT, $parts[1] ?? $parts[0], $matches);
372        $matches += ['port' => ''];
373
374        $components['port'] = self::filterPort($matches['port']);
375        $components['host'] = self::filterHost($matches['host']);
376
377        return $components;
378    }
379
380    /**
381     * Filter and format the port component.
382     *
383     * @link https://tools.ietf.org/html/rfc3986#section-3.2.2
384     *
385     * @throws SyntaxError if the registered name is invalid
386     */
387    private static function filterPort(string $port): ?int
388    {
389        if ('' === $port) {
390            return null;
391        }
392
393        if (1 === preg_match('/^\d*$/', $port)) {
394            return (int) $port;
395        }
396
397        throw new SyntaxError(sprintf('The port `%s` is invalid', $port));
398    }
399
400    /**
401     * Returns whether a hostname is valid.
402     *
403     * @link https://tools.ietf.org/html/rfc3986#section-3.2.2
404     *
405     * @throws SyntaxError if the registered name is invalid
406     */
407    private static function filterHost(string $host): string
408    {
409        if ('' === $host) {
410            return $host;
411        }
412
413        if ('[' !== $host[0] || ']' !== substr($host, -1)) {
414            return self::filterRegisteredName($host);
415        }
416
417        if (!self::isIpHost(substr($host, 1, -1))) {
418            throw new SyntaxError(sprintf('Host `%s` is invalid : the IP host is malformed', $host));
419        }
420
421        return $host;
422    }
423
424    /**
425     * Returns whether the host is an IPv4 or a registered named.
426     *
427     * @link https://tools.ietf.org/html/rfc3986#section-3.2.2
428     *
429     * @throws SyntaxError       if the registered name is invalid
430     * @throws IdnSupportMissing if IDN support or ICU requirement are not available or met.
431     */
432    private static function filterRegisteredName(string $host): string
433    {
434        // @codeCoverageIgnoreStart
435        // added because it is not possible in travis to disabled the ext/intl extension
436        // see travis issue https://github.com/travis-ci/travis-ci/issues/4701
437        static $idn_support = null;
438        $idn_support = $idn_support ?? function_exists('idn_to_ascii') && defined('INTL_IDNA_VARIANT_UTS46');
439        // @codeCoverageIgnoreEnd
440
441        $formatted_host = rawurldecode($host);
442        if (1 === preg_match(self::REGEXP_REGISTERED_NAME, $formatted_host)) {
443            if (false === strpos($formatted_host, 'xn--')) {
444                return $host;
445            }
446
447            // @codeCoverageIgnoreStart
448            if (!$idn_support) {
449                throw new IdnSupportMissing(sprintf('the host `%s` could not be processed for IDN. Verify that ext/intl is installed for IDN support and that ICU is at least version 4.6.', $host));
450            }
451            // @codeCoverageIgnoreEnd
452
453            $unicode = idn_to_utf8($host, 0, INTL_IDNA_VARIANT_UTS46, $arr);
454            if (0 !== $arr['errors']) {
455                throw new SyntaxError(sprintf('The host `%s` is invalid : %s', $host, self::getIDNAErrors($arr['errors'])));
456            }
457
458            // @codeCoverageIgnoreStart
459            if (false === $unicode) {
460                throw new IdnSupportMissing(sprintf('The Intl extension is misconfigured for %s, please correct this issue before proceeding.', PHP_OS));
461            }
462            // @codeCoverageIgnoreEnd
463
464            return $host;
465        }
466
467        //to test IDN host non-ascii characters must be present in the host
468        if (1 !== preg_match(self::REGEXP_IDN_PATTERN, $formatted_host)) {
469            throw new SyntaxError(sprintf('Host `%s` is invalid : the host is not a valid registered name', $host));
470        }
471
472        // @codeCoverageIgnoreStart
473        if (!$idn_support) {
474            throw new IdnSupportMissing(sprintf('the host `%s` could not be processed for IDN. Verify that ext/intl is installed for IDN support and that ICU is at least version 4.6.', $host));
475        }
476        // @codeCoverageIgnoreEnd
477
478        $retval = idn_to_ascii($formatted_host, 0, INTL_IDNA_VARIANT_UTS46, $arr);
479
480        if ([] === $arr) {
481            throw new SyntaxError(sprintf('Host `%s` is not a valid IDN host', $host));
482        }
483
484        if (0 !== $arr['errors']) {
485            throw new SyntaxError(sprintf('Host `%s` is not a valid IDN host : %s', $host, self::getIDNAErrors($arr['errors'])));
486        }
487
488        // @codeCoverageIgnoreStart
489        if (false === $retval) {
490            throw new IdnSupportMissing(sprintf('The Intl extension is misconfigured for %s, please correct this issue before proceeding.', PHP_OS));
491        }
492        // @codeCoverageIgnoreEnd
493
494        if (false !== strpos($retval, '%')) {
495            throw new SyntaxError(sprintf('Host `%s` is invalid : the host is not a valid registered name', $host));
496        }
497
498        return $host;
499    }
500
501    /**
502     * Retrieves and format IDNA conversion error message.
503     *
504     * @link http://icu-project.org/apiref/icu4j/com/ibm/icu/text/IDNA.Error.html
505     */
506    private static function getIDNAErrors(int $error_byte): string
507    {
508        /**
509         * IDNA errors.
510         */
511        static $idn_errors = [
512            IDNA_ERROR_EMPTY_LABEL => 'a non-final domain name label (or the whole domain name) is empty',
513            IDNA_ERROR_LABEL_TOO_LONG => 'a domain name label is longer than 63 bytes',
514            IDNA_ERROR_DOMAIN_NAME_TOO_LONG => 'a domain name is longer than 255 bytes in its storage form',
515            IDNA_ERROR_LEADING_HYPHEN => 'a label starts with a hyphen-minus ("-")',
516            IDNA_ERROR_TRAILING_HYPHEN => 'a label ends with a hyphen-minus ("-")',
517            IDNA_ERROR_HYPHEN_3_4 => 'a label contains hyphen-minus ("-") in the third and fourth positions',
518            IDNA_ERROR_LEADING_COMBINING_MARK => 'a label starts with a combining mark',
519            IDNA_ERROR_DISALLOWED => 'a label or domain name contains disallowed characters',
520            IDNA_ERROR_PUNYCODE => 'a label starts with "xn--" but does not contain valid Punycode',
521            IDNA_ERROR_LABEL_HAS_DOT => 'a label contains a dot=full stop',
522            IDNA_ERROR_INVALID_ACE_LABEL => 'An ACE label does not contain a valid label string',
523            IDNA_ERROR_BIDI => 'a label does not meet the IDNA BiDi requirements (for right-to-left characters)',
524            IDNA_ERROR_CONTEXTJ => 'a label does not meet the IDNA CONTEXTJ requirements',
525        ];
526
527        $res = [];
528        foreach ($idn_errors as $error => $reason) {
529            if ($error === ($error_byte & $error)) {
530                $res[] = $reason;
531            }
532        }
533
534        return [] === $res ? 'Unknown IDNA conversion error.' : implode(', ', $res).'.';
535    }
536
537    /**
538     * Validates a IPv6/IPvfuture host.
539     *
540     * @link https://tools.ietf.org/html/rfc3986#section-3.2.2
541     * @link https://tools.ietf.org/html/rfc6874#section-2
542     * @link https://tools.ietf.org/html/rfc6874#section-4
543     */
544    private static function isIpHost(string $ip_host): bool
545    {
546        if (false !== filter_var($ip_host, FILTER_VALIDATE_IP, FILTER_FLAG_IPV6)) {
547            return true;
548        }
549
550        if (1 === preg_match(self::REGEXP_IP_FUTURE, $ip_host, $matches)) {
551            return !in_array($matches['version'], ['4', '6'], true);
552        }
553
554        $pos = strpos($ip_host, '%');
555        if (false === $pos || 1 === preg_match(
556            self::REGEXP_INVALID_HOST_CHARS,
557            rawurldecode(substr($ip_host, $pos))
558        )) {
559            return false;
560        }
561
562        $ip_host = substr($ip_host, 0, $pos);
563
564        return false !== filter_var($ip_host, FILTER_VALIDATE_IP, FILTER_FLAG_IPV6)
565            && 0 === strpos((string) inet_pton($ip_host), self::ZONE_ID_ADDRESS_BLOCK);
566    }
567}
568