1<?php 2 3/** 4 * League.Uri (https://uri.thephpleague.com) 5 * 6 * (c) Ignace Nyamagana Butera <nyamsprod@gmail.com> 7 * 8 * For the full copyright and license information, please view the LICENSE 9 * file that was distributed with this source code. 10 */ 11 12declare(strict_types=1); 13 14namespace League\Uri; 15 16use League\Uri\Exceptions\IdnSupportMissing; 17use League\Uri\Exceptions\SyntaxError; 18use function array_merge; 19use function defined; 20use function explode; 21use function filter_var; 22use function function_exists; 23use function gettype; 24use function idn_to_ascii; 25use function implode; 26use function inet_pton; 27use function is_object; 28use function is_scalar; 29use function method_exists; 30use function preg_match; 31use function rawurldecode; 32use function sprintf; 33use function strpos; 34use function substr; 35use const FILTER_FLAG_IPV6; 36use const FILTER_VALIDATE_IP; 37use const IDNA_ERROR_BIDI; 38use const IDNA_ERROR_CONTEXTJ; 39use const IDNA_ERROR_DISALLOWED; 40use const IDNA_ERROR_DOMAIN_NAME_TOO_LONG; 41use const IDNA_ERROR_EMPTY_LABEL; 42use const IDNA_ERROR_HYPHEN_3_4; 43use const IDNA_ERROR_INVALID_ACE_LABEL; 44use const IDNA_ERROR_LABEL_HAS_DOT; 45use const IDNA_ERROR_LABEL_TOO_LONG; 46use const IDNA_ERROR_LEADING_COMBINING_MARK; 47use const IDNA_ERROR_LEADING_HYPHEN; 48use const IDNA_ERROR_PUNYCODE; 49use const IDNA_ERROR_TRAILING_HYPHEN; 50use const INTL_IDNA_VARIANT_UTS46; 51 52/** 53 * A class to parse a URI string according to RFC3986. 54 * 55 * @link https://tools.ietf.org/html/rfc3986 56 * @package League\Uri 57 * @author Ignace Nyamagana Butera <nyamsprod@gmail.com> 58 * @since 6.0.0 59 */ 60final class UriString 61{ 62 /** 63 * Default URI component values. 64 */ 65 private const URI_COMPONENTS = [ 66 'scheme' => null, 'user' => null, 'pass' => null, 'host' => null, 67 'port' => null, 'path' => '', 'query' => null, 'fragment' => null, 68 ]; 69 70 /** 71 * Simple URI which do not need any parsing. 72 */ 73 private const URI_SCHORTCUTS = [ 74 '' => [], 75 '#' => ['fragment' => ''], 76 '?' => ['query' => ''], 77 '?#' => ['query' => '', 'fragment' => ''], 78 '/' => ['path' => '/'], 79 '//' => ['host' => ''], 80 ]; 81 82 /** 83 * Range of invalid characters in URI string. 84 */ 85 private const REGEXP_INVALID_URI_CHARS = '/[\x00-\x1f\x7f]/'; 86 87 /** 88 * RFC3986 regular expression URI splitter. 89 * 90 * @link https://tools.ietf.org/html/rfc3986#appendix-B 91 */ 92 private const REGEXP_URI_PARTS = ',^ 93 (?<scheme>(?<scontent>[^:/?\#]+):)? # URI scheme component 94 (?<authority>//(?<acontent>[^/?\#]*))? # URI authority part 95 (?<path>[^?\#]*) # URI path component 96 (?<query>\?(?<qcontent>[^\#]*))? # URI query component 97 (?<fragment>\#(?<fcontent>.*))? # URI fragment component 98 ,x'; 99 100 /** 101 * URI scheme regular expresssion. 102 * 103 * @link https://tools.ietf.org/html/rfc3986#section-3.1 104 */ 105 private const REGEXP_URI_SCHEME = '/^([a-z][a-z\d\+\.\-]*)?$/i'; 106 107 /** 108 * IPvFuture regular expression. 109 * 110 * @link https://tools.ietf.org/html/rfc3986#section-3.2.2 111 */ 112 private const REGEXP_IP_FUTURE = '/^ 113 v(?<version>[A-F0-9])+\. 114 (?: 115 (?<unreserved>[a-z0-9_~\-\.])| 116 (?<sub_delims>[!$&\'()*+,;=:]) # also include the : character 117 )+ 118 $/ix'; 119 120 /** 121 * General registered name regular expression. 122 * 123 * @link https://tools.ietf.org/html/rfc3986#section-3.2.2 124 */ 125 private const REGEXP_REGISTERED_NAME = '/(?(DEFINE) 126 (?<unreserved>[a-z0-9_~\-]) # . is missing as it is used to separate labels 127 (?<sub_delims>[!$&\'()*+,;=]) 128 (?<encoded>%[A-F0-9]{2}) 129 (?<reg_name>(?:(?&unreserved)|(?&sub_delims)|(?&encoded))*) 130 ) 131 ^(?:(?®_name)\.)*(?®_name)\.?$/ix'; 132 133 /** 134 * Invalid characters in host regular expression. 135 * 136 * @link https://tools.ietf.org/html/rfc3986#section-3.2.2 137 */ 138 private const REGEXP_INVALID_HOST_CHARS = '/ 139 [:\/?#\[\]@ ] # gen-delims characters as well as the space character 140 /ix'; 141 142 /** 143 * Invalid path for URI without scheme and authority regular expression. 144 * 145 * @link https://tools.ietf.org/html/rfc3986#section-3.3 146 */ 147 private const REGEXP_INVALID_PATH = ',^(([^/]*):)(.*)?/,'; 148 149 /** 150 * Host and Port splitter regular expression. 151 */ 152 private const REGEXP_HOST_PORT = ',^(?<host>\[.*\]|[^:]*)(:(?<port>.*))?$,'; 153 154 /** 155 * IDN Host detector regular expression. 156 */ 157 private const REGEXP_IDN_PATTERN = '/[^\x20-\x7f]/'; 158 159 /** 160 * Only the address block fe80::/10 can have a Zone ID attach to 161 * let's detect the link local significant 10 bits. 162 */ 163 private const ZONE_ID_ADDRESS_BLOCK = "\xfe\x80"; 164 165 /** 166 * Generate an URI string representation from its parsed representation 167 * returned by League\Uri\parse() or PHP's parse_url. 168 * 169 * If you supply your own array, you are responsible for providing 170 * valid components without their URI delimiters. 171 * 172 * @link https://tools.ietf.org/html/rfc3986#section-5.3 173 * @link https://tools.ietf.org/html/rfc3986#section-7.5 174 * 175 * @param array{ 176 * scheme:?string, 177 * user:?string, 178 * pass:?string, 179 * host:?string, 180 * port:?int, 181 * path:string, 182 * query:?string, 183 * fragment:?string 184 * } $components 185 */ 186 public static function build(array $components): string 187 { 188 $result = $components['path'] ?? ''; 189 if (isset($components['query'])) { 190 $result .= '?'.$components['query']; 191 } 192 193 if (isset($components['fragment'])) { 194 $result .= '#'.$components['fragment']; 195 } 196 197 $scheme = null; 198 if (isset($components['scheme'])) { 199 $scheme = $components['scheme'].':'; 200 } 201 202 if (!isset($components['host'])) { 203 return $scheme.$result; 204 } 205 206 $scheme .= '//'; 207 $authority = $components['host']; 208 if (isset($components['port'])) { 209 $authority .= ':'.$components['port']; 210 } 211 212 if (!isset($components['user'])) { 213 return $scheme.$authority.$result; 214 } 215 216 $authority = '@'.$authority; 217 if (!isset($components['pass'])) { 218 return $scheme.$components['user'].$authority.$result; 219 } 220 221 return $scheme.$components['user'].':'.$components['pass'].$authority.$result; 222 } 223 224 /** 225 * Parse an URI string into its components. 226 * 227 * This method parses a URI and returns an associative array containing any 228 * of the various components of the URI that are present. 229 * 230 * <code> 231 * $components = (new Parser())->parse('http://foo@test.example.com:42?query#'); 232 * var_export($components); 233 * //will display 234 * array( 235 * 'scheme' => 'http', // the URI scheme component 236 * 'user' => 'foo', // the URI user component 237 * 'pass' => null, // the URI pass component 238 * 'host' => 'test.example.com', // the URI host component 239 * 'port' => 42, // the URI port component 240 * 'path' => '', // the URI path component 241 * 'query' => 'query', // the URI query component 242 * 'fragment' => '', // the URI fragment component 243 * ); 244 * </code> 245 * 246 * The returned array is similar to PHP's parse_url return value with the following 247 * differences: 248 * 249 * <ul> 250 * <li>All components are always present in the returned array</li> 251 * <li>Empty and undefined component are treated differently. And empty component is 252 * set to the empty string while an undefined component is set to the `null` value.</li> 253 * <li>The path component is never undefined</li> 254 * <li>The method parses the URI following the RFC3986 rules but you are still 255 * required to validate the returned components against its related scheme specific rules.</li> 256 * </ul> 257 * 258 * @link https://tools.ietf.org/html/rfc3986 259 * 260 * @param mixed $uri any scalar or stringable object 261 * 262 * @throws SyntaxError if the URI contains invalid characters 263 * @throws SyntaxError if the URI contains an invalid scheme 264 * @throws SyntaxError if the URI contains an invalid path 265 * 266 * @return array{ 267 * scheme:?string, 268 * user:?string, 269 * pass:?string, 270 * host:?string, 271 * port:?int, 272 * path:string, 273 * query:?string, 274 * fragment:?string 275 * } 276 */ 277 public static function parse($uri): array 278 { 279 if (is_object($uri) && method_exists($uri, '__toString')) { 280 $uri = (string) $uri; 281 } 282 283 if (!is_scalar($uri)) { 284 throw new \TypeError(sprintf('The uri must be a scalar or a stringable object `%s` given', gettype($uri))); 285 } 286 287 $uri = (string) $uri; 288 289 if (isset(self::URI_SCHORTCUTS[$uri])) { 290 /** @var array{scheme:?string, user:?string, pass:?string, host:?string, port:?int, path:string, query:?string, fragment:?string} $components */ 291 $components = array_merge(self::URI_COMPONENTS, self::URI_SCHORTCUTS[$uri]); 292 293 return $components; 294 } 295 296 if (1 === preg_match(self::REGEXP_INVALID_URI_CHARS, $uri)) { 297 throw new SyntaxError(sprintf('The uri `%s` contains invalid characters', $uri)); 298 } 299 300 //if the first character is a known URI delimiter parsing can be simplified 301 $first_char = $uri[0]; 302 303 //The URI is made of the fragment only 304 if ('#' === $first_char) { 305 [, $fragment] = explode('#', $uri, 2); 306 $components = self::URI_COMPONENTS; 307 $components['fragment'] = $fragment; 308 309 return $components; 310 } 311 312 //The URI is made of the query and fragment 313 if ('?' === $first_char) { 314 [, $partial] = explode('?', $uri, 2); 315 [$query, $fragment] = explode('#', $partial, 2) + [1 => null]; 316 $components = self::URI_COMPONENTS; 317 $components['query'] = $query; 318 $components['fragment'] = $fragment; 319 320 return $components; 321 } 322 323 //use RFC3986 URI regexp to split the URI 324 preg_match(self::REGEXP_URI_PARTS, $uri, $parts); 325 $parts += ['query' => '', 'fragment' => '']; 326 327 if (':' === $parts['scheme'] || 1 !== preg_match(self::REGEXP_URI_SCHEME, $parts['scontent'])) { 328 throw new SyntaxError(sprintf('The uri `%s` contains an invalid scheme', $uri)); 329 } 330 331 if ('' === $parts['scheme'].$parts['authority'] && 1 === preg_match(self::REGEXP_INVALID_PATH, $parts['path'])) { 332 throw new SyntaxError(sprintf('The uri `%s` contains an invalid path.', $uri)); 333 } 334 335 /** @var array{scheme:?string, user:?string, pass:?string, host:?string, port:?int, path:string, query:?string, fragment:?string} $components */ 336 $components = array_merge( 337 self::URI_COMPONENTS, 338 '' === $parts['authority'] ? [] : self::parseAuthority($parts['acontent']), 339 [ 340 'path' => $parts['path'], 341 'scheme' => '' === $parts['scheme'] ? null : $parts['scontent'], 342 'query' => '' === $parts['query'] ? null : $parts['qcontent'], 343 'fragment' => '' === $parts['fragment'] ? null : $parts['fcontent'], 344 ] 345 ); 346 347 return $components; 348 } 349 350 /** 351 * Parses the URI authority part. 352 * 353 * @link https://tools.ietf.org/html/rfc3986#section-3.2 354 * 355 * @throws SyntaxError If the port component is invalid 356 * 357 * @return array{user:?string, pass:?string, host:?string, port:?int} 358 */ 359 private static function parseAuthority(string $authority): array 360 { 361 $components = ['user' => null, 'pass' => null, 'host' => '', 'port' => null]; 362 if ('' === $authority) { 363 return $components; 364 } 365 366 $parts = explode('@', $authority, 2); 367 if (isset($parts[1])) { 368 [$components['user'], $components['pass']] = explode(':', $parts[0], 2) + [1 => null]; 369 } 370 371 preg_match(self::REGEXP_HOST_PORT, $parts[1] ?? $parts[0], $matches); 372 $matches += ['port' => '']; 373 374 $components['port'] = self::filterPort($matches['port']); 375 $components['host'] = self::filterHost($matches['host']); 376 377 return $components; 378 } 379 380 /** 381 * Filter and format the port component. 382 * 383 * @link https://tools.ietf.org/html/rfc3986#section-3.2.2 384 * 385 * @throws SyntaxError if the registered name is invalid 386 */ 387 private static function filterPort(string $port): ?int 388 { 389 if ('' === $port) { 390 return null; 391 } 392 393 if (1 === preg_match('/^\d*$/', $port)) { 394 return (int) $port; 395 } 396 397 throw new SyntaxError(sprintf('The port `%s` is invalid', $port)); 398 } 399 400 /** 401 * Returns whether a hostname is valid. 402 * 403 * @link https://tools.ietf.org/html/rfc3986#section-3.2.2 404 * 405 * @throws SyntaxError if the registered name is invalid 406 */ 407 private static function filterHost(string $host): string 408 { 409 if ('' === $host) { 410 return $host; 411 } 412 413 if ('[' !== $host[0] || ']' !== substr($host, -1)) { 414 return self::filterRegisteredName($host); 415 } 416 417 if (!self::isIpHost(substr($host, 1, -1))) { 418 throw new SyntaxError(sprintf('Host `%s` is invalid : the IP host is malformed', $host)); 419 } 420 421 return $host; 422 } 423 424 /** 425 * Returns whether the host is an IPv4 or a registered named. 426 * 427 * @link https://tools.ietf.org/html/rfc3986#section-3.2.2 428 * 429 * @throws SyntaxError if the registered name is invalid 430 * @throws IdnSupportMissing if IDN support or ICU requirement are not available or met. 431 */ 432 private static function filterRegisteredName(string $host): string 433 { 434 // @codeCoverageIgnoreStart 435 // added because it is not possible in travis to disabled the ext/intl extension 436 // see travis issue https://github.com/travis-ci/travis-ci/issues/4701 437 static $idn_support = null; 438 $idn_support = $idn_support ?? function_exists('idn_to_ascii') && defined('INTL_IDNA_VARIANT_UTS46'); 439 // @codeCoverageIgnoreEnd 440 441 $formatted_host = rawurldecode($host); 442 if (1 === preg_match(self::REGEXP_REGISTERED_NAME, $formatted_host)) { 443 if (false === strpos($formatted_host, 'xn--')) { 444 return $host; 445 } 446 447 // @codeCoverageIgnoreStart 448 if (!$idn_support) { 449 throw new IdnSupportMissing(sprintf('the host `%s` could not be processed for IDN. Verify that ext/intl is installed for IDN support and that ICU is at least version 4.6.', $host)); 450 } 451 // @codeCoverageIgnoreEnd 452 453 $unicode = idn_to_utf8($host, 0, INTL_IDNA_VARIANT_UTS46, $arr); 454 if (0 !== $arr['errors']) { 455 throw new SyntaxError(sprintf('The host `%s` is invalid : %s', $host, self::getIDNAErrors($arr['errors']))); 456 } 457 458 // @codeCoverageIgnoreStart 459 if (false === $unicode) { 460 throw new IdnSupportMissing(sprintf('The Intl extension is misconfigured for %s, please correct this issue before proceeding.', PHP_OS)); 461 } 462 // @codeCoverageIgnoreEnd 463 464 return $host; 465 } 466 467 //to test IDN host non-ascii characters must be present in the host 468 if (1 !== preg_match(self::REGEXP_IDN_PATTERN, $formatted_host)) { 469 throw new SyntaxError(sprintf('Host `%s` is invalid : the host is not a valid registered name', $host)); 470 } 471 472 // @codeCoverageIgnoreStart 473 if (!$idn_support) { 474 throw new IdnSupportMissing(sprintf('the host `%s` could not be processed for IDN. Verify that ext/intl is installed for IDN support and that ICU is at least version 4.6.', $host)); 475 } 476 // @codeCoverageIgnoreEnd 477 478 $retval = idn_to_ascii($formatted_host, 0, INTL_IDNA_VARIANT_UTS46, $arr); 479 480 if ([] === $arr) { 481 throw new SyntaxError(sprintf('Host `%s` is not a valid IDN host', $host)); 482 } 483 484 if (0 !== $arr['errors']) { 485 throw new SyntaxError(sprintf('Host `%s` is not a valid IDN host : %s', $host, self::getIDNAErrors($arr['errors']))); 486 } 487 488 // @codeCoverageIgnoreStart 489 if (false === $retval) { 490 throw new IdnSupportMissing(sprintf('The Intl extension is misconfigured for %s, please correct this issue before proceeding.', PHP_OS)); 491 } 492 // @codeCoverageIgnoreEnd 493 494 if (false !== strpos($retval, '%')) { 495 throw new SyntaxError(sprintf('Host `%s` is invalid : the host is not a valid registered name', $host)); 496 } 497 498 return $host; 499 } 500 501 /** 502 * Retrieves and format IDNA conversion error message. 503 * 504 * @link http://icu-project.org/apiref/icu4j/com/ibm/icu/text/IDNA.Error.html 505 */ 506 private static function getIDNAErrors(int $error_byte): string 507 { 508 /** 509 * IDNA errors. 510 */ 511 static $idn_errors = [ 512 IDNA_ERROR_EMPTY_LABEL => 'a non-final domain name label (or the whole domain name) is empty', 513 IDNA_ERROR_LABEL_TOO_LONG => 'a domain name label is longer than 63 bytes', 514 IDNA_ERROR_DOMAIN_NAME_TOO_LONG => 'a domain name is longer than 255 bytes in its storage form', 515 IDNA_ERROR_LEADING_HYPHEN => 'a label starts with a hyphen-minus ("-")', 516 IDNA_ERROR_TRAILING_HYPHEN => 'a label ends with a hyphen-minus ("-")', 517 IDNA_ERROR_HYPHEN_3_4 => 'a label contains hyphen-minus ("-") in the third and fourth positions', 518 IDNA_ERROR_LEADING_COMBINING_MARK => 'a label starts with a combining mark', 519 IDNA_ERROR_DISALLOWED => 'a label or domain name contains disallowed characters', 520 IDNA_ERROR_PUNYCODE => 'a label starts with "xn--" but does not contain valid Punycode', 521 IDNA_ERROR_LABEL_HAS_DOT => 'a label contains a dot=full stop', 522 IDNA_ERROR_INVALID_ACE_LABEL => 'An ACE label does not contain a valid label string', 523 IDNA_ERROR_BIDI => 'a label does not meet the IDNA BiDi requirements (for right-to-left characters)', 524 IDNA_ERROR_CONTEXTJ => 'a label does not meet the IDNA CONTEXTJ requirements', 525 ]; 526 527 $res = []; 528 foreach ($idn_errors as $error => $reason) { 529 if ($error === ($error_byte & $error)) { 530 $res[] = $reason; 531 } 532 } 533 534 return [] === $res ? 'Unknown IDNA conversion error.' : implode(', ', $res).'.'; 535 } 536 537 /** 538 * Validates a IPv6/IPvfuture host. 539 * 540 * @link https://tools.ietf.org/html/rfc3986#section-3.2.2 541 * @link https://tools.ietf.org/html/rfc6874#section-2 542 * @link https://tools.ietf.org/html/rfc6874#section-4 543 */ 544 private static function isIpHost(string $ip_host): bool 545 { 546 if (false !== filter_var($ip_host, FILTER_VALIDATE_IP, FILTER_FLAG_IPV6)) { 547 return true; 548 } 549 550 if (1 === preg_match(self::REGEXP_IP_FUTURE, $ip_host, $matches)) { 551 return !in_array($matches['version'], ['4', '6'], true); 552 } 553 554 $pos = strpos($ip_host, '%'); 555 if (false === $pos || 1 === preg_match( 556 self::REGEXP_INVALID_HOST_CHARS, 557 rawurldecode(substr($ip_host, $pos)) 558 )) { 559 return false; 560 } 561 562 $ip_host = substr($ip_host, 0, $pos); 563 564 return false !== filter_var($ip_host, FILTER_VALIDATE_IP, FILTER_FLAG_IPV6) 565 && 0 === strpos((string) inet_pton($ip_host), self::ZONE_ID_ADDRESS_BLOCK); 566 } 567} 568