1<?php 2 3declare( strict_types = 1 ); 4 5namespace Wikimedia\Parsoid\Utils; 6 7/** 8 * Utilities for manipulating URLs 9 * @see https://tools.ietf.org/html/rfc3986 10 */ 11class UrlUtils { 12 13 /** 14 * Parse a possibly-relative URL into components 15 * 16 * Note no percent-decoding is performed, and only minimal syntax validation. 17 * 18 * @param string $url 19 * @return (string|null)[] 20 * - 'scheme': Scheme of the url, if any. 21 * - 'authority': Authority part of the url, if any. 22 * This is the part in between the "//" and the path. For http, this is the "user@host:port". 23 * - 'path': Path part of the URL. Never null, but may be the empty string. 24 * - 'query': Query part of the URL, if any. 25 * - 'fragment': Fragment part of the URL, if any. 26 */ 27 public static function parseUrl( string $url ): array { 28 $ret = [ 29 'scheme' => null, 30 'authority' => null, 31 'path' => '', 32 'query' => null, 33 'fragment' => null, 34 ]; 35 36 // Scheme? 37 if ( preg_match( '!^([a-z][a-z0-9+.-]*):!i', $url, $m ) ) { 38 $ret['scheme'] = $m[1]; 39 $url = substr( $url, strlen( $m[0] ) ); 40 } 41 42 // Fragment? 43 $i = strpos( $url, '#' ); 44 if ( $i !== false ) { 45 $ret['fragment'] = substr( $url, $i + 1 ); 46 $url = substr( $url, 0, $i ); 47 } 48 49 // Query? 50 $i = strpos( $url, '?' ); 51 if ( $i !== false ) { 52 $ret['query'] = substr( $url, $i + 1 ); 53 $url = substr( $url, 0, $i ); 54 } 55 56 // Split authority and path 57 if ( substr( $url, 0, 2 ) === '//' ) { 58 $i = strpos( $url, '/', 2 ); 59 if ( $i === false ) { 60 $ret['authority'] = substr( $url, 2 ); 61 $ret['path'] = ''; 62 } else { 63 $ret['authority'] = substr( $url, 2, $i - 2 ); 64 $ret['path'] = substr( $url, $i ); 65 } 66 } else { 67 $ret['path'] = $url; 68 } 69 70 return $ret; 71 } 72 73 /** 74 * This function will reassemble a URL parsed with self::parseURL(). 75 * 76 * Note no percent-encoding or syntax validation is performed. 77 * 78 * @param array $urlParts URL parts, as output from self::parseUrl 79 * @return string URL assembled from its component parts 80 */ 81 public static function assembleUrl( array $urlParts ): string { 82 $ret = ''; 83 84 if ( isset( $urlParts['scheme'] ) ) { 85 $ret .= $urlParts['scheme'] . ':'; 86 } 87 88 if ( isset( $urlParts['authority'] ) ) { 89 $ret .= '//' . $urlParts['authority']; 90 } 91 92 if ( isset( $urlParts['path'] ) ) { 93 $ret .= $urlParts['path']; 94 } 95 96 if ( isset( $urlParts['query'] ) ) { 97 $ret .= '?' . $urlParts['query']; 98 } 99 100 if ( isset( $urlParts['fragment'] ) ) { 101 $ret .= '#' . $urlParts['fragment']; 102 } 103 104 return $ret; 105 } 106 107 /** 108 * Remove all dot-segments in the provided URL path. For example, 109 * '/a/./b/../c/' becomes '/a/c/'. 110 * 111 * @see https://tools.ietf.org/html/rfc3986#section-5.2.4 112 * @note Copied from MediaWiki's wfRemoveDotSegments 113 * @param string $urlPath URL path, potentially containing dot-segments 114 * @return string URL path with all dot-segments removed 115 */ 116 public static function removeDotSegments( string $urlPath ): string { 117 $output = ''; 118 $inputOffset = 0; 119 $inputLength = strlen( $urlPath ); 120 121 while ( $inputOffset < $inputLength ) { 122 $prefixLengthOne = substr( $urlPath, $inputOffset, 1 ); 123 $prefixLengthTwo = substr( $urlPath, $inputOffset, 2 ); 124 $prefixLengthThree = substr( $urlPath, $inputOffset, 3 ); 125 $prefixLengthFour = substr( $urlPath, $inputOffset, 4 ); 126 $trimOutput = false; 127 128 if ( $prefixLengthTwo == './' ) { 129 # Step A, remove leading "./" 130 $inputOffset += 2; 131 } elseif ( $prefixLengthThree == '../' ) { 132 # Step A, remove leading "../" 133 $inputOffset += 3; 134 } elseif ( ( $prefixLengthTwo == '/.' ) && ( $inputOffset + 2 == $inputLength ) ) { 135 # Step B, replace leading "/.$" with "/" 136 $inputOffset += 1; 137 $urlPath[$inputOffset] = '/'; 138 } elseif ( $prefixLengthThree == '/./' ) { 139 # Step B, replace leading "/./" with "/" 140 $inputOffset += 2; 141 } elseif ( $prefixLengthThree == '/..' && ( $inputOffset + 3 == $inputLength ) ) { 142 # Step C, replace leading "/..$" with "/" and 143 # remove last path component in output 144 $inputOffset += 2; 145 $urlPath[$inputOffset] = '/'; 146 $trimOutput = true; 147 } elseif ( $prefixLengthFour == '/../' ) { 148 # Step C, replace leading "/../" with "/" and 149 # remove last path component in output 150 $inputOffset += 3; 151 $trimOutput = true; 152 } elseif ( ( $prefixLengthOne == '.' ) && ( $inputOffset + 1 == $inputLength ) ) { 153 # Step D, remove "^.$" 154 $inputOffset += 1; 155 } elseif ( ( $prefixLengthTwo == '..' ) && ( $inputOffset + 2 == $inputLength ) ) { 156 # Step D, remove "^..$" 157 $inputOffset += 2; 158 } else { 159 # Step E, move leading path segment to output 160 if ( $prefixLengthOne == '/' ) { 161 $slashPos = strpos( $urlPath, '/', $inputOffset + 1 ); 162 } else { 163 $slashPos = strpos( $urlPath, '/', $inputOffset ); 164 } 165 if ( $slashPos === false ) { 166 $output .= substr( $urlPath, $inputOffset ); 167 $inputOffset = $inputLength; 168 } else { 169 $output .= substr( $urlPath, $inputOffset, $slashPos - $inputOffset ); 170 $inputOffset += $slashPos - $inputOffset; 171 } 172 } 173 174 if ( $trimOutput ) { 175 $slashPos = strrpos( $output, '/' ); 176 if ( $slashPos === false ) { 177 $output = ''; 178 } else { 179 $output = substr( $output, 0, $slashPos ); 180 } 181 } 182 } 183 184 return $output; 185 } 186 187 /** 188 * Expand a relative URL using a base URL 189 * 190 * @see https://tools.ietf.org/html/rfc3986#section-5.2.2 191 * @param string $url Relative URL to expand 192 * @param string $base Base URL to expand relative to 193 * @return string Expanded URL 194 */ 195 public static function expandUrl( string $url, string $base ): string { 196 $b = self::parseUrl( $base ); 197 $r = self::parseUrl( $url ); 198 199 $t = []; 200 if ( isset( $r['scheme'] ) ) { 201 $t['scheme'] = $r['scheme']; 202 $t['authority'] = $r['authority'] ?? null; 203 $t['path'] = self::removeDotSegments( $r['path'] ); 204 $t['query'] = $r['query'] ?? null; 205 } else { 206 if ( isset( $r['authority'] ) ) { 207 $t['authority'] = $r['authority']; 208 $t['path'] = self::removeDotSegments( $r['path'] ); 209 $t['query'] = $r['query'] ?? null; 210 } else { 211 if ( $r['path'] === '' ) { 212 $t['path'] = $b['path']; 213 $t['query'] = $r['query'] ?? $b['query'] ?? null; 214 } else { 215 if ( $r['path'][0] === '/' ) { 216 $t['path'] = self::removeDotSegments( $r['path'] ); 217 } else { 218 // start merge(), see RFC 3986 §5.2.3 219 if ( isset( $b['authority'] ) && $b['path'] === '' ) { 220 $t['path'] = '/' . $r['path']; 221 } else { 222 $i = strrpos( $b['path'], '/' ); 223 if ( $i === false ) { 224 $t['path'] = $r['path']; 225 } else { 226 $t['path'] = substr( $b['path'], 0, $i + 1 ) . $r['path']; 227 } 228 } 229 // end merge() 230 $t['path'] = self::removeDotSegments( $t['path'] ); 231 } 232 $t['query'] = $r['query'] ?? null; 233 } 234 $t['authority'] = $b['authority'] ?? null; 235 } 236 $t['scheme'] = $b['scheme'] ?? null; 237 } 238 $t['fragment'] = $r['fragment'] ?? null; 239 240 return self::assembleUrl( $t ); 241 } 242 243} 244