1<?php
2
3declare( strict_types = 1 );
4
5namespace Wikimedia\Parsoid\Utils;
6
7/**
8 * Utilities for manipulating URLs
9 * @see https://tools.ietf.org/html/rfc3986
10 */
11class UrlUtils {
12
13	/**
14	 * Parse a possibly-relative URL into components
15	 *
16	 * Note no percent-decoding is performed, and only minimal syntax validation.
17	 *
18	 * @param string $url
19	 * @return (string|null)[]
20	 *  - 'scheme': Scheme of the url, if any.
21	 *  - 'authority': Authority part of the url, if any.
22	 *    This is the part in between the "//" and the path. For http, this is the "user@host:port".
23	 *  - 'path': Path part of the URL. Never null, but may be the empty string.
24	 *  - 'query': Query part of the URL, if any.
25	 *  - 'fragment': Fragment part of the URL, if any.
26	 */
27	public static function parseUrl( string $url ): array {
28		$ret = [
29			'scheme' => null,
30			'authority' => null,
31			'path' => '',
32			'query' => null,
33			'fragment' => null,
34		];
35
36		// Scheme?
37		if ( preg_match( '!^([a-z][a-z0-9+.-]*):!i', $url, $m ) ) {
38			$ret['scheme'] = $m[1];
39			$url = substr( $url, strlen( $m[0] ) );
40		}
41
42		// Fragment?
43		$i = strpos( $url, '#' );
44		if ( $i !== false ) {
45			$ret['fragment'] = substr( $url, $i + 1 );
46			$url = substr( $url, 0, $i );
47		}
48
49		// Query?
50		$i = strpos( $url, '?' );
51		if ( $i !== false ) {
52			$ret['query'] = substr( $url, $i + 1 );
53			$url = substr( $url, 0, $i );
54		}
55
56		// Split authority and path
57		if ( substr( $url, 0, 2 ) === '//' ) {
58			$i = strpos( $url, '/', 2 );
59			if ( $i === false ) {
60				$ret['authority'] = substr( $url, 2 );
61				$ret['path'] = '';
62			} else {
63				$ret['authority'] = substr( $url, 2, $i - 2 );
64				$ret['path'] = substr( $url, $i );
65			}
66		} else {
67			$ret['path'] = $url;
68		}
69
70		return $ret;
71	}
72
73	/**
74	 * This function will reassemble a URL parsed with self::parseURL().
75	 *
76	 * Note no percent-encoding or syntax validation is performed.
77	 *
78	 * @param array $urlParts URL parts, as output from self::parseUrl
79	 * @return string URL assembled from its component parts
80	 */
81	public static function assembleUrl( array $urlParts ): string {
82		$ret = '';
83
84		if ( isset( $urlParts['scheme'] ) ) {
85			$ret .= $urlParts['scheme'] . ':';
86		}
87
88		if ( isset( $urlParts['authority'] ) ) {
89			$ret .= '//' . $urlParts['authority'];
90		}
91
92		if ( isset( $urlParts['path'] ) ) {
93			$ret .= $urlParts['path'];
94		}
95
96		if ( isset( $urlParts['query'] ) ) {
97			$ret .= '?' . $urlParts['query'];
98		}
99
100		if ( isset( $urlParts['fragment'] ) ) {
101			$ret .= '#' . $urlParts['fragment'];
102		}
103
104		return $ret;
105	}
106
107	/**
108	 * Remove all dot-segments in the provided URL path. For example,
109	 * '/a/./b/../c/' becomes '/a/c/'.
110	 *
111	 * @see https://tools.ietf.org/html/rfc3986#section-5.2.4
112	 * @note Copied from MediaWiki's wfRemoveDotSegments
113	 * @param string $urlPath URL path, potentially containing dot-segments
114	 * @return string URL path with all dot-segments removed
115	 */
116	public static function removeDotSegments( string $urlPath ): string {
117		$output = '';
118		$inputOffset = 0;
119		$inputLength = strlen( $urlPath );
120
121		while ( $inputOffset < $inputLength ) {
122			$prefixLengthOne = substr( $urlPath, $inputOffset, 1 );
123			$prefixLengthTwo = substr( $urlPath, $inputOffset, 2 );
124			$prefixLengthThree = substr( $urlPath, $inputOffset, 3 );
125			$prefixLengthFour = substr( $urlPath, $inputOffset, 4 );
126			$trimOutput = false;
127
128			if ( $prefixLengthTwo == './' ) {
129				# Step A, remove leading "./"
130				$inputOffset += 2;
131			} elseif ( $prefixLengthThree == '../' ) {
132				# Step A, remove leading "../"
133				$inputOffset += 3;
134			} elseif ( ( $prefixLengthTwo == '/.' ) && ( $inputOffset + 2 == $inputLength ) ) {
135				# Step B, replace leading "/.$" with "/"
136				$inputOffset += 1;
137				$urlPath[$inputOffset] = '/';
138			} elseif ( $prefixLengthThree == '/./' ) {
139				# Step B, replace leading "/./" with "/"
140				$inputOffset += 2;
141			} elseif ( $prefixLengthThree == '/..' && ( $inputOffset + 3 == $inputLength ) ) {
142				# Step C, replace leading "/..$" with "/" and
143				# remove last path component in output
144				$inputOffset += 2;
145				$urlPath[$inputOffset] = '/';
146				$trimOutput = true;
147			} elseif ( $prefixLengthFour == '/../' ) {
148				# Step C, replace leading "/../" with "/" and
149				# remove last path component in output
150				$inputOffset += 3;
151				$trimOutput = true;
152			} elseif ( ( $prefixLengthOne == '.' ) && ( $inputOffset + 1 == $inputLength ) ) {
153				# Step D, remove "^.$"
154				$inputOffset += 1;
155			} elseif ( ( $prefixLengthTwo == '..' ) && ( $inputOffset + 2 == $inputLength ) ) {
156				# Step D, remove "^..$"
157				$inputOffset += 2;
158			} else {
159				# Step E, move leading path segment to output
160				if ( $prefixLengthOne == '/' ) {
161					$slashPos = strpos( $urlPath, '/', $inputOffset + 1 );
162				} else {
163					$slashPos = strpos( $urlPath, '/', $inputOffset );
164				}
165				if ( $slashPos === false ) {
166					$output .= substr( $urlPath, $inputOffset );
167					$inputOffset = $inputLength;
168				} else {
169					$output .= substr( $urlPath, $inputOffset, $slashPos - $inputOffset );
170					$inputOffset += $slashPos - $inputOffset;
171				}
172			}
173
174			if ( $trimOutput ) {
175				$slashPos = strrpos( $output, '/' );
176				if ( $slashPos === false ) {
177					$output = '';
178				} else {
179					$output = substr( $output, 0, $slashPos );
180				}
181			}
182		}
183
184		return $output;
185	}
186
187	/**
188	 * Expand a relative URL using a base URL
189	 *
190	 * @see https://tools.ietf.org/html/rfc3986#section-5.2.2
191	 * @param string $url Relative URL to expand
192	 * @param string $base Base URL to expand relative to
193	 * @return string Expanded URL
194	 */
195	public static function expandUrl( string $url, string $base ): string {
196		$b = self::parseUrl( $base );
197		$r = self::parseUrl( $url );
198
199		$t = [];
200		if ( isset( $r['scheme'] ) ) {
201			$t['scheme'] = $r['scheme'];
202			$t['authority'] = $r['authority'] ?? null;
203			$t['path'] = self::removeDotSegments( $r['path'] );
204			$t['query'] = $r['query'] ?? null;
205		} else {
206			if ( isset( $r['authority'] ) ) {
207				$t['authority'] = $r['authority'];
208				$t['path'] = self::removeDotSegments( $r['path'] );
209				$t['query'] = $r['query'] ?? null;
210			} else {
211				if ( $r['path'] === '' ) {
212					$t['path'] = $b['path'];
213					$t['query'] = $r['query'] ?? $b['query'] ?? null;
214				} else {
215					if ( $r['path'][0] === '/' ) {
216						$t['path'] = self::removeDotSegments( $r['path'] );
217					} else {
218						// start merge(), see RFC 3986 §5.2.3
219						if ( isset( $b['authority'] ) && $b['path'] === '' ) {
220							$t['path'] = '/' . $r['path'];
221						} else {
222							$i = strrpos( $b['path'], '/' );
223							if ( $i === false ) {
224								$t['path'] = $r['path'];
225							} else {
226								$t['path'] = substr( $b['path'], 0, $i + 1 ) . $r['path'];
227							}
228						}
229						// end merge()
230						$t['path'] = self::removeDotSegments( $t['path'] );
231					}
232					$t['query'] = $r['query'] ?? null;
233				}
234				$t['authority'] = $b['authority'] ?? null;
235			}
236			$t['scheme'] = $b['scheme'] ?? null;
237		}
238		$t['fragment'] = $r['fragment'] ?? null;
239
240		return self::assembleUrl( $t );
241	}
242
243}
244