1<?php
2/**
3 * This file is part of RSS-Bridge, a PHP project capable of generating RSS and
4 * Atom feeds for websites that don't have one.
5 *
6 * For the full license information, please view the UNLICENSE file distributed
7 * with this source code.
8 *
9 * @package	Core
10 * @license	http://unlicense.org/ UNLICENSE
11 * @link	https://github.com/rss-bridge/rss-bridge
12 */
13
14/**
15 * Gets contents from the Internet.
16 *
17 * **Content caching** (disabled in debug mode)
18 *
19 * A copy of the received content is stored in a local cache folder `server/` at
20 * {@see PATH_CACHE}. The `If-Modified-Since` header is added to the request, if
21 * the provided URL has been cached before.
22 *
23 * When the server responds with `304 Not Modified`, the cached data is returned.
24 * This will improve response times and reduce bandwidth for servers that support
25 * the `If-Modified-Since` header.
26 *
27 * Cached files are forcefully removed after 24 hours.
28 *
29 * @link https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/If-Modified-Since
30 * If-Modified-Since
31 *
32 * @param string $url The URL.
33 * @param array $header (optional) A list of cURL header.
34 * For more information follow the links below.
35 * * https://php.net/manual/en/function.curl-setopt.php
36 * * https://curl.haxx.se/libcurl/c/CURLOPT_HTTPHEADER.html
37 * @param array $opts (optional) A list of cURL options as associative array in
38 * the format `$opts[$option] = $value;`, where `$option` is any `CURLOPT_XXX`
39 * option and `$value` the corresponding value.
40 * @param bool $returnHeader Returns an array of two elements 'header' and
41 * 'content' if enabled.
42 *
43 * For more information see http://php.net/manual/en/function.curl-setopt.php
44 * @return string|array The contents.
45 */
46function getContents($url, $header = array(), $opts = array(), $returnHeader = false){
47	Debug::log('Reading contents from "' . $url . '"');
48
49	// Initialize cache
50	$cacheFac = new CacheFactory();
51	$cacheFac->setWorkingDir(PATH_LIB_CACHES);
52	$cache = $cacheFac->create(Configuration::getConfig('cache', 'type'));
53	$cache->setScope('server');
54	$cache->purgeCache(86400); // 24 hours (forced)
55
56	$params = array($url);
57	$cache->setKey($params);
58
59	$retVal = array(
60		'header' => '',
61		'content' => '',
62	);
63
64	// Use file_get_contents if in CLI mode with no root certificates defined
65	if(php_sapi_name() === 'cli' && empty(ini_get('curl.cainfo'))) {
66
67		$httpHeaders = '';
68
69		foreach ($header as $headerL) {
70			$httpHeaders .= $headerL . "\r\n";
71		}
72
73		$ctx = stream_context_create(array(
74			'http' => array(
75				'header' => $httpHeaders
76			)
77		));
78
79		$data = @file_get_contents($url, 0, $ctx);
80
81		if($data === false) {
82			$errorCode = 500;
83		} else {
84			$errorCode = 200;
85			$retVal['header'] = implode("\r\n", $http_response_header);
86		}
87
88		$curlError = '';
89		$curlErrno = '';
90		$headerSize = 0;
91		$finalHeader = array();
92	} else {
93		$ch = curl_init($url);
94		curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
95		curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
96
97		if(is_array($header) && count($header) !== 0) {
98
99			Debug::log('Setting headers: ' . json_encode($header));
100			curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
101
102		}
103
104		curl_setopt($ch, CURLOPT_USERAGENT, ini_get('user_agent'));
105		curl_setopt($ch, CURLOPT_ENCODING, '');
106		curl_setopt($ch, CURLOPT_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS);
107
108		if(is_array($opts) && count($opts) !== 0) {
109
110			Debug::log('Setting options: ' . json_encode($opts));
111
112			foreach($opts as $key => $value) {
113				curl_setopt($ch, $key, $value);
114			}
115
116		}
117
118		if(defined('PROXY_URL') && !defined('NOPROXY')) {
119
120			Debug::log('Setting proxy url: ' . PROXY_URL);
121			curl_setopt($ch, CURLOPT_PROXY, PROXY_URL);
122
123		}
124
125		// We always want the response header as part of the data!
126		curl_setopt($ch, CURLOPT_HEADER, true);
127
128		// Build "If-Modified-Since" header
129		if(!Debug::isEnabled() && $time = $cache->getTime()) { // Skip if cache file doesn't exist
130			Debug::log('Adding If-Modified-Since');
131			curl_setopt($ch, CURLOPT_TIMEVALUE, $time);
132			curl_setopt($ch, CURLOPT_TIMECONDITION, CURL_TIMECOND_IFMODSINCE);
133		}
134
135		// Enables logging for the outgoing header
136		curl_setopt($ch, CURLINFO_HEADER_OUT, true);
137
138		$data = curl_exec($ch);
139		$errorCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
140
141		$curlError = curl_error($ch);
142		$curlErrno = curl_errno($ch);
143		$curlInfo = curl_getinfo($ch);
144
145		Debug::log('Outgoing header: ' . json_encode($curlInfo));
146
147		if($data === false)
148			Debug::log('Cant\'t download ' . $url . ' cUrl error: ' . $curlError . ' (' . $curlErrno . ')');
149
150		$headerSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
151		$header = substr($data, 0, $headerSize);
152		$retVal['header'] = $header;
153
154		Debug::log('Response header: ' . $header);
155
156		$headers = parseResponseHeader($header);
157		$finalHeader = end($headers);
158
159		curl_close($ch);
160	}
161
162	switch($errorCode) {
163		case 200: // Contents received
164			Debug::log('New contents received');
165			$data = substr($data, $headerSize);
166			// Disable caching if the server responds with "Cache-Control: no-cache"
167			// or "Cache-Control: no-store"
168			$finalHeader = array_change_key_case($finalHeader, CASE_LOWER);
169			if(array_key_exists('cache-control', $finalHeader)) {
170				Debug::log('Server responded with "Cache-Control" header');
171				$directives = explode(',', $finalHeader['cache-control']);
172				$directives = array_map('trim', $directives);
173				if(in_array('no-cache', $directives)
174				|| in_array('no-store', $directives)) { // Skip caching
175					Debug::log('Skip server side caching');
176					$retVal['content'] = $data;
177					break;
178				}
179			}
180			Debug::log('Store response to cache');
181			$cache->saveData($data);
182			$retVal['content'] = $data;
183			break;
184		case 304: // Not modified, use cached data
185			Debug::log('Contents not modified on host, returning cached data');
186			$retVal['content'] = $cache->loadData();
187			break;
188		default:
189			if(array_key_exists('Server', $finalHeader) && strpos($finalHeader['Server'], 'cloudflare') !== false) {
190			returnServerError(<<< EOD
191The server responded with a Cloudflare challenge, which is not supported by RSS-Bridge!
192If this error persists longer than a week, please consider opening an issue on GitHub!
193EOD
194				);
195			}
196
197			$lastError = error_get_last();
198			if($lastError !== null)
199				$lastError = $lastError['message'];
200			returnError(<<<EOD
201Unexpected response from upstream.
202cUrl error: $curlError ($curlErrno)
203PHP error: $lastError
204EOD
205			, $errorCode);
206	}
207
208	return ($returnHeader === true) ? $retVal : $retVal['content'];
209}
210
211/**
212 * Gets contents from the Internet as simplhtmldom object.
213 *
214 * @param string $url The URL.
215 * @param array $header (optional) A list of cURL header.
216 * For more information follow the links below.
217 * * https://php.net/manual/en/function.curl-setopt.php
218 * * https://curl.haxx.se/libcurl/c/CURLOPT_HTTPHEADER.html
219 * @param array $opts (optional) A list of cURL options as associative array in
220 * the format `$opts[$option] = $value;`, where `$option` is any `CURLOPT_XXX`
221 * option and `$value` the corresponding value.
222 *
223 * For more information see http://php.net/manual/en/function.curl-setopt.php
224 * @param bool $lowercase Force all selectors to lowercase.
225 * @param bool $forceTagsClosed Forcefully close tags in malformed HTML.
226 *
227 * _Remarks_: Forcefully closing tags is great for malformed HTML, but it can
228 * lead to parsing errors.
229 * @param string $target_charset Defines the target charset.
230 * @param bool $stripRN Replace all occurrences of `"\r"` and `"\n"` by `" "`.
231 * @param string $defaultBRText Specifies the replacement text for `<br>` tags
232 * when returning plaintext.
233 * @param string $defaultSpanText Specifies the replacement text for `<span />`
234 * tags when returning plaintext.
235 * @return false|simple_html_dom Contents as simplehtmldom object.
236 */
237function getSimpleHTMLDOM($url,
238	$header = array(),
239	$opts = array(),
240	$lowercase = true,
241	$forceTagsClosed = true,
242	$target_charset = DEFAULT_TARGET_CHARSET,
243	$stripRN = true,
244	$defaultBRText = DEFAULT_BR_TEXT,
245	$defaultSpanText = DEFAULT_SPAN_TEXT){
246
247	$content = getContents($url, $header, $opts);
248	return str_get_html($content,
249	$lowercase,
250	$forceTagsClosed,
251	$target_charset,
252	$stripRN,
253	$defaultBRText,
254	$defaultSpanText);
255}
256
257/**
258 * Gets contents from the Internet as simplhtmldom object. Contents are cached
259 * and re-used for subsequent calls until the cache duration elapsed.
260 *
261 * _Notice_: Cached contents are forcefully removed after 24 hours (86400 seconds).
262 *
263 * @param string $url The URL.
264 * @param int $duration Cache duration in seconds.
265 * @param array $header (optional) A list of cURL header.
266 * For more information follow the links below.
267 * * https://php.net/manual/en/function.curl-setopt.php
268 * * https://curl.haxx.se/libcurl/c/CURLOPT_HTTPHEADER.html
269 * @param array $opts (optional) A list of cURL options as associative array in
270 * the format `$opts[$option] = $value;`, where `$option` is any `CURLOPT_XXX`
271 * option and `$value` the corresponding value.
272 *
273 * For more information see http://php.net/manual/en/function.curl-setopt.php
274 * @param bool $lowercase Force all selectors to lowercase.
275 * @param bool $forceTagsClosed Forcefully close tags in malformed HTML.
276 *
277 * _Remarks_: Forcefully closing tags is great for malformed HTML, but it can
278 * lead to parsing errors.
279 * @param string $target_charset Defines the target charset.
280 * @param bool $stripRN Replace all occurrences of `"\r"` and `"\n"` by `" "`.
281 * @param string $defaultBRText Specifies the replacement text for `<br>` tags
282 * when returning plaintext.
283 * @param string $defaultSpanText Specifies the replacement text for `<span />`
284 * tags when returning plaintext.
285 * @return false|simple_html_dom Contents as simplehtmldom object.
286 */
287function getSimpleHTMLDOMCached($url,
288	$duration = 86400,
289	$header = array(),
290	$opts = array(),
291	$lowercase = true,
292	$forceTagsClosed = true,
293	$target_charset = DEFAULT_TARGET_CHARSET,
294	$stripRN = true,
295	$defaultBRText = DEFAULT_BR_TEXT,
296	$defaultSpanText = DEFAULT_SPAN_TEXT){
297
298	Debug::log('Caching url ' . $url . ', duration ' . $duration);
299
300	// Initialize cache
301	$cacheFac = new CacheFactory();
302	$cacheFac->setWorkingDir(PATH_LIB_CACHES);
303	$cache = $cacheFac->create(Configuration::getConfig('cache', 'type'));
304	$cache->setScope('pages');
305	$cache->purgeCache(86400); // 24 hours (forced)
306
307	$params = array($url);
308	$cache->setKey($params);
309
310	// Determine if cached file is within duration
311	$time = $cache->getTime();
312	if($time !== false
313	&& (time() - $duration < $time)
314	&& !Debug::isEnabled()) { // Contents within duration
315		$content = $cache->loadData();
316	} else { // Content not within duration
317		$content = getContents($url, $header, $opts);
318		if($content !== false) {
319			$cache->saveData($content);
320		}
321	}
322
323	return str_get_html($content,
324	$lowercase,
325	$forceTagsClosed,
326	$target_charset,
327	$stripRN,
328	$defaultBRText,
329	$defaultSpanText);
330}
331
332/**
333 * Parses the cURL response header into an associative array
334 *
335 * Based on https://stackoverflow.com/a/18682872
336 *
337 * @param string $header The cURL response header.
338 * @return array An associative array of response headers.
339 */
340function parseResponseHeader($header) {
341
342	$headers = array();
343	$requests = explode("\r\n\r\n", trim($header));
344
345	foreach ($requests as $request) {
346
347		$header = array();
348
349		foreach (explode("\r\n", $request) as $i => $line) {
350
351			if($i === 0) {
352				$header['http_code'] = $line;
353			} else {
354
355				list ($key, $value) = explode(':', $line);
356				$header[$key] = trim($value);
357
358			}
359
360		}
361
362		$headers[] = $header;
363
364	}
365
366	return $headers;
367
368}
369
370/**
371 * Determines the MIME type from a URL/Path file extension.
372 *
373 * _Remarks_:
374 *
375 * * The built-in functions `mime_content_type` and `fileinfo` require fetching
376 * remote contents.
377 * * A caller can hint for a MIME type by appending `#.ext` to the URL (i.e. `#.image`).
378 *
379 * Based on https://stackoverflow.com/a/1147952
380 *
381 * @param string $url The URL or path to the file.
382 * @return string The MIME type of the file.
383 */
384function getMimeType($url) {
385	static $mime = null;
386
387	if (is_null($mime)) {
388		// Default values, overriden by /etc/mime.types when present
389		$mime = array(
390			'jpg' => 'image/jpeg',
391			'gif' => 'image/gif',
392			'png' => 'image/png',
393			'image' => 'image/*'
394		);
395		// '@' is used to mute open_basedir warning, see issue #818
396		if (@is_readable('/etc/mime.types')) {
397			$file = fopen('/etc/mime.types', 'r');
398			while(($line = fgets($file)) !== false) {
399				$line = trim(preg_replace('/#.*/', '', $line));
400				if(!$line)
401					continue;
402				$parts = preg_split('/\s+/', $line);
403				if(count($parts) == 1)
404					continue;
405				$type = array_shift($parts);
406				foreach($parts as $part)
407					$mime[$part] = $type;
408			}
409			fclose($file);
410		}
411	}
412
413	if (strpos($url, '?') !== false) {
414		$url_temp = substr($url, 0, strpos($url, '?'));
415		if (strpos($url, '#') !== false) {
416			$anchor = substr($url, strpos($url, '#'));
417			$url_temp .= $anchor;
418		}
419		$url = $url_temp;
420	}
421
422	$ext = strtolower(pathinfo($url, PATHINFO_EXTENSION));
423	if (!empty($mime[$ext])) {
424		return $mime[$ext];
425	}
426
427	return 'application/octet-stream';
428}
429