1<?php 2/** 3 * This file is part of RSS-Bridge, a PHP project capable of generating RSS and 4 * Atom feeds for websites that don't have one. 5 * 6 * For the full license information, please view the UNLICENSE file distributed 7 * with this source code. 8 * 9 * @package Core 10 * @license http://unlicense.org/ UNLICENSE 11 * @link https://github.com/rss-bridge/rss-bridge 12 */ 13 14/** 15 * Gets contents from the Internet. 16 * 17 * **Content caching** (disabled in debug mode) 18 * 19 * A copy of the received content is stored in a local cache folder `server/` at 20 * {@see PATH_CACHE}. The `If-Modified-Since` header is added to the request, if 21 * the provided URL has been cached before. 22 * 23 * When the server responds with `304 Not Modified`, the cached data is returned. 24 * This will improve response times and reduce bandwidth for servers that support 25 * the `If-Modified-Since` header. 26 * 27 * Cached files are forcefully removed after 24 hours. 28 * 29 * @link https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/If-Modified-Since 30 * If-Modified-Since 31 * 32 * @param string $url The URL. 33 * @param array $header (optional) A list of cURL header. 34 * For more information follow the links below. 35 * * https://php.net/manual/en/function.curl-setopt.php 36 * * https://curl.haxx.se/libcurl/c/CURLOPT_HTTPHEADER.html 37 * @param array $opts (optional) A list of cURL options as associative array in 38 * the format `$opts[$option] = $value;`, where `$option` is any `CURLOPT_XXX` 39 * option and `$value` the corresponding value. 40 * @param bool $returnHeader Returns an array of two elements 'header' and 41 * 'content' if enabled. 42 * 43 * For more information see http://php.net/manual/en/function.curl-setopt.php 44 * @return string|array The contents. 45 */ 46function getContents($url, $header = array(), $opts = array(), $returnHeader = false){ 47 Debug::log('Reading contents from "' . $url . '"'); 48 49 // Initialize cache 50 $cacheFac = new CacheFactory(); 51 $cacheFac->setWorkingDir(PATH_LIB_CACHES); 52 $cache = $cacheFac->create(Configuration::getConfig('cache', 'type')); 53 $cache->setScope('server'); 54 $cache->purgeCache(86400); // 24 hours (forced) 55 56 $params = array($url); 57 $cache->setKey($params); 58 59 $retVal = array( 60 'header' => '', 61 'content' => '', 62 ); 63 64 // Use file_get_contents if in CLI mode with no root certificates defined 65 if(php_sapi_name() === 'cli' && empty(ini_get('curl.cainfo'))) { 66 67 $httpHeaders = ''; 68 69 foreach ($header as $headerL) { 70 $httpHeaders .= $headerL . "\r\n"; 71 } 72 73 $ctx = stream_context_create(array( 74 'http' => array( 75 'header' => $httpHeaders 76 ) 77 )); 78 79 $data = @file_get_contents($url, 0, $ctx); 80 81 if($data === false) { 82 $errorCode = 500; 83 } else { 84 $errorCode = 200; 85 $retVal['header'] = implode("\r\n", $http_response_header); 86 } 87 88 $curlError = ''; 89 $curlErrno = ''; 90 $headerSize = 0; 91 $finalHeader = array(); 92 } else { 93 $ch = curl_init($url); 94 curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); 95 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); 96 97 if(is_array($header) && count($header) !== 0) { 98 99 Debug::log('Setting headers: ' . json_encode($header)); 100 curl_setopt($ch, CURLOPT_HTTPHEADER, $header); 101 102 } 103 104 curl_setopt($ch, CURLOPT_USERAGENT, ini_get('user_agent')); 105 curl_setopt($ch, CURLOPT_ENCODING, ''); 106 curl_setopt($ch, CURLOPT_PROTOCOLS, CURLPROTO_HTTP | CURLPROTO_HTTPS); 107 108 if(is_array($opts) && count($opts) !== 0) { 109 110 Debug::log('Setting options: ' . json_encode($opts)); 111 112 foreach($opts as $key => $value) { 113 curl_setopt($ch, $key, $value); 114 } 115 116 } 117 118 if(defined('PROXY_URL') && !defined('NOPROXY')) { 119 120 Debug::log('Setting proxy url: ' . PROXY_URL); 121 curl_setopt($ch, CURLOPT_PROXY, PROXY_URL); 122 123 } 124 125 // We always want the response header as part of the data! 126 curl_setopt($ch, CURLOPT_HEADER, true); 127 128 // Build "If-Modified-Since" header 129 if(!Debug::isEnabled() && $time = $cache->getTime()) { // Skip if cache file doesn't exist 130 Debug::log('Adding If-Modified-Since'); 131 curl_setopt($ch, CURLOPT_TIMEVALUE, $time); 132 curl_setopt($ch, CURLOPT_TIMECONDITION, CURL_TIMECOND_IFMODSINCE); 133 } 134 135 // Enables logging for the outgoing header 136 curl_setopt($ch, CURLINFO_HEADER_OUT, true); 137 138 $data = curl_exec($ch); 139 $errorCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); 140 141 $curlError = curl_error($ch); 142 $curlErrno = curl_errno($ch); 143 $curlInfo = curl_getinfo($ch); 144 145 Debug::log('Outgoing header: ' . json_encode($curlInfo)); 146 147 if($data === false) 148 Debug::log('Cant\'t download ' . $url . ' cUrl error: ' . $curlError . ' (' . $curlErrno . ')'); 149 150 $headerSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE); 151 $header = substr($data, 0, $headerSize); 152 $retVal['header'] = $header; 153 154 Debug::log('Response header: ' . $header); 155 156 $headers = parseResponseHeader($header); 157 $finalHeader = end($headers); 158 159 curl_close($ch); 160 } 161 162 switch($errorCode) { 163 case 200: // Contents received 164 Debug::log('New contents received'); 165 $data = substr($data, $headerSize); 166 // Disable caching if the server responds with "Cache-Control: no-cache" 167 // or "Cache-Control: no-store" 168 $finalHeader = array_change_key_case($finalHeader, CASE_LOWER); 169 if(array_key_exists('cache-control', $finalHeader)) { 170 Debug::log('Server responded with "Cache-Control" header'); 171 $directives = explode(',', $finalHeader['cache-control']); 172 $directives = array_map('trim', $directives); 173 if(in_array('no-cache', $directives) 174 || in_array('no-store', $directives)) { // Skip caching 175 Debug::log('Skip server side caching'); 176 $retVal['content'] = $data; 177 break; 178 } 179 } 180 Debug::log('Store response to cache'); 181 $cache->saveData($data); 182 $retVal['content'] = $data; 183 break; 184 case 304: // Not modified, use cached data 185 Debug::log('Contents not modified on host, returning cached data'); 186 $retVal['content'] = $cache->loadData(); 187 break; 188 default: 189 if(array_key_exists('Server', $finalHeader) && strpos($finalHeader['Server'], 'cloudflare') !== false) { 190 returnServerError(<<< EOD 191The server responded with a Cloudflare challenge, which is not supported by RSS-Bridge! 192If this error persists longer than a week, please consider opening an issue on GitHub! 193EOD 194 ); 195 } 196 197 $lastError = error_get_last(); 198 if($lastError !== null) 199 $lastError = $lastError['message']; 200 returnError(<<<EOD 201Unexpected response from upstream. 202cUrl error: $curlError ($curlErrno) 203PHP error: $lastError 204EOD 205 , $errorCode); 206 } 207 208 return ($returnHeader === true) ? $retVal : $retVal['content']; 209} 210 211/** 212 * Gets contents from the Internet as simplhtmldom object. 213 * 214 * @param string $url The URL. 215 * @param array $header (optional) A list of cURL header. 216 * For more information follow the links below. 217 * * https://php.net/manual/en/function.curl-setopt.php 218 * * https://curl.haxx.se/libcurl/c/CURLOPT_HTTPHEADER.html 219 * @param array $opts (optional) A list of cURL options as associative array in 220 * the format `$opts[$option] = $value;`, where `$option` is any `CURLOPT_XXX` 221 * option and `$value` the corresponding value. 222 * 223 * For more information see http://php.net/manual/en/function.curl-setopt.php 224 * @param bool $lowercase Force all selectors to lowercase. 225 * @param bool $forceTagsClosed Forcefully close tags in malformed HTML. 226 * 227 * _Remarks_: Forcefully closing tags is great for malformed HTML, but it can 228 * lead to parsing errors. 229 * @param string $target_charset Defines the target charset. 230 * @param bool $stripRN Replace all occurrences of `"\r"` and `"\n"` by `" "`. 231 * @param string $defaultBRText Specifies the replacement text for `<br>` tags 232 * when returning plaintext. 233 * @param string $defaultSpanText Specifies the replacement text for `<span />` 234 * tags when returning plaintext. 235 * @return false|simple_html_dom Contents as simplehtmldom object. 236 */ 237function getSimpleHTMLDOM($url, 238 $header = array(), 239 $opts = array(), 240 $lowercase = true, 241 $forceTagsClosed = true, 242 $target_charset = DEFAULT_TARGET_CHARSET, 243 $stripRN = true, 244 $defaultBRText = DEFAULT_BR_TEXT, 245 $defaultSpanText = DEFAULT_SPAN_TEXT){ 246 247 $content = getContents($url, $header, $opts); 248 return str_get_html($content, 249 $lowercase, 250 $forceTagsClosed, 251 $target_charset, 252 $stripRN, 253 $defaultBRText, 254 $defaultSpanText); 255} 256 257/** 258 * Gets contents from the Internet as simplhtmldom object. Contents are cached 259 * and re-used for subsequent calls until the cache duration elapsed. 260 * 261 * _Notice_: Cached contents are forcefully removed after 24 hours (86400 seconds). 262 * 263 * @param string $url The URL. 264 * @param int $duration Cache duration in seconds. 265 * @param array $header (optional) A list of cURL header. 266 * For more information follow the links below. 267 * * https://php.net/manual/en/function.curl-setopt.php 268 * * https://curl.haxx.se/libcurl/c/CURLOPT_HTTPHEADER.html 269 * @param array $opts (optional) A list of cURL options as associative array in 270 * the format `$opts[$option] = $value;`, where `$option` is any `CURLOPT_XXX` 271 * option and `$value` the corresponding value. 272 * 273 * For more information see http://php.net/manual/en/function.curl-setopt.php 274 * @param bool $lowercase Force all selectors to lowercase. 275 * @param bool $forceTagsClosed Forcefully close tags in malformed HTML. 276 * 277 * _Remarks_: Forcefully closing tags is great for malformed HTML, but it can 278 * lead to parsing errors. 279 * @param string $target_charset Defines the target charset. 280 * @param bool $stripRN Replace all occurrences of `"\r"` and `"\n"` by `" "`. 281 * @param string $defaultBRText Specifies the replacement text for `<br>` tags 282 * when returning plaintext. 283 * @param string $defaultSpanText Specifies the replacement text for `<span />` 284 * tags when returning plaintext. 285 * @return false|simple_html_dom Contents as simplehtmldom object. 286 */ 287function getSimpleHTMLDOMCached($url, 288 $duration = 86400, 289 $header = array(), 290 $opts = array(), 291 $lowercase = true, 292 $forceTagsClosed = true, 293 $target_charset = DEFAULT_TARGET_CHARSET, 294 $stripRN = true, 295 $defaultBRText = DEFAULT_BR_TEXT, 296 $defaultSpanText = DEFAULT_SPAN_TEXT){ 297 298 Debug::log('Caching url ' . $url . ', duration ' . $duration); 299 300 // Initialize cache 301 $cacheFac = new CacheFactory(); 302 $cacheFac->setWorkingDir(PATH_LIB_CACHES); 303 $cache = $cacheFac->create(Configuration::getConfig('cache', 'type')); 304 $cache->setScope('pages'); 305 $cache->purgeCache(86400); // 24 hours (forced) 306 307 $params = array($url); 308 $cache->setKey($params); 309 310 // Determine if cached file is within duration 311 $time = $cache->getTime(); 312 if($time !== false 313 && (time() - $duration < $time) 314 && !Debug::isEnabled()) { // Contents within duration 315 $content = $cache->loadData(); 316 } else { // Content not within duration 317 $content = getContents($url, $header, $opts); 318 if($content !== false) { 319 $cache->saveData($content); 320 } 321 } 322 323 return str_get_html($content, 324 $lowercase, 325 $forceTagsClosed, 326 $target_charset, 327 $stripRN, 328 $defaultBRText, 329 $defaultSpanText); 330} 331 332/** 333 * Parses the cURL response header into an associative array 334 * 335 * Based on https://stackoverflow.com/a/18682872 336 * 337 * @param string $header The cURL response header. 338 * @return array An associative array of response headers. 339 */ 340function parseResponseHeader($header) { 341 342 $headers = array(); 343 $requests = explode("\r\n\r\n", trim($header)); 344 345 foreach ($requests as $request) { 346 347 $header = array(); 348 349 foreach (explode("\r\n", $request) as $i => $line) { 350 351 if($i === 0) { 352 $header['http_code'] = $line; 353 } else { 354 355 list ($key, $value) = explode(':', $line); 356 $header[$key] = trim($value); 357 358 } 359 360 } 361 362 $headers[] = $header; 363 364 } 365 366 return $headers; 367 368} 369 370/** 371 * Determines the MIME type from a URL/Path file extension. 372 * 373 * _Remarks_: 374 * 375 * * The built-in functions `mime_content_type` and `fileinfo` require fetching 376 * remote contents. 377 * * A caller can hint for a MIME type by appending `#.ext` to the URL (i.e. `#.image`). 378 * 379 * Based on https://stackoverflow.com/a/1147952 380 * 381 * @param string $url The URL or path to the file. 382 * @return string The MIME type of the file. 383 */ 384function getMimeType($url) { 385 static $mime = null; 386 387 if (is_null($mime)) { 388 // Default values, overriden by /etc/mime.types when present 389 $mime = array( 390 'jpg' => 'image/jpeg', 391 'gif' => 'image/gif', 392 'png' => 'image/png', 393 'image' => 'image/*' 394 ); 395 // '@' is used to mute open_basedir warning, see issue #818 396 if (@is_readable('/etc/mime.types')) { 397 $file = fopen('/etc/mime.types', 'r'); 398 while(($line = fgets($file)) !== false) { 399 $line = trim(preg_replace('/#.*/', '', $line)); 400 if(!$line) 401 continue; 402 $parts = preg_split('/\s+/', $line); 403 if(count($parts) == 1) 404 continue; 405 $type = array_shift($parts); 406 foreach($parts as $part) 407 $mime[$part] = $type; 408 } 409 fclose($file); 410 } 411 } 412 413 if (strpos($url, '?') !== false) { 414 $url_temp = substr($url, 0, strpos($url, '?')); 415 if (strpos($url, '#') !== false) { 416 $anchor = substr($url, strpos($url, '#')); 417 $url_temp .= $anchor; 418 } 419 $url = $url_temp; 420 } 421 422 $ext = strtolower(pathinfo($url, PATHINFO_EXTENSION)); 423 if (!empty($mime[$ext])) { 424 return $mime[$ext]; 425 } 426 427 return 'application/octet-stream'; 428} 429