1<?php
2/* Copyright (C) 2008-2020	Laurent Destailleur			<eldy@users.sourceforge.net>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 * or see https://www.gnu.org/
17 */
18
19/**
20 *	\file			htdocs/core/lib/geturl.lib.php
21 *	\brief			This file contains functions dedicated to get URLs.
22 */
23
24/**
25 * Function to get a content from an URL (use proxy if proxy defined).
26 * Support Dolibarr setup for timeout and proxy.
27 * Enhancement of CURL to add an anti SSRF protection:
28 * - you can set MAIN_SECURITY_ANTI_SSRF_SERVER_IP to set static ip of server
29 * - common local lookup ips like 127.*.*.* are automatically added
30 *
31 * @param	string	  $url 				    URL to call.
32 * @param	string    $postorget		    'POST', 'GET', 'HEAD', 'PUT', 'PUTALREADYFORMATED', 'POSTALREADYFORMATED', 'DELETE'
33 * @param	string    $param			    Parameters of URL (x=value1&y=value2) or may be a formated content with $postorget='PUTALREADYFORMATED'
34 * @param	integer   $followlocation		0=Do not follow, 1=Follow location.
35 * @param	string[]  $addheaders			Array of string to add into header. Example: ('Accept: application/xrds+xml', ....)
36 * @param	string[]  $allowedschemes		List of schemes that are allowed ('http' + 'https' only by default)
37 * @param	int		  $localurl				0=Only external URL are possible, 1=Only local URL, 2=Both external and local URL are allowed.
38 * @return	array						    Returns an associative array containing the response from the server array('content'=>response, 'curl_error_no'=>errno, 'curl_error_msg'=>errmsg...)
39 */
40function getURLContent($url, $postorget = 'GET', $param = '', $followlocation = 1, $addheaders = array(), $allowedschemes = array('http', 'https'), $localurl = 0)
41{
42	//declaring of global variables
43	global $conf;
44	$USE_PROXY = empty($conf->global->MAIN_PROXY_USE) ? 0 : $conf->global->MAIN_PROXY_USE;
45	$PROXY_HOST = empty($conf->global->MAIN_PROXY_HOST) ? 0 : $conf->global->MAIN_PROXY_HOST;
46	$PROXY_PORT = empty($conf->global->MAIN_PROXY_PORT) ? 0 : $conf->global->MAIN_PROXY_PORT;
47	$PROXY_USER = empty($conf->global->MAIN_PROXY_USER) ? 0 : $conf->global->MAIN_PROXY_USER;
48	$PROXY_PASS = empty($conf->global->MAIN_PROXY_PASS) ? 0 : $conf->global->MAIN_PROXY_PASS;
49
50	dol_syslog("getURLContent postorget=".$postorget." URL=".$url." param=".$param);
51
52	//setting the curl parameters.
53	$ch = curl_init();
54
55	/*print $API_Endpoint."-".$API_version."-".$PAYPAL_API_USER."-".$PAYPAL_API_PASSWORD."-".$PAYPAL_API_SIGNATURE."<br>";
56	 print $USE_PROXY."-".$gv_ApiErrorURL."<br>";
57	 print $nvpStr;
58	 exit;*/
59	curl_setopt($ch, CURLOPT_VERBOSE, 1);
60	curl_setopt($ch, CURLOPT_USERAGENT, 'Dolibarr geturl function');
61
62	// We use @ here because this may return warning if safe mode is on or open_basedir is on (following location is forbidden when safe mode is on).
63	// We force value to false so we will manage redirection ourself later.
64	@curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
65
66	if (is_array($addheaders) && count($addheaders)) {
67		curl_setopt($ch, CURLOPT_HTTPHEADER, $addheaders);
68	}
69	curl_setopt($ch, CURLINFO_HEADER_OUT, true); // To be able to retrieve request header and log it
70
71	// By default use tls decied by PHP.
72	// You can force, if supported a version like TLSv1 or TLSv1.2
73	if (!empty($conf->global->MAIN_CURL_SSLVERSION)) {
74		curl_setopt($ch, CURLOPT_SSLVERSION, $conf->global->MAIN_CURL_SSLVERSION);
75	}
76	//curl_setopt($ch, CURLOPT_SSLVERSION, 6); for tls 1.2
77
78	// Turning off the server and peer verification(TrustManager Concept).
79	curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
80	curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
81
82	// Restrict use to some protocols only
83	$protocols = 0;
84	if (is_array($allowedschemes)) {
85		foreach ($allowedschemes as $allowedscheme) {
86			if ($allowedscheme == 'http') {
87				$protocols |= CURLPROTO_HTTP;
88			}
89			if ($allowedscheme == 'https') {
90				$protocols |= CURLPROTO_HTTPS;
91			}
92		}
93		curl_setopt($ch, CURLOPT_PROTOCOLS, $protocols);
94		curl_setopt($ch, CURLOPT_REDIR_PROTOCOLS, $protocols);
95	}
96
97	curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, empty($conf->global->MAIN_USE_CONNECT_TIMEOUT) ? 5 : $conf->global->MAIN_USE_CONNECT_TIMEOUT);
98	curl_setopt($ch, CURLOPT_TIMEOUT, empty($conf->global->MAIN_USE_RESPONSE_TIMEOUT) ? 30 : $conf->global->MAIN_USE_RESPONSE_TIMEOUT);
99
100	//curl_setopt($ch, CURLOPT_SAFE_UPLOAD, true);	// PHP 5.5
101	curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); // We want response
102	if ($postorget == 'POST') {
103		curl_setopt($ch, CURLOPT_POST, 1); // POST
104		curl_setopt($ch, CURLOPT_POSTFIELDS, $param); // Setting param x=a&y=z as POST fields
105	} elseif ($postorget == 'POSTALREADYFORMATED') {
106		curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'POST'); // HTTP request is 'POST' but param string is taken as it is
107		curl_setopt($ch, CURLOPT_POSTFIELDS, $param); // param = content of post, like a xml string
108	} elseif ($postorget == 'PUT') {
109		$array_param = null;
110		curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'PUT'); // HTTP request is 'PUT'
111		if (!is_array($param)) {
112			parse_str($param, $array_param);
113		} else {
114			dol_syslog("parameter param must be a string", LOG_WARNING);
115			$array_param = $param;
116		}
117		curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($array_param)); // Setting param x=a&y=z as PUT fields
118	} elseif ($postorget == 'PUTALREADYFORMATED') {
119		curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'PUT'); // HTTP request is 'PUT'
120		curl_setopt($ch, CURLOPT_POSTFIELDS, $param); // param = content of post, like a xml string
121	} elseif ($postorget == 'HEAD') {
122		curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'HEAD'); // HTTP request is 'HEAD'
123		curl_setopt($ch, CURLOPT_NOBODY, true);
124	} elseif ($postorget == 'DELETE') {
125		curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'DELETE'); // POST
126	} else {
127		curl_setopt($ch, CURLOPT_POST, 0); // GET
128	}
129
130	//if USE_PROXY constant set at begin of this method.
131	if ($USE_PROXY) {
132		dol_syslog("getURLContent set proxy to ".$PROXY_HOST.":".$PROXY_PORT." - ".$PROXY_USER.":".$PROXY_PASS);
133		//curl_setopt ($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP); // Curl 7.10
134		curl_setopt($ch, CURLOPT_PROXY, $PROXY_HOST.":".$PROXY_PORT);
135		if ($PROXY_USER) {
136			curl_setopt($ch, CURLOPT_PROXYUSERPWD, $PROXY_USER.":".$PROXY_PASS);
137		}
138	}
139
140	$newUrl = $url;
141	$maxRedirection = 5;
142	$info = array();
143	$response = '';
144
145	do {
146		if ($maxRedirection < 1) {
147			break;
148		}
149
150		curl_setopt($ch, CURLOPT_URL, $newUrl);
151
152		// Parse $newUrl
153		$newUrlArray = parse_url($newUrl);
154		$hosttocheck = $newUrlArray['host'];
155		$hosttocheck = str_replace(array('[', ']'), '', $hosttocheck); // Remove brackets of IPv6
156
157		// Deny some reserved host names
158		if (in_array($hosttocheck, array('metadata.google.internal'))) {
159			$info['http_code'] = 400;
160			$info['content'] = 'Error bad hostname '.$hosttocheck.' (Used by Google metadata). This value for hostname is not allowed.';
161			break;
162		}
163
164		// Clean host name $hosttocheck to convert it into an IP $iptocheck
165		if (in_array($hosttocheck, array('localhost', 'localhost.domain'))) {
166			$iptocheck = '127.0.0.1';
167		} elseif (in_array($hosttocheck, array('ip6-localhost', 'ip6-loopback'))) {
168			$iptocheck = '::1';
169		} else {
170			// Resolve $hosttocheck to get the IP $iptocheck and set CURLOPT_CONNECT_TO to use this ip so curl will not try another resolution that may give a different result
171			if (function_exists('gethostbyname')) {
172				$iptocheck = gethostbyname($hosttocheck);
173			} else {
174				$iptocheck = $hosttocheck;
175			}
176			// TODO Resolve ip v6
177		}
178
179		// Check $iptocheck is an IP (v4 or v6), if not clear value.
180		if (!filter_var($iptocheck, FILTER_VALIDATE_IP, FILTER_FLAG_IPV4 | FILTER_FLAG_IPV6)) {	// This is not an IP, we clean data
181			$iptocheck = '0'; //
182		}
183
184		if ($iptocheck) {
185			if ($localurl == 0) {	// Only external url allowed (dangerous, may allow to get malware)
186				if (!filter_var($iptocheck, FILTER_VALIDATE_IP, FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE)) {
187					// Deny ips like 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16, 0.0.0.0/8, 169.254.0.0/16, 127.0.0.0/8 et 240.0.0.0/4, ::1/128, ::/128, ::ffff:0:0/96, fe80::/10...
188					$info['http_code'] = 400;
189					$info['content'] = 'Error bad hostname IP (private or reserved range). Must be an external URL.';
190					break;
191				}
192				if (!empty($_SERVER["SERVER_ADDR"]) && $iptocheck == $_SERVER["SERVER_ADDR"]) {
193					$info['http_code'] = 400;
194					$info['content'] = 'Error bad hostname IP (IP is a local IP). Must be an external URL.';
195					break;
196				}
197				if (!empty($conf->global->MAIN_SECURITY_ANTI_SSRF_SERVER_IP) && in_array($iptocheck, explode(',', $conf->global->MAIN_SECURITY_ANTI_SSRF_SERVER_IP))) {
198					$info['http_code'] = 400;
199					$info['content'] = 'Error bad hostname IP (IP is a local IP defined into MAIN_SECURITY_SERVER_IP). Must be an external URL.';
200					break;
201				}
202			}
203			if ($localurl == 1) {	// Only local url allowed (dangerous, may allow to get metadata on server or make internal port scanning)
204				// Deny ips NOT like 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16, 0.0.0.0/8, 169.254.0.0/16, 127.0.0.0/8 et 240.0.0.0/4, ::1/128, ::/128, ::ffff:0:0/96, fe80::/10...
205				if (filter_var($iptocheck, FILTER_VALIDATE_IP, FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE)) {
206					$info['http_code'] = 400;
207					$info['content'] = 'Error bad hostname '.$iptocheck.'. Must be a local URL.';
208					break;
209				}
210				if (!empty($conf->global->MAIN_SECURITY_ANTI_SSRF_SERVER_IP) && !in_array($iptocheck, explode(',', $conf->global->MAIN_SECURITY_ANTI_SSRF_SERVER_IP))) {
211					$info['http_code'] = 400;
212					$info['content'] = 'Error bad hostname IP (IP is not a local IP defined into list MAIN_SECURITY_SERVER_IP). Must be a local URL in allowed list.';
213					break;
214				}
215			}
216
217			// Common check (local and external)
218			if (in_array($iptocheck, array('100.100.100.200'))) {
219				$info['http_code'] = 400;
220				$info['content'] = 'Error bad hostname IP (Used by Alibaba metadata). Must be an external URL.';
221				break;
222			}
223
224			// Set CURLOPT_CONNECT_TO so curl will not try another resolution that may give a different result. Possible only on PHP v7+
225			if (defined('CURLOPT_CONNECT_TO')) {
226				$connect_to = array(sprintf("%s:%d:%s:%d", $newUrlArray['host'], empty($newUrlArray['port'])?'':$newUrlArray['port'], $iptocheck, empty($newUrlArray['port'])?'':$newUrlArray['port']));
227				//var_dump($newUrlArray);
228				//var_dump($connect_to);
229				curl_setopt($ch, CURLOPT_CONNECT_TO, $connect_to);
230			}
231		}
232
233		// Getting response from server
234		$response = curl_exec($ch);
235
236		$info = curl_getinfo($ch); // Reading of request must be done after sending request
237		$http_code = $info['http_code'];
238
239		if ($followlocation && ($http_code == 301 || $http_code == 302 || $http_code == 303 || $http_code == 307)) {
240			$newUrl = $info['redirect_url'];
241			$maxRedirection--;
242			// TODO Use $info['local_ip'] and $info['primary_ip'] ?
243			continue;
244		} else {
245			$http_code = 0;
246		}
247	} while ($http_code);
248
249	$request = curl_getinfo($ch, CURLINFO_HEADER_OUT); // Reading of request must be done after sending request
250
251	dol_syslog("getURLContent request=".$request);
252	//dol_syslog("getURLContent response =".response);	// This may contains binary data, so we dont output it
253	dol_syslog("getURLContent response size=".strlen($response)); // This may contains binary data, so we dont output it
254
255	$rep = array();
256	if (curl_errno($ch)) {
257		// Add keys to $rep
258		$rep['content'] = $response;
259
260		// moving to display page to display curl errors
261		$rep['curl_error_no'] = curl_errno($ch);
262		$rep['curl_error_msg'] = curl_error($ch);
263
264		dol_syslog("getURLContent response array is ".join(',', $rep));
265	} else {
266		//$info = curl_getinfo($ch);
267
268		// Add keys to $rep
269		$rep = $info;
270		//$rep['header_size']=$info['header_size'];
271		//$rep['http_code']=$info['http_code'];
272		dol_syslog("getURLContent http_code=".$rep['http_code']);
273
274		// Add more keys to $rep
275		if ($response) {
276			$rep['content'] = $response;
277		}
278		$rep['curl_error_no'] = '';
279		$rep['curl_error_msg'] = '';
280	}
281
282	//closing the curl
283	curl_close($ch);
284
285	return $rep;
286}
287
288
289/**
290 * Function get second level domain name.
291 * For example: https://www.abc.mydomain.com/dir/page.html return 'mydomain'
292 *
293 * @param	string	  $url 				    Full URL.
294 * @param	int	 	  $mode					0=return 'mydomain', 1=return 'mydomain.com', 2=return 'abc.mydomain.com'
295 * @return	string						    Returns domaine name
296 */
297function getDomainFromURL($url, $mode = 0)
298{
299	$tmpdomain = preg_replace('/^https?:\/\//i', '', $url); // Remove http(s)://
300	$tmpdomain = preg_replace('/\/.*$/i', '', $tmpdomain); // Remove part after domain
301	if ($mode == 2) {
302		$tmpdomain = preg_replace('/^.*\.([^\.]+)\.([^\.]+)\.([^\.]+)$/', '\1.\2.\3', $tmpdomain); // Remove part 'www.' before 'abc.mydomain.com'
303	} else {
304		$tmpdomain = preg_replace('/^.*\.([^\.]+)\.([^\.]+)$/', '\1.\2', $tmpdomain); // Remove part 'www.abc.' before 'mydomain.com'
305	}
306	if (empty($mode)) {
307		$tmpdomain = preg_replace('/\.[^\.]+$/', '', $tmpdomain); // Remove first level domain (.com, .net, ...)
308	}
309
310	return $tmpdomain;
311}
312
313/**
314 * Function root url from a long url
315 * For example: https://www.abc.mydomain.com/dir/page.html return 'https://www.abc.mydomain.com'
316 * For example: http://www.abc.mydomain.com/ return 'https://www.abc.mydomain.com'
317 *
318 * @param	string	  $url 				    Full URL.
319 * @return	string						    Returns root url
320 */
321function getRootURLFromURL($url)
322{
323	$prefix = '';
324	$tmpurl = $url;
325	$reg = null;
326	if (preg_match('/^(https?:\/\/)/i', $tmpurl, $reg)) {
327		$prefix = $reg[1];
328	}
329	$tmpurl = preg_replace('/^https?:\/\//i', '', $tmpurl); // Remove http(s)://
330	$tmpurl = preg_replace('/\/.*$/i', '', $tmpurl); // Remove part after domain
331
332	return $prefix.$tmpurl;
333}
334
335/**
336 * Function to remove comments into HTML content
337 *
338 * @param	string	  $content 				Text content
339 * @return	string						    Returns text without HTML comments
340 */
341function removeHtmlComment($content)
342{
343	$content = preg_replace('/<!--[^\-]+-->/', '', $content);
344	return $content;
345}
346