1"""Utilities using NDG HTTPS Client, including a main module that can be used to
2fetch from a URL.
3"""
4__author__ = "R B Wilkinson"
5__date__ = "09/12/11"
6__copyright__ = "(C) 2011 Science and Technology Facilities Council"
7__license__ = "BSD - see LICENSE file in top-level directory"
8__contact__ = "Philip.Kershaw@stfc.ac.uk"
9__revision__ = '$Id$'
10
11import logging
12from optparse import OptionParser
13import os
14import sys
15
16if sys.version_info[0] > 2:
17    import http.cookiejar as cookiejar_
18    import http.client as http_client_
19    from urllib.request import Request as Request_
20    from urllib.request import HTTPHandler as HTTPHandler_
21    from urllib.request import HTTPCookieProcessor as HTTPCookieProcessor_
22    from urllib.request import HTTPBasicAuthHandler as HTTPBasicAuthHandler_
23    from urllib.request import HTTPPasswordMgrWithDefaultRealm as \
24                                            HTTPPasswordMgrWithDefaultRealm_
25    from urllib.request import ProxyHandler as ProxyHandler_
26    from urllib.error import HTTPError as HTTPError_
27    import urllib.parse as urlparse_
28else:
29    import cookielib as cookiejar_
30    import httplib as http_client_
31    from urllib2 import Request as Request_
32    from urllib2 import HTTPHandler as HTTPHandler_
33    from urllib2 import HTTPCookieProcessor as HTTPCookieProcessor_
34    from urllib2 import HTTPBasicAuthHandler as HTTPBasicAuthHandler_
35    from urllib2 import HTTPPasswordMgrWithDefaultRealm as \
36                                            HTTPPasswordMgrWithDefaultRealm_
37    from urllib2 import ProxyHandler as ProxyHandler_
38    from urllib2 import HTTPError as HTTPError_
39    import urlparse as urlparse_
40
41from ndg.httpsclient.urllib2_build_opener import build_opener
42from ndg.httpsclient.https import HTTPSContextHandler
43from ndg.httpsclient import ssl_context_util
44
45log = logging.getLogger(__name__)
46
47class AccumulatingHTTPCookieProcessor(HTTPCookieProcessor_):
48    """Cookie processor that adds new cookies (instead of replacing the existing
49    ones as HTTPCookieProcessor does)
50    """
51    def http_request(self, request):
52        """Processes cookies for a HTTP request.
53        @param request: request to process
54        @type request: urllib2.Request
55        @return: request
56        @rtype: urllib2.Request
57        """
58        COOKIE_HEADER_NAME = "Cookie"
59        tmp_request = Request_(request.get_full_url(), request.data, {},
60                                      request.origin_req_host,
61                                      request.unverifiable)
62        self.cookiejar.add_cookie_header(tmp_request)
63        # Combine existing and new cookies.
64        new_cookies = tmp_request.get_header(COOKIE_HEADER_NAME)
65        if new_cookies:
66            if request.has_header(COOKIE_HEADER_NAME):
67                # Merge new cookies with existing ones.
68                old_cookies = request.get_header(COOKIE_HEADER_NAME)
69                merged_cookies = '; '.join([old_cookies, new_cookies])
70                request.add_unredirected_header(COOKIE_HEADER_NAME,
71                                                merged_cookies)
72            else:
73                # No existing cookies so just set new ones.
74                request.add_unredirected_header(COOKIE_HEADER_NAME, new_cookies)
75        return request
76
77    # Process cookies for HTTPS in the same way.
78    https_request = http_request
79
80
81class URLFetchError(Exception):
82    """Error fetching content from URL"""
83
84
85def fetch_from_url(url, config, data=None, handlers=None):
86    """Returns data retrieved from a URL.
87    @param url: URL to attempt to open
88    @type url: basestring
89    @param config: SSL context configuration
90    @type config: Configuration
91    @return data retrieved from URL or None
92    """
93    return_code, return_message, response = open_url(url, config, data=data,
94                                                     handlers=handlers)
95    if return_code and return_code == http_client_.OK:
96        return_data = response.read()
97        response.close()
98        return return_data
99    else:
100        raise URLFetchError(return_message)
101
102def fetch_from_url_to_file(url, config, output_file, data=None, handlers=None):
103    """Writes data retrieved from a URL to a file.
104    @param url: URL to attempt to open
105    @type url: basestring
106    @param config: SSL context configuration
107    @type config: Configuration
108    @param output_file: output file
109    @type output_file: basestring
110    @return: tuple (
111        returned HTTP status code or 0 if an error occurred
112        returned message
113        boolean indicating whether access was successful)
114    """
115    return_code, return_message, response = open_url(url, config, data=data,
116                                                     handlers=handlers)
117    if return_code == http_client_.OK:
118        return_data = response.read()
119        response.close()
120        outfile = open(output_file, "w")
121        outfile.write(return_data)
122        outfile.close()
123
124    return return_code, return_message, return_code == http_client_.OK
125
126
127def fetch_stream_from_url(url, config, data=None, handlers=None):
128    """Returns data retrieved from a URL.
129    @param url: URL to attempt to open
130    @type url: basestring
131    @param config: SSL context configuration
132    @type config: Configuration
133    @param data: HTTP POST data
134    @type data: str
135    @param handlers: list of custom urllib2 handlers to add to the request
136    @type handlers: iterable
137    @return: data retrieved from URL or None
138    @rtype: file derived type
139    """
140    return_code, return_message, response = open_url(url, config, data=data,
141                                                     handlers=handlers)
142    if return_code and return_code == http_client_.OK:
143        return response
144    else:
145        raise URLFetchError(return_message)
146
147
148def open_url(url, config, data=None, handlers=None):
149    """Attempts to open a connection to a specified URL.
150    @param url: URL to attempt to open
151    @param config: SSL context configuration
152    @type config: Configuration
153    @param data: HTTP POST data
154    @type data: str
155    @param handlers: list of custom urllib2 handlers to add to the request
156    @type handlers: iterable
157    @return: tuple (
158        returned HTTP status code or 0 if an error occurred
159        returned message or error description
160        response object)
161    """
162    debuglevel = 1 if config.debug else 0
163
164    # Set up handlers for URL opener.
165    if config.cookie:
166        cj = config.cookie
167    else:
168        cj = cookiejar_.CookieJar()
169
170    # Use a cookie processor that accumulates cookies when redirects occur so
171    # that an application can redirect for authentication and retain both any
172    # cookies for the application and the security system (c.f.,
173    # urllib2.HTTPCookieProcessor which replaces cookies).
174    cookie_handler = AccumulatingHTTPCookieProcessor(cj)
175
176    if not handlers:
177        handlers = []
178
179    handlers.append(cookie_handler)
180
181    if config.debug:
182        http_handler = HTTPHandler_(debuglevel=debuglevel)
183        https_handler = HTTPSContextHandler(config.ssl_context,
184                                            debuglevel=debuglevel)
185        handlers.extend([http_handler, https_handler])
186
187    if config.http_basicauth:
188        # currently only supports http basic auth
189        auth_handler = HTTPBasicAuthHandler_(HTTPPasswordMgrWithDefaultRealm_())
190        auth_handler.add_password(realm=None, uri=url,
191                                  user=config.http_basicauth[0],
192                                  passwd=config.http_basicauth[1])
193        handlers.append(auth_handler)
194
195
196    # Explicitly remove proxy handling if the host is one listed in the value of
197    # the no_proxy environment variable because urllib2 does use proxy settings
198    # set via http_proxy and https_proxy, but does not take the no_proxy value
199    # into account.
200    if not _should_use_proxy(url, config.no_proxy):
201        handlers.append(ProxyHandler_({}))
202        log.debug("Not using proxy")
203    elif config.proxies:
204        handlers.append(ProxyHandler_(config.proxies))
205        log.debug("Configuring proxies: %s" % config.proxies)
206
207    opener = build_opener(*handlers, ssl_context=config.ssl_context)
208
209    headers = config.headers
210    if headers is None:
211        headers = {}
212
213    request = Request_(url, data, headers)
214
215    # Open the URL and check the response.
216    return_code = 0
217    return_message = ''
218    response = None
219
220    try:
221        response = opener.open(request)
222        return_message = response.msg
223        return_code = response.code
224        if log.isEnabledFor(logging.DEBUG):
225            for index, cookie in enumerate(cj):
226                log.debug("%s  :  %s", index, cookie)
227
228    except HTTPError_ as exc:
229        return_code = exc.code
230        return_message = "Error: %s" % exc.msg
231        if log.isEnabledFor(logging.DEBUG):
232            log.debug("%s %s", exc.code, exc.msg)
233
234    except Exception as exc:
235        return_message = "Error: %s" % exc.__str__()
236        if log.isEnabledFor(logging.DEBUG):
237            import traceback
238            log.debug(traceback.format_exc())
239
240    return (return_code, return_message, response)
241
242
243def _should_use_proxy(url, no_proxy=None):
244    """Determines whether a proxy should be used to open a connection to the
245    specified URL, based on the value of the no_proxy environment variable.
246    @param url: URL
247    @type url: basestring or urllib2.Request
248    """
249    if no_proxy is None:
250        no_proxy_effective = os.environ.get('no_proxy', '')
251    else:
252        no_proxy_effective = no_proxy
253
254    urlObj = urlparse_.urlparse(_url_as_string(url))
255    for np in [h.strip() for h in no_proxy_effective.split(',')]:
256        if urlObj.hostname == np:
257            return False
258
259    return True
260
261def _url_as_string(url):
262    """Returns the URL string from a URL value that is either a string or
263    urllib2.Request..
264    @param url: URL
265    @type url: basestring or urllib2.Request
266    @return: URL string
267    @rtype: basestring
268    """
269    if isinstance(url, Request_):
270        return url.get_full_url()
271    elif isinstance(url, str):
272        return url
273    else:
274        raise TypeError("Expected type %r or %r" %
275                        (str, Request_))
276
277
278class Configuration(object):
279    """Connection configuration.
280    """
281    def __init__(self, ssl_context, debug=False, proxies=None, no_proxy=None,
282                 cookie=None, http_basicauth=None, headers=None):
283        """
284        @param ssl_context: SSL context to use with this configuration
285        @type ssl_context: OpenSSL.SSL.Context
286        @param debug: if True, output debugging information
287        @type debug: bool
288        @param proxies: proxies to use for
289        @type proxies: dict with basestring keys and values
290        @param no_proxy: hosts for which a proxy should not be used
291        @type no_proxy: basestring
292        @param cookie: cookies to set for request
293        @type cookie: cookielib.CookieJar (python 3 - http.cookiejar)
294        @param http_basicauth: http authentication, or None
295        @type http_basicauth: tuple of (username,password)
296        @param headers: http headers
297        @type headers: dict
298        """
299        self.ssl_context = ssl_context
300        self.debug = debug
301        self.proxies = proxies
302        self.no_proxy = no_proxy
303        self.cookie = cookie
304        self.http_basicauth = http_basicauth
305        self.headers = headers
306
307
308def main():
309    '''Utility to fetch data using HTTP or HTTPS GET from a specified URL.
310    '''
311    parser = OptionParser(usage="%prog [options] url")
312    parser.add_option("-c", "--certificate", dest="cert_file", metavar="FILE",
313                      default=os.path.expanduser("~/credentials.pem"),
314                      help="Certificate file - defaults to $HOME/credentials.pem")
315    parser.add_option("-k", "--private-key", dest="key_file", metavar="FILE",
316                      default=None,
317                      help="Private key file - defaults to the certificate file")
318    parser.add_option("-t", "--ca-certificate-dir", dest="ca_dir",
319                      metavar="PATH",
320                      default=None,
321                      help="Trusted CA certificate file directory")
322    parser.add_option("-d", "--debug", action="store_true", dest="debug",
323                      default=False,
324                      help="Print debug information.")
325    parser.add_option("-p", "--post-data-file", dest="data_file",
326                      metavar="FILE", default=None,
327                      help="POST data file")
328    parser.add_option("-f", "--fetch", dest="output_file", metavar="FILE",
329                      default=None, help="Output file")
330    parser.add_option("-n", "--no-verify-peer", action="store_true",
331                      dest="no_verify_peer", default=False,
332                      help="Skip verification of peer certificate.")
333    parser.add_option("-a", "--basicauth", dest="basicauth",
334                      metavar="USER:PASSWD",
335                      default=None,
336                      help="HTTP authentication credentials")
337    parser.add_option("--header", action="append", dest="headers",
338                      metavar="HEADER: VALUE",
339                      help="Add HTTP header to request")
340    (options, args) = parser.parse_args()
341    if len(args) != 1:
342        parser.error("Incorrect number of arguments")
343
344    url = args[0]
345
346    if options.debug:
347        logging.getLogger().setLevel(logging.DEBUG)
348
349    if options.key_file and os.path.exists(options.key_file):
350        key_file = options.key_file
351    else:
352        key_file = None
353
354    if options.cert_file and os.path.exists(options.cert_file):
355        cert_file = options.cert_file
356    else:
357        cert_file = None
358
359    if options.ca_dir and os.path.exists(options.ca_dir):
360        ca_dir = options.ca_dir
361    else:
362        ca_dir = None
363
364    verify_peer = not options.no_verify_peer
365
366    if options.data_file and os.path.exists(options.data_file):
367        data_file = open(options.data_file)
368        data = data_file.read()
369        data_file.close()
370    else:
371        data = None
372
373    if options.basicauth:
374        http_basicauth = options.basicauth.split(':', 1)
375    else:
376        http_basicauth = None
377
378    headers = {}
379    if options.headers:
380        for h in options.headers:
381            key, val = h.split(':', 1)
382            headers[key.strip()] = val.lstrip()
383
384    # If a private key file is not specified, the key is assumed to be stored in
385    # the certificate file.
386    ssl_context = ssl_context_util.make_ssl_context(key_file,
387                                                    cert_file,
388                                                    None,
389                                                    ca_dir,
390                                                    verify_peer,
391                                                    url)
392
393    config = Configuration(ssl_context,
394                           options.debug,
395                           http_basicauth=http_basicauth,
396                           headers=headers)
397    if options.output_file:
398        return_code, return_message = fetch_from_url_to_file(
399                                                      url,
400                                                      config,
401                                                      options.output_file,
402                                                      data)[:2]
403        raise SystemExit(return_code, return_message)
404    else:
405        data = fetch_from_url(url, config)
406        print(data)
407
408
409if __name__=='__main__':
410    logging.basicConfig()
411    main()
412