1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3
4# Copyright: (c) 2012, Jan-Piet Mens <jpmens () gmail.com>
5# GNU General Public License v3.0+ (see COPYING or https://www.gnu.org/licenses/gpl-3.0.txt)
6
7from __future__ import absolute_import, division, print_function
8__metaclass__ = type
9
10
11DOCUMENTATION = r'''
12---
13module: get_url
14short_description: Downloads files from HTTP, HTTPS, or FTP to node
15description:
16     - Downloads files from HTTP, HTTPS, or FTP to the remote server. The remote
17       server I(must) have direct access to the remote resource.
18     - By default, if an environment variable C(<protocol>_proxy) is set on
19       the target host, requests will be sent through that proxy. This
20       behaviour can be overridden by setting a variable for this task
21       (see `setting the environment
22       <https://docs.ansible.com/playbooks_environment.html>`_),
23       or by using the use_proxy option.
24     - HTTP redirects can redirect from HTTP to HTTPS so you should be sure that
25       your proxy environment for both protocols is correct.
26     - From Ansible 2.4 when run with C(--check), it will do a HEAD request to validate the URL but
27       will not download the entire file or verify it against hashes.
28     - For Windows targets, use the M(ansible.windows.win_get_url) module instead.
29version_added: '0.6'
30options:
31  url:
32    description:
33      - HTTP, HTTPS, or FTP URL in the form (http|https|ftp)://[user[:pass]]@host.domain[:port]/path
34    type: str
35    required: true
36  dest:
37    description:
38      - Absolute path of where to download the file to.
39      - If C(dest) is a directory, either the server provided filename or, if
40        none provided, the base name of the URL on the remote server will be
41        used. If a directory, C(force) has no effect.
42      - If C(dest) is a directory, the file will always be downloaded
43        (regardless of the C(force) option), but replaced only if the contents changed..
44    type: path
45    required: true
46  tmp_dest:
47    description:
48      - Absolute path of where temporary file is downloaded to.
49      - When run on Ansible 2.5 or greater, path defaults to ansible's remote_tmp setting
50      - When run on Ansible prior to 2.5, it defaults to C(TMPDIR), C(TEMP) or C(TMP) env variables or a platform specific value.
51      - U(https://docs.python.org/2/library/tempfile.html#tempfile.tempdir)
52    type: path
53    version_added: '2.1'
54  force:
55    description:
56      - If C(yes) and C(dest) is not a directory, will download the file every
57        time and replace the file if the contents change. If C(no), the file
58        will only be downloaded if the destination does not exist. Generally
59        should be C(yes) only for small local files.
60      - Prior to 0.6, this module behaved as if C(yes) was the default.
61      - Alias C(thirsty) has been deprecated and will be removed in 2.13.
62    type: bool
63    default: no
64    aliases: [ thirsty ]
65    version_added: '0.7'
66  backup:
67    description:
68      - Create a backup file including the timestamp information so you can get
69        the original file back if you somehow clobbered it incorrectly.
70    type: bool
71    default: no
72    version_added: '2.1'
73  sha256sum:
74    description:
75      - If a SHA-256 checksum is passed to this parameter, the digest of the
76        destination file will be calculated after it is downloaded to ensure
77        its integrity and verify that the transfer completed successfully.
78        This option is deprecated and will be removed in version 2.14. Use
79        option C(checksum) instead.
80    default: ''
81    version_added: "1.3"
82  checksum:
83    description:
84      - 'If a checksum is passed to this parameter, the digest of the
85        destination file will be calculated after it is downloaded to ensure
86        its integrity and verify that the transfer completed successfully.
87        Format: <algorithm>:<checksum|url>, e.g. checksum="sha256:D98291AC[...]B6DC7B97",
88        checksum="sha256:http://example.com/path/sha256sum.txt"'
89      - If you worry about portability, only the sha1 algorithm is available
90        on all platforms and python versions.
91      - The third party hashlib library can be installed for access to additional algorithms.
92      - Additionally, if a checksum is passed to this parameter, and the file exist under
93        the C(dest) location, the I(destination_checksum) would be calculated, and if
94        checksum equals I(destination_checksum), the file download would be skipped
95        (unless C(force) is true). If the checksum does not equal I(destination_checksum),
96        the destination file is deleted.
97    type: str
98    default: ''
99    version_added: "2.0"
100  use_proxy:
101    description:
102      - if C(no), it will not use a proxy, even if one is defined in
103        an environment variable on the target hosts.
104    type: bool
105    default: yes
106  validate_certs:
107    description:
108      - If C(no), SSL certificates will not be validated.
109      - This should only be used on personally controlled sites using self-signed certificates.
110    type: bool
111    default: yes
112  timeout:
113    description:
114      - Timeout in seconds for URL request.
115    type: int
116    default: 10
117    version_added: '1.8'
118  headers:
119    description:
120        - Add custom HTTP headers to a request in hash/dict format.
121        - The hash/dict format was added in Ansible 2.6.
122        - Previous versions used a C("key:value,key:value") string format.
123        - The C("key:value,key:value") string format is deprecated and has been removed in version 2.10.
124    type: dict
125    version_added: '2.0'
126  url_username:
127    description:
128      - The username for use in HTTP basic authentication.
129      - This parameter can be used without C(url_password) for sites that allow empty passwords.
130      - Since version 2.8 you can also use the C(username) alias for this option.
131    type: str
132    aliases: ['username']
133    version_added: '1.6'
134  url_password:
135    description:
136        - The password for use in HTTP basic authentication.
137        - If the C(url_username) parameter is not specified, the C(url_password) parameter will not be used.
138        - Since version 2.8 you can also use the 'password' alias for this option.
139    type: str
140    aliases: ['password']
141    version_added: '1.6'
142  force_basic_auth:
143    description:
144      - Force the sending of the Basic authentication header upon initial request.
145      - httplib2, the library used by the uri module only sends authentication information when a webservice
146        responds to an initial request with a 401 status. Since some basic auth services do not properly
147        send a 401, logins will fail.
148    type: bool
149    default: no
150    version_added: '2.0'
151  client_cert:
152    description:
153      - PEM formatted certificate chain file to be used for SSL client authentication.
154      - This file can also include the key as well, and if the key is included, C(client_key) is not required.
155    type: path
156    version_added: '2.4'
157  client_key:
158    description:
159      - PEM formatted file that contains your private key to be used for SSL client authentication.
160      - If C(client_cert) contains both the certificate and key, this option is not required.
161    type: path
162    version_added: '2.4'
163  http_agent:
164    description:
165      - Header to identify as, generally appears in web server logs.
166    type: str
167    default: ansible-httpget
168# informational: requirements for nodes
169extends_documentation_fragment:
170    - files
171notes:
172     - For Windows targets, use the M(ansible.windows.win_get_url) module instead.
173seealso:
174- module: ansible.builtin.uri
175- module: ansible.windows.win_get_url
176author:
177- Jan-Piet Mens (@jpmens)
178'''
179
180EXAMPLES = r'''
181- name: Download foo.conf
182  get_url:
183    url: http://example.com/path/file.conf
184    dest: /etc/foo.conf
185    mode: '0440'
186
187- name: Download file and force basic auth
188  get_url:
189    url: http://example.com/path/file.conf
190    dest: /etc/foo.conf
191    force_basic_auth: yes
192
193- name: Download file with custom HTTP headers
194  get_url:
195    url: http://example.com/path/file.conf
196    dest: /etc/foo.conf
197    headers:
198      key1: one
199      key2: two
200
201- name: Download file with check (sha256)
202  get_url:
203    url: http://example.com/path/file.conf
204    dest: /etc/foo.conf
205    checksum: sha256:b5bb9d8014a0f9b1d61e21e796d78dccdf1352f23cd32812f4850b878ae4944c
206
207- name: Download file with check (md5)
208  get_url:
209    url: http://example.com/path/file.conf
210    dest: /etc/foo.conf
211    checksum: md5:66dffb5228a211e61d6d7ef4a86f5758
212
213- name: Download file with checksum url (sha256)
214  get_url:
215    url: http://example.com/path/file.conf
216    dest: /etc/foo.conf
217    checksum: sha256:http://example.com/path/sha256sum.txt
218
219- name: Download file from a file path
220  get_url:
221    url: file:///tmp/afile.txt
222    dest: /tmp/afilecopy.txt
223
224- name: < Fetch file that requires authentication.
225        username/password only available since 2.8, in older versions you need to use url_username/url_password
226  get_url:
227    url: http://example.com/path/file.conf
228    dest: /etc/foo.conf
229    username: bar
230    password: '{{ mysecret }}'
231'''
232
233RETURN = r'''
234backup_file:
235    description: name of backup file created after download
236    returned: changed and if backup=yes
237    type: str
238    sample: /path/to/file.txt.2015-02-12@22:09~
239checksum_dest:
240    description: sha1 checksum of the file after copy
241    returned: success
242    type: str
243    sample: 6e642bb8dd5c2e027bf21dd923337cbb4214f827
244checksum_src:
245    description: sha1 checksum of the file
246    returned: success
247    type: str
248    sample: 6e642bb8dd5c2e027bf21dd923337cbb4214f827
249dest:
250    description: destination file/path
251    returned: success
252    type: str
253    sample: /path/to/file.txt
254elapsed:
255    description: The number of seconds that elapsed while performing the download
256    returned: always
257    type: int
258    sample: 23
259gid:
260    description: group id of the file
261    returned: success
262    type: int
263    sample: 100
264group:
265    description: group of the file
266    returned: success
267    type: str
268    sample: "httpd"
269md5sum:
270    description: md5 checksum of the file after download
271    returned: when supported
272    type: str
273    sample: "2a5aeecc61dc98c4d780b14b330e3282"
274mode:
275    description: permissions of the target
276    returned: success
277    type: str
278    sample: "0644"
279msg:
280    description: the HTTP message from the request
281    returned: always
282    type: str
283    sample: OK (unknown bytes)
284owner:
285    description: owner of the file
286    returned: success
287    type: str
288    sample: httpd
289secontext:
290    description: the SELinux security context of the file
291    returned: success
292    type: str
293    sample: unconfined_u:object_r:user_tmp_t:s0
294size:
295    description: size of the target
296    returned: success
297    type: int
298    sample: 1220
299src:
300    description: source file used after download
301    returned: always
302    type: str
303    sample: /tmp/tmpAdFLdV
304state:
305    description: state of the target
306    returned: success
307    type: str
308    sample: file
309status_code:
310    description: the HTTP status code from the request
311    returned: always
312    type: int
313    sample: 200
314uid:
315    description: owner id of the file, after execution
316    returned: success
317    type: int
318    sample: 100
319url:
320    description: the actual URL used for the request
321    returned: always
322    type: str
323    sample: https://www.ansible.com/
324'''
325
326import datetime
327import os
328import re
329import shutil
330import tempfile
331import traceback
332
333from ansible.module_utils.basic import AnsibleModule
334from ansible.module_utils.six.moves.urllib.parse import urlsplit
335from ansible.module_utils._text import to_native
336from ansible.module_utils.urls import fetch_url, url_argument_spec
337
338# ==============================================================
339# url handling
340
341
342def url_filename(url):
343    fn = os.path.basename(urlsplit(url)[2])
344    if fn == '':
345        return 'index.html'
346    return fn
347
348
349def url_get(module, url, dest, use_proxy, last_mod_time, force, timeout=10, headers=None, tmp_dest=''):
350    """
351    Download data from the url and store in a temporary file.
352
353    Return (tempfile, info about the request)
354    """
355    if module.check_mode:
356        method = 'HEAD'
357    else:
358        method = 'GET'
359
360    start = datetime.datetime.utcnow()
361    rsp, info = fetch_url(module, url, use_proxy=use_proxy, force=force, last_mod_time=last_mod_time, timeout=timeout, headers=headers, method=method)
362    elapsed = (datetime.datetime.utcnow() - start).seconds
363
364    if info['status'] == 304:
365        module.exit_json(url=url, dest=dest, changed=False, msg=info.get('msg', ''), status_code=info['status'], elapsed=elapsed)
366
367    # Exceptions in fetch_url may result in a status -1, the ensures a proper error to the user in all cases
368    if info['status'] == -1:
369        module.fail_json(msg=info['msg'], url=url, dest=dest, elapsed=elapsed)
370
371    if info['status'] != 200 and not url.startswith('file:/') and not (url.startswith('ftp:/') and info.get('msg', '').startswith('OK')):
372        module.fail_json(msg="Request failed", status_code=info['status'], response=info['msg'], url=url, dest=dest, elapsed=elapsed)
373
374    # create a temporary file and copy content to do checksum-based replacement
375    if tmp_dest:
376        # tmp_dest should be an existing dir
377        tmp_dest_is_dir = os.path.isdir(tmp_dest)
378        if not tmp_dest_is_dir:
379            if os.path.exists(tmp_dest):
380                module.fail_json(msg="%s is a file but should be a directory." % tmp_dest, elapsed=elapsed)
381            else:
382                module.fail_json(msg="%s directory does not exist." % tmp_dest, elapsed=elapsed)
383    else:
384        tmp_dest = module.tmpdir
385
386    fd, tempname = tempfile.mkstemp(dir=tmp_dest)
387
388    f = os.fdopen(fd, 'wb')
389    try:
390        shutil.copyfileobj(rsp, f)
391    except Exception as e:
392        os.remove(tempname)
393        module.fail_json(msg="failed to create temporary content file: %s" % to_native(e), elapsed=elapsed, exception=traceback.format_exc())
394    f.close()
395    rsp.close()
396    return tempname, info
397
398
399def extract_filename_from_headers(headers):
400    """
401    Extracts a filename from the given dict of HTTP headers.
402
403    Looks for the content-disposition header and applies a regex.
404    Returns the filename if successful, else None."""
405    cont_disp_regex = 'attachment; ?filename="?([^"]+)'
406    res = None
407
408    if 'content-disposition' in headers:
409        cont_disp = headers['content-disposition']
410        match = re.match(cont_disp_regex, cont_disp)
411        if match:
412            res = match.group(1)
413            # Try preventing any funny business.
414            res = os.path.basename(res)
415
416    return res
417
418
419def is_url(checksum):
420    """
421    Returns True if checksum value has supported URL scheme, else False."""
422    supported_schemes = ('http', 'https', 'ftp', 'file')
423
424    return urlsplit(checksum).scheme in supported_schemes
425
426
427# ==============================================================
428# main
429
430def main():
431    argument_spec = url_argument_spec()
432
433    # setup aliases
434    argument_spec['url_username']['aliases'] = ['username']
435    argument_spec['url_password']['aliases'] = ['password']
436
437    argument_spec.update(
438        url=dict(type='str', required=True),
439        dest=dict(type='path', required=True),
440        backup=dict(type='bool', default=False),
441        sha256sum=dict(type='str', default=''),
442        checksum=dict(type='str', default=''),
443        timeout=dict(type='int', default=10),
444        headers=dict(type='dict'),
445        tmp_dest=dict(type='path'),
446    )
447
448    module = AnsibleModule(
449        # not checking because of daisy chain to file module
450        argument_spec=argument_spec,
451        add_file_common_args=True,
452        supports_check_mode=True,
453        mutually_exclusive=[['checksum', 'sha256sum']],
454    )
455
456    if module.params.get('thirsty'):
457        module.deprecate('The alias "thirsty" has been deprecated and will be removed, use "force" instead',
458                         version='2.13', collection_name='ansible.builtin')
459
460    if module.params.get('sha256sum'):
461        module.deprecate('The parameter "sha256sum" has been deprecated and will be removed, use "checksum" instead',
462                         version='2.14', collection_name='ansible.builtin')
463
464    url = module.params['url']
465    dest = module.params['dest']
466    backup = module.params['backup']
467    force = module.params['force']
468    sha256sum = module.params['sha256sum']
469    checksum = module.params['checksum']
470    use_proxy = module.params['use_proxy']
471    timeout = module.params['timeout']
472    headers = module.params['headers']
473    tmp_dest = module.params['tmp_dest']
474
475    result = dict(
476        changed=False,
477        checksum_dest=None,
478        checksum_src=None,
479        dest=dest,
480        elapsed=0,
481        url=url,
482    )
483
484    dest_is_dir = os.path.isdir(dest)
485    last_mod_time = None
486
487    # workaround for usage of deprecated sha256sum parameter
488    if sha256sum:
489        checksum = 'sha256:%s' % (sha256sum)
490
491    # checksum specified, parse for algorithm and checksum
492    if checksum:
493        try:
494            algorithm, checksum = checksum.split(':', 1)
495        except ValueError:
496            module.fail_json(msg="The checksum parameter has to be in format <algorithm>:<checksum>", **result)
497
498        if is_url(checksum):
499            checksum_url = checksum
500            # download checksum file to checksum_tmpsrc
501            checksum_tmpsrc, checksum_info = url_get(module, checksum_url, dest, use_proxy, last_mod_time, force, timeout, headers, tmp_dest)
502            with open(checksum_tmpsrc) as f:
503                lines = [line.rstrip('\n') for line in f]
504            os.remove(checksum_tmpsrc)
505            checksum_map = []
506            for line in lines:
507                # Split by one whitespace to keep the leading type char ' ' (whitespace) for text and '*' for binary
508                parts = line.split(" ", 1)
509                if len(parts) == 2:
510                    # Remove the leading type char, we expect
511                    if parts[1].startswith((" ", "*",)):
512                        parts[1] = parts[1][1:]
513
514                    # Append checksum and path without potential leading './'
515                    checksum_map.append((parts[0], parts[1].lstrip("./")))
516
517            filename = url_filename(url)
518
519            # Look through each line in the checksum file for a hash corresponding to
520            # the filename in the url, returning the first hash that is found.
521            for cksum in (s for (s, f) in checksum_map if f == filename):
522                checksum = cksum
523                break
524            else:
525                checksum = None
526
527            if checksum is None:
528                module.fail_json(msg="Unable to find a checksum for file '%s' in '%s'" % (filename, checksum_url))
529        # Remove any non-alphanumeric characters, including the infamous
530        # Unicode zero-width space
531        checksum = re.sub(r'\W+', '', checksum).lower()
532        # Ensure the checksum portion is a hexdigest
533        try:
534            int(checksum, 16)
535        except ValueError:
536            module.fail_json(msg='The checksum format is invalid', **result)
537
538    if not dest_is_dir and os.path.exists(dest):
539        checksum_mismatch = False
540
541        # If the download is not forced and there is a checksum, allow
542        # checksum match to skip the download.
543        if not force and checksum != '':
544            destination_checksum = module.digest_from_file(dest, algorithm)
545
546            if checksum != destination_checksum:
547                checksum_mismatch = True
548
549        # Not forcing redownload, unless checksum does not match
550        if not force and checksum and not checksum_mismatch:
551            # Not forcing redownload, unless checksum does not match
552            # allow file attribute changes
553            file_args = module.load_file_common_arguments(module.params, path=dest)
554            result['changed'] = module.set_fs_attributes_if_different(file_args, False)
555            if result['changed']:
556                module.exit_json(msg="file already exists but file attributes changed", **result)
557            module.exit_json(msg="file already exists", **result)
558
559        # If the file already exists, prepare the last modified time for the
560        # request.
561        mtime = os.path.getmtime(dest)
562        last_mod_time = datetime.datetime.utcfromtimestamp(mtime)
563
564        # If the checksum does not match we have to force the download
565        # because last_mod_time may be newer than on remote
566        if checksum_mismatch:
567            force = True
568
569    # download to tmpsrc
570    start = datetime.datetime.utcnow()
571    tmpsrc, info = url_get(module, url, dest, use_proxy, last_mod_time, force, timeout, headers, tmp_dest)
572    result['elapsed'] = (datetime.datetime.utcnow() - start).seconds
573    result['src'] = tmpsrc
574
575    # Now the request has completed, we can finally generate the final
576    # destination file name from the info dict.
577
578    if dest_is_dir:
579        filename = extract_filename_from_headers(info)
580        if not filename:
581            # Fall back to extracting the filename from the URL.
582            # Pluck the URL from the info, since a redirect could have changed
583            # it.
584            filename = url_filename(info['url'])
585        dest = os.path.join(dest, filename)
586        result['dest'] = dest
587
588    # raise an error if there is no tmpsrc file
589    if not os.path.exists(tmpsrc):
590        os.remove(tmpsrc)
591        module.fail_json(msg="Request failed", status_code=info['status'], response=info['msg'], **result)
592    if not os.access(tmpsrc, os.R_OK):
593        os.remove(tmpsrc)
594        module.fail_json(msg="Source %s is not readable" % (tmpsrc), **result)
595    result['checksum_src'] = module.sha1(tmpsrc)
596
597    # check if there is no dest file
598    if os.path.exists(dest):
599        # raise an error if copy has no permission on dest
600        if not os.access(dest, os.W_OK):
601            os.remove(tmpsrc)
602            module.fail_json(msg="Destination %s is not writable" % (dest), **result)
603        if not os.access(dest, os.R_OK):
604            os.remove(tmpsrc)
605            module.fail_json(msg="Destination %s is not readable" % (dest), **result)
606        result['checksum_dest'] = module.sha1(dest)
607    else:
608        if not os.path.exists(os.path.dirname(dest)):
609            os.remove(tmpsrc)
610            module.fail_json(msg="Destination %s does not exist" % (os.path.dirname(dest)), **result)
611        if not os.access(os.path.dirname(dest), os.W_OK):
612            os.remove(tmpsrc)
613            module.fail_json(msg="Destination %s is not writable" % (os.path.dirname(dest)), **result)
614
615    if module.check_mode:
616        if os.path.exists(tmpsrc):
617            os.remove(tmpsrc)
618        result['changed'] = ('checksum_dest' not in result or
619                             result['checksum_src'] != result['checksum_dest'])
620        module.exit_json(msg=info.get('msg', ''), **result)
621
622    backup_file = None
623    if result['checksum_src'] != result['checksum_dest']:
624        try:
625            if backup:
626                if os.path.exists(dest):
627                    backup_file = module.backup_local(dest)
628            module.atomic_move(tmpsrc, dest, unsafe_writes=module.params['unsafe_writes'])
629        except Exception as e:
630            if os.path.exists(tmpsrc):
631                os.remove(tmpsrc)
632            module.fail_json(msg="failed to copy %s to %s: %s" % (tmpsrc, dest, to_native(e)),
633                             exception=traceback.format_exc(), **result)
634        result['changed'] = True
635    else:
636        result['changed'] = False
637        if os.path.exists(tmpsrc):
638            os.remove(tmpsrc)
639
640    if checksum != '':
641        destination_checksum = module.digest_from_file(dest, algorithm)
642
643        if checksum != destination_checksum:
644            os.remove(dest)
645            module.fail_json(msg="The checksum for %s did not match %s; it was %s." % (dest, checksum, destination_checksum), **result)
646
647    # allow file attribute changes
648    file_args = module.load_file_common_arguments(module.params, path=dest)
649    result['changed'] = module.set_fs_attributes_if_different(file_args, result['changed'])
650
651    # Backwards compat only.  We'll return None on FIPS enabled systems
652    try:
653        result['md5sum'] = module.md5(dest)
654    except ValueError:
655        result['md5sum'] = None
656
657    if backup_file:
658        result['backup_file'] = backup_file
659
660    # Mission complete
661    module.exit_json(msg=info.get('msg', ''), status_code=info.get('status', ''), **result)
662
663
664if __name__ == '__main__':
665    main()
666