1# -*- coding: utf-8 -*-
2#
3# The internetarchive module is a Python/CLI interface to Archive.org.
4#
5# Copyright (C) 2012-2021 Internet Archive
6#
7# This program is free software: you can redistribute it and/or modify
8# it under the terms of the GNU Affero General Public License as
9# published by the Free Software Foundation, either version 3 of the
10# License, or (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15# GNU Affero General Public License for more details.
16#
17# You should have received a copy of the GNU Affero General Public License
18# along with this program.  If not, see <http://www.gnu.org/licenses/>.
19
20"""
21internetarchive.session
22~~~~~~~~~~~~~~~~~~~~~~~
23
24This module provides an ArchiveSession object to manage and persist
25settings across the internetarchive package.
26
27:copyright: (C) 2012-2021 by Internet Archive.
28:license: AGPL 3, see LICENSE for more details.
29"""
30
31from __future__ import absolute_import, unicode_literals
32
33import os
34import locale
35import sys
36import logging
37import platform
38import warnings
39try:
40    import ujson as json
41except ImportError:
42    import json
43
44import requests.sessions
45from requests.utils import default_headers
46from requests.adapters import HTTPAdapter
47from requests.packages.urllib3 import Retry
48from six.moves.urllib.parse import urlparse, unquote
49from requests.cookies import create_cookie
50
51from internetarchive import __version__, auth
52from internetarchive.config import get_config
53from internetarchive.item import Item, Collection
54from internetarchive.search import Search
55from internetarchive.catalog import Catalog, CatalogTask
56from internetarchive.utils import reraise_modify, parse_dict_cookies
57
58
59logger = logging.getLogger(__name__)
60
61
62class ArchiveSession(requests.sessions.Session):
63    """The :class:`ArchiveSession <internetarchive.ArchiveSession>`
64    object collects together useful functionality from `internetarchive`
65    as well as important data such as configuration information and
66    credentials.  It is subclassed from
67    :class:`requests.Session <requests.Session>`.
68
69    Usage::
70
71        >>> from internetarchive import ArchiveSession
72        >>> s = ArchiveSession()
73        >>> item = s.get_item('nasa')
74        Collection(identifier='nasa', exists=True)
75    """
76
77    ITEM_MEDIATYPE_TABLE = {
78        'collection': Collection,
79    }
80
81    def __init__(self,
82                 config=None,
83                 config_file=None,
84                 debug=None,
85                 http_adapter_kwargs=None):
86        """Initialize :class:`ArchiveSession <ArchiveSession>` object with config.
87
88        :type config: dict
89        :param config: (optional) A config dict used for initializing the
90                       :class:`ArchiveSession <ArchiveSession>` object.
91
92        :type config_file: str
93        :param config_file: (optional) Path to config file used for initializing the
94                            :class:`ArchiveSession <ArchiveSession>` object.
95
96        :type http_adapter_kwargs: dict
97        :param http_adapter_kwargs: (optional) Keyword arguments used to initialize the
98                                    :class:`requests.adapters.HTTPAdapter <HTTPAdapter>`
99                                    object.
100
101        :returns: :class:`ArchiveSession` object.
102        """
103        super(ArchiveSession, self).__init__()
104        http_adapter_kwargs = {} if not http_adapter_kwargs else http_adapter_kwargs
105        debug = False if not debug else True
106
107        self.config = get_config(config, config_file)
108        self.config_file = config_file
109        for ck, cv in self.config.get('cookies', {}).items():
110            raw_cookie = '{}={}'.format(ck, cv)
111            cookie_dict = parse_dict_cookies(raw_cookie)
112            if not cookie_dict.get(ck):
113                continue
114            cookie = create_cookie(ck, cookie_dict[ck],
115                                   domain=cookie_dict.get('domain'),
116                                   path=cookie_dict.get('path'))
117            self.cookies.set_cookie(cookie)
118
119        self.secure = self.config.get('general', {}).get('secure', True)
120        self.host = self.config.get('general', {}).get('host', 'archive.org')
121        if 'archive.org' not in self.host:
122            self.host += '.archive.org'
123        self.protocol = 'https:' if self.secure else 'http:'
124        user_email = self.config.get('cookies', dict()).get('logged-in-user')
125        if user_email:
126            user_email = user_email.split(';')[0]
127            user_email = unquote(user_email)
128        self.user_email = user_email
129        self.access_key = self.config.get('s3', {}).get('access')
130        self.secret_key = self.config.get('s3', {}).get('secret')
131        self.http_adapter_kwargs = http_adapter_kwargs
132
133        self.headers = default_headers()
134        self.headers.update({'User-Agent': self._get_user_agent_string()})
135        self.headers.update({'Connection': 'close'})
136
137        self.mount_http_adapter()
138
139        logging_config = self.config.get('logging', {})
140        if logging_config.get('level'):
141            self.set_file_logger(logging_config.get('level', 'NOTSET'),
142                                 logging_config.get('file', 'internetarchive.log'))
143            if debug or (logger.level <= 10):
144                self.set_file_logger(logging_config.get('level', 'NOTSET'),
145                                     logging_config.get('file', 'internetarchive.log'),
146                                     'urllib3')
147
148    def _get_user_agent_string(self):
149        """Generate a User-Agent string to be sent with every request."""
150        uname = platform.uname()
151        try:
152            lang = locale.getlocale()[0][:2]
153        except:
154            lang = ''
155        py_version = '{0}.{1}.{2}'.format(*sys.version_info)
156        return 'internetarchive/{0} ({1} {2}; N; {3}; {4}) Python/{5}'.format(
157            __version__, uname[0], uname[-1], lang, self.access_key, py_version)
158
159    def rebuild_auth(self, prepared_request, response):
160        """Never rebuild auth for archive.org URLs.
161        """
162        u = urlparse(prepared_request.url)
163        if u.netloc.endswith('archive.org'):
164            return
165        super(ArchiveSession, self).rebuild_auth(prepared_request, response)
166
167    def mount_http_adapter(self, protocol=None, max_retries=None,
168                           status_forcelist=None, host=None):
169        """Mount an HTTP adapter to the
170        :class:`ArchiveSession <ArchiveSession>` object.
171
172        :type protocol: str
173        :param protocol: HTTP protocol to mount your adapter to (e.g. 'https://').
174
175        :type max_retries: int, object
176        :param max_retries: The number of times to retry a failed request.
177                            This can also be an `urllib3.Retry` object.
178
179        :type status_forcelist: list
180        :param status_forcelist: A list of status codes (as int's) to retry on.
181
182        :type host: str
183        :param host: The host to mount your adapter to.
184        """
185        protocol = protocol if protocol else self.protocol
186        host = host if host else 'archive.org'
187        if max_retries is None:
188            max_retries = self.http_adapter_kwargs.get('max_retries', 3)
189
190        if not status_forcelist:
191            status_forcelist = [500, 501, 502, 503, 504]
192        if max_retries and isinstance(max_retries, (int, float)):
193            max_retries = Retry(total=max_retries,
194                                connect=max_retries,
195                                read=max_retries,
196                                redirect=False,
197                                allowed_methods=Retry.DEFAULT_ALLOWED_METHODS,
198                                status_forcelist=status_forcelist,
199                                backoff_factor=1)
200        self.http_adapter_kwargs['max_retries'] = max_retries
201        max_retries_adapter = HTTPAdapter(**self.http_adapter_kwargs)
202        # Don't mount on s3.us.archive.org, only archive.org!
203        # IA-S3 requires a more complicated retry workflow.
204        self.mount('{0}//{1}'.format(protocol, host), max_retries_adapter)
205
206    def set_file_logger(self, log_level, path, logger_name='internetarchive'):
207        """Convenience function to quickly configure any level of
208        logging to a file.
209
210        :type log_level: str
211        :param log_level: A log level as specified in the `logging` module.
212
213        :type path: string
214        :param path: Path to the log file. The file will be created if it doesn't already
215                     exist.
216
217        :type logger_name: str
218        :param logger_name: (optional) The name of the logger.
219        """
220        _log_level = {
221            'CRITICAL': 50,
222            'ERROR': 40,
223            'WARNING': 30,
224            'INFO': 20,
225            'DEBUG': 10,
226            'NOTSET': 0,
227        }
228
229        log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
230
231        _log = logging.getLogger(logger_name)
232        _log.setLevel(logging.DEBUG)
233
234        fh = logging.FileHandler(path, encoding='utf-8')
235        fh.setLevel(_log_level[log_level])
236
237        formatter = logging.Formatter(log_format)
238        fh.setFormatter(formatter)
239
240        _log.addHandler(fh)
241
242    def get_item(self, identifier, item_metadata=None, request_kwargs=None):
243        """A method for creating :class:`internetarchive.Item <Item>` and
244        :class:`internetarchive.Collection <Collection>` objects.
245
246        :type identifier: str
247        :param identifier: A globally unique Archive.org identifier.
248
249        :type item_metadata: dict
250        :param item_metadata: (optional) A metadata dict used to initialize the Item or
251                              Collection object. Metadata will automatically be retrieved
252                              from Archive.org if nothing is provided.
253
254        :type request_kwargs: dict
255        :param request_kwargs: (optional) Keyword arguments to be used in
256                                    :meth:`requests.sessions.Session.get` request.
257        """
258        request_kwargs = {} if not request_kwargs else request_kwargs
259        if not item_metadata:
260            logger.debug('no metadata provided for "{0}", '
261                         'retrieving now.'.format(identifier))
262            item_metadata = self.get_metadata(identifier, request_kwargs)
263        mediatype = item_metadata.get('metadata', {}).get('mediatype')
264        try:
265            item_class = self.ITEM_MEDIATYPE_TABLE.get(mediatype, Item)
266        except TypeError:
267            item_class = Item
268        return item_class(self, identifier, item_metadata)
269
270    def get_metadata(self, identifier, request_kwargs=None):
271        """Get an item's metadata from the `Metadata API
272        <http://blog.archive.org/2013/07/04/metadata-api/>`__
273
274        :type identifier: str
275        :param identifier: Globally unique Archive.org identifier.
276
277        :rtype: dict
278        :returns: Metadat API response.
279        """
280        request_kwargs = {} if not request_kwargs else request_kwargs
281        url = '{0}//{1}/metadata/{2}'.format(self.protocol, self.host, identifier)
282        if 'timeout' not in request_kwargs:
283            request_kwargs['timeout'] = 12
284        try:
285            if self.access_key and self.secret_key:
286                s3_auth = auth.S3Auth(self.access_key, self.secret_key)
287            else:
288                s3_auth = None
289            resp = self.get(url, auth=s3_auth, **request_kwargs)
290            resp.raise_for_status()
291        except Exception as exc:
292            error_msg = 'Error retrieving metadata from {0}, {1}'.format(url, exc)
293            logger.error(error_msg)
294            raise type(exc)(error_msg)
295        return resp.json()
296
297    def search_items(self, query,
298                     fields=None,
299                     sorts=None,
300                     params=None,
301                     full_text_search=None,
302                     dsl_fts=None,
303                     request_kwargs=None,
304                     max_retries=None):
305        """Search for items on Archive.org.
306
307        :type query: str
308        :param query: The Archive.org search query to yield results for. Refer to
309                      https://archive.org/advancedsearch.php#raw for help formatting your
310                      query.
311
312        :type fields: bool
313        :param fields: (optional) The metadata fields to return in the search results.
314
315        :type params: dict
316        :param params: (optional) The URL parameters to send with each request sent to the
317                       Archive.org Advancedsearch Api.
318
319        :type full_text_search: bool
320        :param full_text_search: (optional) Beta support for querying the archive.org
321                                 Full Text Search API [default: False].
322
323        :type dsl_fts: bool
324        :param dsl_fts: (optional) Beta support for querying the archive.org Full Text
325                        Search API in dsl (i.e. do not prepend ``!L `` to the
326                        ``full_text_search`` query [default: False].
327
328        :returns: A :class:`Search` object, yielding search results.
329        """
330        request_kwargs = {} if not request_kwargs else request_kwargs
331        return Search(self, query,
332                      fields=fields,
333                      sorts=sorts,
334                      params=params,
335                      full_text_search=full_text_search,
336                      dsl_fts=dsl_fts,
337                      request_kwargs=request_kwargs,
338                      max_retries=max_retries)
339
340    def s3_is_overloaded(self, identifier=None, access_key=None, request_kwargs=None):
341        request_kwargs = {} if not request_kwargs else request_kwargs
342        if 'timeout' not in request_kwargs:
343            request_kwargs['timeout'] = 12
344
345        u = '{protocol}//s3.us.archive.org'.format(protocol=self.protocol)
346        p = dict(
347            check_limit=1,
348            accesskey=access_key,
349            bucket=identifier,
350        )
351        try:
352            r = self.get(u, params=p, **request_kwargs)
353        except:
354            return True
355        try:
356            j = r.json()
357        except ValueError:
358            return True
359        if j.get('over_limit') == 0:
360            return False
361        else:
362            return True
363
364    def get_tasks_api_rate_limit(self, cmd='derive.php', request_kwargs=None):
365        c = Catalog(self, request_kwargs)
366        r = c.get_rate_limit(cmd=cmd)
367        return r
368
369    def submit_task(self, identifier, cmd, comment=None, priority=None, data=None,
370                    headers=None, reduced_priority=None, request_kwargs=None):
371        """Submit an archive.org task.
372
373        :type identifier: str
374        :param identifier: Item identifier.
375
376        :type cmd: str
377        :param cmd: Task command to submit, see
378                    `supported task commands
379                    <https://archive.org/services/docs/api/tasks.html#supported-tasks>`_.
380
381        :type comment: str
382        :param comment: (optional) A reasonable explanation for why the
383                        task is being submitted.
384
385        :type priority: int
386        :param priority: (optional) Task priority from 10 to -10
387                         (default: 0).
388
389        :type data: dict
390        :param data: (optional) Extra POST data to submit with
391                     the request. Refer to `Tasks API Request Entity
392                     <https://archive.org/services/docs/api/tasks.html#request-entity>`_.
393
394        :type headers: dict
395        :param headers: (optional) Add additional headers to request.
396
397        :type reduced_priority: bool
398        :param reduced_priority: (optional) Submit your derive at a lower priority.
399                                 This option is helpful to get around rate-limiting.
400                                 Your task will more likey be accepted, but it might
401                                 not run for a long time. Note that you still may be
402                                 subject to rate-limiting. This is different than
403                                 ``priority`` in that it will allow you to possibly
404                                 avoid rate-limiting.
405
406        :type request_kwargs: dict
407        :param request_kwargs: (optional) Keyword arguments to be used in
408                               :meth:`requests.sessions.Session.post` request.
409
410        :rtype: :class:`requests.Response`
411        """
412        headers = dict() if not headers else headers
413        if reduced_priority is not None:
414            headers.update({'X-Accept-Reduced-Priority': '1'})
415
416        c = Catalog(self, request_kwargs)
417        r = c.submit_task(identifier, cmd,
418                          comment=comment,
419                          priority=priority,
420                          data=data,
421                          headers=headers)
422        return r
423
424    def iter_history(self, identifier, params=None, request_kwargs=None):
425        """A generator that returns completed tasks.
426
427        :type identifier: str
428        :param identifier: (optional) Item identifier.
429
430        :type params: dict
431        :param params: (optional) Query parameters, refer to
432                       `Tasks API
433                       <https://archive.org/services/docs/api/tasks.html>`_
434                       for available parameters.
435
436        :type request_kwargs: dict
437        :param request_kwargs: (optional) Keyword arguments to be used in
438                               :meth:`requests.sessions.Session.get` request.
439
440        :rtype: collections.Iterable[CatalogTask]
441        """
442        params = dict() if not params else params
443        params.update(dict(identifier=identifier, catalog=0, summary=0, history=1))
444        c = Catalog(self, request_kwargs)
445        for j in c.iter_tasks(params):
446            yield j
447
448    def iter_catalog(self, identifier=None, params=None, request_kwargs=None):
449        """A generator that returns queued or running tasks.
450
451        :type identifier: str
452        :param identifier: (optional) Item identifier.
453
454        :type params: dict
455        :param params: (optional) Query parameters, refer to
456                       `Tasks API
457                       <https://archive.org/services/docs/api/tasks.html>`_
458                       for available parameters.
459
460        :type request_kwargs: dict
461        :param request_kwargs: (optional) Keyword arguments to be used in
462                               :meth:`requests.sessions.Session.get` request.
463
464        :rtype: collections.Iterable[CatalogTask]
465        """
466        params = dict() if not params else params
467        params.update(dict(identifier=identifier, catalog=1, summary=0, history=0))
468        c = Catalog(self, request_kwargs)
469        for j in c.iter_tasks(params):
470            yield j
471
472    def get_tasks_summary(self, identifier=None, params=None, request_kwargs=None):
473        """Get the total counts of catalog tasks meeting all criteria,
474        organized by run status (queued, running, error, and paused).
475
476        :type identifier: str
477        :param identifier: (optional) Item identifier.
478
479        :type params: dict
480        :param params: (optional) Query parameters, refer to
481                       `Tasks API
482                       <https://archive.org/services/docs/api/tasks.html>`_
483                       for available parameters.
484
485        :type request_kwargs: dict
486        :param request_kwargs: (optional) Keyword arguments to be used in
487                               :meth:`requests.sessions.Session.get` request.
488
489        :rtype: dict
490        """
491        c = Catalog(self, request_kwargs)
492        return c.get_summary(identifier=identifier, params=params)
493
494    def get_tasks(self, identifier=None, params=None, request_kwargs=None):
495        """Get a list of all tasks meeting all criteria.
496        The list is ordered by submission time.
497
498        :type identifier: str
499        :param identifier: (optional) The item identifier, if provided
500                           will return tasks for only this item filtered by
501                           other criteria provided in params.
502
503        :type params: dict
504        :param params: (optional) Query parameters, refer to
505                       `Tasks API
506                       <https://archive.org/services/docs/api/tasks.html>`_
507                       for available parameters.
508
509        :type request_kwargs: dict
510        :param request_kwargs: (optional) Keyword arguments to be used in
511                               :meth:`requests.sessions.Session.get` request.
512
513        :rtype: List[CatalogTask]
514        """
515        params = dict() if not params else params
516        c = Catalog(self, request_kwargs)
517        if 'history' not in params:
518            params['history'] = 1
519        if 'catalog' not in params:
520            params['catalog'] = 1
521        return c.get_tasks(identifier=identifier, params=params)
522
523    def get_my_catalog(self, params=None, request_kwargs=None):
524        """Get all queued or running tasks.
525
526        :type params: dict
527        :param params: (optional) Query parameters, refer to
528                       `Tasks API
529                       <https://archive.org/services/docs/api/tasks.html>`_
530                       for available parameters.
531
532        :type request_kwargs: dict
533        :param request_kwargs: (optional) Keyword arguments to be used in
534                               :meth:`requests.sessions.Session.get` request.
535
536        :rtype: List[CatalogTask]
537        """
538        params = dict() if not params else params
539        _params = dict(submitter=self.user_email, catalog=1, history=0, summary=0)
540        params.update(_params)
541        return self.get_tasks(params=params, request_kwargs=request_kwargs)
542
543    def get_task_log(self, task_id, request_kwargs=None):
544        """Get a task log.
545
546        :type task_id: str or int
547        :param task_id: The task id for the task log you'd like to fetch.
548
549        :type request_kwargs: dict
550        :param request_kwargs: (optional) Keyword arguments that
551                               :py:class:`requests.Request` takes.
552
553        :rtype: str
554        :returns: The task log as a string.
555        """
556        return CatalogTask.get_task_log(task_id, self, request_kwargs)
557
558    def send(self, request, **kwargs):
559        # Catch urllib3 warnings for HTTPS related errors.
560        insecure = False
561        with warnings.catch_warnings(record=True) as w:
562            warnings.filterwarnings('always')
563            try:
564                r = super(ArchiveSession, self).send(request, **kwargs)
565            except Exception as e:
566                try:
567                    reraise_modify(e, e.request.url, prepend=False)
568                except:
569                    logger.error(e)
570                    raise e
571            if self.protocol == 'http:':
572                return r
573            insecure_warnings = ['SNIMissingWarning', 'InsecurePlatformWarning']
574            if w:
575                for e in w:
576                    if any(x in str(e) for x in insecure_warnings):
577                        insecure = True
578                        break
579        if insecure:
580            from requests.exceptions import RequestException
581            msg = ('You are attempting to make an HTTPS request on an insecure platform,'
582                   ' please see:\n\n\thttps://archive.org/services/docs/api'
583                   '/internetarchive/troubleshooting.html#https-issues\n')
584            raise RequestException(msg)
585        return r
586