1# -*- coding: utf-8 -*- 2# 3# The internetarchive module is a Python/CLI interface to Archive.org. 4# 5# Copyright (C) 2012-2021 Internet Archive 6# 7# This program is free software: you can redistribute it and/or modify 8# it under the terms of the GNU Affero General Public License as 9# published by the Free Software Foundation, either version 3 of the 10# License, or (at your option) any later version. 11# 12# This program is distributed in the hope that it will be useful, 13# but WITHOUT ANY WARRANTY; without even the implied warranty of 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15# GNU Affero General Public License for more details. 16# 17# You should have received a copy of the GNU Affero General Public License 18# along with this program. If not, see <http://www.gnu.org/licenses/>. 19 20""" 21internetarchive.session 22~~~~~~~~~~~~~~~~~~~~~~~ 23 24This module provides an ArchiveSession object to manage and persist 25settings across the internetarchive package. 26 27:copyright: (C) 2012-2021 by Internet Archive. 28:license: AGPL 3, see LICENSE for more details. 29""" 30 31from __future__ import absolute_import, unicode_literals 32 33import os 34import locale 35import sys 36import logging 37import platform 38import warnings 39try: 40 import ujson as json 41except ImportError: 42 import json 43 44import requests.sessions 45from requests.utils import default_headers 46from requests.adapters import HTTPAdapter 47from requests.packages.urllib3 import Retry 48from six.moves.urllib.parse import urlparse, unquote 49from requests.cookies import create_cookie 50 51from internetarchive import __version__, auth 52from internetarchive.config import get_config 53from internetarchive.item import Item, Collection 54from internetarchive.search import Search 55from internetarchive.catalog import Catalog, CatalogTask 56from internetarchive.utils import reraise_modify, parse_dict_cookies 57 58 59logger = logging.getLogger(__name__) 60 61 62class ArchiveSession(requests.sessions.Session): 63 """The :class:`ArchiveSession <internetarchive.ArchiveSession>` 64 object collects together useful functionality from `internetarchive` 65 as well as important data such as configuration information and 66 credentials. It is subclassed from 67 :class:`requests.Session <requests.Session>`. 68 69 Usage:: 70 71 >>> from internetarchive import ArchiveSession 72 >>> s = ArchiveSession() 73 >>> item = s.get_item('nasa') 74 Collection(identifier='nasa', exists=True) 75 """ 76 77 ITEM_MEDIATYPE_TABLE = { 78 'collection': Collection, 79 } 80 81 def __init__(self, 82 config=None, 83 config_file=None, 84 debug=None, 85 http_adapter_kwargs=None): 86 """Initialize :class:`ArchiveSession <ArchiveSession>` object with config. 87 88 :type config: dict 89 :param config: (optional) A config dict used for initializing the 90 :class:`ArchiveSession <ArchiveSession>` object. 91 92 :type config_file: str 93 :param config_file: (optional) Path to config file used for initializing the 94 :class:`ArchiveSession <ArchiveSession>` object. 95 96 :type http_adapter_kwargs: dict 97 :param http_adapter_kwargs: (optional) Keyword arguments used to initialize the 98 :class:`requests.adapters.HTTPAdapter <HTTPAdapter>` 99 object. 100 101 :returns: :class:`ArchiveSession` object. 102 """ 103 super(ArchiveSession, self).__init__() 104 http_adapter_kwargs = {} if not http_adapter_kwargs else http_adapter_kwargs 105 debug = False if not debug else True 106 107 self.config = get_config(config, config_file) 108 self.config_file = config_file 109 for ck, cv in self.config.get('cookies', {}).items(): 110 raw_cookie = '{}={}'.format(ck, cv) 111 cookie_dict = parse_dict_cookies(raw_cookie) 112 if not cookie_dict.get(ck): 113 continue 114 cookie = create_cookie(ck, cookie_dict[ck], 115 domain=cookie_dict.get('domain'), 116 path=cookie_dict.get('path')) 117 self.cookies.set_cookie(cookie) 118 119 self.secure = self.config.get('general', {}).get('secure', True) 120 self.host = self.config.get('general', {}).get('host', 'archive.org') 121 if 'archive.org' not in self.host: 122 self.host += '.archive.org' 123 self.protocol = 'https:' if self.secure else 'http:' 124 user_email = self.config.get('cookies', dict()).get('logged-in-user') 125 if user_email: 126 user_email = user_email.split(';')[0] 127 user_email = unquote(user_email) 128 self.user_email = user_email 129 self.access_key = self.config.get('s3', {}).get('access') 130 self.secret_key = self.config.get('s3', {}).get('secret') 131 self.http_adapter_kwargs = http_adapter_kwargs 132 133 self.headers = default_headers() 134 self.headers.update({'User-Agent': self._get_user_agent_string()}) 135 self.headers.update({'Connection': 'close'}) 136 137 self.mount_http_adapter() 138 139 logging_config = self.config.get('logging', {}) 140 if logging_config.get('level'): 141 self.set_file_logger(logging_config.get('level', 'NOTSET'), 142 logging_config.get('file', 'internetarchive.log')) 143 if debug or (logger.level <= 10): 144 self.set_file_logger(logging_config.get('level', 'NOTSET'), 145 logging_config.get('file', 'internetarchive.log'), 146 'urllib3') 147 148 def _get_user_agent_string(self): 149 """Generate a User-Agent string to be sent with every request.""" 150 uname = platform.uname() 151 try: 152 lang = locale.getlocale()[0][:2] 153 except: 154 lang = '' 155 py_version = '{0}.{1}.{2}'.format(*sys.version_info) 156 return 'internetarchive/{0} ({1} {2}; N; {3}; {4}) Python/{5}'.format( 157 __version__, uname[0], uname[-1], lang, self.access_key, py_version) 158 159 def rebuild_auth(self, prepared_request, response): 160 """Never rebuild auth for archive.org URLs. 161 """ 162 u = urlparse(prepared_request.url) 163 if u.netloc.endswith('archive.org'): 164 return 165 super(ArchiveSession, self).rebuild_auth(prepared_request, response) 166 167 def mount_http_adapter(self, protocol=None, max_retries=None, 168 status_forcelist=None, host=None): 169 """Mount an HTTP adapter to the 170 :class:`ArchiveSession <ArchiveSession>` object. 171 172 :type protocol: str 173 :param protocol: HTTP protocol to mount your adapter to (e.g. 'https://'). 174 175 :type max_retries: int, object 176 :param max_retries: The number of times to retry a failed request. 177 This can also be an `urllib3.Retry` object. 178 179 :type status_forcelist: list 180 :param status_forcelist: A list of status codes (as int's) to retry on. 181 182 :type host: str 183 :param host: The host to mount your adapter to. 184 """ 185 protocol = protocol if protocol else self.protocol 186 host = host if host else 'archive.org' 187 if max_retries is None: 188 max_retries = self.http_adapter_kwargs.get('max_retries', 3) 189 190 if not status_forcelist: 191 status_forcelist = [500, 501, 502, 503, 504] 192 if max_retries and isinstance(max_retries, (int, float)): 193 max_retries = Retry(total=max_retries, 194 connect=max_retries, 195 read=max_retries, 196 redirect=False, 197 allowed_methods=Retry.DEFAULT_ALLOWED_METHODS, 198 status_forcelist=status_forcelist, 199 backoff_factor=1) 200 self.http_adapter_kwargs['max_retries'] = max_retries 201 max_retries_adapter = HTTPAdapter(**self.http_adapter_kwargs) 202 # Don't mount on s3.us.archive.org, only archive.org! 203 # IA-S3 requires a more complicated retry workflow. 204 self.mount('{0}//{1}'.format(protocol, host), max_retries_adapter) 205 206 def set_file_logger(self, log_level, path, logger_name='internetarchive'): 207 """Convenience function to quickly configure any level of 208 logging to a file. 209 210 :type log_level: str 211 :param log_level: A log level as specified in the `logging` module. 212 213 :type path: string 214 :param path: Path to the log file. The file will be created if it doesn't already 215 exist. 216 217 :type logger_name: str 218 :param logger_name: (optional) The name of the logger. 219 """ 220 _log_level = { 221 'CRITICAL': 50, 222 'ERROR': 40, 223 'WARNING': 30, 224 'INFO': 20, 225 'DEBUG': 10, 226 'NOTSET': 0, 227 } 228 229 log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' 230 231 _log = logging.getLogger(logger_name) 232 _log.setLevel(logging.DEBUG) 233 234 fh = logging.FileHandler(path, encoding='utf-8') 235 fh.setLevel(_log_level[log_level]) 236 237 formatter = logging.Formatter(log_format) 238 fh.setFormatter(formatter) 239 240 _log.addHandler(fh) 241 242 def get_item(self, identifier, item_metadata=None, request_kwargs=None): 243 """A method for creating :class:`internetarchive.Item <Item>` and 244 :class:`internetarchive.Collection <Collection>` objects. 245 246 :type identifier: str 247 :param identifier: A globally unique Archive.org identifier. 248 249 :type item_metadata: dict 250 :param item_metadata: (optional) A metadata dict used to initialize the Item or 251 Collection object. Metadata will automatically be retrieved 252 from Archive.org if nothing is provided. 253 254 :type request_kwargs: dict 255 :param request_kwargs: (optional) Keyword arguments to be used in 256 :meth:`requests.sessions.Session.get` request. 257 """ 258 request_kwargs = {} if not request_kwargs else request_kwargs 259 if not item_metadata: 260 logger.debug('no metadata provided for "{0}", ' 261 'retrieving now.'.format(identifier)) 262 item_metadata = self.get_metadata(identifier, request_kwargs) 263 mediatype = item_metadata.get('metadata', {}).get('mediatype') 264 try: 265 item_class = self.ITEM_MEDIATYPE_TABLE.get(mediatype, Item) 266 except TypeError: 267 item_class = Item 268 return item_class(self, identifier, item_metadata) 269 270 def get_metadata(self, identifier, request_kwargs=None): 271 """Get an item's metadata from the `Metadata API 272 <http://blog.archive.org/2013/07/04/metadata-api/>`__ 273 274 :type identifier: str 275 :param identifier: Globally unique Archive.org identifier. 276 277 :rtype: dict 278 :returns: Metadat API response. 279 """ 280 request_kwargs = {} if not request_kwargs else request_kwargs 281 url = '{0}//{1}/metadata/{2}'.format(self.protocol, self.host, identifier) 282 if 'timeout' not in request_kwargs: 283 request_kwargs['timeout'] = 12 284 try: 285 if self.access_key and self.secret_key: 286 s3_auth = auth.S3Auth(self.access_key, self.secret_key) 287 else: 288 s3_auth = None 289 resp = self.get(url, auth=s3_auth, **request_kwargs) 290 resp.raise_for_status() 291 except Exception as exc: 292 error_msg = 'Error retrieving metadata from {0}, {1}'.format(url, exc) 293 logger.error(error_msg) 294 raise type(exc)(error_msg) 295 return resp.json() 296 297 def search_items(self, query, 298 fields=None, 299 sorts=None, 300 params=None, 301 full_text_search=None, 302 dsl_fts=None, 303 request_kwargs=None, 304 max_retries=None): 305 """Search for items on Archive.org. 306 307 :type query: str 308 :param query: The Archive.org search query to yield results for. Refer to 309 https://archive.org/advancedsearch.php#raw for help formatting your 310 query. 311 312 :type fields: bool 313 :param fields: (optional) The metadata fields to return in the search results. 314 315 :type params: dict 316 :param params: (optional) The URL parameters to send with each request sent to the 317 Archive.org Advancedsearch Api. 318 319 :type full_text_search: bool 320 :param full_text_search: (optional) Beta support for querying the archive.org 321 Full Text Search API [default: False]. 322 323 :type dsl_fts: bool 324 :param dsl_fts: (optional) Beta support for querying the archive.org Full Text 325 Search API in dsl (i.e. do not prepend ``!L `` to the 326 ``full_text_search`` query [default: False]. 327 328 :returns: A :class:`Search` object, yielding search results. 329 """ 330 request_kwargs = {} if not request_kwargs else request_kwargs 331 return Search(self, query, 332 fields=fields, 333 sorts=sorts, 334 params=params, 335 full_text_search=full_text_search, 336 dsl_fts=dsl_fts, 337 request_kwargs=request_kwargs, 338 max_retries=max_retries) 339 340 def s3_is_overloaded(self, identifier=None, access_key=None, request_kwargs=None): 341 request_kwargs = {} if not request_kwargs else request_kwargs 342 if 'timeout' not in request_kwargs: 343 request_kwargs['timeout'] = 12 344 345 u = '{protocol}//s3.us.archive.org'.format(protocol=self.protocol) 346 p = dict( 347 check_limit=1, 348 accesskey=access_key, 349 bucket=identifier, 350 ) 351 try: 352 r = self.get(u, params=p, **request_kwargs) 353 except: 354 return True 355 try: 356 j = r.json() 357 except ValueError: 358 return True 359 if j.get('over_limit') == 0: 360 return False 361 else: 362 return True 363 364 def get_tasks_api_rate_limit(self, cmd='derive.php', request_kwargs=None): 365 c = Catalog(self, request_kwargs) 366 r = c.get_rate_limit(cmd=cmd) 367 return r 368 369 def submit_task(self, identifier, cmd, comment=None, priority=None, data=None, 370 headers=None, reduced_priority=None, request_kwargs=None): 371 """Submit an archive.org task. 372 373 :type identifier: str 374 :param identifier: Item identifier. 375 376 :type cmd: str 377 :param cmd: Task command to submit, see 378 `supported task commands 379 <https://archive.org/services/docs/api/tasks.html#supported-tasks>`_. 380 381 :type comment: str 382 :param comment: (optional) A reasonable explanation for why the 383 task is being submitted. 384 385 :type priority: int 386 :param priority: (optional) Task priority from 10 to -10 387 (default: 0). 388 389 :type data: dict 390 :param data: (optional) Extra POST data to submit with 391 the request. Refer to `Tasks API Request Entity 392 <https://archive.org/services/docs/api/tasks.html#request-entity>`_. 393 394 :type headers: dict 395 :param headers: (optional) Add additional headers to request. 396 397 :type reduced_priority: bool 398 :param reduced_priority: (optional) Submit your derive at a lower priority. 399 This option is helpful to get around rate-limiting. 400 Your task will more likey be accepted, but it might 401 not run for a long time. Note that you still may be 402 subject to rate-limiting. This is different than 403 ``priority`` in that it will allow you to possibly 404 avoid rate-limiting. 405 406 :type request_kwargs: dict 407 :param request_kwargs: (optional) Keyword arguments to be used in 408 :meth:`requests.sessions.Session.post` request. 409 410 :rtype: :class:`requests.Response` 411 """ 412 headers = dict() if not headers else headers 413 if reduced_priority is not None: 414 headers.update({'X-Accept-Reduced-Priority': '1'}) 415 416 c = Catalog(self, request_kwargs) 417 r = c.submit_task(identifier, cmd, 418 comment=comment, 419 priority=priority, 420 data=data, 421 headers=headers) 422 return r 423 424 def iter_history(self, identifier, params=None, request_kwargs=None): 425 """A generator that returns completed tasks. 426 427 :type identifier: str 428 :param identifier: (optional) Item identifier. 429 430 :type params: dict 431 :param params: (optional) Query parameters, refer to 432 `Tasks API 433 <https://archive.org/services/docs/api/tasks.html>`_ 434 for available parameters. 435 436 :type request_kwargs: dict 437 :param request_kwargs: (optional) Keyword arguments to be used in 438 :meth:`requests.sessions.Session.get` request. 439 440 :rtype: collections.Iterable[CatalogTask] 441 """ 442 params = dict() if not params else params 443 params.update(dict(identifier=identifier, catalog=0, summary=0, history=1)) 444 c = Catalog(self, request_kwargs) 445 for j in c.iter_tasks(params): 446 yield j 447 448 def iter_catalog(self, identifier=None, params=None, request_kwargs=None): 449 """A generator that returns queued or running tasks. 450 451 :type identifier: str 452 :param identifier: (optional) Item identifier. 453 454 :type params: dict 455 :param params: (optional) Query parameters, refer to 456 `Tasks API 457 <https://archive.org/services/docs/api/tasks.html>`_ 458 for available parameters. 459 460 :type request_kwargs: dict 461 :param request_kwargs: (optional) Keyword arguments to be used in 462 :meth:`requests.sessions.Session.get` request. 463 464 :rtype: collections.Iterable[CatalogTask] 465 """ 466 params = dict() if not params else params 467 params.update(dict(identifier=identifier, catalog=1, summary=0, history=0)) 468 c = Catalog(self, request_kwargs) 469 for j in c.iter_tasks(params): 470 yield j 471 472 def get_tasks_summary(self, identifier=None, params=None, request_kwargs=None): 473 """Get the total counts of catalog tasks meeting all criteria, 474 organized by run status (queued, running, error, and paused). 475 476 :type identifier: str 477 :param identifier: (optional) Item identifier. 478 479 :type params: dict 480 :param params: (optional) Query parameters, refer to 481 `Tasks API 482 <https://archive.org/services/docs/api/tasks.html>`_ 483 for available parameters. 484 485 :type request_kwargs: dict 486 :param request_kwargs: (optional) Keyword arguments to be used in 487 :meth:`requests.sessions.Session.get` request. 488 489 :rtype: dict 490 """ 491 c = Catalog(self, request_kwargs) 492 return c.get_summary(identifier=identifier, params=params) 493 494 def get_tasks(self, identifier=None, params=None, request_kwargs=None): 495 """Get a list of all tasks meeting all criteria. 496 The list is ordered by submission time. 497 498 :type identifier: str 499 :param identifier: (optional) The item identifier, if provided 500 will return tasks for only this item filtered by 501 other criteria provided in params. 502 503 :type params: dict 504 :param params: (optional) Query parameters, refer to 505 `Tasks API 506 <https://archive.org/services/docs/api/tasks.html>`_ 507 for available parameters. 508 509 :type request_kwargs: dict 510 :param request_kwargs: (optional) Keyword arguments to be used in 511 :meth:`requests.sessions.Session.get` request. 512 513 :rtype: List[CatalogTask] 514 """ 515 params = dict() if not params else params 516 c = Catalog(self, request_kwargs) 517 if 'history' not in params: 518 params['history'] = 1 519 if 'catalog' not in params: 520 params['catalog'] = 1 521 return c.get_tasks(identifier=identifier, params=params) 522 523 def get_my_catalog(self, params=None, request_kwargs=None): 524 """Get all queued or running tasks. 525 526 :type params: dict 527 :param params: (optional) Query parameters, refer to 528 `Tasks API 529 <https://archive.org/services/docs/api/tasks.html>`_ 530 for available parameters. 531 532 :type request_kwargs: dict 533 :param request_kwargs: (optional) Keyword arguments to be used in 534 :meth:`requests.sessions.Session.get` request. 535 536 :rtype: List[CatalogTask] 537 """ 538 params = dict() if not params else params 539 _params = dict(submitter=self.user_email, catalog=1, history=0, summary=0) 540 params.update(_params) 541 return self.get_tasks(params=params, request_kwargs=request_kwargs) 542 543 def get_task_log(self, task_id, request_kwargs=None): 544 """Get a task log. 545 546 :type task_id: str or int 547 :param task_id: The task id for the task log you'd like to fetch. 548 549 :type request_kwargs: dict 550 :param request_kwargs: (optional) Keyword arguments that 551 :py:class:`requests.Request` takes. 552 553 :rtype: str 554 :returns: The task log as a string. 555 """ 556 return CatalogTask.get_task_log(task_id, self, request_kwargs) 557 558 def send(self, request, **kwargs): 559 # Catch urllib3 warnings for HTTPS related errors. 560 insecure = False 561 with warnings.catch_warnings(record=True) as w: 562 warnings.filterwarnings('always') 563 try: 564 r = super(ArchiveSession, self).send(request, **kwargs) 565 except Exception as e: 566 try: 567 reraise_modify(e, e.request.url, prepend=False) 568 except: 569 logger.error(e) 570 raise e 571 if self.protocol == 'http:': 572 return r 573 insecure_warnings = ['SNIMissingWarning', 'InsecurePlatformWarning'] 574 if w: 575 for e in w: 576 if any(x in str(e) for x in insecure_warnings): 577 insecure = True 578 break 579 if insecure: 580 from requests.exceptions import RequestException 581 msg = ('You are attempting to make an HTTPS request on an insecure platform,' 582 ' please see:\n\n\thttps://archive.org/services/docs/api' 583 '/internetarchive/troubleshooting.html#https-issues\n') 584 raise RequestException(msg) 585 return r 586