1"""Interface to Mediawiki's api.php.""" 2# 3# (C) Pywikibot team, 2007-2021 4# 5# Distributed under the terms of the MIT license. 6# 7import datetime 8import hashlib 9import inspect 10import os 11import pickle 12import pprint 13import re 14import traceback 15from collections.abc import Container, MutableMapping, Sized 16from contextlib import suppress 17from email.generator import BytesGenerator 18from email.mime.multipart import MIMEMultipart as MIMEMultipartOrig 19from email.mime.nonmultipart import MIMENonMultipart 20from inspect import getfullargspec 21from io import BytesIO 22from typing import Optional, Union 23from urllib.parse import unquote, urlencode 24from warnings import warn 25 26import pywikibot 27from pywikibot import config, login 28from pywikibot.backports import Dict, Tuple, removeprefix 29from pywikibot.comms import http 30from pywikibot.exceptions import ( 31 Error, 32 FatalServerError, 33 InvalidTitleError, 34 MaxlagTimeoutError, 35 NoUsernameError, 36 Server414Error, 37 Server504Error, 38 SiteDefinitionError, 39 TimeoutError, 40 UnsupportedPageError, 41) 42from pywikibot.family import SubdomainFamily 43from pywikibot.login import LoginStatus 44from pywikibot.textlib import removeHTMLParts 45from pywikibot.tools import PYTHON_VERSION, ModuleDeprecationWrapper, itergroup 46from pywikibot.tools.formatter import color_format 47 48 49_logger = 'data.api' 50 51lagpattern = re.compile( 52 r'Waiting for [\w.: ]+: (?P<lag>\d+(?:\.\d+)?) seconds? lagged') 53 54 55def _invalidate_superior_cookies(family): 56 """ 57 Clear cookies for site's second level domain. 58 59 get_login_token() will generate new cookies needed. 60 This is a workaround for requests bug, see T224712 61 and https://github.com/psf/requests/issues/5411 62 for more details. 63 """ 64 if isinstance(family, SubdomainFamily): 65 for cookie in http.cookie_jar: 66 if family.domain == cookie.domain: 67 http.cookie_jar.clear(cookie.domain, cookie.path, cookie.name) 68 69 70# Bug: T113120, T228841 71# Subclassing necessary to fix bug of the email package in Python 3: 72# see https://bugs.python.org/issue19003 73# see https://bugs.python.org/issue18886 74# The following solution might be removed if the bug is fixed for 75# Python versions which are supported by PWB, probably with Python 3.5 76 77class CTEBinaryBytesGenerator(BytesGenerator): 78 79 """Workaround for bug in python 3 email handling of CTE binary.""" 80 81 def __init__(self, *args, **kwargs): 82 """Initializer.""" 83 super().__init__(*args, **kwargs) 84 self._writeBody = self._write_body 85 86 def _write_body(self, msg): 87 if msg['content-transfer-encoding'] == 'binary': 88 self._fp.write(msg.get_payload(decode=True)) 89 else: 90 super()._handle_text(msg) 91 92 93class CTEBinaryMIMEMultipart(MIMEMultipartOrig): 94 95 """Workaround for bug in python 3 email handling of CTE binary.""" 96 97 def as_bytes(self, unixfrom=False, policy=None): 98 """Return unmodified binary payload.""" 99 policy = self.policy if policy is None else policy 100 fp = BytesIO() 101 g = CTEBinaryBytesGenerator(fp, mangle_from_=False, policy=policy) 102 g.flatten(self, unixfrom=unixfrom) 103 return fp.getvalue() 104 105 106MIMEMultipart = CTEBinaryMIMEMultipart 107 108 109class ParamInfo(Sized, Container): 110 111 """ 112 API parameter information data object. 113 114 Provides cache aware fetching of parameter information. 115 116 It does not support the format modules. 117 """ 118 119 paraminfo_keys = frozenset(['modules', 'querymodules', 'formatmodules', 120 'mainmodule', 'pagesetmodule']) 121 122 root_modules = frozenset(['main', 'pageset']) 123 124 init_modules = frozenset(['main', 'paraminfo']) 125 126 def __init__(self, site, preloaded_modules=None, modules_only_mode=None): 127 """ 128 Initializer. 129 130 :param preloaded_modules: API modules to preload 131 :type preloaded_modules: set of string 132 :param modules_only_mode: use the 'modules' only syntax for API request 133 :type modules_only_mode: bool or None to only use default, which True 134 if the site is 1.25wmf4+ 135 """ 136 self.site = site 137 138 # Keys are module names, values are the raw responses from the server. 139 self._paraminfo = {} 140 141 # Cached data. 142 self._prefixes = {} 143 self._prefix_map = {} 144 self._with_limits = None 145 146 self._action_modules = frozenset() # top level modules 147 self._modules = {} # filled in _init() (and enlarged in fetch) 148 self._limit = None 149 150 self.preloaded_modules = self.init_modules 151 if preloaded_modules: 152 self.preloaded_modules |= set(preloaded_modules) 153 154 self.modules_only_mode = modules_only_mode 155 if self.modules_only_mode: 156 self.paraminfo_keys = frozenset(['modules']) 157 158 def _add_submodules(self, name, modules): 159 """Add the modules to the internal cache or check if equal.""" 160 # The current implementation here doesn't support submodules inside of 161 # submodules, because that would require to fetch all modules when only 162 # the names of them were requested 163 assert '+' not in name 164 modules = frozenset(modules) 165 if name == 'main': 166 # The main module behaves differently as it has no prefix 167 if self._action_modules: 168 assert modules == self._action_modules 169 else: 170 self._action_modules = modules 171 elif name in self._modules: 172 assert modules == self._modules[name] 173 else: 174 self._modules[name] = modules 175 176 def _init(self): 177 assert ('query' in self._modules) is ('main' in self._paraminfo) 178 if 'query' in self._modules: 179 return 180 mw_ver = self.site.mw_version 181 182 # The paraminfo api deprecated the old request syntax of 183 # querymodules='info'; to avoid warnings sites with 1.25wmf4+ 184 # must only use 'modules' parameter. 185 if self.modules_only_mode is None: 186 self.modules_only_mode = mw_ver >= '1.25wmf4' 187 if self.modules_only_mode: 188 self.paraminfo_keys = frozenset(['modules']) 189 190 # Assume that by v1.26, it will be desirable to prefetch 'query' 191 if mw_ver > '1.26': 192 self.preloaded_modules |= {'query'} 193 194 self._fetch(self.preloaded_modules) 195 196 main_modules_param = self.parameter('main', 'action') 197 assert main_modules_param 198 assert 'type' in main_modules_param 199 assert isinstance(main_modules_param['type'], list) 200 assert self._action_modules == set(main_modules_param['type']) 201 202 # While deprecated with warning in 1.25, paraminfo param 'querymodules' 203 # provides a list of all query modules. This will likely be removed 204 # from the API in the future, in which case the fallback is the use 205 # the same data available in the paraminfo for query. 206 query_modules_param = self.parameter('paraminfo', 'querymodules') 207 208 if 'limit' not in query_modules_param: 209 raise RuntimeError('"limit" not found in query modules') 210 self._limit = query_modules_param['limit'] 211 212 if query_modules_param and 'type' in query_modules_param: 213 # 'type' is the list of modules 214 self._add_submodules('query', query_modules_param['type']) 215 216 if 'query' not in self._modules: 217 assert 'query' not in self._paraminfo 218 self._fetch({'query'}) 219 assert 'query' in self._modules 220 221 def _emulate_pageset(self): 222 """Emulate the pageset module, which existed until MW 1.24.""" 223 # pageset isn't a module in the new system, so it is emulated, with 224 # the paraminfo from the query module. 225 assert('query' in self._paraminfo) 226 227 self._paraminfo['pageset'] = { 228 'name': 'pageset', 229 'path': 'pageset', 230 'classname': 'ApiPageSet', 231 'prefix': '', 232 'readrights': '', 233 'helpurls': [], 234 'parameters': self._paraminfo['query']['parameters'] 235 } 236 237 @staticmethod 238 def _modules_to_set(modules) -> set: 239 """Return modules as a set. 240 241 :type modules: iterable or str 242 """ 243 if isinstance(modules, str): 244 return set(modules.split('|')) 245 return set(modules) 246 247 def fetch(self, modules) -> None: 248 """ 249 Fetch paraminfo for multiple modules. 250 251 No exception is raised when paraminfo for a module does not exist. 252 Use __getitem__ to cause an exception if a module does not exist. 253 254 :param modules: API modules to load 255 :type modules: iterable or str 256 """ 257 if 'main' not in self._paraminfo: 258 # The first request should be 'paraminfo', so that 259 # query modules can be prefixed with 'query+' 260 self._init() 261 262 modules = self._modules_to_set(modules) 263 264 if self._action_modules: 265 # The query module may be added before the action modules have been 266 if 'query' in self._modules: 267 # It does fetch() while initializing, and this method can't be 268 # called before it's initialized. 269 modules = self._normalize_modules(modules) 270 else: 271 # We do know the valid action modules and require a subset 272 assert not modules - self._action_modules - self.root_modules 273 274 self._fetch(modules) 275 276 def _fetch(self, modules: Union[set, frozenset]) -> None: 277 """ 278 Fetch paraminfo for multiple modules without initializing beforehand. 279 280 :param modules: API modules to load and which haven't been loaded yet. 281 """ 282 def module_generator(): 283 """A generator yielding batches of modules.""" 284 i = itergroup(sorted(modules), self._limit) 285 for batch in i: 286 for failed_module in failed_modules: 287 yield [failed_module] 288 del failed_modules[:] 289 yield batch 290 291 modules = modules - set(self._paraminfo.keys()) 292 if not modules: 293 return 294 295 assert 'query' in self._modules or 'paraminfo' not in self._paraminfo 296 297 # If something went wrong in a batch it can add each module to the 298 # batch and the generator will on the next iteration yield each module 299 # separately 300 failed_modules = [] 301 302 # This can be further optimised, by grouping them in more stable 303 # subsets, which are unlikely to change. i.e. first request core 304 # modules which have been a stable part of the API for a long time. 305 # Also detecting extension based modules may help. 306 # Also, when self.modules_only_mode is disabled, both modules and 307 # querymodules may each be filled with self._limit items, doubling the 308 # number of modules that may be processed in a single batch. 309 for module_batch in module_generator(): 310 if self.modules_only_mode and 'pageset' in module_batch: 311 pywikibot.debug('paraminfo fetch: removed pageset', _logger) 312 module_batch.remove('pageset') 313 # If this occurred during initialisation, 314 # also record it in the preloaded_modules. 315 # (at least so tests know an extra load was intentional) 316 if 'query' not in self._paraminfo: 317 pywikibot.debug('paraminfo batch: added query', _logger) 318 module_batch.append('query') 319 self.preloaded_modules |= {'query'} 320 321 params = { 322 'action': 'paraminfo', 323 } 324 325 if self.modules_only_mode: 326 params['modules'] = module_batch 327 else: 328 params['modules'] = [mod for mod in module_batch 329 if not mod.startswith('query+') 330 and mod not in self.root_modules] 331 params['querymodules'] = [mod[6:] for mod in module_batch 332 if mod.startswith('query+')] 333 334 for mod in set(module_batch) & self.root_modules: 335 params[mod + 'module'] = 1 336 337 # Request need ParamInfo to determine use_get 338 request = self.site._request(expiry=config.API_config_expiry, 339 use_get=True, 340 parameters=params) 341 result = request.submit() 342 343 normalized_result = self.normalize_paraminfo(result) 344 for path in list(normalized_result): 345 if normalized_result[path] is False: 346 del normalized_result[path] 347 348 # Sometimes the name/path of the module is not actually the name 349 # which was requested, so we need to manually determine which 350 # (wrongly named) module uses which actual name. See also T105478 351 missing_modules = [m for m in module_batch 352 if m not in normalized_result] 353 if len(missing_modules) == 1 and len(normalized_result) == 1: 354 # Okay it's possible to recover 355 normalized_result = next(iter(normalized_result.values())) 356 pywikibot.warning('The module "{0[name]}" ("{0[path]}") ' 357 'was returned as path even though "{1}" ' 358 'was requested'.format(normalized_result, 359 missing_modules[0])) 360 normalized_result['path'] = missing_modules[0] 361 normalized_result['name'] = missing_modules[0].rsplit('+')[0] 362 normalized_result = {missing_modules[0]: normalized_result} 363 elif len(module_batch) > 1 and missing_modules: 364 # Rerequest the missing ones separately 365 pywikibot.log('Inconsistency in batch "{}"; rerequest ' 366 'separately'.format(missing_modules)) 367 failed_modules.extend(missing_modules) 368 369 # Remove all modules which weren't requested, we can't be sure that 370 # they are valid 371 for path in list(normalized_result): 372 if path not in module_batch: 373 del normalized_result[path] 374 375 self._paraminfo.update(normalized_result) 376 self._generate_submodules(mod['path'] 377 for mod in normalized_result.values()) 378 379 if 'pageset' in modules and 'pageset' not in self._paraminfo: 380 self._emulate_pageset() 381 382 def _generate_submodules(self, modules): 383 """Check and generate submodules for the given modules.""" 384 for module in modules: 385 parameters = self._paraminfo[module].get('parameters', []) 386 submodules = set() 387 # Advanced submodule into added to MW API in df80f1ea 388 if self.site.mw_version >= '1.26wmf9': 389 # This is supplying submodules even if they aren't submodules 390 # of the given module so skip those 391 for param in parameters: 392 if ((module == 'main' and param['name'] == 'format') 393 or 'submodules' not in param): 394 continue 395 for submodule in param['submodules'].values(): 396 if '+' in submodule: 397 parent, child = submodule.rsplit('+', 1) 398 else: 399 parent = 'main' 400 child = submodule 401 if parent == module: 402 submodules.add(child) 403 else: 404 # Boolean submodule info added to MW API in afa153ae 405 if self.site.mw_version < '1.24wmf18': 406 if module == 'main': 407 params = {'action'} 408 elif module == 'query': 409 params = {'prop', 'list', 'meta'} 410 else: 411 params = set() 412 for param in parameters: 413 if param['name'] in params: 414 param['submodules'] = '' 415 416 for param in parameters: 417 # Do not add format modules 418 if ('submodules' in param 419 and (module != 'main' 420 or param['name'] != 'format')): 421 submodules |= set(param['type']) 422 423 if submodules: 424 self._add_submodules(module, submodules) 425 if module == 'query': 426 # Previously also modules from generator were used as query 427 # modules, but verify that those are just a subset of the 428 # prop/list/meta modules. There is no sanity check as this 429 # needs to be revisited if query has no generator parameter 430 for param in parameters: 431 if param['name'] == 'generator': 432 break 433 else: 434 param = {} 435 assert param['name'] == 'generator' \ 436 and submodules >= set(param['type']) 437 438 def _normalize_modules(self, modules) -> set: 439 """Add query+ to any query module name not also in action modules.""" 440 # Users will supply the wrong type, and expect it to work. 441 modules = self._modules_to_set(modules) 442 443 assert self._action_modules 444 445 return {'query+' + mod 446 if '+' not in mod and mod in self.query_modules 447 and mod not in self._action_modules 448 else mod 449 for mod in modules} 450 451 def normalize_modules(self, modules) -> set: 452 """ 453 Convert the modules into module paths. 454 455 Add query+ to any query module name not also in action modules. 456 457 :return: The modules converted into a module paths 458 """ 459 self._init() 460 return self._normalize_modules(modules) 461 462 @classmethod 463 def normalize_paraminfo(cls, data): 464 """ 465 Convert both old and new API JSON into a new-ish data structure. 466 467 For duplicate paths, the value will be False. 468 """ 469 result_data = {} 470 for paraminfo_key, modules_data in data['paraminfo'].items(): 471 if not modules_data: 472 continue 473 474 if paraminfo_key[:-len('module')] in cls.root_modules: 475 modules_data = [modules_data] 476 elif not paraminfo_key.endswith('modules'): 477 continue 478 479 for mod_data in modules_data: 480 if 'missing' in mod_data: 481 continue 482 483 name = mod_data.get('name') 484 php_class = mod_data.get('classname') 485 486 if not name and php_class: 487 if php_class == 'ApiMain': 488 name = 'main' 489 elif php_class == 'ApiPageSet': 490 name = 'pageset' 491 else: 492 pywikibot.warning('Unknown paraminfo module "{}"' 493 .format(php_class)) 494 name = '<unknown>:' + php_class 495 496 mod_data['name'] = name 497 498 if 'path' not in mod_data: 499 # query modules often contain 'ApiQuery' and have a suffix. 500 # 'ApiQuery' alone is the action 'query' 501 if ('querytype' in mod_data 502 or php_class and len(php_class) > 8 503 and 'ApiQuery' in php_class): 504 mod_data['path'] = 'query+' + name 505 else: 506 mod_data['path'] = name 507 508 path = mod_data['path'] 509 510 if path in result_data: 511 # Only warn first time 512 if result_data[path] is not False: 513 pywikibot.warning('Path "{}" is ambiguous.' 514 .format(path)) 515 else: 516 pywikibot.log('Found another path "{}"'.format(path)) 517 result_data[path] = False 518 else: 519 result_data[path] = mod_data 520 521 return result_data 522 523 def __getitem__(self, key): 524 """ 525 Return a paraminfo module for the module path, caching it. 526 527 Use the module path, such as 'query+x', to obtain the paraminfo for 528 submodule 'x' in the query module. 529 530 If the key does not include a '+' and is not present in the top level 531 of the API, it will fallback to looking for the key 'query+x'. 532 """ 533 self.fetch({key}) 534 if key in self._paraminfo: 535 return self._paraminfo[key] 536 if '+' not in key: 537 return self._paraminfo['query+' + key] 538 raise KeyError(key) 539 540 def __contains__(self, key) -> bool: 541 """Return whether the key is valid.""" 542 try: 543 self[key] 544 return True 545 except KeyError: 546 return False 547 548 def __len__(self) -> int: 549 """Return number of cached modules.""" 550 return len(self._paraminfo) 551 552 def parameter(self, module: str, param_name: str) -> Optional[dict]: 553 """ 554 Get details about one modules parameter. 555 556 Returns None if the parameter does not exist. 557 558 :param module: API module name 559 :param param_name: parameter name in the module 560 :return: metadata that describes how the parameter may be used 561 """ 562 # TODO: the 'description' field of each parameter is not in the default 563 # output of v1.25, and can't removed from previous API versions. 564 # There should be an option to remove this verbose data from the cached 565 # version, for earlier versions of the API, and/or extract any useful 566 # data and discard the entire received paraminfo structure. There are 567 # also params which are common to many modules, such as those provided 568 # by the ApiPageSet php class: titles, pageids, redirects, etc. 569 try: 570 module = self[module] 571 except KeyError: 572 raise ValueError("paraminfo for '{}' not loaded".format(module)) 573 574 try: 575 params = module['parameters'] 576 except KeyError: 577 pywikibot.warning("module '{}' has no parameters".format(module)) 578 return None 579 580 param_data = [param for param in params 581 if param['name'] == param_name] 582 583 if not param_data: 584 return None 585 586 if len(param_data) != 1: 587 raise RuntimeError( 588 'parameter data length is eiter empty or not unique.\n{}' 589 .format(param_data)) 590 return param_data[0] 591 592 @property 593 def module_paths(self): 594 """Set of all modules using their paths.""" 595 return self._module_set(True) 596 597 # As soon as modules() is removed, module_paths and _module_set can be 598 # combined, so don't add any code between these two methods. 599 def _module_set(self, path): 600 # Load the submodules of all action modules available 601 self.fetch(self.action_modules) 602 modules = set(self.action_modules) 603 for parent_module in self._modules: 604 submodules = self.submodules(parent_module, path) 605 assert not submodules & modules or not path 606 modules |= submodules 607 return modules 608 609 @property 610 def action_modules(self): 611 """Set of all action modules.""" 612 self._init() 613 return self._action_modules 614 615 @property 616 def query_modules(self): 617 """Set of all query module names without query+ path prefix.""" 618 return self.submodules('query') 619 620 def submodules(self, name: str, path: bool = False) -> set: 621 """ 622 Set of all submodules. 623 624 :param name: The name of the parent module. 625 :param path: Whether the path and not the name is returned. 626 :return: The names or paths of the submodules. 627 """ 628 if name not in self._modules: 629 self.fetch([name]) 630 submodules = self._modules[name] 631 if path: 632 submodules = self._prefix_submodules(submodules, name) 633 return submodules 634 635 @staticmethod 636 def _prefix_submodules(modules, prefix): 637 """Prefix submodules with path.""" 638 return {'{}+{}'.format(prefix, mod) for mod in modules} 639 640 @property 641 def prefix_map(self): 642 """ 643 Mapping of module to its prefix for all modules with a prefix. 644 645 This loads paraminfo for all modules. 646 """ 647 if not self._prefix_map: 648 self._prefix_map = {module: prefix 649 for module, prefix 650 in self.attributes('prefix').items() 651 if prefix} 652 return self._prefix_map.copy() 653 654 def attributes(self, attribute: str, modules: Optional[set] = None): 655 """ 656 Mapping of modules with an attribute to the attribute value. 657 658 It will include all modules which have that attribute set, also if that 659 attribute is empty or set to False. 660 661 :param attribute: attribute name 662 :param modules: modules to include. If None (default), it'll load all 663 modules including all submodules using the paths. 664 :rtype: dict using modules as keys 665 """ 666 if modules is None: 667 modules = self.module_paths 668 self.fetch(modules) 669 670 return {mod: self[mod][attribute] 671 for mod in modules if attribute in self[mod]} 672 673 674class OptionSet(MutableMapping): 675 676 """ 677 A class to store a set of options which can be either enabled or not. 678 679 If it is instantiated with the associated site, module and parameter it 680 will only allow valid names as options. If instantiated 'lazy loaded' it 681 won't checks if the names are valid until the site has been set (which 682 isn't required, but recommended). The site can only be set once if it's not 683 None and after setting it, any site (even None) will fail. 684 """ 685 686 def __init__(self, site=None, 687 module: Optional[str] = None, 688 param: Optional[str] = None, 689 dict: Optional[dict] = None): 690 """ 691 Initializer. 692 693 If a site is given, the module and param must be given too. 694 695 :param site: The associated site 696 :type site: pywikibot.site.APISite or None 697 :param module: The module name which is used by paraminfo. (Ignored 698 when site is None) 699 :param param: The parameter name inside the module. That parameter must 700 have a 'type' entry. (Ignored when site is None) 701 :param dict: The initializing dict which is used for 702 :py:obj:`from_dict` 703 """ 704 self._site_set = False 705 self._enabled = set() 706 self._disabled = set() 707 self._set_site(site, module, param) 708 if dict: 709 self.from_dict(dict) 710 711 def _set_site(self, site, module: str, param: str, 712 clear_invalid: bool = False): 713 """Set the site and valid names. 714 715 As soon as the site has been not None, any subsequent calls will fail, 716 unless there had been invalid names and a KeyError was thrown. 717 718 :param site: The associated site 719 :type site: pywikibot.site.APISite 720 :param module: The module name which is used by paraminfo. 721 :param param: The parameter name inside the module. That parameter must 722 have a 'type' entry. 723 :param clear_invalid: Instead of throwing a KeyError, invalid names are 724 silently removed from the options (disabled by default). 725 """ 726 if self._site_set: 727 raise TypeError('The site cannot be set multiple times.') 728 # If the entries written to this are valid, it will never be 729 # overwritten 730 self._valid_enable = set() 731 self._valid_disable = set() 732 if site is None: 733 return 734 for type_value in site._paraminfo.parameter(module, param)['type']: 735 if type_value[0] == '!': 736 self._valid_disable.add(type_value[1:]) 737 else: 738 self._valid_enable.add(type_value) 739 if clear_invalid: 740 self._enabled &= self._valid_enable 741 self._disabled &= self._valid_disable 742 else: 743 invalid_names = ((self._enabled - self._valid_enable) 744 | (self._disabled - self._valid_disable)) 745 if invalid_names: 746 raise KeyError('OptionSet already contains invalid name(s) ' 747 '"{}"'.format('", "'.join(invalid_names))) 748 self._site_set = True 749 750 def from_dict(self, dictionary): 751 """ 752 Load options from the dict. 753 754 The options are not cleared before. If changes have been made 755 previously, but only the dict values should be applied it needs to be 756 cleared first. 757 758 :param dictionary: 759 a dictionary containing for each entry either the value 760 False, True or None. The names must be valid depending on whether 761 they enable or disable the option. All names with the value None 762 can be in either of the list. 763 :type dictionary: dict (keys are strings, values are bool/None) 764 """ 765 enabled = set() 766 disabled = set() 767 removed = set() 768 for name, value in dictionary.items(): 769 if value is True: 770 enabled.add(name) 771 elif value is False: 772 disabled.add(name) 773 elif value is None: 774 removed.add(name) 775 else: 776 raise ValueError('Dict contains invalid value "{}"'.format( 777 value)) 778 invalid_names = ( 779 (enabled - self._valid_enable) | (disabled - self._valid_disable) 780 | (removed - self._valid_enable - self._valid_disable) 781 ) 782 if invalid_names and self._site_set: 783 raise ValueError('Dict contains invalid name(s) "{}"'.format( 784 '", "'.join(invalid_names))) 785 self._enabled = enabled | (self._enabled - disabled - removed) 786 self._disabled = disabled | (self._disabled - enabled - removed) 787 788 def clear(self): 789 """Clear all enabled and disabled options.""" 790 self._enabled.clear() 791 self._disabled.clear() 792 793 def __setitem__(self, name, value): 794 """Set option to enabled, disabled or neither.""" 795 if value is True: 796 if self._site_set and name not in self._valid_enable: 797 raise KeyError('Invalid name "{}"'.format(name)) 798 self._enabled.add(name) 799 self._disabled.discard(name) 800 elif value is False: 801 if self._site_set and name not in self._valid_disable: 802 raise KeyError('Invalid name "{}"'.format(name)) 803 self._disabled.add(name) 804 self._enabled.discard(name) 805 elif value is None: 806 if self._site_set and (name not in self._valid_enable 807 or name not in self._valid_disable): 808 raise KeyError('Invalid name "{}"'.format(name)) 809 self._enabled.discard(name) 810 self._disabled.discard(name) 811 else: 812 raise ValueError('Invalid value "{}"'.format(value)) 813 814 def __getitem__(self, name) -> Optional[bool]: 815 """ 816 Return whether the option is enabled. 817 818 :return: If the name has been set it returns whether it is enabled. 819 Otherwise it returns None. If the site has been set it raises a 820 KeyError if the name is invalid. Otherwise it might return a value 821 even though the name might be invalid. 822 """ 823 if name in self._enabled: 824 return True 825 if name in self._disabled: 826 return False 827 if (self._site_set or name in self._valid_enable 828 or name in self._valid_disable): 829 return None 830 raise KeyError('Invalid name "{}"'.format(name)) 831 832 def __delitem__(self, name): 833 """Remove the item by setting it to None.""" 834 self[name] = None 835 836 def __contains__(self, name): 837 """Return True if option has been set.""" 838 return name in self._enabled or name in self._disabled 839 840 def __iter__(self): 841 """Iterate over each enabled and disabled option.""" 842 yield from self._enabled 843 yield from self._disabled 844 845 def api_iter(self): 846 """Iterate over each option as they appear in the URL.""" 847 yield from self._enabled 848 for disabled in self._disabled: 849 yield '!{}'.format(disabled) 850 851 def __len__(self): 852 """Return the number of enabled and disabled options.""" 853 return len(self._enabled) + len(self._disabled) 854 855 856class Request(MutableMapping): 857 858 """A request to a Site's api.php interface. 859 860 Attributes of this object (except for the special parameters listed 861 below) get passed as commands to api.php, and can be get or set 862 using the dict interface. All attributes must be strings. Use an 863 empty string for parameters that don't require a value. For example, 864 Request(action="query", titles="Foo bar", prop="info", redirects="") 865 corresponds to the API request 866 "api.php?action=query&titles=Foo%20bar&prop=info&redirects" 867 868 This is the lowest-level interface to the API, and can be used for any 869 request that a particular site's API supports. See the API documentation 870 (https://www.mediawiki.org/wiki/API) and site-specific settings for 871 details on what parameters are accepted for each request type. 872 873 Uploading files is a special case: to upload, the parameter "mime" must 874 contain a dict, and the parameter "file" must be set equal to a valid 875 filename on the local computer, _not_ to the content of the file. 876 877 Returns a dict containing the JSON data returned by the wiki. Normally, 878 one of the dict keys will be equal to the value of the 'action' 879 parameter. Errors are caught and raise an APIError exception. 880 881 Example: 882 883 >>> r = Request(parameters={'action': 'query', 'meta': 'userinfo'}) 884 >>> # This is equivalent to 885 >>> # https://{path}/api.php?action=query&meta=userinfo&format=json 886 >>> # change a parameter 887 >>> r['meta'] = "userinfo|siteinfo" 888 >>> # add a new parameter 889 >>> r['siprop'] = "namespaces" 890 >>> # note that "uiprop" param gets added automatically 891 >>> str(r.action) 892 'query' 893 >>> sorted(str(key) for key in r._params.keys()) 894 ['action', 'meta', 'siprop'] 895 >>> [str(key) for key in r._params['action']] 896 ['query'] 897 >>> [str(key) for key in r._params['meta']] 898 ['userinfo', 'siteinfo'] 899 >>> [str(key) for key in r._params['siprop']] 900 ['namespaces'] 901 >>> data = r.submit() 902 >>> isinstance(data, dict) 903 True 904 >>> set(['query', 'batchcomplete', 'warnings']).issuperset(data.keys()) 905 True 906 >>> 'query' in data 907 True 908 >>> sorted(str(key) for key in data['query'].keys()) 909 ['namespaces', 'userinfo'] 910 911 """ 912 913 # To make sure the default value of 'parameters' can be identified. 914 _PARAM_DEFAULT = object() 915 916 def __init__(self, site=None, 917 mime: Optional[dict] = None, 918 throttle: bool = True, 919 max_retries: Optional[int] = None, 920 retry_wait: Optional[int] = None, 921 use_get: Optional[bool] = None, 922 parameters=_PARAM_DEFAULT, **kwargs): 923 """ 924 Create a new Request instance with the given parameters. 925 926 The parameters for the request can be defined via either the 927 'parameters' parameter or the keyword arguments. The keyword arguments 928 were the previous implementation but could cause problems when there 929 are arguments to the API named the same as normal arguments to this 930 class. So the second parameter 'parameters' was added which just 931 contains all parameters. When a Request instance is created it must use 932 either one of them and not both at the same time. To have backwards 933 compatibility it adds a parameter named 'parameters' to kwargs when 934 both parameters are set as that indicates an old call and 'parameters' 935 was originally supplied as a keyword parameter. 936 937 If undefined keyword arguments were given AND the 'parameters' 938 parameter was supplied as a positional parameter it still assumes 939 'parameters' were part of the keyword arguments. 940 941 If a class is using Request and is directly forwarding the parameters, 942 :py:obj:`Request.clean_kwargs` can be used to automatically 943 convert the old kwargs mode into the new parameter mode. This 944 normalizes the arguments so that when the API parameters are 945 modified the changes can always be applied to the 'parameters' 946 parameter. 947 948 :param site: The Site to which the request will be submitted. If not 949 supplied, uses the user's configured default Site. 950 :param mime: If not None, send in "multipart/form-data" format (default 951 None). Parameters which should only be transferred via mime 952 mode are defined via this parameter (even an empty dict means 953 mime shall be used). 954 :param max_retries: Maximum number of times to retry after 955 errors, defaults to config.max_retries. 956 :param retry_wait: Minimum time in seconds to wait after an 957 error, defaults to config.retry_wait seconds (doubles each retry 958 until config.retry_max seconds is reached). 959 :param use_get: Use HTTP GET request if possible. If False it 960 uses a POST request. If None, it'll try to determine via 961 action=paraminfo if the action requires a POST. 962 :param parameters: The parameters used for the request to the API. 963 :type parameters: dict 964 :param kwargs: The parameters used for the request to the API. 965 """ 966 if site is None: 967 self.site = pywikibot.Site() 968 warn('Request() invoked without a site; setting to {}' 969 .format(self.site), RuntimeWarning, 2) 970 else: 971 self.site = site 972 973 self.mime = mime 974 if isinstance(mime, bool): 975 raise TypeError('mime param in api.Request() must not be boolean') 976 977 self.throttle = throttle 978 self.use_get = use_get 979 if max_retries is None: 980 self.max_retries = pywikibot.config.max_retries 981 else: 982 self.max_retries = max_retries 983 self.current_retries = 0 984 if retry_wait is None: 985 self.retry_wait = pywikibot.config.retry_wait 986 else: 987 self.retry_wait = retry_wait 988 # The only problem with that system is that it won't detect when 989 # 'parameters' is actually the only parameter for the request as it 990 # then assumes it's using the new mode (and the parameters are actually 991 # in the parameter 'parameters' not that the parameter 'parameters' is 992 # actually a parameter for the request). But that is invalid anyway as 993 # it MUST have at least an action parameter for the request which would 994 # be in kwargs if it's using the old mode. 995 if kwargs: 996 if parameters is not self._PARAM_DEFAULT: 997 # 'parameters' AND kwargs is set. In that case think of 998 # 'parameters' being an old kwarg which is now filled in an 999 # actual parameter 1000 self._warn_both() 1001 kwargs['parameters'] = parameters 1002 # When parameters wasn't set it's likely that kwargs-mode was used 1003 self._warn_kwargs() 1004 parameters = kwargs 1005 elif parameters is self._PARAM_DEFAULT: 1006 parameters = {} 1007 self._params = {} 1008 if 'action' not in parameters: 1009 raise ValueError("'action' specification missing from Request.") 1010 self.action = parameters['action'] 1011 self.update(parameters) # also convert all parameter values to lists 1012 self._warning_handler = None 1013 # Actions that imply database updates on the server, used for various 1014 # things like throttling or skipping actions when we're in simulation 1015 # mode 1016 self.write = self.action in { 1017 'block', 'clearhasmsg', 'createaccount', 'delete', 'edit', 1018 'emailuser', 'filerevert', 'flowthank', 'imagerotate', 'import', 1019 'managetags', 'mergehistory', 'move', 'options', 'patrol', 1020 'protect', 'purge', 'resetpassword', 'revisiondelete', 'rollback', 1021 'setnotificationtimestamp', 'setpagelanguage', 'tag', 'thank', 1022 'unblock', 'undelete', 'upload', 'userrights', 'watch', 1023 'wbcreateclaim', 'wbcreateredirect', 'wbeditentity', 1024 'wblinktitles', 'wbmergeitems', 'wbremoveclaims', 1025 'wbremovequalifiers', 'wbremovereferences', 'wbsetaliases', 1026 'wbsetclaim', 'wbsetclaimvalue', 'wbsetdescription', 'wbsetlabel', 1027 'wbsetqualifier', 'wbsetreference', 'wbsetsitelink', 1028 } 1029 # Client side verification that the request is being performed 1030 # by a logged in user, and warn if it isn't a config username. 1031 if self.write: 1032 try: 1033 username = self.site.userinfo['name'] 1034 except KeyError: 1035 raise Error('API write action attempted without user name') 1036 1037 if 'anon' in self.site.userinfo: 1038 raise Error("API write action attempted as IP '{}'" 1039 .format(username)) 1040 1041 if not self.site.user() or self.site.username() != username: 1042 pywikibot.warning( 1043 'API write action by unexpected username {} commenced.\n' 1044 'userinfo: {!r}'.format(username, self.site.userinfo)) 1045 1046 # Make sure user is logged in 1047 if self.write: 1048 pywikibot.debug('Adding user assertion', _logger) 1049 self['assert'] = 'user' 1050 1051 @classmethod 1052 def create_simple(cls, req_site, **kwargs): 1053 """Create a new instance using all args except site for the API.""" 1054 # This ONLY support site so that any caller can be sure there will be 1055 # no conflict with PWB parameters 1056 # req_site is needed to avoid conflicts with possible site keyword in 1057 # kwarg until positional-only parameters are supported, see T262926 1058 # TODO: Use ParamInfo request to determine valid parameters 1059 if isinstance(kwargs.get('parameters'), dict): 1060 warn('The request contains already a "parameters" entry which is ' 1061 'a dict.') 1062 return cls(site=req_site, parameters=kwargs) 1063 1064 @classmethod 1065 def _warn_both(cls): 1066 """Warn that kwargs mode was used but parameters was set too.""" 1067 warn('Both kwargs and parameters are set in Request.__init__. It ' 1068 'assumes that "parameters" is actually a parameter of the ' 1069 'Request and is added to kwargs.', DeprecationWarning, 3) 1070 1071 @classmethod 1072 def _warn_kwargs(cls): 1073 """Warn that kwargs was used instead of parameters.""" 1074 warn('Instead of using kwargs from Request.__init__, parameters ' 1075 'for the request to the API should be added via the ' 1076 '"parameters" parameter.', DeprecationWarning, 3) 1077 1078 @classmethod 1079 def clean_kwargs(cls, kwargs: dict) -> dict: 1080 """ 1081 Convert keyword arguments into new parameters mode. 1082 1083 If there are no other arguments in kwargs apart from the used arguments 1084 by the class' initializer it'll just return kwargs and otherwise remove 1085 those which aren't in the initializer and put them in a dict which is 1086 added as a 'parameters' keyword. It will always create a shallow copy. 1087 1088 :param kwargs: The original keyword arguments which is not modified. 1089 :return: The normalized keyword arguments. 1090 """ 1091 if 'expiry' in kwargs and kwargs['expiry'] is None: 1092 del kwargs['expiry'] 1093 1094 args = set() 1095 for super_cls in inspect.getmro(cls): 1096 if not super_cls.__name__.endswith('Request'): 1097 break 1098 args |= set(getfullargspec(super_cls.__init__).args) 1099 else: 1100 raise ValueError('Request was not a super class of ' 1101 '{0!r}'.format(cls)) 1102 args -= {'self'} 1103 old_kwargs = set(kwargs) 1104 # all kwargs defined above but not in args indicate 'kwargs' mode 1105 if old_kwargs - args: 1106 # Move all kwargs into parameters 1107 parameters = {name: value for name, value in kwargs.items() 1108 if name not in args or name == 'parameters'} 1109 if 'parameters' in parameters: 1110 cls._warn_both() 1111 # Copy only arguments and not the parameters 1112 kwargs = {name: value for name, value in kwargs.items() 1113 if name in args or name == 'self'} 1114 kwargs['parameters'] = parameters 1115 # Make sure that all arguments have remained 1116 assert(old_kwargs | {'parameters'} 1117 == set(kwargs) | set(kwargs['parameters'])) 1118 assert(('parameters' in old_kwargs) 1119 is ('parameters' in kwargs['parameters'])) 1120 cls._warn_kwargs() 1121 else: 1122 kwargs = dict(kwargs) 1123 kwargs.setdefault('parameters', {}) 1124 return kwargs 1125 1126 def _format_value(self, value): 1127 """ 1128 Format the MediaWiki API request parameter. 1129 1130 Converts from Python datatypes to MediaWiki API parameter values. 1131 1132 Supports: 1133 * datetime.datetime (using strftime and ISO8601 format) 1134 * pywikibot.page.BasePage (using title (+namespace; -section)) 1135 1136 All other datatypes are converted to string. 1137 """ 1138 if isinstance(value, datetime.datetime): 1139 return value.strftime(pywikibot.Timestamp.ISO8601Format) 1140 if isinstance(value, pywikibot.page.BasePage): 1141 if value.site != self.site: 1142 raise RuntimeError( 1143 'value.site {!r} is different from Request.site {!r}' 1144 .format(value.site, self.site)) 1145 return value.title(with_section=False) 1146 return str(value) 1147 1148 def __getitem__(self, key): 1149 """Implement dict interface.""" 1150 return self._params[key] 1151 1152 def __setitem__(self, key: str, value): 1153 """Set MediaWiki API request parameter. 1154 1155 :param value: param value(s) 1156 :type value: str in site encoding 1157 (string types may be a `|`-separated list) 1158 iterable, where items are converted to string 1159 with special handling for datetime.datetime to convert it to a 1160 string using the ISO 8601 format accepted by the MediaWiki API. 1161 """ 1162 if isinstance(value, bytes): 1163 value = value.decode(self.site.encoding()) 1164 1165 if isinstance(value, str): 1166 value = value.split('|') 1167 1168 if hasattr(value, 'api_iter'): 1169 self._params[key] = value 1170 else: 1171 try: 1172 iter(value) 1173 except TypeError: 1174 # convert any non-iterable value into a single-element list 1175 self._params[key] = [value] 1176 else: 1177 self._params[key] = list(value) 1178 1179 def __delitem__(self, key): 1180 """Implement dict interface.""" 1181 del self._params[key] 1182 1183 def keys(self): 1184 """Implement dict interface.""" 1185 return list(self._params.keys()) 1186 1187 def __contains__(self, key): 1188 """Implement dict interface.""" 1189 return key in self._params 1190 1191 def __iter__(self): 1192 """Implement dict interface.""" 1193 return iter(self._params) 1194 1195 def __len__(self): 1196 """Implement dict interface.""" 1197 return len(self._params) 1198 1199 def iteritems(self): 1200 """Implement dict interface.""" 1201 return iter(self._params.items()) 1202 1203 def items(self): 1204 """Return a list of tuples containing the parameters in any order.""" 1205 return list(self._params.items()) 1206 1207 def _add_defaults(self): 1208 """ 1209 Add default parameters to the API request. 1210 1211 This method will only add them once. 1212 """ 1213 if hasattr(self, '__defaulted'): 1214 return 1215 1216 if self.mime is not None \ 1217 and set(self._params.keys()) & set(self.mime.keys()): 1218 raise ValueError('The mime and params shall not share the ' 1219 'same keys.') 1220 1221 if self.action == 'query': 1222 meta = self._params.get('meta', []) 1223 # Special logic for private wikis (T153903). 1224 # If the wiki requires login privileges to read articles, pywikibot 1225 # will be blocked from accessing the userinfo. 1226 # Work around this by requiring userinfo only if 'tokens' and 1227 # 'login' are not both set. 1228 typep = self._params.get('type', []) 1229 if not ('tokens' in meta and 'login' in typep): 1230 if 'userinfo' not in meta: 1231 meta = set(meta + ['userinfo']) 1232 self['meta'] = sorted(meta) 1233 uiprop = self._params.get('uiprop', []) 1234 uiprop = set(uiprop + ['blockinfo', 'hasmsg']) 1235 self['uiprop'] = sorted(uiprop) 1236 if 'prop' in self._params: 1237 if self.site.has_extension('ProofreadPage'): 1238 prop = set(self['prop'] + ['proofread']) 1239 self['prop'] = sorted(prop) 1240 # When neither 'continue' nor 'rawcontinue' is present and the 1241 # version number is at least 1.25wmf5 we add a dummy rawcontinue 1242 # parameter. Querying siteinfo is save as it adds 'continue' 1243 # except for 'tokens' (T284577) 1244 if ('tokens' not in meta and 'continue' not in self._params 1245 and self.site.mw_version >= '1.25wmf5'): 1246 self._params.setdefault('rawcontinue', ['']) 1247 elif self.action == 'help' and self.site.mw_version > '1.24': 1248 self['wrap'] = '' 1249 1250 if config.maxlag: 1251 self._params.setdefault('maxlag', [str(config.maxlag)]) 1252 self._params.setdefault('format', ['json']) 1253 if self['format'] != ['json']: 1254 raise TypeError( 1255 "Query format '{}' cannot be parsed.".format(self['format'])) 1256 1257 self.__defaulted = True 1258 1259 def _encoded_items(self): 1260 """ 1261 Build a dict of params with minimal encoding needed for the site. 1262 1263 This helper method only prepares params for serialisation or 1264 transmission, so it only encodes values which are not ASCII, 1265 requiring callers to consider how to handle ASCII vs other values, 1266 however the output is designed to enable __str__ and __repr__ to 1267 do the right thing in most circumstances. 1268 1269 Servers which use an encoding that is not a superset of ASCII 1270 are not supported. 1271 1272 :return: Parameters either in the site encoding, or ASCII strings 1273 :rtype: dict with values of either str or bytes 1274 """ 1275 params = {} 1276 for key, values in self._params.items(): 1277 try: 1278 iterator = values.api_iter() 1279 except AttributeError: 1280 if len(values) == 1: 1281 value = values[0] 1282 if value is True: 1283 values = [''] 1284 elif value is False or value is None: 1285 # False and None are not included in the http URI 1286 continue 1287 iterator = iter(values) 1288 value = '|'.join(self._format_value(value) for value in iterator) 1289 # If the value is encodable as ascii, do not encode it. 1290 # This means that any value which can be encoded as ascii 1291 # is presumed to be ascii, and servers using a site encoding 1292 # which is not a superset of ascii may be problematic. 1293 try: 1294 value.encode('ascii') 1295 except UnicodeError: 1296 try: 1297 value = value.encode(self.site.encoding()) 1298 except Exception: 1299 pywikibot.error( 1300 "_encoded_items: '{}' could not be encoded as '{}':" 1301 ' {!r}'.format(key, self.site.encoding(), value)) 1302 assert key.encode('ascii') 1303 assert isinstance(key, str) 1304 params[key] = value 1305 return params 1306 1307 def _http_param_string(self): 1308 """ 1309 Return the parameters as a HTTP URL query fragment. 1310 1311 URL encodes the parameters provided by _encoded_items() 1312 1313 :note: Not all parameters are sorted, therefore for two given 1314 CachedRequest objects with equal _params, the result of 1315 _http_param_string() is not necessarily equal. 1316 """ 1317 return encode_url(self._encoded_items()) 1318 1319 def __str__(self): 1320 """Return a string representation.""" 1321 return unquote(self.site.scriptpath() 1322 + '/api.php?' 1323 + self._http_param_string()) 1324 1325 def __repr__(self): 1326 """Return internal representation.""" 1327 return '{}.{}<{}->{!r}>'.format(self.__class__.__module__, 1328 self.__class__.__name__, 1329 self.site, str(self)) 1330 1331 def _simulate(self, action): 1332 """Simulate action.""" 1333 if action and config.simulate and ( 1334 self.write or action in config.actions_to_block): 1335 pywikibot.output(color_format( 1336 '{black;yellow}SIMULATION: {} action blocked.{default}', 1337 action)) 1338 # for more realistic simulation 1339 if config.simulate is not True: 1340 pywikibot.sleep(float(config.simulate)) 1341 return { 1342 action: {'result': 'Success', 'nochange': ''}, 1343 1344 # wikibase results 1345 'pageinfo': {'lastrevid': -1}, 1346 'entity': {'lastrevid': -1}, 1347 } 1348 return None 1349 1350 def _is_wikibase_error_retryable(self, error): 1351 # dict of error message and current action. 1352 # Value is True if action type is to be ignored 1353 err_msg = { 1354 'edit-already-exists': 'wbeditentity', 1355 'actionthrottledtext': True, # T192912, T268645 1356 } 1357 messages = error.get('messages') 1358 message = None 1359 # bug T68619; after Wikibase breaking change 1ca9cee we have a 1360 # list of messages 1361 if isinstance(messages, list): 1362 for item in messages: 1363 message = item['name'] 1364 action = err_msg.get(message) 1365 if action is True or action == self.action: 1366 return True 1367 1368 return False 1369 1370 if isinstance(messages, dict): 1371 try: # behaviour before gerrit 124323 breaking change 1372 message = messages['0']['name'] 1373 except KeyError: # unsure the new output is always a list 1374 message = messages['name'] 1375 action = err_msg.get(message) 1376 return action is True or action == self.action 1377 1378 @staticmethod 1379 def _generate_mime_part(key, content, keytype=None, headers=None): 1380 if not keytype: 1381 try: 1382 content.encode('ascii') 1383 keytype = ('text', 'plain') 1384 except (UnicodeError, AttributeError): 1385 keytype = ('application', 'octet-stream') 1386 submsg = MIMENonMultipart(*keytype) 1387 content_headers = {'name': key} 1388 if headers: 1389 content_headers.update(headers) 1390 submsg.add_header('Content-disposition', 'form-data', 1391 **content_headers) 1392 1393 if keytype != ('text', 'plain'): 1394 submsg['Content-Transfer-Encoding'] = 'binary' 1395 1396 submsg.set_payload(content) 1397 return submsg 1398 1399 def _use_get(self): 1400 """Verify whether 'get' is to be used.""" 1401 if (not config.enable_GET_without_SSL 1402 and self.site.protocol() != 'https' 1403 or self.site.is_oauth_token_available()): # T108182 workaround 1404 use_get = False 1405 elif self.use_get is None: 1406 if self.action == 'query': 1407 # for queries check the query module 1408 modules = set() 1409 for mod_type_name in ('list', 'prop', 'generator'): 1410 modules.update(self._params.get(mod_type_name, [])) 1411 else: 1412 modules = {self.action} 1413 if modules: 1414 self.site._paraminfo.fetch(modules) 1415 use_get = all('mustbeposted' not in self.site._paraminfo[mod] 1416 for mod in modules) 1417 else: 1418 # If modules is empty, just 'meta' was given, which doesn't 1419 # require POSTs, and is required for ParamInfo 1420 use_get = True 1421 else: 1422 use_get = self.use_get 1423 return use_get 1424 1425 @classmethod 1426 def _build_mime_request(cls, params: dict, 1427 mime_params: dict) -> Tuple[dict, bytes]: 1428 """ 1429 Construct a MIME multipart form post. 1430 1431 :param params: HTTP request params 1432 :param mime_params: HTTP request parts which must be sent in the body 1433 :type mime_params: dict of (content, keytype, headers) 1434 :return: HTTP request headers and body 1435 """ 1436 # construct a MIME message containing all API key/values 1437 container = MIMEMultipart(_subtype='form-data') 1438 for key, value in params.items(): 1439 submsg = cls._generate_mime_part(key, value) 1440 container.attach(submsg) 1441 for key, value in mime_params.items(): 1442 submsg = cls._generate_mime_part(key, *value) 1443 container.attach(submsg) 1444 1445 # strip the headers to get the HTTP message body 1446 body = container.as_bytes() 1447 marker = b'\n\n' # separates headers from body 1448 eoh = body.find(marker) 1449 body = body[eoh + len(marker):] 1450 # retrieve the headers from the MIME object 1451 headers = dict(container.items()) 1452 return headers, body 1453 1454 def _get_request_params(self, use_get, paramstring): 1455 """Get request parameters.""" 1456 uri = self.site.apipath() 1457 if self.mime is not None: 1458 (headers, body) = Request._build_mime_request( 1459 self._encoded_items(), self.mime) 1460 use_get = False # MIME requests require HTTP POST 1461 else: 1462 headers = {'Content-Type': 'application/x-www-form-urlencoded'} 1463 if (not self.site.maximum_GET_length() 1464 or self.site.maximum_GET_length() < len(paramstring)): 1465 use_get = False 1466 if use_get: 1467 uri = '{}?{}'.format(uri, paramstring) 1468 body = None 1469 else: 1470 body = paramstring 1471 1472 pywikibot.debug('API request to {} (uses get: {}):\n' 1473 'Headers: {!r}\nURI: {!r}\nBody: {!r}' 1474 .format(self.site, use_get, headers, uri, body), 1475 _logger) 1476 return use_get, uri, body, headers 1477 1478 def _http_request(self, use_get: bool, uri: str, data, headers, 1479 paramstring) -> tuple: 1480 """Get or post a http request with exception handling. 1481 1482 :return: a tuple containing requests.Response object from 1483 http.request and use_get value 1484 """ 1485 try: 1486 response = http.request(self.site, uri=uri, 1487 method='GET' if use_get else 'POST', 1488 data=data, headers=headers) 1489 except Server504Error: 1490 pywikibot.log('Caught HTTP 504 error; retrying') 1491 except Server414Error: 1492 if use_get: 1493 pywikibot.log('Caught HTTP 414 error; retrying') 1494 use_get = False 1495 else: 1496 pywikibot.warning('Caught HTTP 414 error, although not ' 1497 'using GET.') 1498 raise 1499 except FatalServerError: 1500 # This error is not going to be fixed by just waiting 1501 pywikibot.error(traceback.format_exc()) 1502 raise 1503 # TODO: what other exceptions can occur here? 1504 except Exception: 1505 # for any other error on the http request, wait and retry 1506 pywikibot.error(traceback.format_exc()) 1507 pywikibot.log('{}, {}'.format(uri, paramstring)) 1508 else: 1509 return response, use_get 1510 self.wait() 1511 return None, use_get 1512 1513 def _json_loads(self, response) -> Optional[dict]: 1514 """Return a dict from requests.Response. 1515 1516 :param response: a requests.Response object 1517 :type response: requests.Response 1518 :return: a data dict 1519 :raises pywikibot.exceptions.APIError: unknown action found 1520 :raises pywikibot.exceptions.APIError: unknown query result type 1521 """ 1522 try: 1523 result = response.json() 1524 except ValueError: 1525 # if the result isn't valid JSON, there may be a server 1526 # problem. Wait a few seconds and try again 1527 # Show 20 lines of bare text 1528 text = '\n'.join(removeHTMLParts(response.text).splitlines()[:20]) 1529 msg = """\ 1530Non-JSON response received from server {site} for url 1531{resp.url} 1532The server may be down. 1533Status code: {resp.status_code} 1534 1535The text message is: 1536{text} 1537""".format(site=self.site, resp=response, text=text) 1538 1539 # Do not retry for AutoFamily but raise a SiteDefinitionError 1540 # Note: family.AutoFamily is a function to create that class 1541 if self.site.family.__class__.__name__ == 'AutoFamily': 1542 pywikibot.debug(msg, _logger) 1543 raise SiteDefinitionError('Invalid AutoFamily({!r})' 1544 .format(self.site.family.domain)) 1545 1546 pywikibot.warning(msg) 1547 1548 # there might also be an overflow, so try a smaller limit 1549 for param in self._params: 1550 if param.endswith('limit'): 1551 # param values are stored a list of str 1552 value = self[param][0] 1553 if value.isdigit(): 1554 self[param] = [str(int(value) // 2)] 1555 pywikibot.output('Set {} = {}' 1556 .format(param, self[param])) 1557 else: 1558 return result or {} 1559 self.wait() 1560 return None 1561 1562 def _relogin(self, message=''): 1563 """Force re-login and inform user.""" 1564 pywikibot.error('{}{}Forcing re-login.'.format(message, 1565 ' ' if message else '')) 1566 self.site._relogin() 1567 1568 def _userinfo_query(self, result): 1569 """Handle userinfo query.""" 1570 if self.action == 'query' and 'userinfo' in result.get('query', ()): 1571 # if we get passed userinfo in the query result, we can confirm 1572 # that we are logged in as the correct user. If this is not the 1573 # case, force a re-login. 1574 username = result['query']['userinfo']['name'] 1575 if (self.site.user() is not None and self.site.user() != username 1576 and self.site._loginstatus != LoginStatus.IN_PROGRESS): 1577 message = ("Logged in as '{actual}' instead of '{expected}'." 1578 .format(actual=username, expected=self.site.user())) 1579 self._relogin(message) 1580 return True 1581 return False 1582 1583 def _handle_warnings(self, result): 1584 if 'warnings' in result: 1585 for mod, warning in result['warnings'].items(): 1586 if mod == 'info': 1587 continue 1588 if '*' in warning: 1589 text = warning['*'] 1590 elif 'html' in warning: 1591 # bug T51978 1592 text = warning['html']['*'] 1593 else: 1594 pywikibot.warning( 1595 'API warning ({}) of unknown format: {}'. 1596 format(mod, warning)) 1597 continue 1598 # multiple warnings are in text separated by a newline 1599 for single_warning in text.splitlines(): 1600 if (not callable(self._warning_handler) 1601 or not self._warning_handler(mod, single_warning)): 1602 pywikibot.warning('API warning ({}): {}' 1603 .format(mod, single_warning)) 1604 1605 def _logged_in(self, code): 1606 """Check whether user is logged in. 1607 1608 Older wikis returned an error instead of a warning when the request 1609 asked for too many values. If we get this error, assume we are not 1610 logged in (we can't check this because the userinfo data is not 1611 present) and force a re-login 1612 """ 1613 if code.endswith('limit'): 1614 message = 'Received API limit error.' 1615 1616 # If the user assertion failed, we're probably logged out as well. 1617 elif code == 'assertuserfailed': 1618 message = 'User assertion failed.' 1619 1620 # Lastly, the purge module requires a POST if used as anonymous user, 1621 # but we normally send a GET request. If the API tells us the request 1622 # has to be POSTed, we're probably logged out. 1623 elif code == 'mustbeposted' and self.action == 'purge': 1624 message = "Received unexpected 'mustbeposted' error." 1625 1626 else: 1627 return True 1628 1629 self._relogin(message) 1630 return False 1631 1632 def _internal_api_error(self, code, error, result): 1633 """Check for internal_api_error_ or readonly and retry. 1634 1635 :raises pywikibot.exceptions.APIMWError: internal_api_error or readonly 1636 """ 1637 iae = 'internal_api_error_' 1638 if not (code.startswith(iae) or code == 'readonly'): 1639 return False 1640 1641 # T154011 1642 class_name = code if code == 'readonly' else removeprefix(code, iae) 1643 1644 del error['code'] # is added via class_name 1645 e = pywikibot.exceptions.APIMWError(class_name, **error) 1646 1647 # If the error key is in this table, it is probably a temporary 1648 # problem, so we will retry the edit. 1649 # TODO: T154011: 'ReadOnlyError' seems replaced by 'readonly' 1650 retry = class_name in ['DBConnectionError', # T64974 1651 'DBQueryError', # T60158 1652 'ReadOnlyError', # T61227 1653 'readonly', # T154011 1654 ] 1655 1656 pywikibot.error('Detected MediaWiki API exception {}{}' 1657 .format(e, '; retrying' if retry else '; raising')) 1658 param_repr = str(self._params) 1659 pywikibot.log('MediaWiki exception {} details:\n' 1660 ' query=\n{}\n' 1661 ' response=\n{}' 1662 .format(class_name, 1663 pprint.pformat(param_repr), 1664 result)) 1665 if not retry: 1666 raise e 1667 1668 self.wait() 1669 return True 1670 1671 def _ratelimited(self): 1672 """Handle ratelimited warning.""" 1673 ratelimits = self.site.userinfo['ratelimits'] 1674 delay = None 1675 1676 ratelimit = ratelimits.get(self.action, {}) 1677 # find the lowest wait time for the given action 1678 for limit in ratelimit.values(): 1679 seconds = limit['seconds'] 1680 hits = limit['hits'] 1681 delay = min(delay or seconds, seconds / hits) 1682 1683 if not delay: 1684 pywikibot.warning( 1685 'No rate limit found for action {}'.format(self.action)) 1686 self.wait(delay) 1687 1688 def _bad_token(self, code) -> bool: 1689 """Check for bad token.""" 1690 if code != 'badtoken': # Other code not handled here 1691 return False 1692 1693 if self.site._loginstatus == LoginStatus.IN_PROGRESS: 1694 pywikibot.log('Login status: {}' 1695 .format(self.site._loginstatus.name)) 1696 return False 1697 1698 user_tokens = self.site.tokens._tokens[self.site.user()] 1699 # all token values mapped to their type 1700 tokens = {token: t_type for t_type, token in user_tokens.items()} 1701 # determine which tokens are bad 1702 invalid_param = {name: tokens[param[0]] 1703 for name, param in self._params.items() 1704 if len(param) == 1 and param[0] in tokens} 1705 # doesn't care about the cache so can directly load them 1706 if invalid_param: 1707 pywikibot.log( 1708 'Bad token error for {}. Tokens for "{}" used in request; ' 1709 'invalidated them.' 1710 .format(self.site.user(), 1711 '", "'.join(sorted(set(invalid_param.values()))))) 1712 # invalidate superior wiki cookies (T224712) 1713 _invalidate_superior_cookies(self.site.family) 1714 # request new token(s) instead of invalid 1715 self.site.tokens.load_tokens(set(invalid_param.values())) 1716 # fix parameters; lets hope that it doesn't mistake actual 1717 # parameters as tokens 1718 for name, t_type in invalid_param.items(): 1719 self[name] = self.site.tokens[t_type] 1720 return True 1721 1722 # otherwise couldn't find any … weird there is nothing what 1723 # can be done here because it doesn't know which parameters 1724 # to fix 1725 pywikibot.log( 1726 'Bad token error for {} but no parameter is using a ' 1727 'token. Current tokens: {}' 1728 .format(self.site.user(), 1729 ', '.join('{}: {}'.format(*e) 1730 for e in user_tokens.items()))) 1731 return False 1732 1733 def submit(self) -> dict: 1734 """ 1735 Submit a query and parse the response. 1736 1737 :return: a dict containing data retrieved from api.php 1738 """ 1739 self._add_defaults() 1740 use_get = self._use_get() 1741 retries = 0 1742 while True: 1743 paramstring = self._http_param_string() 1744 1745 simulate = self._simulate(self.action) 1746 if simulate: 1747 return simulate 1748 1749 if self.throttle: 1750 self.site.throttle(write=self.write) 1751 else: 1752 pywikibot.log( 1753 "Submitting unthrottled action '{}'.".format(self.action)) 1754 1755 use_get, uri, body, headers = self._get_request_params(use_get, 1756 paramstring) 1757 response, use_get = self._http_request(use_get, uri, body, headers, 1758 paramstring) 1759 if response is None: 1760 continue 1761 1762 result = self._json_loads(response) 1763 if result is None: 1764 continue 1765 1766 if self._userinfo_query(result): 1767 continue 1768 1769 self._handle_warnings(result) 1770 1771 if 'error' not in result: 1772 return result 1773 1774 error = result['error'].copy() 1775 for key in result: 1776 if key in ('error', 'warnings'): 1777 continue 1778 assert key not in error 1779 assert isinstance(result[key], str), \ 1780 'Unexpected {}: {!r}'.format(key, result[key]) 1781 error[key] = result[key] 1782 1783 if '*' in result['error']: 1784 # help text returned 1785 result['error']['help'] = result['error'].pop('*') 1786 code = result['error'].setdefault('code', 'Unknown') 1787 info = result['error'].setdefault('info', None) 1788 1789 if not self._logged_in(code): 1790 continue 1791 1792 if code == 'maxlag': 1793 retries += 1 1794 if retries > max(5, pywikibot.config.max_retries): 1795 break 1796 pywikibot.log('Pausing due to database lag: ' + info) 1797 1798 try: 1799 lag = result['error']['lag'] 1800 except KeyError: 1801 lag = lagpattern.search(info) 1802 lag = float(lag.group('lag')) if lag else 0.0 1803 1804 self.site.throttle.lag(lag * retries) 1805 continue 1806 1807 if code == 'help' and self.action == 'help': 1808 # The help module returns an error result with the complete 1809 # API information. As this data was requested, return the 1810 # data instead of raising an exception. 1811 return {'help': {'mime': 'text/plain', 1812 'help': result['error']['help']}} 1813 1814 pywikibot.warning('API error {}: {}'.format(code, info)) 1815 pywikibot.log(' headers=\n{}'.format(response.headers)) 1816 1817 if self._internal_api_error(code, error, result): 1818 continue 1819 1820 # Phab. tickets T48535, T64126, T68494, T68619 1821 if code == 'failed-save' \ 1822 and self._is_wikibase_error_retryable(result['error']): 1823 self.wait() 1824 continue 1825 1826 if code == 'ratelimited': 1827 self._ratelimited() 1828 continue 1829 1830 # If readapidenied is returned try to login 1831 if code == 'readapidenied' \ 1832 and self.site._loginstatus in (LoginStatus.NOT_ATTEMPTED, 1833 LoginStatus.NOT_LOGGED_IN): 1834 self.site.login() 1835 continue 1836 1837 if self._bad_token(code): 1838 continue 1839 1840 if 'mwoauth-invalid-authorization' in code: 1841 if 'Nonce already used' in info: 1842 pywikibot.error( 1843 'Retrying failed OAuth authentication for {}: {}' 1844 .format(self.site, info)) 1845 continue 1846 raise NoUsernameError('Failed OAuth authentication for {}: {}' 1847 .format(self.site, info)) 1848 if code == 'cirrussearch-too-busy-error': # T170647 1849 self.wait() 1850 continue 1851 1852 if code == 'urlshortener-blocked': # T244062 1853 # add additional informations to result['error'] 1854 result['error']['current site'] = self.site 1855 if self.site.user(): 1856 result['error']['current user'] = self.site.user() 1857 else: # not logged in; show the IP 1858 uinfo = self.site.userinfo 1859 result['error']['current user'] = uinfo['name'] 1860 1861 # raise error 1862 try: 1863 param_repr = str(self._params) 1864 pywikibot.log('API Error: query=\n{}' 1865 .format(pprint.pformat(param_repr))) 1866 pywikibot.log(' response=\n{}'.format(result)) 1867 1868 raise pywikibot.exceptions.APIError(**result['error']) 1869 except TypeError: 1870 raise RuntimeError(result) 1871 1872 msg = 'Maximum retries attempted due to maxlag without success.' 1873 if os.environ.get('PYWIKIBOT_TESTS_RUNNING', '0') == '1': 1874 import unittest 1875 raise unittest.SkipTest(msg) 1876 1877 raise MaxlagTimeoutError(msg) 1878 1879 def wait(self, delay=None): 1880 """Determine how long to wait after a failed request.""" 1881 self.current_retries += 1 1882 if self.current_retries > self.max_retries: 1883 raise TimeoutError('Maximum retries attempted without success.') 1884 1885 # double the next wait, but do not exceed config.retry_max seconds 1886 delay = delay or self.retry_wait 1887 delay *= 2 ** (self.current_retries - 1) 1888 delay = min(delay, config.retry_max) 1889 1890 pywikibot.warning('Waiting {:.1f} seconds before retrying.' 1891 .format(delay)) 1892 pywikibot.sleep(delay) 1893 1894 1895class CachedRequest(Request): 1896 1897 """Cached request.""" 1898 1899 def __init__(self, expiry, *args, **kwargs): 1900 """Initialize a CachedRequest object. 1901 1902 :param expiry: either a number of days or a datetime.timedelta object 1903 """ 1904 assert expiry is not None 1905 super().__init__(*args, **kwargs) 1906 if not isinstance(expiry, datetime.timedelta): 1907 expiry = datetime.timedelta(expiry) 1908 self.expiry = min(expiry, datetime.timedelta(config.API_config_expiry)) 1909 self._data = None 1910 self._cachetime = None 1911 1912 @classmethod 1913 def create_simple(cls, req_site, **kwargs): 1914 """Unsupported as it requires at least two parameters.""" 1915 raise NotImplementedError('CachedRequest cannot be created simply.') 1916 1917 @classmethod 1918 def _get_cache_dir(cls) -> str: 1919 """ 1920 Return the base directory path for cache entries. 1921 1922 The directory will be created if it does not already exist. 1923 1924 :return: base directory path for cache entries 1925 """ 1926 path = os.path.join(config.base_dir, 1927 'apicache-py{0:d}'.format(PYTHON_VERSION[0])) 1928 cls._make_dir(path) 1929 cls._get_cache_dir = classmethod(lambda c: path) # cache the result 1930 return path 1931 1932 @staticmethod 1933 def _make_dir(dir_name: str) -> str: 1934 """Create directory if it does not exist already. 1935 1936 The directory name (dir_name) is returned unmodified. 1937 1938 :param dir_name: directory path 1939 :return: directory name 1940 """ 1941 with suppress(OSError): # directory already exists 1942 os.makedirs(dir_name) 1943 return dir_name 1944 1945 def _uniquedescriptionstr(self) -> str: 1946 """Return unique description for the cache entry. 1947 1948 If this is modified, please also update 1949 scripts/maintenance/cache.py to support 1950 the new key and all previous keys. 1951 """ 1952 login_status = self.site._loginstatus 1953 1954 if login_status >= LoginStatus.AS_USER: 1955 # This uses the format of Page.__repr__, without performing 1956 # config.console_encoding as done by Page.__repr__. 1957 # The returned value can't be encoded to anything other than 1958 # ascii otherwise it creates an exception when _create_file_name() 1959 # tries to encode it as utf-8. 1960 user_key = 'User(User:{})'.format(self.site.userinfo['name']) 1961 else: 1962 user_key = repr(LoginStatus(LoginStatus.NOT_LOGGED_IN)) 1963 1964 request_key = repr(sorted(self._encoded_items().items())) 1965 return '{!r}{}{}'.format(self.site, user_key, request_key) 1966 1967 def _create_file_name(self): 1968 """ 1969 Return a unique ascii identifier for the cache entry. 1970 1971 :rtype: str (hexadecimal; i.e. characters 0-9 and a-f only) 1972 """ 1973 return hashlib.sha256( 1974 self._uniquedescriptionstr().encode('utf-8') 1975 ).hexdigest() 1976 1977 def _cachefile_path(self): 1978 return os.path.join(CachedRequest._get_cache_dir(), 1979 self._create_file_name()) 1980 1981 def _expired(self, dt): 1982 return dt + self.expiry < datetime.datetime.utcnow() 1983 1984 def _load_cache(self) -> bool: 1985 """Load cache entry for request, if available. 1986 1987 :return: Whether the request was loaded from the cache 1988 """ 1989 self._add_defaults() 1990 try: 1991 filename = self._cachefile_path() 1992 with open(filename, 'rb') as f: 1993 uniquedescr, self._data, self._cachetime = pickle.load(f) 1994 if uniquedescr != self._uniquedescriptionstr(): 1995 raise RuntimeError('Expected unique description for the cache ' 1996 'entry is different from file entry.') 1997 if self._expired(self._cachetime): 1998 self._data = None 1999 return False 2000 pywikibot.debug('{}: cache hit ({}) for API request: {}' 2001 .format(self.__class__.__name__, filename, 2002 uniquedescr), _logger) 2003 return True 2004 except IOError: 2005 # file not found 2006 return False 2007 except Exception as e: 2008 pywikibot.output('Could not load cache: {!r}'.format(e)) 2009 return False 2010 2011 def _write_cache(self, data): 2012 """Write data to self._cachefile_path().""" 2013 data = (self._uniquedescriptionstr(), data, datetime.datetime.utcnow()) 2014 with open(self._cachefile_path(), 'wb') as f: 2015 pickle.dump(data, f, protocol=config.pickle_protocol) 2016 2017 def submit(self): 2018 """Submit cached request.""" 2019 cached_available = self._load_cache() 2020 if not cached_available: 2021 self._data = super().submit() 2022 self._write_cache(self._data) 2023 else: 2024 self._handle_warnings(self._data) 2025 return self._data 2026 2027 2028class _RequestWrapper: 2029 2030 """A wrapper class to handle the usage of the ``parameters`` parameter.""" 2031 2032 def _clean_kwargs(self, kwargs, **mw_api_args): 2033 """Clean kwargs, define site and request class.""" 2034 if 'site' not in kwargs: 2035 warn('{} invoked without a site'.format(self.__class__.__name__), 2036 RuntimeWarning, 3) 2037 kwargs['site'] = pywikibot.Site() 2038 assert(not hasattr(self, 'site') or self.site == kwargs['site']) 2039 self.site = kwargs['site'] 2040 self.request_class = kwargs['site']._request_class(kwargs) 2041 kwargs = self.request_class.clean_kwargs(kwargs) 2042 kwargs['parameters'].update(mw_api_args) 2043 return kwargs 2044 2045 2046class APIGenerator(_RequestWrapper): 2047 2048 """ 2049 Iterator that handle API responses containing lists. 2050 2051 The iterator will iterate each item in the query response and use the 2052 continue request parameter to retrieve the next portion of items 2053 automatically. If the limit attribute is set, the iterator will stop 2054 after iterating that many values. 2055 """ 2056 2057 def __init__(self, action: str, continue_name: str = 'continue', 2058 limit_name: str = 'limit', data_name: str = 'data', **kwargs): 2059 """ 2060 Initialize an APIGenerator object. 2061 2062 kwargs are used to create a Request object; see that object's 2063 documentation for values. 2064 2065 :param action: API action name. 2066 :param continue_name: Name of the continue API parameter. 2067 :param limit_name: Name of the limit API parameter. 2068 :param data_name: Name of the data in API response. 2069 """ 2070 kwargs = self._clean_kwargs(kwargs, action=action) 2071 2072 self.continue_name = continue_name 2073 self.limit_name = limit_name 2074 self.data_name = data_name 2075 2076 if config.step > 0: 2077 self.query_increment = config.step 2078 else: 2079 self.query_increment = None 2080 self.limit = None 2081 self.starting_offset = kwargs['parameters'].pop(self.continue_name, 0) 2082 self.request = self.request_class(**kwargs) 2083 self.request[self.limit_name] = self.query_increment 2084 2085 def set_query_increment(self, value: int): 2086 """ 2087 Set the maximum number of items to be retrieved per API query. 2088 2089 If not called, the default is config.step. 2090 2091 :param value: The value of maximum number of items to be retrieved 2092 per API request to set. 2093 """ 2094 self.query_increment = int(value) 2095 self.request[self.limit_name] = self.query_increment 2096 pywikibot.debug('{}: Set query_increment to {}.' 2097 .format(self.__class__.__name__, 2098 self.query_increment), _logger) 2099 2100 def set_maximum_items(self, value: Union[int, str, None]): 2101 """ 2102 Set the maximum number of items to be retrieved from the wiki. 2103 2104 If not called, most queries will continue as long as there is 2105 more data to be retrieved from the API. 2106 2107 :param value: The value of maximum number of items to be retrieved 2108 in total to set. Ignores None value. 2109 """ 2110 if value is not None and int(value) > 0: 2111 self.limit = int(value) 2112 if self.query_increment and self.limit < self.query_increment: 2113 self.request[self.limit_name] = self.limit 2114 pywikibot.debug('{}: Set request item limit to {}' 2115 .format(self.__class__.__name__, self.limit), 2116 _logger) 2117 pywikibot.debug('{}: Set limit (maximum_items) to {}.' 2118 .format(self.__class__.__name__, self.limit), 2119 _logger) 2120 2121 def __iter__(self): 2122 """ 2123 Submit request and iterate the response. 2124 2125 Continues response as needed until limit (if defined) is reached. 2126 """ 2127 offset = self.starting_offset 2128 n = 0 2129 while True: 2130 self.request[self.continue_name] = offset 2131 pywikibot.debug('{}: Request: {}' 2132 .format(self.__class__.__name__, self.request), 2133 _logger) 2134 data = self.request.submit() 2135 2136 n_items = len(data[self.data_name]) 2137 pywikibot.debug('{}: Retrieved {} items' 2138 .format(self.__class__.__name__, n_items), 2139 _logger) 2140 if n_items > 0: 2141 for item in data[self.data_name]: 2142 yield item 2143 n += 1 2144 if self.limit is not None and n >= self.limit: 2145 pywikibot.debug('%s: Stopped iterating due to ' 2146 'exceeding item limit.' % 2147 self.__class__.__name__, _logger) 2148 return 2149 offset += n_items 2150 else: 2151 pywikibot.debug('{}: Stopped iterating due to empty list in ' 2152 'response.'.format(self.__class__.__name__), 2153 _logger) 2154 break 2155 2156 2157class QueryGenerator(_RequestWrapper): 2158 2159 """ 2160 Base class for iterators that handle responses to API action=query. 2161 2162 By default, the iterator will iterate each item in the query response, 2163 and use the (query-)continue element, if present, to continue iterating as 2164 long as the wiki returns additional values. However, if the iterator's 2165 limit attribute is set to a positive int, the iterator will stop after 2166 iterating that many values. If limit is negative, the limit parameter 2167 will not be passed to the API at all. 2168 2169 Most common query types are more efficiently handled by subclasses, but 2170 this class can be used directly for custom queries and miscellaneous 2171 types (such as "meta=...") that don't return the usual list of pages or 2172 links. See the API documentation for specific query options. 2173 2174 """ 2175 2176 # Should results be filtered during iteration according to set_namespace? 2177 # Used if the API module does not support multiple namespaces. 2178 # Override in subclasses by defining a function that returns True if 2179 # the result's namespace is in self._namespaces. 2180 _check_result_namespace = NotImplemented 2181 2182 # Set of allowed namespaces will be assigned to _namespaces during 2183 # set_namespace call. Only to be used by _check_result_namespace. 2184 _namespaces = None 2185 2186 def __init__(self, **kwargs): 2187 """ 2188 Initialize a QueryGenerator object. 2189 2190 kwargs are used to create a Request object; see that object's 2191 documentation for values. 'action'='query' is assumed. 2192 2193 """ 2194 if not hasattr(self, 'site'): 2195 kwargs = self._clean_kwargs(kwargs) # hasn't been called yet 2196 parameters = kwargs['parameters'] 2197 if 'action' in parameters and parameters['action'] != 'query': 2198 raise Error("{}: 'action' must be 'query', not {}" 2199 .format(self.__class__.__name__, kwargs['action'])) 2200 parameters['action'] = 'query' 2201 # make sure request type is valid, and get limit key if any 2202 for modtype in ('generator', 'list', 'prop', 'meta'): 2203 if modtype in parameters: 2204 self.modules = parameters[modtype].split('|') 2205 break 2206 else: 2207 raise Error('{}: No query module name found in arguments.' 2208 .format(self.__class__.__name__)) 2209 2210 parameters['indexpageids'] = True # always ask for list of pageids 2211 self.continue_name = 'continue' 2212 self.continue_update = self._continue 2213 # Explicitly enable the simplified continuation 2214 parameters['continue'] = True 2215 self.request = self.request_class(**kwargs) 2216 2217 self.site._paraminfo.fetch('query+' + mod for mod in self.modules) 2218 2219 limited_modules = {mod for mod in self.modules 2220 if self.site._paraminfo.parameter('query+' + mod, 2221 'limit')} 2222 2223 if not limited_modules: 2224 self.limited_module = None 2225 elif len(limited_modules) == 1: 2226 self.limited_module = limited_modules.pop() 2227 else: 2228 # Select the first limited module in the request. 2229 # Query will continue as needed until limit (if any) for this 2230 # module is reached. 2231 for module in self.modules: 2232 if module in limited_modules: 2233 self.limited_module = module 2234 limited_modules.remove(module) 2235 break 2236 pywikibot.log('{}: multiple requested query modules support limits' 2237 "; using the first such module '{}' of {!r}" 2238 .format(self.__class__.__name__, self.limited_module, 2239 self.modules)) 2240 2241 # Set limits for all remaining limited modules to max value. 2242 # Default values will only cause more requests and make the query 2243 # slower. 2244 for module in limited_modules: 2245 param = self.site._paraminfo.parameter('query+' + module, 2246 'limit') 2247 prefix = self.site._paraminfo['query+' + module]['prefix'] 2248 if self.site.logged_in() \ 2249 and self.site.has_right('apihighlimits'): 2250 self.request[prefix + 'limit'] = int(param['highmax']) 2251 else: 2252 self.request[prefix + 'limit'] = int(param['max']) 2253 2254 if config.step > 0: 2255 self.api_limit = config.step 2256 else: 2257 self.api_limit = None 2258 2259 if self.limited_module: 2260 self.prefix = self.site._paraminfo['query+' 2261 + self.limited_module]['prefix'] 2262 self._update_limit() 2263 2264 if self.api_limit is not None and 'generator' in parameters: 2265 self.prefix = 'g' + self.prefix 2266 2267 self.limit = None 2268 self.query_limit = self.api_limit 2269 if 'generator' in parameters: 2270 # name of the "query" subelement key to look for when iterating 2271 self.resultkey = 'pages' 2272 else: 2273 self.resultkey = self.modules[0] 2274 2275 # usually the (query-)continue key is the same as the querymodule, 2276 # but not always 2277 # API can return more than one query-continue key, if multiple 2278 # properties are requested by the query, e.g. 2279 # "query-continue":{ 2280 # "langlinks":{"llcontinue":"12188973|pt"}, 2281 # "templates":{"tlcontinue":"310820|828|Namespace_detect"}} 2282 # self.continuekey is a list 2283 self.continuekey = self.modules 2284 self._add_slots() 2285 2286 def _add_slots(self): 2287 """Add slots to params if the site supports multi-content revisions. 2288 2289 On MW 1.32+ the following query parameters require slots to be given 2290 when content or contentmodel is requested. 2291 2292 * prop=revisions 2293 * prop=deletedrevisions or 2294 * list=allrevisions 2295 * list=alldeletedrevisions 2296 2297 More info: 2298 https://lists.wikimedia.org/hyperkitty/list/mediawiki-api-announce@lists.wikimedia.org/message/AXO4G4OOMTG7CEUU5TGAWXBI2LD4G3BC/ 2299 """ 2300 if self.site.mw_version < '1.32': 2301 return 2302 request = self.request 2303 # If using any deprecated_params, do not add slots. Usage of 2304 # these parameters together with slots is forbidden and the user will 2305 # get an API warning anyway. 2306 props = request.get('prop') 2307 if props: 2308 if 'revisions' in props: 2309 deprecated_params = { 2310 'rvexpandtemplates', 'rvparse', 'rvdiffto', 'rvdifftotext', 2311 'rvdifftotextpst', 'rvcontentformat', 'parsetree'} 2312 if not set(request) & deprecated_params: 2313 request['rvslots'] = '*' 2314 if 'deletedrevisions' in props: 2315 deprecated_params = { 2316 'drvexpandtemplates', 'drvparse', 'drvdiffto', 2317 'drvdifftotext', 'drvdifftotextpst', 'drvcontentformat', 2318 'parsetree'} 2319 if not set(request) & deprecated_params: 2320 request['drvslots'] = '*' 2321 lists = request.get('list') 2322 if lists: 2323 if 'allrevisions' in lists: 2324 deprecated_params = { 2325 'arvexpandtemplates', 'arvparse', 'arvdiffto', 2326 'arvdifftotext', 'arvdifftotextpst', 'arvcontentformat', 2327 'parsetree'} 2328 if not set(request) & deprecated_params: 2329 request['arvslots'] = '*' 2330 if 'alldeletedrevisions' in lists: 2331 deprecated_params = { 2332 'adrexpandtemplates', 'adrparse', 'adrdiffto', 2333 'adrdifftotext', 'adrdifftotextpst', 'adrcontentformat', 2334 'parsetree'} 2335 if not set(request) & deprecated_params: 2336 request['adrslots'] = '*' 2337 2338 def set_query_increment(self, value): 2339 """Set the maximum number of items to be retrieved per API query. 2340 2341 If not called, the default is to ask for "max" items and let the 2342 API decide how many to send. 2343 """ 2344 limit = int(value) 2345 2346 # don't update if limit is greater than maximum allowed by API 2347 if self.api_limit is None: 2348 self.query_limit = limit 2349 else: 2350 self.query_limit = min(self.api_limit, limit) 2351 pywikibot.debug('{}: Set query_limit to {}.' 2352 .format(self.__class__.__name__, 2353 self.query_limit), _logger) 2354 2355 def set_maximum_items(self, value: Union[int, str, None]): 2356 """Set the maximum number of items to be retrieved from the wiki. 2357 2358 If not called, most queries will continue as long as there is 2359 more data to be retrieved from the API. 2360 2361 If set to -1 (or any negative value), the "limit" parameter will be 2362 omitted from the request. For some request types (such as 2363 prop=revisions), this is necessary to signal that only current 2364 revision is to be returned. 2365 2366 :param value: The value of maximum number of items to be retrieved 2367 in total to set. Ignores None value. 2368 """ 2369 if value is not None: 2370 self.limit = int(value) 2371 2372 def _update_limit(self): 2373 """Set query limit for self.module based on api response.""" 2374 param = self.site._paraminfo.parameter('query+' + self.limited_module, 2375 'limit') 2376 if self.site.logged_in() and self.site.has_right('apihighlimits'): 2377 limit = int(param['highmax']) 2378 else: 2379 limit = int(param['max']) 2380 if self.api_limit is None or limit < self.api_limit: 2381 self.api_limit = limit 2382 pywikibot.debug( 2383 '{}: Set query_limit to {}.'.format(self.__class__.__name__, 2384 self.api_limit), 2385 _logger) 2386 2387 def support_namespace(self) -> bool: 2388 """Check if namespace is a supported parameter on this query. 2389 2390 Note: this function will be removed when self.set_namespace() will 2391 throw TypeError() instead of just giving a warning. 2392 See T196619. 2393 2394 :return: True if yes, False otherwise 2395 """ 2396 assert self.limited_module # some modules do not have a prefix 2397 return bool( 2398 self.site._paraminfo.parameter('query+' + self.limited_module, 2399 'namespace')) 2400 2401 def set_namespace(self, namespaces): 2402 """Set a namespace filter on this query. 2403 2404 :param namespaces: namespace identifiers to limit query results 2405 :type namespaces: iterable of str or Namespace key, or a single 2406 instance of those types. May be a '|' separated list of 2407 namespace identifiers. An empty iterator clears any 2408 namespace restriction. 2409 :raises KeyError: a namespace identifier was not resolved 2410 2411 # TODO: T196619 2412 # @raises TypeError: module does not support a namespace parameter 2413 # or a namespace identifier has an inappropriate 2414 # type such as NoneType or bool, or more than one namespace 2415 # if the API module does not support multiple namespaces 2416 """ 2417 assert self.limited_module # some modules do not have a prefix 2418 param = self.site._paraminfo.parameter('query+' + self.limited_module, 2419 'namespace') 2420 if not param: 2421 pywikibot.warning('{} module does not support a namespace ' 2422 'parameter'.format(self.limited_module)) 2423 warn('set_namespace() will be modified to raise TypeError ' 2424 'when namespace parameter is not supported. ' 2425 'It will be a Breaking Change, please update your code ' 2426 'ASAP, due date July, 31st 2019.', FutureWarning, 2) 2427 2428 # TODO: T196619 2429 # raise TypeError('{} module does not support a namespace ' 2430 # 'parameter'.format(self.limited_module)) 2431 2432 return False 2433 2434 if isinstance(namespaces, str): 2435 namespaces = namespaces.split('|') 2436 2437 # Use Namespace id (int) here; Request will cast int to str 2438 namespaces = [ns.id for ns in 2439 self.site.namespaces.resolve(namespaces)] 2440 2441 if 'multi' not in param and len(namespaces) != 1: 2442 if self._check_result_namespace is NotImplemented: 2443 raise TypeError('{} module does not support multiple ' 2444 'namespaces'.format(self.limited_module)) 2445 self._namespaces = set(namespaces) 2446 namespaces = None 2447 2448 if namespaces: 2449 self.request[self.prefix + 'namespace'] = namespaces 2450 elif self.prefix + 'namespace' in self.request: 2451 del self.request[self.prefix + 'namespace'] 2452 2453 return None 2454 2455 def _query_continue(self): 2456 if all(key not in self.data[self.continue_name] 2457 for key in self.continuekey): 2458 pywikibot.log( 2459 "Missing '{}' key(s) in ['{}'] value." 2460 .format(self.continuekey, self.continue_name)) 2461 return True 2462 2463 for query_continue_pair in self.data['query-continue'].values(): 2464 self._add_continues(query_continue_pair) 2465 return False # a new request with query-continue is needed 2466 2467 def _continue(self): 2468 self._add_continues(self.data['continue']) 2469 return False # a new request with continue is needed 2470 2471 def _add_continues(self, continue_pair): 2472 for key, value in continue_pair.items(): 2473 # query-continue can return ints (continue too?) 2474 if isinstance(value, int): 2475 value = str(value) 2476 self.request[key] = value 2477 2478 def _handle_query_limit(self, prev_limit, new_limit, had_data): 2479 """Handle query limit.""" 2480 if self.query_limit is None: 2481 return prev_limit, new_limit 2482 2483 prev_limit = new_limit 2484 if self.limit is None: 2485 new_limit = self.query_limit 2486 elif self.limit > 0: 2487 if had_data: 2488 # self.resultkey in data in last request.submit() 2489 new_limit = min(self.query_limit, self.limit - self._count) 2490 else: 2491 # only "(query-)continue" returned. See Bug T74209. 2492 # increase new_limit to advance faster until new 2493 # useful data are found again. 2494 new_limit = min(new_limit * 2, self.query_limit) 2495 else: 2496 new_limit = None 2497 2498 if new_limit and 'rvprop' in self.request \ 2499 and 'content' in self.request['rvprop']: 2500 # queries that retrieve page content have lower limits 2501 # Note: although API allows up to 500 pages for content 2502 # queries, these sometimes result in server-side errors 2503 # so use 250 as a safer limit 2504 new_limit = min(new_limit, self.api_limit // 10, 250) 2505 2506 if new_limit is not None: 2507 self.request[self.prefix + 'limit'] = str(new_limit) 2508 2509 if prev_limit != new_limit: 2510 pywikibot.debug( 2511 '{name}: query_limit: {query}, api_limit: {api}, ' 2512 'limit: {limit}, new_limit: {new}, count: {count}\n' 2513 '{name}: {prefix}limit: {value}' 2514 .format(name=self.__class__.__name__, 2515 query=self.query_limit, 2516 api=self.api_limit, 2517 limit=self.limit, 2518 new=new_limit, 2519 count=self._count, 2520 prefix=self.prefix, 2521 value=self.request[self.prefix + 'limit']), 2522 _logger) 2523 return prev_limit, new_limit 2524 2525 def _get_resultdata(self): 2526 """Get resultdata and verify result.""" 2527 resultdata = keys = self.data['query'][self.resultkey] 2528 if isinstance(resultdata, dict): 2529 keys = list(resultdata.keys()) 2530 if 'results' in resultdata: 2531 resultdata = resultdata['results'] 2532 elif 'pageids' in self.data['query']: 2533 # this ensures that page data will be iterated 2534 # in the same order as received from server 2535 resultdata = [resultdata[k] 2536 for k in self.data['query']['pageids']] 2537 else: 2538 resultdata = [resultdata[k] 2539 for k in sorted(resultdata.keys())] 2540 pywikibot.debug('{name} received {keys}; limit={limit}' 2541 .format(name=self.__class__.__name__, 2542 keys=keys, limit=self.limit), 2543 _logger) 2544 return resultdata 2545 2546 def _extract_results(self, resultdata): 2547 """Extract results from resultdata.""" 2548 for item in resultdata: 2549 result = self.result(item) 2550 if self._namespaces: 2551 if not self._check_result_namespace(result): 2552 continue 2553 yield result 2554 if isinstance(item, dict) \ 2555 and set(self.continuekey) & set(item.keys()): 2556 # if we need to count elements contained in items in 2557 # self.data["query"]["pages"], we want to count 2558 # item[self.continuekey] (e.g. 'revisions') and not 2559 # self.resultkey (i.e. 'pages') 2560 for key in set(self.continuekey) & set(item.keys()): 2561 self._count += len(item[key]) 2562 # otherwise we proceed as usual 2563 else: 2564 self._count += 1 2565 # note: self.limit could be -1 2566 if self.limit and 0 < self.limit <= self._count: 2567 raise RuntimeError( 2568 'QueryGenerator._extract_results reached the limit') 2569 2570 def __iter__(self): 2571 """Submit request and iterate the response based on self.resultkey. 2572 2573 Continues response as needed until limit (if any) is reached. 2574 2575 """ 2576 previous_result_had_data = True 2577 prev_limit = new_limit = None 2578 2579 self._count = 0 2580 while True: 2581 prev_limit, new_limit = self._handle_query_limit( 2582 prev_limit, new_limit, previous_result_had_data) 2583 if not hasattr(self, 'data'): 2584 self.data = self.request.submit() 2585 if not self.data or not isinstance(self.data, dict): 2586 pywikibot.debug( 2587 '{}: stopped iteration because no dict retrieved from api.' 2588 .format(self.__class__.__name__), 2589 _logger) 2590 return 2591 if 'query' in self.data and self.resultkey in self.data['query']: 2592 resultdata = self._get_resultdata() 2593 if 'normalized' in self.data['query']: 2594 self.normalized = { 2595 item['to']: item['from'] 2596 for item in self.data['query']['normalized']} 2597 else: 2598 self.normalized = {} 2599 try: 2600 yield from self._extract_results(resultdata) 2601 except RuntimeError: 2602 return 2603 # self.resultkey in data in last request.submit() 2604 previous_result_had_data = True 2605 else: 2606 if 'query' not in self.data: 2607 pywikibot.log("%s: 'query' not found in api response." % 2608 self.__class__.__name__) 2609 pywikibot.log(str(self.data)) 2610 # if (query-)continue is present, self.resultkey might not have 2611 # been fetched yet 2612 if self.continue_name not in self.data: 2613 # No results. 2614 return 2615 # self.resultkey not in data in last request.submit() 2616 # only "(query-)continue" was retrieved. 2617 previous_result_had_data = False 2618 if self.modules[0] == 'random': 2619 # "random" module does not return "(query-)continue" 2620 # now we loop for a new random query 2621 del self.data # a new request is needed 2622 continue 2623 if self.continue_name not in self.data: 2624 return 2625 if self.continue_update(): 2626 return 2627 2628 del self.data # a new request with (query-)continue is needed 2629 2630 def result(self, data): 2631 """Process result data as needed for particular subclass.""" 2632 return data 2633 2634 2635class PageGenerator(QueryGenerator): 2636 2637 """Iterator for response to a request of type action=query&generator=foo. 2638 2639 This class can be used for any of the query types that are listed in the 2640 API documentation as being able to be used as a generator. Instances of 2641 this class iterate Page objects. 2642 2643 """ 2644 2645 def __init__(self, generator: str, g_content=False, **kwargs): 2646 """ 2647 Initializer. 2648 2649 Required and optional parameters are as for ``Request``, except that 2650 action=query is assumed and generator is required. 2651 2652 :param generator: the "generator=" type from api.php 2653 :param g_content: if True, retrieve the contents of the current 2654 version of each Page (default False) 2655 2656 """ 2657 # If possible, use self.request after __init__ instead of appendParams 2658 def append_params(params, key, value): 2659 if key in params: 2660 params[key] += '|' + value 2661 else: 2662 params[key] = value 2663 kwargs = self._clean_kwargs(kwargs) 2664 parameters = kwargs['parameters'] 2665 # get some basic information about every page generated 2666 append_params(parameters, 'prop', 'info|imageinfo|categoryinfo') 2667 if g_content: 2668 # retrieve the current revision 2669 append_params(parameters, 'prop', 'revisions') 2670 append_params(parameters, 'rvprop', 2671 'ids|timestamp|flags|comment|user|content') 2672 if not ('inprop' in parameters 2673 and 'protection' in parameters['inprop']): 2674 append_params(parameters, 'inprop', 'protection') 2675 append_params(parameters, 'iiprop', 2676 'timestamp|user|comment|url|size|sha1|metadata') 2677 append_params(parameters, 'iilimit', 'max') # T194233 2678 parameters['generator'] = generator 2679 super().__init__(**kwargs) 2680 self.resultkey = 'pages' # element to look for in result 2681 self.props = self.request['prop'] 2682 2683 def result(self, pagedata): 2684 """Convert page dict entry from api to Page object. 2685 2686 This can be overridden in subclasses to return a different type 2687 of object. 2688 2689 """ 2690 p = pywikibot.Page(self.site, pagedata['title'], pagedata['ns']) 2691 ns = pagedata['ns'] 2692 # Upcast to proper Page subclass. 2693 if ns == 2: 2694 p = pywikibot.User(p) 2695 elif ns == 6: 2696 p = pywikibot.FilePage(p) 2697 elif ns == 14: 2698 p = pywikibot.Category(p) 2699 update_page(p, pagedata, self.props) 2700 return p 2701 2702 2703class PropertyGenerator(QueryGenerator): 2704 2705 """Iterator for queries of type action=query&prop=foo. 2706 2707 See the API documentation for types of page properties that can be 2708 queried. 2709 2710 This iterator yields one or more dict object(s) corresponding 2711 to each "page" item(s) from the API response; the calling module has to 2712 decide what to do with the contents of the dict. There will be one 2713 dict for each page queried via a titles= or ids= parameter (which must 2714 be supplied when instantiating this class). 2715 2716 """ 2717 2718 def __init__(self, prop: str, **kwargs): 2719 """ 2720 Initializer. 2721 2722 Required and optional parameters are as for ``Request``, except that 2723 action=query is assumed and prop is required. 2724 2725 :param prop: the "prop=" type from api.php 2726 """ 2727 kwargs = self._clean_kwargs(kwargs, prop=prop) 2728 super().__init__(**kwargs) 2729 self._props = frozenset(prop.split('|')) 2730 self.resultkey = 'pages' 2731 2732 @property 2733 def props(self): 2734 """The requested property names.""" 2735 return self._props 2736 2737 def __iter__(self): 2738 """Yield results.""" 2739 self._previous_dicts = {} 2740 yield from super().__iter__() 2741 yield from self._previous_dicts.values() 2742 2743 def _extract_results(self, resultdata): 2744 """Yield completed page_data of consecutive API requests.""" 2745 yield from self._fully_retrieved_data_dicts(resultdata) 2746 for data_dict in super()._extract_results(resultdata): 2747 if 'title' in data_dict: 2748 d = self._previous_dicts.setdefault(data_dict['title'], 2749 data_dict) 2750 if d is not data_dict: 2751 self._update_old_result_dict(d, data_dict) 2752 else: 2753 pywikibot.warn('Skipping result without title: ' 2754 + str(data_dict)) 2755 2756 def _fully_retrieved_data_dicts(self, resultdata): 2757 """Yield items of self._previous_dicts that are not in resultdata.""" 2758 resultdata_titles = {d['title'] for d in resultdata if 'title' in d} 2759 for prev_title, prev_dict in self._previous_dicts.copy().items(): 2760 if prev_title not in resultdata_titles: 2761 yield prev_dict 2762 del self._previous_dicts[prev_title] 2763 2764 @staticmethod 2765 def _update_old_result_dict(old_dict, new_dict): 2766 """Update old result dict with new_dict.""" 2767 for k, v in new_dict.items(): 2768 if k not in old_dict: 2769 old_dict[k] = v 2770 continue 2771 if isinstance(v, list): 2772 old_dict[k].extend(v) 2773 continue 2774 assert isinstance(v, (str, int)), ( 2775 'continued API result had an unexpected type: {}'.format(v)) 2776 2777 2778class ListGenerator(QueryGenerator): 2779 2780 """Iterator for queries of type action=query&list=foo. 2781 2782 See the API documentation for types of lists that can be queried. Lists 2783 include both site-wide information (such as 'allpages') and page-specific 2784 information (such as 'backlinks'). 2785 2786 This iterator yields a dict object for each member of the list returned 2787 by the API, with the format of the dict depending on the particular list 2788 command used. For those lists that contain page information, it may be 2789 easier to use the PageGenerator class instead, as that will convert the 2790 returned information into a Page object. 2791 2792 """ 2793 2794 def __init__(self, listaction: str, **kwargs): 2795 """ 2796 Initializer. 2797 2798 Required and optional parameters are as for ``Request``, except that 2799 action=query is assumed and listaction is required. 2800 2801 :param listaction: the "list=" type from api.php 2802 """ 2803 kwargs = self._clean_kwargs(kwargs, list=listaction) 2804 super().__init__(**kwargs) 2805 2806 2807class LogEntryListGenerator(ListGenerator): 2808 2809 """ 2810 Iterator for queries of list 'logevents'. 2811 2812 Yields LogEntry objects instead of dicts. 2813 """ 2814 2815 def __init__(self, logtype=None, **kwargs): 2816 """Initializer.""" 2817 super().__init__('logevents', **kwargs) 2818 2819 from pywikibot import logentries 2820 self.entryFactory = logentries.LogEntryFactory(self.site, logtype) 2821 2822 def result(self, pagedata): 2823 """Instantiate LogEntry from data from api.""" 2824 return self.entryFactory.create(pagedata) 2825 2826 def _check_result_namespace(self, result): 2827 """Return True if result.ns() is in self._namespaces.""" 2828 return result.ns() in self._namespaces 2829 2830 2831class LoginManager(login.LoginManager): 2832 2833 """Supply login_to_site method to use API interface.""" 2834 2835 # API login parameters mapping 2836 mapping = { 2837 'user': ('lgname', 'username'), 2838 'password': ('lgpassword', 'password'), 2839 'ldap': ('lgdomain', 'domain'), 2840 'token': ('lgtoken', 'logintoken'), 2841 'result': ('result', 'status'), 2842 'success': ('Success', 'PASS'), 2843 'fail': ('Failed', 'FAIL'), 2844 'reason': ('reason', 'message') 2845 } 2846 2847 def keyword(self, key): 2848 """Get API keyword from mapping.""" 2849 return self.mapping[key][self.action != 'login'] 2850 2851 def _login_parameters(self, *, botpassword: bool = False 2852 ) -> Dict[str, str]: 2853 """Return login parameters.""" 2854 # Since MW 1.27 only for bot passwords. 2855 self.action = 'login' 2856 if not botpassword: 2857 # get token using meta=tokens if supported 2858 token = self.get_login_token() 2859 if token: 2860 # Standard login request since MW 1.27 2861 self.action = 'clientlogin' 2862 2863 # prepare default login parameters 2864 parameters = {'action': self.action, 2865 self.keyword('user'): self.login_name, 2866 self.keyword('password'): self.password} 2867 2868 if self.action == 'clientlogin': 2869 # clientlogin requires non-empty loginreturnurl 2870 parameters['loginreturnurl'] = 'https://example.com' 2871 parameters['rememberMe'] = '1' 2872 parameters['logintoken'] = token 2873 2874 if self.site.family.ldapDomain: 2875 parameters[self.keyword('ldap')] = self.site.family.ldapDomain 2876 2877 return parameters 2878 2879 def login_to_site(self) -> None: 2880 """Login to the site. 2881 2882 Note, this doesn't do anything with cookies. The http module 2883 takes care of all the cookie stuff. Throws exception on failure. 2884 """ 2885 self.below_mw_1_27 = False 2886 if hasattr(self, '_waituntil'): 2887 if datetime.datetime.now() < self._waituntil: 2888 diff = self._waituntil - datetime.datetime.now() 2889 pywikibot.warning( 2890 'Too many tries, waiting {} seconds before retrying.' 2891 .format(diff.seconds)) 2892 pywikibot.sleep(diff.seconds) 2893 2894 self.site._loginstatus = LoginStatus.IN_PROGRESS 2895 2896 # Bot passwords username contains @, 2897 # otherwise @ is not allowed in usernames. 2898 # @ in bot password is deprecated, 2899 # but we don't want to break bots using it. 2900 parameters = self._login_parameters( 2901 botpassword='@' in self.login_name or '@' in self.password) 2902 2903 # base login request 2904 login_request = self.site._request(use_get=False, 2905 parameters=parameters) 2906 while True: 2907 # try to login 2908 try: 2909 login_result = login_request.submit() 2910 except pywikibot.exceptions.APIError as e: 2911 login_result = {'error': e.__dict__} 2912 2913 # clientlogin response can be clientlogin or error 2914 if self.action in login_result: 2915 response = login_result[self.action] 2916 result_key = self.keyword('result') 2917 elif 'error' in login_result: 2918 response = login_result['error'] 2919 result_key = 'code' 2920 else: 2921 raise RuntimeError('Unexpected API login response key.') 2922 2923 status = response[result_key] 2924 fail_reason = response.get(self.keyword('reason'), '') 2925 if status == self.keyword('success'): 2926 return 2927 2928 if status in ('NeedToken', 'WrongToken', 'badtoken'): 2929 token = response.get('token') 2930 if token and self.below_mw_1_27: 2931 # fetched token using action=login 2932 login_request['lgtoken'] = token 2933 pywikibot.log('Received login token, proceed with login.') 2934 else: 2935 # if incorrect login token was used, 2936 # force relogin and generate fresh one 2937 pywikibot.error('Received incorrect login token. ' 2938 'Forcing re-login.') 2939 # invalidate superior wiki cookies (T224712) 2940 _invalidate_superior_cookies(self.site.family) 2941 login_request[ 2942 self.keyword('token')] = self.get_login_token() 2943 continue 2944 2945 # messagecode was introduced with 1.29.0-wmf.14 2946 # but older wikis are still supported 2947 login_throttled = response.get('messagecode') == 'login-throttled' 2948 2949 if (status == 'Throttled' or status == self.keyword('fail') 2950 and (login_throttled or 'wait' in fail_reason)): 2951 wait = response.get('wait') 2952 if wait: 2953 delta = datetime.timedelta(seconds=int(wait)) 2954 else: 2955 match = re.search(r'(\d+) (seconds|minutes)', fail_reason) 2956 if match: 2957 delta = datetime.timedelta( 2958 **{match.group(2): int(match.group(1))}) 2959 else: 2960 delta = datetime.timedelta() 2961 self._waituntil = datetime.datetime.now() + delta 2962 2963 break 2964 2965 if 'error' in login_result: 2966 raise pywikibot.exceptions.APIError(**response) 2967 2968 raise pywikibot.exceptions.APIError(code=status, info=fail_reason) 2969 2970 def get_login_token(self) -> Optional[str]: 2971 """Fetch login token for MediaWiki 1.27+. 2972 2973 :return: login token 2974 """ 2975 login_token_request = self.site._request( 2976 use_get=False, 2977 parameters={'action': 'query', 'meta': 'tokens', 'type': 'login'}, 2978 ) 2979 login_token_result = login_token_request.submit() 2980 # check if we have to use old implementation of mw < 1.27 2981 if 'query' in login_token_result: 2982 return login_token_result['query']['tokens'].get('logintoken') 2983 2984 self.below_mw_1_27 = True 2985 return None 2986 2987 2988def encode_url(query) -> str: 2989 """ 2990 Encode parameters to pass with a url. 2991 2992 Reorder parameters so that token parameters go last and call wraps 2993 :py:obj:`urlencode`. Return an HTTP URL query fragment which complies with 2994 https://www.mediawiki.org/wiki/API:Edit#Parameters 2995 (See the 'token' bullet.) 2996 2997 :param query: keys and values to be uncoded for passing with a url 2998 :type query: mapping object or a sequence of two-element tuples 2999 :return: encoded parameters with token parameters at the end 3000 """ 3001 if hasattr(query, 'items'): 3002 query = list(query.items()) 3003 3004 # parameters ending on 'token' should go last 3005 # wpEditToken should go very last 3006 query.sort(key=lambda x: x[0].lower().endswith('token') 3007 + (x[0] == 'wpEditToken')) 3008 return urlencode(query) 3009 3010 3011def _update_pageid(page, pagedict: dict): 3012 """Update pageid.""" 3013 if 'pageid' in pagedict: 3014 page._pageid = int(pagedict['pageid']) 3015 elif 'missing' in pagedict: 3016 page._pageid = 0 # Non-existent page 3017 else: 3018 # Something is wrong. 3019 if page.site.sametitle(page.title(), pagedict['title']): 3020 if 'invalid' in pagedict: 3021 raise InvalidTitleError('{}: {}' 3022 .format(page, 3023 pagedict['invalidreason'])) 3024 if int(pagedict['ns']) < 0: 3025 raise UnsupportedPageError(page) 3026 raise RuntimeError( 3027 "Page {} has neither 'pageid' nor 'missing' attribute" 3028 .format(pagedict['title'])) 3029 3030 3031def _update_contentmodel(page, pagedict: dict): 3032 """Update page content model.""" 3033 page._contentmodel = pagedict.get('contentmodel') # can be None 3034 3035 if (page._contentmodel 3036 and page._contentmodel == 'proofread-page' 3037 and 'proofread' in pagedict): 3038 page._quality = pagedict['proofread']['quality'] 3039 page._quality_text = pagedict['proofread']['quality_text'] 3040 3041 3042def _update_protection(page, pagedict: dict): 3043 """Update page protection.""" 3044 if 'restrictiontypes' in pagedict: 3045 page._applicable_protections = set(pagedict['restrictiontypes']) 3046 else: 3047 page._applicable_protections = None 3048 page._protection = {item['type']: (item['level'], item['expiry']) 3049 for item in pagedict['protection']} 3050 3051 3052def _update_revisions(page, revisions): 3053 """Update page revisions.""" 3054 for rev in revisions: 3055 page._revisions[rev['revid']] = pywikibot.page.Revision(**rev) 3056 3057 3058def _update_templates(page, templates): 3059 """Update page templates.""" 3060 templ_pages = [pywikibot.Page(page.site, tl['title']) for tl in templates] 3061 if hasattr(page, '_templates'): 3062 page._templates.extend(templ_pages) 3063 else: 3064 page._templates = templ_pages 3065 3066 3067def _update_langlinks(page, langlinks): 3068 """Update page langlinks.""" 3069 links = [pywikibot.Link.langlinkUnsafe(link['lang'], link['*'], 3070 source=page.site) 3071 for link in langlinks] 3072 3073 if hasattr(page, '_langlinks'): 3074 page._langlinks.extend(links) 3075 else: 3076 page._langlinks = links 3077 3078 3079def _update_coordinates(page, coordinates): 3080 """Update page coordinates.""" 3081 coords = [] 3082 for co in coordinates: 3083 coord = pywikibot.Coordinate(lat=co['lat'], 3084 lon=co['lon'], 3085 typ=co.get('type', ''), 3086 name=co.get('name', ''), 3087 dim=int(co.get('dim', 0)) or None, 3088 globe=co['globe'], # See [[gerrit:67886]] 3089 primary='primary' in co 3090 ) 3091 coords.append(coord) 3092 page._coords = coords 3093 3094 3095def update_page(page, pagedict: dict, props=None): 3096 """Update attributes of Page object page, based on query data in pagedict. 3097 3098 :param page: object to be updated 3099 :type page: pywikibot.page.Page 3100 :param pagedict: the contents of a "page" element of a query response 3101 :param props: the property names which resulted in pagedict. If a missing 3102 value in pagedict can indicate both 'false' and 'not present' the 3103 property which would make the value present must be in the props 3104 parameter. 3105 :type props: iterable of string 3106 :raises pywikibot.exceptions.InvalidTitleError: Page title is invalid 3107 :raises pywikibot.exceptions.UnsupportedPageError: Page with namespace < 0 3108 is not supported yet 3109 """ 3110 _update_pageid(page, pagedict) 3111 _update_contentmodel(page, pagedict) 3112 3113 props = props or [] 3114 if 'info' in props: 3115 page._isredir = 'redirect' in pagedict 3116 3117 if 'touched' in pagedict: 3118 page._timestamp = pagedict['touched'] 3119 3120 if 'protection' in pagedict: 3121 _update_protection(page, pagedict) 3122 3123 if 'revisions' in pagedict: 3124 _update_revisions(page, pagedict['revisions']) 3125 3126 if 'lastrevid' in pagedict: 3127 page.latest_revision_id = pagedict['lastrevid'] 3128 3129 if 'imageinfo' in pagedict: 3130 if not isinstance(page, pywikibot.FilePage): 3131 raise RuntimeError( 3132 '"imageinfo" found but {} is not a FilePage object' 3133 .format(page)) 3134 page._load_file_revisions(pagedict['imageinfo']) 3135 3136 if 'categoryinfo' in pagedict: 3137 page._catinfo = pagedict['categoryinfo'] 3138 3139 if 'templates' in pagedict: 3140 _update_templates(page, pagedict['templates']) 3141 elif 'templates' in props: 3142 page._templates = [] 3143 3144 if 'langlinks' in pagedict: 3145 _update_langlinks(page, pagedict['langlinks']) 3146 elif 'langlinks' in props: 3147 page._langlinks = [] 3148 3149 if 'coordinates' in pagedict: 3150 _update_coordinates(page, pagedict['coordinates']) 3151 3152 if 'pageimage' in pagedict: 3153 page._pageimage = pywikibot.FilePage(page.site, pagedict['pageimage']) 3154 3155 if 'pageprops' in pagedict: 3156 page._pageprops = pagedict['pageprops'] 3157 elif 'pageprops' in props: 3158 page._pageprops = {} 3159 3160 if 'preload' in pagedict: 3161 page._preloadedtext = pagedict['preload'] 3162 3163 if 'flowinfo' in pagedict: 3164 page._flowinfo = pagedict['flowinfo']['flow'] 3165 3166 if 'lintId' in pagedict: 3167 page._lintinfo = pagedict 3168 page._lintinfo.pop('pageid') 3169 page._lintinfo.pop('title') 3170 page._lintinfo.pop('ns') 3171 3172 3173wrapper = ModuleDeprecationWrapper(__name__) 3174wrapper.add_deprecated_attr( 3175 'APIError', replacement_name='pywikibot.exceptions.APIError', 3176 since='20210423') 3177wrapper.add_deprecated_attr( 3178 'UploadWarning', replacement_name='pywikibot.exceptions.UploadError', 3179 since='20210423') 3180wrapper.add_deprecated_attr( 3181 'APIMWException', replacement_name='pywikibot.exceptions.APIMWError', 3182 since='20210423') 3183