1""" 2This module offers a wide variety of page generators. 3 4A page generator is an 5object that is iterable (see https://legacy.python.org/dev/peps/pep-0255/ ) and 6that yields page objects on which other scripts can then work. 7 8Pagegenerators.py cannot be run as script. For testing purposes listpages.py 9can be used instead, to print page titles to standard output. 10 11These parameters are supported to specify which pages titles to print: 12 13¶ms; 14""" 15# 16# (C) Pywikibot team, 2008-2021 17# 18# Distributed under the terms of the MIT license. 19# 20import calendar 21import codecs 22import datetime 23import io 24import itertools 25import json 26import re 27import sys 28from collections import namedtuple 29from collections.abc import Iterator 30from datetime import timedelta 31from functools import partial 32from http import HTTPStatus 33from itertools import zip_longest 34from typing import Optional, Union 35from urllib.parse import urlparse 36 37from requests.exceptions import ReadTimeout 38 39import pywikibot 40from pywikibot import config, date, i18n, xmlreader 41from pywikibot.backports import Iterable, List 42from pywikibot.bot import ShowingListOption 43from pywikibot.comms import http 44from pywikibot.data import api 45from pywikibot.exceptions import ( 46 NoPageError, 47 ServerError, 48 UnknownExtensionError, 49) 50from pywikibot.proofreadpage import ProofreadPage 51from pywikibot.tools import ( 52 DequeGenerator, 53 deprecated, 54 deprecated_args, 55 filter_unique, 56 intersect_generators, 57 itergroup, 58 redirect_func, 59) 60 61 62_logger = 'pagegenerators' 63 64# ported from version 1 for backwards-compatibility 65# most of these functions just wrap a Site or Page method that returns 66# a generator 67 68parameterHelp = """\ 69GENERATOR OPTIONS 70================= 71 72-cat Work on all pages which are in a specific category. 73 Argument can also be given as "-cat:categoryname" or 74 as "-cat:categoryname|fromtitle" (using # instead of | 75 is also allowed in this one and the following) 76 77-catr Like -cat, but also recursively includes pages in 78 subcategories, sub-subcategories etc. of the 79 given category. 80 Argument can also be given as "-catr:categoryname" or 81 as "-catr:categoryname|fromtitle". 82 83-subcats Work on all subcategories of a specific category. 84 Argument can also be given as "-subcats:categoryname" or 85 as "-subcats:categoryname|fromtitle". 86 87-subcatsr Like -subcats, but also includes sub-subcategories etc. of 88 the given category. 89 Argument can also be given as "-subcatsr:categoryname" or 90 as "-subcatsr:categoryname|fromtitle". 91 92-uncat Work on all pages which are not categorised. 93 94-uncatcat Work on all categories which are not categorised. 95 96-uncatfiles Work on all files which are not categorised. 97 98-file Read a list of pages to treat from the named text file. 99 Page titles in the file may be either enclosed with 100 [[brackets]], or be separated by new lines. 101 Argument can also be given as "-file:filename". 102 103-filelinks Work on all pages that use a certain image/media file. 104 Argument can also be given as "-filelinks:filename". 105 106-search Work on all pages that are found in a MediaWiki search 107 across all namespaces. 108 109-logevents Work on articles that were on a specified Special:Log. 110 The value may be a comma separated list of these values: 111 112 logevent,username,start,end 113 114 or for backward compatibility: 115 116 logevent,username,total 117 118 Note: 'start' is the most recent date and log events are 119 iterated from present to past. If 'start'' is not provided, 120 it means 'now'; if 'end' is not provided, it means 'since 121 the beginning'. 122 123 To use the default value, use an empty string. 124 You have options for every type of logs given by the 125 log event parameter which could be one of the following: 126 127 spamblacklist, titleblacklist, gblblock, renameuser, 128 globalauth, gblrights, gblrename, abusefilter, 129 massmessage, thanks, usermerge, block, protect, rights, 130 delete, upload, move, import, patrol, merge, suppress, 131 tag, managetags, contentmodel, review, stable, 132 timedmediahandler, newusers 133 134 It uses the default number of pages 10. 135 136 Examples: 137 138 -logevents:move gives pages from move log (usually 139 redirects) 140 -logevents:delete,,20 gives 20 pages from deletion log 141 -logevents:protect,Usr gives pages from protect log by user 142 Usr 143 -logevents:patrol,Usr,20 gives 20 patrolled pages by Usr 144 -logevents:upload,,20121231,20100101 gives upload pages 145 in the 2010s, 2011s, and 2012s 146 -logevents:review,,20121231 gives review pages since the 147 beginning till the 31 Dec 2012 148 -logevents:review,Usr,20121231 gives review pages by user 149 Usr since the beginning till the 31 Dec 2012 150 151 In some cases it must be given as -logevents:"move,Usr,20" 152 153-interwiki Work on the given page and all equivalent pages in other 154 languages. This can, for example, be used to fight 155 multi-site spamming. 156 Attention: this will cause the bot to modify 157 pages on several wiki sites, this is not well tested, 158 so check your edits! 159 160-links Work on all pages that are linked from a certain page. 161 Argument can also be given as "-links:linkingpagetitle". 162 163-liverecentchanges Work on pages from the live recent changes feed. If used as 164 -liverecentchanges:x, work on x recent changes. 165 166-imagesused Work on all images that contained on a certain page. 167 Can also be given as "-imagesused:linkingpagetitle". 168 169-newimages Work on the most recent new images. If given as 170 -newimages:x, will work on x newest images. 171 172-newpages Work on the most recent new pages. If given as -newpages:x, 173 will work on x newest pages. 174 175-recentchanges Work on the pages with the most recent changes. If 176 given as -recentchanges:x, will work on the x most recently 177 changed pages. If given as -recentchanges:offset,duration 178 it will work on pages changed from 'offset' minutes with 179 'duration' minutes of timespan. rctags are supported too. 180 The rctag must be the very first parameter part. 181 182 Examples: 183 184 -recentchanges:20 gives the 20 most recently changed pages 185 -recentchanges:120,70 will give pages with 120 offset 186 minutes and 70 minutes of timespan 187 -recentchanges:visualeditor,10 gives the 10 most recently 188 changed pages marked with 'visualeditor' 189 -recentchanges:"mobile edit,60,35" will retrieve pages 190 marked with 'mobile edit' for the given offset and timespan 191 192-unconnectedpages Work on the most recent unconnected pages to the Wikibase 193 repository. Given as -unconnectedpages:x, will work on the 194 x most recent unconnected pages. 195 196-ref Work on all pages that link to a certain page. 197 Argument can also be given as "-ref:referredpagetitle". 198 199-start Specifies that the robot should go alphabetically through 200 all pages on the home wiki, starting at the named page. 201 Argument can also be given as "-start:pagetitle". 202 203 You can also include a namespace. For example, 204 "-start:Template:!" will make the bot work on all pages 205 in the template namespace. 206 207 default value is start:! 208 209-prefixindex Work on pages commencing with a common prefix. 210 211-transcludes Work on all pages that use a certain template. 212 Argument can also be given as "-transcludes:Title". 213 214-unusedfiles Work on all description pages of images/media files that 215 are not used anywhere. 216 Argument can be given as "-unusedfiles:n" where 217 n is the maximum number of articles to work on. 218 219-lonelypages Work on all articles that are not linked from any other 220 article. 221 Argument can be given as "-lonelypages:n" where 222 n is the maximum number of articles to work on. 223 224-unwatched Work on all articles that are not watched by anyone. 225 Argument can be given as "-unwatched:n" where 226 n is the maximum number of articles to work on. 227 228-property:name Work on all pages with a given property name from 229 Special:PagesWithProp. 230 231-usercontribs Work on all articles that were edited by a certain user. 232 (Example : -usercontribs:DumZiBoT) 233 234-weblink Work on all articles that contain an external link to 235 a given URL; may be given as "-weblink:url" 236 237-withoutinterwiki Work on all pages that don't have interlanguage links. 238 Argument can be given as "-withoutinterwiki:n" where 239 n is the total to fetch. 240 241-mysqlquery Takes a MySQL query string like 242 "SELECT page_namespace, page_title FROM page 243 WHERE page_namespace = 0" and treats 244 the resulting pages. See 245 https://www.mediawiki.org/wiki/Manual:Pywikibot/MySQL 246 for more details. 247 248-sparql Takes a SPARQL SELECT query string including ?item 249 and works on the resulting pages. 250 251-sparqlendpoint Specify SPARQL endpoint URL (optional). 252 (Example : -sparqlendpoint:http://myserver.com/sparql) 253 254-searchitem Takes a search string and works on Wikibase pages that 255 contain it. 256 Argument can be given as "-searchitem:text", where text 257 is the string to look for, or "-searchitem:lang:text", 258 where lang is the language to search items in. 259 260-wantedpages Work on pages that are linked, but do not exist; 261 may be given as "-wantedpages:n" where n is the maximum 262 number of articles to work on. 263 264-wantedcategories Work on categories that are used, but do not exist; 265 may be given as "-wantedcategories:n" where n is the 266 maximum number of categories to work on. 267 268-wantedfiles Work on files that are used, but do not exist; 269 may be given as "-wantedfiles:n" where n is the maximum 270 number of files to work on. 271 272-wantedtemplates Work on templates that are used, but do not exist; 273 may be given as "-wantedtemplates:n" where n is the 274 maximum number of templates to work on. 275 276-random Work on random pages returned by [[Special:Random]]. 277 Can also be given as "-random:n" where n is the number 278 of pages to be returned. 279 280-randomredirect Work on random redirect pages returned by 281 [[Special:RandomRedirect]]. Can also be given as 282 "-randomredirect:n" where n is the number of pages to be 283 returned. 284 285-google Work on all pages that are found in a Google search. 286 You need a Google Web API license key. Note that Google 287 doesn't give out license keys anymore. See google_key in 288 config.py for instructions. 289 Argument can also be given as "-google:searchstring". 290 291-page Work on a single page. Argument can also be given as 292 "-page:pagetitle", and supplied multiple times for 293 multiple pages. 294 295-pageid Work on a single pageid. Argument can also be given as 296 "-pageid:pageid1,pageid2,." or 297 "-pageid:'pageid1|pageid2|..'" 298 and supplied multiple times for multiple pages. 299 300-linter Work on pages that contain lint errors. Extension Linter 301 must be available on the site. 302 -linter select all categories. 303 -linter:high, -linter:medium or -linter:low select all 304 categories for that prio. 305 Single categories can be selected with commas as in 306 -linter:cat1,cat2,cat3 307 308 Adding '/int' identifies Lint ID to start querying from: 309 e.g. -linter:high/10000 310 311 -linter:show just shows available categories. 312 313-querypage:name Work on pages provided by a QueryPage-based special page, 314 see https://www.mediawiki.org/wiki/API:Querypage. 315 (tip: use -limit:n to fetch only n pages). 316 317 -querypage shows special pages available. 318 319-url Read a list of pages to treat from the provided URL. 320 The URL must return text in the same format as expected for 321 the -file argument, e.g. page titles separated by newlines 322 or enclosed in brackets. 323 324 325FILTER OPTIONS 326============== 327 328-catfilter Filter the page generator to only yield pages in the 329 specified category. See -cat generator for argument format. 330 331-grep A regular expression that needs to match the article 332 otherwise the page won't be returned. 333 Multiple -grep:regexpr can be provided and the page will 334 be returned if content is matched by any of the regexpr 335 provided. 336 Case insensitive regular expressions will be used and 337 dot matches any character, including a newline. 338 339-grepnot Like -grep, but return the page only if the regular 340 expression does not match. 341 342-intersect Work on the intersection of all the provided generators. 343 344-limit When used with any other argument -limit:n specifies a set 345 of pages, work on no more than n pages in total. 346 347-namespaces Filter the page generator to only yield pages in the 348-namespace specified namespaces. Separate multiple namespace 349-ns numbers or names with commas. 350 351 Examples: 352 353 -ns:0,2,4 354 -ns:Help,MediaWiki 355 356 You may use a preleading "not" to exclude the namespace. 357 358 Examples: 359 360 -ns:not:2,3 361 -ns:not:Help,File 362 363 If used with -newpages/-random/-randomredirect/linter 364 generators, -namespace/ns must be provided before 365 -newpages/-random/-randomredirect/linter. 366 If used with -recentchanges generator, efficiency is 367 improved if -namespace is provided before -recentchanges. 368 369 If used with -start generator, -namespace/ns shall contain 370 only one value. 371 372-onlyif A claim the page needs to contain, otherwise the item won't 373 be returned. 374 The format is property=value,qualifier=value. Multiple (or 375 none) qualifiers can be passed, separated by commas. 376 377 Examples: 378 379 P1=Q2 (property P1 must contain value Q2), 380 P3=Q4,P5=Q6,P6=Q7 (property P3 with value Q4 and 381 qualifiers: P5 with value Q6 and P6 with value Q7). 382 Value can be page ID, coordinate in format: 383 latitude,longitude[,precision] (all values are in decimal 384 degrees), year, or plain string. 385 The argument can be provided multiple times and the item 386 page will be returned only if all claims are present. 387 Argument can be also given as "-onlyif:expression". 388 389-onlyifnot A claim the page must not contain, otherwise the item won't 390 be returned. 391 For usage and examples, see -onlyif above. 392 393-ql Filter pages based on page quality. 394 This is only applicable if contentmodel equals 395 'proofread-page', otherwise has no effects. 396 Valid values are in range 0-4. 397 Multiple values can be comma-separated. 398 399-subpage -subpage:n filters pages to only those that have depth n 400 i.e. a depth of 0 filters out all pages that are subpages, 401 and a depth of 1 filters out all pages that are subpages of 402 subpages. 403 404 405-titleregex A regular expression that needs to match the article title 406 otherwise the page won't be returned. 407 Multiple -titleregex:regexpr can be provided and the page 408 will be returned if title is matched by any of the regexpr 409 provided. 410 Case insensitive regular expressions will be used and 411 dot matches any character. 412 413-titleregexnot Like -titleregex, but return the page only if the regular 414 expression does not match. 415""" 416 417docuReplacements = {'¶ms;': parameterHelp} # noqa: N816 418 419# if a bot uses GeneratorFactory, the module should include the line 420# docuReplacements = {'¶ms;': pywikibot.pagegenerators.parameterHelp} 421# and include the marker ¶ms; in the module's docstring 422# 423# We manually include it so the parameters show up in the auto-generated 424# module documentation: 425 426__doc__ = __doc__.replace('¶ms;', parameterHelp) 427 428 429# This is the function that will be used to de-duplicate page iterators. 430_filter_unique_pages = partial( 431 filter_unique, key=lambda page: '{}:{}:{}'.format(*page._cmpkey())) 432 433 434def _output_if(predicate, msg): 435 if predicate: 436 pywikibot.output(msg) 437 438 439class GeneratorFactory: 440 441 """Process command line arguments and return appropriate page generator. 442 443 This factory is responsible for processing command line arguments 444 that are used by many scripts and that determine which pages to work on. 445 446 :Note: GeneratorFactory must be instantiated after global arguments are 447 parsed except if site parameter is given. 448 """ 449 450 def __init__(self, site=None, 451 positional_arg_name: Optional[str] = None, 452 enabled_options: Optional[Iterable[str]] = None, 453 disabled_options: Optional[Iterable[str]] = None): 454 """ 455 Initializer. 456 457 :param site: Site for generator results 458 :type site: :py:obj:`pywikibot.site.BaseSite` 459 :param positional_arg_name: generator to use for positional args, 460 which do not begin with a hyphen 461 :param enabled_options: only enable options given by this Iterable. 462 This is priorized over disabled_options 463 :param disabled_options: disable these given options and let them 464 be handled by scripts options handler 465 """ 466 self.gens = [] 467 self._namespaces = [] 468 self.limit = None 469 self.qualityfilter_list = [] 470 self.articlefilter_list = [] 471 self.articlenotfilter_list = [] 472 self.titlefilter_list = [] 473 self.titlenotfilter_list = [] 474 self.claimfilter_list = [] 475 self.catfilter_list = [] 476 self.intersect = False 477 self.subpage_max_depth = None 478 self._site = site 479 self._positional_arg_name = positional_arg_name 480 self._sparql = None 481 self.nopreload = False 482 self._validate_options(enabled_options, disabled_options) 483 484 def _validate_options(self, enable, disable): 485 """Validate option restrictions.""" 486 msg = '{!r} is not a valid pagegenerators option to be ' 487 enable = enable or [] 488 disable = disable or [] 489 self.enabled_options = set(enable) 490 self.disabled_options = set(disable) 491 for opt in enable: 492 if not hasattr(self, '_handle_' + opt): 493 pywikibot.warning((msg + 'enabled').format(opt)) 494 self.enabled_options.remove(opt) 495 for opt in disable: 496 if not hasattr(self, '_handle_' + opt): 497 pywikibot.warning((msg + 'disabled').format(opt)) 498 self.disabled_options.remove(opt) 499 if self.enabled_options and self.disabled_options: 500 pywikibot.warning('Ignoring disabled option because enabled ' 501 'options are set.') 502 self.disabled_options = [] 503 504 @property 505 def site(self): 506 """ 507 Generator site. 508 509 The generator site should not be accessed until after the global 510 arguments have been handled, otherwise the default Site may be changed 511 by global arguments, which will cause this cached value to be stale. 512 513 :return: Site given to initializer, otherwise the default Site at the 514 time this property is first accessed. 515 :rtype: :py:obj:`pywikibot.site.BaseSite` 516 """ 517 if not self._site: 518 self._site = pywikibot.Site() 519 return self._site 520 521 @property 522 def namespaces(self): 523 """ 524 List of Namespace parameters. 525 526 Converts int or string namespaces to Namespace objects and 527 change the storage to immutable once it has been accessed. 528 529 The resolving and validation of namespace command line arguments 530 is performed in this method, as it depends on the site property 531 which is lazy loaded to avoid being cached before the global 532 arguments are handled. 533 534 :return: namespaces selected using arguments 535 :rtype: list of Namespace 536 :raises KeyError: a namespace identifier was not resolved 537 :raises TypeError: a namespace identifier has an inappropriate 538 type such as NoneType or bool 539 """ 540 if isinstance(self._namespaces, list): 541 self._namespaces = frozenset( 542 self.site.namespaces.resolve(self._namespaces)) 543 return self._namespaces 544 545 def getCombinedGenerator(self, gen=None, preload=False): 546 """Return the combination of all accumulated generators. 547 548 Only call this after all arguments have been parsed. 549 550 :param gen: Another generator to be combined with 551 :type gen: iterator 552 :param preload: preload pages using PreloadingGenerator 553 unless self.nopreload is True 554 :type preload: bool 555 """ 556 if gen: 557 self.gens.insert(0, gen) 558 559 for i in range(len(self.gens)): 560 if self.namespaces: 561 if (isinstance(self.gens[i], api.QueryGenerator) 562 and self.gens[i].support_namespace()): 563 self.gens[i].set_namespace(self.namespaces) 564 # QueryGenerator does not support namespace param. 565 else: 566 self.gens[i] = NamespaceFilterPageGenerator( 567 self.gens[i], self.namespaces, self.site) 568 569 if self.limit: 570 try: 571 self.gens[i].set_maximum_items(self.limit) 572 except AttributeError: 573 self.gens[i] = itertools.islice(self.gens[i], self.limit) 574 575 if not self.gens: 576 if any((self.titlefilter_list, 577 self.titlenotfilter_list, 578 self.articlefilter_list, 579 self.articlenotfilter_list, 580 self.claimfilter_list, 581 self.catfilter_list, 582 self.qualityfilter_list, 583 self.subpage_max_depth is not None)): 584 pywikibot.warning('filter(s) specified but no generators.') 585 return None 586 587 if len(self.gens) == 1: 588 dupfiltergen = self.gens[0] 589 if hasattr(self, '_single_gen_filter_unique'): 590 dupfiltergen = _filter_unique_pages(dupfiltergen) 591 if self.intersect: 592 pywikibot.warning( 593 '"-intersect" ignored as only one generator is specified.') 594 elif self.intersect: 595 # By definition no duplicates are possible. 596 dupfiltergen = intersect_generators(*self.gens) 597 else: 598 dupfiltergen = _filter_unique_pages(itertools.chain(*self.gens)) 599 600 # Add on subpage filter generator 601 if self.subpage_max_depth is not None: 602 dupfiltergen = SubpageFilterGenerator( 603 dupfiltergen, self.subpage_max_depth) 604 605 if self.claimfilter_list: 606 for claim in self.claimfilter_list: 607 dupfiltergen = ItemClaimFilterPageGenerator(dupfiltergen, 608 claim[0], claim[1], 609 claim[2], claim[3]) 610 611 if self.qualityfilter_list: 612 dupfiltergen = QualityFilterPageGenerator( 613 dupfiltergen, self.qualityfilter_list) 614 615 if self.titlefilter_list: 616 dupfiltergen = RegexFilterPageGenerator( 617 dupfiltergen, self.titlefilter_list) 618 619 if self.titlenotfilter_list: 620 dupfiltergen = RegexFilterPageGenerator( 621 dupfiltergen, self.titlenotfilter_list, 'none') 622 623 if self.catfilter_list: 624 dupfiltergen = CategoryFilterPageGenerator( 625 dupfiltergen, self.catfilter_list) 626 627 if (preload or self.articlefilter_list) and not self.nopreload: 628 if isinstance(dupfiltergen, DequeGenerator): 629 dupfiltergen = DequePreloadingGenerator(dupfiltergen) 630 else: 631 dupfiltergen = PreloadingGenerator(dupfiltergen) 632 633 if self.articlefilter_list: 634 dupfiltergen = RegexBodyFilterPageGenerator( 635 dupfiltergen, self.articlefilter_list) 636 637 if self.articlenotfilter_list: 638 dupfiltergen = RegexBodyFilterPageGenerator( 639 dupfiltergen, self.articlenotfilter_list, 'none') 640 641 return dupfiltergen 642 643 @deprecated_args(arg='category') 644 def getCategory(self, category: str) -> tuple: 645 """ 646 Return Category and start as defined by category. 647 648 :param category: category name with start parameter 649 """ 650 if not category: 651 category = i18n.input( 652 'pywikibot-enter-category-name', 653 fallback_prompt='Please enter the category name:') 654 category = category.replace('#', '|') 655 656 category, _, startfrom = category.partition('|') 657 if not startfrom: 658 startfrom = None 659 660 # Insert "Category:" before category name to avoid parsing problems in 661 # Link.parse() when categoryname contains ":"; 662 # Part before ":" might be interpreted as an interwiki prefix 663 prefix = category.split(':', 1)[0] # whole word if ":" not present 664 if prefix not in self.site.namespaces[14]: 665 category = '{}:{}'.format( 666 self.site.namespace(14), category) 667 cat = pywikibot.Category(pywikibot.Link(category, 668 source=self.site, 669 default_namespace=14)) 670 return cat, startfrom 671 672 @deprecated_args(arg='category') 673 def getCategoryGen(self, category: str, recurse: bool = False, 674 content: bool = False, gen_func=None): 675 """ 676 Return generator based on Category defined by category and gen_func. 677 678 :param category: category name with start parameter 679 :rtype: generator 680 """ 681 cat, startfrom = self.getCategory(category) 682 683 return gen_func(cat, 684 start=startfrom, 685 recurse=recurse, 686 content=content) 687 688 @staticmethod 689 def _parse_log_events(logtype: str, user: Optional[str] = None, 690 start=None, end=None): 691 """ 692 Parse the -logevent argument information. 693 694 :param logtype: A valid logtype 695 :param user: A username associated to the log events. Ignored if 696 empty string or None. 697 :param start: Timestamp to start listing from. For backward 698 compatibility, this can also be the total amount of pages 699 that should be returned. It is taken as 'total' if the value does 700 not have 8 digits. 701 :type start: str convertible to Timestamp matching '%Y%m%d%H%M%S'. 702 If the length is not 8: for backward compatibility to use this as 703 'total', it can also be a str (castable to int). 704 :param end: Timestamp to end listing at 705 :type end: str convertible to Timestamp matching '%Y%m%d%H%M%S' 706 :return: The generator or None if invalid 'start/total' or 'end' value. 707 :rtype: LogeventsPageGenerator 708 """ 709 def parse_start(start): 710 """Parse start and return (start, total).""" 711 if start is None: 712 return None, None 713 714 if len(start) >= 8: 715 return pywikibot.Timestamp.fromtimestampformat(start), None 716 717 return None, int(start) 718 719 start = start or None # because start might be an empty string 720 try: 721 start, total = parse_start(start) 722 assert total is None or total > 0 723 except ValueError as err: 724 pywikibot.error( 725 '{}. Start parameter has wrong format!'.format(err)) 726 return None 727 except AssertionError: 728 pywikibot.error('Total number of log ({}) events must be a ' 729 'positive int.'.format(start)) 730 return None 731 732 try: 733 end = pywikibot.Timestamp.fromtimestampformat(end) 734 except ValueError as err: 735 pywikibot.error( 736 '{}. End parameter has wrong format!'.format(err)) 737 return None 738 except TypeError: # end is None 739 pass 740 741 if start or end: 742 pywikibot.output('Fetching log events in range: {} - {}.' 743 .format(end or 'beginning of time', 744 start or 'now')) 745 746 # 'user or None', because user might be an empty string when 747 # 'foo,,bar' was used. 748 return LogeventsPageGenerator(logtype, user or None, total=total, 749 start=start, end=end) 750 751 def _handle_filelinks(self, value): 752 """Handle `-filelinks` argument.""" 753 if not value: 754 value = i18n.input( 755 'pywikibot-enter-file-links-processing', 756 fallback_prompt='Links to which file page should be ' 757 'processed?') 758 if not value.startswith(self.site.namespace(6) + ':'): 759 value = 'Image:' + value 760 file_page = pywikibot.FilePage(self.site, value) 761 return file_page.usingPages() 762 763 def _handle_linter(self, value): 764 """Handle `-linter` argument.""" 765 if not self.site.has_extension('Linter'): 766 raise UnknownExtensionError( 767 '-linter needs a site with Linter extension.') 768 cats = self.site.siteinfo.get('linter') # Get linter categories. 769 valid_cats = [c for _list in cats.values() for c in _list] 770 771 value = value or '' 772 cat, _, lint_from = value.partition('/') 773 lint_from = lint_from or None 774 775 def show_available_categories(cats): 776 _i = ' ' * 4 777 _2i = 2 * _i 778 txt = 'Available categories of lint errors:\n' 779 for prio, _list in cats.items(): 780 txt += '{indent}{prio}\n'.format(indent=_i, prio=prio) 781 txt += ''.join( 782 '{indent}{cat}\n'.format(indent=_2i, cat=c) for c in _list) 783 pywikibot.output(txt) 784 785 if cat == 'show': # Display categories of lint errors. 786 show_available_categories(cats) 787 sys.exit(0) 788 789 if not cat: 790 lint_cats = valid_cats 791 elif cat in ['low', 'medium', 'high']: 792 lint_cats = cats[cat] 793 else: 794 lint_cats = cat.split(',') 795 assert set(lint_cats) <= set(valid_cats), \ 796 'Invalid category of lint errors: {}'.format(cat) 797 798 return self.site.linter_pages( 799 lint_categories='|'.join(lint_cats), namespaces=self.namespaces, 800 lint_from=lint_from) 801 802 def _handle_querypage(self, value): 803 """Handle `-querypage` argument.""" 804 if value is None: # Display special pages. 805 pages = self.site._paraminfo.parameter('query+querypage', 806 'page') 807 pages = sorted(pages['type']) 808 limit = self.site._paraminfo.parameter('query+querypage', 809 'limit') 810 811 max_w = max(len(p) for p in pages[::2]) + 4 812 txt = 'Available special pages:\n' 813 for a, b in zip_longest(pages[::2], pages[1::2], fillvalue=''): 814 txt += ' {a:<{max_w}}{b}\n'.format(a=a, b=b, max_w=max_w) 815 txt += ('\nMaximum number of pages to return is {max} ' 816 '({highmax} for bots).\n'.format_map(limit)) 817 pywikibot.output(txt) 818 sys.exit(0) 819 820 return self.site.querypage(value) 821 822 def _handle_url(self, value): 823 """Handle `-url` argument.""" 824 if not value: 825 value = pywikibot.input('Please enter the URL:') 826 return TextIOPageGenerator(value, site=self.site) 827 828 def _handle_unusedfiles(self, value): 829 """Handle `-unusedfiles` argument.""" 830 return self.site.unusedfiles(total=_int_none(value)) 831 832 def _handle_lonelypages(self, value): 833 """Handle `-lonelypages` argument.""" 834 return self.site.lonelypages(total=_int_none(value)) 835 836 def _handle_unwatched(self, value): 837 """Handle `-unwatched` argument.""" 838 return self.site.unwatchedpage(total=_int_none(value)) 839 840 def _handle_wantedpages(self, value): 841 """Handle `-wantedpages` argument.""" 842 return self.site.wantedpages(total=_int_none(value)) 843 844 def _handle_wantedfiles(self, value): 845 """Handle `-wantedfiles` argument.""" 846 return self.site.wantedfiles(total=_int_none(value)) 847 848 def _handle_wantedtemplates(self, value): 849 """Handle `-wantedtemplates` argument.""" 850 return self.site.wantedtemplates(total=_int_none(value)) 851 852 def _handle_wantedcategories(self, value): 853 """Handle `-wantedcategories` argument.""" 854 return self.site.wantedcategories(total=_int_none(value)) 855 856 def _handle_property(self, value): 857 """Handle `-property` argument.""" 858 if not value: 859 question = 'Which property name to be used?' 860 value = pywikibot.input(question + ' (List [?])') 861 pnames = self.site.get_property_names() 862 # also use the default by <enter> key 863 if value in '?' or value not in pnames: 864 prefix, value = pywikibot.input_choice( 865 question, ShowingListOption(pnames)) 866 return self.site.pages_with_property(value) 867 868 def _handle_usercontribs(self, value): 869 """Handle `-usercontribs` argument.""" 870 self._single_gen_filter_unique = True 871 return UserContributionsGenerator( 872 value, site=self.site, _filter_unique=None) 873 874 def _handle_withoutinterwiki(self, value): 875 """Handle `-withoutinterwiki` argument.""" 876 return self.site.withoutinterwiki(total=_int_none(value)) 877 878 def _handle_interwiki(self, value): 879 """Handle `-interwiki` argument.""" 880 if not value: 881 value = i18n.input( 882 'pywikibot-enter-page-processing', 883 fallback_prompt='Which page should be processed?') 884 page = pywikibot.Page(pywikibot.Link(value, self.site)) 885 return InterwikiPageGenerator(page) 886 887 def _handle_randomredirect(self, value): 888 """Handle `-randomredirect` argument.""" 889 # partial workaround for bug T119940 890 # to use -namespace/ns with -randomredirect, -ns must be given 891 # before -randomredirect 892 # otherwise default namespace is 0 893 namespaces = self.namespaces or 0 894 return self.site.randompages(total=_int_none(value), 895 namespaces=namespaces, redirects=True) 896 897 def _handle_random(self, value): 898 """Handle `-random` argument.""" 899 # partial workaround for bug T119940 900 # to use -namespace/ns with -random, -ns must be given 901 # before -random 902 # otherwise default namespace is 0 903 namespaces = self.namespaces or 0 904 return self.site.randompages(total=_int_none(value), 905 namespaces=namespaces) 906 907 def _handle_recentchanges(self, value): 908 """Handle `-recentchanges` argument.""" 909 rcstart = None 910 rcend = None 911 rctag = None 912 total = None 913 params = value.split(',') if value else [] 914 if params and not params[0].isdigit(): 915 rctag = params.pop(0) 916 if len(params) > 2: 917 raise ValueError('More than two parameters passed.') 918 if len(params) == 2: 919 offset = float(params[0]) 920 duration = float(params[1]) 921 if offset < 0 or duration < 0: 922 raise ValueError('Negative valued parameters passed.') 923 ts_time = self.site.server_time() 924 rcstart = ts_time - timedelta(minutes=offset) 925 rcend = rcstart - timedelta(minutes=duration) 926 elif len(params) == 1: 927 total = int(params[0]) 928 self._single_gen_filter_unique = True 929 return RecentChangesPageGenerator( 930 namespaces=self.namespaces, total=total, start=rcstart, end=rcend, 931 site=self.site, tag=rctag) 932 933 def _handle_liverecentchanges(self, value): 934 """Handle `-liverecentchanges` argument.""" 935 self.nopreload = True 936 return LiveRCPageGenerator(site=self.site, total=_int_none(value)) 937 938 def _handle_file(self, value): 939 """Handle `-file` argument.""" 940 if not value: 941 value = pywikibot.input('Please enter the local file name:') 942 return TextIOPageGenerator(value, site=self.site) 943 944 def _handle_namespaces(self, value): 945 """Handle `-namespaces` argument.""" 946 if isinstance(self._namespaces, frozenset): 947 raise RuntimeError('-namespace/ns option must be provided before ' 948 '-newpages/-random/-randomredirect/-linter') 949 if not value: 950 value = pywikibot.input('What namespace are you filtering on?') 951 NOT_KEY = 'not:' 952 if value.startswith(NOT_KEY): 953 value = value[len(NOT_KEY):] 954 resolve = self.site.namespaces.resolve 955 not_ns = set(resolve(value.split(','))) 956 if not self._namespaces: 957 self._namespaces = list( 958 set(self.site.namespaces.values()) - not_ns) 959 else: 960 self._namespaces = list( 961 set(resolve(self._namespaces)) - not_ns) 962 else: 963 self._namespaces += value.split(',') 964 return True 965 966 _handle_ns = _handle_namespaces 967 _handle_namespace = _handle_namespaces 968 969 def _handle_limit(self, value): 970 """Handle `-limit` argument.""" 971 if not value: 972 value = pywikibot.input('What is the limit value?') 973 self.limit = _int_none(value) 974 return True 975 976 def _handle_category(self, value): 977 """Handle `-category` argument.""" 978 return self.getCategoryGen( 979 value, recurse=False, gen_func=CategorizedPageGenerator) 980 981 _handle_cat = _handle_category 982 983 def _handle_catr(self, value): 984 """Handle `-catr` argument.""" 985 return self.getCategoryGen( 986 value, recurse=True, gen_func=CategorizedPageGenerator) 987 988 def _handle_subcats(self, value): 989 """Handle `-subcats` argument.""" 990 return self.getCategoryGen( 991 value, recurse=False, gen_func=SubCategoriesPageGenerator) 992 993 def _handle_subcatsr(self, value): 994 """Handle `-subcatsr` argument.""" 995 return self.getCategoryGen( 996 value, recurse=True, gen_func=SubCategoriesPageGenerator) 997 998 def _handle_catfilter(self, value): 999 """Handle `-catfilter` argument.""" 1000 cat, _ = self.getCategory(value) 1001 self.catfilter_list.append(cat) 1002 return True 1003 1004 def _handle_page(self, value): 1005 """Handle `-page` argument.""" 1006 if not value: 1007 value = pywikibot.input('What page do you want to use?') 1008 return [pywikibot.Page(pywikibot.Link(value, self.site))] 1009 1010 def _handle_pageid(self, value): 1011 """Handle `-pageid` argument.""" 1012 if not value: 1013 value = pywikibot.input('What pageid do you want to use?') 1014 return self.site.load_pages_from_pageids(value) 1015 1016 def _handle_uncatfiles(self, value): 1017 """Handle `-uncatfiles` argument.""" 1018 return self.site.uncategorizedimages() 1019 1020 def _handle_uncatcat(self, value): 1021 """Handle `-uncatcat` argument.""" 1022 return self.site.uncategorizedcategories() 1023 1024 def _handle_uncat(self, value): 1025 """Handle `-uncat` argument.""" 1026 return self.site.uncategorizedpages() 1027 1028 def _handle_ref(self, value): 1029 """Handle `-ref` argument.""" 1030 if not value: 1031 value = pywikibot.input( 1032 'Links to which page should be processed?') 1033 page = pywikibot.Page(pywikibot.Link(value, self.site)) 1034 return page.getReferences() 1035 1036 def _handle_links(self, value): 1037 """Handle `-links` argument.""" 1038 if not value: 1039 value = pywikibot.input( 1040 'Links from which page should be processed?') 1041 page = pywikibot.Page(pywikibot.Link(value, self.site)) 1042 return page.linkedPages() 1043 1044 def _handle_weblink(self, value): 1045 """Handle `-weblink` argument.""" 1046 if not value: 1047 value = pywikibot.input( 1048 'Pages with which weblink should be processed?') 1049 return self.site.exturlusage(value) 1050 1051 def _handle_transcludes(self, value): 1052 """Handle `-transcludes` argument.""" 1053 if not value: 1054 value = pywikibot.input( 1055 'Pages that transclude which page should be processed?') 1056 page = pywikibot.Page(pywikibot.Link(value, 1057 default_namespace=10, 1058 source=self.site)) 1059 return page.getReferences(only_template_inclusion=True) 1060 1061 def _handle_start(self, value): 1062 """Handle `-start` argument.""" 1063 if not value: 1064 value = '!' 1065 firstpagelink = pywikibot.Link(value, self.site) 1066 return self.site.allpages( 1067 start=firstpagelink.title, namespace=firstpagelink.namespace, 1068 filterredir=False) 1069 1070 def _handle_prefixindex(self, value): 1071 """Handle `-prefixindex` argument.""" 1072 if not value: 1073 value = pywikibot.input('What page names are you looking for?') 1074 return PrefixingPageGenerator(prefix=value, site=self.site) 1075 1076 def _handle_newimages(self, value): 1077 """Handle `-newimages` argument.""" 1078 return NewimagesPageGenerator(total=_int_none(value), site=self.site) 1079 1080 def _handle_newpages(self, value): 1081 """Handle `-newpages` argument.""" 1082 # partial workaround for bug T69249 1083 # to use -namespace/ns with -newpages, -ns must be given 1084 # before -newpages 1085 # otherwise default namespace is 0 1086 namespaces = self.namespaces or 0 1087 return NewpagesPageGenerator( 1088 namespaces=namespaces, total=_int_none(value), site=self.site) 1089 1090 def _handle_unconnectedpages(self, value): 1091 """Handle `-unconnectedpages` argument.""" 1092 return self.site.unconnected_pages(total=_int_none(value)) 1093 1094 def _handle_imagesused(self, value): 1095 """Handle `-imagesused` argument.""" 1096 if not value: 1097 value = pywikibot.input( 1098 'Images on which page should be processed?') 1099 page = pywikibot.Page(pywikibot.Link(value, self.site)) 1100 return page.imagelinks() 1101 1102 def _handle_searchitem(self, value): 1103 """Handle `-searchitem` argument.""" 1104 if not value: 1105 value = pywikibot.input('Text to look for:') 1106 params = value.split(':') 1107 value = params[-1] 1108 lang = params[0] if len(params) == 2 else None 1109 return WikibaseSearchItemPageGenerator( 1110 value, language=lang, site=self.site) 1111 1112 def _handle_search(self, value): 1113 """Handle `-search` argument.""" 1114 if not value: 1115 value = pywikibot.input('What do you want to search for?') 1116 # In order to be useful, all namespaces are required 1117 return self.site.search(value, namespaces=[]) 1118 1119 @staticmethod 1120 def _handle_google(value): 1121 """Handle `-google` argument.""" 1122 return GoogleSearchPageGenerator(value) 1123 1124 def _handle_titleregex(self, value): 1125 """Handle `-titleregex` argument.""" 1126 if not value: 1127 value = pywikibot.input( 1128 'What page names are you looking for?') 1129 self.titlefilter_list.append(value) 1130 return True 1131 1132 def _handle_titleregexnot(self, value): 1133 """Handle `-titleregexnot` argument.""" 1134 if not value: 1135 value = pywikibot.input( 1136 'All pages except which ones?') 1137 self.titlenotfilter_list.append(value) 1138 return True 1139 1140 def _handle_grep(self, value): 1141 """Handle `-grep` argument.""" 1142 if not value: 1143 value = pywikibot.input('Which pattern do you want to grep?') 1144 self.articlefilter_list.append(value) 1145 return True 1146 1147 def _handle_grepnot(self, value): 1148 """Handle `-grepnot` argument.""" 1149 if not value: 1150 value = pywikibot.input('Which pattern do you want to skip?') 1151 self.articlenotfilter_list.append(value) 1152 return True 1153 1154 def _handle_ql(self, value): 1155 """Handle `-ql` argument.""" 1156 if not self.site.has_extension('ProofreadPage'): 1157 raise UnknownExtensionError( 1158 'Ql filtering needs a site with ProofreadPage extension.') 1159 value = [int(_) for _ in value.split(',')] 1160 if min(value) < 0 or max(value) > 4: # Invalid input ql. 1161 valid_ql = [ 1162 '{}: {}'.format(*i) 1163 for i in self.site.proofread_levels.items()] 1164 valid_ql = ', '.join(valid_ql) 1165 pywikibot.warning('Acceptable values for -ql are:\n {}' 1166 .format(valid_ql)) 1167 self.qualityfilter_list = value 1168 return True 1169 1170 def _handle_onlyif(self, value): 1171 """Handle `-onlyif` argument.""" 1172 return self._onlyif_onlyifnot_handler(value, False) 1173 1174 def _handle_onlyifnot(self, value): 1175 """Handle `-onlyifnot` argument.""" 1176 return self._onlyif_onlyifnot_handler(value, True) 1177 1178 def _onlyif_onlyifnot_handler(self, value, ifnot): 1179 """Handle `-onlyif` and `-onlyifnot` arguments.""" 1180 if not value: 1181 value = pywikibot.input('Which claim do you want to filter?') 1182 p = re.compile(r'(?<!\\),') # Match "," only if there no "\" before 1183 temp = [] # Array to store split argument 1184 for arg in p.split(value): 1185 temp.append(arg.replace(r'\,', ',').split('=')) 1186 self.claimfilter_list.append( 1187 (temp[0][0], temp[0][1], dict(temp[1:]), ifnot)) 1188 return True 1189 1190 def _handle_sparqlendpoint(self, value): 1191 """Handle `-sparqlendpoint` argument.""" 1192 if not value: 1193 value = pywikibot.input('SPARQL endpoint:') 1194 self._sparql = value 1195 1196 def _handle_sparql(self, value): 1197 """Handle `-sparql` argument.""" 1198 if not value: 1199 value = pywikibot.input('SPARQL query:') 1200 return WikidataSPARQLPageGenerator( 1201 value, site=self.site, endpoint=self._sparql) 1202 1203 def _handle_mysqlquery(self, value): 1204 """Handle `-mysqlquery` argument.""" 1205 if not value: 1206 value = pywikibot.input('Mysql query string:') 1207 return MySQLPageGenerator(value, site=self.site) 1208 1209 def _handle_intersect(self, value): 1210 """Handle `-intersect` argument.""" 1211 self.intersect = True 1212 return True 1213 1214 def _handle_subpage(self, value): 1215 """Handle `-subpage` argument.""" 1216 if not value: 1217 value = pywikibot.input( 1218 'Maximum subpage depth:') 1219 self.subpage_max_depth = int(value) 1220 return True 1221 1222 def _handle_logevents(self, value): 1223 """Handle `-logevents` argument.""" 1224 params = value.split(',') 1225 if params[0] not in self.site.logtypes: 1226 raise NotImplementedError( 1227 'Invalid -logevents parameter "{}"'.format(params[0])) 1228 return self._parse_log_events(*params) 1229 1230 def handle_args(self, args: Iterable[str]) -> List[str]: 1231 """Handle command line arguments and return the rest as a list. 1232 1233 *New in version 6.0.* 1234 """ 1235 return [arg for arg in args if not self.handle_arg(arg)] 1236 1237 def handle_arg(self, arg: str) -> bool: 1238 """Parse one argument at a time. 1239 1240 If it is recognized as an argument that specifies a generator, a 1241 generator is created and added to the accumulation list, and the 1242 function returns true. Otherwise, it returns false, so that caller 1243 can try parsing the argument. Call getCombinedGenerator() after all 1244 arguments have been parsed to get the final output generator. 1245 1246 *Renamed in version 6.0.* 1247 1248 :param arg: Pywikibot argument consisting of -name:value 1249 :return: True if the argument supplied was recognised by the factory 1250 """ 1251 if not arg.startswith('-') and self._positional_arg_name: 1252 value = arg 1253 arg = '-' + self._positional_arg_name 1254 else: 1255 arg, _, value = arg.partition(':') 1256 1257 if value == '': 1258 value = None 1259 1260 opt = arg[1:] 1261 if opt in self.disabled_options: 1262 return False 1263 1264 if self.enabled_options and opt not in self.enabled_options: 1265 return False 1266 1267 handler = getattr(self, '_handle_' + opt, None) 1268 if not handler: 1269 return False 1270 1271 handler_result = handler(value) 1272 if isinstance(handler_result, bool): 1273 return handler_result 1274 if handler_result: 1275 self.gens.append(handler_result) 1276 return True 1277 1278 return False 1279 1280 1281def _int_none(v): 1282 """Return None if v is None or '' else return int(v).""" 1283 return v if (v is None or v == '') else int(v) 1284 1285 1286@deprecated('Site.allpages()', since='20180512') 1287@deprecated_args(step=True) 1288def AllpagesPageGenerator(start: str = '!', namespace=0, 1289 includeredirects=True, site=None, 1290 total: Optional[int] = None, content: bool = False 1291 ): # pragma: no cover 1292 """ 1293 Iterate Page objects for all titles in a single namespace. 1294 1295 If includeredirects is False, redirects are not included. If 1296 includeredirects equals the string 'only', only redirects are added. 1297 1298 :param total: Maximum number of pages to retrieve in total 1299 :param content: If True, load current version of each page (default False) 1300 :param site: Site for generator results. 1301 :type site: :py:obj:`pywikibot.site.BaseSite` 1302 1303 """ 1304 if site is None: 1305 site = pywikibot.Site() 1306 if includeredirects: 1307 if includeredirects == 'only': 1308 filterredir = True 1309 else: 1310 filterredir = None 1311 else: 1312 filterredir = False 1313 return site.allpages(start=start, namespace=namespace, 1314 filterredir=filterredir, total=total, content=content) 1315 1316 1317@deprecated_args(step=True) 1318def PrefixingPageGenerator(prefix: str, namespace=None, 1319 includeredirects: Union[None, bool, str] = True, 1320 site=None, total: int = None, 1321 content: bool = False): 1322 """ 1323 Prefixed Page generator. 1324 1325 :param prefix: The prefix of the pages. 1326 :param namespace: Namespace to retrieve pages from 1327 :type namespace: Namespace or int 1328 :param includeredirects: If includeredirects is None, False or an empty 1329 string, redirects will not be found. If includeredirects equals the 1330 string 'only', only redirects will be found. Otherwise redirects will 1331 be included. 1332 :param site: Site for generator results. 1333 :type site: :py:obj:`pywikibot.site.BaseSite` 1334 :param total: Maximum number of pages to retrieve in total 1335 :param content: If True, load current version of each page (default False) 1336 :return: a generator that yields Page objects 1337 :rtype: generator 1338 """ 1339 if site is None: 1340 site = pywikibot.Site() 1341 prefixlink = pywikibot.Link(prefix, site) 1342 if namespace is None: 1343 namespace = prefixlink.namespace 1344 title = prefixlink.title 1345 if includeredirects: 1346 if includeredirects == 'only': 1347 filterredir = True 1348 else: 1349 filterredir = None 1350 else: 1351 filterredir = False 1352 return site.allpages(prefix=title, namespace=namespace, 1353 filterredir=filterredir, total=total, content=content) 1354 1355 1356@deprecated_args(number='total', mode='logtype', repeat=True) 1357def LogeventsPageGenerator(logtype: Optional[str] = None, 1358 user: Optional[str] = None, site=None, 1359 namespace: Optional[int] = None, 1360 total: Optional[int] = None, start=None, 1361 end=None, reverse: bool = False): 1362 """ 1363 Generate Pages for specified modes of logevents. 1364 1365 :param logtype: Mode of logs to retrieve 1366 :param user: User of logs retrieved 1367 :param site: Site for generator results 1368 :type site: :py:obj:`pywikibot.site.BaseSite` 1369 :param namespace: Namespace to retrieve logs from 1370 :param total: Maximum number of pages to retrieve in total 1371 :param start: Timestamp to start listing from 1372 :type start: pywikibot.Timestamp 1373 :param end: Timestamp to end listing at 1374 :type end: pywikibot.Timestamp 1375 :param reverse: if True, start with oldest changes (default: newest) 1376 """ 1377 if site is None: 1378 site = pywikibot.Site() 1379 for entry in site.logevents(total=total, logtype=logtype, user=user, 1380 namespace=namespace, start=start, end=end, 1381 reverse=reverse): 1382 try: 1383 yield entry.page() 1384 except KeyError as e: 1385 pywikibot.warning('LogeventsPageGenerator: ' 1386 'failed to load page for {!r}; skipping' 1387 .format(entry.data)) 1388 pywikibot.exception(e) 1389 1390 1391@deprecated_args(number='total', step=True, namespace='namespaces', 1392 repeat=True, get_redirect=True) 1393def NewpagesPageGenerator(site=None, namespaces=(0, ), 1394 total: Optional[int] = None): 1395 """ 1396 Iterate Page objects for all new titles in a single namespace. 1397 1398 :param total: Maxmium number of pages to retrieve in total 1399 :param site: Site for generator results. 1400 :type site: :py:obj:`pywikibot.site.BaseSite` 1401 """ 1402 # API does not (yet) have a newpages function, so this tries to duplicate 1403 # it by filtering the recentchanges output 1404 # defaults to namespace 0 because that's how Special:Newpages defaults 1405 if site is None: 1406 site = pywikibot.Site() 1407 return (page for page, _ in site.newpages(namespaces=namespaces, 1408 total=total, returndict=True)) 1409 1410 1411def RecentChangesPageGenerator(site=None, _filter_unique=None, **kwargs): 1412 """ 1413 Generate pages that are in the recent changes list, including duplicates. 1414 1415 For parameters refer pywikibot.site.recentchanges 1416 1417 :param site: Site for generator results. 1418 :type site: :py:obj:`pywikibot.site.BaseSite` 1419 """ 1420 if site is None: 1421 site = pywikibot.Site() 1422 1423 gen = site.recentchanges(**kwargs) 1424 gen.request['rcprop'] = 'title' 1425 gen = (pywikibot.Page(site, rc['title']) 1426 for rc in gen if rc['type'] != 'log' or 'title' in rc) 1427 1428 if _filter_unique: 1429 gen = _filter_unique(gen) 1430 return gen 1431 1432 1433@deprecated('site.unconnected_pages()', since='20180512') 1434@deprecated_args(step=True) 1435def UnconnectedPageGenerator(site=None, total: Optional[int] = None): 1436 """ 1437 Iterate Page objects for all unconnected pages to a Wikibase repository. 1438 1439 :param total: Maximum number of pages to retrieve in total 1440 :param site: Site for generator results. 1441 :type site: :py:obj:`pywikibot.site.APISite` 1442 """ 1443 if site is None: 1444 site = pywikibot.Site() 1445 if not site.data_repository(): 1446 raise ValueError('The given site does not have Wikibase repository.') 1447 return site.unconnected_pages(total=total) 1448 1449 1450@deprecated('File.usingPages()', since='20200515') 1451@deprecated_args(referredImagePage='referredFilePage', step=True) 1452def FileLinksGenerator(referredFilePage, total=None, content=False): 1453 """DEPRECATED. Yield Pages on which referredFilePage file is displayed.""" 1454 return referredFilePage.usingPages(total=total, 1455 content=content) # pragma: no cover 1456 1457 1458@deprecated('Page.imagelinks()', since='20200515') 1459@deprecated_args(step=True) 1460def ImagesPageGenerator(pageWithImages, total=None, content=False): 1461 """DEPRECATED. Yield FilePages displayed on pageWithImages.""" 1462 return pageWithImages.imagelinks(total=total, 1463 content=content) # pragma: no cover 1464 1465 1466def InterwikiPageGenerator(page): 1467 """Iterate over all interwiki (non-language) links on a page.""" 1468 return (pywikibot.Page(link) for link in page.interwiki()) 1469 1470 1471@deprecated_args(step=True) 1472def LanguageLinksPageGenerator(page, total=None): 1473 """Iterate over all interwiki language links on a page.""" 1474 return (pywikibot.Page(link) for link in page.iterlanglinks(total=total)) 1475 1476 1477@deprecated_args(step=True) 1478def CategorizedPageGenerator(category, recurse=False, start=None, 1479 total=None, content=False, 1480 namespaces=None): 1481 """Yield all pages in a specific category. 1482 1483 If recurse is True, pages in subcategories are included as well; if 1484 recurse is an int, only subcategories to that depth will be included 1485 (e.g., recurse=2 will get pages in subcats and sub-subcats, but will 1486 not go any further). 1487 1488 If start is a string value, only pages whose sortkey comes after start 1489 alphabetically are included. 1490 1491 If content is True (default is False), the current page text of each 1492 retrieved page will be downloaded. 1493 1494 """ 1495 kwargs = { 1496 'content': content, 1497 'namespaces': namespaces, 1498 'recurse': recurse, 1499 'startprefix': start, 1500 'total': total, 1501 } 1502 yield from category.articles(**kwargs) 1503 1504 1505@deprecated_args(step=True) 1506def SubCategoriesPageGenerator(category, recurse=False, start=None, 1507 total=None, content=False): 1508 """Yield all subcategories in a specific category. 1509 1510 If recurse is True, pages in subcategories are included as well; if 1511 recurse is an int, only subcategories to that depth will be included 1512 (e.g., recurse=2 will get pages in subcats and sub-subcats, but will 1513 not go any further). 1514 1515 If start is a string value, only categories whose sortkey comes after 1516 start alphabetically are included. 1517 1518 If content is True (default is False), the current page text of each 1519 category description page will be downloaded. 1520 1521 """ 1522 # TODO: page generator could be modified to use cmstartsortkey ... 1523 for s in category.subcategories(recurse=recurse, 1524 total=total, content=content): 1525 if start is None or s.title(with_ns=False) >= start: 1526 yield s 1527 1528 1529@deprecated('Page.linkedPages()', since='20200515') 1530@deprecated_args(step=True) 1531def LinkedPageGenerator(linkingPage, total: int = None, content: bool = False): 1532 """DEPRECATED. Yield all pages linked from a specific page. 1533 1534 See :py:obj:`pywikibot.page.BasePage.linkedPages` for details. 1535 1536 :param linkingPage: the page that links to the pages we want 1537 :type linkingPage: :py:obj:`pywikibot.Page` 1538 :param total: the total number of pages to iterate 1539 :param content: if True, retrieve the current content of each linked page 1540 :return: a generator that yields Page objects of pages linked to 1541 linkingPage 1542 :rtype: generator 1543 """ 1544 return linkingPage.linkedPages(total=total, 1545 content=content) # pragma: no cover 1546 1547 1548def _yield_titles(f: Union[codecs.StreamReaderWriter, io.StringIO], 1549 site: pywikibot.Site): 1550 """Yield page titles from a text stream. 1551 1552 :param f: text stream object 1553 :type f: codecs.StreamReaderWriter, io.StringIO, or any other stream-like 1554 object 1555 :param site: Site for generator results. 1556 :type site: :py:obj:`pywikibot.site.BaseSite` 1557 :return: a generator that yields Page objects of pages with titles in text 1558 stream 1559 :rtype: generator 1560 """ 1561 linkmatch = None 1562 for linkmatch in pywikibot.link_regex.finditer(f.read()): 1563 # If the link is in interwiki format, the Page object may reside 1564 # on a different Site than the default. 1565 # This makes it possible to work on different wikis using a single 1566 # text file, but also could be dangerous because you might 1567 # inadvertently change pages on another wiki! 1568 yield pywikibot.Page(pywikibot.Link(linkmatch.group('title'), 1569 site)) 1570 if linkmatch is not None: 1571 return 1572 1573 f.seek(0) 1574 for title in f: 1575 title = title.strip() 1576 if '|' in title: 1577 title = title[:title.index('|')] 1578 if title: 1579 yield pywikibot.Page(site, title) 1580 1581 1582def TextIOPageGenerator(source: Optional[str] = None, 1583 site: Optional[pywikibot.site.BaseSite] = None): 1584 """Iterate pages from a list in a text file or on a webpage. 1585 1586 The text source must contain page links between double-square-brackets or, 1587 alternatively, separated by newlines. The generator will yield each 1588 corresponding Page object. 1589 1590 :param source: the file path or URL that should be read. If no name is 1591 given, the generator prompts the user. 1592 :param site: Site for generator results. 1593 :type site: :py:obj:`pywikibot.site.BaseSite` 1594 1595 """ 1596 if source is None: 1597 source = pywikibot.input('Please enter the filename / URL:') 1598 if site is None: 1599 site = pywikibot.Site() 1600 # If source cannot be parsed as an HTTP URL, treat as local file 1601 if not urlparse(source).netloc: 1602 with codecs.open(source, 'r', config.textfile_encoding) as f: 1603 yield from _yield_titles(f, site) 1604 # Else, fetch page (page should return text in same format as that expected 1605 # in filename, i.e. pages separated by newlines or pages enclosed in double 1606 # brackets 1607 else: 1608 with io.StringIO(http.fetch(source).text) as f: 1609 yield from _yield_titles(f, site) 1610 1611 1612def PagesFromTitlesGenerator(iterable, site=None): 1613 """ 1614 Generate pages from the titles (strings) yielded by iterable. 1615 1616 :param site: Site for generator results. 1617 :type site: :py:obj:`pywikibot.site.BaseSite` 1618 """ 1619 if site is None: 1620 site = pywikibot.Site() 1621 for title in iterable: 1622 if not isinstance(title, str): 1623 break 1624 yield pywikibot.Page(pywikibot.Link(title, site)) 1625 1626 1627@deprecated('site.load_pages_from_pageids()', since='20200515') 1628def PagesFromPageidGenerator(pageids, site=None): 1629 """ 1630 DEPRECATED. Return a page generator from pageids. 1631 1632 Pages are iterated in the same order than in the underlying pageids. 1633 Pageids are filtered and only one page is returned in case of 1634 duplicate pageid. 1635 1636 :param pageids: an iterable that returns pageids, or a comma-separated 1637 string of pageids (e.g. '945097,1483753,956608') 1638 :param site: Site for generator results. 1639 :type site: :py:obj:`pywikibot.site.BaseSite` 1640 """ 1641 if site is None: 1642 site = pywikibot.Site() 1643 1644 return site.load_pages_from_pageids(pageids) 1645 1646 1647@deprecated_args(number='total', step=True) 1648def UserContributionsGenerator(username, namespaces: List[int] = None, 1649 site=None, total: Optional[int] = None, 1650 _filter_unique=_filter_unique_pages): 1651 """Yield unique pages edited by user:username. 1652 1653 :param total: Maximum number of pages to retrieve in total 1654 :param namespaces: list of namespace numbers to fetch contribs from 1655 :param site: Site for generator results. 1656 :type site: :py:obj:`pywikibot.site.BaseSite` 1657 """ 1658 if site is None: 1659 site = pywikibot.Site() 1660 1661 user = pywikibot.User(site, username) 1662 if not (user.isAnonymous() or user.isRegistered()): 1663 pywikibot.warning('User "{}" does not exist on site "{}".' 1664 .format(user.username, site)) 1665 1666 gen = (contrib[0] for contrib in user.contributions( 1667 namespaces=namespaces, total=total)) 1668 if _filter_unique: 1669 return _filter_unique(gen) 1670 return gen 1671 1672 1673def NamespaceFilterPageGenerator(generator, namespaces, site=None): 1674 """ 1675 A generator yielding pages from another generator in given namespaces. 1676 1677 If a site is provided, the namespaces are validated using the namespaces 1678 of that site, otherwise the namespaces are validated using the default 1679 site. 1680 1681 NOTE: API-based generators that have a "namespaces" parameter perform 1682 namespace filtering more efficiently than this generator. 1683 1684 :param namespaces: list of namespace identifiers to limit results 1685 :type namespaces: iterable of str or Namespace key, 1686 or a single instance of those types. 1687 :param site: Site for generator results; mandatory if 1688 namespaces contains namespace names. Defaults to the default site. 1689 :type site: :py:obj:`pywikibot.site.BaseSite` 1690 :raises KeyError: a namespace identifier was not resolved 1691 :raises TypeError: a namespace identifier has an inappropriate 1692 type such as NoneType or bool, or more than one namespace 1693 if the API module does not support multiple namespaces 1694 """ 1695 # As site was only required if the namespaces contain strings, don't 1696 # attempt to use the config selected site unless the initial attempt 1697 # at resolving the namespaces fails. 1698 if not site: 1699 site = pywikibot.Site() 1700 try: 1701 namespaces = site.namespaces.resolve(namespaces) 1702 except KeyError as e: 1703 pywikibot.log('Failed resolving namespaces:') 1704 pywikibot.exception(e) 1705 raise 1706 1707 return (page for page in generator if page.namespace() in namespaces) 1708 1709 1710@deprecated_args(ignoreList='ignore_list') 1711def PageTitleFilterPageGenerator(generator, ignore_list: dict): 1712 """ 1713 Yield only those pages are not listed in the ignore list. 1714 1715 :param ignore_list: family names are mapped to dictionaries in which 1716 language codes are mapped to lists of page titles. Each title must 1717 be a valid regex as they are compared using :py:obj:`re.search`. 1718 1719 """ 1720 def is_ignored(page): 1721 try: 1722 site_ig_list = ignore_list[page.site.family.name][page.site.code] 1723 except KeyError: 1724 return False 1725 return any(re.search(ig, page.title()) for ig in site_ig_list) 1726 1727 for page in generator: 1728 if not is_ignored(page): 1729 yield page 1730 continue 1731 1732 if config.verbose_output: 1733 pywikibot.output('Ignoring page {}'.format(page.title())) 1734 1735 1736def RedirectFilterPageGenerator(generator, no_redirects: bool = True, 1737 show_filtered: bool = False): 1738 """ 1739 Yield pages from another generator that are redirects or not. 1740 1741 :param no_redirects: Exclude redirects if True, else only include 1742 redirects. 1743 :param show_filtered: Output a message for each page not yielded 1744 """ 1745 fmt = '{page} is {what} redirect page. Skipping.' 1746 what = 'a' if no_redirects else 'not a' 1747 1748 for page in generator or []: 1749 is_redirect = page.isRedirectPage() 1750 if bool(no_redirects) != bool(is_redirect): # xor 1751 yield page 1752 continue 1753 1754 if show_filtered: 1755 pywikibot.output(fmt.format(what=what, page=page)) 1756 1757 1758class ItemClaimFilter: 1759 1760 """Item claim filter.""" 1761 1762 page_classes = { 1763 True: pywikibot.PropertyPage, 1764 False: pywikibot.ItemPage, 1765 } 1766 1767 @classmethod 1768 def __filter_match(cls, page, prop, claim, qualifiers): 1769 """ 1770 Return true if the page contains the claim given. 1771 1772 :param page: the page to check 1773 :return: true if page contains the claim, false otherwise 1774 :rtype: bool 1775 """ 1776 if not isinstance(page, pywikibot.page.WikibasePage): # T175151 1777 try: 1778 assert page.site.property_namespace 1779 assert page.site.item_namespace 1780 key = page.namespace() == page.site.property_namespace 1781 page_cls = cls.page_classes[key] 1782 page = page_cls(page.site, page.title(with_ns=False)) 1783 except (AttributeError, AssertionError): 1784 try: 1785 page = pywikibot.ItemPage.fromPage(page) 1786 except NoPageError: 1787 return False 1788 1789 def match_qualifiers(page_claim, qualifiers): 1790 return all(page_claim.has_qualifier(prop, val) 1791 for prop, val in qualifiers.items()) 1792 1793 page_claims = page.get()['claims'].get(prop, []) 1794 return any( 1795 p_cl.target_equals(claim) and match_qualifiers(p_cl, qualifiers) 1796 for p_cl in page_claims) 1797 1798 @classmethod 1799 def filter(cls, generator, prop: str, claim, 1800 qualifiers: Optional[dict] = None, 1801 negate: bool = False): 1802 """ 1803 Yield all ItemPages which contain certain claim in a property. 1804 1805 :param prop: property id to check 1806 :param claim: value of the property to check. Can be exact value (for 1807 instance, ItemPage instance) or a string (e.g. 'Q37470'). 1808 :param qualifiers: dict of qualifiers that must be present, or None if 1809 qualifiers are irrelevant 1810 :param negate: true if pages that do *not* contain specified claim 1811 should be yielded, false otherwise 1812 """ 1813 qualifiers = qualifiers or {} 1814 for page in generator: 1815 if cls.__filter_match(page, prop, claim, qualifiers) is not negate: 1816 yield page 1817 1818 1819# name the generator methods 1820ItemClaimFilterPageGenerator = ItemClaimFilter.filter 1821 1822 1823def SubpageFilterGenerator(generator, max_depth: int = 0, 1824 show_filtered: bool = False): 1825 """ 1826 Generator which filters out subpages based on depth. 1827 1828 It looks at the namespace of each page and checks if that namespace has 1829 subpages enabled. If so, pages with forward slashes ('/') are excluded. 1830 1831 :param generator: A generator object 1832 :type generator: any generator or iterator 1833 :param max_depth: Max depth of subpages to yield, at least zero 1834 :param show_filtered: Output a message for each page not yielded 1835 """ 1836 assert max_depth >= 0, 'Max subpage depth must be at least 0' 1837 1838 for page in generator: 1839 if page.depth <= max_depth: 1840 yield page 1841 else: 1842 if show_filtered: 1843 pywikibot.output( 1844 'Page {} is a subpage that is too deep. Skipping.' 1845 .format(page)) 1846 1847 1848class RegexFilter: 1849 1850 """Regex filter.""" 1851 1852 @classmethod 1853 def __filter_match(cls, regex, string, quantifier): 1854 """Return True if string matches precompiled regex list. 1855 1856 :param quantifier: a qualifier 1857 :type quantifier: str of 'all', 'any' or 'none' 1858 :rtype: bool 1859 """ 1860 if quantifier == 'all': 1861 match = all(r.search(string) for r in regex) 1862 else: 1863 match = any(r.search(string) for r in regex) 1864 return (quantifier == 'none') ^ match 1865 1866 @classmethod 1867 def __precompile(cls, regex, flag): 1868 """Precompile the regex list if needed.""" 1869 # Enable multiple regexes 1870 if not isinstance(regex, (list, tuple)): 1871 regex = [regex] 1872 # Test if regex is already compiled. 1873 # We assume that all list components have the same type 1874 if isinstance(regex[0], str): 1875 regex = [re.compile(r, flag) for r in regex] 1876 return regex 1877 1878 @classmethod 1879 @deprecated_args(inverse='quantifier') 1880 def titlefilter(cls, generator, regex, quantifier='any', 1881 ignore_namespace=True): 1882 """Yield pages from another generator whose title matches regex. 1883 1884 Uses regex option re.IGNORECASE depending on the quantifier parameter. 1885 1886 If ignore_namespace is False, the whole page title is compared. 1887 NOTE: if you want to check for a match at the beginning of the title, 1888 you have to start the regex with "^" 1889 1890 :param generator: another generator 1891 :type generator: any generator or iterator 1892 :param regex: a regex which should match the page title 1893 :type regex: a single regex string or a list of regex strings or a 1894 compiled regex or a list of compiled regexes 1895 :param quantifier: must be one of the following values: 1896 'all' - yields page if title is matched by all regexes 1897 'any' - yields page if title is matched by any regexes 1898 'none' - yields page if title is NOT matched by any regexes 1899 :type quantifier: str of ('all', 'any', 'none') 1900 :param ignore_namespace: ignore the namespace when matching the title 1901 :type ignore_namespace: bool 1902 :return: return a page depending on the matching parameters 1903 1904 """ 1905 # for backwards compatibility with compat for inverse parameter 1906 if quantifier is False: 1907 quantifier = 'any' 1908 elif quantifier is True: 1909 quantifier = 'none' 1910 reg = cls.__precompile(regex, re.I) 1911 for page in generator: 1912 title = page.title(with_ns=not ignore_namespace) 1913 if cls.__filter_match(reg, title, quantifier): 1914 yield page 1915 1916 @classmethod 1917 def contentfilter(cls, generator, regex, quantifier='any'): 1918 """Yield pages from another generator whose body matches regex. 1919 1920 Uses regex option re.IGNORECASE depending on the quantifier parameter. 1921 1922 For parameters see titlefilter above. 1923 1924 """ 1925 reg = cls.__precompile(regex, re.IGNORECASE | re.DOTALL) 1926 return (page for page in generator 1927 if cls.__filter_match(reg, page.text, quantifier)) 1928 1929 1930def QualityFilterPageGenerator(generator, quality: List[int]): 1931 """ 1932 Wrap a generator to filter pages according to quality levels. 1933 1934 This is possible only for pages with content_model 'proofread-page'. 1935 In all the other cases, no filter is applied. 1936 1937 :param generator: A generator object 1938 :param quality: proofread-page quality levels (valid range 0-4) 1939 1940 """ 1941 for page in generator: 1942 if page.namespace() == page.site.proofread_page_ns: 1943 page = ProofreadPage(page) 1944 if page.quality_level in quality: 1945 yield page 1946 else: 1947 yield page 1948 1949 1950@deprecated_args(site=True) 1951def CategoryFilterPageGenerator(generator, category_list): 1952 """ 1953 Wrap a generator to filter pages by categories specified. 1954 1955 :param generator: A generator object 1956 :param category_list: categories used to filter generated pages 1957 :type category_list: list of category objects 1958 1959 """ 1960 for page in generator: 1961 if all(x in page.categories() for x in category_list): 1962 yield page 1963 1964 1965# name the generator methods 1966RegexFilterPageGenerator = RegexFilter.titlefilter 1967RegexBodyFilterPageGenerator = RegexFilter.contentfilter 1968 1969 1970@deprecated_args(begintime='last_edit_start', endtime='last_edit_end') 1971def EdittimeFilterPageGenerator(generator, 1972 last_edit_start=None, 1973 last_edit_end=None, 1974 first_edit_start=None, 1975 first_edit_end=None, 1976 show_filtered=False): 1977 """ 1978 Wrap a generator to filter pages outside last or first edit range. 1979 1980 :param generator: A generator object 1981 :param last_edit_start: Only yield pages last edited after this time 1982 :type last_edit_start: datetime 1983 :param last_edit_end: Only yield pages last edited before this time 1984 :type last_edit_end: datetime 1985 :param first_edit_start: Only yield pages first edited after this time 1986 :type first_edit_start: datetime 1987 :param first_edit_end: Only yield pages first edited before this time 1988 :type first_edit_end: datetime 1989 :param show_filtered: Output a message for each page not yielded 1990 :type show_filtered: bool 1991 1992 """ 1993 def to_be_yielded(edit, page, show_filtered): 1994 if not edit.do_edit: 1995 return True 1996 1997 if isinstance(edit, Latest): 1998 edit_time = page.latest_revision.timestamp 1999 else: 2000 edit_time = page.oldest_revision.timestamp 2001 2002 msg = '{prefix} edit on {page} was on {time}.\n' \ 2003 'Too {{when}}. Skipping.' \ 2004 .format(prefix=edit.__class__.__name__, # prefix = Class name. 2005 page=page, 2006 time=edit_time.isoformat()) 2007 2008 if edit_time < edit.edit_start: 2009 _output_if(show_filtered, msg.format(when='old')) 2010 return False 2011 2012 if edit_time > edit.edit_end: 2013 _output_if(show_filtered, msg.format(when='recent')) 2014 return False 2015 2016 return True 2017 2018 First = namedtuple('First', ['do_edit', 'edit_start', 'edit_end']) 2019 Latest = namedtuple('Latest', First._fields) 2020 2021 latest_edit = Latest(do_edit=last_edit_start or last_edit_end, 2022 edit_start=last_edit_start or datetime.datetime.min, 2023 edit_end=last_edit_end or datetime.datetime.max) 2024 2025 first_edit = First(do_edit=first_edit_start or first_edit_end, 2026 edit_start=first_edit_start or datetime.datetime.min, 2027 edit_end=first_edit_end or datetime.datetime.max) 2028 2029 for page in generator or []: 2030 if (to_be_yielded(latest_edit, page, show_filtered) 2031 and to_be_yielded(first_edit, page, show_filtered)): 2032 yield page 2033 2034 2035def UserEditFilterGenerator(generator, username: str, timestamp=None, 2036 skip: bool = False, 2037 max_revision_depth: Optional[int] = None, 2038 show_filtered: bool = False): 2039 """ 2040 Generator which will yield Pages modified by username. 2041 2042 It only looks at the last editors given by max_revision_depth. 2043 If timestamp is set in MediaWiki format JJJJMMDDhhmmss, older edits are 2044 ignored. 2045 If skip is set, pages edited by the given user are ignored otherwise only 2046 pages edited by this user are given back. 2047 2048 :param generator: A generator object 2049 :param username: user name which edited the page 2050 :param timestamp: ignore edits which are older than this timestamp 2051 :type timestamp: datetime or str (MediaWiki format JJJJMMDDhhmmss) or None 2052 :param skip: Ignore pages edited by the given user 2053 :param max_revision_depth: It only looks at the last editors given by 2054 max_revision_depth 2055 :param show_filtered: Output a message for each page not yielded 2056 """ 2057 if isinstance(timestamp, str): 2058 ts = pywikibot.Timestamp.fromtimestampformat(timestamp) 2059 else: 2060 ts = timestamp 2061 2062 for page in generator: 2063 contribs = page.contributors(total=max_revision_depth, endtime=ts) 2064 if bool(contribs[username]) is not bool(skip): # xor operation 2065 yield page 2066 elif show_filtered: 2067 pywikibot.output('Skipping {}'.format(page.title(as_link=True))) 2068 2069 2070@deprecated('itertools.chain(*iterables)', since='20180513') 2071def CombinedPageGenerator(generators): 2072 """Yield from each iterable until exhausted, then proceed with the next.""" 2073 return itertools.chain(*generators) # pragma: no cover 2074 2075 2076def PageClassGenerator(generator): 2077 """ 2078 Yield pages from another generator as Page subclass objects. 2079 2080 The page class type depends on the page namespace. 2081 Objects may be Category, FilePage, Userpage or Page. 2082 """ 2083 for page in generator: 2084 if page.namespace() == page.site.namespaces.USER: 2085 yield pywikibot.User(page) 2086 elif page.namespace() == page.site.namespaces.FILE: 2087 yield pywikibot.FilePage(page) 2088 elif page.namespace() == page.site.namespaces.CATEGORY: 2089 yield pywikibot.Category(page) 2090 else: 2091 yield page 2092 2093 2094def PageWithTalkPageGenerator(generator, return_talk_only=False): 2095 """Yield pages and associated talk pages from another generator. 2096 2097 Only yields talk pages if the original generator yields a non-talk page, 2098 and does not check if the talk page in fact exists. 2099 2100 """ 2101 for page in generator: 2102 if not return_talk_only or page.isTalkPage(): 2103 yield page 2104 if not page.isTalkPage(): 2105 yield page.toggleTalkPage() 2106 2107 2108@deprecated('LiveRCPageGenerator or EventStreams', since='20180415') 2109def RepeatingGenerator(generator, key_func=lambda x: x, sleep_duration=60, 2110 total: Optional[int] = None, **kwargs): 2111 """Yield items in live time. 2112 2113 The provided generator must support parameter 'start', 'end', 2114 'reverse', and 'total' such as site.recentchanges(), site.logevents(). 2115 2116 To fetch revisions in recentchanges in live time:: 2117 2118 gen = RepeatingGenerator(site.recentchanges, lambda x: x['revid']) 2119 2120 To fetch new pages in live time:: 2121 2122 gen = RepeatingGenerator(site.newpages, lambda x: x[0]) 2123 2124 Note that other parameters not listed below will be passed 2125 to the generator function. Parameter 'reverse', 'start', 'end' 2126 will always be discarded to prevent the generator yielding items 2127 in wrong order. 2128 2129 :param generator: a function returning a generator that will be queried 2130 :param key_func: a function returning key that will be used to detect 2131 duplicate entry 2132 :param sleep_duration: duration between each query 2133 :param total: if it is a positive number, iterate no more than this 2134 number of items in total. Otherwise, iterate forever 2135 :return: a generator yielding items in ascending order by time 2136 """ 2137 kwargs.pop('reverse', None) # always get newest item first 2138 kwargs.pop('start', None) # don't set start time 2139 kwargs.pop('end', None) # don't set stop time 2140 2141 seen = set() 2142 while total is None or len(seen) < total: 2143 def filtered_generator(): 2144 for item in generator(total=None if seen else 1, **kwargs): 2145 key = key_func(item) 2146 if key not in seen: 2147 seen.add(key) 2148 yield item 2149 if len(seen) == total: 2150 return 2151 else: 2152 break 2153 pywikibot.sleep(sleep_duration) 2154 2155 yield from reversed(list(filtered_generator())) 2156 2157 2158@deprecated_args(pageNumber='groupsize', step='groupsize', lookahead=True) 2159def PreloadingGenerator(generator, groupsize: int = 50): 2160 """ 2161 Yield preloaded pages taken from another generator. 2162 2163 :param generator: pages to iterate over 2164 :param groupsize: how many pages to preload at once 2165 """ 2166 # pages may be on more than one site, for example if an interwiki 2167 # generator is used, so use a separate preloader for each site 2168 sites = {} 2169 # build a list of pages for each site found in the iterator 2170 for page in generator: 2171 site = page.site 2172 sites.setdefault(site, []).append(page) 2173 if len(sites[site]) >= groupsize: 2174 # if this site is at the groupsize, process it 2175 group = sites.pop(site) 2176 yield from site.preloadpages(group, groupsize=groupsize) 2177 2178 for site, pages in sites.items(): 2179 # process any leftover sites that never reached the groupsize 2180 yield from site.preloadpages(pages, groupsize=groupsize) 2181 2182 2183@deprecated_args(step='groupsize') 2184def DequePreloadingGenerator(generator, groupsize=50): 2185 """Preload generator of type DequeGenerator.""" 2186 assert isinstance(generator, DequeGenerator), \ 2187 'generator must be a DequeGenerator object' 2188 2189 while True: 2190 page_count = min(len(generator), groupsize) 2191 if not page_count: 2192 return 2193 2194 yield from PreloadingGenerator(generator, page_count) 2195 2196 2197@deprecated_args(step='groupsize') 2198def PreloadingEntityGenerator(generator, groupsize: int = 50): 2199 """ 2200 Yield preloaded pages taken from another generator. 2201 2202 Function basically is copied from above, but for Wikibase entities. 2203 2204 :param generator: pages to iterate over 2205 :type generator: Iterable 2206 :param groupsize: how many pages to preload at once 2207 """ 2208 sites = {} 2209 for page in generator: 2210 site = page.site 2211 sites.setdefault(site, []).append(page) 2212 if len(sites[site]) >= groupsize: 2213 # if this site is at the groupsize, process it 2214 group = sites.pop(site) 2215 repo = site.data_repository() 2216 yield from repo.preload_entities(group, groupsize) 2217 2218 for site, pages in sites.items(): 2219 # process any leftover sites that never reached the groupsize 2220 repo = site.data_repository() 2221 yield from repo.preload_entities(pages, groupsize) 2222 2223 2224@deprecated_args(number='total', step=True, repeat=True) 2225def NewimagesPageGenerator(total: Optional[int] = None, site=None): 2226 """ 2227 New file generator. 2228 2229 :param total: Maximum number of pages to retrieve in total 2230 :param site: Site for generator results. 2231 :type site: :py:obj:`pywikibot.site.BaseSite` 2232 """ 2233 if site is None: 2234 site = pywikibot.Site() 2235 return (entry.page() 2236 for entry in site.logevents(logtype='upload', total=total)) 2237 2238 2239def WikibaseItemGenerator(gen): 2240 """ 2241 A wrapper generator used to yield Wikibase items of another generator. 2242 2243 :param gen: Generator to wrap. 2244 :type gen: generator 2245 :return: Wrapped generator 2246 :rtype: generator 2247 """ 2248 for page in gen: 2249 if isinstance(page, pywikibot.ItemPage): 2250 yield page 2251 elif page.site.data_repository() == page.site: 2252 # These are already items, as they have a DataSite in page.site. 2253 # However generator is yielding Page, so convert to ItemPage. 2254 # FIXME: If we've already fetched content, we should retain it 2255 yield pywikibot.ItemPage(page.site, page.title()) 2256 else: 2257 yield pywikibot.ItemPage.fromPage(page) 2258 2259 2260def WikibaseItemFilterPageGenerator(generator, has_item: bool = True, 2261 show_filtered: bool = False): 2262 """ 2263 A wrapper generator used to exclude if page has a Wikibase item or not. 2264 2265 :param generator: Generator to wrap. 2266 :type generator: generator 2267 :param has_item: Exclude pages without an item if True, or only 2268 include pages without an item if False 2269 :param show_filtered: Output a message for each page not yielded 2270 :return: Wrapped generator 2271 :rtype: generator 2272 """ 2273 why = "doesn't" if has_item else 'has' 2274 msg = '{{page}} {why} a wikidata item. Skipping.'.format(why=why) 2275 2276 for page in generator or []: 2277 try: 2278 page_item = pywikibot.ItemPage.fromPage(page, lazy_load=False) 2279 except NoPageError: 2280 page_item = None 2281 2282 to_be_skipped = bool(page_item) != has_item 2283 if to_be_skipped: 2284 _output_if(show_filtered, msg.format(page=page)) 2285 continue 2286 2287 yield page 2288 2289 2290@deprecated('Site.unusedfiles()', since='20200515') 2291@deprecated_args(extension=True, number='total', repeat=True) 2292def UnusedFilesGenerator(total: Optional[int] = None, 2293 site=None): # pragma: no cover 2294 """ 2295 DEPRECATED. Unused files generator. 2296 2297 :param total: Maximum number of pages to retrieve in total 2298 :param site: Site for generator results. 2299 :type site: :py:obj:`pywikibot.site.BaseSite` 2300 """ 2301 if site is None: 2302 site = pywikibot.Site() 2303 return site.unusedfiles(total=total) 2304 2305 2306@deprecated('Site.withoutinterwiki()', since='20200515') 2307@deprecated_args(number='total', repeat=True) 2308def WithoutInterwikiPageGenerator(total=None, site=None): # pragma: no cover 2309 """ 2310 DEPRECATED. Page lacking interwikis generator. 2311 2312 :param total: Maximum number of pages to retrieve in total 2313 :param site: Site for generator results. 2314 :type site: :py:obj:`pywikibot.site.BaseSite` 2315 """ 2316 if site is None: 2317 site = pywikibot.Site() 2318 return site.withoutinterwiki(total=total) 2319 2320 2321@deprecated('Site.uncategorizedcategories()', since='20200515') 2322@deprecated_args(number='total', repeat=True) 2323def UnCategorizedCategoryGenerator(total: Optional[int] = 100, 2324 site=None): # pragma: no cover 2325 """ 2326 DEPRECATED. Uncategorized category generator. 2327 2328 :param total: Maximum number of pages to retrieve in total 2329 :param site: Site for generator results. 2330 :type site: :py:obj:`pywikibot.site.BaseSite` 2331 """ 2332 if site is None: 2333 site = pywikibot.Site() 2334 return site.uncategorizedcategories(total=total) 2335 2336 2337@deprecated('Site.uncategorizedimages()', since='20200515') 2338@deprecated_args(number='total', repeat=True) 2339def UnCategorizedImageGenerator(total: int = 100, 2340 site=None): # pragma: no cover 2341 """ 2342 DEPRECATED. Uncategorized file generator. 2343 2344 :param total: Maximum number of pages to retrieve in total 2345 :param site: Site for generator results. 2346 :type site: :py:obj:`pywikibot.site.BaseSite` 2347 """ 2348 if site is None: 2349 site = pywikibot.Site() 2350 return site.uncategorizedimages(total=total) 2351 2352 2353@deprecated('Site.uncategorizedpages()', since='20200515') 2354@deprecated_args(number='total', repeat=True) 2355def UnCategorizedPageGenerator(total: int = 100, 2356 site=None): # pragma: no cover 2357 """ 2358 DEPRECATED. Uncategorized page generator. 2359 2360 :param total: Maximum number of pages to retrieve in total 2361 :param site: Site for generator results. 2362 :type site: :py:obj:`pywikibot.site.BaseSite` 2363 """ 2364 if site is None: 2365 site = pywikibot.Site() 2366 return site.uncategorizedpages(total=total) 2367 2368 2369@deprecated('Site.uncategorizedtemplates()', since='20200515') 2370@deprecated_args(number='total', repeat=True) 2371def UnCategorizedTemplateGenerator(total: int = 100, 2372 site=None): # pragma: no cover 2373 """ 2374 DEPRECATED. Uncategorized template generator. 2375 2376 :param total: Maximum number of pages to retrieve in total 2377 :param site: Site for generator results. 2378 :type site: :py:obj:`pywikibot.site.BaseSite` 2379 """ 2380 if site is None: 2381 site = pywikibot.Site() 2382 return site.uncategorizedtemplates(total=total) 2383 2384 2385@deprecated('Site.lonelypages()', since='20200515') 2386@deprecated_args(number='total', repeat=True) 2387def LonelyPagesPageGenerator(total: Optional[int] = None, 2388 site=None): # pragma: no cover 2389 """ 2390 DEPRECATED. Lonely page generator. 2391 2392 :param total: Maximum number of pages to retrieve in total 2393 :param site: Site for generator results. 2394 :type site: :py:obj:`pywikibot.site.BaseSite` 2395 """ 2396 if site is None: 2397 site = pywikibot.Site() 2398 return site.lonelypages(total=total) 2399 2400 2401@deprecated('Site.unwatchedpages()', since='20200515') 2402@deprecated_args(number='total', repeat=True) 2403def UnwatchedPagesPageGenerator(total: Optional[int] = None, 2404 site=None): # pragma: no cover 2405 """ 2406 DEPRECATED. Unwatched page generator. 2407 2408 :param total: Maximum number of pages to retrieve in total 2409 :param site: Site for generator results. 2410 :type site: :py:obj:`pywikibot.site.BaseSite` 2411 """ 2412 if site is None: 2413 site = pywikibot.Site() 2414 return site.unwatchedpages(total=total) 2415 2416 2417@deprecated('Site.pages_with_property()', since='20200515') 2418def page_with_property_generator(name: str, total: Optional[int] = None, 2419 site=None): # pragma: no cover 2420 """ 2421 Special:PagesWithProperty page generator. 2422 2423 :param name: Property name of pages to be retrieved 2424 :param total: Maximum number of pages to retrieve in total 2425 :param site: Site for generator results. 2426 :type site: :py:obj:`pywikibot.site.BaseSite` 2427 """ 2428 if site is None: 2429 site = pywikibot.Site() 2430 return site.pages_with_property(name, total=total) 2431 2432 2433@deprecated('Site.wantedpages', since='20180803') 2434def WantedPagesPageGenerator(total: int = 100, site=None): # pragma: no cover 2435 """ 2436 Wanted page generator. 2437 2438 :param total: Maximum number of pages to retrieve in total 2439 :param site: Site for generator results. 2440 :type site: :py:obj:`pywikibot.site.BaseSite` 2441 """ 2442 if site is None: 2443 site = pywikibot.Site() 2444 return site.wantedpages(total=total) 2445 2446 2447@deprecated_args(number='total', repeat=True) 2448def AncientPagesPageGenerator(total: int = 100, site=None): # pragma: no cover 2449 """ 2450 Ancient page generator. 2451 2452 :param total: Maximum number of pages to retrieve in total 2453 :param site: Site for generator results. 2454 :type site: :py:obj:`pywikibot.site.BaseSite` 2455 """ 2456 if site is None: 2457 site = pywikibot.Site() 2458 return (page for page, _ in site.ancientpages(total=total)) 2459 2460 2461@deprecated('Site.deadendpages()', since='20200515') 2462@deprecated_args(number='total', repeat=True) 2463def DeadendPagesPageGenerator(total: int = 100, site=None): # pragma: no cover 2464 """ 2465 DEPRECATED. Dead-end page generator. 2466 2467 :param total: Maximum number of pages to retrieve in total 2468 :param site: Site for generator results. 2469 :type site: :py:obj:`pywikibot.site.BaseSite` 2470 """ 2471 if site is None: 2472 site = pywikibot.Site() 2473 return site.deadendpages(total=total) 2474 2475 2476@deprecated_args(number='total', repeat=True) 2477def LongPagesPageGenerator(total: int = 100, site=None): 2478 """ 2479 Long page generator. 2480 2481 :param total: Maximum number of pages to retrieve in total 2482 :param site: Site for generator results. 2483 :type site: :py:obj:`pywikibot.site.BaseSite` 2484 """ 2485 if site is None: 2486 site = pywikibot.Site() 2487 return (page for page, _ in site.longpages(total=total)) 2488 2489 2490@deprecated_args(number='total', repeat=True) 2491def ShortPagesPageGenerator(total: int = 100, site=None): 2492 """ 2493 Short page generator. 2494 2495 :param total: Maximum number of pages to retrieve in total 2496 :param site: Site for generator results. 2497 :type site: :py:obj:`pywikibot.site.BaseSite` 2498 """ 2499 if site is None: 2500 site = pywikibot.Site() 2501 return (page for page, _ in site.shortpages(total=total)) 2502 2503 2504@deprecated('Site.randompages()', since='20200515') 2505@deprecated_args(number='total') 2506def RandomPageGenerator(total: Optional[int] = None, site=None, 2507 namespaces=None): # pragma: no cover 2508 """ 2509 DEPRECATED. Random page generator. 2510 2511 :param total: Maximum number of pages to retrieve in total 2512 :param site: Site for generator results. 2513 :type site: :py:obj:`pywikibot.site.BaseSite` 2514 """ 2515 if site is None: 2516 site = pywikibot.Site() 2517 return site.randompages(total=total, namespaces=namespaces) 2518 2519 2520@deprecated('Site.randompages()', since='20200515') 2521@deprecated_args(number='total') 2522def RandomRedirectPageGenerator(total: Optional[int] = None, site=None, 2523 namespaces=None): # pragma: no cover 2524 """ 2525 DEPRECATED. Random redirect generator. 2526 2527 :param total: Maximum number of pages to retrieve in total 2528 :param site: Site for generator results. 2529 :type site: :py:obj:`pywikibot.site.BaseSite` 2530 """ 2531 if site is None: 2532 site = pywikibot.Site() 2533 return site.randompages(total=total, namespaces=namespaces, 2534 redirects=True) 2535 2536 2537@deprecated('Site.exturlusage()', since='20200515') 2538@deprecated_args(link='url', euprotocol='protocol', step=True) 2539def LinksearchPageGenerator(url: str, namespaces: List[int] = None, 2540 total: Optional[int] = None, site=None, 2541 protocol: Optional[str] = None): 2542 """DEPRECATED. Yield all pages that link to a certain URL. 2543 2544 :param url: The URL to search for (with ot without the protocol prefix); 2545 this may include a '*' as a wildcard, only at the start of the 2546 hostname 2547 :param namespaces: list of namespace numbers to fetch contribs from 2548 :param total: Maximum number of pages to retrieve in total 2549 :param site: Site for generator results 2550 :type site: :py:obj:`pywikibot.site.BaseSite` 2551 :param protocol: Protocol to search for, likely http or https, http by 2552 default. Full list shown on Special:LinkSearch wikipage 2553 """ 2554 if site is None: 2555 site = pywikibot.Site() 2556 return site.exturlusage(url, namespaces=namespaces, protocol=protocol, 2557 total=total, content=False) 2558 2559 2560@deprecated('Site.search()', since='20200515') 2561@deprecated_args(number='total', step=True) 2562def SearchPageGenerator(query, total: Optional[int] = None, namespaces=None, 2563 site=None): # pragma: no cover 2564 """ 2565 DEPRECATED. Yield pages from the MediaWiki internal search engine. 2566 2567 :param total: Maximum number of pages to retrieve in total 2568 :param site: Site for generator results. 2569 :type site: :py:obj:`pywikibot.site.BaseSite` 2570 """ 2571 if site is None: 2572 site = pywikibot.Site() 2573 return site.search(query, total=total, namespaces=namespaces) 2574 2575 2576def LiveRCPageGenerator(site=None, total: Optional[int] = None): 2577 """ 2578 Yield pages from a socket.io RC stream. 2579 2580 Generates pages based on the EventStreams Server-Sent-Event (SSE) recent 2581 changes stream. 2582 The Page objects will have an extra property ._rcinfo containing the 2583 literal rc data. This can be used to e.g. filter only new pages. See 2584 `pywikibot.comms.eventstreams.rc_listener` for details on the .rcinfo 2585 format. 2586 2587 :param site: site to return recent changes for 2588 :type site: pywikibot.BaseSite 2589 :param total: the maximum number of changes to return 2590 """ 2591 if site is None: 2592 site = pywikibot.Site() 2593 2594 from pywikibot.comms.eventstreams import site_rc_listener 2595 2596 for entry in site_rc_listener(site, total=total): 2597 # The title in a log entry may have been suppressed 2598 if 'title' not in entry and entry['type'] == 'log': 2599 continue 2600 page = pywikibot.Page(site, entry['title'], entry['namespace']) 2601 page._rcinfo = entry 2602 yield page 2603 2604 2605# following classes just ported from version 1 without revision; not tested 2606 2607 2608class GoogleSearchPageGenerator: 2609 2610 """ 2611 Page generator using Google search results. 2612 2613 To use this generator, you need to install the package 'google': 2614 2615 :py:obj:`https://pypi.org/project/google` 2616 2617 This package has been available since 2010, hosted on GitHub 2618 since 2012, and provided by PyPI since 2013. 2619 2620 As there are concerns about Google's Terms of Service, this 2621 generator prints a warning for each query. 2622 """ 2623 2624 def __init__(self, query=None, site=None): 2625 """ 2626 Initializer. 2627 2628 :param site: Site for generator results. 2629 :type site: :py:obj:`pywikibot.site.BaseSite` 2630 """ 2631 self.query = query or pywikibot.input('Please enter the search query:') 2632 if site is None: 2633 site = pywikibot.Site() 2634 self.site = site 2635 2636 def queryGoogle(self, query): 2637 """ 2638 Perform a query using python package 'google'. 2639 2640 The terms of service as at June 2014 give two conditions that 2641 may apply to use of search: 2642 2643 1. Don't access [Google Services] using a method other than 2644 the interface and the instructions that [they] provide. 2645 2. Don't remove, obscure, or alter any legal notices 2646 displayed in or along with [Google] Services. 2647 2648 Both of those issues should be managed by the package 'google', 2649 however Pywikibot will at least ensure the user sees the TOS 2650 in order to comply with the second condition. 2651 """ 2652 try: 2653 import google 2654 except ImportError: 2655 pywikibot.error('ERROR: generator GoogleSearchPageGenerator ' 2656 "depends on package 'google'.\n" 2657 'To install, please run: pip install google.') 2658 sys.exit(1) 2659 pywikibot.warning('Please read http://www.google.com/accounts/TOS') 2660 yield from google.search(query) 2661 2662 def __iter__(self): 2663 """Iterate results.""" 2664 # restrict query to local site 2665 localQuery = '{} site:{}'.format(self.query, self.site.hostname()) 2666 base = 'http://{}{}'.format(self.site.hostname(), 2667 self.site.article_path) 2668 for url in self.queryGoogle(localQuery): 2669 if url[:len(base)] == base: 2670 title = url[len(base):] 2671 page = pywikibot.Page(pywikibot.Link(title, self.site)) 2672 # Google contains links in the format 2673 # https://de.wikipedia.org/wiki/en:Foobar 2674 if page.site == self.site: 2675 yield page 2676 2677 2678def MySQLPageGenerator(query, site=None, verbose=None): 2679 """ 2680 Yield a list of pages based on a MySQL query. 2681 2682 The query should return two columns, page namespace and page title pairs 2683 from some table. An example query that yields all ns0 pages might look 2684 like:: 2685 2686 SELECT 2687 page_namespace, 2688 page_title 2689 FROM page 2690 WHERE page_namespace = 0; 2691 2692 See https://www.mediawiki.org/wiki/Manual:Pywikibot/MySQL 2693 2694 :param query: MySQL query to execute 2695 :param site: Site object 2696 :type site: :py:obj:`pywikibot.site.BaseSite` 2697 :param verbose: if True, print query to be executed; 2698 if None, config.verbose_output will be used. 2699 :type verbose: None or bool 2700 :return: generator which yields pywikibot.Page 2701 """ 2702 from pywikibot.data import mysql 2703 2704 if site is None: 2705 site = pywikibot.Site() 2706 2707 row_gen = mysql.mysql_query(query, 2708 dbname=site.dbName(), 2709 verbose=verbose) 2710 2711 for row in row_gen: 2712 namespace_number, page_name = row 2713 page_name = page_name.decode(site.encoding()) 2714 page = pywikibot.Page(site, page_name, ns=int(namespace_number)) 2715 yield page 2716 2717 2718class XMLDumpOldPageGenerator(Iterator): 2719 2720 """ 2721 Xml generator that yields Page objects with old text loaded. 2722 2723 :param filename: filename of XML dump 2724 :type filename: str 2725 :param start: skip entries below that value 2726 :type start: str or None 2727 :param namespaces: namespace filter 2728 :type namespaces: iterable of str or Namespace key, 2729 or a single instance of those types 2730 :param site: current site for the generator 2731 :type site: pywikibot.Site or None 2732 :param text_predicate: a callable with entry.text as parameter and boolean 2733 as result to indicate the generator should return the page or not 2734 :type text_predicate: function identifier or None 2735 2736 :ivar text_predicate: holds text_predicate function 2737 :ivar skipping: True if start parameter is given, else False 2738 :ivar start: holds start parameter 2739 :ivar namespaces: holds namespaces filter 2740 :ivar parser: holds the xmlreader.XmlDump parse method 2741 """ 2742 2743 @deprecated_args(xmlFilename='filename') 2744 def __init__(self, filename: str, start: Optional[str] = None, 2745 namespaces=None, site=None, 2746 text_predicate=None): 2747 """Initializer.""" 2748 self.text_predicate = text_predicate 2749 2750 self.skipping = bool(start) 2751 if self.skipping: 2752 self.start = start.replace('_', ' ') 2753 else: 2754 self.start = None 2755 2756 self.site = site or pywikibot.Site() 2757 if not namespaces: 2758 self.namespaces = self.site.namespaces 2759 else: 2760 self.namespaces = self.site.namespaces.resolve(namespaces) 2761 2762 dump = xmlreader.XmlDump(filename) 2763 self.parser = dump.parse() 2764 2765 def __next__(self): 2766 """Get next Page.""" 2767 while True: 2768 entry = next(self.parser) 2769 if self.skipping: 2770 if entry.title < self.start: 2771 continue 2772 self.skipping = False 2773 page = pywikibot.Page(self.site, entry.title) 2774 if page.namespace() not in self.namespaces: 2775 continue 2776 if not self.text_predicate or self.text_predicate(entry.text): 2777 page.text = entry.text 2778 return page 2779 2780 2781class XMLDumpPageGenerator(XMLDumpOldPageGenerator): 2782 2783 """Xml generator that yields Page objects without text loaded.""" 2784 2785 def __next__(self): 2786 """Get next Page from dump and remove the text.""" 2787 page = super().__next__() 2788 del page.text 2789 return page 2790 2791 2792def YearPageGenerator(start=1, end=2050, site=None): 2793 """ 2794 Year page generator. 2795 2796 :param site: Site for generator results. 2797 :type site: :py:obj:`pywikibot.site.BaseSite` 2798 """ 2799 if site is None: 2800 site = pywikibot.Site() 2801 pywikibot.output('Starting with year {}'.format(start)) 2802 for i in range(start, end + 1): 2803 if i % 100 == 0: 2804 pywikibot.output('Preparing {}...'.format(i)) 2805 # There is no year 0 2806 if i != 0: 2807 current_year = date.formatYear(site.lang, i) 2808 yield pywikibot.Page(pywikibot.Link(current_year, site)) 2809 2810 2811@deprecated_args(startMonth='start_month', endMonth='end_month') 2812def DayPageGenerator(start_month: int = 1, end_month: int = 12, 2813 site=None, year: int = 2000): 2814 """ 2815 Day page generator. 2816 2817 :param site: Site for generator results. 2818 :type site: :py:obj:`pywikibot.site.BaseSite` 2819 :param year: considering leap year. 2820 """ 2821 if site is None: 2822 site = pywikibot.Site() 2823 lang = site.lang 2824 firstPage = pywikibot.Page(site, date.format_date(start_month, 1, lang)) 2825 pywikibot.output('Starting with {}'.format(firstPage.title(as_link=True))) 2826 for month in range(start_month, end_month + 1): 2827 for day in range(1, calendar.monthrange(year, month)[1] + 1): 2828 yield pywikibot.Page( 2829 pywikibot.Link(date.format_date(month, day, lang), site)) 2830 2831 2832def WikidataPageFromItemGenerator(gen, site): 2833 """Generate pages from site based on sitelinks of item pages. 2834 2835 :param gen: generator of :py:obj:`pywikibot.ItemPage` 2836 :param site: Site for generator results. 2837 :type site: :py:obj:`pywikibot.site.BaseSite` 2838 2839 """ 2840 repo = site.data_repository() 2841 for sublist in itergroup(gen, 50): 2842 req = {'ids': [item.id for item in sublist], 2843 'sitefilter': site.dbName(), 2844 'action': 'wbgetentities', 2845 'props': 'sitelinks'} 2846 2847 wbrequest = repo._simple_request(**req) 2848 wbdata = wbrequest.submit() 2849 entities = (item for item in wbdata['entities'].values() if 2850 'sitelinks' in item and site.dbName() in item['sitelinks']) 2851 sitelinks = (item['sitelinks'][site.dbName()]['title'] 2852 for item in entities) 2853 for sitelink in sitelinks: 2854 yield pywikibot.Page(site, sitelink) 2855 2856 2857def WikidataSPARQLPageGenerator(query, 2858 site=None, item_name: str = 'item', 2859 endpoint: Optional[str] = None, 2860 entity_url: Optional[str] = None, 2861 result_type=set): 2862 """Generate pages that result from the given SPARQL query. 2863 2864 :param query: the SPARQL query string. 2865 :param site: Site for generator results. 2866 :type site: :py:obj:`pywikibot.site.BaseSite` 2867 :param item_name: name of the item in the SPARQL query 2868 :param endpoint: SPARQL endpoint URL 2869 :param entity_url: URL prefix for any entities returned in a query. 2870 :param result_type: type of the iterable in which 2871 SPARQL results are stored (default set) 2872 :type result_type: iterable 2873 2874 """ 2875 from pywikibot.data import sparql 2876 2877 if site is None: 2878 site = pywikibot.Site() 2879 repo = site.data_repository() 2880 dependencies = {'endpoint': endpoint, 'entity_url': entity_url} 2881 if not endpoint or not entity_url: 2882 dependencies['repo'] = repo 2883 query_object = sparql.SparqlQuery(**dependencies) 2884 data = query_object.get_items(query, 2885 item_name=item_name, 2886 result_type=result_type) 2887 entities = (repo.get_entity_for_entity_id(entity) for entity in data) 2888 if isinstance(site, pywikibot.site.DataSite): 2889 return entities 2890 2891 return WikidataPageFromItemGenerator(entities, site) 2892 2893 2894def WikibaseSearchItemPageGenerator(text: str, 2895 language: Optional[str] = None, 2896 total: Optional[int] = None, site=None): 2897 """ 2898 Generate pages that contain the provided text. 2899 2900 :param text: Text to look for. 2901 :param language: Code of the language to search in. If not specified, 2902 value from pywikibot.config.data_lang is used. 2903 :param total: Maximum number of pages to retrieve in total, or None in 2904 case of no limit. 2905 :param site: Site for generator results. 2906 :type site: :py:obj:`pywikibot.site.BaseSite` 2907 """ 2908 if site is None: 2909 site = pywikibot.Site() 2910 if language is None: 2911 language = site.lang 2912 repo = site.data_repository() 2913 2914 data = repo.search_entities(text, language, total=total) 2915 return (pywikibot.ItemPage(repo, item['id']) for item in data) 2916 2917 2918class PetScanPageGenerator: 2919 """Queries PetScan (https://petscan.wmflabs.org/) to generate pages.""" 2920 2921 def __init__(self, categories, subset_combination=True, namespaces=None, 2922 site=None, extra_options=None): 2923 """ 2924 Initializer. 2925 2926 :param categories: List of categories to retrieve pages from 2927 (as strings) 2928 :param subset_combination: Combination mode. 2929 If True, returns the intersection of the results of the categories, 2930 else returns the union of the results of the categories 2931 :param namespaces: List of namespaces to search in 2932 (default is None, meaning all namespaces) 2933 :param site: Site to operate on 2934 (default is the default site from the user config) 2935 :param extra_options: Dictionary of extra options to use (optional) 2936 """ 2937 if site is None: 2938 site = pywikibot.Site() 2939 2940 self.site = site 2941 self.opts = self.buildQuery(categories, subset_combination, 2942 namespaces, extra_options) 2943 2944 def buildQuery(self, categories, subset_combination, namespaces, 2945 extra_options): 2946 """ 2947 Get the querystring options to query PetScan. 2948 2949 :param categories: List of categories (as strings) 2950 :param subset_combination: Combination mode. 2951 If True, returns the intersection of the results of the categories, 2952 else returns the union of the results of the categories 2953 :param namespaces: List of namespaces to search in 2954 :param extra_options: Dictionary of extra options to use 2955 :return: Dictionary of querystring parameters to use in the query 2956 """ 2957 extra_options = extra_options or {} 2958 2959 query = { 2960 'language': self.site.code, 2961 'project': self.site.hostname().split('.')[-2], 2962 'combination': 'subset' if subset_combination else 'union', 2963 'categories': '\r\n'.join(categories), 2964 'format': 'json', 2965 'doit': '' 2966 } 2967 2968 if namespaces: 2969 for namespace in namespaces: 2970 query['ns[{}]'.format(int(namespace))] = 1 2971 2972 query_final = query.copy() 2973 query_final.update(extra_options) 2974 2975 return query_final 2976 2977 def query(self): 2978 """Query PetScan.""" 2979 url = 'https://petscan.wmflabs.org' 2980 2981 try: 2982 req = http.fetch(url, params=self.opts) 2983 except ReadTimeout: 2984 raise ServerError('received ReadTimeout from {}'.format(url)) 2985 2986 server_err = HTTPStatus.INTERNAL_SERVER_ERROR 2987 if server_err <= req.status_code < server_err + 100: 2988 raise ServerError( 2989 'received {} status from {}'.format(req.status_code, req.url)) 2990 2991 j = json.loads(req.text) 2992 raw_pages = j['*'][0]['a']['*'] 2993 yield from raw_pages 2994 2995 def __iter__(self): 2996 for raw_page in self.query(): 2997 page = pywikibot.Page(self.site, raw_page['title'], 2998 int(raw_page['namespace'])) 2999 yield page 3000 3001 3002DuplicateFilterPageGenerator = redirect_func( 3003 filter_unique, old_name='DuplicateFilterPageGenerator', since='20180715') 3004PreloadingItemGenerator = redirect_func(PreloadingEntityGenerator, 3005 old_name='PreloadingItemGenerator', 3006 since='20170314') 3007TextfilePageGenerator = redirect_func( 3008 TextIOPageGenerator, old_name='TextfilePageGenerator', since='20210611') 3009 3010if __name__ == '__main__': # pragma: no cover 3011 pywikibot.output('Pagegenerators cannot be run as script - are you ' 3012 'looking for listpages.py?') 3013