1"""
2This module offers a wide variety of page generators.
3
4A page generator is an
5object that is iterable (see https://legacy.python.org/dev/peps/pep-0255/ ) and
6that yields page objects on which other scripts can then work.
7
8Pagegenerators.py cannot be run as script. For testing purposes listpages.py
9can be used instead, to print page titles to standard output.
10
11These parameters are supported to specify which pages titles to print:
12
13&params;
14"""
15#
16# (C) Pywikibot team, 2008-2021
17#
18# Distributed under the terms of the MIT license.
19#
20import calendar
21import codecs
22import datetime
23import io
24import itertools
25import json
26import re
27import sys
28from collections import namedtuple
29from collections.abc import Iterator
30from datetime import timedelta
31from functools import partial
32from http import HTTPStatus
33from itertools import zip_longest
34from typing import Optional, Union
35from urllib.parse import urlparse
36
37from requests.exceptions import ReadTimeout
38
39import pywikibot
40from pywikibot import config, date, i18n, xmlreader
41from pywikibot.backports import Iterable, List
42from pywikibot.bot import ShowingListOption
43from pywikibot.comms import http
44from pywikibot.data import api
45from pywikibot.exceptions import (
46    NoPageError,
47    ServerError,
48    UnknownExtensionError,
49)
50from pywikibot.proofreadpage import ProofreadPage
51from pywikibot.tools import (
52    DequeGenerator,
53    deprecated,
54    deprecated_args,
55    filter_unique,
56    intersect_generators,
57    itergroup,
58    redirect_func,
59)
60
61
62_logger = 'pagegenerators'
63
64# ported from version 1 for backwards-compatibility
65# most of these functions just wrap a Site or Page method that returns
66# a generator
67
68parameterHelp = """\
69GENERATOR OPTIONS
70=================
71
72-cat                Work on all pages which are in a specific category.
73                    Argument can also be given as "-cat:categoryname" or
74                    as "-cat:categoryname|fromtitle" (using # instead of |
75                    is also allowed in this one and the following)
76
77-catr               Like -cat, but also recursively includes pages in
78                    subcategories, sub-subcategories etc. of the
79                    given category.
80                    Argument can also be given as "-catr:categoryname" or
81                    as "-catr:categoryname|fromtitle".
82
83-subcats            Work on all subcategories of a specific category.
84                    Argument can also be given as "-subcats:categoryname" or
85                    as "-subcats:categoryname|fromtitle".
86
87-subcatsr           Like -subcats, but also includes sub-subcategories etc. of
88                    the given category.
89                    Argument can also be given as "-subcatsr:categoryname" or
90                    as "-subcatsr:categoryname|fromtitle".
91
92-uncat              Work on all pages which are not categorised.
93
94-uncatcat           Work on all categories which are not categorised.
95
96-uncatfiles         Work on all files which are not categorised.
97
98-file               Read a list of pages to treat from the named text file.
99                    Page titles in the file may be either enclosed with
100                    [[brackets]], or be separated by new lines.
101                    Argument can also be given as "-file:filename".
102
103-filelinks          Work on all pages that use a certain image/media file.
104                    Argument can also be given as "-filelinks:filename".
105
106-search             Work on all pages that are found in a MediaWiki search
107                    across all namespaces.
108
109-logevents          Work on articles that were on a specified Special:Log.
110                    The value may be a comma separated list of these values:
111
112                        logevent,username,start,end
113
114                    or for backward compatibility:
115
116                        logevent,username,total
117
118                    Note: 'start' is the most recent date and log events are
119                    iterated from present to past. If 'start'' is not provided,
120                    it means 'now'; if 'end' is not provided, it means 'since
121                    the beginning'.
122
123                    To use the default value, use an empty string.
124                    You have options for every type of logs given by the
125                    log event parameter which could be one of the following:
126
127                        spamblacklist, titleblacklist, gblblock, renameuser,
128                        globalauth, gblrights, gblrename, abusefilter,
129                        massmessage, thanks, usermerge, block, protect, rights,
130                        delete, upload, move, import, patrol, merge, suppress,
131                        tag, managetags, contentmodel, review, stable,
132                        timedmediahandler, newusers
133
134                    It uses the default number of pages 10.
135
136                    Examples:
137
138                    -logevents:move gives pages from move log (usually
139                    redirects)
140                    -logevents:delete,,20 gives 20 pages from deletion log
141                    -logevents:protect,Usr gives pages from protect log by user
142                    Usr
143                    -logevents:patrol,Usr,20 gives 20 patrolled pages by Usr
144                    -logevents:upload,,20121231,20100101 gives upload pages
145                    in the 2010s, 2011s, and 2012s
146                    -logevents:review,,20121231 gives review pages since the
147                    beginning till the 31 Dec 2012
148                    -logevents:review,Usr,20121231 gives review pages by user
149                    Usr since the beginning till the 31 Dec 2012
150
151                    In some cases it must be given as -logevents:"move,Usr,20"
152
153-interwiki          Work on the given page and all equivalent pages in other
154                    languages. This can, for example, be used to fight
155                    multi-site spamming.
156                    Attention: this will cause the bot to modify
157                    pages on several wiki sites, this is not well tested,
158                    so check your edits!
159
160-links              Work on all pages that are linked from a certain page.
161                    Argument can also be given as "-links:linkingpagetitle".
162
163-liverecentchanges  Work on pages from the live recent changes feed. If used as
164                    -liverecentchanges:x, work on x recent changes.
165
166-imagesused         Work on all images that contained on a certain page.
167                    Can also be given as "-imagesused:linkingpagetitle".
168
169-newimages          Work on the most recent new images. If given as
170                    -newimages:x, will work on x newest images.
171
172-newpages           Work on the most recent new pages. If given as -newpages:x,
173                    will work on x newest pages.
174
175-recentchanges      Work on the pages with the most recent changes. If
176                    given as -recentchanges:x, will work on the x most recently
177                    changed pages. If given as -recentchanges:offset,duration
178                    it will work on pages changed from 'offset' minutes with
179                    'duration' minutes of timespan. rctags are supported too.
180                    The rctag must be the very first parameter part.
181
182                    Examples:
183
184                    -recentchanges:20 gives the 20 most recently changed pages
185                    -recentchanges:120,70 will give pages with 120 offset
186                    minutes and 70 minutes of timespan
187                    -recentchanges:visualeditor,10 gives the 10 most recently
188                    changed pages marked with 'visualeditor'
189                    -recentchanges:"mobile edit,60,35" will retrieve pages
190                    marked with 'mobile edit' for the given offset and timespan
191
192-unconnectedpages   Work on the most recent unconnected pages to the Wikibase
193                    repository. Given as -unconnectedpages:x, will work on the
194                    x most recent unconnected pages.
195
196-ref                Work on all pages that link to a certain page.
197                    Argument can also be given as "-ref:referredpagetitle".
198
199-start              Specifies that the robot should go alphabetically through
200                    all pages on the home wiki, starting at the named page.
201                    Argument can also be given as "-start:pagetitle".
202
203                    You can also include a namespace. For example,
204                    "-start:Template:!" will make the bot work on all pages
205                    in the template namespace.
206
207                    default value is start:!
208
209-prefixindex        Work on pages commencing with a common prefix.
210
211-transcludes        Work on all pages that use a certain template.
212                    Argument can also be given as "-transcludes:Title".
213
214-unusedfiles        Work on all description pages of images/media files that
215                    are not used anywhere.
216                    Argument can be given as "-unusedfiles:n" where
217                    n is the maximum number of articles to work on.
218
219-lonelypages        Work on all articles that are not linked from any other
220                    article.
221                    Argument can be given as "-lonelypages:n" where
222                    n is the maximum number of articles to work on.
223
224-unwatched          Work on all articles that are not watched by anyone.
225                    Argument can be given as "-unwatched:n" where
226                    n is the maximum number of articles to work on.
227
228-property:name      Work on all pages with a given property name from
229                    Special:PagesWithProp.
230
231-usercontribs       Work on all articles that were edited by a certain user.
232                    (Example : -usercontribs:DumZiBoT)
233
234-weblink            Work on all articles that contain an external link to
235                    a given URL; may be given as "-weblink:url"
236
237-withoutinterwiki   Work on all pages that don't have interlanguage links.
238                    Argument can be given as "-withoutinterwiki:n" where
239                    n is the total to fetch.
240
241-mysqlquery         Takes a MySQL query string like
242                    "SELECT page_namespace, page_title FROM page
243                    WHERE page_namespace = 0" and treats
244                    the resulting pages. See
245                    https://www.mediawiki.org/wiki/Manual:Pywikibot/MySQL
246                    for more details.
247
248-sparql             Takes a SPARQL SELECT query string including ?item
249                    and works on the resulting pages.
250
251-sparqlendpoint     Specify SPARQL endpoint URL (optional).
252                    (Example : -sparqlendpoint:http://myserver.com/sparql)
253
254-searchitem         Takes a search string and works on Wikibase pages that
255                    contain it.
256                    Argument can be given as "-searchitem:text", where text
257                    is the string to look for, or "-searchitem:lang:text",
258                    where lang is the language to search items in.
259
260-wantedpages        Work on pages that are linked, but do not exist;
261                    may be given as "-wantedpages:n" where n is the maximum
262                    number of articles to work on.
263
264-wantedcategories   Work on categories that are used, but do not exist;
265                    may be given as "-wantedcategories:n" where n is the
266                    maximum number of categories to work on.
267
268-wantedfiles        Work on files that are used, but do not exist;
269                    may be given as "-wantedfiles:n" where n is the maximum
270                    number of files to work on.
271
272-wantedtemplates    Work on templates that are used, but do not exist;
273                    may be given as "-wantedtemplates:n" where n is the
274                    maximum number of templates to work on.
275
276-random             Work on random pages returned by [[Special:Random]].
277                    Can also be given as "-random:n" where n is the number
278                    of pages to be returned.
279
280-randomredirect     Work on random redirect pages returned by
281                    [[Special:RandomRedirect]]. Can also be given as
282                    "-randomredirect:n" where n is the number of pages to be
283                    returned.
284
285-google             Work on all pages that are found in a Google search.
286                    You need a Google Web API license key. Note that Google
287                    doesn't give out license keys anymore. See google_key in
288                    config.py for instructions.
289                    Argument can also be given as "-google:searchstring".
290
291-page               Work on a single page. Argument can also be given as
292                    "-page:pagetitle", and supplied multiple times for
293                    multiple pages.
294
295-pageid             Work on a single pageid. Argument can also be given as
296                    "-pageid:pageid1,pageid2,." or
297                    "-pageid:'pageid1|pageid2|..'"
298                    and supplied multiple times for multiple pages.
299
300-linter             Work on pages that contain lint errors. Extension Linter
301                    must be available on the site.
302                    -linter select all categories.
303                    -linter:high, -linter:medium or -linter:low select all
304                    categories for that prio.
305                    Single categories can be selected with commas as in
306                    -linter:cat1,cat2,cat3
307
308                    Adding '/int' identifies Lint ID to start querying from:
309                    e.g. -linter:high/10000
310
311                    -linter:show just shows available categories.
312
313-querypage:name     Work on pages provided by a QueryPage-based special page,
314                    see https://www.mediawiki.org/wiki/API:Querypage.
315                    (tip: use -limit:n to fetch only n pages).
316
317                    -querypage shows special pages available.
318
319-url                Read a list of pages to treat from the provided URL.
320                    The URL must return text in the same format as expected for
321                    the -file argument, e.g. page titles separated by newlines
322                    or enclosed in brackets.
323
324
325FILTER OPTIONS
326==============
327
328-catfilter          Filter the page generator to only yield pages in the
329                    specified category. See -cat generator for argument format.
330
331-grep               A regular expression that needs to match the article
332                    otherwise the page won't be returned.
333                    Multiple -grep:regexpr can be provided and the page will
334                    be returned if content is matched by any of the regexpr
335                    provided.
336                    Case insensitive regular expressions will be used and
337                    dot matches any character, including a newline.
338
339-grepnot            Like -grep, but return the page only if the regular
340                    expression does not match.
341
342-intersect          Work on the intersection of all the provided generators.
343
344-limit              When used with any other argument -limit:n specifies a set
345                    of pages, work on no more than n pages in total.
346
347-namespaces         Filter the page generator to only yield pages in the
348-namespace          specified namespaces. Separate multiple namespace
349-ns                 numbers or names with commas.
350
351                    Examples:
352
353                    -ns:0,2,4
354                    -ns:Help,MediaWiki
355
356                    You may use a preleading "not" to exclude the namespace.
357
358                    Examples:
359
360                    -ns:not:2,3
361                    -ns:not:Help,File
362
363                    If used with -newpages/-random/-randomredirect/linter
364                    generators, -namespace/ns must be provided before
365                    -newpages/-random/-randomredirect/linter.
366                    If used with -recentchanges generator, efficiency is
367                    improved if -namespace is provided before -recentchanges.
368
369                    If used with -start generator, -namespace/ns shall contain
370                    only one value.
371
372-onlyif             A claim the page needs to contain, otherwise the item won't
373                    be returned.
374                    The format is property=value,qualifier=value. Multiple (or
375                    none) qualifiers can be passed, separated by commas.
376
377                    Examples:
378
379                    P1=Q2 (property P1 must contain value Q2),
380                    P3=Q4,P5=Q6,P6=Q7 (property P3 with value Q4 and
381                    qualifiers: P5 with value Q6 and P6 with value Q7).
382                    Value can be page ID, coordinate in format:
383                    latitude,longitude[,precision] (all values are in decimal
384                    degrees), year, or plain string.
385                    The argument can be provided multiple times and the item
386                    page will be returned only if all claims are present.
387                    Argument can be also given as "-onlyif:expression".
388
389-onlyifnot          A claim the page must not contain, otherwise the item won't
390                    be returned.
391                    For usage and examples, see -onlyif above.
392
393-ql                 Filter pages based on page quality.
394                    This is only applicable if contentmodel equals
395                    'proofread-page', otherwise has no effects.
396                    Valid values are in range 0-4.
397                    Multiple values can be comma-separated.
398
399-subpage            -subpage:n filters pages to only those that have depth n
400                    i.e. a depth of 0 filters out all pages that are subpages,
401                    and a depth of 1 filters out all pages that are subpages of
402                    subpages.
403
404
405-titleregex         A regular expression that needs to match the article title
406                    otherwise the page won't be returned.
407                    Multiple -titleregex:regexpr can be provided and the page
408                    will be returned if title is matched by any of the regexpr
409                    provided.
410                    Case insensitive regular expressions will be used and
411                    dot matches any character.
412
413-titleregexnot      Like -titleregex, but return the page only if the regular
414                    expression does not match.
415"""
416
417docuReplacements = {'&params;': parameterHelp}  # noqa: N816
418
419# if a bot uses GeneratorFactory, the module should include the line
420#   docuReplacements = {'&params;': pywikibot.pagegenerators.parameterHelp}
421# and include the marker &params; in the module's docstring
422#
423# We manually include it so the parameters show up in the auto-generated
424# module documentation:
425
426__doc__ = __doc__.replace('&params;', parameterHelp)
427
428
429# This is the function that will be used to de-duplicate page iterators.
430_filter_unique_pages = partial(
431    filter_unique, key=lambda page: '{}:{}:{}'.format(*page._cmpkey()))
432
433
434def _output_if(predicate, msg):
435    if predicate:
436        pywikibot.output(msg)
437
438
439class GeneratorFactory:
440
441    """Process command line arguments and return appropriate page generator.
442
443    This factory is responsible for processing command line arguments
444    that are used by many scripts and that determine which pages to work on.
445
446    :Note: GeneratorFactory must be instantiated after global arguments are
447        parsed except if site parameter is given.
448    """
449
450    def __init__(self, site=None,
451                 positional_arg_name: Optional[str] = None,
452                 enabled_options: Optional[Iterable[str]] = None,
453                 disabled_options: Optional[Iterable[str]] = None):
454        """
455        Initializer.
456
457        :param site: Site for generator results
458        :type site: :py:obj:`pywikibot.site.BaseSite`
459        :param positional_arg_name: generator to use for positional args,
460            which do not begin with a hyphen
461        :param enabled_options: only enable options given by this Iterable.
462            This is priorized over disabled_options
463        :param disabled_options: disable these given options and let them
464            be handled by scripts options handler
465        """
466        self.gens = []
467        self._namespaces = []
468        self.limit = None
469        self.qualityfilter_list = []
470        self.articlefilter_list = []
471        self.articlenotfilter_list = []
472        self.titlefilter_list = []
473        self.titlenotfilter_list = []
474        self.claimfilter_list = []
475        self.catfilter_list = []
476        self.intersect = False
477        self.subpage_max_depth = None
478        self._site = site
479        self._positional_arg_name = positional_arg_name
480        self._sparql = None
481        self.nopreload = False
482        self._validate_options(enabled_options, disabled_options)
483
484    def _validate_options(self, enable, disable):
485        """Validate option restrictions."""
486        msg = '{!r} is not a valid pagegenerators option to be '
487        enable = enable or []
488        disable = disable or []
489        self.enabled_options = set(enable)
490        self.disabled_options = set(disable)
491        for opt in enable:
492            if not hasattr(self, '_handle_' + opt):
493                pywikibot.warning((msg + 'enabled').format(opt))
494                self.enabled_options.remove(opt)
495        for opt in disable:
496            if not hasattr(self, '_handle_' + opt):
497                pywikibot.warning((msg + 'disabled').format(opt))
498                self.disabled_options.remove(opt)
499        if self.enabled_options and self.disabled_options:
500            pywikibot.warning('Ignoring disabled option because enabled '
501                              'options are set.')
502            self.disabled_options = []
503
504    @property
505    def site(self):
506        """
507        Generator site.
508
509        The generator site should not be accessed until after the global
510        arguments have been handled, otherwise the default Site may be changed
511        by global arguments, which will cause this cached value to be stale.
512
513        :return: Site given to initializer, otherwise the default Site at the
514            time this property is first accessed.
515        :rtype: :py:obj:`pywikibot.site.BaseSite`
516        """
517        if not self._site:
518            self._site = pywikibot.Site()
519        return self._site
520
521    @property
522    def namespaces(self):
523        """
524        List of Namespace parameters.
525
526        Converts int or string namespaces to Namespace objects and
527        change the storage to immutable once it has been accessed.
528
529        The resolving and validation of namespace command line arguments
530        is performed in this method, as it depends on the site property
531        which is lazy loaded to avoid being cached before the global
532        arguments are handled.
533
534        :return: namespaces selected using arguments
535        :rtype: list of Namespace
536        :raises KeyError: a namespace identifier was not resolved
537        :raises TypeError: a namespace identifier has an inappropriate
538            type such as NoneType or bool
539        """
540        if isinstance(self._namespaces, list):
541            self._namespaces = frozenset(
542                self.site.namespaces.resolve(self._namespaces))
543        return self._namespaces
544
545    def getCombinedGenerator(self, gen=None, preload=False):
546        """Return the combination of all accumulated generators.
547
548        Only call this after all arguments have been parsed.
549
550        :param gen: Another generator to be combined with
551        :type gen: iterator
552        :param preload: preload pages using PreloadingGenerator
553            unless self.nopreload is True
554        :type preload: bool
555        """
556        if gen:
557            self.gens.insert(0, gen)
558
559        for i in range(len(self.gens)):
560            if self.namespaces:
561                if (isinstance(self.gens[i], api.QueryGenerator)
562                        and self.gens[i].support_namespace()):
563                    self.gens[i].set_namespace(self.namespaces)
564                # QueryGenerator does not support namespace param.
565                else:
566                    self.gens[i] = NamespaceFilterPageGenerator(
567                        self.gens[i], self.namespaces, self.site)
568
569            if self.limit:
570                try:
571                    self.gens[i].set_maximum_items(self.limit)
572                except AttributeError:
573                    self.gens[i] = itertools.islice(self.gens[i], self.limit)
574
575        if not self.gens:
576            if any((self.titlefilter_list,
577                    self.titlenotfilter_list,
578                    self.articlefilter_list,
579                    self.articlenotfilter_list,
580                    self.claimfilter_list,
581                    self.catfilter_list,
582                    self.qualityfilter_list,
583                    self.subpage_max_depth is not None)):
584                pywikibot.warning('filter(s) specified but no generators.')
585            return None
586
587        if len(self.gens) == 1:
588            dupfiltergen = self.gens[0]
589            if hasattr(self, '_single_gen_filter_unique'):
590                dupfiltergen = _filter_unique_pages(dupfiltergen)
591            if self.intersect:
592                pywikibot.warning(
593                    '"-intersect" ignored as only one generator is specified.')
594        elif self.intersect:
595            # By definition no duplicates are possible.
596            dupfiltergen = intersect_generators(*self.gens)
597        else:
598            dupfiltergen = _filter_unique_pages(itertools.chain(*self.gens))
599
600        # Add on subpage filter generator
601        if self.subpage_max_depth is not None:
602            dupfiltergen = SubpageFilterGenerator(
603                dupfiltergen, self.subpage_max_depth)
604
605        if self.claimfilter_list:
606            for claim in self.claimfilter_list:
607                dupfiltergen = ItemClaimFilterPageGenerator(dupfiltergen,
608                                                            claim[0], claim[1],
609                                                            claim[2], claim[3])
610
611        if self.qualityfilter_list:
612            dupfiltergen = QualityFilterPageGenerator(
613                dupfiltergen, self.qualityfilter_list)
614
615        if self.titlefilter_list:
616            dupfiltergen = RegexFilterPageGenerator(
617                dupfiltergen, self.titlefilter_list)
618
619        if self.titlenotfilter_list:
620            dupfiltergen = RegexFilterPageGenerator(
621                dupfiltergen, self.titlenotfilter_list, 'none')
622
623        if self.catfilter_list:
624            dupfiltergen = CategoryFilterPageGenerator(
625                dupfiltergen, self.catfilter_list)
626
627        if (preload or self.articlefilter_list) and not self.nopreload:
628            if isinstance(dupfiltergen, DequeGenerator):
629                dupfiltergen = DequePreloadingGenerator(dupfiltergen)
630            else:
631                dupfiltergen = PreloadingGenerator(dupfiltergen)
632
633        if self.articlefilter_list:
634            dupfiltergen = RegexBodyFilterPageGenerator(
635                dupfiltergen, self.articlefilter_list)
636
637        if self.articlenotfilter_list:
638            dupfiltergen = RegexBodyFilterPageGenerator(
639                dupfiltergen, self.articlenotfilter_list, 'none')
640
641        return dupfiltergen
642
643    @deprecated_args(arg='category')
644    def getCategory(self, category: str) -> tuple:
645        """
646        Return Category and start as defined by category.
647
648        :param category: category name with start parameter
649        """
650        if not category:
651            category = i18n.input(
652                'pywikibot-enter-category-name',
653                fallback_prompt='Please enter the category name:')
654        category = category.replace('#', '|')
655
656        category, _, startfrom = category.partition('|')
657        if not startfrom:
658            startfrom = None
659
660        # Insert "Category:" before category name to avoid parsing problems in
661        # Link.parse() when categoryname contains ":";
662        # Part before ":" might be interpreted as an interwiki prefix
663        prefix = category.split(':', 1)[0]  # whole word if ":" not present
664        if prefix not in self.site.namespaces[14]:
665            category = '{}:{}'.format(
666                self.site.namespace(14), category)
667        cat = pywikibot.Category(pywikibot.Link(category,
668                                                source=self.site,
669                                                default_namespace=14))
670        return cat, startfrom
671
672    @deprecated_args(arg='category')
673    def getCategoryGen(self, category: str, recurse: bool = False,
674                       content: bool = False, gen_func=None):
675        """
676        Return generator based on Category defined by category and gen_func.
677
678        :param category: category name with start parameter
679        :rtype: generator
680        """
681        cat, startfrom = self.getCategory(category)
682
683        return gen_func(cat,
684                        start=startfrom,
685                        recurse=recurse,
686                        content=content)
687
688    @staticmethod
689    def _parse_log_events(logtype: str, user: Optional[str] = None,
690                          start=None, end=None):
691        """
692        Parse the -logevent argument information.
693
694        :param logtype: A valid logtype
695        :param user: A username associated to the log events. Ignored if
696            empty string or None.
697        :param start: Timestamp to start listing from. For backward
698            compatibility, this can also be the total amount of pages
699            that should be returned. It is taken as 'total' if the value does
700            not have 8 digits.
701        :type start: str convertible to Timestamp matching '%Y%m%d%H%M%S'.
702            If the length is not 8: for backward compatibility to use this as
703            'total', it can also be a str (castable to int).
704        :param end: Timestamp to end listing at
705        :type end: str convertible to Timestamp matching '%Y%m%d%H%M%S'
706        :return: The generator or None if invalid 'start/total' or 'end' value.
707        :rtype: LogeventsPageGenerator
708        """
709        def parse_start(start):
710            """Parse start and return (start, total)."""
711            if start is None:
712                return None, None
713
714            if len(start) >= 8:
715                return pywikibot.Timestamp.fromtimestampformat(start), None
716
717            return None, int(start)
718
719        start = start or None  # because start might be an empty string
720        try:
721            start, total = parse_start(start)
722            assert total is None or total > 0
723        except ValueError as err:
724            pywikibot.error(
725                '{}. Start parameter has wrong format!'.format(err))
726            return None
727        except AssertionError:
728            pywikibot.error('Total number of log ({}) events must be a '
729                            'positive int.'.format(start))
730            return None
731
732        try:
733            end = pywikibot.Timestamp.fromtimestampformat(end)
734        except ValueError as err:
735            pywikibot.error(
736                '{}. End parameter has wrong format!'.format(err))
737            return None
738        except TypeError:  # end is None
739            pass
740
741        if start or end:
742            pywikibot.output('Fetching log events in range: {} - {}.'
743                             .format(end or 'beginning of time',
744                                     start or 'now'))
745
746        # 'user or None', because user might be an empty string when
747        # 'foo,,bar' was used.
748        return LogeventsPageGenerator(logtype, user or None, total=total,
749                                      start=start, end=end)
750
751    def _handle_filelinks(self, value):
752        """Handle `-filelinks` argument."""
753        if not value:
754            value = i18n.input(
755                'pywikibot-enter-file-links-processing',
756                fallback_prompt='Links to which file page should be '
757                                'processed?')
758        if not value.startswith(self.site.namespace(6) + ':'):
759            value = 'Image:' + value
760        file_page = pywikibot.FilePage(self.site, value)
761        return file_page.usingPages()
762
763    def _handle_linter(self, value):
764        """Handle `-linter` argument."""
765        if not self.site.has_extension('Linter'):
766            raise UnknownExtensionError(
767                '-linter needs a site with Linter extension.')
768        cats = self.site.siteinfo.get('linter')  # Get linter categories.
769        valid_cats = [c for _list in cats.values() for c in _list]
770
771        value = value or ''
772        cat, _, lint_from = value.partition('/')
773        lint_from = lint_from or None
774
775        def show_available_categories(cats):
776            _i = ' ' * 4
777            _2i = 2 * _i
778            txt = 'Available categories of lint errors:\n'
779            for prio, _list in cats.items():
780                txt += '{indent}{prio}\n'.format(indent=_i, prio=prio)
781                txt += ''.join(
782                    '{indent}{cat}\n'.format(indent=_2i, cat=c) for c in _list)
783            pywikibot.output(txt)
784
785        if cat == 'show':  # Display categories of lint errors.
786            show_available_categories(cats)
787            sys.exit(0)
788
789        if not cat:
790            lint_cats = valid_cats
791        elif cat in ['low', 'medium', 'high']:
792            lint_cats = cats[cat]
793        else:
794            lint_cats = cat.split(',')
795            assert set(lint_cats) <= set(valid_cats), \
796                'Invalid category of lint errors: {}'.format(cat)
797
798        return self.site.linter_pages(
799            lint_categories='|'.join(lint_cats), namespaces=self.namespaces,
800            lint_from=lint_from)
801
802    def _handle_querypage(self, value):
803        """Handle `-querypage` argument."""
804        if value is None:  # Display special pages.
805            pages = self.site._paraminfo.parameter('query+querypage',
806                                                   'page')
807            pages = sorted(pages['type'])
808            limit = self.site._paraminfo.parameter('query+querypage',
809                                                   'limit')
810
811            max_w = max(len(p) for p in pages[::2]) + 4
812            txt = 'Available special pages:\n'
813            for a, b in zip_longest(pages[::2], pages[1::2], fillvalue=''):
814                txt += '    {a:<{max_w}}{b}\n'.format(a=a, b=b, max_w=max_w)
815            txt += ('\nMaximum number of pages to return is {max} '
816                    '({highmax} for bots).\n'.format_map(limit))
817            pywikibot.output(txt)
818            sys.exit(0)
819
820        return self.site.querypage(value)
821
822    def _handle_url(self, value):
823        """Handle `-url` argument."""
824        if not value:
825            value = pywikibot.input('Please enter the URL:')
826        return TextIOPageGenerator(value, site=self.site)
827
828    def _handle_unusedfiles(self, value):
829        """Handle `-unusedfiles` argument."""
830        return self.site.unusedfiles(total=_int_none(value))
831
832    def _handle_lonelypages(self, value):
833        """Handle `-lonelypages` argument."""
834        return self.site.lonelypages(total=_int_none(value))
835
836    def _handle_unwatched(self, value):
837        """Handle `-unwatched` argument."""
838        return self.site.unwatchedpage(total=_int_none(value))
839
840    def _handle_wantedpages(self, value):
841        """Handle `-wantedpages` argument."""
842        return self.site.wantedpages(total=_int_none(value))
843
844    def _handle_wantedfiles(self, value):
845        """Handle `-wantedfiles` argument."""
846        return self.site.wantedfiles(total=_int_none(value))
847
848    def _handle_wantedtemplates(self, value):
849        """Handle `-wantedtemplates` argument."""
850        return self.site.wantedtemplates(total=_int_none(value))
851
852    def _handle_wantedcategories(self, value):
853        """Handle `-wantedcategories` argument."""
854        return self.site.wantedcategories(total=_int_none(value))
855
856    def _handle_property(self, value):
857        """Handle `-property` argument."""
858        if not value:
859            question = 'Which property name to be used?'
860            value = pywikibot.input(question + ' (List [?])')
861            pnames = self.site.get_property_names()
862            # also use the default by <enter> key
863            if value in '?' or value not in pnames:
864                prefix, value = pywikibot.input_choice(
865                    question, ShowingListOption(pnames))
866        return self.site.pages_with_property(value)
867
868    def _handle_usercontribs(self, value):
869        """Handle `-usercontribs` argument."""
870        self._single_gen_filter_unique = True
871        return UserContributionsGenerator(
872            value, site=self.site, _filter_unique=None)
873
874    def _handle_withoutinterwiki(self, value):
875        """Handle `-withoutinterwiki` argument."""
876        return self.site.withoutinterwiki(total=_int_none(value))
877
878    def _handle_interwiki(self, value):
879        """Handle `-interwiki` argument."""
880        if not value:
881            value = i18n.input(
882                'pywikibot-enter-page-processing',
883                fallback_prompt='Which page should be processed?')
884        page = pywikibot.Page(pywikibot.Link(value, self.site))
885        return InterwikiPageGenerator(page)
886
887    def _handle_randomredirect(self, value):
888        """Handle `-randomredirect` argument."""
889        # partial workaround for bug T119940
890        # to use -namespace/ns with -randomredirect, -ns must be given
891        # before -randomredirect
892        # otherwise default namespace is 0
893        namespaces = self.namespaces or 0
894        return self.site.randompages(total=_int_none(value),
895                                     namespaces=namespaces, redirects=True)
896
897    def _handle_random(self, value):
898        """Handle `-random` argument."""
899        # partial workaround for bug T119940
900        # to use -namespace/ns with -random, -ns must be given
901        # before -random
902        # otherwise default namespace is 0
903        namespaces = self.namespaces or 0
904        return self.site.randompages(total=_int_none(value),
905                                     namespaces=namespaces)
906
907    def _handle_recentchanges(self, value):
908        """Handle `-recentchanges` argument."""
909        rcstart = None
910        rcend = None
911        rctag = None
912        total = None
913        params = value.split(',') if value else []
914        if params and not params[0].isdigit():
915            rctag = params.pop(0)
916        if len(params) > 2:
917            raise ValueError('More than two parameters passed.')
918        if len(params) == 2:
919            offset = float(params[0])
920            duration = float(params[1])
921            if offset < 0 or duration < 0:
922                raise ValueError('Negative valued parameters passed.')
923            ts_time = self.site.server_time()
924            rcstart = ts_time - timedelta(minutes=offset)
925            rcend = rcstart - timedelta(minutes=duration)
926        elif len(params) == 1:
927            total = int(params[0])
928        self._single_gen_filter_unique = True
929        return RecentChangesPageGenerator(
930            namespaces=self.namespaces, total=total, start=rcstart, end=rcend,
931            site=self.site, tag=rctag)
932
933    def _handle_liverecentchanges(self, value):
934        """Handle `-liverecentchanges` argument."""
935        self.nopreload = True
936        return LiveRCPageGenerator(site=self.site, total=_int_none(value))
937
938    def _handle_file(self, value):
939        """Handle `-file` argument."""
940        if not value:
941            value = pywikibot.input('Please enter the local file name:')
942        return TextIOPageGenerator(value, site=self.site)
943
944    def _handle_namespaces(self, value):
945        """Handle `-namespaces` argument."""
946        if isinstance(self._namespaces, frozenset):
947            raise RuntimeError('-namespace/ns option must be provided before '
948                               '-newpages/-random/-randomredirect/-linter')
949        if not value:
950            value = pywikibot.input('What namespace are you filtering on?')
951        NOT_KEY = 'not:'
952        if value.startswith(NOT_KEY):
953            value = value[len(NOT_KEY):]
954            resolve = self.site.namespaces.resolve
955            not_ns = set(resolve(value.split(',')))
956            if not self._namespaces:
957                self._namespaces = list(
958                    set(self.site.namespaces.values()) - not_ns)
959            else:
960                self._namespaces = list(
961                    set(resolve(self._namespaces)) - not_ns)
962        else:
963            self._namespaces += value.split(',')
964        return True
965
966    _handle_ns = _handle_namespaces
967    _handle_namespace = _handle_namespaces
968
969    def _handle_limit(self, value):
970        """Handle `-limit` argument."""
971        if not value:
972            value = pywikibot.input('What is the limit value?')
973        self.limit = _int_none(value)
974        return True
975
976    def _handle_category(self, value):
977        """Handle `-category` argument."""
978        return self.getCategoryGen(
979            value, recurse=False, gen_func=CategorizedPageGenerator)
980
981    _handle_cat = _handle_category
982
983    def _handle_catr(self, value):
984        """Handle `-catr` argument."""
985        return self.getCategoryGen(
986            value, recurse=True, gen_func=CategorizedPageGenerator)
987
988    def _handle_subcats(self, value):
989        """Handle `-subcats` argument."""
990        return self.getCategoryGen(
991            value, recurse=False, gen_func=SubCategoriesPageGenerator)
992
993    def _handle_subcatsr(self, value):
994        """Handle `-subcatsr` argument."""
995        return self.getCategoryGen(
996            value, recurse=True, gen_func=SubCategoriesPageGenerator)
997
998    def _handle_catfilter(self, value):
999        """Handle `-catfilter` argument."""
1000        cat, _ = self.getCategory(value)
1001        self.catfilter_list.append(cat)
1002        return True
1003
1004    def _handle_page(self, value):
1005        """Handle `-page` argument."""
1006        if not value:
1007            value = pywikibot.input('What page do you want to use?')
1008        return [pywikibot.Page(pywikibot.Link(value, self.site))]
1009
1010    def _handle_pageid(self, value):
1011        """Handle `-pageid` argument."""
1012        if not value:
1013            value = pywikibot.input('What pageid do you want to use?')
1014        return self.site.load_pages_from_pageids(value)
1015
1016    def _handle_uncatfiles(self, value):
1017        """Handle `-uncatfiles` argument."""
1018        return self.site.uncategorizedimages()
1019
1020    def _handle_uncatcat(self, value):
1021        """Handle `-uncatcat` argument."""
1022        return self.site.uncategorizedcategories()
1023
1024    def _handle_uncat(self, value):
1025        """Handle `-uncat` argument."""
1026        return self.site.uncategorizedpages()
1027
1028    def _handle_ref(self, value):
1029        """Handle `-ref` argument."""
1030        if not value:
1031            value = pywikibot.input(
1032                'Links to which page should be processed?')
1033        page = pywikibot.Page(pywikibot.Link(value, self.site))
1034        return page.getReferences()
1035
1036    def _handle_links(self, value):
1037        """Handle `-links` argument."""
1038        if not value:
1039            value = pywikibot.input(
1040                'Links from which page should be processed?')
1041        page = pywikibot.Page(pywikibot.Link(value, self.site))
1042        return page.linkedPages()
1043
1044    def _handle_weblink(self, value):
1045        """Handle `-weblink` argument."""
1046        if not value:
1047            value = pywikibot.input(
1048                'Pages with which weblink should be processed?')
1049        return self.site.exturlusage(value)
1050
1051    def _handle_transcludes(self, value):
1052        """Handle `-transcludes` argument."""
1053        if not value:
1054            value = pywikibot.input(
1055                'Pages that transclude which page should be processed?')
1056        page = pywikibot.Page(pywikibot.Link(value,
1057                                             default_namespace=10,
1058                                             source=self.site))
1059        return page.getReferences(only_template_inclusion=True)
1060
1061    def _handle_start(self, value):
1062        """Handle `-start` argument."""
1063        if not value:
1064            value = '!'
1065        firstpagelink = pywikibot.Link(value, self.site)
1066        return self.site.allpages(
1067            start=firstpagelink.title, namespace=firstpagelink.namespace,
1068            filterredir=False)
1069
1070    def _handle_prefixindex(self, value):
1071        """Handle `-prefixindex` argument."""
1072        if not value:
1073            value = pywikibot.input('What page names are you looking for?')
1074        return PrefixingPageGenerator(prefix=value, site=self.site)
1075
1076    def _handle_newimages(self, value):
1077        """Handle `-newimages` argument."""
1078        return NewimagesPageGenerator(total=_int_none(value), site=self.site)
1079
1080    def _handle_newpages(self, value):
1081        """Handle `-newpages` argument."""
1082        # partial workaround for bug T69249
1083        # to use -namespace/ns with -newpages, -ns must be given
1084        # before -newpages
1085        # otherwise default namespace is 0
1086        namespaces = self.namespaces or 0
1087        return NewpagesPageGenerator(
1088            namespaces=namespaces, total=_int_none(value), site=self.site)
1089
1090    def _handle_unconnectedpages(self, value):
1091        """Handle `-unconnectedpages` argument."""
1092        return self.site.unconnected_pages(total=_int_none(value))
1093
1094    def _handle_imagesused(self, value):
1095        """Handle `-imagesused` argument."""
1096        if not value:
1097            value = pywikibot.input(
1098                'Images on which page should be processed?')
1099        page = pywikibot.Page(pywikibot.Link(value, self.site))
1100        return page.imagelinks()
1101
1102    def _handle_searchitem(self, value):
1103        """Handle `-searchitem` argument."""
1104        if not value:
1105            value = pywikibot.input('Text to look for:')
1106        params = value.split(':')
1107        value = params[-1]
1108        lang = params[0] if len(params) == 2 else None
1109        return WikibaseSearchItemPageGenerator(
1110            value, language=lang, site=self.site)
1111
1112    def _handle_search(self, value):
1113        """Handle `-search` argument."""
1114        if not value:
1115            value = pywikibot.input('What do you want to search for?')
1116        # In order to be useful, all namespaces are required
1117        return self.site.search(value, namespaces=[])
1118
1119    @staticmethod
1120    def _handle_google(value):
1121        """Handle `-google` argument."""
1122        return GoogleSearchPageGenerator(value)
1123
1124    def _handle_titleregex(self, value):
1125        """Handle `-titleregex` argument."""
1126        if not value:
1127            value = pywikibot.input(
1128                'What page names are you looking for?')
1129        self.titlefilter_list.append(value)
1130        return True
1131
1132    def _handle_titleregexnot(self, value):
1133        """Handle `-titleregexnot` argument."""
1134        if not value:
1135            value = pywikibot.input(
1136                'All pages except which ones?')
1137        self.titlenotfilter_list.append(value)
1138        return True
1139
1140    def _handle_grep(self, value):
1141        """Handle `-grep` argument."""
1142        if not value:
1143            value = pywikibot.input('Which pattern do you want to grep?')
1144        self.articlefilter_list.append(value)
1145        return True
1146
1147    def _handle_grepnot(self, value):
1148        """Handle `-grepnot` argument."""
1149        if not value:
1150            value = pywikibot.input('Which pattern do you want to skip?')
1151        self.articlenotfilter_list.append(value)
1152        return True
1153
1154    def _handle_ql(self, value):
1155        """Handle `-ql` argument."""
1156        if not self.site.has_extension('ProofreadPage'):
1157            raise UnknownExtensionError(
1158                'Ql filtering needs a site with ProofreadPage extension.')
1159        value = [int(_) for _ in value.split(',')]
1160        if min(value) < 0 or max(value) > 4:  # Invalid input ql.
1161            valid_ql = [
1162                '{}: {}'.format(*i)
1163                for i in self.site.proofread_levels.items()]
1164            valid_ql = ', '.join(valid_ql)
1165            pywikibot.warning('Acceptable values for -ql are:\n    {}'
1166                              .format(valid_ql))
1167        self.qualityfilter_list = value
1168        return True
1169
1170    def _handle_onlyif(self, value):
1171        """Handle `-onlyif` argument."""
1172        return self._onlyif_onlyifnot_handler(value, False)
1173
1174    def _handle_onlyifnot(self, value):
1175        """Handle `-onlyifnot` argument."""
1176        return self._onlyif_onlyifnot_handler(value, True)
1177
1178    def _onlyif_onlyifnot_handler(self, value, ifnot):
1179        """Handle `-onlyif` and `-onlyifnot` arguments."""
1180        if not value:
1181            value = pywikibot.input('Which claim do you want to filter?')
1182        p = re.compile(r'(?<!\\),')  # Match "," only if there no "\" before
1183        temp = []  # Array to store split argument
1184        for arg in p.split(value):
1185            temp.append(arg.replace(r'\,', ',').split('='))
1186        self.claimfilter_list.append(
1187            (temp[0][0], temp[0][1], dict(temp[1:]), ifnot))
1188        return True
1189
1190    def _handle_sparqlendpoint(self, value):
1191        """Handle `-sparqlendpoint` argument."""
1192        if not value:
1193            value = pywikibot.input('SPARQL endpoint:')
1194        self._sparql = value
1195
1196    def _handle_sparql(self, value):
1197        """Handle `-sparql` argument."""
1198        if not value:
1199            value = pywikibot.input('SPARQL query:')
1200        return WikidataSPARQLPageGenerator(
1201            value, site=self.site, endpoint=self._sparql)
1202
1203    def _handle_mysqlquery(self, value):
1204        """Handle `-mysqlquery` argument."""
1205        if not value:
1206            value = pywikibot.input('Mysql query string:')
1207        return MySQLPageGenerator(value, site=self.site)
1208
1209    def _handle_intersect(self, value):
1210        """Handle `-intersect` argument."""
1211        self.intersect = True
1212        return True
1213
1214    def _handle_subpage(self, value):
1215        """Handle `-subpage` argument."""
1216        if not value:
1217            value = pywikibot.input(
1218                'Maximum subpage depth:')
1219        self.subpage_max_depth = int(value)
1220        return True
1221
1222    def _handle_logevents(self, value):
1223        """Handle `-logevents` argument."""
1224        params = value.split(',')
1225        if params[0] not in self.site.logtypes:
1226            raise NotImplementedError(
1227                'Invalid -logevents parameter "{}"'.format(params[0]))
1228        return self._parse_log_events(*params)
1229
1230    def handle_args(self, args: Iterable[str]) -> List[str]:
1231        """Handle command line arguments and return the rest as a list.
1232
1233        *New in version 6.0.*
1234        """
1235        return [arg for arg in args if not self.handle_arg(arg)]
1236
1237    def handle_arg(self, arg: str) -> bool:
1238        """Parse one argument at a time.
1239
1240        If it is recognized as an argument that specifies a generator, a
1241        generator is created and added to the accumulation list, and the
1242        function returns true. Otherwise, it returns false, so that caller
1243        can try parsing the argument. Call getCombinedGenerator() after all
1244        arguments have been parsed to get the final output generator.
1245
1246        *Renamed in version 6.0.*
1247
1248        :param arg: Pywikibot argument consisting of -name:value
1249        :return: True if the argument supplied was recognised by the factory
1250        """
1251        if not arg.startswith('-') and self._positional_arg_name:
1252            value = arg
1253            arg = '-' + self._positional_arg_name
1254        else:
1255            arg, _, value = arg.partition(':')
1256
1257        if value == '':
1258            value = None
1259
1260        opt = arg[1:]
1261        if opt in self.disabled_options:
1262            return False
1263
1264        if self.enabled_options and opt not in self.enabled_options:
1265            return False
1266
1267        handler = getattr(self, '_handle_' + opt, None)
1268        if not handler:
1269            return False
1270
1271        handler_result = handler(value)
1272        if isinstance(handler_result, bool):
1273            return handler_result
1274        if handler_result:
1275            self.gens.append(handler_result)
1276            return True
1277
1278        return False
1279
1280
1281def _int_none(v):
1282    """Return None if v is None or '' else return int(v)."""
1283    return v if (v is None or v == '') else int(v)
1284
1285
1286@deprecated('Site.allpages()', since='20180512')
1287@deprecated_args(step=True)
1288def AllpagesPageGenerator(start: str = '!', namespace=0,
1289                          includeredirects=True, site=None,
1290                          total: Optional[int] = None, content: bool = False
1291                          ):  # pragma: no cover
1292    """
1293    Iterate Page objects for all titles in a single namespace.
1294
1295    If includeredirects is False, redirects are not included. If
1296    includeredirects equals the string 'only', only redirects are added.
1297
1298    :param total: Maximum number of pages to retrieve in total
1299    :param content: If True, load current version of each page (default False)
1300    :param site: Site for generator results.
1301    :type site: :py:obj:`pywikibot.site.BaseSite`
1302
1303    """
1304    if site is None:
1305        site = pywikibot.Site()
1306    if includeredirects:
1307        if includeredirects == 'only':
1308            filterredir = True
1309        else:
1310            filterredir = None
1311    else:
1312        filterredir = False
1313    return site.allpages(start=start, namespace=namespace,
1314                         filterredir=filterredir, total=total, content=content)
1315
1316
1317@deprecated_args(step=True)
1318def PrefixingPageGenerator(prefix: str, namespace=None,
1319                           includeredirects: Union[None, bool, str] = True,
1320                           site=None, total: int = None,
1321                           content: bool = False):
1322    """
1323    Prefixed Page generator.
1324
1325    :param prefix: The prefix of the pages.
1326    :param namespace: Namespace to retrieve pages from
1327    :type namespace: Namespace or int
1328    :param includeredirects: If includeredirects is None, False or an empty
1329        string, redirects will not be found. If includeredirects equals the
1330        string 'only', only redirects will be found. Otherwise redirects will
1331        be included.
1332    :param site: Site for generator results.
1333    :type site: :py:obj:`pywikibot.site.BaseSite`
1334    :param total: Maximum number of pages to retrieve in total
1335    :param content: If True, load current version of each page (default False)
1336    :return: a generator that yields Page objects
1337    :rtype: generator
1338    """
1339    if site is None:
1340        site = pywikibot.Site()
1341    prefixlink = pywikibot.Link(prefix, site)
1342    if namespace is None:
1343        namespace = prefixlink.namespace
1344    title = prefixlink.title
1345    if includeredirects:
1346        if includeredirects == 'only':
1347            filterredir = True
1348        else:
1349            filterredir = None
1350    else:
1351        filterredir = False
1352    return site.allpages(prefix=title, namespace=namespace,
1353                         filterredir=filterredir, total=total, content=content)
1354
1355
1356@deprecated_args(number='total', mode='logtype', repeat=True)
1357def LogeventsPageGenerator(logtype: Optional[str] = None,
1358                           user: Optional[str] = None, site=None,
1359                           namespace: Optional[int] = None,
1360                           total: Optional[int] = None, start=None,
1361                           end=None, reverse: bool = False):
1362    """
1363    Generate Pages for specified modes of logevents.
1364
1365    :param logtype: Mode of logs to retrieve
1366    :param user: User of logs retrieved
1367    :param site: Site for generator results
1368    :type site: :py:obj:`pywikibot.site.BaseSite`
1369    :param namespace: Namespace to retrieve logs from
1370    :param total: Maximum number of pages to retrieve in total
1371    :param start: Timestamp to start listing from
1372    :type start: pywikibot.Timestamp
1373    :param end: Timestamp to end listing at
1374    :type end: pywikibot.Timestamp
1375    :param reverse: if True, start with oldest changes (default: newest)
1376    """
1377    if site is None:
1378        site = pywikibot.Site()
1379    for entry in site.logevents(total=total, logtype=logtype, user=user,
1380                                namespace=namespace, start=start, end=end,
1381                                reverse=reverse):
1382        try:
1383            yield entry.page()
1384        except KeyError as e:
1385            pywikibot.warning('LogeventsPageGenerator: '
1386                              'failed to load page for {!r}; skipping'
1387                              .format(entry.data))
1388            pywikibot.exception(e)
1389
1390
1391@deprecated_args(number='total', step=True, namespace='namespaces',
1392                 repeat=True, get_redirect=True)
1393def NewpagesPageGenerator(site=None, namespaces=(0, ),
1394                          total: Optional[int] = None):
1395    """
1396    Iterate Page objects for all new titles in a single namespace.
1397
1398    :param total: Maxmium number of pages to retrieve in total
1399    :param site: Site for generator results.
1400    :type site: :py:obj:`pywikibot.site.BaseSite`
1401    """
1402    # API does not (yet) have a newpages function, so this tries to duplicate
1403    # it by filtering the recentchanges output
1404    # defaults to namespace 0 because that's how Special:Newpages defaults
1405    if site is None:
1406        site = pywikibot.Site()
1407    return (page for page, _ in site.newpages(namespaces=namespaces,
1408                                              total=total, returndict=True))
1409
1410
1411def RecentChangesPageGenerator(site=None, _filter_unique=None, **kwargs):
1412    """
1413    Generate pages that are in the recent changes list, including duplicates.
1414
1415    For parameters refer pywikibot.site.recentchanges
1416
1417    :param site: Site for generator results.
1418    :type site: :py:obj:`pywikibot.site.BaseSite`
1419    """
1420    if site is None:
1421        site = pywikibot.Site()
1422
1423    gen = site.recentchanges(**kwargs)
1424    gen.request['rcprop'] = 'title'
1425    gen = (pywikibot.Page(site, rc['title'])
1426           for rc in gen if rc['type'] != 'log' or 'title' in rc)
1427
1428    if _filter_unique:
1429        gen = _filter_unique(gen)
1430    return gen
1431
1432
1433@deprecated('site.unconnected_pages()', since='20180512')
1434@deprecated_args(step=True)
1435def UnconnectedPageGenerator(site=None, total: Optional[int] = None):
1436    """
1437    Iterate Page objects for all unconnected pages to a Wikibase repository.
1438
1439    :param total: Maximum number of pages to retrieve in total
1440    :param site: Site for generator results.
1441    :type site: :py:obj:`pywikibot.site.APISite`
1442    """
1443    if site is None:
1444        site = pywikibot.Site()
1445    if not site.data_repository():
1446        raise ValueError('The given site does not have Wikibase repository.')
1447    return site.unconnected_pages(total=total)
1448
1449
1450@deprecated('File.usingPages()', since='20200515')
1451@deprecated_args(referredImagePage='referredFilePage', step=True)
1452def FileLinksGenerator(referredFilePage, total=None, content=False):
1453    """DEPRECATED. Yield Pages on which referredFilePage file is displayed."""
1454    return referredFilePage.usingPages(total=total,
1455                                       content=content)  # pragma: no cover
1456
1457
1458@deprecated('Page.imagelinks()', since='20200515')
1459@deprecated_args(step=True)
1460def ImagesPageGenerator(pageWithImages, total=None, content=False):
1461    """DEPRECATED. Yield FilePages displayed on pageWithImages."""
1462    return pageWithImages.imagelinks(total=total,
1463                                     content=content)  # pragma: no cover
1464
1465
1466def InterwikiPageGenerator(page):
1467    """Iterate over all interwiki (non-language) links on a page."""
1468    return (pywikibot.Page(link) for link in page.interwiki())
1469
1470
1471@deprecated_args(step=True)
1472def LanguageLinksPageGenerator(page, total=None):
1473    """Iterate over all interwiki language links on a page."""
1474    return (pywikibot.Page(link) for link in page.iterlanglinks(total=total))
1475
1476
1477@deprecated_args(step=True)
1478def CategorizedPageGenerator(category, recurse=False, start=None,
1479                             total=None, content=False,
1480                             namespaces=None):
1481    """Yield all pages in a specific category.
1482
1483    If recurse is True, pages in subcategories are included as well; if
1484    recurse is an int, only subcategories to that depth will be included
1485    (e.g., recurse=2 will get pages in subcats and sub-subcats, but will
1486    not go any further).
1487
1488    If start is a string value, only pages whose sortkey comes after start
1489    alphabetically are included.
1490
1491    If content is True (default is False), the current page text of each
1492    retrieved page will be downloaded.
1493
1494    """
1495    kwargs = {
1496        'content': content,
1497        'namespaces': namespaces,
1498        'recurse': recurse,
1499        'startprefix': start,
1500        'total': total,
1501    }
1502    yield from category.articles(**kwargs)
1503
1504
1505@deprecated_args(step=True)
1506def SubCategoriesPageGenerator(category, recurse=False, start=None,
1507                               total=None, content=False):
1508    """Yield all subcategories in a specific category.
1509
1510    If recurse is True, pages in subcategories are included as well; if
1511    recurse is an int, only subcategories to that depth will be included
1512    (e.g., recurse=2 will get pages in subcats and sub-subcats, but will
1513    not go any further).
1514
1515    If start is a string value, only categories whose sortkey comes after
1516    start alphabetically are included.
1517
1518    If content is True (default is False), the current page text of each
1519    category description page will be downloaded.
1520
1521    """
1522    # TODO: page generator could be modified to use cmstartsortkey ...
1523    for s in category.subcategories(recurse=recurse,
1524                                    total=total, content=content):
1525        if start is None or s.title(with_ns=False) >= start:
1526            yield s
1527
1528
1529@deprecated('Page.linkedPages()', since='20200515')
1530@deprecated_args(step=True)
1531def LinkedPageGenerator(linkingPage, total: int = None, content: bool = False):
1532    """DEPRECATED. Yield all pages linked from a specific page.
1533
1534    See :py:obj:`pywikibot.page.BasePage.linkedPages` for details.
1535
1536    :param linkingPage: the page that links to the pages we want
1537    :type linkingPage: :py:obj:`pywikibot.Page`
1538    :param total: the total number of pages to iterate
1539    :param content: if True, retrieve the current content of each linked page
1540    :return: a generator that yields Page objects of pages linked to
1541        linkingPage
1542    :rtype: generator
1543    """
1544    return linkingPage.linkedPages(total=total,
1545                                   content=content)  # pragma: no cover
1546
1547
1548def _yield_titles(f: Union[codecs.StreamReaderWriter, io.StringIO],
1549                  site: pywikibot.Site):
1550    """Yield page titles from a text stream.
1551
1552    :param f: text stream object
1553    :type f: codecs.StreamReaderWriter, io.StringIO, or any other stream-like
1554        object
1555    :param site: Site for generator results.
1556    :type site: :py:obj:`pywikibot.site.BaseSite`
1557    :return: a generator that yields Page objects of pages with titles in text
1558        stream
1559    :rtype: generator
1560    """
1561    linkmatch = None
1562    for linkmatch in pywikibot.link_regex.finditer(f.read()):
1563        # If the link is in interwiki format, the Page object may reside
1564        # on a different Site than the default.
1565        # This makes it possible to work on different wikis using a single
1566        # text file, but also could be dangerous because you might
1567        # inadvertently change pages on another wiki!
1568        yield pywikibot.Page(pywikibot.Link(linkmatch.group('title'),
1569                                            site))
1570    if linkmatch is not None:
1571        return
1572
1573    f.seek(0)
1574    for title in f:
1575        title = title.strip()
1576        if '|' in title:
1577            title = title[:title.index('|')]
1578        if title:
1579            yield pywikibot.Page(site, title)
1580
1581
1582def TextIOPageGenerator(source: Optional[str] = None,
1583                        site: Optional[pywikibot.site.BaseSite] = None):
1584    """Iterate pages from a list in a text file or on a webpage.
1585
1586    The text source must contain page links between double-square-brackets or,
1587    alternatively, separated by newlines. The generator will yield each
1588    corresponding Page object.
1589
1590    :param source: the file path or URL that should be read. If no name is
1591                     given, the generator prompts the user.
1592    :param site: Site for generator results.
1593    :type site: :py:obj:`pywikibot.site.BaseSite`
1594
1595    """
1596    if source is None:
1597        source = pywikibot.input('Please enter the filename / URL:')
1598    if site is None:
1599        site = pywikibot.Site()
1600    # If source cannot be parsed as an HTTP URL, treat as local file
1601    if not urlparse(source).netloc:
1602        with codecs.open(source, 'r', config.textfile_encoding) as f:
1603            yield from _yield_titles(f, site)
1604    # Else, fetch page (page should return text in same format as that expected
1605    # in filename, i.e. pages separated by newlines or pages enclosed in double
1606    # brackets
1607    else:
1608        with io.StringIO(http.fetch(source).text) as f:
1609            yield from _yield_titles(f, site)
1610
1611
1612def PagesFromTitlesGenerator(iterable, site=None):
1613    """
1614    Generate pages from the titles (strings) yielded by iterable.
1615
1616    :param site: Site for generator results.
1617    :type site: :py:obj:`pywikibot.site.BaseSite`
1618    """
1619    if site is None:
1620        site = pywikibot.Site()
1621    for title in iterable:
1622        if not isinstance(title, str):
1623            break
1624        yield pywikibot.Page(pywikibot.Link(title, site))
1625
1626
1627@deprecated('site.load_pages_from_pageids()', since='20200515')
1628def PagesFromPageidGenerator(pageids, site=None):
1629    """
1630    DEPRECATED. Return a page generator from pageids.
1631
1632    Pages are iterated in the same order than in the underlying pageids.
1633    Pageids are filtered and only one page is returned in case of
1634    duplicate pageid.
1635
1636    :param pageids: an iterable that returns pageids, or a comma-separated
1637                    string of pageids (e.g. '945097,1483753,956608')
1638    :param site: Site for generator results.
1639    :type site: :py:obj:`pywikibot.site.BaseSite`
1640    """
1641    if site is None:
1642        site = pywikibot.Site()
1643
1644    return site.load_pages_from_pageids(pageids)
1645
1646
1647@deprecated_args(number='total', step=True)
1648def UserContributionsGenerator(username, namespaces: List[int] = None,
1649                               site=None, total: Optional[int] = None,
1650                               _filter_unique=_filter_unique_pages):
1651    """Yield unique pages edited by user:username.
1652
1653    :param total: Maximum number of pages to retrieve in total
1654    :param namespaces: list of namespace numbers to fetch contribs from
1655    :param site: Site for generator results.
1656    :type site: :py:obj:`pywikibot.site.BaseSite`
1657    """
1658    if site is None:
1659        site = pywikibot.Site()
1660
1661    user = pywikibot.User(site, username)
1662    if not (user.isAnonymous() or user.isRegistered()):
1663        pywikibot.warning('User "{}" does not exist on site "{}".'
1664                          .format(user.username, site))
1665
1666    gen = (contrib[0] for contrib in user.contributions(
1667        namespaces=namespaces, total=total))
1668    if _filter_unique:
1669        return _filter_unique(gen)
1670    return gen
1671
1672
1673def NamespaceFilterPageGenerator(generator, namespaces, site=None):
1674    """
1675    A generator yielding pages from another generator in given namespaces.
1676
1677    If a site is provided, the namespaces are validated using the namespaces
1678    of that site, otherwise the namespaces are validated using the default
1679    site.
1680
1681    NOTE: API-based generators that have a "namespaces" parameter perform
1682    namespace filtering more efficiently than this generator.
1683
1684    :param namespaces: list of namespace identifiers to limit results
1685    :type namespaces: iterable of str or Namespace key,
1686        or a single instance of those types.
1687    :param site: Site for generator results; mandatory if
1688        namespaces contains namespace names. Defaults to the default site.
1689    :type site: :py:obj:`pywikibot.site.BaseSite`
1690    :raises KeyError: a namespace identifier was not resolved
1691    :raises TypeError: a namespace identifier has an inappropriate
1692        type such as NoneType or bool, or more than one namespace
1693        if the API module does not support multiple namespaces
1694    """
1695    # As site was only required if the namespaces contain strings, don't
1696    # attempt to use the config selected site unless the initial attempt
1697    # at resolving the namespaces fails.
1698    if not site:
1699        site = pywikibot.Site()
1700    try:
1701        namespaces = site.namespaces.resolve(namespaces)
1702    except KeyError as e:
1703        pywikibot.log('Failed resolving namespaces:')
1704        pywikibot.exception(e)
1705        raise
1706
1707    return (page for page in generator if page.namespace() in namespaces)
1708
1709
1710@deprecated_args(ignoreList='ignore_list')
1711def PageTitleFilterPageGenerator(generator, ignore_list: dict):
1712    """
1713    Yield only those pages are not listed in the ignore list.
1714
1715    :param ignore_list: family names are mapped to dictionaries in which
1716        language codes are mapped to lists of page titles. Each title must
1717        be a valid regex as they are compared using :py:obj:`re.search`.
1718
1719    """
1720    def is_ignored(page):
1721        try:
1722            site_ig_list = ignore_list[page.site.family.name][page.site.code]
1723        except KeyError:
1724            return False
1725        return any(re.search(ig, page.title()) for ig in site_ig_list)
1726
1727    for page in generator:
1728        if not is_ignored(page):
1729            yield page
1730            continue
1731
1732        if config.verbose_output:
1733            pywikibot.output('Ignoring page {}'.format(page.title()))
1734
1735
1736def RedirectFilterPageGenerator(generator, no_redirects: bool = True,
1737                                show_filtered: bool = False):
1738    """
1739    Yield pages from another generator that are redirects or not.
1740
1741    :param no_redirects: Exclude redirects if True, else only include
1742        redirects.
1743    :param show_filtered: Output a message for each page not yielded
1744    """
1745    fmt = '{page} is {what} redirect page. Skipping.'
1746    what = 'a' if no_redirects else 'not a'
1747
1748    for page in generator or []:
1749        is_redirect = page.isRedirectPage()
1750        if bool(no_redirects) != bool(is_redirect):  # xor
1751            yield page
1752            continue
1753
1754        if show_filtered:
1755            pywikibot.output(fmt.format(what=what, page=page))
1756
1757
1758class ItemClaimFilter:
1759
1760    """Item claim filter."""
1761
1762    page_classes = {
1763        True: pywikibot.PropertyPage,
1764        False: pywikibot.ItemPage,
1765    }
1766
1767    @classmethod
1768    def __filter_match(cls, page, prop, claim, qualifiers):
1769        """
1770        Return true if the page contains the claim given.
1771
1772        :param page: the page to check
1773        :return: true if page contains the claim, false otherwise
1774        :rtype: bool
1775        """
1776        if not isinstance(page, pywikibot.page.WikibasePage):  # T175151
1777            try:
1778                assert page.site.property_namespace
1779                assert page.site.item_namespace
1780                key = page.namespace() == page.site.property_namespace
1781                page_cls = cls.page_classes[key]
1782                page = page_cls(page.site, page.title(with_ns=False))
1783            except (AttributeError, AssertionError):
1784                try:
1785                    page = pywikibot.ItemPage.fromPage(page)
1786                except NoPageError:
1787                    return False
1788
1789        def match_qualifiers(page_claim, qualifiers):
1790            return all(page_claim.has_qualifier(prop, val)
1791                       for prop, val in qualifiers.items())
1792
1793        page_claims = page.get()['claims'].get(prop, [])
1794        return any(
1795            p_cl.target_equals(claim) and match_qualifiers(p_cl, qualifiers)
1796            for p_cl in page_claims)
1797
1798    @classmethod
1799    def filter(cls, generator, prop: str, claim,
1800               qualifiers: Optional[dict] = None,
1801               negate: bool = False):
1802        """
1803        Yield all ItemPages which contain certain claim in a property.
1804
1805        :param prop: property id to check
1806        :param claim: value of the property to check. Can be exact value (for
1807            instance, ItemPage instance) or a string (e.g. 'Q37470').
1808        :param qualifiers: dict of qualifiers that must be present, or None if
1809            qualifiers are irrelevant
1810        :param negate: true if pages that do *not* contain specified claim
1811            should be yielded, false otherwise
1812        """
1813        qualifiers = qualifiers or {}
1814        for page in generator:
1815            if cls.__filter_match(page, prop, claim, qualifiers) is not negate:
1816                yield page
1817
1818
1819# name the generator methods
1820ItemClaimFilterPageGenerator = ItemClaimFilter.filter
1821
1822
1823def SubpageFilterGenerator(generator, max_depth: int = 0,
1824                           show_filtered: bool = False):
1825    """
1826    Generator which filters out subpages based on depth.
1827
1828    It looks at the namespace of each page and checks if that namespace has
1829    subpages enabled. If so, pages with forward slashes ('/') are excluded.
1830
1831    :param generator: A generator object
1832    :type generator: any generator or iterator
1833    :param max_depth: Max depth of subpages to yield, at least zero
1834    :param show_filtered: Output a message for each page not yielded
1835    """
1836    assert max_depth >= 0, 'Max subpage depth must be at least 0'
1837
1838    for page in generator:
1839        if page.depth <= max_depth:
1840            yield page
1841        else:
1842            if show_filtered:
1843                pywikibot.output(
1844                    'Page {} is a subpage that is too deep. Skipping.'
1845                    .format(page))
1846
1847
1848class RegexFilter:
1849
1850    """Regex filter."""
1851
1852    @classmethod
1853    def __filter_match(cls, regex, string, quantifier):
1854        """Return True if string matches precompiled regex list.
1855
1856        :param quantifier: a qualifier
1857        :type quantifier: str of 'all', 'any' or 'none'
1858        :rtype: bool
1859        """
1860        if quantifier == 'all':
1861            match = all(r.search(string) for r in regex)
1862        else:
1863            match = any(r.search(string) for r in regex)
1864        return (quantifier == 'none') ^ match
1865
1866    @classmethod
1867    def __precompile(cls, regex, flag):
1868        """Precompile the regex list if needed."""
1869        # Enable multiple regexes
1870        if not isinstance(regex, (list, tuple)):
1871            regex = [regex]
1872        # Test if regex is already compiled.
1873        # We assume that all list components have the same type
1874        if isinstance(regex[0], str):
1875            regex = [re.compile(r, flag) for r in regex]
1876        return regex
1877
1878    @classmethod
1879    @deprecated_args(inverse='quantifier')
1880    def titlefilter(cls, generator, regex, quantifier='any',
1881                    ignore_namespace=True):
1882        """Yield pages from another generator whose title matches regex.
1883
1884        Uses regex option re.IGNORECASE depending on the quantifier parameter.
1885
1886        If ignore_namespace is False, the whole page title is compared.
1887        NOTE: if you want to check for a match at the beginning of the title,
1888        you have to start the regex with "^"
1889
1890        :param generator: another generator
1891        :type generator: any generator or iterator
1892        :param regex: a regex which should match the page title
1893        :type regex: a single regex string or a list of regex strings or a
1894            compiled regex or a list of compiled regexes
1895        :param quantifier: must be one of the following values:
1896            'all' - yields page if title is matched by all regexes
1897            'any' - yields page if title is matched by any regexes
1898            'none' - yields page if title is NOT matched by any regexes
1899        :type quantifier: str of ('all', 'any', 'none')
1900        :param ignore_namespace: ignore the namespace when matching the title
1901        :type ignore_namespace: bool
1902        :return: return a page depending on the matching parameters
1903
1904        """
1905        # for backwards compatibility with compat for inverse parameter
1906        if quantifier is False:
1907            quantifier = 'any'
1908        elif quantifier is True:
1909            quantifier = 'none'
1910        reg = cls.__precompile(regex, re.I)
1911        for page in generator:
1912            title = page.title(with_ns=not ignore_namespace)
1913            if cls.__filter_match(reg, title, quantifier):
1914                yield page
1915
1916    @classmethod
1917    def contentfilter(cls, generator, regex, quantifier='any'):
1918        """Yield pages from another generator whose body matches regex.
1919
1920        Uses regex option re.IGNORECASE depending on the quantifier parameter.
1921
1922        For parameters see titlefilter above.
1923
1924        """
1925        reg = cls.__precompile(regex, re.IGNORECASE | re.DOTALL)
1926        return (page for page in generator
1927                if cls.__filter_match(reg, page.text, quantifier))
1928
1929
1930def QualityFilterPageGenerator(generator, quality: List[int]):
1931    """
1932    Wrap a generator to filter pages according to quality levels.
1933
1934    This is possible only for pages with content_model 'proofread-page'.
1935    In all the other cases, no filter is applied.
1936
1937    :param generator: A generator object
1938    :param quality: proofread-page quality levels (valid range 0-4)
1939
1940    """
1941    for page in generator:
1942        if page.namespace() == page.site.proofread_page_ns:
1943            page = ProofreadPage(page)
1944            if page.quality_level in quality:
1945                yield page
1946        else:
1947            yield page
1948
1949
1950@deprecated_args(site=True)
1951def CategoryFilterPageGenerator(generator, category_list):
1952    """
1953    Wrap a generator to filter pages by categories specified.
1954
1955    :param generator: A generator object
1956    :param category_list: categories used to filter generated pages
1957    :type category_list: list of category objects
1958
1959    """
1960    for page in generator:
1961        if all(x in page.categories() for x in category_list):
1962            yield page
1963
1964
1965# name the generator methods
1966RegexFilterPageGenerator = RegexFilter.titlefilter
1967RegexBodyFilterPageGenerator = RegexFilter.contentfilter
1968
1969
1970@deprecated_args(begintime='last_edit_start', endtime='last_edit_end')
1971def EdittimeFilterPageGenerator(generator,
1972                                last_edit_start=None,
1973                                last_edit_end=None,
1974                                first_edit_start=None,
1975                                first_edit_end=None,
1976                                show_filtered=False):
1977    """
1978    Wrap a generator to filter pages outside last or first edit range.
1979
1980    :param generator: A generator object
1981    :param last_edit_start: Only yield pages last edited after this time
1982    :type last_edit_start: datetime
1983    :param last_edit_end: Only yield pages last edited before this time
1984    :type last_edit_end: datetime
1985    :param first_edit_start: Only yield pages first edited after this time
1986    :type first_edit_start: datetime
1987    :param first_edit_end: Only yield pages first edited before this time
1988    :type first_edit_end: datetime
1989    :param show_filtered: Output a message for each page not yielded
1990    :type show_filtered: bool
1991
1992    """
1993    def to_be_yielded(edit, page, show_filtered):
1994        if not edit.do_edit:
1995            return True
1996
1997        if isinstance(edit, Latest):
1998            edit_time = page.latest_revision.timestamp
1999        else:
2000            edit_time = page.oldest_revision.timestamp
2001
2002        msg = '{prefix} edit on {page} was on {time}.\n' \
2003              'Too {{when}}. Skipping.' \
2004              .format(prefix=edit.__class__.__name__,  # prefix = Class name.
2005                      page=page,
2006                      time=edit_time.isoformat())
2007
2008        if edit_time < edit.edit_start:
2009            _output_if(show_filtered, msg.format(when='old'))
2010            return False
2011
2012        if edit_time > edit.edit_end:
2013            _output_if(show_filtered, msg.format(when='recent'))
2014            return False
2015
2016        return True
2017
2018    First = namedtuple('First', ['do_edit', 'edit_start', 'edit_end'])
2019    Latest = namedtuple('Latest', First._fields)
2020
2021    latest_edit = Latest(do_edit=last_edit_start or last_edit_end,
2022                         edit_start=last_edit_start or datetime.datetime.min,
2023                         edit_end=last_edit_end or datetime.datetime.max)
2024
2025    first_edit = First(do_edit=first_edit_start or first_edit_end,
2026                       edit_start=first_edit_start or datetime.datetime.min,
2027                       edit_end=first_edit_end or datetime.datetime.max)
2028
2029    for page in generator or []:
2030        if (to_be_yielded(latest_edit, page, show_filtered)
2031                and to_be_yielded(first_edit, page, show_filtered)):
2032            yield page
2033
2034
2035def UserEditFilterGenerator(generator, username: str, timestamp=None,
2036                            skip: bool = False,
2037                            max_revision_depth: Optional[int] = None,
2038                            show_filtered: bool = False):
2039    """
2040    Generator which will yield Pages modified by username.
2041
2042    It only looks at the last editors given by max_revision_depth.
2043    If timestamp is set in MediaWiki format JJJJMMDDhhmmss, older edits are
2044    ignored.
2045    If skip is set, pages edited by the given user are ignored otherwise only
2046    pages edited by this user are given back.
2047
2048    :param generator: A generator object
2049    :param username: user name which edited the page
2050    :param timestamp: ignore edits which are older than this timestamp
2051    :type timestamp: datetime or str (MediaWiki format JJJJMMDDhhmmss) or None
2052    :param skip: Ignore pages edited by the given user
2053    :param max_revision_depth: It only looks at the last editors given by
2054        max_revision_depth
2055    :param show_filtered: Output a message for each page not yielded
2056    """
2057    if isinstance(timestamp, str):
2058        ts = pywikibot.Timestamp.fromtimestampformat(timestamp)
2059    else:
2060        ts = timestamp
2061
2062    for page in generator:
2063        contribs = page.contributors(total=max_revision_depth, endtime=ts)
2064        if bool(contribs[username]) is not bool(skip):  # xor operation
2065            yield page
2066        elif show_filtered:
2067            pywikibot.output('Skipping {}'.format(page.title(as_link=True)))
2068
2069
2070@deprecated('itertools.chain(*iterables)', since='20180513')
2071def CombinedPageGenerator(generators):
2072    """Yield from each iterable until exhausted, then proceed with the next."""
2073    return itertools.chain(*generators)  # pragma: no cover
2074
2075
2076def PageClassGenerator(generator):
2077    """
2078    Yield pages from another generator as Page subclass objects.
2079
2080    The page class type depends on the page namespace.
2081    Objects may be Category, FilePage, Userpage or Page.
2082    """
2083    for page in generator:
2084        if page.namespace() == page.site.namespaces.USER:
2085            yield pywikibot.User(page)
2086        elif page.namespace() == page.site.namespaces.FILE:
2087            yield pywikibot.FilePage(page)
2088        elif page.namespace() == page.site.namespaces.CATEGORY:
2089            yield pywikibot.Category(page)
2090        else:
2091            yield page
2092
2093
2094def PageWithTalkPageGenerator(generator, return_talk_only=False):
2095    """Yield pages and associated talk pages from another generator.
2096
2097    Only yields talk pages if the original generator yields a non-talk page,
2098    and does not check if the talk page in fact exists.
2099
2100    """
2101    for page in generator:
2102        if not return_talk_only or page.isTalkPage():
2103            yield page
2104        if not page.isTalkPage():
2105            yield page.toggleTalkPage()
2106
2107
2108@deprecated('LiveRCPageGenerator or EventStreams', since='20180415')
2109def RepeatingGenerator(generator, key_func=lambda x: x, sleep_duration=60,
2110                       total: Optional[int] = None, **kwargs):
2111    """Yield items in live time.
2112
2113    The provided generator must support parameter 'start', 'end',
2114    'reverse', and 'total' such as site.recentchanges(), site.logevents().
2115
2116    To fetch revisions in recentchanges in live time::
2117
2118        gen = RepeatingGenerator(site.recentchanges, lambda x: x['revid'])
2119
2120    To fetch new pages in live time::
2121
2122        gen = RepeatingGenerator(site.newpages, lambda x: x[0])
2123
2124    Note that other parameters not listed below will be passed
2125    to the generator function. Parameter 'reverse', 'start', 'end'
2126    will always be discarded to prevent the generator yielding items
2127    in wrong order.
2128
2129    :param generator: a function returning a generator that will be queried
2130    :param key_func: a function returning key that will be used to detect
2131        duplicate entry
2132    :param sleep_duration: duration between each query
2133    :param total: if it is a positive number, iterate no more than this
2134        number of items in total. Otherwise, iterate forever
2135    :return: a generator yielding items in ascending order by time
2136    """
2137    kwargs.pop('reverse', None)  # always get newest item first
2138    kwargs.pop('start', None)  # don't set start time
2139    kwargs.pop('end', None)  # don't set stop time
2140
2141    seen = set()
2142    while total is None or len(seen) < total:
2143        def filtered_generator():
2144            for item in generator(total=None if seen else 1, **kwargs):
2145                key = key_func(item)
2146                if key not in seen:
2147                    seen.add(key)
2148                    yield item
2149                    if len(seen) == total:
2150                        return
2151                else:
2152                    break
2153            pywikibot.sleep(sleep_duration)
2154
2155        yield from reversed(list(filtered_generator()))
2156
2157
2158@deprecated_args(pageNumber='groupsize', step='groupsize', lookahead=True)
2159def PreloadingGenerator(generator, groupsize: int = 50):
2160    """
2161    Yield preloaded pages taken from another generator.
2162
2163    :param generator: pages to iterate over
2164    :param groupsize: how many pages to preload at once
2165    """
2166    # pages may be on more than one site, for example if an interwiki
2167    # generator is used, so use a separate preloader for each site
2168    sites = {}
2169    # build a list of pages for each site found in the iterator
2170    for page in generator:
2171        site = page.site
2172        sites.setdefault(site, []).append(page)
2173        if len(sites[site]) >= groupsize:
2174            # if this site is at the groupsize, process it
2175            group = sites.pop(site)
2176            yield from site.preloadpages(group, groupsize=groupsize)
2177
2178    for site, pages in sites.items():
2179        # process any leftover sites that never reached the groupsize
2180        yield from site.preloadpages(pages, groupsize=groupsize)
2181
2182
2183@deprecated_args(step='groupsize')
2184def DequePreloadingGenerator(generator, groupsize=50):
2185    """Preload generator of type DequeGenerator."""
2186    assert isinstance(generator, DequeGenerator), \
2187        'generator must be a DequeGenerator object'
2188
2189    while True:
2190        page_count = min(len(generator), groupsize)
2191        if not page_count:
2192            return
2193
2194        yield from PreloadingGenerator(generator, page_count)
2195
2196
2197@deprecated_args(step='groupsize')
2198def PreloadingEntityGenerator(generator, groupsize: int = 50):
2199    """
2200    Yield preloaded pages taken from another generator.
2201
2202    Function basically is copied from above, but for Wikibase entities.
2203
2204    :param generator: pages to iterate over
2205    :type generator: Iterable
2206    :param groupsize: how many pages to preload at once
2207    """
2208    sites = {}
2209    for page in generator:
2210        site = page.site
2211        sites.setdefault(site, []).append(page)
2212        if len(sites[site]) >= groupsize:
2213            # if this site is at the groupsize, process it
2214            group = sites.pop(site)
2215            repo = site.data_repository()
2216            yield from repo.preload_entities(group, groupsize)
2217
2218    for site, pages in sites.items():
2219        # process any leftover sites that never reached the groupsize
2220        repo = site.data_repository()
2221        yield from repo.preload_entities(pages, groupsize)
2222
2223
2224@deprecated_args(number='total', step=True, repeat=True)
2225def NewimagesPageGenerator(total: Optional[int] = None, site=None):
2226    """
2227    New file generator.
2228
2229    :param total: Maximum number of pages to retrieve in total
2230    :param site: Site for generator results.
2231    :type site: :py:obj:`pywikibot.site.BaseSite`
2232    """
2233    if site is None:
2234        site = pywikibot.Site()
2235    return (entry.page()
2236            for entry in site.logevents(logtype='upload', total=total))
2237
2238
2239def WikibaseItemGenerator(gen):
2240    """
2241    A wrapper generator used to yield Wikibase items of another generator.
2242
2243    :param gen: Generator to wrap.
2244    :type gen: generator
2245    :return: Wrapped generator
2246    :rtype: generator
2247    """
2248    for page in gen:
2249        if isinstance(page, pywikibot.ItemPage):
2250            yield page
2251        elif page.site.data_repository() == page.site:
2252            # These are already items, as they have a DataSite in page.site.
2253            # However generator is yielding Page, so convert to ItemPage.
2254            # FIXME: If we've already fetched content, we should retain it
2255            yield pywikibot.ItemPage(page.site, page.title())
2256        else:
2257            yield pywikibot.ItemPage.fromPage(page)
2258
2259
2260def WikibaseItemFilterPageGenerator(generator, has_item: bool = True,
2261                                    show_filtered: bool = False):
2262    """
2263    A wrapper generator used to exclude if page has a Wikibase item or not.
2264
2265    :param generator: Generator to wrap.
2266    :type generator: generator
2267    :param has_item: Exclude pages without an item if True, or only
2268        include pages without an item if False
2269    :param show_filtered: Output a message for each page not yielded
2270    :return: Wrapped generator
2271    :rtype: generator
2272    """
2273    why = "doesn't" if has_item else 'has'
2274    msg = '{{page}} {why} a wikidata item. Skipping.'.format(why=why)
2275
2276    for page in generator or []:
2277        try:
2278            page_item = pywikibot.ItemPage.fromPage(page, lazy_load=False)
2279        except NoPageError:
2280            page_item = None
2281
2282        to_be_skipped = bool(page_item) != has_item
2283        if to_be_skipped:
2284            _output_if(show_filtered, msg.format(page=page))
2285            continue
2286
2287        yield page
2288
2289
2290@deprecated('Site.unusedfiles()', since='20200515')
2291@deprecated_args(extension=True, number='total', repeat=True)
2292def UnusedFilesGenerator(total: Optional[int] = None,
2293                         site=None):  # pragma: no cover
2294    """
2295    DEPRECATED. Unused files generator.
2296
2297    :param total: Maximum number of pages to retrieve in total
2298    :param site: Site for generator results.
2299    :type site: :py:obj:`pywikibot.site.BaseSite`
2300    """
2301    if site is None:
2302        site = pywikibot.Site()
2303    return site.unusedfiles(total=total)
2304
2305
2306@deprecated('Site.withoutinterwiki()', since='20200515')
2307@deprecated_args(number='total', repeat=True)
2308def WithoutInterwikiPageGenerator(total=None, site=None):  # pragma: no cover
2309    """
2310    DEPRECATED. Page lacking interwikis generator.
2311
2312    :param total: Maximum number of pages to retrieve in total
2313    :param site: Site for generator results.
2314    :type site: :py:obj:`pywikibot.site.BaseSite`
2315    """
2316    if site is None:
2317        site = pywikibot.Site()
2318    return site.withoutinterwiki(total=total)
2319
2320
2321@deprecated('Site.uncategorizedcategories()', since='20200515')
2322@deprecated_args(number='total', repeat=True)
2323def UnCategorizedCategoryGenerator(total: Optional[int] = 100,
2324                                   site=None):  # pragma: no cover
2325    """
2326    DEPRECATED. Uncategorized category generator.
2327
2328    :param total: Maximum number of pages to retrieve in total
2329    :param site: Site for generator results.
2330    :type site: :py:obj:`pywikibot.site.BaseSite`
2331    """
2332    if site is None:
2333        site = pywikibot.Site()
2334    return site.uncategorizedcategories(total=total)
2335
2336
2337@deprecated('Site.uncategorizedimages()', since='20200515')
2338@deprecated_args(number='total', repeat=True)
2339def UnCategorizedImageGenerator(total: int = 100,
2340                                site=None):  # pragma: no cover
2341    """
2342    DEPRECATED. Uncategorized file generator.
2343
2344    :param total: Maximum number of pages to retrieve in total
2345    :param site: Site for generator results.
2346    :type site: :py:obj:`pywikibot.site.BaseSite`
2347    """
2348    if site is None:
2349        site = pywikibot.Site()
2350    return site.uncategorizedimages(total=total)
2351
2352
2353@deprecated('Site.uncategorizedpages()', since='20200515')
2354@deprecated_args(number='total', repeat=True)
2355def UnCategorizedPageGenerator(total: int = 100,
2356                               site=None):  # pragma: no cover
2357    """
2358    DEPRECATED. Uncategorized page generator.
2359
2360    :param total: Maximum number of pages to retrieve in total
2361    :param site: Site for generator results.
2362    :type site: :py:obj:`pywikibot.site.BaseSite`
2363    """
2364    if site is None:
2365        site = pywikibot.Site()
2366    return site.uncategorizedpages(total=total)
2367
2368
2369@deprecated('Site.uncategorizedtemplates()', since='20200515')
2370@deprecated_args(number='total', repeat=True)
2371def UnCategorizedTemplateGenerator(total: int = 100,
2372                                   site=None):  # pragma: no cover
2373    """
2374    DEPRECATED. Uncategorized template generator.
2375
2376    :param total: Maximum number of pages to retrieve in total
2377    :param site: Site for generator results.
2378    :type site: :py:obj:`pywikibot.site.BaseSite`
2379    """
2380    if site is None:
2381        site = pywikibot.Site()
2382    return site.uncategorizedtemplates(total=total)
2383
2384
2385@deprecated('Site.lonelypages()', since='20200515')
2386@deprecated_args(number='total', repeat=True)
2387def LonelyPagesPageGenerator(total: Optional[int] = None,
2388                             site=None):  # pragma: no cover
2389    """
2390    DEPRECATED. Lonely page generator.
2391
2392    :param total: Maximum number of pages to retrieve in total
2393    :param site: Site for generator results.
2394    :type site: :py:obj:`pywikibot.site.BaseSite`
2395    """
2396    if site is None:
2397        site = pywikibot.Site()
2398    return site.lonelypages(total=total)
2399
2400
2401@deprecated('Site.unwatchedpages()', since='20200515')
2402@deprecated_args(number='total', repeat=True)
2403def UnwatchedPagesPageGenerator(total: Optional[int] = None,
2404                                site=None):  # pragma: no cover
2405    """
2406    DEPRECATED. Unwatched page generator.
2407
2408    :param total: Maximum number of pages to retrieve in total
2409    :param site: Site for generator results.
2410    :type site: :py:obj:`pywikibot.site.BaseSite`
2411    """
2412    if site is None:
2413        site = pywikibot.Site()
2414    return site.unwatchedpages(total=total)
2415
2416
2417@deprecated('Site.pages_with_property()', since='20200515')
2418def page_with_property_generator(name: str, total: Optional[int] = None,
2419                                 site=None):  # pragma: no cover
2420    """
2421    Special:PagesWithProperty page generator.
2422
2423    :param name: Property name of pages to be retrieved
2424    :param total: Maximum number of pages to retrieve in total
2425    :param site: Site for generator results.
2426    :type site: :py:obj:`pywikibot.site.BaseSite`
2427    """
2428    if site is None:
2429        site = pywikibot.Site()
2430    return site.pages_with_property(name, total=total)
2431
2432
2433@deprecated('Site.wantedpages', since='20180803')
2434def WantedPagesPageGenerator(total: int = 100, site=None):  # pragma: no cover
2435    """
2436    Wanted page generator.
2437
2438    :param total: Maximum number of pages to retrieve in total
2439    :param site: Site for generator results.
2440    :type site: :py:obj:`pywikibot.site.BaseSite`
2441    """
2442    if site is None:
2443        site = pywikibot.Site()
2444    return site.wantedpages(total=total)
2445
2446
2447@deprecated_args(number='total', repeat=True)
2448def AncientPagesPageGenerator(total: int = 100, site=None):  # pragma: no cover
2449    """
2450    Ancient page generator.
2451
2452    :param total: Maximum number of pages to retrieve in total
2453    :param site: Site for generator results.
2454    :type site: :py:obj:`pywikibot.site.BaseSite`
2455    """
2456    if site is None:
2457        site = pywikibot.Site()
2458    return (page for page, _ in site.ancientpages(total=total))
2459
2460
2461@deprecated('Site.deadendpages()', since='20200515')
2462@deprecated_args(number='total', repeat=True)
2463def DeadendPagesPageGenerator(total: int = 100, site=None):  # pragma: no cover
2464    """
2465    DEPRECATED. Dead-end page generator.
2466
2467    :param total: Maximum number of pages to retrieve in total
2468    :param site: Site for generator results.
2469    :type site: :py:obj:`pywikibot.site.BaseSite`
2470    """
2471    if site is None:
2472        site = pywikibot.Site()
2473    return site.deadendpages(total=total)
2474
2475
2476@deprecated_args(number='total', repeat=True)
2477def LongPagesPageGenerator(total: int = 100, site=None):
2478    """
2479    Long page generator.
2480
2481    :param total: Maximum number of pages to retrieve in total
2482    :param site: Site for generator results.
2483    :type site: :py:obj:`pywikibot.site.BaseSite`
2484    """
2485    if site is None:
2486        site = pywikibot.Site()
2487    return (page for page, _ in site.longpages(total=total))
2488
2489
2490@deprecated_args(number='total', repeat=True)
2491def ShortPagesPageGenerator(total: int = 100, site=None):
2492    """
2493    Short page generator.
2494
2495    :param total: Maximum number of pages to retrieve in total
2496    :param site: Site for generator results.
2497    :type site: :py:obj:`pywikibot.site.BaseSite`
2498    """
2499    if site is None:
2500        site = pywikibot.Site()
2501    return (page for page, _ in site.shortpages(total=total))
2502
2503
2504@deprecated('Site.randompages()', since='20200515')
2505@deprecated_args(number='total')
2506def RandomPageGenerator(total: Optional[int] = None, site=None,
2507                        namespaces=None):  # pragma: no cover
2508    """
2509    DEPRECATED. Random page generator.
2510
2511    :param total: Maximum number of pages to retrieve in total
2512    :param site: Site for generator results.
2513    :type site: :py:obj:`pywikibot.site.BaseSite`
2514    """
2515    if site is None:
2516        site = pywikibot.Site()
2517    return site.randompages(total=total, namespaces=namespaces)
2518
2519
2520@deprecated('Site.randompages()', since='20200515')
2521@deprecated_args(number='total')
2522def RandomRedirectPageGenerator(total: Optional[int] = None, site=None,
2523                                namespaces=None):  # pragma: no cover
2524    """
2525    DEPRECATED. Random redirect generator.
2526
2527    :param total: Maximum number of pages to retrieve in total
2528    :param site: Site for generator results.
2529    :type site: :py:obj:`pywikibot.site.BaseSite`
2530    """
2531    if site is None:
2532        site = pywikibot.Site()
2533    return site.randompages(total=total, namespaces=namespaces,
2534                            redirects=True)
2535
2536
2537@deprecated('Site.exturlusage()', since='20200515')
2538@deprecated_args(link='url', euprotocol='protocol', step=True)
2539def LinksearchPageGenerator(url: str, namespaces: List[int] = None,
2540                            total: Optional[int] = None, site=None,
2541                            protocol: Optional[str] = None):
2542    """DEPRECATED. Yield all pages that link to a certain URL.
2543
2544    :param url: The URL to search for (with ot without the protocol prefix);
2545            this may include a '*' as a wildcard, only at the start of the
2546            hostname
2547    :param namespaces: list of namespace numbers to fetch contribs from
2548    :param total: Maximum number of pages to retrieve in total
2549    :param site: Site for generator results
2550    :type site: :py:obj:`pywikibot.site.BaseSite`
2551    :param protocol: Protocol to search for, likely http or https, http by
2552            default. Full list shown on Special:LinkSearch wikipage
2553    """
2554    if site is None:
2555        site = pywikibot.Site()
2556    return site.exturlusage(url, namespaces=namespaces, protocol=protocol,
2557                            total=total, content=False)
2558
2559
2560@deprecated('Site.search()', since='20200515')
2561@deprecated_args(number='total', step=True)
2562def SearchPageGenerator(query, total: Optional[int] = None, namespaces=None,
2563                        site=None):  # pragma: no cover
2564    """
2565    DEPRECATED. Yield pages from the MediaWiki internal search engine.
2566
2567    :param total: Maximum number of pages to retrieve in total
2568    :param site: Site for generator results.
2569    :type site: :py:obj:`pywikibot.site.BaseSite`
2570    """
2571    if site is None:
2572        site = pywikibot.Site()
2573    return site.search(query, total=total, namespaces=namespaces)
2574
2575
2576def LiveRCPageGenerator(site=None, total: Optional[int] = None):
2577    """
2578    Yield pages from a socket.io RC stream.
2579
2580    Generates pages based on the EventStreams Server-Sent-Event (SSE) recent
2581    changes stream.
2582    The Page objects will have an extra property ._rcinfo containing the
2583    literal rc data. This can be used to e.g. filter only new pages. See
2584    `pywikibot.comms.eventstreams.rc_listener` for details on the .rcinfo
2585    format.
2586
2587    :param site: site to return recent changes for
2588    :type site: pywikibot.BaseSite
2589    :param total: the maximum number of changes to return
2590    """
2591    if site is None:
2592        site = pywikibot.Site()
2593
2594    from pywikibot.comms.eventstreams import site_rc_listener
2595
2596    for entry in site_rc_listener(site, total=total):
2597        # The title in a log entry may have been suppressed
2598        if 'title' not in entry and entry['type'] == 'log':
2599            continue
2600        page = pywikibot.Page(site, entry['title'], entry['namespace'])
2601        page._rcinfo = entry
2602        yield page
2603
2604
2605# following classes just ported from version 1 without revision; not tested
2606
2607
2608class GoogleSearchPageGenerator:
2609
2610    """
2611    Page generator using Google search results.
2612
2613    To use this generator, you need to install the package 'google':
2614
2615        :py:obj:`https://pypi.org/project/google`
2616
2617    This package has been available since 2010, hosted on GitHub
2618    since 2012, and provided by PyPI since 2013.
2619
2620    As there are concerns about Google's Terms of Service, this
2621    generator prints a warning for each query.
2622    """
2623
2624    def __init__(self, query=None, site=None):
2625        """
2626        Initializer.
2627
2628        :param site: Site for generator results.
2629        :type site: :py:obj:`pywikibot.site.BaseSite`
2630        """
2631        self.query = query or pywikibot.input('Please enter the search query:')
2632        if site is None:
2633            site = pywikibot.Site()
2634        self.site = site
2635
2636    def queryGoogle(self, query):
2637        """
2638        Perform a query using python package 'google'.
2639
2640        The terms of service as at June 2014 give two conditions that
2641        may apply to use of search:
2642
2643            1. Don't access [Google Services] using a method other than
2644               the interface and the instructions that [they] provide.
2645            2. Don't remove, obscure, or alter any legal notices
2646               displayed in or along with [Google] Services.
2647
2648        Both of those issues should be managed by the package 'google',
2649        however Pywikibot will at least ensure the user sees the TOS
2650        in order to comply with the second condition.
2651        """
2652        try:
2653            import google
2654        except ImportError:
2655            pywikibot.error('ERROR: generator GoogleSearchPageGenerator '
2656                            "depends on package 'google'.\n"
2657                            'To install, please run: pip install google.')
2658            sys.exit(1)
2659        pywikibot.warning('Please read http://www.google.com/accounts/TOS')
2660        yield from google.search(query)
2661
2662    def __iter__(self):
2663        """Iterate results."""
2664        # restrict query to local site
2665        localQuery = '{} site:{}'.format(self.query, self.site.hostname())
2666        base = 'http://{}{}'.format(self.site.hostname(),
2667                                    self.site.article_path)
2668        for url in self.queryGoogle(localQuery):
2669            if url[:len(base)] == base:
2670                title = url[len(base):]
2671                page = pywikibot.Page(pywikibot.Link(title, self.site))
2672                # Google contains links in the format
2673                # https://de.wikipedia.org/wiki/en:Foobar
2674                if page.site == self.site:
2675                    yield page
2676
2677
2678def MySQLPageGenerator(query, site=None, verbose=None):
2679    """
2680    Yield a list of pages based on a MySQL query.
2681
2682    The query should return two columns, page namespace and page title pairs
2683    from some table. An example query that yields all ns0 pages might look
2684    like::
2685
2686        SELECT
2687         page_namespace,
2688         page_title
2689        FROM page
2690        WHERE page_namespace = 0;
2691
2692    See https://www.mediawiki.org/wiki/Manual:Pywikibot/MySQL
2693
2694    :param query: MySQL query to execute
2695    :param site: Site object
2696    :type site: :py:obj:`pywikibot.site.BaseSite`
2697    :param verbose: if True, print query to be executed;
2698        if None, config.verbose_output will be used.
2699    :type verbose: None or bool
2700    :return: generator which yields pywikibot.Page
2701    """
2702    from pywikibot.data import mysql
2703
2704    if site is None:
2705        site = pywikibot.Site()
2706
2707    row_gen = mysql.mysql_query(query,
2708                                dbname=site.dbName(),
2709                                verbose=verbose)
2710
2711    for row in row_gen:
2712        namespace_number, page_name = row
2713        page_name = page_name.decode(site.encoding())
2714        page = pywikibot.Page(site, page_name, ns=int(namespace_number))
2715        yield page
2716
2717
2718class XMLDumpOldPageGenerator(Iterator):
2719
2720    """
2721    Xml generator that yields Page objects with old text loaded.
2722
2723    :param filename: filename of XML dump
2724    :type filename: str
2725    :param start: skip entries below that value
2726    :type start: str or None
2727    :param namespaces: namespace filter
2728    :type namespaces: iterable of str or Namespace key,
2729        or a single instance of those types
2730    :param site: current site for the generator
2731    :type site: pywikibot.Site or None
2732    :param text_predicate: a callable with entry.text as parameter and boolean
2733        as result to indicate the generator should return the page or not
2734    :type text_predicate: function identifier or None
2735
2736    :ivar text_predicate: holds text_predicate function
2737    :ivar skipping: True if start parameter is given, else False
2738    :ivar start: holds start parameter
2739    :ivar namespaces: holds namespaces filter
2740    :ivar parser: holds the xmlreader.XmlDump parse method
2741    """
2742
2743    @deprecated_args(xmlFilename='filename')
2744    def __init__(self, filename: str, start: Optional[str] = None,
2745                 namespaces=None, site=None,
2746                 text_predicate=None):
2747        """Initializer."""
2748        self.text_predicate = text_predicate
2749
2750        self.skipping = bool(start)
2751        if self.skipping:
2752            self.start = start.replace('_', ' ')
2753        else:
2754            self.start = None
2755
2756        self.site = site or pywikibot.Site()
2757        if not namespaces:
2758            self.namespaces = self.site.namespaces
2759        else:
2760            self.namespaces = self.site.namespaces.resolve(namespaces)
2761
2762        dump = xmlreader.XmlDump(filename)
2763        self.parser = dump.parse()
2764
2765    def __next__(self):
2766        """Get next Page."""
2767        while True:
2768            entry = next(self.parser)
2769            if self.skipping:
2770                if entry.title < self.start:
2771                    continue
2772                self.skipping = False
2773            page = pywikibot.Page(self.site, entry.title)
2774            if page.namespace() not in self.namespaces:
2775                continue
2776            if not self.text_predicate or self.text_predicate(entry.text):
2777                page.text = entry.text
2778                return page
2779
2780
2781class XMLDumpPageGenerator(XMLDumpOldPageGenerator):
2782
2783    """Xml generator that yields Page objects without text loaded."""
2784
2785    def __next__(self):
2786        """Get next Page from dump and remove the text."""
2787        page = super().__next__()
2788        del page.text
2789        return page
2790
2791
2792def YearPageGenerator(start=1, end=2050, site=None):
2793    """
2794    Year page generator.
2795
2796    :param site: Site for generator results.
2797    :type site: :py:obj:`pywikibot.site.BaseSite`
2798    """
2799    if site is None:
2800        site = pywikibot.Site()
2801    pywikibot.output('Starting with year {}'.format(start))
2802    for i in range(start, end + 1):
2803        if i % 100 == 0:
2804            pywikibot.output('Preparing {}...'.format(i))
2805        # There is no year 0
2806        if i != 0:
2807            current_year = date.formatYear(site.lang, i)
2808            yield pywikibot.Page(pywikibot.Link(current_year, site))
2809
2810
2811@deprecated_args(startMonth='start_month', endMonth='end_month')
2812def DayPageGenerator(start_month: int = 1, end_month: int = 12,
2813                     site=None, year: int = 2000):
2814    """
2815    Day page generator.
2816
2817    :param site: Site for generator results.
2818    :type site: :py:obj:`pywikibot.site.BaseSite`
2819    :param year: considering leap year.
2820    """
2821    if site is None:
2822        site = pywikibot.Site()
2823    lang = site.lang
2824    firstPage = pywikibot.Page(site, date.format_date(start_month, 1, lang))
2825    pywikibot.output('Starting with {}'.format(firstPage.title(as_link=True)))
2826    for month in range(start_month, end_month + 1):
2827        for day in range(1, calendar.monthrange(year, month)[1] + 1):
2828            yield pywikibot.Page(
2829                pywikibot.Link(date.format_date(month, day, lang), site))
2830
2831
2832def WikidataPageFromItemGenerator(gen, site):
2833    """Generate pages from site based on sitelinks of item pages.
2834
2835    :param gen: generator of :py:obj:`pywikibot.ItemPage`
2836    :param site: Site for generator results.
2837    :type site: :py:obj:`pywikibot.site.BaseSite`
2838
2839    """
2840    repo = site.data_repository()
2841    for sublist in itergroup(gen, 50):
2842        req = {'ids': [item.id for item in sublist],
2843               'sitefilter': site.dbName(),
2844               'action': 'wbgetentities',
2845               'props': 'sitelinks'}
2846
2847        wbrequest = repo._simple_request(**req)
2848        wbdata = wbrequest.submit()
2849        entities = (item for item in wbdata['entities'].values() if
2850                    'sitelinks' in item and site.dbName() in item['sitelinks'])
2851        sitelinks = (item['sitelinks'][site.dbName()]['title']
2852                     for item in entities)
2853        for sitelink in sitelinks:
2854            yield pywikibot.Page(site, sitelink)
2855
2856
2857def WikidataSPARQLPageGenerator(query,
2858                                site=None, item_name: str = 'item',
2859                                endpoint: Optional[str] = None,
2860                                entity_url: Optional[str] = None,
2861                                result_type=set):
2862    """Generate pages that result from the given SPARQL query.
2863
2864    :param query: the SPARQL query string.
2865    :param site: Site for generator results.
2866    :type site: :py:obj:`pywikibot.site.BaseSite`
2867    :param item_name: name of the item in the SPARQL query
2868    :param endpoint: SPARQL endpoint URL
2869    :param entity_url: URL prefix for any entities returned in a query.
2870    :param result_type: type of the iterable in which
2871             SPARQL results are stored (default set)
2872    :type result_type: iterable
2873
2874    """
2875    from pywikibot.data import sparql
2876
2877    if site is None:
2878        site = pywikibot.Site()
2879    repo = site.data_repository()
2880    dependencies = {'endpoint': endpoint, 'entity_url': entity_url}
2881    if not endpoint or not entity_url:
2882        dependencies['repo'] = repo
2883    query_object = sparql.SparqlQuery(**dependencies)
2884    data = query_object.get_items(query,
2885                                  item_name=item_name,
2886                                  result_type=result_type)
2887    entities = (repo.get_entity_for_entity_id(entity) for entity in data)
2888    if isinstance(site, pywikibot.site.DataSite):
2889        return entities
2890
2891    return WikidataPageFromItemGenerator(entities, site)
2892
2893
2894def WikibaseSearchItemPageGenerator(text: str,
2895                                    language: Optional[str] = None,
2896                                    total: Optional[int] = None, site=None):
2897    """
2898    Generate pages that contain the provided text.
2899
2900    :param text: Text to look for.
2901    :param language: Code of the language to search in. If not specified,
2902        value from pywikibot.config.data_lang is used.
2903    :param total: Maximum number of pages to retrieve in total, or None in
2904        case of no limit.
2905    :param site: Site for generator results.
2906    :type site: :py:obj:`pywikibot.site.BaseSite`
2907    """
2908    if site is None:
2909        site = pywikibot.Site()
2910    if language is None:
2911        language = site.lang
2912    repo = site.data_repository()
2913
2914    data = repo.search_entities(text, language, total=total)
2915    return (pywikibot.ItemPage(repo, item['id']) for item in data)
2916
2917
2918class PetScanPageGenerator:
2919    """Queries PetScan (https://petscan.wmflabs.org/) to generate pages."""
2920
2921    def __init__(self, categories, subset_combination=True, namespaces=None,
2922                 site=None, extra_options=None):
2923        """
2924        Initializer.
2925
2926        :param categories: List of categories to retrieve pages from
2927            (as strings)
2928        :param subset_combination: Combination mode.
2929            If True, returns the intersection of the results of the categories,
2930            else returns the union of the results of the categories
2931        :param namespaces: List of namespaces to search in
2932            (default is None, meaning all namespaces)
2933        :param site: Site to operate on
2934            (default is the default site from the user config)
2935        :param extra_options: Dictionary of extra options to use (optional)
2936        """
2937        if site is None:
2938            site = pywikibot.Site()
2939
2940        self.site = site
2941        self.opts = self.buildQuery(categories, subset_combination,
2942                                    namespaces, extra_options)
2943
2944    def buildQuery(self, categories, subset_combination, namespaces,
2945                   extra_options):
2946        """
2947        Get the querystring options to query PetScan.
2948
2949        :param categories: List of categories (as strings)
2950        :param subset_combination: Combination mode.
2951            If True, returns the intersection of the results of the categories,
2952            else returns the union of the results of the categories
2953        :param namespaces: List of namespaces to search in
2954        :param extra_options: Dictionary of extra options to use
2955        :return: Dictionary of querystring parameters to use in the query
2956        """
2957        extra_options = extra_options or {}
2958
2959        query = {
2960            'language': self.site.code,
2961            'project': self.site.hostname().split('.')[-2],
2962            'combination': 'subset' if subset_combination else 'union',
2963            'categories': '\r\n'.join(categories),
2964            'format': 'json',
2965            'doit': ''
2966        }
2967
2968        if namespaces:
2969            for namespace in namespaces:
2970                query['ns[{}]'.format(int(namespace))] = 1
2971
2972        query_final = query.copy()
2973        query_final.update(extra_options)
2974
2975        return query_final
2976
2977    def query(self):
2978        """Query PetScan."""
2979        url = 'https://petscan.wmflabs.org'
2980
2981        try:
2982            req = http.fetch(url, params=self.opts)
2983        except ReadTimeout:
2984            raise ServerError('received ReadTimeout from {}'.format(url))
2985
2986        server_err = HTTPStatus.INTERNAL_SERVER_ERROR
2987        if server_err <= req.status_code < server_err + 100:
2988            raise ServerError(
2989                'received {} status from {}'.format(req.status_code, req.url))
2990
2991        j = json.loads(req.text)
2992        raw_pages = j['*'][0]['a']['*']
2993        yield from raw_pages
2994
2995    def __iter__(self):
2996        for raw_page in self.query():
2997            page = pywikibot.Page(self.site, raw_page['title'],
2998                                  int(raw_page['namespace']))
2999            yield page
3000
3001
3002DuplicateFilterPageGenerator = redirect_func(
3003    filter_unique, old_name='DuplicateFilterPageGenerator', since='20180715')
3004PreloadingItemGenerator = redirect_func(PreloadingEntityGenerator,
3005                                        old_name='PreloadingItemGenerator',
3006                                        since='20170314')
3007TextfilePageGenerator = redirect_func(
3008    TextIOPageGenerator, old_name='TextfilePageGenerator', since='20210611')
3009
3010if __name__ == '__main__':  # pragma: no cover
3011    pywikibot.output('Pagegenerators cannot be run as script - are you '
3012                     'looking for listpages.py?')
3013