1#!/usr/bin/python
2"""
3This module can do slight modifications to tidy a wiki page's source code.
4
5The changes are not supposed to change the look of the rendered wiki page.
6
7If you wish to run this as an stand-alone script, use::
8
9    scripts/cosmetic_changes.py
10
11For regular use, it is recommended to put this line into your user-config.py::
12
13    cosmetic_changes = True
14
15You may enable cosmetic changes for additional languages by adding the
16dictionary cosmetic_changes_enable to your user-config.py. It should contain
17a tuple of languages for each site where you wish to enable in addition to
18your own langlanguage if cosmetic_changes_mylang_only is True (see below).
19Please set your dictionary by adding such lines to your user-config.py::
20
21    cosmetic_changes_enable['wikipedia'] = ('de', 'en', 'fr')
22
23There is another config variable: You can set::
24
25    cosmetic_changes_mylang_only = False
26
27if you're running a bot on multiple sites and want to do cosmetic changes on
28all of them, but be careful if you do.
29
30You may disable cosmetic changes by adding the all unwanted languages to the
31dictionary cosmetic_changes_disable in your user-config.py. It should contain
32a tuple of languages for each site where you wish to disable cosmetic changes.
33You may use it with cosmetic_changes_mylang_only is False, but you can also
34disable your own language. This also overrides the settings in the dictionary
35cosmetic_changes_enable. Please set this dictionary by adding such lines to
36your user-config.py::
37
38    cosmetic_changes_disable['wikipedia'] = ('de', 'en', 'fr')
39
40You may disable cosmetic changes for a given script by appending the all
41unwanted scripts to the list cosmetic_changes_deny_script in your
42user-config.py. By default it contains cosmetic_changes.py itself and touch.py.
43This overrides all other enabling settings for cosmetic changes. Please modify
44the given list by adding such lines to your user-config.py::
45
46    cosmetic_changes_deny_script.append('your_script_name_1')
47
48or by adding a list to the given one::
49
50    cosmetic_changes_deny_script += ['your_script_name_1',
51                                     'your_script_name_2']
52"""
53#
54# (C) Pywikibot team, 2006-2021
55#
56# Distributed under the terms of the MIT license.
57#
58import re
59
60from enum import IntEnum
61from typing import Any, Optional, Union
62
63import pywikibot
64from pywikibot import textlib
65from pywikibot.backports import Callable, Dict, List, Match, Pattern
66from pywikibot.exceptions import InvalidTitleError
67from pywikibot.textlib import (
68    FILE_LINK_REGEX,
69    MultiTemplateMatchBuilder,
70    _get_regexes,
71)
72from pywikibot.tools import (
73    deprecated,
74    deprecated_args,
75    first_lower,
76    first_upper,
77    issue_deprecation_warning,
78    ModuleDeprecationWrapper,
79)
80from pywikibot.tools.chars import url2string
81
82
83try:
84    import stdnum.isbn as stdnum_isbn
85except ImportError:
86    stdnum_isbn = None
87
88
89# Subpage templates. Must be in lower case,
90# whereas subpage itself must be case sensitive
91# This is also used by interwiki.py
92# TODO: Maybe move it to family file and implement global instances
93moved_links = {
94    'ar': (['documentation', 'template documentation', 'شرح', 'توثيق'],
95           '/doc'),
96    'bn': ('documentation', '/doc'),
97    'ca': ('ús de la plantilla', '/ús'),
98    'cs': ('dokumentace', '/doc'),
99    'da': ('dokumentation', '/doc'),
100    'de': ('dokumentation', '/Meta'),
101    'dsb': (['dokumentacija', 'doc'], '/Dokumentacija'),
102    'en': (['documentation', 'template documentation', 'template doc',
103            'doc', 'documentation, template'], '/doc'),
104    'es': (['documentación', 'documentación de plantilla'], '/doc'),
105    'eu': ('txantiloi dokumentazioa', '/dok'),
106    'fa': (['documentation', 'template documentation', 'template doc',
107            'doc', 'توضیحات', 'زیرصفحه توضیحات'], '/doc'),
108    # fi: no idea how to handle this type of subpage at :Metasivu:
109    'fi': ('mallineohje', None),
110    'fr': (['/documentation', 'documentation', 'doc_modèle',
111            'documentation modèle', 'documentation modèle compliqué',
112            'documentation modèle en sous-page',
113            'documentation modèle compliqué en sous-page',
114            'documentation modèle utilisant les parserfunctions en sous-page',
115            ],
116           '/Documentation'),
117    'hsb': (['dokumentacija', 'doc'], '/Dokumentacija'),
118    'hu': ('sablondokumentáció', '/doc'),
119    'id': ('template doc', '/doc'),
120    'ilo': ('documentation', '/doc'),
121    'ja': ('documentation', '/doc'),
122    'ka': ('თარგის ინფო', '/ინფო'),
123    'ko': ('documentation', '/설명문서'),
124    'ms': ('documentation', '/doc'),
125    'no': ('dokumentasjon', '/dok'),
126    'nn': ('dokumentasjon', '/dok'),
127    'pl': ('dokumentacja', '/opis'),
128    'pt': (['documentação', '/doc'], '/doc'),
129    'ro': ('documentaţie', '/doc'),
130    'ru': ('doc', '/doc'),
131    'simple': (['documentation',
132                'template documentation',
133                'template doc',
134                'doc',
135                'documentation, template'], '/doc'),
136    'sk': ('dokumentácia', '/Dokumentácia'),
137    'sv': ('dokumentation', '/dok'),
138    'uk': (['документація', 'doc', 'documentation'], '/Документація'),
139    'ur': (['دستاویز', 'توثيق', 'شرح', 'توضیحات',
140            'documentation', 'template doc', 'doc',
141            'documentation, template'], '/doc'),
142    'vi': ('documentation', '/doc'),
143    'zh': (['documentation', 'doc'], '/doc'),
144}
145
146# Template which should be replaced or removed.
147# Use a list with two entries. The first entry will be replaced by the second.
148# Examples:
149# For removing {{Foo}}, the list must be:
150#           ('Foo', None),
151#
152# The following also works:
153#           ('Foo', ''),
154#
155# For replacing {{Foo}} with {{Bar}} the list must be:
156#           ('Foo', 'Bar'),
157#
158# This also removes all template parameters of {{Foo}}
159# For replacing {{Foo}} with {{Bar}} but keep the template
160# parameters in its original order, please use:
161#           ('Foo', 'Bar\\g<parameters>'),
162
163deprecatedTemplates = {
164    'wikipedia': {
165        'de': [
166            ('Belege', 'Belege fehlen\\g<parameters>'),
167            ('Quelle', 'Belege fehlen\\g<parameters>'),
168            ('Quellen', 'Belege fehlen\\g<parameters>'),
169            ('Quellen fehlen', 'Belege fehlen\\g<parameters>'),
170        ],
171        'ur': [
172            ('Infobox former country',
173             'خانہ معلومات سابقہ ملک\\g<parameters>'),
174            ('Infobox Former Country',
175             'خانہ معلومات سابقہ ملک\\g<parameters>'),
176        ],
177    }
178}
179
180
181class CANCEL(IntEnum):
182
183    """Cancel level to ignore exceptions.
184
185    If an error occurred and either skips the page or the method
186    or a single match. ALL raises the exception.
187
188    .. versionadded:: 6.3
189    """
190
191    ALL = 0
192    PAGE = 1
193    METHOD = 2
194    MATCH = 3
195
196
197def _format_isbn_match(match: Match[str], strict: bool = True) -> str:
198    """Helper function to validate and format a single matched ISBN."""
199    if not stdnum_isbn:
200        raise NotImplementedError(
201            'ISBN functionality not available. Install stdnum package.')
202
203    isbn = match.group('code')
204    try:
205        stdnum_isbn.validate(isbn)
206    except stdnum_isbn.ValidationError as e:
207        if strict:
208            raise
209        pywikibot.log('ISBN "{}" validation error: {}'.format(isbn, e))
210        return isbn
211
212    return stdnum_isbn.format(isbn)
213
214
215def _reformat_ISBNs(text: str, strict: bool = True) -> str:
216    """Helper function to normalise ISBNs in text.
217
218    :raises Exception: Invalid ISBN encountered when strict enabled
219    """
220    return textlib.reformat_ISBNs(
221        text, lambda match: _format_isbn_match(match, strict=strict))
222
223
224class CosmeticChangesToolkit:
225
226    """Cosmetic changes toolkit."""
227
228    @deprecated_args(redirect=True, diff='show_diff', site='page')
229    def __init__(self, page: 'pywikibot.page.BasePage', *,
230                 show_diff: bool = False,
231                 namespace: Optional[int] = None,
232                 pageTitle: Optional[str] = None,
233                 ignore: IntEnum = CANCEL.ALL) -> None:
234        """Initializer.
235
236        :param page: the Page object containing the text to be modified
237        :param show_diff: show difference after replacements
238        :param namespace: DEPRECATED namespace parameter
239        :param pageTitle: DEPRECATED page title parameter
240        :param ignore: ignores if an error occurred and either skips the page
241            or only that method. It can be set one of the CANCEL constants
242        """
243        if isinstance(page, pywikibot.BaseSite):
244            self.site = page
245            self.title = pageTitle
246
247            class_name = type(self).__name__
248            if self.title is None:
249                raise ValueError('Page title required for ' + class_name)
250
251            try:
252                self.namespace = self.site.namespaces.resolve(namespace).pop(0)
253            except (KeyError, TypeError, IndexError):
254                raise ValueError('{} needs a valid namespace'
255                                 .format(class_name))
256            issue_deprecation_warning(
257                'site parameter of ' + class_name,
258                'a pywikibot.Page object as first parameter',
259                since='20201102')
260        else:
261            if namespace is not None or pageTitle is not None:
262                raise TypeError(
263                    "'namespace' and 'pageTitle' arguments are invalid with "
264                    'a given Page object')
265            self.site = page.site
266            self.title = page.title()
267            self.namespace = page.namespace()
268
269        self.show_diff = show_diff
270        self.template = (self.namespace == 10)
271        self.talkpage = self.namespace >= 0 and self.namespace % 2 == 1
272        self.ignore = ignore
273
274        self.common_methods = [
275            self.commonsfiledesc,
276            self.fixSelfInterwiki,
277            self.standardizePageFooter,
278            self.fixSyntaxSave,
279            self.cleanUpLinks,
280            self.cleanUpSectionHeaders,
281            self.putSpacesInLists,
282            self.translateAndCapitalizeNamespaces,
283            self.translateMagicWords,
284            self.replaceDeprecatedTemplates,
285            self.resolveHtmlEntities,
286            self.removeEmptySections,
287            self.removeUselessSpaces,
288            self.removeNonBreakingSpaceBeforePercent,
289
290            self.fixHtml,
291            self.fixReferences,
292            self.fixStyle,
293            self.fixTypo,
294
295            self.fixArabicLetters,
296        ]
297        if stdnum_isbn:
298            self.common_methods.append(self.fix_ISBN)
299
300    @property  # type: ignore[misc]
301    @deprecated('show_diff', since='20200415')
302    def diff(self) -> bool:
303        """CosmeticChangesToolkit.diff attribute getter."""
304        return self.show_diff
305
306    @diff.setter  # type: ignore[misc]
307    @deprecated('show_diff', since='20200415')
308    def diff(self, value: bool) -> None:
309        """CosmeticChangesToolkit.diff attribute setter."""
310        self.show_diff = bool(value)
311
312    @classmethod
313    @deprecated('CosmeticChangesToolkit with pywikibot.Page object',
314                since='20200415')
315    @deprecated_args(diff='show_diff')
316    def from_page(cls, page: 'pywikibot.page.BasePage',
317                  show_diff: bool = False,
318                  ignore: IntEnum = CANCEL.ALL) -> 'CosmeticChangesToolkit':
319        """Create toolkit based on the page."""
320        return cls(page, show_diff=show_diff, ignore=ignore)
321
322    def safe_execute(self, method: Callable[[str], str], text: str) -> str:
323        """Execute the method and catch exceptions if enabled."""
324        result = None
325        try:
326            result = method(text)
327        except Exception as e:
328            if self.ignore == CANCEL.METHOD:
329                pywikibot.warning('Unable to perform "{}" on "{}"!'
330                                  .format(method.__name__, self.title))
331                pywikibot.exception(e)
332            else:
333                raise
334        return text if result is None else result
335
336    def _change(self, text: str) -> str:
337        """Execute all clean up methods."""
338        for method in self.common_methods:
339            text = self.safe_execute(method, text)
340        return text
341
342    def change(self, text: str) -> Union[bool, str]:
343        """Execute all clean up methods and catch errors if activated."""
344        try:
345            new_text = self._change(text)
346        except Exception as e:
347            if self.ignore == CANCEL.PAGE:
348                pywikibot.warning('Skipped "{}", because an error occurred.'
349                                  .format(self.title))
350                pywikibot.exception(e)
351                return False
352            raise
353        else:
354            if self.show_diff:
355                pywikibot.showDiff(text, new_text)
356            return new_text
357
358    def fixSelfInterwiki(self, text: str) -> str:
359        """
360        Interwiki links to the site itself are displayed like local links.
361
362        Remove their language code prefix.
363        """
364        if not self.talkpage and pywikibot.calledModuleName() != 'interwiki':
365            interwikiR = re.compile(r'\[\[(?: *:)? *{} *: *([^\[\]\n]*)\]\]'
366                                    .format(self.site.code))
367            text = interwikiR.sub(r'[[\1]]', text)
368        return text
369
370    def standardizePageFooter(self, text: str) -> str:
371        """
372        Standardize page footer.
373
374        Makes sure that interwiki links and categories are put
375        into the correct position and into the right order. This
376        combines the old instances of standardizeInterwiki
377        and standardizeCategories.
378
379        The page footer consists of the following parts
380        in that sequence:
381        1. categories
382        2. additional information depending on the local site policy
383        3. interwiki
384        """
385        assert self.title is not None
386
387        categories = []
388        interwiki_links = {}
389
390        # get categories
391        if not self.template:
392            categories = textlib.getCategoryLinks(text, site=self.site)
393
394        if not self.talkpage:
395            subpage = False
396            if self.template:
397                try:
398                    tmpl, loc = moved_links[self.site.code]
399                    del tmpl
400                except KeyError:
401                    loc = None
402                if loc is not None and loc in self.title:
403                    subpage = True
404
405            # get interwiki
406            interwiki_links = textlib.getLanguageLinks(
407                text, insite=self.site, template_subpage=subpage)
408
409            # remove interwiki
410            text = textlib.removeLanguageLinks(text, site=self.site)
411
412        # add categories, main to top
413        if categories:
414            # TODO: Sort categories in alphabetic order, e.g. using
415            # categories.sort()? (T100265)
416            # TODO: Get main categories from Wikidata?
417            main = pywikibot.Category(self.site, 'Category:' + self.title,
418                                      sort_key=' ')
419            if main in categories:
420                categories.pop(categories.index(main))
421                categories.insert(0, main)
422            text = textlib.replaceCategoryLinks(text, categories,
423                                                site=self.site)
424
425        # add interwiki
426        if interwiki_links:
427            text = textlib.replaceLanguageLinks(text, interwiki_links,
428                                                site=self.site,
429                                                template=self.template,
430                                                template_subpage=subpage)
431
432        return text
433
434    def translateAndCapitalizeNamespaces(self, text: str) -> str:
435        """Use localized namespace names."""
436        # arz uses English stylish codes
437        if self.site.sitename == 'wikipedia:arz':
438            return text
439        # wiki links aren't parsed here.
440        exceptions = ['nowiki', 'comment', 'math', 'pre']
441
442        for namespace in self.site.namespaces.values():
443            if namespace == 0:
444                # skip main (article) namespace
445                continue
446            # a clone is needed. Won't change the namespace dict
447            namespaces = list(namespace)
448            if namespace == 6 and self.site.family.name == 'wikipedia':
449                if self.site.code in ('en', 'fr'):
450                    # do not change "Image" on en-wiki and fr-wiki
451                    assert 'Image' in namespaces
452                    namespaces.remove('Image')
453                if self.site.code == 'hu':
454                    # do not change "Kép" on hu-wiki
455                    assert 'Kép' in namespaces
456                    namespaces.remove('Kép')
457                elif self.site.code == 'pt':
458                    # use "Imagem" by default on pt-wiki (per T57242)
459                    assert 'Imagem' in namespaces
460                    namespaces.insert(
461                        0, namespaces.pop(namespaces.index('Imagem')))
462            # final namespace variant
463            final_ns = namespaces.pop(0)
464            if namespace in (2, 3):
465                # skip localized user namespace, maybe gender is used
466                namespaces = ['User' if namespace == 2 else 'User talk']
467            # lowerspaced and underscored namespaces
468            for i, item in enumerate(namespaces):
469                item = item.replace(' ', '[ _]')
470                item = '[{}{}]'.format(item[0], item[0].lower()) + item[1:]
471                namespaces[i] = item
472            namespaces.append(first_lower(final_ns))
473            if final_ns and namespaces:
474                if self.site.sitename == 'wikipedia:pt' and namespace == 6:
475                    # only change on these file extensions (per T57242)
476                    extensions = ('png', 'gif', 'jpg', 'jpeg', 'svg', 'tiff',
477                                  'tif')
478                    text = textlib.replaceExcept(
479                        text,
480                        r'\[\[\s*({}) *:(?P<name>[^\|\]]*?\.({}))'
481                        r'(?P<label>.*?)\]\]'
482                        .format('|'.join(namespaces), '|'.join(extensions)),
483                        r'[[{}:\g<name>\g<label>]]'.format(final_ns),
484                        exceptions)
485                else:
486                    text = textlib.replaceExcept(
487                        text,
488                        r'\[\[\s*({}) *:(?P<nameAndLabel>.*?)\]\]'
489                        .format('|'.join(namespaces)),
490                        r'[[{}:\g<nameAndLabel>]]'.format(final_ns),
491                        exceptions)
492        return text
493
494    def translateMagicWords(self, text: str) -> str:
495        """Use localized magic words."""
496        # not wanted at ru
497        # arz uses English stylish codes
498        # no need to run on English wikis
499        if self.site.code in ['arz', 'en', 'ru']:
500            return text
501
502        def init_cache() -> None:
503            for magicword in ('img_thumbnail', 'img_left', 'img_center',
504                              'img_right', 'img_none', 'img_framed',
505                              'img_frameless', 'img_border', 'img_upright',
506                              'img_baseline', 'img_sub', 'img_super',
507                              'img_top', 'img_text_top', 'img_middle',
508                              'img_bottom', 'img_text_bottom'):
509                aliases = self.site.getmagicwords(magicword)
510                if len(aliases) > 1:
511                    cache.update((alias, aliases[0]) for alias in aliases[1:]
512                                 if '$1' not in alias)
513            if not cache:
514                cache[False] = True  # signal there is nothing to replace
515
516        def replace_magicword(match: Match[str]) -> str:
517            if cache.get(False):
518                return match.group()
519            split = match.group().split('|')
520            if len(split) == 1:
521                return match.group()
522
523            if not cache:
524                init_cache()
525
526            # push ']]' out and re-add below
527            split[-1] = split[-1][:-2]
528            return '{}|{}]]'.format(
529                split[0], '|'.join(cache.get(x.strip(), x) for x in split[1:]))
530
531        cache = {}  # type: Dict[Union[bool, str], Any]
532        exceptions = ['comment', 'nowiki', 'pre', 'syntaxhighlight']
533        regex = re.compile(
534            FILE_LINK_REGEX % '|'.join(self.site.namespaces[6]),
535            flags=re.X)
536        return textlib.replaceExcept(
537            text, regex, replace_magicword, exceptions)
538
539    def cleanUpLinks(self, text: str) -> str:
540        """Tidy up wikilinks found in a string.
541
542        This function will:
543        * Replace underscores with spaces
544
545        * Move leading and trailing spaces out of the wikilink and into the
546          surrounding text
547
548        * Convert URL-encoded characters into Unicode-encoded characters
549
550        * Move trailing characters out of the link and make the link without
551          using a pipe, if possible
552
553        * Capitalize the article title of the link, if appropriate
554
555        :param text: string to perform the clean-up on
556        :return: text with tidied wikilinks
557        """
558        # helper function which works on one link and either returns it
559        # unmodified, or returns a replacement.
560        def handleOneLink(match: Match[str]) -> str:
561            titleWithSection = match.group('titleWithSection')
562            label = match.group('label')
563            trailingChars = match.group('linktrail')
564            newline = match.group('newline')
565
566            try:
567                is_interwiki = self.site.isInterwikiLink(titleWithSection)
568            except ValueError:  # T111513
569                is_interwiki = True
570
571            if is_interwiki:
572                return match.group()
573
574            # The link looks like this:
575            # [[page_title|link_text]]trailing_chars
576            # We only work on namespace 0 because pipes and linktrails work
577            # differently for images and categories.
578            page = pywikibot.Page(pywikibot.Link(titleWithSection, self.site))
579            try:
580                in_main_namespace = page.namespace() == 0
581            except InvalidTitleError:
582                in_main_namespace = False
583            if not in_main_namespace:
584                return match.group()
585
586            # Replace underlines by spaces, also multiple underlines
587            titleWithSection = re.sub('_+', ' ', titleWithSection)
588            # Remove double spaces
589            titleWithSection = re.sub('  +', ' ', titleWithSection)
590            # Remove unnecessary leading spaces from title,
591            # but remember if we did this because we eventually want
592            # to re-add it outside of the link later.
593            titleLength = len(titleWithSection)
594            titleWithSection = titleWithSection.lstrip()
595            hadLeadingSpaces = len(titleWithSection) != titleLength
596            hadTrailingSpaces = False
597            # Remove unnecessary trailing spaces from title,
598            # but remember if we did this because it may affect
599            # the linktrail and because we eventually want to
600            # re-add it outside of the link later.
601            if not trailingChars:
602                titleLength = len(titleWithSection)
603                titleWithSection = titleWithSection.rstrip()
604                hadTrailingSpaces = len(titleWithSection) != titleLength
605
606            # Convert URL-encoded characters to str
607            titleWithSection = url2string(titleWithSection,
608                                          encodings=self.site.encodings())
609
610            if not titleWithSection:
611                # just skip empty links.
612                return match.group()
613
614            # Remove unnecessary initial and final spaces from label.
615            # Please note that some editors prefer spaces around pipes.
616            # (See [[en:Wikipedia:Semi-bots]]). We remove them anyway.
617            if label is not None:
618                # Remove unnecessary leading spaces from label,
619                # but remember if we did this because we want
620                # to re-add it outside of the link later.
621                labelLength = len(label)
622                label = label.lstrip()
623                hadLeadingSpaces = len(label) != labelLength
624                # Remove unnecessary trailing spaces from label,
625                # but remember if we did this because it affects
626                # the linktrail.
627                if not trailingChars:
628                    labelLength = len(label)
629                    label = label.rstrip()
630                    hadTrailingSpaces = len(label) != labelLength
631            else:
632                label = titleWithSection
633            if trailingChars:
634                label += trailingChars
635
636            if self.site.siteinfo['case'] == 'first-letter':
637                firstcase_title = first_lower(titleWithSection)
638                firstcase_label = first_lower(label)
639            else:
640                firstcase_title = titleWithSection
641                firstcase_label = label
642
643            if firstcase_label == firstcase_title:
644                newLink = '[[{}]]'.format(label)
645            # Check if we can create a link with trailing characters
646            # instead of a pipelink
647            elif (firstcase_label.startswith(firstcase_title)
648                  and trailR.sub('', label[len(titleWithSection):]) == ''):
649                newLink = '[[{}]]{}'.format(label[:len(titleWithSection)],
650                                            label[len(titleWithSection):])
651
652            else:
653                # Try to capitalize the first letter of the title.
654                # Not useful for languages that don't capitalize nouns.
655                # TODO: Add a configuration variable for each site,
656                # which determines if the link target is written in
657                # uppercase
658                if self.site.sitename == 'wikipedia:de':
659                    titleWithSection = first_upper(titleWithSection)
660                newLink = '[[{}|{}]]'.format(titleWithSection, label)
661            # re-add spaces that were pulled out of the link.
662            # Examples:
663            #   text[[ title ]]text        -> text [[title]] text
664            #   text[[ title | name ]]text -> text [[title|name]] text
665            #   text[[ title |name]]text   -> text[[title|name]]text
666            #   text[[title| name]]text    -> text [[title|name]]text
667            if hadLeadingSpaces and not newline:
668                newLink = ' ' + newLink
669            if hadTrailingSpaces:
670                newLink = newLink + ' '
671            if newline:
672                newLink = newline + newLink
673            return newLink
674
675        trailR = re.compile(self.site.linktrail())
676    # The regular expression which finds links. Results consist of four groups:
677    # group <newline> depends whether the links starts with a new line.
678    # group <titleWithSection> is the page title and section, that is,
679    # everything before | or ]. It'll include the # to make life easier for us.
680    # group <label> is the alternative link title between | and ].
681    # group <linktrail> is the link trail after ]] which are part of the word.
682    # note that the definition of 'letter' varies from language to language.
683        linkR = re.compile(
684            r'(?P<newline>[\n]*)\[\[(?P<titleWithSection>[^\]\|]+)'
685            r'(\|(?P<label>[^\]\|]*))?\]\](?P<linktrail>'
686            + self.site.linktrail() + ')')
687
688        text = textlib.replaceExcept(text, linkR, handleOneLink,
689                                     ['comment', 'math', 'nowiki', 'pre',
690                                      'startspace'])
691        return text
692
693    def resolveHtmlEntities(self, text: str) -> str:
694        """Replace HTML entities with string."""
695        ignore = [
696            38,     # Ampersand (&amp;)
697            39,     # Single quotation mark (&quot;) per T26093
698            60,     # Less than (&lt;)
699            62,     # Greater than (&gt;)
700            91,     # Opening square bracket ([)
701                    # - sometimes used intentionally inside links
702            93,     # Closing square bracket (])
703                    # - used intentionally inside links
704            124,    # Vertical bar (|)
705                    # - used intentionally in navigation bar templates on w:de
706            160,    # Non-breaking space (&nbsp;)
707                    # - not supported by Firefox textareas
708            173,    # Soft-hypen (&shy;) - enable editing
709            8206,   # Left-to-right mark (&ltr;)
710            8207,   # Right-to-left mark (&rtl;)
711        ]
712        if self.template:
713            ignore += [32]  # Space ( )
714            ignore += [58]  # Colon (:)
715        # TODO: T254350 - what other extension tags should be avoided?
716        # (graph, math, score, timeline, etc.)
717        text = pywikibot.html2unicode(
718            text, ignore=ignore, exceptions=['comment', 'syntaxhighlight'])
719        return text
720
721    def removeEmptySections(self, text: str) -> str:
722        """Cleanup empty sections."""
723        # userspace contains article stubs without nobots/in use templates
724        if self.namespace == 2:
725            return text
726
727        skippings = ['comment', 'category']
728        skip_regexes = _get_regexes(skippings, self.site)
729        # site defined templates
730        skip_templates = {
731            'cs': ('Pahýl[ _]část',),  # stub section
732        }
733        if self.site.code in skip_templates:
734            for template in skip_templates[self.site.code]:
735                skip_regexes.append(
736                    re.compile(r'\{\{\s*%s\s*\}\}' % template, re.I))
737        # empty lists
738        skip_regexes.append(re.compile(r'(?m)^[\*#] *$'))
739
740        # get stripped sections
741        stripped_text = textlib.removeLanguageLinks(text, self.site, '\n')
742        for reg in skip_regexes:
743            stripped_text = reg.sub(r'', stripped_text)
744        strip_sections = textlib.extract_sections(
745            stripped_text, self.site)[1]
746
747        # get proper sections
748        header, sections, footer = textlib.extract_sections(text, self.site)
749
750        # iterate stripped sections and create a new page body
751        new_body = []
752        for i, strip_section in enumerate(strip_sections):
753            current_heading = sections[i][0]
754            try:
755                next_heading = sections[i + 1][0]
756            except IndexError:
757                next_heading = ''
758            current_dep = (len(current_heading)
759                           - len(current_heading.lstrip('=')))
760            next_dep = len(next_heading) - len(next_heading.lstrip('='))
761            if strip_section[1].strip() or current_dep < next_dep:
762                new_body.extend(sections[i])
763        return header + ''.join(new_body) + footer
764
765    def removeUselessSpaces(self, text: str) -> str:
766        """Cleanup multiple or trailing spaces."""
767        exceptions = ['comment', 'math', 'nowiki', 'pre', 'syntaxhighlight',
768                      'startspace', 'table']
769        if self.site.sitename != 'wikipedia:cs':
770            exceptions.append('template')
771        text = textlib.replaceExcept(text, r'(?m)[\t ]+( |$)', r'\1',
772                                     exceptions, site=self.site)
773        return text
774
775    def removeNonBreakingSpaceBeforePercent(self, text: str) -> str:
776        """
777        Remove a non-breaking space between number and percent sign.
778
779        Newer MediaWiki versions automatically place a non-breaking space in
780        front of a percent sign, so it is no longer required to place it
781        manually.
782        """
783        text = textlib.replaceExcept(
784            text, r'(\d)&(?:nbsp|#160|#x[Aa]0);%', r'\1 %', ['timeline'])
785        return text
786
787    def cleanUpSectionHeaders(self, text: str) -> str:
788        """
789        Add a space between the equal signs and the section title.
790
791        Example::
792
793            ==Section title==
794
795        becomes::
796
797        == Section title ==
798
799        :NOTE: This space is recommended in the syntax help on the
800            English and German Wikipedias. It is not wanted on Lojban and
801            English Wiktionaries (T168399, T169064) and it might be that
802            it is not wanted on other wikis. If there are any complaints,
803            please file a bug report.
804        """
805        if self.site.sitename in ['wiktionary:jbo', 'wiktionary:en']:
806            return text
807        return textlib.replaceExcept(
808            text,
809            r'(?m)^(={1,6})[ \t]*(?P<title>.*[^\s=])[ \t]*\1[ \t]*\r?\n',
810            r'\1 \g<title> \1\n',
811            ['comment', 'math', 'nowiki', 'pre'])
812
813    def putSpacesInLists(self, text: str) -> str:
814        """
815        Add a space between the * or # and the text.
816
817        :NOTE: This space is recommended in the syntax help on the
818            English, German and French Wikipedias. It might be that it
819            is not wanted on other wikis. If there are any complaints,
820            please file a bug report.
821        """
822        if not self.template:
823            exceptions = ['comment', 'math', 'nowiki', 'pre',
824                          'syntaxhighlight', 'template', 'timeline',
825                          self.site.redirect_regex]
826            text = textlib.replaceExcept(
827                text,
828                r'(?m)'
829                r'^(?P<bullet>[:;]*(\*+|#+)[:;\*#]*)(?P<char>[^\s\*#:;].+?)',
830                r'\g<bullet> \g<char>',
831                exceptions)
832        return text
833
834    def replaceDeprecatedTemplates(self, text: str) -> str:
835        """Replace deprecated templates."""
836        exceptions = ['comment', 'math', 'nowiki', 'pre']
837        builder = MultiTemplateMatchBuilder(self.site)
838
839        if self.site.family.name in deprecatedTemplates \
840           and self.site.code in deprecatedTemplates[self.site.family.name]:
841            for template in deprecatedTemplates[
842                    self.site.family.name][self.site.code]:
843                old, new = template
844                if new is None:
845                    new = ''
846                else:
847                    new = '{{%s}}' % new
848
849                text = textlib.replaceExcept(
850                    text,
851                    builder.pattern(old),
852                    new, exceptions)
853
854        return text
855
856    # from fixes.py
857    def fixSyntaxSave(self, text: str) -> str:
858        """Convert weblinks to wikilink, fix link syntax."""
859        def replace_link(match: Match[str]) -> str:
860            """Create a string to replace a single link."""
861            replacement = '[['
862            if re.match(r'(?:' + '|'.join(list(self.site.namespaces[6])
863                        + list(self.site.namespaces[14])) + '):',
864                        match.group('link')):
865                replacement += ':'
866            replacement += match.group('link')
867            if match.group('title'):
868                replacement += '|' + match.group('title')
869            return replacement + ']]'
870
871        exceptions = ['comment', 'math', 'nowiki', 'pre', 'startspace',
872                      'syntaxhighlight']
873        # link to the wiki working on
874        # Only use suffixes for article paths
875        for suffix in self.site._interwiki_urls(True):
876            http_url = self.site.base_url(suffix, 'http')
877            if self.site.protocol() == 'http':
878                https_url = None
879            else:
880                https_url = self.site.base_url(suffix, 'https')
881            # compare strings without the protocol, if they are empty support
882            # also no prefix (//en.wikipedia.org/…)
883            if https_url is not None and http_url[4:] == https_url[5:]:
884                urls = ['(?:https?:)?' + re.escape(http_url[5:])]
885            else:
886                urls = [re.escape(url) for url in (http_url, https_url)
887                        if url is not None]
888            for url in urls:
889                # Only include links which don't include the separator as
890                # the wikilink won't support additional parameters
891                separator = '?'
892                if '?' in suffix:
893                    separator += '&'
894                # Match first a non space in the title to prevent that multiple
895                # spaces at the end without title will be matched by it
896                text = textlib.replaceExcept(
897                    text,
898                    r'\[\[?' + url + r'(?P<link>[^' + separator + r']+?)'
899                    r'(\s+(?P<title>[^\s].*?))?\s*\]\]?',
900                    replace_link, exceptions, site=self.site)
901        # external link in/starting with double brackets
902        text = textlib.replaceExcept(
903            text,
904            r'\[\[(?P<url>https?://[^\]]+?)\]\]?',
905            r'[\g<url>]', exceptions, site=self.site)
906        # external link and description separated by a pipe, with
907        # whitespace in front of the pipe, so that it is clear that
908        # the dash is not a legitimate part of the URL.
909        text = textlib.replaceExcept(
910            text,
911            r'\[(?P<url>https?://[^\|\] \r\n]+?) +\| *(?P<label>[^\|\]]+?)\]',
912            r'[\g<url> \g<label>]', exceptions)
913        # dash in external link, where the correct end of the URL can
914        # be detected from the file extension. It is very unlikely that
915        # this will cause mistakes.
916        extensions = [r'\.{}'.format(ext)
917                      for ext in ['pdf', 'html?', 'php', 'aspx?', 'jsp']]
918        text = textlib.replaceExcept(
919            text,
920            r'\[(?P<url>https?://[^\|\] ]+?(' + '|'.join(extensions) + r')) *'
921            r'\| *(?P<label>[^\|\]]+?)\]',
922            r'[\g<url> \g<label>]', exceptions)
923        return text
924
925    def fixHtml(self, text: str) -> str:
926        """Relace html markups with wikitext markups."""
927        def replace_header(match: Match[str]) -> str:
928            """Create a header string for replacing."""
929            depth = int(match.group(1))
930            return r'{0} {1} {0}'.format('=' * depth, match.group(2))
931
932        # Everything case-insensitive (?i)
933        # Keep in mind that MediaWiki automatically converts <br> to <br />
934        exceptions = ['comment', 'math', 'nowiki', 'pre', 'startspace',
935                      'syntaxhighlight']
936        text = textlib.replaceExcept(text, r'(?i)<(b|strong)>(.*?)</\1>',
937                                     r"'''\2'''", exceptions, site=self.site)
938        text = textlib.replaceExcept(text, r'(?i)<(i|em)>(.*?)</\1>',
939                                     r"''\2''", exceptions, site=self.site)
940        # horizontal line without attributes in a single line
941        text = textlib.replaceExcept(text, r'(?i)([\r\n])<hr[ /]*>([\r\n])',
942                                     r'\1----\2', exceptions)
943        # horizontal line with attributes; can't be done with wiki syntax
944        # so we only make it XHTML compliant
945        text = textlib.replaceExcept(text, r'(?i)<hr ([^>/]+?)>',
946                                     r'<hr \1 />',
947                                     exceptions)
948        # a header where only spaces are in the same line
949        text = textlib.replaceExcept(
950            text,
951            r'(?i)(?<=[\r\n]) *<h([1-7])> *([^<]+?) *</h\1> *(?=[\r\n])',
952            replace_header,
953            exceptions)
954        # TODO: maybe we can make the bot replace <p> tags with \r\n's.
955        return text
956
957    def fixReferences(self, text: str) -> str:
958        """Fix references tags."""
959        # See also
960        # https://en.wikipedia.org/wiki/User:AnomieBOT/source/tasks/OrphanReferenceFixer.pm
961        exceptions = ['comment', 'math', 'nowiki', 'pre', 'syntaxhighlight',
962                      'startspace']
963
964        # it should be name = " or name=" NOT name   ="
965        text = re.sub(r'(?i)<ref +name(= *| *=)"', r'<ref name="', text)
966        # remove empty <ref/>-tag
967        text = textlib.replaceExcept(text,
968                                     r'(?i)(<ref\s*/>|<ref *>\s*</ref>)',
969                                     r'', exceptions)
970        text = textlib.replaceExcept(text,
971                                     r'(?i)<ref\s+([^>]+?)\s*>\s*</ref>',
972                                     r'<ref \1/>', exceptions)
973        return text
974
975    def fixStyle(self, text: str) -> str:
976        """Convert prettytable to wikitable class."""
977        exceptions = ['comment', 'math', 'nowiki', 'pre', 'startspace',
978                      'syntaxhighlight']
979        if self.site.code in ('de', 'en'):
980            text = textlib.replaceExcept(text,
981                                         r'(class="[^"]*)prettytable([^"]*")',
982                                         r'\1wikitable\2', exceptions)
983        return text
984
985    def fixTypo(self, text: str) -> str:
986        """Fix units."""
987        exceptions = [
988            'comment',
989            'gallery',
990            'hyperlink',
991            'interwiki',
992            'link',
993            'nowiki',
994            'math',
995            'pre',
996            'startspace',
997            'syntaxhighlight',
998        ]  # type: List[Union[str, Pattern[str]]]
999
1000        # change <number> ccm -> <number> cm³
1001        text = textlib.replaceExcept(text, r'(\d)\s*(?:&nbsp;)?ccm',
1002                                     r'\1&nbsp;cm³', exceptions,
1003                                     site=self.site)
1004        # Solve wrong Nº sign with °C or °F
1005        # additional exception requested on fr-wiki for this stuff
1006        pattern = re.compile('«.*?»')
1007        exceptions.append(pattern)
1008        text = textlib.replaceExcept(text, r'(\d)\s*(?:&nbsp;)?[º°]([CF])',
1009                                     r'\1&nbsp;°\2', exceptions,
1010                                     site=self.site)
1011        text = textlib.replaceExcept(text, 'º([CF])', '°' + r'\1',
1012                                     exceptions,
1013                                     site=self.site)
1014        return text
1015
1016    def fixArabicLetters(self, text: str) -> str:
1017        """Fix Arabic and Persian letters."""
1018        if self.site.code not in ['ckb', 'fa']:
1019            return text
1020
1021        exceptions = [
1022            'file',
1023            'gallery',
1024            'hyperlink',
1025            'interwiki',
1026            'inputbox',
1027            # FIXME: but changes letters inside wikilinks
1028            # 'link',
1029            'math',
1030            'pre',
1031            'template',
1032            'timeline',
1033            'ref',
1034            'startspace',
1035            'syntaxhighlight',
1036        ]  # type: List[Union[str, Pattern[str]]]
1037
1038        digits = textlib.NON_LATIN_DIGITS
1039        faChrs = 'ءاآأإئؤبپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیةيك' + digits['fa']
1040
1041        # not to let bot edits in latin content
1042        exceptions.append(re.compile('[^{fa}] *?"*? *?, *?[^{fa}]'
1043                                     .format(fa=faChrs)))
1044        text = textlib.replaceExcept(text, ',', '،', exceptions,
1045                                     site=self.site)
1046        if self.site.code == 'ckb':
1047            text = textlib.replaceExcept(text,
1048                                         '\u0647([.\u060c_<\\]\\s])',
1049                                         '\u06d5\\1', exceptions,
1050                                         site=self.site)
1051            text = textlib.replaceExcept(text, 'ه\u200c', 'ە', exceptions,
1052                                         site=self.site)
1053            text = textlib.replaceExcept(text, 'ه', 'ھ', exceptions,
1054                                         site=self.site)
1055        text = textlib.replaceExcept(text, 'ك', 'ک', exceptions,
1056                                     site=self.site)
1057        text = textlib.replaceExcept(text, '[ىي]', 'ی', exceptions,
1058                                     site=self.site)
1059
1060        return text
1061
1062    def commonsfiledesc(self, text: str) -> str:
1063        """
1064        Clean up file descriptions on Wikimedia Commons.
1065
1066        It works according to [1] and works only on pages in the file
1067        namespace on Wikimedia Commons.
1068
1069        [1]:
1070        https://commons.wikimedia.org/wiki/Commons:Tools/pywiki_file_description_cleanup
1071        """
1072        if self.site.sitename != 'commons:commons' or self.namespace == 6:
1073            return text
1074
1075        # section headers to {{int:}} versions
1076        exceptions = ['comment', 'includeonly', 'math', 'noinclude', 'nowiki',
1077                      'pre', 'syntaxhighlight', 'ref', 'timeline']
1078        text = textlib.replaceExcept(text,
1079                                     r'([\r\n]|^)\=\= *Summary *\=\=',
1080                                     r'\1== {{int:filedesc}} ==',
1081                                     exceptions, True)
1082        text = textlib.replaceExcept(
1083            text,
1084            r'([\r\n])\=\= *\[\[Commons:Copyright tags\|Licensing\]\]: *\=\=',
1085            r'\1== {{int:license-header}} ==', exceptions, True)
1086        text = textlib.replaceExcept(
1087            text,
1088            r'([\r\n])'
1089            r'\=\= *(Licensing|License information|{{int:license}}) *\=\=',
1090            r'\1== {{int:license-header}} ==', exceptions, True)
1091
1092        # frequent field values to {{int:}} versions
1093        text = textlib.replaceExcept(
1094            text,
1095            r'([\r\n]\|[Ss]ource *\= *)'
1096            r'(?:[Oo]wn work by uploader|[Oo]wn work|[Ee]igene [Aa]rbeit) *'
1097            r'([\r\n])',
1098            r'\1{{own}}\2', exceptions, True)
1099        text = textlib.replaceExcept(
1100            text,
1101            r'(\| *Permission *\=) *(?:[Ss]ee below|[Ss]iehe unten) *([\r\n])',
1102            r'\1\2', exceptions, True)
1103
1104        # added to transwikied pages
1105        text = textlib.replaceExcept(text, r'__NOTOC__', '', exceptions, True)
1106
1107        # tracker element for js upload form
1108        text = textlib.replaceExcept(
1109            text,
1110            r'<!-- *{{ImageUpload\|(?:full|basic)}} *-->',
1111            '', exceptions[1:], True)
1112        text = textlib.replaceExcept(text, r'{{ImageUpload\|(?:basic|full)}}',
1113                                     '', exceptions, True)
1114
1115        # duplicated section headers
1116        text = textlib.replaceExcept(
1117            text,
1118            r'([\r\n]|^)\=\= *{{int:filedesc}} *\=\=(?:[\r\n ]*)\=\= *'
1119            r'{{int:filedesc}} *\=\=',
1120            r'\1== {{int:filedesc}} ==', exceptions, True)
1121        text = textlib.replaceExcept(
1122            text,
1123            r'([\r\n]|^)\=\= *{{int:license-header}} *\=\=(?:[\r\n ]*)'
1124            r'\=\= *{{int:license-header}} *\=\=',
1125            r'\1== {{int:license-header}} ==', exceptions, True)
1126        return text
1127
1128    def fix_ISBN(self, text: str) -> str:
1129        """Hyphenate ISBN numbers."""
1130        return _reformat_ISBNs(text, strict=self.ignore != CANCEL.MATCH)
1131
1132
1133_CANCEL_ALL = CANCEL.ALL
1134_CANCEL_PAGE = CANCEL.PAGE
1135_CANCEL_METHOD = CANCEL.METHOD
1136_CANCEL_MATCH = CANCEL.MATCH
1137
1138wrapper = ModuleDeprecationWrapper(__name__)
1139wrapper.add_deprecated_attr('CANCEL_ALL', _CANCEL_ALL,
1140                            replacement_name='CANCEL.ALL',
1141                            since='20210528')
1142wrapper.add_deprecated_attr('CANCEL_PAGE', _CANCEL_PAGE,
1143                            replacement_name='CANCEL.PAGE',
1144                            since='20210528')
1145wrapper.add_deprecated_attr('CANCEL_METHOD', _CANCEL_METHOD,
1146                            replacement_name='CANCEL.METHOD',
1147                            since='20210528')
1148wrapper.add_deprecated_attr('CANCEL_MATCH', _CANCEL_MATCH,
1149                            replacement_name='CANCEL.MATCH',
1150                            since='20210528')
1151