1#!/usr/bin/python 2""" 3This module can do slight modifications to tidy a wiki page's source code. 4 5The changes are not supposed to change the look of the rendered wiki page. 6 7If you wish to run this as an stand-alone script, use:: 8 9 scripts/cosmetic_changes.py 10 11For regular use, it is recommended to put this line into your user-config.py:: 12 13 cosmetic_changes = True 14 15You may enable cosmetic changes for additional languages by adding the 16dictionary cosmetic_changes_enable to your user-config.py. It should contain 17a tuple of languages for each site where you wish to enable in addition to 18your own langlanguage if cosmetic_changes_mylang_only is True (see below). 19Please set your dictionary by adding such lines to your user-config.py:: 20 21 cosmetic_changes_enable['wikipedia'] = ('de', 'en', 'fr') 22 23There is another config variable: You can set:: 24 25 cosmetic_changes_mylang_only = False 26 27if you're running a bot on multiple sites and want to do cosmetic changes on 28all of them, but be careful if you do. 29 30You may disable cosmetic changes by adding the all unwanted languages to the 31dictionary cosmetic_changes_disable in your user-config.py. It should contain 32a tuple of languages for each site where you wish to disable cosmetic changes. 33You may use it with cosmetic_changes_mylang_only is False, but you can also 34disable your own language. This also overrides the settings in the dictionary 35cosmetic_changes_enable. Please set this dictionary by adding such lines to 36your user-config.py:: 37 38 cosmetic_changes_disable['wikipedia'] = ('de', 'en', 'fr') 39 40You may disable cosmetic changes for a given script by appending the all 41unwanted scripts to the list cosmetic_changes_deny_script in your 42user-config.py. By default it contains cosmetic_changes.py itself and touch.py. 43This overrides all other enabling settings for cosmetic changes. Please modify 44the given list by adding such lines to your user-config.py:: 45 46 cosmetic_changes_deny_script.append('your_script_name_1') 47 48or by adding a list to the given one:: 49 50 cosmetic_changes_deny_script += ['your_script_name_1', 51 'your_script_name_2'] 52""" 53# 54# (C) Pywikibot team, 2006-2021 55# 56# Distributed under the terms of the MIT license. 57# 58import re 59 60from enum import IntEnum 61from typing import Any, Optional, Union 62 63import pywikibot 64from pywikibot import textlib 65from pywikibot.backports import Callable, Dict, List, Match, Pattern 66from pywikibot.exceptions import InvalidTitleError 67from pywikibot.textlib import ( 68 FILE_LINK_REGEX, 69 MultiTemplateMatchBuilder, 70 _get_regexes, 71) 72from pywikibot.tools import ( 73 deprecated, 74 deprecated_args, 75 first_lower, 76 first_upper, 77 issue_deprecation_warning, 78 ModuleDeprecationWrapper, 79) 80from pywikibot.tools.chars import url2string 81 82 83try: 84 import stdnum.isbn as stdnum_isbn 85except ImportError: 86 stdnum_isbn = None 87 88 89# Subpage templates. Must be in lower case, 90# whereas subpage itself must be case sensitive 91# This is also used by interwiki.py 92# TODO: Maybe move it to family file and implement global instances 93moved_links = { 94 'ar': (['documentation', 'template documentation', 'شرح', 'توثيق'], 95 '/doc'), 96 'bn': ('documentation', '/doc'), 97 'ca': ('ús de la plantilla', '/ús'), 98 'cs': ('dokumentace', '/doc'), 99 'da': ('dokumentation', '/doc'), 100 'de': ('dokumentation', '/Meta'), 101 'dsb': (['dokumentacija', 'doc'], '/Dokumentacija'), 102 'en': (['documentation', 'template documentation', 'template doc', 103 'doc', 'documentation, template'], '/doc'), 104 'es': (['documentación', 'documentación de plantilla'], '/doc'), 105 'eu': ('txantiloi dokumentazioa', '/dok'), 106 'fa': (['documentation', 'template documentation', 'template doc', 107 'doc', 'توضیحات', 'زیرصفحه توضیحات'], '/doc'), 108 # fi: no idea how to handle this type of subpage at :Metasivu: 109 'fi': ('mallineohje', None), 110 'fr': (['/documentation', 'documentation', 'doc_modèle', 111 'documentation modèle', 'documentation modèle compliqué', 112 'documentation modèle en sous-page', 113 'documentation modèle compliqué en sous-page', 114 'documentation modèle utilisant les parserfunctions en sous-page', 115 ], 116 '/Documentation'), 117 'hsb': (['dokumentacija', 'doc'], '/Dokumentacija'), 118 'hu': ('sablondokumentáció', '/doc'), 119 'id': ('template doc', '/doc'), 120 'ilo': ('documentation', '/doc'), 121 'ja': ('documentation', '/doc'), 122 'ka': ('თარგის ინფო', '/ინფო'), 123 'ko': ('documentation', '/설명문서'), 124 'ms': ('documentation', '/doc'), 125 'no': ('dokumentasjon', '/dok'), 126 'nn': ('dokumentasjon', '/dok'), 127 'pl': ('dokumentacja', '/opis'), 128 'pt': (['documentação', '/doc'], '/doc'), 129 'ro': ('documentaţie', '/doc'), 130 'ru': ('doc', '/doc'), 131 'simple': (['documentation', 132 'template documentation', 133 'template doc', 134 'doc', 135 'documentation, template'], '/doc'), 136 'sk': ('dokumentácia', '/Dokumentácia'), 137 'sv': ('dokumentation', '/dok'), 138 'uk': (['документація', 'doc', 'documentation'], '/Документація'), 139 'ur': (['دستاویز', 'توثيق', 'شرح', 'توضیحات', 140 'documentation', 'template doc', 'doc', 141 'documentation, template'], '/doc'), 142 'vi': ('documentation', '/doc'), 143 'zh': (['documentation', 'doc'], '/doc'), 144} 145 146# Template which should be replaced or removed. 147# Use a list with two entries. The first entry will be replaced by the second. 148# Examples: 149# For removing {{Foo}}, the list must be: 150# ('Foo', None), 151# 152# The following also works: 153# ('Foo', ''), 154# 155# For replacing {{Foo}} with {{Bar}} the list must be: 156# ('Foo', 'Bar'), 157# 158# This also removes all template parameters of {{Foo}} 159# For replacing {{Foo}} with {{Bar}} but keep the template 160# parameters in its original order, please use: 161# ('Foo', 'Bar\\g<parameters>'), 162 163deprecatedTemplates = { 164 'wikipedia': { 165 'de': [ 166 ('Belege', 'Belege fehlen\\g<parameters>'), 167 ('Quelle', 'Belege fehlen\\g<parameters>'), 168 ('Quellen', 'Belege fehlen\\g<parameters>'), 169 ('Quellen fehlen', 'Belege fehlen\\g<parameters>'), 170 ], 171 'ur': [ 172 ('Infobox former country', 173 'خانہ معلومات سابقہ ملک\\g<parameters>'), 174 ('Infobox Former Country', 175 'خانہ معلومات سابقہ ملک\\g<parameters>'), 176 ], 177 } 178} 179 180 181class CANCEL(IntEnum): 182 183 """Cancel level to ignore exceptions. 184 185 If an error occurred and either skips the page or the method 186 or a single match. ALL raises the exception. 187 188 .. versionadded:: 6.3 189 """ 190 191 ALL = 0 192 PAGE = 1 193 METHOD = 2 194 MATCH = 3 195 196 197def _format_isbn_match(match: Match[str], strict: bool = True) -> str: 198 """Helper function to validate and format a single matched ISBN.""" 199 if not stdnum_isbn: 200 raise NotImplementedError( 201 'ISBN functionality not available. Install stdnum package.') 202 203 isbn = match.group('code') 204 try: 205 stdnum_isbn.validate(isbn) 206 except stdnum_isbn.ValidationError as e: 207 if strict: 208 raise 209 pywikibot.log('ISBN "{}" validation error: {}'.format(isbn, e)) 210 return isbn 211 212 return stdnum_isbn.format(isbn) 213 214 215def _reformat_ISBNs(text: str, strict: bool = True) -> str: 216 """Helper function to normalise ISBNs in text. 217 218 :raises Exception: Invalid ISBN encountered when strict enabled 219 """ 220 return textlib.reformat_ISBNs( 221 text, lambda match: _format_isbn_match(match, strict=strict)) 222 223 224class CosmeticChangesToolkit: 225 226 """Cosmetic changes toolkit.""" 227 228 @deprecated_args(redirect=True, diff='show_diff', site='page') 229 def __init__(self, page: 'pywikibot.page.BasePage', *, 230 show_diff: bool = False, 231 namespace: Optional[int] = None, 232 pageTitle: Optional[str] = None, 233 ignore: IntEnum = CANCEL.ALL) -> None: 234 """Initializer. 235 236 :param page: the Page object containing the text to be modified 237 :param show_diff: show difference after replacements 238 :param namespace: DEPRECATED namespace parameter 239 :param pageTitle: DEPRECATED page title parameter 240 :param ignore: ignores if an error occurred and either skips the page 241 or only that method. It can be set one of the CANCEL constants 242 """ 243 if isinstance(page, pywikibot.BaseSite): 244 self.site = page 245 self.title = pageTitle 246 247 class_name = type(self).__name__ 248 if self.title is None: 249 raise ValueError('Page title required for ' + class_name) 250 251 try: 252 self.namespace = self.site.namespaces.resolve(namespace).pop(0) 253 except (KeyError, TypeError, IndexError): 254 raise ValueError('{} needs a valid namespace' 255 .format(class_name)) 256 issue_deprecation_warning( 257 'site parameter of ' + class_name, 258 'a pywikibot.Page object as first parameter', 259 since='20201102') 260 else: 261 if namespace is not None or pageTitle is not None: 262 raise TypeError( 263 "'namespace' and 'pageTitle' arguments are invalid with " 264 'a given Page object') 265 self.site = page.site 266 self.title = page.title() 267 self.namespace = page.namespace() 268 269 self.show_diff = show_diff 270 self.template = (self.namespace == 10) 271 self.talkpage = self.namespace >= 0 and self.namespace % 2 == 1 272 self.ignore = ignore 273 274 self.common_methods = [ 275 self.commonsfiledesc, 276 self.fixSelfInterwiki, 277 self.standardizePageFooter, 278 self.fixSyntaxSave, 279 self.cleanUpLinks, 280 self.cleanUpSectionHeaders, 281 self.putSpacesInLists, 282 self.translateAndCapitalizeNamespaces, 283 self.translateMagicWords, 284 self.replaceDeprecatedTemplates, 285 self.resolveHtmlEntities, 286 self.removeEmptySections, 287 self.removeUselessSpaces, 288 self.removeNonBreakingSpaceBeforePercent, 289 290 self.fixHtml, 291 self.fixReferences, 292 self.fixStyle, 293 self.fixTypo, 294 295 self.fixArabicLetters, 296 ] 297 if stdnum_isbn: 298 self.common_methods.append(self.fix_ISBN) 299 300 @property # type: ignore[misc] 301 @deprecated('show_diff', since='20200415') 302 def diff(self) -> bool: 303 """CosmeticChangesToolkit.diff attribute getter.""" 304 return self.show_diff 305 306 @diff.setter # type: ignore[misc] 307 @deprecated('show_diff', since='20200415') 308 def diff(self, value: bool) -> None: 309 """CosmeticChangesToolkit.diff attribute setter.""" 310 self.show_diff = bool(value) 311 312 @classmethod 313 @deprecated('CosmeticChangesToolkit with pywikibot.Page object', 314 since='20200415') 315 @deprecated_args(diff='show_diff') 316 def from_page(cls, page: 'pywikibot.page.BasePage', 317 show_diff: bool = False, 318 ignore: IntEnum = CANCEL.ALL) -> 'CosmeticChangesToolkit': 319 """Create toolkit based on the page.""" 320 return cls(page, show_diff=show_diff, ignore=ignore) 321 322 def safe_execute(self, method: Callable[[str], str], text: str) -> str: 323 """Execute the method and catch exceptions if enabled.""" 324 result = None 325 try: 326 result = method(text) 327 except Exception as e: 328 if self.ignore == CANCEL.METHOD: 329 pywikibot.warning('Unable to perform "{}" on "{}"!' 330 .format(method.__name__, self.title)) 331 pywikibot.exception(e) 332 else: 333 raise 334 return text if result is None else result 335 336 def _change(self, text: str) -> str: 337 """Execute all clean up methods.""" 338 for method in self.common_methods: 339 text = self.safe_execute(method, text) 340 return text 341 342 def change(self, text: str) -> Union[bool, str]: 343 """Execute all clean up methods and catch errors if activated.""" 344 try: 345 new_text = self._change(text) 346 except Exception as e: 347 if self.ignore == CANCEL.PAGE: 348 pywikibot.warning('Skipped "{}", because an error occurred.' 349 .format(self.title)) 350 pywikibot.exception(e) 351 return False 352 raise 353 else: 354 if self.show_diff: 355 pywikibot.showDiff(text, new_text) 356 return new_text 357 358 def fixSelfInterwiki(self, text: str) -> str: 359 """ 360 Interwiki links to the site itself are displayed like local links. 361 362 Remove their language code prefix. 363 """ 364 if not self.talkpage and pywikibot.calledModuleName() != 'interwiki': 365 interwikiR = re.compile(r'\[\[(?: *:)? *{} *: *([^\[\]\n]*)\]\]' 366 .format(self.site.code)) 367 text = interwikiR.sub(r'[[\1]]', text) 368 return text 369 370 def standardizePageFooter(self, text: str) -> str: 371 """ 372 Standardize page footer. 373 374 Makes sure that interwiki links and categories are put 375 into the correct position and into the right order. This 376 combines the old instances of standardizeInterwiki 377 and standardizeCategories. 378 379 The page footer consists of the following parts 380 in that sequence: 381 1. categories 382 2. additional information depending on the local site policy 383 3. interwiki 384 """ 385 assert self.title is not None 386 387 categories = [] 388 interwiki_links = {} 389 390 # get categories 391 if not self.template: 392 categories = textlib.getCategoryLinks(text, site=self.site) 393 394 if not self.talkpage: 395 subpage = False 396 if self.template: 397 try: 398 tmpl, loc = moved_links[self.site.code] 399 del tmpl 400 except KeyError: 401 loc = None 402 if loc is not None and loc in self.title: 403 subpage = True 404 405 # get interwiki 406 interwiki_links = textlib.getLanguageLinks( 407 text, insite=self.site, template_subpage=subpage) 408 409 # remove interwiki 410 text = textlib.removeLanguageLinks(text, site=self.site) 411 412 # add categories, main to top 413 if categories: 414 # TODO: Sort categories in alphabetic order, e.g. using 415 # categories.sort()? (T100265) 416 # TODO: Get main categories from Wikidata? 417 main = pywikibot.Category(self.site, 'Category:' + self.title, 418 sort_key=' ') 419 if main in categories: 420 categories.pop(categories.index(main)) 421 categories.insert(0, main) 422 text = textlib.replaceCategoryLinks(text, categories, 423 site=self.site) 424 425 # add interwiki 426 if interwiki_links: 427 text = textlib.replaceLanguageLinks(text, interwiki_links, 428 site=self.site, 429 template=self.template, 430 template_subpage=subpage) 431 432 return text 433 434 def translateAndCapitalizeNamespaces(self, text: str) -> str: 435 """Use localized namespace names.""" 436 # arz uses English stylish codes 437 if self.site.sitename == 'wikipedia:arz': 438 return text 439 # wiki links aren't parsed here. 440 exceptions = ['nowiki', 'comment', 'math', 'pre'] 441 442 for namespace in self.site.namespaces.values(): 443 if namespace == 0: 444 # skip main (article) namespace 445 continue 446 # a clone is needed. Won't change the namespace dict 447 namespaces = list(namespace) 448 if namespace == 6 and self.site.family.name == 'wikipedia': 449 if self.site.code in ('en', 'fr'): 450 # do not change "Image" on en-wiki and fr-wiki 451 assert 'Image' in namespaces 452 namespaces.remove('Image') 453 if self.site.code == 'hu': 454 # do not change "Kép" on hu-wiki 455 assert 'Kép' in namespaces 456 namespaces.remove('Kép') 457 elif self.site.code == 'pt': 458 # use "Imagem" by default on pt-wiki (per T57242) 459 assert 'Imagem' in namespaces 460 namespaces.insert( 461 0, namespaces.pop(namespaces.index('Imagem'))) 462 # final namespace variant 463 final_ns = namespaces.pop(0) 464 if namespace in (2, 3): 465 # skip localized user namespace, maybe gender is used 466 namespaces = ['User' if namespace == 2 else 'User talk'] 467 # lowerspaced and underscored namespaces 468 for i, item in enumerate(namespaces): 469 item = item.replace(' ', '[ _]') 470 item = '[{}{}]'.format(item[0], item[0].lower()) + item[1:] 471 namespaces[i] = item 472 namespaces.append(first_lower(final_ns)) 473 if final_ns and namespaces: 474 if self.site.sitename == 'wikipedia:pt' and namespace == 6: 475 # only change on these file extensions (per T57242) 476 extensions = ('png', 'gif', 'jpg', 'jpeg', 'svg', 'tiff', 477 'tif') 478 text = textlib.replaceExcept( 479 text, 480 r'\[\[\s*({}) *:(?P<name>[^\|\]]*?\.({}))' 481 r'(?P<label>.*?)\]\]' 482 .format('|'.join(namespaces), '|'.join(extensions)), 483 r'[[{}:\g<name>\g<label>]]'.format(final_ns), 484 exceptions) 485 else: 486 text = textlib.replaceExcept( 487 text, 488 r'\[\[\s*({}) *:(?P<nameAndLabel>.*?)\]\]' 489 .format('|'.join(namespaces)), 490 r'[[{}:\g<nameAndLabel>]]'.format(final_ns), 491 exceptions) 492 return text 493 494 def translateMagicWords(self, text: str) -> str: 495 """Use localized magic words.""" 496 # not wanted at ru 497 # arz uses English stylish codes 498 # no need to run on English wikis 499 if self.site.code in ['arz', 'en', 'ru']: 500 return text 501 502 def init_cache() -> None: 503 for magicword in ('img_thumbnail', 'img_left', 'img_center', 504 'img_right', 'img_none', 'img_framed', 505 'img_frameless', 'img_border', 'img_upright', 506 'img_baseline', 'img_sub', 'img_super', 507 'img_top', 'img_text_top', 'img_middle', 508 'img_bottom', 'img_text_bottom'): 509 aliases = self.site.getmagicwords(magicword) 510 if len(aliases) > 1: 511 cache.update((alias, aliases[0]) for alias in aliases[1:] 512 if '$1' not in alias) 513 if not cache: 514 cache[False] = True # signal there is nothing to replace 515 516 def replace_magicword(match: Match[str]) -> str: 517 if cache.get(False): 518 return match.group() 519 split = match.group().split('|') 520 if len(split) == 1: 521 return match.group() 522 523 if not cache: 524 init_cache() 525 526 # push ']]' out and re-add below 527 split[-1] = split[-1][:-2] 528 return '{}|{}]]'.format( 529 split[0], '|'.join(cache.get(x.strip(), x) for x in split[1:])) 530 531 cache = {} # type: Dict[Union[bool, str], Any] 532 exceptions = ['comment', 'nowiki', 'pre', 'syntaxhighlight'] 533 regex = re.compile( 534 FILE_LINK_REGEX % '|'.join(self.site.namespaces[6]), 535 flags=re.X) 536 return textlib.replaceExcept( 537 text, regex, replace_magicword, exceptions) 538 539 def cleanUpLinks(self, text: str) -> str: 540 """Tidy up wikilinks found in a string. 541 542 This function will: 543 * Replace underscores with spaces 544 545 * Move leading and trailing spaces out of the wikilink and into the 546 surrounding text 547 548 * Convert URL-encoded characters into Unicode-encoded characters 549 550 * Move trailing characters out of the link and make the link without 551 using a pipe, if possible 552 553 * Capitalize the article title of the link, if appropriate 554 555 :param text: string to perform the clean-up on 556 :return: text with tidied wikilinks 557 """ 558 # helper function which works on one link and either returns it 559 # unmodified, or returns a replacement. 560 def handleOneLink(match: Match[str]) -> str: 561 titleWithSection = match.group('titleWithSection') 562 label = match.group('label') 563 trailingChars = match.group('linktrail') 564 newline = match.group('newline') 565 566 try: 567 is_interwiki = self.site.isInterwikiLink(titleWithSection) 568 except ValueError: # T111513 569 is_interwiki = True 570 571 if is_interwiki: 572 return match.group() 573 574 # The link looks like this: 575 # [[page_title|link_text]]trailing_chars 576 # We only work on namespace 0 because pipes and linktrails work 577 # differently for images and categories. 578 page = pywikibot.Page(pywikibot.Link(titleWithSection, self.site)) 579 try: 580 in_main_namespace = page.namespace() == 0 581 except InvalidTitleError: 582 in_main_namespace = False 583 if not in_main_namespace: 584 return match.group() 585 586 # Replace underlines by spaces, also multiple underlines 587 titleWithSection = re.sub('_+', ' ', titleWithSection) 588 # Remove double spaces 589 titleWithSection = re.sub(' +', ' ', titleWithSection) 590 # Remove unnecessary leading spaces from title, 591 # but remember if we did this because we eventually want 592 # to re-add it outside of the link later. 593 titleLength = len(titleWithSection) 594 titleWithSection = titleWithSection.lstrip() 595 hadLeadingSpaces = len(titleWithSection) != titleLength 596 hadTrailingSpaces = False 597 # Remove unnecessary trailing spaces from title, 598 # but remember if we did this because it may affect 599 # the linktrail and because we eventually want to 600 # re-add it outside of the link later. 601 if not trailingChars: 602 titleLength = len(titleWithSection) 603 titleWithSection = titleWithSection.rstrip() 604 hadTrailingSpaces = len(titleWithSection) != titleLength 605 606 # Convert URL-encoded characters to str 607 titleWithSection = url2string(titleWithSection, 608 encodings=self.site.encodings()) 609 610 if not titleWithSection: 611 # just skip empty links. 612 return match.group() 613 614 # Remove unnecessary initial and final spaces from label. 615 # Please note that some editors prefer spaces around pipes. 616 # (See [[en:Wikipedia:Semi-bots]]). We remove them anyway. 617 if label is not None: 618 # Remove unnecessary leading spaces from label, 619 # but remember if we did this because we want 620 # to re-add it outside of the link later. 621 labelLength = len(label) 622 label = label.lstrip() 623 hadLeadingSpaces = len(label) != labelLength 624 # Remove unnecessary trailing spaces from label, 625 # but remember if we did this because it affects 626 # the linktrail. 627 if not trailingChars: 628 labelLength = len(label) 629 label = label.rstrip() 630 hadTrailingSpaces = len(label) != labelLength 631 else: 632 label = titleWithSection 633 if trailingChars: 634 label += trailingChars 635 636 if self.site.siteinfo['case'] == 'first-letter': 637 firstcase_title = first_lower(titleWithSection) 638 firstcase_label = first_lower(label) 639 else: 640 firstcase_title = titleWithSection 641 firstcase_label = label 642 643 if firstcase_label == firstcase_title: 644 newLink = '[[{}]]'.format(label) 645 # Check if we can create a link with trailing characters 646 # instead of a pipelink 647 elif (firstcase_label.startswith(firstcase_title) 648 and trailR.sub('', label[len(titleWithSection):]) == ''): 649 newLink = '[[{}]]{}'.format(label[:len(titleWithSection)], 650 label[len(titleWithSection):]) 651 652 else: 653 # Try to capitalize the first letter of the title. 654 # Not useful for languages that don't capitalize nouns. 655 # TODO: Add a configuration variable for each site, 656 # which determines if the link target is written in 657 # uppercase 658 if self.site.sitename == 'wikipedia:de': 659 titleWithSection = first_upper(titleWithSection) 660 newLink = '[[{}|{}]]'.format(titleWithSection, label) 661 # re-add spaces that were pulled out of the link. 662 # Examples: 663 # text[[ title ]]text -> text [[title]] text 664 # text[[ title | name ]]text -> text [[title|name]] text 665 # text[[ title |name]]text -> text[[title|name]]text 666 # text[[title| name]]text -> text [[title|name]]text 667 if hadLeadingSpaces and not newline: 668 newLink = ' ' + newLink 669 if hadTrailingSpaces: 670 newLink = newLink + ' ' 671 if newline: 672 newLink = newline + newLink 673 return newLink 674 675 trailR = re.compile(self.site.linktrail()) 676 # The regular expression which finds links. Results consist of four groups: 677 # group <newline> depends whether the links starts with a new line. 678 # group <titleWithSection> is the page title and section, that is, 679 # everything before | or ]. It'll include the # to make life easier for us. 680 # group <label> is the alternative link title between | and ]. 681 # group <linktrail> is the link trail after ]] which are part of the word. 682 # note that the definition of 'letter' varies from language to language. 683 linkR = re.compile( 684 r'(?P<newline>[\n]*)\[\[(?P<titleWithSection>[^\]\|]+)' 685 r'(\|(?P<label>[^\]\|]*))?\]\](?P<linktrail>' 686 + self.site.linktrail() + ')') 687 688 text = textlib.replaceExcept(text, linkR, handleOneLink, 689 ['comment', 'math', 'nowiki', 'pre', 690 'startspace']) 691 return text 692 693 def resolveHtmlEntities(self, text: str) -> str: 694 """Replace HTML entities with string.""" 695 ignore = [ 696 38, # Ampersand (&) 697 39, # Single quotation mark (") per T26093 698 60, # Less than (<) 699 62, # Greater than (>) 700 91, # Opening square bracket ([) 701 # - sometimes used intentionally inside links 702 93, # Closing square bracket (]) 703 # - used intentionally inside links 704 124, # Vertical bar (|) 705 # - used intentionally in navigation bar templates on w:de 706 160, # Non-breaking space ( ) 707 # - not supported by Firefox textareas 708 173, # Soft-hypen (­) - enable editing 709 8206, # Left-to-right mark (<r;) 710 8207, # Right-to-left mark (&rtl;) 711 ] 712 if self.template: 713 ignore += [32] # Space ( ) 714 ignore += [58] # Colon (:) 715 # TODO: T254350 - what other extension tags should be avoided? 716 # (graph, math, score, timeline, etc.) 717 text = pywikibot.html2unicode( 718 text, ignore=ignore, exceptions=['comment', 'syntaxhighlight']) 719 return text 720 721 def removeEmptySections(self, text: str) -> str: 722 """Cleanup empty sections.""" 723 # userspace contains article stubs without nobots/in use templates 724 if self.namespace == 2: 725 return text 726 727 skippings = ['comment', 'category'] 728 skip_regexes = _get_regexes(skippings, self.site) 729 # site defined templates 730 skip_templates = { 731 'cs': ('Pahýl[ _]část',), # stub section 732 } 733 if self.site.code in skip_templates: 734 for template in skip_templates[self.site.code]: 735 skip_regexes.append( 736 re.compile(r'\{\{\s*%s\s*\}\}' % template, re.I)) 737 # empty lists 738 skip_regexes.append(re.compile(r'(?m)^[\*#] *$')) 739 740 # get stripped sections 741 stripped_text = textlib.removeLanguageLinks(text, self.site, '\n') 742 for reg in skip_regexes: 743 stripped_text = reg.sub(r'', stripped_text) 744 strip_sections = textlib.extract_sections( 745 stripped_text, self.site)[1] 746 747 # get proper sections 748 header, sections, footer = textlib.extract_sections(text, self.site) 749 750 # iterate stripped sections and create a new page body 751 new_body = [] 752 for i, strip_section in enumerate(strip_sections): 753 current_heading = sections[i][0] 754 try: 755 next_heading = sections[i + 1][0] 756 except IndexError: 757 next_heading = '' 758 current_dep = (len(current_heading) 759 - len(current_heading.lstrip('='))) 760 next_dep = len(next_heading) - len(next_heading.lstrip('=')) 761 if strip_section[1].strip() or current_dep < next_dep: 762 new_body.extend(sections[i]) 763 return header + ''.join(new_body) + footer 764 765 def removeUselessSpaces(self, text: str) -> str: 766 """Cleanup multiple or trailing spaces.""" 767 exceptions = ['comment', 'math', 'nowiki', 'pre', 'syntaxhighlight', 768 'startspace', 'table'] 769 if self.site.sitename != 'wikipedia:cs': 770 exceptions.append('template') 771 text = textlib.replaceExcept(text, r'(?m)[\t ]+( |$)', r'\1', 772 exceptions, site=self.site) 773 return text 774 775 def removeNonBreakingSpaceBeforePercent(self, text: str) -> str: 776 """ 777 Remove a non-breaking space between number and percent sign. 778 779 Newer MediaWiki versions automatically place a non-breaking space in 780 front of a percent sign, so it is no longer required to place it 781 manually. 782 """ 783 text = textlib.replaceExcept( 784 text, r'(\d)&(?:nbsp|#160|#x[Aa]0);%', r'\1 %', ['timeline']) 785 return text 786 787 def cleanUpSectionHeaders(self, text: str) -> str: 788 """ 789 Add a space between the equal signs and the section title. 790 791 Example:: 792 793 ==Section title== 794 795 becomes:: 796 797 == Section title == 798 799 :NOTE: This space is recommended in the syntax help on the 800 English and German Wikipedias. It is not wanted on Lojban and 801 English Wiktionaries (T168399, T169064) and it might be that 802 it is not wanted on other wikis. If there are any complaints, 803 please file a bug report. 804 """ 805 if self.site.sitename in ['wiktionary:jbo', 'wiktionary:en']: 806 return text 807 return textlib.replaceExcept( 808 text, 809 r'(?m)^(={1,6})[ \t]*(?P<title>.*[^\s=])[ \t]*\1[ \t]*\r?\n', 810 r'\1 \g<title> \1\n', 811 ['comment', 'math', 'nowiki', 'pre']) 812 813 def putSpacesInLists(self, text: str) -> str: 814 """ 815 Add a space between the * or # and the text. 816 817 :NOTE: This space is recommended in the syntax help on the 818 English, German and French Wikipedias. It might be that it 819 is not wanted on other wikis. If there are any complaints, 820 please file a bug report. 821 """ 822 if not self.template: 823 exceptions = ['comment', 'math', 'nowiki', 'pre', 824 'syntaxhighlight', 'template', 'timeline', 825 self.site.redirect_regex] 826 text = textlib.replaceExcept( 827 text, 828 r'(?m)' 829 r'^(?P<bullet>[:;]*(\*+|#+)[:;\*#]*)(?P<char>[^\s\*#:;].+?)', 830 r'\g<bullet> \g<char>', 831 exceptions) 832 return text 833 834 def replaceDeprecatedTemplates(self, text: str) -> str: 835 """Replace deprecated templates.""" 836 exceptions = ['comment', 'math', 'nowiki', 'pre'] 837 builder = MultiTemplateMatchBuilder(self.site) 838 839 if self.site.family.name in deprecatedTemplates \ 840 and self.site.code in deprecatedTemplates[self.site.family.name]: 841 for template in deprecatedTemplates[ 842 self.site.family.name][self.site.code]: 843 old, new = template 844 if new is None: 845 new = '' 846 else: 847 new = '{{%s}}' % new 848 849 text = textlib.replaceExcept( 850 text, 851 builder.pattern(old), 852 new, exceptions) 853 854 return text 855 856 # from fixes.py 857 def fixSyntaxSave(self, text: str) -> str: 858 """Convert weblinks to wikilink, fix link syntax.""" 859 def replace_link(match: Match[str]) -> str: 860 """Create a string to replace a single link.""" 861 replacement = '[[' 862 if re.match(r'(?:' + '|'.join(list(self.site.namespaces[6]) 863 + list(self.site.namespaces[14])) + '):', 864 match.group('link')): 865 replacement += ':' 866 replacement += match.group('link') 867 if match.group('title'): 868 replacement += '|' + match.group('title') 869 return replacement + ']]' 870 871 exceptions = ['comment', 'math', 'nowiki', 'pre', 'startspace', 872 'syntaxhighlight'] 873 # link to the wiki working on 874 # Only use suffixes for article paths 875 for suffix in self.site._interwiki_urls(True): 876 http_url = self.site.base_url(suffix, 'http') 877 if self.site.protocol() == 'http': 878 https_url = None 879 else: 880 https_url = self.site.base_url(suffix, 'https') 881 # compare strings without the protocol, if they are empty support 882 # also no prefix (//en.wikipedia.org/…) 883 if https_url is not None and http_url[4:] == https_url[5:]: 884 urls = ['(?:https?:)?' + re.escape(http_url[5:])] 885 else: 886 urls = [re.escape(url) for url in (http_url, https_url) 887 if url is not None] 888 for url in urls: 889 # Only include links which don't include the separator as 890 # the wikilink won't support additional parameters 891 separator = '?' 892 if '?' in suffix: 893 separator += '&' 894 # Match first a non space in the title to prevent that multiple 895 # spaces at the end without title will be matched by it 896 text = textlib.replaceExcept( 897 text, 898 r'\[\[?' + url + r'(?P<link>[^' + separator + r']+?)' 899 r'(\s+(?P<title>[^\s].*?))?\s*\]\]?', 900 replace_link, exceptions, site=self.site) 901 # external link in/starting with double brackets 902 text = textlib.replaceExcept( 903 text, 904 r'\[\[(?P<url>https?://[^\]]+?)\]\]?', 905 r'[\g<url>]', exceptions, site=self.site) 906 # external link and description separated by a pipe, with 907 # whitespace in front of the pipe, so that it is clear that 908 # the dash is not a legitimate part of the URL. 909 text = textlib.replaceExcept( 910 text, 911 r'\[(?P<url>https?://[^\|\] \r\n]+?) +\| *(?P<label>[^\|\]]+?)\]', 912 r'[\g<url> \g<label>]', exceptions) 913 # dash in external link, where the correct end of the URL can 914 # be detected from the file extension. It is very unlikely that 915 # this will cause mistakes. 916 extensions = [r'\.{}'.format(ext) 917 for ext in ['pdf', 'html?', 'php', 'aspx?', 'jsp']] 918 text = textlib.replaceExcept( 919 text, 920 r'\[(?P<url>https?://[^\|\] ]+?(' + '|'.join(extensions) + r')) *' 921 r'\| *(?P<label>[^\|\]]+?)\]', 922 r'[\g<url> \g<label>]', exceptions) 923 return text 924 925 def fixHtml(self, text: str) -> str: 926 """Relace html markups with wikitext markups.""" 927 def replace_header(match: Match[str]) -> str: 928 """Create a header string for replacing.""" 929 depth = int(match.group(1)) 930 return r'{0} {1} {0}'.format('=' * depth, match.group(2)) 931 932 # Everything case-insensitive (?i) 933 # Keep in mind that MediaWiki automatically converts <br> to <br /> 934 exceptions = ['comment', 'math', 'nowiki', 'pre', 'startspace', 935 'syntaxhighlight'] 936 text = textlib.replaceExcept(text, r'(?i)<(b|strong)>(.*?)</\1>', 937 r"'''\2'''", exceptions, site=self.site) 938 text = textlib.replaceExcept(text, r'(?i)<(i|em)>(.*?)</\1>', 939 r"''\2''", exceptions, site=self.site) 940 # horizontal line without attributes in a single line 941 text = textlib.replaceExcept(text, r'(?i)([\r\n])<hr[ /]*>([\r\n])', 942 r'\1----\2', exceptions) 943 # horizontal line with attributes; can't be done with wiki syntax 944 # so we only make it XHTML compliant 945 text = textlib.replaceExcept(text, r'(?i)<hr ([^>/]+?)>', 946 r'<hr \1 />', 947 exceptions) 948 # a header where only spaces are in the same line 949 text = textlib.replaceExcept( 950 text, 951 r'(?i)(?<=[\r\n]) *<h([1-7])> *([^<]+?) *</h\1> *(?=[\r\n])', 952 replace_header, 953 exceptions) 954 # TODO: maybe we can make the bot replace <p> tags with \r\n's. 955 return text 956 957 def fixReferences(self, text: str) -> str: 958 """Fix references tags.""" 959 # See also 960 # https://en.wikipedia.org/wiki/User:AnomieBOT/source/tasks/OrphanReferenceFixer.pm 961 exceptions = ['comment', 'math', 'nowiki', 'pre', 'syntaxhighlight', 962 'startspace'] 963 964 # it should be name = " or name=" NOT name =" 965 text = re.sub(r'(?i)<ref +name(= *| *=)"', r'<ref name="', text) 966 # remove empty <ref/>-tag 967 text = textlib.replaceExcept(text, 968 r'(?i)(<ref\s*/>|<ref *>\s*</ref>)', 969 r'', exceptions) 970 text = textlib.replaceExcept(text, 971 r'(?i)<ref\s+([^>]+?)\s*>\s*</ref>', 972 r'<ref \1/>', exceptions) 973 return text 974 975 def fixStyle(self, text: str) -> str: 976 """Convert prettytable to wikitable class.""" 977 exceptions = ['comment', 'math', 'nowiki', 'pre', 'startspace', 978 'syntaxhighlight'] 979 if self.site.code in ('de', 'en'): 980 text = textlib.replaceExcept(text, 981 r'(class="[^"]*)prettytable([^"]*")', 982 r'\1wikitable\2', exceptions) 983 return text 984 985 def fixTypo(self, text: str) -> str: 986 """Fix units.""" 987 exceptions = [ 988 'comment', 989 'gallery', 990 'hyperlink', 991 'interwiki', 992 'link', 993 'nowiki', 994 'math', 995 'pre', 996 'startspace', 997 'syntaxhighlight', 998 ] # type: List[Union[str, Pattern[str]]] 999 1000 # change <number> ccm -> <number> cm³ 1001 text = textlib.replaceExcept(text, r'(\d)\s*(?: )?ccm', 1002 r'\1 cm³', exceptions, 1003 site=self.site) 1004 # Solve wrong Nº sign with °C or °F 1005 # additional exception requested on fr-wiki for this stuff 1006 pattern = re.compile('«.*?»') 1007 exceptions.append(pattern) 1008 text = textlib.replaceExcept(text, r'(\d)\s*(?: )?[º°]([CF])', 1009 r'\1 °\2', exceptions, 1010 site=self.site) 1011 text = textlib.replaceExcept(text, 'º([CF])', '°' + r'\1', 1012 exceptions, 1013 site=self.site) 1014 return text 1015 1016 def fixArabicLetters(self, text: str) -> str: 1017 """Fix Arabic and Persian letters.""" 1018 if self.site.code not in ['ckb', 'fa']: 1019 return text 1020 1021 exceptions = [ 1022 'file', 1023 'gallery', 1024 'hyperlink', 1025 'interwiki', 1026 'inputbox', 1027 # FIXME: but changes letters inside wikilinks 1028 # 'link', 1029 'math', 1030 'pre', 1031 'template', 1032 'timeline', 1033 'ref', 1034 'startspace', 1035 'syntaxhighlight', 1036 ] # type: List[Union[str, Pattern[str]]] 1037 1038 digits = textlib.NON_LATIN_DIGITS 1039 faChrs = 'ءاآأإئؤبپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیةيك' + digits['fa'] 1040 1041 # not to let bot edits in latin content 1042 exceptions.append(re.compile('[^{fa}] *?"*? *?, *?[^{fa}]' 1043 .format(fa=faChrs))) 1044 text = textlib.replaceExcept(text, ',', '،', exceptions, 1045 site=self.site) 1046 if self.site.code == 'ckb': 1047 text = textlib.replaceExcept(text, 1048 '\u0647([.\u060c_<\\]\\s])', 1049 '\u06d5\\1', exceptions, 1050 site=self.site) 1051 text = textlib.replaceExcept(text, 'ه\u200c', 'ە', exceptions, 1052 site=self.site) 1053 text = textlib.replaceExcept(text, 'ه', 'ھ', exceptions, 1054 site=self.site) 1055 text = textlib.replaceExcept(text, 'ك', 'ک', exceptions, 1056 site=self.site) 1057 text = textlib.replaceExcept(text, '[ىي]', 'ی', exceptions, 1058 site=self.site) 1059 1060 return text 1061 1062 def commonsfiledesc(self, text: str) -> str: 1063 """ 1064 Clean up file descriptions on Wikimedia Commons. 1065 1066 It works according to [1] and works only on pages in the file 1067 namespace on Wikimedia Commons. 1068 1069 [1]: 1070 https://commons.wikimedia.org/wiki/Commons:Tools/pywiki_file_description_cleanup 1071 """ 1072 if self.site.sitename != 'commons:commons' or self.namespace == 6: 1073 return text 1074 1075 # section headers to {{int:}} versions 1076 exceptions = ['comment', 'includeonly', 'math', 'noinclude', 'nowiki', 1077 'pre', 'syntaxhighlight', 'ref', 'timeline'] 1078 text = textlib.replaceExcept(text, 1079 r'([\r\n]|^)\=\= *Summary *\=\=', 1080 r'\1== {{int:filedesc}} ==', 1081 exceptions, True) 1082 text = textlib.replaceExcept( 1083 text, 1084 r'([\r\n])\=\= *\[\[Commons:Copyright tags\|Licensing\]\]: *\=\=', 1085 r'\1== {{int:license-header}} ==', exceptions, True) 1086 text = textlib.replaceExcept( 1087 text, 1088 r'([\r\n])' 1089 r'\=\= *(Licensing|License information|{{int:license}}) *\=\=', 1090 r'\1== {{int:license-header}} ==', exceptions, True) 1091 1092 # frequent field values to {{int:}} versions 1093 text = textlib.replaceExcept( 1094 text, 1095 r'([\r\n]\|[Ss]ource *\= *)' 1096 r'(?:[Oo]wn work by uploader|[Oo]wn work|[Ee]igene [Aa]rbeit) *' 1097 r'([\r\n])', 1098 r'\1{{own}}\2', exceptions, True) 1099 text = textlib.replaceExcept( 1100 text, 1101 r'(\| *Permission *\=) *(?:[Ss]ee below|[Ss]iehe unten) *([\r\n])', 1102 r'\1\2', exceptions, True) 1103 1104 # added to transwikied pages 1105 text = textlib.replaceExcept(text, r'__NOTOC__', '', exceptions, True) 1106 1107 # tracker element for js upload form 1108 text = textlib.replaceExcept( 1109 text, 1110 r'<!-- *{{ImageUpload\|(?:full|basic)}} *-->', 1111 '', exceptions[1:], True) 1112 text = textlib.replaceExcept(text, r'{{ImageUpload\|(?:basic|full)}}', 1113 '', exceptions, True) 1114 1115 # duplicated section headers 1116 text = textlib.replaceExcept( 1117 text, 1118 r'([\r\n]|^)\=\= *{{int:filedesc}} *\=\=(?:[\r\n ]*)\=\= *' 1119 r'{{int:filedesc}} *\=\=', 1120 r'\1== {{int:filedesc}} ==', exceptions, True) 1121 text = textlib.replaceExcept( 1122 text, 1123 r'([\r\n]|^)\=\= *{{int:license-header}} *\=\=(?:[\r\n ]*)' 1124 r'\=\= *{{int:license-header}} *\=\=', 1125 r'\1== {{int:license-header}} ==', exceptions, True) 1126 return text 1127 1128 def fix_ISBN(self, text: str) -> str: 1129 """Hyphenate ISBN numbers.""" 1130 return _reformat_ISBNs(text, strict=self.ignore != CANCEL.MATCH) 1131 1132 1133_CANCEL_ALL = CANCEL.ALL 1134_CANCEL_PAGE = CANCEL.PAGE 1135_CANCEL_METHOD = CANCEL.METHOD 1136_CANCEL_MATCH = CANCEL.MATCH 1137 1138wrapper = ModuleDeprecationWrapper(__name__) 1139wrapper.add_deprecated_attr('CANCEL_ALL', _CANCEL_ALL, 1140 replacement_name='CANCEL.ALL', 1141 since='20210528') 1142wrapper.add_deprecated_attr('CANCEL_PAGE', _CANCEL_PAGE, 1143 replacement_name='CANCEL.PAGE', 1144 since='20210528') 1145wrapper.add_deprecated_attr('CANCEL_METHOD', _CANCEL_METHOD, 1146 replacement_name='CANCEL.METHOD', 1147 since='20210528') 1148wrapper.add_deprecated_attr('CANCEL_MATCH', _CANCEL_MATCH, 1149 replacement_name='CANCEL.MATCH', 1150 since='20210528') 1151