1<?php 2/** 3 * Check for articles to fix after adding/deleting namespaces 4 * 5 * Copyright © 2005-2007 Brion Vibber <brion@pobox.com> 6 * https://www.mediawiki.org/ 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public License along 19 * with this program; if not, write to the Free Software Foundation, Inc., 20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 21 * http://www.gnu.org/copyleft/gpl.html 22 * 23 * @file 24 * @ingroup Maintenance 25 */ 26 27require_once __DIR__ . '/Maintenance.php'; 28 29use MediaWiki\Linker\LinkTarget; 30use MediaWiki\MediaWikiServices; 31use Wikimedia\Rdbms\IDatabase; 32use Wikimedia\Rdbms\IMaintainableDatabase; 33use Wikimedia\Rdbms\IResultWrapper; 34 35/** 36 * Maintenance script that checks for articles to fix after 37 * adding/deleting namespaces. 38 * 39 * @ingroup Maintenance 40 */ 41class NamespaceDupes extends Maintenance { 42 43 /** 44 * @var IMaintainableDatabase 45 */ 46 protected $db; 47 48 /** 49 * Total number of pages that need fixing that are automatically resolveable 50 * @var int 51 */ 52 private $resolvablePages = 0; 53 54 /** 55 * Total number of pages that need fixing 56 * @var int 57 */ 58 private $totalPages = 0; 59 60 /** 61 * Total number of links that need fixing that are automatically resolveable 62 * @var int 63 */ 64 private $resolvableLinks = 0; 65 66 /** 67 * Total number of erroneous links 68 * @var int 69 */ 70 private $totalLinks = 0; 71 72 /** 73 * Total number of links deleted because they weren't automatically resolveable due to the 74 * target already existing 75 * @var int 76 */ 77 private $deletedLinks = 0; 78 79 public function __construct() { 80 parent::__construct(); 81 $this->addDescription( 'Find and fix pages affected by namespace addition/removal' ); 82 $this->addOption( 'fix', 'Attempt to automatically fix errors and delete broken links' ); 83 $this->addOption( 'merge', "Instead of renaming conflicts, do a history merge with " . 84 "the correct title" ); 85 $this->addOption( 'add-suffix', "Dupes will be renamed with correct namespace with " . 86 "<text> appended after the article name", false, true ); 87 $this->addOption( 'add-prefix', "Dupes will be renamed with correct namespace with " . 88 "<text> prepended before the article name", false, true ); 89 $this->addOption( 'source-pseudo-namespace', "Move all pages with the given source " . 90 "prefix (with an implied colon following it). If --dest-namespace is not specified, " . 91 "the colon will be replaced with a hyphen.", 92 false, true ); 93 $this->addOption( 'dest-namespace', "In combination with --source-pseudo-namespace, " . 94 "specify the namespace ID of the destination.", false, true ); 95 $this->addOption( 'move-talk', "If this is specified, pages in the Talk namespace that " . 96 "begin with a conflicting prefix will be renamed, for example " . 97 "Talk:File:Foo -> File_Talk:Foo" ); 98 } 99 100 public function execute() { 101 $options = [ 102 'fix' => $this->hasOption( 'fix' ), 103 'merge' => $this->hasOption( 'merge' ), 104 'add-suffix' => $this->getOption( 'add-suffix', '' ), 105 'add-prefix' => $this->getOption( 'add-prefix', '' ), 106 'move-talk' => $this->hasOption( 'move-talk' ), 107 'source-pseudo-namespace' => $this->getOption( 'source-pseudo-namespace', '' ), 108 'dest-namespace' => intval( $this->getOption( 'dest-namespace', 0 ) ) 109 ]; 110 111 if ( $options['source-pseudo-namespace'] !== '' ) { 112 $retval = $this->checkPrefix( $options ); 113 } else { 114 $retval = $this->checkAll( $options ); 115 } 116 117 if ( $retval ) { 118 $this->output( "\nLooks good!\n" ); 119 } else { 120 $this->output( "\nOh noeees\n" ); 121 } 122 } 123 124 /** 125 * Check all namespaces 126 * 127 * @param array $options Associative array of validated command-line options 128 * 129 * @return bool 130 */ 131 private function checkAll( $options ) { 132 $contLang = MediaWikiServices::getInstance()->getContentLanguage(); 133 $spaces = []; 134 135 // List interwikis first, so they'll be overridden 136 // by any conflicting local namespaces. 137 foreach ( $this->getInterwikiList() as $prefix ) { 138 $name = $contLang->ucfirst( $prefix ); 139 $spaces[$name] = 0; 140 } 141 142 // Now pull in all canonical and alias namespaces... 143 foreach ( 144 MediaWikiServices::getInstance()->getNamespaceInfo()->getCanonicalNamespaces() 145 as $ns => $name 146 ) { 147 // This includes $wgExtraNamespaces 148 if ( $name !== '' ) { 149 $spaces[$name] = $ns; 150 } 151 } 152 foreach ( $contLang->getNamespaces() as $ns => $name ) { 153 if ( $name !== '' ) { 154 $spaces[$name] = $ns; 155 } 156 } 157 foreach ( $contLang->getNamespaceAliases() as $name => $ns ) { 158 $spaces[$name] = $ns; 159 } 160 161 // We'll need to check for lowercase keys as well, 162 // since we're doing case-sensitive searches in the db. 163 $capitalLinks = $this->getConfig()->get( 'CapitalLinks' ); 164 foreach ( $spaces as $name => $ns ) { 165 $moreNames = []; 166 $moreNames[] = $contLang->uc( $name ); 167 $moreNames[] = $contLang->ucfirst( $contLang->lc( $name ) ); 168 $moreNames[] = $contLang->ucwords( $name ); 169 $moreNames[] = $contLang->ucwords( $contLang->lc( $name ) ); 170 $moreNames[] = $contLang->ucwordbreaks( $name ); 171 $moreNames[] = $contLang->ucwordbreaks( $contLang->lc( $name ) ); 172 if ( !$capitalLinks ) { 173 foreach ( $moreNames as $altName ) { 174 $moreNames[] = $contLang->lcfirst( $altName ); 175 } 176 $moreNames[] = $contLang->lcfirst( $name ); 177 } 178 foreach ( array_unique( $moreNames ) as $altName ) { 179 if ( $altName !== $name ) { 180 $spaces[$altName] = $ns; 181 } 182 } 183 } 184 185 // Sort by namespace index, and if there are two with the same index, 186 // break the tie by sorting by name 187 $origSpaces = $spaces; 188 uksort( $spaces, static function ( $a, $b ) use ( $origSpaces ) { 189 return $origSpaces[$a] <=> $origSpaces[$b] 190 ?: $a <=> $b; 191 } ); 192 193 $ok = true; 194 foreach ( $spaces as $name => $ns ) { 195 $ok = $this->checkNamespace( $ns, $name, $options ) && $ok; 196 } 197 198 $this->output( 199 "{$this->totalPages} pages to fix, " . 200 "{$this->resolvablePages} were resolvable.\n\n" 201 ); 202 203 foreach ( $spaces as $name => $ns ) { 204 if ( $ns != 0 ) { 205 /* Fix up link destinations for non-interwiki links only. 206 * 207 * For example if a page has [[Foo:Bar]] and then a Foo namespace 208 * is introduced, pagelinks needs to be updated to have 209 * page_namespace = NS_FOO. 210 * 211 * If instead an interwiki prefix was introduced called "Foo", 212 * the link should instead be moved to the iwlinks table. If a new 213 * language is introduced called "Foo", or if there is a pagelink 214 * [[fr:Bar]] when interlanguage magic links are turned on, the 215 * link would have to be moved to the langlinks table. Let's put 216 * those cases in the too-hard basket for now. The consequences are 217 * not especially severe. 218 * @fixme Handle interwiki links, and pagelinks to Category:, File: 219 * which probably need reparsing. 220 */ 221 222 $this->checkLinkTable( 'pagelinks', 'pl', $ns, $name, $options ); 223 $this->checkLinkTable( 'templatelinks', 'tl', $ns, $name, $options ); 224 225 // The redirect table has interwiki links randomly mixed in, we 226 // need to filter those out. For example [[w:Foo:Bar]] would 227 // have rd_interwiki=w and rd_namespace=0, which would match the 228 // query for a conflicting namespace "Foo" if filtering wasn't done. 229 $this->checkLinkTable( 'redirect', 'rd', $ns, $name, $options, 230 [ 'rd_interwiki' => null ] ); 231 $this->checkLinkTable( 'redirect', 'rd', $ns, $name, $options, 232 [ 'rd_interwiki' => '' ] ); 233 } 234 } 235 236 $this->output( 237 "{$this->totalLinks} links to fix, " . 238 "{$this->resolvableLinks} were resolvable, " . 239 "{$this->deletedLinks} were deleted.\n" 240 ); 241 242 return $ok; 243 } 244 245 /** 246 * @return string[] 247 */ 248 private function getInterwikiList() { 249 $result = MediaWikiServices::getInstance()->getInterwikiLookup()->getAllPrefixes(); 250 return array_column( $result, 'iw_prefix' ); 251 } 252 253 /** 254 * Check a given prefix and try to move it into the given destination namespace 255 * 256 * @param int $ns Destination namespace id 257 * @param string $name 258 * @param array $options Associative array of validated command-line options 259 * @return bool 260 */ 261 private function checkNamespace( $ns, $name, $options ) { 262 $targets = $this->getTargetList( $ns, $name, $options ); 263 $count = $targets->numRows(); 264 $this->totalPages += $count; 265 if ( $count == 0 ) { 266 return true; 267 } 268 269 $dryRunNote = $options['fix'] ? '' : ' DRY RUN ONLY'; 270 271 $ok = true; 272 foreach ( $targets as $row ) { 273 // Find the new title and determine the action to take 274 275 $newTitle = $this->getDestinationTitle( 276 $ns, $name, $row->page_namespace, $row->page_title ); 277 $logStatus = false; 278 if ( !$newTitle ) { 279 $logStatus = 'invalid title'; 280 $action = 'abort'; 281 } elseif ( $newTitle->exists() ) { 282 if ( $options['merge'] ) { 283 if ( $this->canMerge( $row->page_id, $newTitle, $logStatus ) ) { 284 $action = 'merge'; 285 } else { 286 $action = 'abort'; 287 } 288 } elseif ( $options['add-prefix'] == '' && $options['add-suffix'] == '' ) { 289 $action = 'abort'; 290 $logStatus = 'dest title exists and --add-prefix not specified'; 291 } else { 292 $newTitle = $this->getAlternateTitle( $newTitle, $options ); 293 if ( !$newTitle ) { 294 $action = 'abort'; 295 $logStatus = 'alternate title is invalid'; 296 } elseif ( $newTitle->exists() ) { 297 $action = 'abort'; 298 $logStatus = 'title conflict'; 299 } else { 300 $action = 'move'; 301 $logStatus = 'alternate'; 302 } 303 } 304 } else { 305 $action = 'move'; 306 $logStatus = 'no conflict'; 307 } 308 309 // Take the action or log a dry run message 310 311 $logTitle = "id={$row->page_id} ns={$row->page_namespace} dbk={$row->page_title}"; 312 $pageOK = true; 313 314 switch ( $action ) { 315 case 'abort': 316 $this->output( "$logTitle *** $logStatus\n" ); 317 $pageOK = false; 318 break; 319 case 'move': 320 $this->output( "$logTitle -> " . 321 $newTitle->getPrefixedDBkey() . " ($logStatus)$dryRunNote\n" ); 322 323 if ( $options['fix'] ) { 324 $pageOK = $this->movePage( $row->page_id, $newTitle ); 325 } 326 break; 327 case 'merge': 328 $this->output( "$logTitle => " . 329 $newTitle->getPrefixedDBkey() . " (merge)$dryRunNote\n" ); 330 331 if ( $options['fix'] ) { 332 $pageOK = $this->mergePage( $row, $newTitle ); 333 } 334 break; 335 } 336 337 if ( $pageOK ) { 338 $this->resolvablePages++; 339 } else { 340 $ok = false; 341 } 342 } 343 344 return $ok; 345 } 346 347 /** 348 * Check and repair the destination fields in a link table 349 * @param string $table The link table name 350 * @param string $fieldPrefix The field prefix in the link table 351 * @param int $ns Destination namespace id 352 * @param string $name 353 * @param array $options Associative array of validated command-line options 354 * @param array $extraConds Extra conditions for the SQL query 355 */ 356 private function checkLinkTable( $table, $fieldPrefix, $ns, $name, $options, 357 $extraConds = [] 358 ) { 359 $dbw = $this->getDB( DB_PRIMARY ); 360 361 $batchConds = []; 362 $fromField = "{$fieldPrefix}_from"; 363 $namespaceField = "{$fieldPrefix}_namespace"; 364 $titleField = "{$fieldPrefix}_title"; 365 $batchSize = 500; 366 $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory(); 367 while ( true ) { 368 $res = $dbw->select( 369 $table, 370 [ $fromField, $namespaceField, $titleField ], 371 array_merge( 372 $batchConds, 373 $extraConds, 374 [ 375 $namespaceField => 0, 376 $titleField . $dbw->buildLike( "$name:", $dbw->anyString() ) 377 ] 378 ), 379 __METHOD__, 380 [ 381 'ORDER BY' => [ $titleField, $fromField ], 382 'LIMIT' => $batchSize 383 ] 384 ); 385 386 if ( $res->numRows() == 0 ) { 387 break; 388 } 389 390 $rowsToDeleteIfStillExists = []; 391 392 foreach ( $res as $row ) { 393 $logTitle = "from={$row->$fromField} ns={$row->$namespaceField} " . 394 "dbk={$row->$titleField}"; 395 $destTitle = $this->getDestinationTitle( 396 $ns, $name, $row->$namespaceField, $row->$titleField ); 397 $this->totalLinks++; 398 if ( !$destTitle ) { 399 $this->output( "$table $logTitle *** INVALID\n" ); 400 continue; 401 } 402 $this->resolvableLinks++; 403 if ( !$options['fix'] ) { 404 $this->output( "$table $logTitle -> " . 405 $destTitle->getPrefixedDBkey() . " DRY RUN\n" ); 406 continue; 407 } 408 409 $dbw->update( $table, 410 // SET 411 [ 412 $namespaceField => $destTitle->getNamespace(), 413 $titleField => $destTitle->getDBkey() 414 ], 415 // WHERE 416 [ 417 $namespaceField => 0, 418 $titleField => $row->$titleField, 419 $fromField => $row->$fromField 420 ], 421 __METHOD__, 422 [ 'IGNORE' ] 423 ); 424 425 $rowsToDeleteIfStillExists[] = $dbw->makeList( 426 [ 427 $fromField => $row->$fromField, 428 $namespaceField => $row->$namespaceField, 429 $titleField => $row->$titleField, 430 ], 431 IDatabase::LIST_AND 432 ); 433 434 $this->output( "$table $logTitle -> " . 435 $destTitle->getPrefixedDBkey() . "\n" 436 ); 437 } 438 439 if ( $options['fix'] && count( $rowsToDeleteIfStillExists ) > 0 ) { 440 $dbw->delete( 441 $table, 442 $dbw->makeList( $rowsToDeleteIfStillExists, IDatabase::LIST_OR ), 443 __METHOD__ 444 ); 445 446 $this->deletedLinks += $dbw->affectedRows(); 447 $this->resolvableLinks -= $dbw->affectedRows(); 448 } 449 450 $encLastTitle = $dbw->addQuotes( $row->$titleField ); 451 $encLastFrom = $dbw->addQuotes( $row->$fromField ); 452 453 $batchConds = [ 454 "$titleField > $encLastTitle " . 455 "OR ($titleField = $encLastTitle AND $fromField > $encLastFrom)" 456 ]; 457 458 $lbFactory->waitForReplication(); 459 } 460 } 461 462 /** 463 * Move the given pseudo-namespace, either replacing the colon with a hyphen 464 * (useful for pseudo-namespaces that conflict with interwiki links) or move 465 * them to another namespace if specified. 466 * @param array $options Associative array of validated command-line options 467 * @return bool 468 */ 469 private function checkPrefix( $options ) { 470 $prefix = $options['source-pseudo-namespace']; 471 $ns = $options['dest-namespace']; 472 $this->output( "Checking prefix \"$prefix\" vs namespace $ns\n" ); 473 474 return $this->checkNamespace( $ns, $prefix, $options ); 475 } 476 477 /** 478 * Find pages in main and talk namespaces that have a prefix of the new 479 * namespace so we know titles that will need migrating 480 * 481 * @param int $ns Destination namespace id 482 * @param string $name Prefix that is being made a namespace 483 * @param array $options Associative array of validated command-line options 484 * 485 * @return IResultWrapper 486 */ 487 private function getTargetList( $ns, $name, $options ) { 488 $dbw = $this->getDB( DB_PRIMARY ); 489 490 if ( 491 $options['move-talk'] && 492 MediaWikiServices::getInstance()->getNamespaceInfo()->isSubject( $ns ) 493 ) { 494 $checkNamespaces = [ NS_MAIN, NS_TALK ]; 495 } else { 496 $checkNamespaces = NS_MAIN; 497 } 498 499 return $dbw->select( 'page', 500 [ 501 'page_id', 502 'page_title', 503 'page_namespace', 504 ], 505 [ 506 'page_namespace' => $checkNamespaces, 507 'page_title' . $dbw->buildLike( "$name:", $dbw->anyString() ), 508 ], 509 __METHOD__ 510 ); 511 } 512 513 /** 514 * Get the preferred destination title for a given target page. 515 * @param int $ns The destination namespace ID 516 * @param string $name The conflicting prefix 517 * @param int $sourceNs The source namespace 518 * @param int $sourceDbk The source DB key (i.e. page_title) 519 * @return Title|false 520 */ 521 private function getDestinationTitle( $ns, $name, $sourceNs, $sourceDbk ) { 522 $dbk = substr( $sourceDbk, strlen( "$name:" ) ); 523 if ( $ns == 0 ) { 524 // An interwiki; try an alternate encoding with '-' for ':' 525 $dbk = "$name-" . $dbk; 526 } 527 $destNS = $ns; 528 $nsInfo = MediaWikiServices::getInstance()->getNamespaceInfo(); 529 if ( $sourceNs == NS_TALK && $nsInfo->isSubject( $ns ) ) { 530 // This is an associated talk page moved with the --move-talk feature. 531 $destNS = $nsInfo->getTalk( $destNS ); 532 } 533 $newTitle = Title::makeTitleSafe( $destNS, $dbk ); 534 if ( !$newTitle || !$newTitle->canExist() ) { 535 return false; 536 } 537 return $newTitle; 538 } 539 540 /** 541 * Get an alternative title to move a page to. This is used if the 542 * preferred destination title already exists. 543 * 544 * @param LinkTarget $linkTarget 545 * @param array $options Associative array of validated command-line options 546 * @return Title|bool 547 */ 548 private function getAlternateTitle( LinkTarget $linkTarget, $options ) { 549 $prefix = $options['add-prefix']; 550 $suffix = $options['add-suffix']; 551 if ( $prefix == '' && $suffix == '' ) { 552 return false; 553 } 554 while ( true ) { 555 $dbk = $prefix . $linkTarget->getDBkey() . $suffix; 556 $title = Title::makeTitleSafe( $linkTarget->getNamespace(), $dbk ); 557 if ( !$title ) { 558 return false; 559 } 560 if ( !$title->exists() ) { 561 return $title; 562 } 563 } 564 } 565 566 /** 567 * Move a page 568 * 569 * @param int $id The page_id 570 * @param LinkTarget $newLinkTarget The new title link target 571 * @return bool 572 */ 573 private function movePage( $id, LinkTarget $newLinkTarget ) { 574 $dbw = $this->getDB( DB_PRIMARY ); 575 576 $dbw->update( 'page', 577 [ 578 "page_namespace" => $newLinkTarget->getNamespace(), 579 "page_title" => $newLinkTarget->getDBkey(), 580 ], 581 [ 582 "page_id" => $id, 583 ], 584 __METHOD__ 585 ); 586 587 // Update *_from_namespace in links tables 588 $fromNamespaceTables = [ 589 [ 'pagelinks', 'pl' ], 590 [ 'templatelinks', 'tl' ], 591 [ 'imagelinks', 'il' ] 592 ]; 593 foreach ( $fromNamespaceTables as [ $table, $fieldPrefix ] ) { 594 $dbw->update( $table, 595 // SET 596 [ "{$fieldPrefix}_from_namespace" => $newLinkTarget->getNamespace() ], 597 // WHERE 598 [ "{$fieldPrefix}_from" => $id ], 599 __METHOD__ 600 ); 601 } 602 603 return true; 604 } 605 606 /** 607 * Determine if we can merge a page. 608 * We check if an inaccessible revision would become the latest and 609 * deny the merge if so -- it's theoretically possible to update the 610 * latest revision, but opens a can of worms -- search engine updates, 611 * recentchanges review, etc. 612 * 613 * @param int $id The page_id 614 * @param LinkTarget $linkTarget The new link target 615 * @param string &$logStatus This is set to the log status message on failure 616 * @return bool 617 */ 618 private function canMerge( $id, LinkTarget $linkTarget, &$logStatus ) { 619 $revisionLookup = MediaWikiServices::getInstance()->getRevisionLookup(); 620 $latestDest = $revisionLookup->getRevisionByTitle( $linkTarget, 0, 621 IDBAccessObject::READ_LATEST ); 622 $latestSource = $revisionLookup->getRevisionByPageId( $id, 0, 623 IDBAccessObject::READ_LATEST ); 624 if ( $latestSource->getTimestamp() > $latestDest->getTimestamp() ) { 625 $logStatus = 'cannot merge since source is later'; 626 return false; 627 } else { 628 return true; 629 } 630 } 631 632 /** 633 * Merge page histories 634 * 635 * @param stdClass $row Page row 636 * @param Title $newTitle The new title 637 * @return bool 638 */ 639 private function mergePage( $row, Title $newTitle ) { 640 $dbw = $this->getDB( DB_PRIMARY ); 641 642 $id = $row->page_id; 643 644 // Construct the WikiPage object we will need later, while the 645 // page_id still exists. Note that this cannot use makeTitleSafe(), 646 // we are deliberately constructing an invalid title. 647 $sourceTitle = Title::makeTitle( $row->page_namespace, $row->page_title ); 648 $sourceTitle->resetArticleID( $id ); 649 $wikiPage = new WikiPage( $sourceTitle ); 650 $wikiPage->loadPageData( 'fromdbmaster' ); 651 652 $destId = $newTitle->getArticleID(); 653 $this->beginTransaction( $dbw, __METHOD__ ); 654 $dbw->update( 'revision', 655 // SET 656 [ 'rev_page' => $destId ], 657 // WHERE 658 [ 'rev_page' => $id ], 659 __METHOD__ 660 ); 661 662 $dbw->delete( 'page', [ 'page_id' => $id ], __METHOD__ ); 663 664 $this->commitTransaction( $dbw, __METHOD__ ); 665 666 /* Call LinksDeletionUpdate to delete outgoing links from the old title, 667 * and update category counts. 668 * 669 * Calling external code with a fake broken Title is a fairly dubious 670 * idea. It's necessary because it's quite a lot of code to duplicate, 671 * but that also makes it fragile since it would be easy for someone to 672 * accidentally introduce an assumption of title validity to the code we 673 * are calling. 674 */ 675 DeferredUpdates::addUpdate( new LinksDeletionUpdate( $wikiPage ) ); 676 DeferredUpdates::doUpdates(); 677 678 return true; 679 } 680} 681 682$maintClass = NamespaceDupes::class; 683require_once RUN_MAINTENANCE_IF_MAIN; 684