1<?php 2/** 3 * Refresh link tables. 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation; either version 2 of the License, or 8 * (at your option) any later version. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License along 16 * with this program; if not, write to the Free Software Foundation, Inc., 17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18 * http://www.gnu.org/copyleft/gpl.html 19 * 20 * @file 21 * @ingroup Maintenance 22 */ 23 24use MediaWiki\MediaWikiServices; 25use MediaWiki\Revision\RevisionRecord; 26use Wikimedia\Rdbms\IDatabase; 27 28require_once __DIR__ . '/Maintenance.php'; 29 30/** 31 * Maintenance script to refresh link tables. 32 * 33 * @ingroup Maintenance 34 */ 35class RefreshLinks extends Maintenance { 36 private const REPORTING_INTERVAL = 100; 37 38 /** @var int|bool */ 39 protected $namespace = false; 40 41 public function __construct() { 42 parent::__construct(); 43 $this->addDescription( 'Refresh link tables' ); 44 $this->addOption( 'dfn-only', 'Delete links from nonexistent articles only' ); 45 $this->addOption( 'new-only', 'Only affect articles with just a single edit' ); 46 $this->addOption( 'redirects-only', 'Only fix redirects, not all links' ); 47 $this->addOption( 'old-redirects-only', 'Only fix redirects with no redirect table entry' ); 48 $this->addOption( 'e', 'Last page id to refresh', false, true ); 49 $this->addOption( 'dfn-chunk-size', 'Maximum number of existent IDs to check per ' . 50 'query, default 100000', false, true ); 51 $this->addOption( 'namespace', 'Only fix pages in this namespace', false, true ); 52 $this->addOption( 'category', 'Only fix pages in this category', false, true ); 53 $this->addOption( 'tracking-category', 'Only fix pages in this tracking category', false, true ); 54 $this->addArg( 'start', 'Page_id to start from, default 1', false ); 55 $this->setBatchSize( 100 ); 56 } 57 58 public function execute() { 59 // Note that there is a difference between not specifying the start 60 // and end IDs and using the minimum and maximum values from the page 61 // table. In the latter case, deleteLinksFromNonexistent() will not 62 // delete entries for nonexistent IDs that fall outside the range. 63 $start = (int)$this->getArg( 0 ) ?: null; 64 $end = (int)$this->getOption( 'e' ) ?: null; 65 $dfnChunkSize = (int)$this->getOption( 'dfn-chunk-size', 100000 ); 66 67 $ns = $this->getOption( 'namespace' ); 68 if ( $ns === null ) { 69 $this->namespace = false; 70 } else { 71 $this->namespace = (int)$ns; 72 } 73 74 if ( $this->hasOption( 'category' ) ) { 75 $category = $this->getOption( 'category' ); 76 $title = Title::makeTitleSafe( NS_CATEGORY, $category ); 77 if ( !$title ) { 78 $this->fatalError( "'$category' is an invalid category name!\n" ); 79 } 80 $this->refreshCategory( $title ); 81 } elseif ( $this->hasOption( 'tracking-category' ) ) { 82 $this->refreshTrackingCategory( $this->getOption( 'trackingcategory' ) ); 83 } elseif ( !$this->hasOption( 'dfn-only' ) ) { 84 $new = $this->hasOption( 'new-only' ); 85 $redir = $this->hasOption( 'redirects-only' ); 86 $oldRedir = $this->hasOption( 'old-redirects-only' ); 87 $this->doRefreshLinks( $start, $new, $end, $redir, $oldRedir ); 88 $this->deleteLinksFromNonexistent( null, null, $this->getBatchSize(), $dfnChunkSize ); 89 } else { 90 $this->deleteLinksFromNonexistent( $start, $end, $this->getBatchSize(), $dfnChunkSize ); 91 } 92 } 93 94 private function namespaceCond() { 95 return $this->namespace !== false 96 ? [ 'page_namespace' => $this->namespace ] 97 : []; 98 } 99 100 /** 101 * Do the actual link refreshing. 102 * @param int|null $start Page_id to start from 103 * @param bool $newOnly Only do pages with 1 edit 104 * @param int|null $end Page_id to stop at 105 * @param bool $redirectsOnly Only fix redirects 106 * @param bool $oldRedirectsOnly Only fix redirects without redirect entries 107 */ 108 private function doRefreshLinks( $start, $newOnly = false, 109 $end = null, $redirectsOnly = false, $oldRedirectsOnly = false 110 ) { 111 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); 112 113 if ( $start === null ) { 114 $start = 1; 115 } 116 117 // Give extensions a chance to optimize settings 118 $this->getHookRunner()->onMaintenanceRefreshLinksInit( $this ); 119 120 $what = $redirectsOnly ? "redirects" : "links"; 121 $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory(); 122 123 if ( $oldRedirectsOnly ) { 124 # This entire code path is cut-and-pasted from below. Hurrah. 125 126 $conds = [ 127 "page_is_redirect=1", 128 "rd_from IS NULL", 129 self::intervalCond( $dbr, 'page_id', $start, $end ), 130 ] + $this->namespaceCond(); 131 132 $res = $dbr->select( 133 [ 'page', 'redirect' ], 134 'page_id', 135 $conds, 136 __METHOD__, 137 [], 138 [ 'redirect' => [ "LEFT JOIN", "page_id=rd_from" ] ] 139 ); 140 $num = $res->numRows(); 141 $this->output( "Refreshing $num old redirects from $start...\n" ); 142 143 $i = 0; 144 145 foreach ( $res as $row ) { 146 if ( !( ++$i % self::REPORTING_INTERVAL ) ) { 147 $this->output( "$i\n" ); 148 $lbFactory->waitForReplication(); 149 } 150 $this->fixRedirect( $row->page_id ); 151 } 152 } elseif ( $newOnly ) { 153 $this->output( "Refreshing $what from " ); 154 $res = $dbr->select( 'page', 155 [ 'page_id' ], 156 [ 157 'page_is_new' => 1, 158 self::intervalCond( $dbr, 'page_id', $start, $end ), 159 ] + $this->namespaceCond(), 160 __METHOD__ 161 ); 162 $num = $res->numRows(); 163 $this->output( "$num new articles...\n" ); 164 165 $i = 0; 166 foreach ( $res as $row ) { 167 if ( !( ++$i % self::REPORTING_INTERVAL ) ) { 168 $this->output( "$i\n" ); 169 $lbFactory->waitForReplication(); 170 } 171 if ( $redirectsOnly ) { 172 $this->fixRedirect( $row->page_id ); 173 } else { 174 self::fixLinksFromArticle( $row->page_id, $this->namespace ); 175 } 176 } 177 } else { 178 if ( !$end ) { 179 $maxPage = $dbr->selectField( 'page', 'max(page_id)', '', __METHOD__ ); 180 $maxRD = $dbr->selectField( 'redirect', 'max(rd_from)', '', __METHOD__ ); 181 $end = max( $maxPage, $maxRD ); 182 } 183 $this->output( "Refreshing redirects table.\n" ); 184 $this->output( "Starting from page_id $start of $end.\n" ); 185 186 for ( $id = $start; $id <= $end; $id++ ) { 187 if ( !( $id % self::REPORTING_INTERVAL ) ) { 188 $this->output( "$id\n" ); 189 $lbFactory->waitForReplication(); 190 } 191 $this->fixRedirect( $id ); 192 } 193 194 if ( !$redirectsOnly ) { 195 $this->output( "Refreshing links tables.\n" ); 196 $this->output( "Starting from page_id $start of $end.\n" ); 197 198 for ( $id = $start; $id <= $end; $id++ ) { 199 if ( !( $id % self::REPORTING_INTERVAL ) ) { 200 $this->output( "$id\n" ); 201 $lbFactory->waitForReplication(); 202 } 203 self::fixLinksFromArticle( $id, $this->namespace ); 204 } 205 } 206 } 207 } 208 209 /** 210 * Update the redirect entry for a given page. 211 * 212 * This methods bypasses the "redirect" table to get the redirect target, 213 * and parses the page's content to fetch it. This allows to be sure that 214 * the redirect target is up to date and valid. 215 * This is particularly useful when modifying namespaces to be sure the 216 * entry in the "redirect" table points to the correct page and not to an 217 * invalid one. 218 * 219 * @param int $id The page ID to check 220 */ 221 private function fixRedirect( $id ) { 222 $page = MediaWikiServices::getInstance()->getWikiPageFactory()->newFromID( $id ); 223 $dbw = $this->getDB( DB_PRIMARY ); 224 225 if ( $page === null ) { 226 // This page doesn't exist (any more) 227 // Delete any redirect table entry for it 228 $dbw->delete( 'redirect', [ 'rd_from' => $id ], 229 __METHOD__ ); 230 231 return; 232 } elseif ( $this->namespace !== false 233 && !$page->getTitle()->inNamespace( $this->namespace ) 234 ) { 235 return; 236 } 237 238 $rt = null; 239 $content = $page->getContent( RevisionRecord::RAW ); 240 if ( $content !== null ) { 241 $rt = $content->getUltimateRedirectTarget(); 242 } 243 244 if ( $rt === null ) { 245 // The page is not a redirect 246 // Delete any redirect table entry for it 247 $dbw->delete( 'redirect', [ 'rd_from' => $id ], __METHOD__ ); 248 $fieldValue = 0; 249 } else { 250 $page->insertRedirectEntry( $rt ); 251 $fieldValue = 1; 252 } 253 254 // Update the page table to be sure it is an a consistent state 255 $dbw->update( 'page', [ 'page_is_redirect' => $fieldValue ], 256 [ 'page_id' => $id ], __METHOD__ ); 257 } 258 259 /** 260 * Run LinksUpdate for all links on a given page_id 261 * @param int $id The page_id 262 * @param int|bool $ns Only fix links if it is in this namespace 263 */ 264 public static function fixLinksFromArticle( $id, $ns = false ) { 265 $services = MediaWikiServices::getInstance(); 266 $page = $services->getWikiPageFactory()->newFromID( $id ); 267 268 $services->getLinkCache()->clear(); 269 270 if ( $page === null ) { 271 return; 272 } elseif ( $ns !== false 273 && !$page->getTitle()->inNamespace( $ns ) ) { 274 return; 275 } 276 277 // Defer updates to post-send but then immediately execute deferred updates; 278 // this is the simplest way to run all updates immediately (including updates 279 // scheduled by other updates). 280 $page->doSecondaryDataUpdates( [ 281 'defer' => DeferredUpdates::POSTSEND, 282 'recursive' => false, 283 ] ); 284 DeferredUpdates::doUpdates(); 285 } 286 287 /** 288 * Removes non-existing links from pages from pagelinks, imagelinks, 289 * categorylinks, templatelinks, externallinks, interwikilinks, langlinks and redirect tables. 290 * 291 * @param int|null $start Page_id to start from 292 * @param int|null $end Page_id to stop at 293 * @param int $batchSize The size of deletion batches 294 * @param int $chunkSize Maximum number of existent IDs to check per query 295 * 296 * @author Merlijn van Deen <valhallasw@arctus.nl> 297 */ 298 private function deleteLinksFromNonexistent( $start = null, $end = null, $batchSize = 100, 299 $chunkSize = 100000 300 ) { 301 MediaWikiServices::getInstance()->getDBLoadBalancerFactory()->waitForReplication(); 302 $this->output( "Deleting illegal entries from the links tables...\n" ); 303 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); 304 do { 305 // Find the start of the next chunk. This is based only 306 // on existent page_ids. 307 $nextStart = $dbr->selectField( 308 'page', 309 'page_id', 310 [ self::intervalCond( $dbr, 'page_id', $start, $end ) ] 311 + $this->namespaceCond(), 312 __METHOD__, 313 [ 'ORDER BY' => 'page_id', 'OFFSET' => $chunkSize ] 314 ); 315 316 if ( $nextStart !== false ) { 317 // To find the end of the current chunk, subtract one. 318 // This will serve to limit the number of rows scanned in 319 // dfnCheckInterval(), per query, to at most the sum of 320 // the chunk size and deletion batch size. 321 $chunkEnd = $nextStart - 1; 322 } else { 323 // This is the last chunk. Check all page_ids up to $end. 324 $chunkEnd = $end; 325 } 326 327 $fmtStart = $start !== null ? "[$start" : '(-INF'; 328 $fmtChunkEnd = $chunkEnd !== null ? "$chunkEnd]" : 'INF)'; 329 $this->output( " Checking interval $fmtStart, $fmtChunkEnd\n" ); 330 $this->dfnCheckInterval( $start, $chunkEnd, $batchSize ); 331 332 $start = $nextStart; 333 334 } while ( $nextStart !== false ); 335 } 336 337 /** 338 * @see RefreshLinks::deleteLinksFromNonexistent() 339 * @param int|null $start Page_id to start from 340 * @param int|null $end Page_id to stop at 341 * @param int $batchSize The size of deletion batches 342 */ 343 private function dfnCheckInterval( $start = null, $end = null, $batchSize = 100 ) { 344 $dbw = $this->getDB( DB_PRIMARY ); 345 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); 346 347 $linksTables = [ 348 // table name => page_id field 349 'pagelinks' => 'pl_from', 350 'imagelinks' => 'il_from', 351 'categorylinks' => 'cl_from', 352 'templatelinks' => 'tl_from', 353 'externallinks' => 'el_from', 354 'iwlinks' => 'iwl_from', 355 'langlinks' => 'll_from', 356 'redirect' => 'rd_from', 357 'page_props' => 'pp_page', 358 ]; 359 360 $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory(); 361 foreach ( $linksTables as $table => $field ) { 362 $this->output( " $table: 0" ); 363 $tableStart = $start; 364 $counter = 0; 365 do { 366 $ids = $dbr->selectFieldValues( 367 $table, 368 $field, 369 [ 370 self::intervalCond( $dbr, $field, $tableStart, $end ), 371 "$field NOT IN ({$dbr->selectSQLText( 'page', 'page_id', [], __METHOD__ )})", 372 ], 373 __METHOD__, 374 [ 'DISTINCT', 'ORDER BY' => $field, 'LIMIT' => $batchSize ] 375 ); 376 377 $numIds = count( $ids ); 378 if ( $numIds ) { 379 $counter += $numIds; 380 $dbw->delete( $table, [ $field => $ids ], __METHOD__ ); 381 $this->output( ", $counter" ); 382 $tableStart = $ids[$numIds - 1] + 1; 383 $lbFactory->waitForReplication(); 384 } 385 386 } while ( $numIds >= $batchSize && ( $end === null || $tableStart <= $end ) ); 387 388 $this->output( " deleted.\n" ); 389 } 390 } 391 392 /** 393 * Build a SQL expression for a closed interval (i.e. BETWEEN). 394 * 395 * By specifying a null $start or $end, it is also possible to create 396 * half-bounded or unbounded intervals using this function. 397 * 398 * @param IDatabase $db 399 * @param string $var Field name 400 * @param mixed $start First value to include or null 401 * @param mixed $end Last value to include or null 402 * @return string 403 */ 404 private static function intervalCond( IDatabase $db, $var, $start, $end ) { 405 if ( $start === null && $end === null ) { 406 return "$var IS NOT NULL"; 407 } elseif ( $end === null ) { 408 return "$var >= {$db->addQuotes( $start )}"; 409 } elseif ( $start === null ) { 410 return "$var <= {$db->addQuotes( $end )}"; 411 } else { 412 return "$var BETWEEN {$db->addQuotes( $start )} AND {$db->addQuotes( $end )}"; 413 } 414 } 415 416 /** 417 * Refershes links for pages in a tracking category 418 * 419 * @param string $category Category key 420 */ 421 private function refreshTrackingCategory( $category ) { 422 $cats = $this->getPossibleCategories( $category ); 423 424 if ( !$cats ) { 425 $this->error( "Tracking category '$category' is disabled\n" ); 426 // Output to stderr but don't bail out, 427 } 428 429 foreach ( $cats as $cat ) { 430 $this->refreshCategory( $cat ); 431 } 432 } 433 434 /** 435 * Refreshes links to a category 436 * 437 * @param Title $category 438 */ 439 private function refreshCategory( Title $category ) { 440 $this->output( "Refreshing pages in category '{$category->getText()}'...\n" ); 441 442 $dbr = $this->getDB( DB_REPLICA ); 443 $conds = [ 444 'page_id=cl_from', 445 'cl_to' => $category->getDBkey(), 446 ]; 447 if ( $this->namespace !== false ) { 448 $conds['page_namespace'] = $this->namespace; 449 } 450 451 $i = 0; 452 $timestamp = ''; 453 $lastId = 0; 454 $lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory(); 455 do { 456 $finalConds = $conds; 457 $timestamp = $dbr->addQuotes( $timestamp ); 458 $finalConds[] = 459 "(cl_timestamp > $timestamp OR (cl_timestamp = $timestamp AND cl_from > $lastId))"; 460 $res = $dbr->select( [ 'page', 'categorylinks' ], 461 [ 'page_id', 'cl_timestamp' ], 462 $finalConds, 463 __METHOD__, 464 [ 465 'ORDER BY' => [ 'cl_timestamp', 'cl_from' ], 466 'LIMIT' => $this->getBatchSize(), 467 ] 468 ); 469 470 foreach ( $res as $row ) { 471 if ( !( ++$i % self::REPORTING_INTERVAL ) ) { 472 $this->output( "$i\n" ); 473 $lbFactory->waitForReplication(); 474 } 475 $lastId = $row->page_id; 476 $timestamp = $row->cl_timestamp; 477 self::fixLinksFromArticle( $row->page_id ); 478 } 479 480 } while ( $res->numRows() == $this->getBatchSize() ); 481 } 482 483 /** 484 * Returns a list of possible categories for a given tracking category key 485 * 486 * @param string $categoryKey 487 * @return Title[] 488 */ 489 private function getPossibleCategories( $categoryKey ) { 490 $trackingCategories = new TrackingCategories( $this->getConfig() ); 491 $cats = $trackingCategories->getTrackingCategories(); 492 if ( isset( $cats[$categoryKey] ) ) { 493 return $cats[$categoryKey]['cats']; 494 } 495 $this->fatalError( "Unknown tracking category {$categoryKey}\n" ); 496 } 497} 498 499$maintClass = RefreshLinks::class; 500require_once RUN_MAINTENANCE_IF_MAIN; 501