1<?php
2/**
3 * Refresh link tables.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
19 *
20 * @file
21 * @ingroup Maintenance
22 */
23
24use MediaWiki\MediaWikiServices;
25use MediaWiki\Revision\RevisionRecord;
26use Wikimedia\Rdbms\IDatabase;
27
28require_once __DIR__ . '/Maintenance.php';
29
30/**
31 * Maintenance script to refresh link tables.
32 *
33 * @ingroup Maintenance
34 */
35class RefreshLinks extends Maintenance {
36	private const REPORTING_INTERVAL = 100;
37
38	/** @var int|bool */
39	protected $namespace = false;
40
41	public function __construct() {
42		parent::__construct();
43		$this->addDescription( 'Refresh link tables' );
44		$this->addOption( 'dfn-only', 'Delete links from nonexistent articles only' );
45		$this->addOption( 'new-only', 'Only affect articles with just a single edit' );
46		$this->addOption( 'redirects-only', 'Only fix redirects, not all links' );
47		$this->addOption( 'old-redirects-only', 'Only fix redirects with no redirect table entry' );
48		$this->addOption( 'e', 'Last page id to refresh', false, true );
49		$this->addOption( 'dfn-chunk-size', 'Maximum number of existent IDs to check per ' .
50			'query, default 100000', false, true );
51		$this->addOption( 'namespace', 'Only fix pages in this namespace', false, true );
52		$this->addOption( 'category', 'Only fix pages in this category', false, true );
53		$this->addOption( 'tracking-category', 'Only fix pages in this tracking category', false, true );
54		$this->addArg( 'start', 'Page_id to start from, default 1', false );
55		$this->setBatchSize( 100 );
56	}
57
58	public function execute() {
59		// Note that there is a difference between not specifying the start
60		// and end IDs and using the minimum and maximum values from the page
61		// table. In the latter case, deleteLinksFromNonexistent() will not
62		// delete entries for nonexistent IDs that fall outside the range.
63		$start = (int)$this->getArg( 0 ) ?: null;
64		$end = (int)$this->getOption( 'e' ) ?: null;
65		$dfnChunkSize = (int)$this->getOption( 'dfn-chunk-size', 100000 );
66
67		$ns = $this->getOption( 'namespace' );
68		if ( $ns === null ) {
69			$this->namespace = false;
70		} else {
71			$this->namespace = (int)$ns;
72		}
73
74		if ( $this->hasOption( 'category' ) ) {
75			$category = $this->getOption( 'category' );
76			$title = Title::makeTitleSafe( NS_CATEGORY, $category );
77			if ( !$title ) {
78				$this->fatalError( "'$category' is an invalid category name!\n" );
79			}
80			$this->refreshCategory( $title );
81		} elseif ( $this->hasOption( 'tracking-category' ) ) {
82			$this->refreshTrackingCategory( $this->getOption( 'trackingcategory' ) );
83		} elseif ( !$this->hasOption( 'dfn-only' ) ) {
84			$new = $this->hasOption( 'new-only' );
85			$redir = $this->hasOption( 'redirects-only' );
86			$oldRedir = $this->hasOption( 'old-redirects-only' );
87			$this->doRefreshLinks( $start, $new, $end, $redir, $oldRedir );
88			$this->deleteLinksFromNonexistent( null, null, $this->getBatchSize(), $dfnChunkSize );
89		} else {
90			$this->deleteLinksFromNonexistent( $start, $end, $this->getBatchSize(), $dfnChunkSize );
91		}
92	}
93
94	private function namespaceCond() {
95		return $this->namespace !== false
96			? [ 'page_namespace' => $this->namespace ]
97			: [];
98	}
99
100	/**
101	 * Do the actual link refreshing.
102	 * @param int|null $start Page_id to start from
103	 * @param bool $newOnly Only do pages with 1 edit
104	 * @param int|null $end Page_id to stop at
105	 * @param bool $redirectsOnly Only fix redirects
106	 * @param bool $oldRedirectsOnly Only fix redirects without redirect entries
107	 */
108	private function doRefreshLinks( $start, $newOnly = false,
109		$end = null, $redirectsOnly = false, $oldRedirectsOnly = false
110	) {
111		$dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
112
113		if ( $start === null ) {
114			$start = 1;
115		}
116
117		// Give extensions a chance to optimize settings
118		$this->getHookRunner()->onMaintenanceRefreshLinksInit( $this );
119
120		$what = $redirectsOnly ? "redirects" : "links";
121		$lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
122
123		if ( $oldRedirectsOnly ) {
124			# This entire code path is cut-and-pasted from below.  Hurrah.
125
126			$conds = [
127				"page_is_redirect=1",
128				"rd_from IS NULL",
129				self::intervalCond( $dbr, 'page_id', $start, $end ),
130			] + $this->namespaceCond();
131
132			$res = $dbr->select(
133				[ 'page', 'redirect' ],
134				'page_id',
135				$conds,
136				__METHOD__,
137				[],
138				[ 'redirect' => [ "LEFT JOIN", "page_id=rd_from" ] ]
139			);
140			$num = $res->numRows();
141			$this->output( "Refreshing $num old redirects from $start...\n" );
142
143			$i = 0;
144
145			foreach ( $res as $row ) {
146				if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
147					$this->output( "$i\n" );
148					$lbFactory->waitForReplication();
149				}
150				$this->fixRedirect( $row->page_id );
151			}
152		} elseif ( $newOnly ) {
153			$this->output( "Refreshing $what from " );
154			$res = $dbr->select( 'page',
155				[ 'page_id' ],
156				[
157					'page_is_new' => 1,
158					self::intervalCond( $dbr, 'page_id', $start, $end ),
159				] + $this->namespaceCond(),
160				__METHOD__
161			);
162			$num = $res->numRows();
163			$this->output( "$num new articles...\n" );
164
165			$i = 0;
166			foreach ( $res as $row ) {
167				if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
168					$this->output( "$i\n" );
169					$lbFactory->waitForReplication();
170				}
171				if ( $redirectsOnly ) {
172					$this->fixRedirect( $row->page_id );
173				} else {
174					self::fixLinksFromArticle( $row->page_id, $this->namespace );
175				}
176			}
177		} else {
178			if ( !$end ) {
179				$maxPage = $dbr->selectField( 'page', 'max(page_id)', '', __METHOD__ );
180				$maxRD = $dbr->selectField( 'redirect', 'max(rd_from)', '', __METHOD__ );
181				$end = max( $maxPage, $maxRD );
182			}
183			$this->output( "Refreshing redirects table.\n" );
184			$this->output( "Starting from page_id $start of $end.\n" );
185
186			for ( $id = $start; $id <= $end; $id++ ) {
187				if ( !( $id % self::REPORTING_INTERVAL ) ) {
188					$this->output( "$id\n" );
189					$lbFactory->waitForReplication();
190				}
191				$this->fixRedirect( $id );
192			}
193
194			if ( !$redirectsOnly ) {
195				$this->output( "Refreshing links tables.\n" );
196				$this->output( "Starting from page_id $start of $end.\n" );
197
198				for ( $id = $start; $id <= $end; $id++ ) {
199					if ( !( $id % self::REPORTING_INTERVAL ) ) {
200						$this->output( "$id\n" );
201						$lbFactory->waitForReplication();
202					}
203					self::fixLinksFromArticle( $id, $this->namespace );
204				}
205			}
206		}
207	}
208
209	/**
210	 * Update the redirect entry for a given page.
211	 *
212	 * This methods bypasses the "redirect" table to get the redirect target,
213	 * and parses the page's content to fetch it. This allows to be sure that
214	 * the redirect target is up to date and valid.
215	 * This is particularly useful when modifying namespaces to be sure the
216	 * entry in the "redirect" table points to the correct page and not to an
217	 * invalid one.
218	 *
219	 * @param int $id The page ID to check
220	 */
221	private function fixRedirect( $id ) {
222		$page = MediaWikiServices::getInstance()->getWikiPageFactory()->newFromID( $id );
223		$dbw = $this->getDB( DB_PRIMARY );
224
225		if ( $page === null ) {
226			// This page doesn't exist (any more)
227			// Delete any redirect table entry for it
228			$dbw->delete( 'redirect', [ 'rd_from' => $id ],
229				__METHOD__ );
230
231			return;
232		} elseif ( $this->namespace !== false
233			&& !$page->getTitle()->inNamespace( $this->namespace )
234		) {
235			return;
236		}
237
238		$rt = null;
239		$content = $page->getContent( RevisionRecord::RAW );
240		if ( $content !== null ) {
241			$rt = $content->getUltimateRedirectTarget();
242		}
243
244		if ( $rt === null ) {
245			// The page is not a redirect
246			// Delete any redirect table entry for it
247			$dbw->delete( 'redirect', [ 'rd_from' => $id ], __METHOD__ );
248			$fieldValue = 0;
249		} else {
250			$page->insertRedirectEntry( $rt );
251			$fieldValue = 1;
252		}
253
254		// Update the page table to be sure it is an a consistent state
255		$dbw->update( 'page', [ 'page_is_redirect' => $fieldValue ],
256			[ 'page_id' => $id ], __METHOD__ );
257	}
258
259	/**
260	 * Run LinksUpdate for all links on a given page_id
261	 * @param int $id The page_id
262	 * @param int|bool $ns Only fix links if it is in this namespace
263	 */
264	public static function fixLinksFromArticle( $id, $ns = false ) {
265		$services = MediaWikiServices::getInstance();
266		$page = $services->getWikiPageFactory()->newFromID( $id );
267
268		$services->getLinkCache()->clear();
269
270		if ( $page === null ) {
271			return;
272		} elseif ( $ns !== false
273			&& !$page->getTitle()->inNamespace( $ns ) ) {
274			return;
275		}
276
277		// Defer updates to post-send but then immediately execute deferred updates;
278		// this is the simplest way to run all updates immediately (including updates
279		// scheduled by other updates).
280		$page->doSecondaryDataUpdates( [
281			'defer' => DeferredUpdates::POSTSEND,
282			'recursive' => false,
283		] );
284		DeferredUpdates::doUpdates();
285	}
286
287	/**
288	 * Removes non-existing links from pages from pagelinks, imagelinks,
289	 * categorylinks, templatelinks, externallinks, interwikilinks, langlinks and redirect tables.
290	 *
291	 * @param int|null $start Page_id to start from
292	 * @param int|null $end Page_id to stop at
293	 * @param int $batchSize The size of deletion batches
294	 * @param int $chunkSize Maximum number of existent IDs to check per query
295	 *
296	 * @author Merlijn van Deen <valhallasw@arctus.nl>
297	 */
298	private function deleteLinksFromNonexistent( $start = null, $end = null, $batchSize = 100,
299		$chunkSize = 100000
300	) {
301		MediaWikiServices::getInstance()->getDBLoadBalancerFactory()->waitForReplication();
302		$this->output( "Deleting illegal entries from the links tables...\n" );
303		$dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
304		do {
305			// Find the start of the next chunk. This is based only
306			// on existent page_ids.
307			$nextStart = $dbr->selectField(
308				'page',
309				'page_id',
310				[ self::intervalCond( $dbr, 'page_id', $start, $end ) ]
311				+ $this->namespaceCond(),
312				__METHOD__,
313				[ 'ORDER BY' => 'page_id', 'OFFSET' => $chunkSize ]
314			);
315
316			if ( $nextStart !== false ) {
317				// To find the end of the current chunk, subtract one.
318				// This will serve to limit the number of rows scanned in
319				// dfnCheckInterval(), per query, to at most the sum of
320				// the chunk size and deletion batch size.
321				$chunkEnd = $nextStart - 1;
322			} else {
323				// This is the last chunk. Check all page_ids up to $end.
324				$chunkEnd = $end;
325			}
326
327			$fmtStart = $start !== null ? "[$start" : '(-INF';
328			$fmtChunkEnd = $chunkEnd !== null ? "$chunkEnd]" : 'INF)';
329			$this->output( "  Checking interval $fmtStart, $fmtChunkEnd\n" );
330			$this->dfnCheckInterval( $start, $chunkEnd, $batchSize );
331
332			$start = $nextStart;
333
334		} while ( $nextStart !== false );
335	}
336
337	/**
338	 * @see RefreshLinks::deleteLinksFromNonexistent()
339	 * @param int|null $start Page_id to start from
340	 * @param int|null $end Page_id to stop at
341	 * @param int $batchSize The size of deletion batches
342	 */
343	private function dfnCheckInterval( $start = null, $end = null, $batchSize = 100 ) {
344		$dbw = $this->getDB( DB_PRIMARY );
345		$dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
346
347		$linksTables = [
348			// table name => page_id field
349			'pagelinks' => 'pl_from',
350			'imagelinks' => 'il_from',
351			'categorylinks' => 'cl_from',
352			'templatelinks' => 'tl_from',
353			'externallinks' => 'el_from',
354			'iwlinks' => 'iwl_from',
355			'langlinks' => 'll_from',
356			'redirect' => 'rd_from',
357			'page_props' => 'pp_page',
358		];
359
360		$lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
361		foreach ( $linksTables as $table => $field ) {
362			$this->output( "    $table: 0" );
363			$tableStart = $start;
364			$counter = 0;
365			do {
366				$ids = $dbr->selectFieldValues(
367					$table,
368					$field,
369					[
370						self::intervalCond( $dbr, $field, $tableStart, $end ),
371						"$field NOT IN ({$dbr->selectSQLText( 'page', 'page_id', [], __METHOD__ )})",
372					],
373					__METHOD__,
374					[ 'DISTINCT', 'ORDER BY' => $field, 'LIMIT' => $batchSize ]
375				);
376
377				$numIds = count( $ids );
378				if ( $numIds ) {
379					$counter += $numIds;
380					$dbw->delete( $table, [ $field => $ids ], __METHOD__ );
381					$this->output( ", $counter" );
382					$tableStart = $ids[$numIds - 1] + 1;
383					$lbFactory->waitForReplication();
384				}
385
386			} while ( $numIds >= $batchSize && ( $end === null || $tableStart <= $end ) );
387
388			$this->output( " deleted.\n" );
389		}
390	}
391
392	/**
393	 * Build a SQL expression for a closed interval (i.e. BETWEEN).
394	 *
395	 * By specifying a null $start or $end, it is also possible to create
396	 * half-bounded or unbounded intervals using this function.
397	 *
398	 * @param IDatabase $db
399	 * @param string $var Field name
400	 * @param mixed $start First value to include or null
401	 * @param mixed $end Last value to include or null
402	 * @return string
403	 */
404	private static function intervalCond( IDatabase $db, $var, $start, $end ) {
405		if ( $start === null && $end === null ) {
406			return "$var IS NOT NULL";
407		} elseif ( $end === null ) {
408			return "$var >= {$db->addQuotes( $start )}";
409		} elseif ( $start === null ) {
410			return "$var <= {$db->addQuotes( $end )}";
411		} else {
412			return "$var BETWEEN {$db->addQuotes( $start )} AND {$db->addQuotes( $end )}";
413		}
414	}
415
416	/**
417	 * Refershes links for pages in a tracking category
418	 *
419	 * @param string $category Category key
420	 */
421	private function refreshTrackingCategory( $category ) {
422		$cats = $this->getPossibleCategories( $category );
423
424		if ( !$cats ) {
425			$this->error( "Tracking category '$category' is disabled\n" );
426			// Output to stderr but don't bail out,
427		}
428
429		foreach ( $cats as $cat ) {
430			$this->refreshCategory( $cat );
431		}
432	}
433
434	/**
435	 * Refreshes links to a category
436	 *
437	 * @param Title $category
438	 */
439	private function refreshCategory( Title $category ) {
440		$this->output( "Refreshing pages in category '{$category->getText()}'...\n" );
441
442		$dbr = $this->getDB( DB_REPLICA );
443		$conds = [
444			'page_id=cl_from',
445			'cl_to' => $category->getDBkey(),
446		];
447		if ( $this->namespace !== false ) {
448			$conds['page_namespace'] = $this->namespace;
449		}
450
451		$i = 0;
452		$timestamp = '';
453		$lastId = 0;
454		$lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
455		do {
456			$finalConds = $conds;
457			$timestamp = $dbr->addQuotes( $timestamp );
458			$finalConds[] =
459				"(cl_timestamp > $timestamp OR (cl_timestamp = $timestamp AND cl_from > $lastId))";
460			$res = $dbr->select( [ 'page', 'categorylinks' ],
461				[ 'page_id', 'cl_timestamp' ],
462				$finalConds,
463				__METHOD__,
464				[
465					'ORDER BY' => [ 'cl_timestamp', 'cl_from' ],
466					'LIMIT' => $this->getBatchSize(),
467				]
468			);
469
470			foreach ( $res as $row ) {
471				if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
472					$this->output( "$i\n" );
473					$lbFactory->waitForReplication();
474				}
475				$lastId = $row->page_id;
476				$timestamp = $row->cl_timestamp;
477				self::fixLinksFromArticle( $row->page_id );
478			}
479
480		} while ( $res->numRows() == $this->getBatchSize() );
481	}
482
483	/**
484	 * Returns a list of possible categories for a given tracking category key
485	 *
486	 * @param string $categoryKey
487	 * @return Title[]
488	 */
489	private function getPossibleCategories( $categoryKey ) {
490		$trackingCategories = new TrackingCategories( $this->getConfig() );
491		$cats = $trackingCategories->getTrackingCategories();
492		if ( isset( $cats[$categoryKey] ) ) {
493			return $cats[$categoryKey]['cats'];
494		}
495		$this->fatalError( "Unknown tracking category {$categoryKey}\n" );
496	}
497}
498
499$maintClass = RefreshLinks::class;
500require_once RUN_MAINTENANCE_IF_MAIN;
501