1<?php
2/**
3 * Check for articles to fix after adding/deleting namespaces
4 *
5 * Copyright © 2005-2007 Brion Vibber <brion@pobox.com>
6 * https://www.mediawiki.org/
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @file
24 * @ingroup Maintenance
25 */
26
27require_once __DIR__ . '/Maintenance.php';
28
29use MediaWiki\Linker\LinkTarget;
30use MediaWiki\MediaWikiServices;
31use Wikimedia\Rdbms\IDatabase;
32use Wikimedia\Rdbms\IMaintainableDatabase;
33use Wikimedia\Rdbms\IResultWrapper;
34
35/**
36 * Maintenance script that checks for articles to fix after
37 * adding/deleting namespaces.
38 *
39 * @ingroup Maintenance
40 */
41class NamespaceDupes extends Maintenance {
42
43	/**
44	 * @var IMaintainableDatabase
45	 */
46	protected $db;
47
48	/**
49	 * Total number of pages that need fixing that are automatically resolveable
50	 * @var int
51	 */
52	private $resolvablePages = 0;
53
54	/**
55	 * Total number of pages that need fixing
56	 * @var int
57	 */
58	private $totalPages = 0;
59
60	/**
61	 * Total number of links that need fixing that are automatically resolveable
62	 * @var int
63	 */
64	private $resolvableLinks = 0;
65
66	/**
67	 * Total number of erroneous links
68	 * @var int
69	 */
70	private $totalLinks = 0;
71
72	/**
73	 * Total number of links deleted because they weren't automatically resolveable due to the
74	 * target already existing
75	 * @var int
76	 */
77	private $deletedLinks = 0;
78
79	public function __construct() {
80		parent::__construct();
81		$this->addDescription( 'Find and fix pages affected by namespace addition/removal' );
82		$this->addOption( 'fix', 'Attempt to automatically fix errors and delete broken links' );
83		$this->addOption( 'merge', "Instead of renaming conflicts, do a history merge with " .
84			"the correct title" );
85		$this->addOption( 'add-suffix', "Dupes will be renamed with correct namespace with " .
86			"<text> appended after the article name", false, true );
87		$this->addOption( 'add-prefix', "Dupes will be renamed with correct namespace with " .
88			"<text> prepended before the article name", false, true );
89		$this->addOption( 'source-pseudo-namespace', "Move all pages with the given source " .
90			"prefix (with an implied colon following it). If --dest-namespace is not specified, " .
91			"the colon will be replaced with a hyphen.",
92			false, true );
93		$this->addOption( 'dest-namespace', "In combination with --source-pseudo-namespace, " .
94			"specify the namespace ID of the destination.", false, true );
95		$this->addOption( 'move-talk', "If this is specified, pages in the Talk namespace that " .
96			"begin with a conflicting prefix will be renamed, for example " .
97			"Talk:File:Foo -> File_Talk:Foo" );
98	}
99
100	public function execute() {
101		$options = [
102			'fix' => $this->hasOption( 'fix' ),
103			'merge' => $this->hasOption( 'merge' ),
104			'add-suffix' => $this->getOption( 'add-suffix', '' ),
105			'add-prefix' => $this->getOption( 'add-prefix', '' ),
106			'move-talk' => $this->hasOption( 'move-talk' ),
107			'source-pseudo-namespace' => $this->getOption( 'source-pseudo-namespace', '' ),
108			'dest-namespace' => intval( $this->getOption( 'dest-namespace', 0 ) )
109		];
110
111		if ( $options['source-pseudo-namespace'] !== '' ) {
112			$retval = $this->checkPrefix( $options );
113		} else {
114			$retval = $this->checkAll( $options );
115		}
116
117		if ( $retval ) {
118			$this->output( "\nLooks good!\n" );
119		} else {
120			$this->output( "\nOh noeees\n" );
121		}
122	}
123
124	/**
125	 * Check all namespaces
126	 *
127	 * @param array $options Associative array of validated command-line options
128	 *
129	 * @return bool
130	 */
131	private function checkAll( $options ) {
132		$contLang = MediaWikiServices::getInstance()->getContentLanguage();
133		$spaces = [];
134
135		// List interwikis first, so they'll be overridden
136		// by any conflicting local namespaces.
137		foreach ( $this->getInterwikiList() as $prefix ) {
138			$name = $contLang->ucfirst( $prefix );
139			$spaces[$name] = 0;
140		}
141
142		// Now pull in all canonical and alias namespaces...
143		foreach (
144			MediaWikiServices::getInstance()->getNamespaceInfo()->getCanonicalNamespaces()
145			as $ns => $name
146		) {
147			// This includes $wgExtraNamespaces
148			if ( $name !== '' ) {
149				$spaces[$name] = $ns;
150			}
151		}
152		foreach ( $contLang->getNamespaces() as $ns => $name ) {
153			if ( $name !== '' ) {
154				$spaces[$name] = $ns;
155			}
156		}
157		foreach ( $contLang->getNamespaceAliases() as $name => $ns ) {
158			$spaces[$name] = $ns;
159		}
160
161		// We'll need to check for lowercase keys as well,
162		// since we're doing case-sensitive searches in the db.
163		$capitalLinks = $this->getConfig()->get( 'CapitalLinks' );
164		foreach ( $spaces as $name => $ns ) {
165			$moreNames = [];
166			$moreNames[] = $contLang->uc( $name );
167			$moreNames[] = $contLang->ucfirst( $contLang->lc( $name ) );
168			$moreNames[] = $contLang->ucwords( $name );
169			$moreNames[] = $contLang->ucwords( $contLang->lc( $name ) );
170			$moreNames[] = $contLang->ucwordbreaks( $name );
171			$moreNames[] = $contLang->ucwordbreaks( $contLang->lc( $name ) );
172			if ( !$capitalLinks ) {
173				foreach ( $moreNames as $altName ) {
174					$moreNames[] = $contLang->lcfirst( $altName );
175				}
176				$moreNames[] = $contLang->lcfirst( $name );
177			}
178			foreach ( array_unique( $moreNames ) as $altName ) {
179				if ( $altName !== $name ) {
180					$spaces[$altName] = $ns;
181				}
182			}
183		}
184
185		// Sort by namespace index, and if there are two with the same index,
186		// break the tie by sorting by name
187		$origSpaces = $spaces;
188		uksort( $spaces, static function ( $a, $b ) use ( $origSpaces ) {
189			return $origSpaces[$a] <=> $origSpaces[$b]
190				?: $a <=> $b;
191		} );
192
193		$ok = true;
194		foreach ( $spaces as $name => $ns ) {
195			$ok = $this->checkNamespace( $ns, $name, $options ) && $ok;
196		}
197
198		$this->output(
199			"{$this->totalPages} pages to fix, " .
200			"{$this->resolvablePages} were resolvable.\n\n"
201		);
202
203		foreach ( $spaces as $name => $ns ) {
204			if ( $ns != 0 ) {
205				/* Fix up link destinations for non-interwiki links only.
206				 *
207				 * For example if a page has [[Foo:Bar]] and then a Foo namespace
208				 * is introduced, pagelinks needs to be updated to have
209				 * page_namespace = NS_FOO.
210				 *
211				 * If instead an interwiki prefix was introduced called "Foo",
212				 * the link should instead be moved to the iwlinks table. If a new
213				 * language is introduced called "Foo", or if there is a pagelink
214				 * [[fr:Bar]] when interlanguage magic links are turned on, the
215				 * link would have to be moved to the langlinks table. Let's put
216				 * those cases in the too-hard basket for now. The consequences are
217				 * not especially severe.
218				 * @fixme Handle interwiki links, and pagelinks to Category:, File:
219				 * which probably need reparsing.
220				 */
221
222				$this->checkLinkTable( 'pagelinks', 'pl', $ns, $name, $options );
223				$this->checkLinkTable( 'templatelinks', 'tl', $ns, $name, $options );
224
225				// The redirect table has interwiki links randomly mixed in, we
226				// need to filter those out. For example [[w:Foo:Bar]] would
227				// have rd_interwiki=w and rd_namespace=0, which would match the
228				// query for a conflicting namespace "Foo" if filtering wasn't done.
229				$this->checkLinkTable( 'redirect', 'rd', $ns, $name, $options,
230					[ 'rd_interwiki' => null ] );
231				$this->checkLinkTable( 'redirect', 'rd', $ns, $name, $options,
232					[ 'rd_interwiki' => '' ] );
233			}
234		}
235
236		$this->output(
237			"{$this->totalLinks} links to fix, " .
238			"{$this->resolvableLinks} were resolvable, " .
239			"{$this->deletedLinks} were deleted.\n"
240		);
241
242		return $ok;
243	}
244
245	/**
246	 * @return string[]
247	 */
248	private function getInterwikiList() {
249		$result = MediaWikiServices::getInstance()->getInterwikiLookup()->getAllPrefixes();
250		return array_column( $result, 'iw_prefix' );
251	}
252
253	/**
254	 * Check a given prefix and try to move it into the given destination namespace
255	 *
256	 * @param int $ns Destination namespace id
257	 * @param string $name
258	 * @param array $options Associative array of validated command-line options
259	 * @return bool
260	 */
261	private function checkNamespace( $ns, $name, $options ) {
262		$targets = $this->getTargetList( $ns, $name, $options );
263		$count = $targets->numRows();
264		$this->totalPages += $count;
265		if ( $count == 0 ) {
266			return true;
267		}
268
269		$dryRunNote = $options['fix'] ? '' : ' DRY RUN ONLY';
270
271		$ok = true;
272		foreach ( $targets as $row ) {
273			// Find the new title and determine the action to take
274
275			$newTitle = $this->getDestinationTitle(
276				$ns, $name, $row->page_namespace, $row->page_title );
277			$logStatus = false;
278			if ( !$newTitle ) {
279				$logStatus = 'invalid title';
280				$action = 'abort';
281			} elseif ( $newTitle->exists() ) {
282				if ( $options['merge'] ) {
283					if ( $this->canMerge( $row->page_id, $newTitle, $logStatus ) ) {
284						$action = 'merge';
285					} else {
286						$action = 'abort';
287					}
288				} elseif ( $options['add-prefix'] == '' && $options['add-suffix'] == '' ) {
289					$action = 'abort';
290					$logStatus = 'dest title exists and --add-prefix not specified';
291				} else {
292					$newTitle = $this->getAlternateTitle( $newTitle, $options );
293					if ( !$newTitle ) {
294						$action = 'abort';
295						$logStatus = 'alternate title is invalid';
296					} elseif ( $newTitle->exists() ) {
297						$action = 'abort';
298						$logStatus = 'title conflict';
299					} else {
300						$action = 'move';
301						$logStatus = 'alternate';
302					}
303				}
304			} else {
305				$action = 'move';
306				$logStatus = 'no conflict';
307			}
308
309			// Take the action or log a dry run message
310
311			$logTitle = "id={$row->page_id} ns={$row->page_namespace} dbk={$row->page_title}";
312			$pageOK = true;
313
314			switch ( $action ) {
315				case 'abort':
316					$this->output( "$logTitle *** $logStatus\n" );
317					$pageOK = false;
318					break;
319				case 'move':
320					$this->output( "$logTitle -> " .
321						$newTitle->getPrefixedDBkey() . " ($logStatus)$dryRunNote\n" );
322
323					if ( $options['fix'] ) {
324						$pageOK = $this->movePage( $row->page_id, $newTitle );
325					}
326					break;
327				case 'merge':
328					$this->output( "$logTitle => " .
329						$newTitle->getPrefixedDBkey() . " (merge)$dryRunNote\n" );
330
331					if ( $options['fix'] ) {
332						$pageOK = $this->mergePage( $row, $newTitle );
333					}
334					break;
335			}
336
337			if ( $pageOK ) {
338				$this->resolvablePages++;
339			} else {
340				$ok = false;
341			}
342		}
343
344		return $ok;
345	}
346
347	/**
348	 * Check and repair the destination fields in a link table
349	 * @param string $table The link table name
350	 * @param string $fieldPrefix The field prefix in the link table
351	 * @param int $ns Destination namespace id
352	 * @param string $name
353	 * @param array $options Associative array of validated command-line options
354	 * @param array $extraConds Extra conditions for the SQL query
355	 */
356	private function checkLinkTable( $table, $fieldPrefix, $ns, $name, $options,
357		$extraConds = []
358	) {
359		$dbw = $this->getDB( DB_PRIMARY );
360
361		$batchConds = [];
362		$fromField = "{$fieldPrefix}_from";
363		$namespaceField = "{$fieldPrefix}_namespace";
364		$titleField = "{$fieldPrefix}_title";
365		$batchSize = 500;
366		$lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
367		while ( true ) {
368			$res = $dbw->select(
369				$table,
370				[ $fromField, $namespaceField, $titleField ],
371				array_merge(
372					$batchConds,
373					$extraConds,
374					[
375						$namespaceField => 0,
376						$titleField . $dbw->buildLike( "$name:", $dbw->anyString() )
377					]
378				),
379				__METHOD__,
380				[
381					'ORDER BY' => [ $titleField, $fromField ],
382					'LIMIT' => $batchSize
383				]
384			);
385
386			if ( $res->numRows() == 0 ) {
387				break;
388			}
389
390			$rowsToDeleteIfStillExists = [];
391
392			foreach ( $res as $row ) {
393				$logTitle = "from={$row->$fromField} ns={$row->$namespaceField} " .
394					"dbk={$row->$titleField}";
395				$destTitle = $this->getDestinationTitle(
396					$ns, $name, $row->$namespaceField, $row->$titleField );
397				$this->totalLinks++;
398				if ( !$destTitle ) {
399					$this->output( "$table $logTitle *** INVALID\n" );
400					continue;
401				}
402				$this->resolvableLinks++;
403				if ( !$options['fix'] ) {
404					$this->output( "$table $logTitle -> " .
405						$destTitle->getPrefixedDBkey() . " DRY RUN\n" );
406					continue;
407				}
408
409				$dbw->update( $table,
410					// SET
411					[
412						$namespaceField => $destTitle->getNamespace(),
413						$titleField => $destTitle->getDBkey()
414					],
415					// WHERE
416					[
417						$namespaceField => 0,
418						$titleField => $row->$titleField,
419						$fromField => $row->$fromField
420					],
421					__METHOD__,
422					[ 'IGNORE' ]
423				);
424
425				$rowsToDeleteIfStillExists[] = $dbw->makeList(
426					[
427						$fromField => $row->$fromField,
428						$namespaceField => $row->$namespaceField,
429						$titleField => $row->$titleField,
430					],
431					IDatabase::LIST_AND
432				);
433
434				$this->output( "$table $logTitle -> " .
435					$destTitle->getPrefixedDBkey() . "\n"
436				);
437			}
438
439			if ( $options['fix'] && count( $rowsToDeleteIfStillExists ) > 0 ) {
440				$dbw->delete(
441					$table,
442					$dbw->makeList( $rowsToDeleteIfStillExists, IDatabase::LIST_OR ),
443					__METHOD__
444				);
445
446				$this->deletedLinks += $dbw->affectedRows();
447				$this->resolvableLinks -= $dbw->affectedRows();
448			}
449
450			$encLastTitle = $dbw->addQuotes( $row->$titleField );
451			$encLastFrom = $dbw->addQuotes( $row->$fromField );
452
453			$batchConds = [
454				"$titleField > $encLastTitle " .
455				"OR ($titleField = $encLastTitle AND $fromField > $encLastFrom)"
456			];
457
458			$lbFactory->waitForReplication();
459		}
460	}
461
462	/**
463	 * Move the given pseudo-namespace, either replacing the colon with a hyphen
464	 * (useful for pseudo-namespaces that conflict with interwiki links) or move
465	 * them to another namespace if specified.
466	 * @param array $options Associative array of validated command-line options
467	 * @return bool
468	 */
469	private function checkPrefix( $options ) {
470		$prefix = $options['source-pseudo-namespace'];
471		$ns = $options['dest-namespace'];
472		$this->output( "Checking prefix \"$prefix\" vs namespace $ns\n" );
473
474		return $this->checkNamespace( $ns, $prefix, $options );
475	}
476
477	/**
478	 * Find pages in main and talk namespaces that have a prefix of the new
479	 * namespace so we know titles that will need migrating
480	 *
481	 * @param int $ns Destination namespace id
482	 * @param string $name Prefix that is being made a namespace
483	 * @param array $options Associative array of validated command-line options
484	 *
485	 * @return IResultWrapper
486	 */
487	private function getTargetList( $ns, $name, $options ) {
488		$dbw = $this->getDB( DB_PRIMARY );
489
490		if (
491			$options['move-talk'] &&
492			MediaWikiServices::getInstance()->getNamespaceInfo()->isSubject( $ns )
493		) {
494			$checkNamespaces = [ NS_MAIN, NS_TALK ];
495		} else {
496			$checkNamespaces = NS_MAIN;
497		}
498
499		return $dbw->select( 'page',
500			[
501				'page_id',
502				'page_title',
503				'page_namespace',
504			],
505			[
506				'page_namespace' => $checkNamespaces,
507				'page_title' . $dbw->buildLike( "$name:", $dbw->anyString() ),
508			],
509			__METHOD__
510		);
511	}
512
513	/**
514	 * Get the preferred destination title for a given target page.
515	 * @param int $ns The destination namespace ID
516	 * @param string $name The conflicting prefix
517	 * @param int $sourceNs The source namespace
518	 * @param int $sourceDbk The source DB key (i.e. page_title)
519	 * @return Title|false
520	 */
521	private function getDestinationTitle( $ns, $name, $sourceNs, $sourceDbk ) {
522		$dbk = substr( $sourceDbk, strlen( "$name:" ) );
523		if ( $ns == 0 ) {
524			// An interwiki; try an alternate encoding with '-' for ':'
525			$dbk = "$name-" . $dbk;
526		}
527		$destNS = $ns;
528		$nsInfo = MediaWikiServices::getInstance()->getNamespaceInfo();
529		if ( $sourceNs == NS_TALK && $nsInfo->isSubject( $ns ) ) {
530			// This is an associated talk page moved with the --move-talk feature.
531			$destNS = $nsInfo->getTalk( $destNS );
532		}
533		$newTitle = Title::makeTitleSafe( $destNS, $dbk );
534		if ( !$newTitle || !$newTitle->canExist() ) {
535			return false;
536		}
537		return $newTitle;
538	}
539
540	/**
541	 * Get an alternative title to move a page to. This is used if the
542	 * preferred destination title already exists.
543	 *
544	 * @param LinkTarget $linkTarget
545	 * @param array $options Associative array of validated command-line options
546	 * @return Title|bool
547	 */
548	private function getAlternateTitle( LinkTarget $linkTarget, $options ) {
549		$prefix = $options['add-prefix'];
550		$suffix = $options['add-suffix'];
551		if ( $prefix == '' && $suffix == '' ) {
552			return false;
553		}
554		while ( true ) {
555			$dbk = $prefix . $linkTarget->getDBkey() . $suffix;
556			$title = Title::makeTitleSafe( $linkTarget->getNamespace(), $dbk );
557			if ( !$title ) {
558				return false;
559			}
560			if ( !$title->exists() ) {
561				return $title;
562			}
563		}
564	}
565
566	/**
567	 * Move a page
568	 *
569	 * @param int $id The page_id
570	 * @param LinkTarget $newLinkTarget The new title link target
571	 * @return bool
572	 */
573	private function movePage( $id, LinkTarget $newLinkTarget ) {
574		$dbw = $this->getDB( DB_PRIMARY );
575
576		$dbw->update( 'page',
577			[
578				"page_namespace" => $newLinkTarget->getNamespace(),
579				"page_title" => $newLinkTarget->getDBkey(),
580			],
581			[
582				"page_id" => $id,
583			],
584			__METHOD__
585		);
586
587		// Update *_from_namespace in links tables
588		$fromNamespaceTables = [
589			[ 'pagelinks', 'pl' ],
590			[ 'templatelinks', 'tl' ],
591			[ 'imagelinks', 'il' ]
592		];
593		foreach ( $fromNamespaceTables as [ $table, $fieldPrefix ] ) {
594			$dbw->update( $table,
595				// SET
596				[ "{$fieldPrefix}_from_namespace" => $newLinkTarget->getNamespace() ],
597				// WHERE
598				[ "{$fieldPrefix}_from" => $id ],
599				__METHOD__
600			);
601		}
602
603		return true;
604	}
605
606	/**
607	 * Determine if we can merge a page.
608	 * We check if an inaccessible revision would become the latest and
609	 * deny the merge if so -- it's theoretically possible to update the
610	 * latest revision, but opens a can of worms -- search engine updates,
611	 * recentchanges review, etc.
612	 *
613	 * @param int $id The page_id
614	 * @param LinkTarget $linkTarget The new link target
615	 * @param string &$logStatus This is set to the log status message on failure
616	 * @return bool
617	 */
618	private function canMerge( $id, LinkTarget $linkTarget, &$logStatus ) {
619		$revisionLookup = MediaWikiServices::getInstance()->getRevisionLookup();
620		$latestDest = $revisionLookup->getRevisionByTitle( $linkTarget, 0,
621			IDBAccessObject::READ_LATEST );
622		$latestSource = $revisionLookup->getRevisionByPageId( $id, 0,
623			IDBAccessObject::READ_LATEST );
624		if ( $latestSource->getTimestamp() > $latestDest->getTimestamp() ) {
625			$logStatus = 'cannot merge since source is later';
626			return false;
627		} else {
628			return true;
629		}
630	}
631
632	/**
633	 * Merge page histories
634	 *
635	 * @param stdClass $row Page row
636	 * @param Title $newTitle The new title
637	 * @return bool
638	 */
639	private function mergePage( $row, Title $newTitle ) {
640		$dbw = $this->getDB( DB_PRIMARY );
641
642		$id = $row->page_id;
643
644		// Construct the WikiPage object we will need later, while the
645		// page_id still exists. Note that this cannot use makeTitleSafe(),
646		// we are deliberately constructing an invalid title.
647		$sourceTitle = Title::makeTitle( $row->page_namespace, $row->page_title );
648		$sourceTitle->resetArticleID( $id );
649		$wikiPage = new WikiPage( $sourceTitle );
650		$wikiPage->loadPageData( 'fromdbmaster' );
651
652		$destId = $newTitle->getArticleID();
653		$this->beginTransaction( $dbw, __METHOD__ );
654		$dbw->update( 'revision',
655			// SET
656			[ 'rev_page' => $destId ],
657			// WHERE
658			[ 'rev_page' => $id ],
659			__METHOD__
660		);
661
662		$dbw->delete( 'page', [ 'page_id' => $id ], __METHOD__ );
663
664		$this->commitTransaction( $dbw, __METHOD__ );
665
666		/* Call LinksDeletionUpdate to delete outgoing links from the old title,
667		 * and update category counts.
668		 *
669		 * Calling external code with a fake broken Title is a fairly dubious
670		 * idea. It's necessary because it's quite a lot of code to duplicate,
671		 * but that also makes it fragile since it would be easy for someone to
672		 * accidentally introduce an assumption of title validity to the code we
673		 * are calling.
674		 */
675		DeferredUpdates::addUpdate( new LinksDeletionUpdate( $wikiPage ) );
676		DeferredUpdates::doUpdates();
677
678		return true;
679	}
680}
681
682$maintClass = NamespaceDupes::class;
683require_once RUN_MAINTENANCE_IF_MAIN;
684