1<?php
2/**
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 * http://www.gnu.org/copyleft/gpl.html
17 *
18 */
19use Wikimedia\Purtle\RdfWriter;
20use Wikimedia\Purtle\TurtleRdfWriter;
21use Wikimedia\Rdbms\IDatabase;
22
23require_once __DIR__ . '/Maintenance.php';
24
25/**
26 * Maintenance script to provide RDF representation of the recent changes in category tree.
27 *
28 * @ingroup Maintenance
29 * @since 1.30
30 */
31class CategoryChangesAsRdf extends Maintenance {
32	/**
33	 * Insert query
34	 */
35	private const SPARQL_INSERT = <<<SPARQL
36INSERT DATA {
37%s
38};
39
40SPARQL;
41
42	/**
43	 * Delete query
44	 */
45	private const SPARQL_DELETE = <<<SPARQLD
46DELETE {
47?category ?x ?y
48} WHERE {
49   ?category ?x ?y
50   VALUES ?category {
51     %s
52   }
53};
54
55SPARQLD;
56
57	/**
58	 * Delete/Insert query
59	 */
60	private const SPARQL_DELETE_INSERT = <<<SPARQLDI
61DELETE {
62?category ?x ?y
63} INSERT {
64%s
65} WHERE {
66  ?category ?x ?y
67   VALUES ?category {
68     %s
69   }
70};
71
72SPARQLDI;
73
74	/**
75	 * @var RdfWriter
76	 */
77	private $rdfWriter;
78	/**
79	 * Categories RDF helper.
80	 * @var CategoriesRdf
81	 */
82	private $categoriesRdf;
83
84	private $startTS;
85	private $endTS;
86
87	/**
88	 * List of processed page IDs,
89	 * so we don't try to process same thing twice
90	 * @var int[]
91	 */
92	protected $processed = [];
93
94	public function __construct() {
95		parent::__construct();
96
97		$this->addDescription( "Generate RDF dump of category changes in a wiki." );
98
99		$this->setBatchSize( 200 );
100		$this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false,
101			true, 'o' );
102		$this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or Mediawiki format.',
103			true, true, 's' );
104		$this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or Mediawiki format.', true,
105			true, 'e' );
106	}
107
108	/**
109	 * Initialize external service classes.
110	 */
111	public function initialize() {
112		// SPARQL Update syntax is close to Turtle format, so we can use Turtle writer.
113		$this->rdfWriter = new TurtleRdfWriter();
114		$this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
115	}
116
117	public function execute() {
118		$this->initialize();
119		$startTS = new MWTimestamp( $this->getOption( "start" ) );
120
121		$endTS = new MWTimestamp( $this->getOption( "end" ) );
122		$now = new MWTimestamp();
123		$rcMaxAge = $this->getConfig()->get( 'RCMaxAge' );
124
125		if ( $now->getTimestamp() - $startTS->getTimestamp() > $rcMaxAge ) {
126			$this->error( "Start timestamp too old, maximum RC age is $rcMaxAge!" );
127		}
128		if ( $now->getTimestamp() - $endTS->getTimestamp() > $rcMaxAge ) {
129			$this->error( "End timestamp too old, maximum RC age is $rcMaxAge!" );
130		}
131
132		$this->startTS = $startTS->getTimestamp();
133		$this->endTS = $endTS->getTimestamp();
134
135		$outFile = $this->getOption( 'output', 'php://stdout' );
136		if ( $outFile === '-' ) {
137			$outFile = 'php://stdout';
138		}
139
140		$output = fopen( $outFile, 'wb' );
141
142		$this->categoriesRdf->setupPrefixes();
143		$this->rdfWriter->start();
144
145		$prefixes = $this->getRdf();
146		// We have to strip @ from prefix, since SPARQL UPDATE doesn't use them
147		// Also strip dot at the end.
148		$prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes );
149		fwrite( $output, $prefixes );
150
151		$dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
152
153		// Deletes go first because if the page was deleted, other changes
154		// do not matter. This only gets true deletes, i.e. not pages that were restored.
155		$this->handleDeletes( $dbr, $output );
156		// Moves go before additions because if category is moved, we should not process creation
157		// as it would produce wrong data - because create row has old title
158		$this->handleMoves( $dbr, $output );
159		// We need to handle restores too since delete may have happened in previous update.
160		$this->handleRestores( $dbr, $output );
161		// Process newly added pages
162		$this->handleAdds( $dbr, $output );
163		// Process page edits
164		$this->handleEdits( $dbr, $output );
165		// Process categorization changes
166		$this->handleCategorization( $dbr, $output );
167
168		// Update timestamp
169		fwrite( $output, $this->updateTS( $this->endTS ) );
170	}
171
172	/**
173	 * Get the text of SPARQL INSERT DATA clause
174	 * @return string
175	 */
176	private function getInsertRdf() {
177		$rdfText = $this->getRdf();
178		if ( !$rdfText ) {
179			return "";
180		}
181		return sprintf( self::SPARQL_INSERT, $rdfText );
182	}
183
184	/**
185	 * Get SPARQL for updating set of categories
186	 * @param IDatabase $dbr
187	 * @param string[] $deleteUrls List of URIs to be deleted, with <>
188	 * @param string[] $pages List of categories: id => title
189	 * @param string $mark Marks which operation requests the query
190	 * @return string SPARQL query
191	 */
192	private function getCategoriesUpdate( IDatabase $dbr, $deleteUrls, $pages, $mark ) {
193		if ( empty( $deleteUrls ) ) {
194			return "";
195		}
196
197		if ( !empty( $pages ) ) {
198			$this->writeParentCategories( $dbr, $pages );
199		}
200
201		return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode( ' ', $deleteUrls ) ) .
202			$this->getInsertRdf();
203	}
204
205	/**
206	 * Write parent data for a set of categories.
207	 * The list has the child categories.
208	 * @param IDatabase $dbr
209	 * @param string[] $pages List of child categories: id => title
210	 */
211	private function writeParentCategories( IDatabase $dbr, $pages ) {
212		foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) {
213			$this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
214		}
215	}
216
217	/**
218	 * Generate SPARQL Update code for updating dump timestamp
219	 * @param string|int $timestamp Timestamp for last change
220	 * @return string SPARQL Update query for timestamp.
221	 */
222	public function updateTS( $timestamp ) {
223		$dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>';
224		$ts = wfTimestamp( TS_ISO_8601, $timestamp );
225		$tsQuery = <<<SPARQL
226DELETE {
227  $dumpUrl schema:dateModified ?o .
228}
229WHERE {
230  $dumpUrl schema:dateModified ?o .
231};
232INSERT DATA {
233  $dumpUrl schema:dateModified "$ts"^^xsd:dateTime .
234}
235
236SPARQL;
237		return $tsQuery;
238	}
239
240	/**
241	 * Set up standard iterator for retrieving category changes.
242	 * @param IDatabase $dbr
243	 * @param string[] $columns List of additional fields to get
244	 * @param string[] $extra_tables List of additional tables to join
245	 * @return BatchRowIterator
246	 */
247	private function setupChangesIterator(
248		IDatabase $dbr,
249		array $columns = [],
250		array $extra_tables = []
251	) {
252		$tables = [ 'recentchanges', 'page_props', 'category' ];
253		if ( $extra_tables ) {
254			$tables = array_merge( $tables, $extra_tables );
255		}
256		$it = new BatchRowIterator( $dbr,
257			$tables,
258			[ 'rc_timestamp' ],
259			$this->mBatchSize
260		);
261		$this->addTimestampConditions( $it, $dbr );
262		$it->addJoinConditions(
263			[
264				'page_props' => [
265					'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ]
266				],
267				'category' => [
268					'LEFT JOIN', [ 'cat_title = rc_title' ]
269				]
270			]
271		);
272		$it->setFetchColumns( array_merge( $columns, [
273			'rc_title',
274			'rc_cur_id',
275			'pp_propname',
276			'cat_pages',
277			'cat_subcats',
278			'cat_files'
279		] ) );
280		return $it;
281	}
282
283	/**
284	 * Fetch newly created categories
285	 * @param IDatabase $dbr
286	 * @return BatchRowIterator
287	 */
288	protected function getNewCatsIterator( IDatabase $dbr ) {
289		$it = $this->setupChangesIterator( $dbr );
290		$it->addConditions( [
291			'rc_namespace' => NS_CATEGORY,
292			'rc_new' => 1,
293		] );
294		return $it;
295	}
296
297	/**
298	 * Fetch moved categories
299	 * @param IDatabase $dbr
300	 * @return BatchRowIterator
301	 */
302	protected function getMovedCatsIterator( IDatabase $dbr ) {
303		$it = $this->setupChangesIterator( $dbr, [ 'page_title', 'page_namespace' ], [ 'page' ] );
304		$it->addConditions( [
305			'rc_namespace' => NS_CATEGORY,
306			'rc_new' => 0,
307			'rc_log_type' => 'move',
308			'rc_type' => RC_LOG,
309		] );
310		$it->addJoinConditions( [
311			'page' => [ 'JOIN', 'rc_cur_id = page_id' ],
312		] );
313		$this->addIndex( $it );
314		return $it;
315	}
316
317	/**
318	 * Fetch deleted categories
319	 * @param IDatabase $dbr
320	 * @return BatchRowIterator
321	 */
322	protected function getDeletedCatsIterator( IDatabase $dbr ) {
323		$it = new BatchRowIterator( $dbr,
324			'recentchanges',
325			[ 'rc_timestamp' ],
326			$this->mBatchSize
327		);
328		$this->addTimestampConditions( $it, $dbr );
329		$it->addConditions( [
330			'rc_namespace' => NS_CATEGORY,
331			'rc_new' => 0,
332			'rc_log_type' => 'delete',
333			'rc_log_action' => 'delete',
334			'rc_type' => RC_LOG,
335			// We will fetch ones that do not have page record. If they do,
336			// this means they were restored, thus restoring handler will pick it up.
337			'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
338		] );
339		$this->addIndex( $it );
340		$it->setFetchColumns( [ 'rc_cur_id', 'rc_title' ] );
341		return $it;
342	}
343
344	/**
345	 * Fetch restored categories
346	 * @param IDatabase $dbr
347	 * @return BatchRowIterator
348	 */
349	protected function getRestoredCatsIterator( IDatabase $dbr ) {
350		$it = $this->setupChangesIterator( $dbr );
351		$it->addConditions( [
352			'rc_namespace' => NS_CATEGORY,
353			'rc_new' => 0,
354			'rc_log_type' => 'delete',
355			'rc_log_action' => 'restore',
356			'rc_type' => RC_LOG,
357			// We will only fetch ones that have page record
358			'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
359		] );
360		$this->addIndex( $it );
361		return $it;
362	}
363
364	/**
365	 * Fetch categorization changes or edits
366	 * @param IDatabase $dbr
367	 * @param int $type
368	 * @return BatchRowIterator
369	 */
370	protected function getChangedCatsIterator( IDatabase $dbr, $type ) {
371		$it =
372			$this->setupChangesIterator( $dbr );
373		$it->addConditions( [
374			'rc_namespace' => NS_CATEGORY,
375			'rc_new' => 0,
376			'rc_type' => $type,
377		] );
378		$this->addIndex( $it );
379		return $it;
380	}
381
382	/**
383	 * Add timestamp limits to iterator
384	 * @param BatchRowIterator $it Iterator
385	 * @param IDatabase $dbr
386	 */
387	private function addTimestampConditions( BatchRowIterator $it, IDatabase $dbr ) {
388		$it->addConditions( [
389			'rc_timestamp >= ' . $dbr->addQuotes( $dbr->timestamp( $this->startTS ) ),
390			'rc_timestamp < ' . $dbr->addQuotes( $dbr->timestamp( $this->endTS ) ),
391		] );
392	}
393
394	/**
395	 * Need to force index, somehow on terbium the optimizer chooses wrong one
396	 * @param BatchRowIterator $it
397	 */
398	private function addIndex( BatchRowIterator $it ) {
399		$it->addOptions( [
400			'USE INDEX' => [ 'recentchanges' => 'new_name_timestamp' ]
401		] );
402	}
403
404	/**
405	 * Get iterator for links for categories.
406	 * @param IDatabase $dbr
407	 * @param int[] $ids List of page IDs
408	 * @return Traversable
409	 */
410	protected function getCategoryLinksIterator( IDatabase $dbr, array $ids ) {
411		$it = new BatchRowIterator(
412			$dbr,
413			'categorylinks',
414			[ 'cl_from', 'cl_to' ],
415			$this->mBatchSize
416		);
417		$it->addConditions( [
418			'cl_type' => 'subcat',
419			'cl_from' => $ids
420		] );
421		$it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
422		return new RecursiveIteratorIterator( $it );
423	}
424
425	/**
426	 * Get accumulated RDF.
427	 * @return string
428	 */
429	public function getRdf() {
430		return $this->rdfWriter->drain();
431	}
432
433	/**
434	 * Handle category deletes.
435	 * @param IDatabase $dbr
436	 * @param resource $output File to write the output
437	 */
438	public function handleDeletes( IDatabase $dbr, $output ) {
439		// This only does "true" deletes - i.e. those that the page stays deleted
440		foreach ( $this->getDeletedCatsIterator( $dbr ) as $batch ) {
441			$deleteUrls = [];
442			foreach ( $batch as $row ) {
443				// This can produce duplicates, we don't care
444				$deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
445				$this->processed[$row->rc_cur_id] = true;
446			}
447			fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) );
448		}
449	}
450
451	/**
452	 * Write category data to RDF.
453	 * @param stdclass $row Database row
454	 */
455	private function writeCategoryData( $row ) {
456		$this->categoriesRdf->writeCategoryData(
457			$row->rc_title,
458			$row->pp_propname === 'hiddencat',
459			(int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
460			(int)$row->cat_subcats
461		);
462	}
463
464	/**
465	 * @param IDatabase $dbr
466	 * @param resource $output
467	 */
468	public function handleMoves( IDatabase $dbr, $output ) {
469		foreach ( $this->getMovedCatsIterator( $dbr ) as $batch ) {
470			$pages = [];
471			$deleteUrls = [];
472			foreach ( $batch as $row ) {
473				$deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
474
475				if ( isset( $this->processed[$row->rc_cur_id] ) ) {
476					// We already captured this one before
477					continue;
478				}
479
480				if ( $row->page_namespace != NS_CATEGORY ) {
481					// If page was moved out of Category:, we'll just delete
482					continue;
483				}
484				$row->rc_title = $row->page_title;
485				$this->writeCategoryData( $row );
486				$pages[$row->rc_cur_id] = $row->page_title;
487				$this->processed[$row->rc_cur_id] = true;
488			}
489
490			fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) );
491		}
492	}
493
494	/**
495	 * @param IDatabase $dbr
496	 * @param resource $output
497	 */
498	public function handleRestores( IDatabase $dbr, $output ) {
499		fwrite( $output, "# Restores\n" );
500		// This will only find those restores that were not deleted later.
501		foreach ( $this->getRestoredCatsIterator( $dbr ) as $batch ) {
502			$pages = [];
503			foreach ( $batch as $row ) {
504				if ( isset( $this->processed[$row->rc_cur_id] ) ) {
505					// We already captured this one before
506					continue;
507				}
508				$this->writeCategoryData( $row );
509				$pages[$row->rc_cur_id] = $row->rc_title;
510				$this->processed[$row->rc_cur_id] = true;
511			}
512
513			if ( empty( $pages ) ) {
514				continue;
515			}
516
517			$this->writeParentCategories( $dbr, $pages );
518
519			fwrite( $output, $this->getInsertRdf() );
520		}
521	}
522
523	/**
524	 * @param IDatabase $dbr
525	 * @param resource $output
526	 */
527	public function handleAdds( IDatabase $dbr, $output ) {
528		fwrite( $output, "# Additions\n" );
529		foreach ( $this->getNewCatsIterator( $dbr ) as $batch ) {
530			$pages = [];
531			foreach ( $batch as $row ) {
532				if ( isset( $this->processed[$row->rc_cur_id] ) ) {
533					// We already captured this one before
534					continue;
535				}
536				$this->writeCategoryData( $row );
537				$pages[$row->rc_cur_id] = $row->rc_title;
538				$this->processed[$row->rc_cur_id] = true;
539			}
540
541			if ( empty( $pages ) ) {
542				continue;
543			}
544
545			$this->writeParentCategories( $dbr, $pages );
546			fwrite( $output, $this->getInsertRdf() );
547		}
548	}
549
550	/**
551	 * Handle edits for category texts
552	 * @param IDatabase $dbr
553	 * @param resource $output
554	 */
555	public function handleEdits( IDatabase $dbr, $output ) {
556		// Editing category can change hidden flag and add new parents.
557		// TODO: it's pretty expensive to update all edited categories, and most edits
558		// aren't actually interesting for us. Some way to know which are interesting?
559		// We can capture recategorization on the next step, but not change in hidden status.
560		foreach ( $this->getChangedCatsIterator( $dbr, RC_EDIT ) as $batch ) {
561			$pages = [];
562			$deleteUrls = [];
563			foreach ( $batch as $row ) {
564				// Note that on categorization event, cur_id points to
565				// the child page, not the parent category!
566				if ( isset( $this->processed[$row->rc_cur_id] ) ) {
567					// We already captured this one before
568					continue;
569				}
570				$this->writeCategoryData( $row );
571				$pages[$row->rc_cur_id] = $row->rc_title;
572				$this->processed[$row->rc_cur_id] = true;
573				$deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
574			}
575
576			fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) );
577		}
578	}
579
580	/**
581	 * Handles categorization changes
582	 * @param IDatabase $dbr
583	 * @param resource $output
584	 */
585	public function handleCategorization( IDatabase $dbr, $output ) {
586		$processedTitle = [];
587		// Categorization change can add new parents and change counts
588		// for the parent category.
589		foreach ( $this->getChangedCatsIterator( $dbr, RC_CATEGORIZE ) as $batch ) {
590			/*
591			 * Note that on categorization event, cur_id points to
592			 * the child page, not the parent category!
593			 * So we need to have a two-stage process, since we have ID from one
594			 * category and title from another, and we need both for proper updates.
595			 * TODO: For now, we do full update even though some data hasn't changed,
596			 * e.g. parents for parent cat and counts for child cat.
597			 */
598			$childPages = [];
599			$parentCats = [];
600			foreach ( $batch as $row ) {
601				$childPages[$row->rc_cur_id] = true;
602				$parentCats[$row->rc_title] = true;
603			}
604
605			$joinConditions = [
606				'page_props' => [
607					'LEFT JOIN',
608					[ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
609				],
610				'category' => [
611					'LEFT JOIN',
612					[ 'cat_title = page_title' ],
613				],
614			];
615
616			$pages = [];
617			$deleteUrls = [];
618
619			if ( $childPages ) {
620				// Load child rows by ID
621				$childRows = $dbr->select(
622					[ 'page', 'page_props', 'category' ],
623					[
624						'page_id',
625						'rc_title' => 'page_title',
626						'pp_propname',
627						'cat_pages',
628						'cat_subcats',
629						'cat_files',
630					],
631					[ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ],
632					__METHOD__,
633					[],
634					$joinConditions
635				);
636				foreach ( $childRows as $row ) {
637					if ( isset( $this->processed[$row->page_id] ) ) {
638						// We already captured this one before
639						continue;
640					}
641					$this->writeCategoryData( $row );
642					$deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
643					$this->processed[$row->page_id] = true;
644				}
645			}
646
647			if ( $parentCats ) {
648				// Load parent rows by title
649				$joinConditions = [
650					'page' => [
651						'LEFT JOIN',
652						[ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ],
653					],
654					'page_props' => [
655						'LEFT JOIN',
656						[ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
657					],
658				];
659
660				$parentRows = $dbr->select(
661					[ 'category', 'page', 'page_props' ],
662					[
663						'page_id',
664						'rc_title' => 'cat_title',
665						'pp_propname',
666						'cat_pages',
667						'cat_subcats',
668						'cat_files',
669					],
670					[ 'cat_title' => array_map( 'strval', array_keys( $parentCats ) ) ],
671					__METHOD__,
672					[],
673					$joinConditions
674				);
675				foreach ( $parentRows as $row ) {
676					if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) {
677						// We already captured this one before
678						continue;
679					}
680					if ( isset( $processedTitle[$row->rc_title] ) ) {
681						// We already captured this one before
682						continue;
683					}
684					$this->writeCategoryData( $row );
685					$deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
686					if ( $row->page_id ) {
687						$this->processed[$row->page_id] = true;
688					}
689					$processedTitle[$row->rc_title] = true;
690				}
691			}
692
693			fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) );
694		}
695	}
696}
697
698$maintClass = CategoryChangesAsRdf::class;
699require_once RUN_MAINTENANCE_IF_MAIN;
700