1<?php 2/** 3 * This program is free software; you can redistribute it and/or modify 4 * it under the terms of the GNU General Public License as published by 5 * the Free Software Foundation; either version 2 of the License, or 6 * (at your option) any later version. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public License along 14 * with this program; if not, write to the Free Software Foundation, Inc., 15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 16 * http://www.gnu.org/copyleft/gpl.html 17 * 18 */ 19use Wikimedia\Purtle\RdfWriter; 20use Wikimedia\Purtle\TurtleRdfWriter; 21use Wikimedia\Rdbms\IDatabase; 22 23require_once __DIR__ . '/Maintenance.php'; 24 25/** 26 * Maintenance script to provide RDF representation of the recent changes in category tree. 27 * 28 * @ingroup Maintenance 29 * @since 1.30 30 */ 31class CategoryChangesAsRdf extends Maintenance { 32 /** 33 * Insert query 34 */ 35 private const SPARQL_INSERT = <<<SPARQL 36INSERT DATA { 37%s 38}; 39 40SPARQL; 41 42 /** 43 * Delete query 44 */ 45 private const SPARQL_DELETE = <<<SPARQLD 46DELETE { 47?category ?x ?y 48} WHERE { 49 ?category ?x ?y 50 VALUES ?category { 51 %s 52 } 53}; 54 55SPARQLD; 56 57 /** 58 * Delete/Insert query 59 */ 60 private const SPARQL_DELETE_INSERT = <<<SPARQLDI 61DELETE { 62?category ?x ?y 63} INSERT { 64%s 65} WHERE { 66 ?category ?x ?y 67 VALUES ?category { 68 %s 69 } 70}; 71 72SPARQLDI; 73 74 /** 75 * @var RdfWriter 76 */ 77 private $rdfWriter; 78 /** 79 * Categories RDF helper. 80 * @var CategoriesRdf 81 */ 82 private $categoriesRdf; 83 84 private $startTS; 85 private $endTS; 86 87 /** 88 * List of processed page IDs, 89 * so we don't try to process same thing twice 90 * @var int[] 91 */ 92 protected $processed = []; 93 94 public function __construct() { 95 parent::__construct(); 96 97 $this->addDescription( "Generate RDF dump of category changes in a wiki." ); 98 99 $this->setBatchSize( 200 ); 100 $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false, 101 true, 'o' ); 102 $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or Mediawiki format.', 103 true, true, 's' ); 104 $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or Mediawiki format.', true, 105 true, 'e' ); 106 } 107 108 /** 109 * Initialize external service classes. 110 */ 111 public function initialize() { 112 // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer. 113 $this->rdfWriter = new TurtleRdfWriter(); 114 $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter ); 115 } 116 117 public function execute() { 118 $this->initialize(); 119 $startTS = new MWTimestamp( $this->getOption( "start" ) ); 120 121 $endTS = new MWTimestamp( $this->getOption( "end" ) ); 122 $now = new MWTimestamp(); 123 $rcMaxAge = $this->getConfig()->get( 'RCMaxAge' ); 124 125 if ( $now->getTimestamp() - $startTS->getTimestamp() > $rcMaxAge ) { 126 $this->error( "Start timestamp too old, maximum RC age is $rcMaxAge!" ); 127 } 128 if ( $now->getTimestamp() - $endTS->getTimestamp() > $rcMaxAge ) { 129 $this->error( "End timestamp too old, maximum RC age is $rcMaxAge!" ); 130 } 131 132 $this->startTS = $startTS->getTimestamp(); 133 $this->endTS = $endTS->getTimestamp(); 134 135 $outFile = $this->getOption( 'output', 'php://stdout' ); 136 if ( $outFile === '-' ) { 137 $outFile = 'php://stdout'; 138 } 139 140 $output = fopen( $outFile, 'wb' ); 141 142 $this->categoriesRdf->setupPrefixes(); 143 $this->rdfWriter->start(); 144 145 $prefixes = $this->getRdf(); 146 // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them 147 // Also strip dot at the end. 148 $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes ); 149 fwrite( $output, $prefixes ); 150 151 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); 152 153 // Deletes go first because if the page was deleted, other changes 154 // do not matter. This only gets true deletes, i.e. not pages that were restored. 155 $this->handleDeletes( $dbr, $output ); 156 // Moves go before additions because if category is moved, we should not process creation 157 // as it would produce wrong data - because create row has old title 158 $this->handleMoves( $dbr, $output ); 159 // We need to handle restores too since delete may have happened in previous update. 160 $this->handleRestores( $dbr, $output ); 161 // Process newly added pages 162 $this->handleAdds( $dbr, $output ); 163 // Process page edits 164 $this->handleEdits( $dbr, $output ); 165 // Process categorization changes 166 $this->handleCategorization( $dbr, $output ); 167 168 // Update timestamp 169 fwrite( $output, $this->updateTS( $this->endTS ) ); 170 } 171 172 /** 173 * Get the text of SPARQL INSERT DATA clause 174 * @return string 175 */ 176 private function getInsertRdf() { 177 $rdfText = $this->getRdf(); 178 if ( !$rdfText ) { 179 return ""; 180 } 181 return sprintf( self::SPARQL_INSERT, $rdfText ); 182 } 183 184 /** 185 * Get SPARQL for updating set of categories 186 * @param IDatabase $dbr 187 * @param string[] $deleteUrls List of URIs to be deleted, with <> 188 * @param string[] $pages List of categories: id => title 189 * @param string $mark Marks which operation requests the query 190 * @return string SPARQL query 191 */ 192 private function getCategoriesUpdate( IDatabase $dbr, $deleteUrls, $pages, $mark ) { 193 if ( empty( $deleteUrls ) ) { 194 return ""; 195 } 196 197 if ( !empty( $pages ) ) { 198 $this->writeParentCategories( $dbr, $pages ); 199 } 200 201 return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode( ' ', $deleteUrls ) ) . 202 $this->getInsertRdf(); 203 } 204 205 /** 206 * Write parent data for a set of categories. 207 * The list has the child categories. 208 * @param IDatabase $dbr 209 * @param string[] $pages List of child categories: id => title 210 */ 211 private function writeParentCategories( IDatabase $dbr, $pages ) { 212 foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) { 213 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to ); 214 } 215 } 216 217 /** 218 * Generate SPARQL Update code for updating dump timestamp 219 * @param string|int $timestamp Timestamp for last change 220 * @return string SPARQL Update query for timestamp. 221 */ 222 public function updateTS( $timestamp ) { 223 $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>'; 224 $ts = wfTimestamp( TS_ISO_8601, $timestamp ); 225 $tsQuery = <<<SPARQL 226DELETE { 227 $dumpUrl schema:dateModified ?o . 228} 229WHERE { 230 $dumpUrl schema:dateModified ?o . 231}; 232INSERT DATA { 233 $dumpUrl schema:dateModified "$ts"^^xsd:dateTime . 234} 235 236SPARQL; 237 return $tsQuery; 238 } 239 240 /** 241 * Set up standard iterator for retrieving category changes. 242 * @param IDatabase $dbr 243 * @param string[] $columns List of additional fields to get 244 * @param string[] $extra_tables List of additional tables to join 245 * @return BatchRowIterator 246 */ 247 private function setupChangesIterator( 248 IDatabase $dbr, 249 array $columns = [], 250 array $extra_tables = [] 251 ) { 252 $tables = [ 'recentchanges', 'page_props', 'category' ]; 253 if ( $extra_tables ) { 254 $tables = array_merge( $tables, $extra_tables ); 255 } 256 $it = new BatchRowIterator( $dbr, 257 $tables, 258 [ 'rc_timestamp' ], 259 $this->mBatchSize 260 ); 261 $this->addTimestampConditions( $it, $dbr ); 262 $it->addJoinConditions( 263 [ 264 'page_props' => [ 265 'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ] 266 ], 267 'category' => [ 268 'LEFT JOIN', [ 'cat_title = rc_title' ] 269 ] 270 ] 271 ); 272 $it->setFetchColumns( array_merge( $columns, [ 273 'rc_title', 274 'rc_cur_id', 275 'pp_propname', 276 'cat_pages', 277 'cat_subcats', 278 'cat_files' 279 ] ) ); 280 return $it; 281 } 282 283 /** 284 * Fetch newly created categories 285 * @param IDatabase $dbr 286 * @return BatchRowIterator 287 */ 288 protected function getNewCatsIterator( IDatabase $dbr ) { 289 $it = $this->setupChangesIterator( $dbr ); 290 $it->addConditions( [ 291 'rc_namespace' => NS_CATEGORY, 292 'rc_new' => 1, 293 ] ); 294 return $it; 295 } 296 297 /** 298 * Fetch moved categories 299 * @param IDatabase $dbr 300 * @return BatchRowIterator 301 */ 302 protected function getMovedCatsIterator( IDatabase $dbr ) { 303 $it = $this->setupChangesIterator( $dbr, [ 'page_title', 'page_namespace' ], [ 'page' ] ); 304 $it->addConditions( [ 305 'rc_namespace' => NS_CATEGORY, 306 'rc_new' => 0, 307 'rc_log_type' => 'move', 308 'rc_type' => RC_LOG, 309 ] ); 310 $it->addJoinConditions( [ 311 'page' => [ 'JOIN', 'rc_cur_id = page_id' ], 312 ] ); 313 $this->addIndex( $it ); 314 return $it; 315 } 316 317 /** 318 * Fetch deleted categories 319 * @param IDatabase $dbr 320 * @return BatchRowIterator 321 */ 322 protected function getDeletedCatsIterator( IDatabase $dbr ) { 323 $it = new BatchRowIterator( $dbr, 324 'recentchanges', 325 [ 'rc_timestamp' ], 326 $this->mBatchSize 327 ); 328 $this->addTimestampConditions( $it, $dbr ); 329 $it->addConditions( [ 330 'rc_namespace' => NS_CATEGORY, 331 'rc_new' => 0, 332 'rc_log_type' => 'delete', 333 'rc_log_action' => 'delete', 334 'rc_type' => RC_LOG, 335 // We will fetch ones that do not have page record. If they do, 336 // this means they were restored, thus restoring handler will pick it up. 337 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)', 338 ] ); 339 $this->addIndex( $it ); 340 $it->setFetchColumns( [ 'rc_cur_id', 'rc_title' ] ); 341 return $it; 342 } 343 344 /** 345 * Fetch restored categories 346 * @param IDatabase $dbr 347 * @return BatchRowIterator 348 */ 349 protected function getRestoredCatsIterator( IDatabase $dbr ) { 350 $it = $this->setupChangesIterator( $dbr ); 351 $it->addConditions( [ 352 'rc_namespace' => NS_CATEGORY, 353 'rc_new' => 0, 354 'rc_log_type' => 'delete', 355 'rc_log_action' => 'restore', 356 'rc_type' => RC_LOG, 357 // We will only fetch ones that have page record 358 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)', 359 ] ); 360 $this->addIndex( $it ); 361 return $it; 362 } 363 364 /** 365 * Fetch categorization changes or edits 366 * @param IDatabase $dbr 367 * @param int $type 368 * @return BatchRowIterator 369 */ 370 protected function getChangedCatsIterator( IDatabase $dbr, $type ) { 371 $it = 372 $this->setupChangesIterator( $dbr ); 373 $it->addConditions( [ 374 'rc_namespace' => NS_CATEGORY, 375 'rc_new' => 0, 376 'rc_type' => $type, 377 ] ); 378 $this->addIndex( $it ); 379 return $it; 380 } 381 382 /** 383 * Add timestamp limits to iterator 384 * @param BatchRowIterator $it Iterator 385 * @param IDatabase $dbr 386 */ 387 private function addTimestampConditions( BatchRowIterator $it, IDatabase $dbr ) { 388 $it->addConditions( [ 389 'rc_timestamp >= ' . $dbr->addQuotes( $dbr->timestamp( $this->startTS ) ), 390 'rc_timestamp < ' . $dbr->addQuotes( $dbr->timestamp( $this->endTS ) ), 391 ] ); 392 } 393 394 /** 395 * Need to force index, somehow on terbium the optimizer chooses wrong one 396 * @param BatchRowIterator $it 397 */ 398 private function addIndex( BatchRowIterator $it ) { 399 $it->addOptions( [ 400 'USE INDEX' => [ 'recentchanges' => 'new_name_timestamp' ] 401 ] ); 402 } 403 404 /** 405 * Get iterator for links for categories. 406 * @param IDatabase $dbr 407 * @param int[] $ids List of page IDs 408 * @return Traversable 409 */ 410 protected function getCategoryLinksIterator( IDatabase $dbr, array $ids ) { 411 $it = new BatchRowIterator( 412 $dbr, 413 'categorylinks', 414 [ 'cl_from', 'cl_to' ], 415 $this->mBatchSize 416 ); 417 $it->addConditions( [ 418 'cl_type' => 'subcat', 419 'cl_from' => $ids 420 ] ); 421 $it->setFetchColumns( [ 'cl_from', 'cl_to' ] ); 422 return new RecursiveIteratorIterator( $it ); 423 } 424 425 /** 426 * Get accumulated RDF. 427 * @return string 428 */ 429 public function getRdf() { 430 return $this->rdfWriter->drain(); 431 } 432 433 /** 434 * Handle category deletes. 435 * @param IDatabase $dbr 436 * @param resource $output File to write the output 437 */ 438 public function handleDeletes( IDatabase $dbr, $output ) { 439 // This only does "true" deletes - i.e. those that the page stays deleted 440 foreach ( $this->getDeletedCatsIterator( $dbr ) as $batch ) { 441 $deleteUrls = []; 442 foreach ( $batch as $row ) { 443 // This can produce duplicates, we don't care 444 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; 445 $this->processed[$row->rc_cur_id] = true; 446 } 447 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) ); 448 } 449 } 450 451 /** 452 * Write category data to RDF. 453 * @param stdclass $row Database row 454 */ 455 private function writeCategoryData( $row ) { 456 $this->categoriesRdf->writeCategoryData( 457 $row->rc_title, 458 $row->pp_propname === 'hiddencat', 459 (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files, 460 (int)$row->cat_subcats 461 ); 462 } 463 464 /** 465 * @param IDatabase $dbr 466 * @param resource $output 467 */ 468 public function handleMoves( IDatabase $dbr, $output ) { 469 foreach ( $this->getMovedCatsIterator( $dbr ) as $batch ) { 470 $pages = []; 471 $deleteUrls = []; 472 foreach ( $batch as $row ) { 473 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; 474 475 if ( isset( $this->processed[$row->rc_cur_id] ) ) { 476 // We already captured this one before 477 continue; 478 } 479 480 if ( $row->page_namespace != NS_CATEGORY ) { 481 // If page was moved out of Category:, we'll just delete 482 continue; 483 } 484 $row->rc_title = $row->page_title; 485 $this->writeCategoryData( $row ); 486 $pages[$row->rc_cur_id] = $row->page_title; 487 $this->processed[$row->rc_cur_id] = true; 488 } 489 490 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) ); 491 } 492 } 493 494 /** 495 * @param IDatabase $dbr 496 * @param resource $output 497 */ 498 public function handleRestores( IDatabase $dbr, $output ) { 499 fwrite( $output, "# Restores\n" ); 500 // This will only find those restores that were not deleted later. 501 foreach ( $this->getRestoredCatsIterator( $dbr ) as $batch ) { 502 $pages = []; 503 foreach ( $batch as $row ) { 504 if ( isset( $this->processed[$row->rc_cur_id] ) ) { 505 // We already captured this one before 506 continue; 507 } 508 $this->writeCategoryData( $row ); 509 $pages[$row->rc_cur_id] = $row->rc_title; 510 $this->processed[$row->rc_cur_id] = true; 511 } 512 513 if ( empty( $pages ) ) { 514 continue; 515 } 516 517 $this->writeParentCategories( $dbr, $pages ); 518 519 fwrite( $output, $this->getInsertRdf() ); 520 } 521 } 522 523 /** 524 * @param IDatabase $dbr 525 * @param resource $output 526 */ 527 public function handleAdds( IDatabase $dbr, $output ) { 528 fwrite( $output, "# Additions\n" ); 529 foreach ( $this->getNewCatsIterator( $dbr ) as $batch ) { 530 $pages = []; 531 foreach ( $batch as $row ) { 532 if ( isset( $this->processed[$row->rc_cur_id] ) ) { 533 // We already captured this one before 534 continue; 535 } 536 $this->writeCategoryData( $row ); 537 $pages[$row->rc_cur_id] = $row->rc_title; 538 $this->processed[$row->rc_cur_id] = true; 539 } 540 541 if ( empty( $pages ) ) { 542 continue; 543 } 544 545 $this->writeParentCategories( $dbr, $pages ); 546 fwrite( $output, $this->getInsertRdf() ); 547 } 548 } 549 550 /** 551 * Handle edits for category texts 552 * @param IDatabase $dbr 553 * @param resource $output 554 */ 555 public function handleEdits( IDatabase $dbr, $output ) { 556 // Editing category can change hidden flag and add new parents. 557 // TODO: it's pretty expensive to update all edited categories, and most edits 558 // aren't actually interesting for us. Some way to know which are interesting? 559 // We can capture recategorization on the next step, but not change in hidden status. 560 foreach ( $this->getChangedCatsIterator( $dbr, RC_EDIT ) as $batch ) { 561 $pages = []; 562 $deleteUrls = []; 563 foreach ( $batch as $row ) { 564 // Note that on categorization event, cur_id points to 565 // the child page, not the parent category! 566 if ( isset( $this->processed[$row->rc_cur_id] ) ) { 567 // We already captured this one before 568 continue; 569 } 570 $this->writeCategoryData( $row ); 571 $pages[$row->rc_cur_id] = $row->rc_title; 572 $this->processed[$row->rc_cur_id] = true; 573 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; 574 } 575 576 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) ); 577 } 578 } 579 580 /** 581 * Handles categorization changes 582 * @param IDatabase $dbr 583 * @param resource $output 584 */ 585 public function handleCategorization( IDatabase $dbr, $output ) { 586 $processedTitle = []; 587 // Categorization change can add new parents and change counts 588 // for the parent category. 589 foreach ( $this->getChangedCatsIterator( $dbr, RC_CATEGORIZE ) as $batch ) { 590 /* 591 * Note that on categorization event, cur_id points to 592 * the child page, not the parent category! 593 * So we need to have a two-stage process, since we have ID from one 594 * category and title from another, and we need both for proper updates. 595 * TODO: For now, we do full update even though some data hasn't changed, 596 * e.g. parents for parent cat and counts for child cat. 597 */ 598 $childPages = []; 599 $parentCats = []; 600 foreach ( $batch as $row ) { 601 $childPages[$row->rc_cur_id] = true; 602 $parentCats[$row->rc_title] = true; 603 } 604 605 $joinConditions = [ 606 'page_props' => [ 607 'LEFT JOIN', 608 [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ], 609 ], 610 'category' => [ 611 'LEFT JOIN', 612 [ 'cat_title = page_title' ], 613 ], 614 ]; 615 616 $pages = []; 617 $deleteUrls = []; 618 619 if ( $childPages ) { 620 // Load child rows by ID 621 $childRows = $dbr->select( 622 [ 'page', 'page_props', 'category' ], 623 [ 624 'page_id', 625 'rc_title' => 'page_title', 626 'pp_propname', 627 'cat_pages', 628 'cat_subcats', 629 'cat_files', 630 ], 631 [ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ], 632 __METHOD__, 633 [], 634 $joinConditions 635 ); 636 foreach ( $childRows as $row ) { 637 if ( isset( $this->processed[$row->page_id] ) ) { 638 // We already captured this one before 639 continue; 640 } 641 $this->writeCategoryData( $row ); 642 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; 643 $this->processed[$row->page_id] = true; 644 } 645 } 646 647 if ( $parentCats ) { 648 // Load parent rows by title 649 $joinConditions = [ 650 'page' => [ 651 'LEFT JOIN', 652 [ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ], 653 ], 654 'page_props' => [ 655 'LEFT JOIN', 656 [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ], 657 ], 658 ]; 659 660 $parentRows = $dbr->select( 661 [ 'category', 'page', 'page_props' ], 662 [ 663 'page_id', 664 'rc_title' => 'cat_title', 665 'pp_propname', 666 'cat_pages', 667 'cat_subcats', 668 'cat_files', 669 ], 670 [ 'cat_title' => array_map( 'strval', array_keys( $parentCats ) ) ], 671 __METHOD__, 672 [], 673 $joinConditions 674 ); 675 foreach ( $parentRows as $row ) { 676 if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) { 677 // We already captured this one before 678 continue; 679 } 680 if ( isset( $processedTitle[$row->rc_title] ) ) { 681 // We already captured this one before 682 continue; 683 } 684 $this->writeCategoryData( $row ); 685 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; 686 if ( $row->page_id ) { 687 $this->processed[$row->page_id] = true; 688 } 689 $processedTitle[$row->rc_title] = true; 690 } 691 } 692 693 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) ); 694 } 695 } 696} 697 698$maintClass = CategoryChangesAsRdf::class; 699require_once RUN_MAINTENANCE_IF_MAIN; 700