1<?php 2/** 3 * Base class for exporting 4 * 5 * Copyright © 2003, 2005, 2006 Brion Vibber <brion@pobox.com> 6 * https://www.mediawiki.org/ 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public License along 19 * with this program; if not, write to the Free Software Foundation, Inc., 20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 21 * http://www.gnu.org/copyleft/gpl.html 22 * 23 * @file 24 */ 25 26/** 27 * @defgroup Dump Dump 28 */ 29 30use MediaWiki\HookContainer\HookRunner; 31use MediaWiki\MediaWikiServices; 32use MediaWiki\Page\PageIdentity; 33use MediaWiki\Revision\RevisionAccessException; 34use MediaWiki\Revision\RevisionRecord; 35use MediaWiki\Revision\RevisionStore; 36use Wikimedia\Rdbms\IDatabase; 37use Wikimedia\Rdbms\IResultWrapper; 38 39/** 40 * @ingroup SpecialPage Dump 41 */ 42class WikiExporter { 43 /** @var bool Return distinct author list (when not returning full history) */ 44 public $list_authors = false; 45 46 /** @var bool */ 47 public $dumpUploads = false; 48 49 /** @var bool */ 50 public $dumpUploadFileContents = false; 51 52 /** @var string */ 53 public $author_list = ""; 54 55 public const FULL = 1; 56 public const CURRENT = 2; 57 public const STABLE = 4; // extension defined 58 public const LOGS = 8; 59 public const RANGE = 16; 60 61 public const TEXT = XmlDumpWriter::WRITE_CONTENT; 62 public const STUB = XmlDumpWriter::WRITE_STUB; 63 64 protected const BATCH_SIZE = 50000; 65 66 /** @var int */ 67 public $text; 68 69 /** @var DumpOutput */ 70 public $sink; 71 72 /** @var XmlDumpWriter */ 73 private $writer; 74 75 /** @var IDatabase */ 76 protected $db; 77 78 /** @var array|int */ 79 protected $history; 80 81 /** @var array|null */ 82 protected $limitNamespaces; 83 84 /** @var RevisionStore */ 85 private $revisionStore; 86 87 /** @var TitleParser */ 88 private $titleParser; 89 90 /** @var HookRunner */ 91 private $hookRunner; 92 93 /** 94 * Returns the default export schema version, as defined by $wgXmlDumpSchemaVersion. 95 * @return string 96 */ 97 public static function schemaVersion() { 98 global $wgXmlDumpSchemaVersion; 99 return $wgXmlDumpSchemaVersion; 100 } 101 102 /** 103 * @param IDatabase $db 104 * @param int|array $history One of WikiExporter::FULL, WikiExporter::CURRENT, 105 * WikiExporter::RANGE or WikiExporter::STABLE, or an associative array: 106 * - offset: non-inclusive offset at which to start the query 107 * - limit: maximum number of rows to return 108 * - dir: "asc" or "desc" timestamp order 109 * @param int $text One of WikiExporter::TEXT or WikiExporter::STUB 110 * @param null|array $limitNamespaces Comma-separated list of namespace numbers 111 * to limit results 112 */ 113 public function __construct( 114 $db, 115 $history = self::CURRENT, 116 $text = self::TEXT, 117 $limitNamespaces = null 118 ) { 119 $this->db = $db; 120 $this->history = $history; 121 $this->writer = new XmlDumpWriter( $text, self::schemaVersion() ); 122 $this->sink = new DumpOutput(); 123 $this->text = $text; 124 $this->limitNamespaces = $limitNamespaces; 125 $services = MediaWikiServices::getInstance(); 126 $this->hookRunner = new HookRunner( $services->getHookContainer() ); 127 $this->revisionStore = $services->getRevisionStore(); 128 $this->titleParser = $services->getTitleParser(); 129 } 130 131 /** 132 * @param string $schemaVersion which schema version the generated XML should comply to. 133 * One of the values from self::$supportedSchemas, using the XML_DUMP_SCHEMA_VERSION_XX 134 * constants. 135 */ 136 public function setSchemaVersion( $schemaVersion ) { 137 $this->writer = new XmlDumpWriter( $this->text, $schemaVersion ); 138 } 139 140 /** 141 * Set the DumpOutput or DumpFilter object which will receive 142 * various row objects and XML output for filtering. Filters 143 * can be chained or used as callbacks. 144 * 145 * @param DumpOutput|DumpFilter &$sink 146 */ 147 public function setOutputSink( &$sink ) { 148 $this->sink =& $sink; 149 } 150 151 public function openStream() { 152 $output = $this->writer->openStream(); 153 $this->sink->writeOpenStream( $output ); 154 } 155 156 public function closeStream() { 157 $output = $this->writer->closeStream(); 158 $this->sink->writeCloseStream( $output ); 159 } 160 161 /** 162 * Dumps a series of page and revision records for all pages 163 * in the database, either including complete history or only 164 * the most recent version. 165 */ 166 public function allPages() { 167 $this->dumpFrom( '' ); 168 } 169 170 /** 171 * Dumps a series of page and revision records for those pages 172 * in the database falling within the page_id range given. 173 * @param int $start Inclusive lower limit (this id is included) 174 * @param int $end Exclusive upper limit (this id is not included) 175 * If 0, no upper limit. 176 * @param bool $orderRevs order revisions within pages in ascending order 177 */ 178 public function pagesByRange( $start, $end, $orderRevs ) { 179 if ( $orderRevs ) { 180 $condition = 'rev_page >= ' . intval( $start ); 181 if ( $end ) { 182 $condition .= ' AND rev_page < ' . intval( $end ); 183 } 184 } else { 185 $condition = 'page_id >= ' . intval( $start ); 186 if ( $end ) { 187 $condition .= ' AND page_id < ' . intval( $end ); 188 } 189 } 190 $this->dumpFrom( $condition, $orderRevs ); 191 } 192 193 /** 194 * Dumps a series of page and revision records for those pages 195 * in the database with revisions falling within the rev_id range given. 196 * @param int $start Inclusive lower limit (this id is included) 197 * @param int $end Exclusive upper limit (this id is not included) 198 * If 0, no upper limit. 199 */ 200 public function revsByRange( $start, $end ) { 201 $condition = 'rev_id >= ' . intval( $start ); 202 if ( $end ) { 203 $condition .= ' AND rev_id < ' . intval( $end ); 204 } 205 $this->dumpFrom( $condition ); 206 } 207 208 /** 209 * @param PageIdentity $page 210 */ 211 public function pageByTitle( PageIdentity $page ) { 212 $this->dumpFrom( 213 'page_namespace=' . $page->getNamespace() . 214 ' AND page_title=' . $this->db->addQuotes( $page->getDBkey() ) ); 215 } 216 217 /** 218 * @param string $name 219 * @throws MWException 220 */ 221 public function pageByName( $name ) { 222 try { 223 $link = $this->titleParser->parseTitle( $name ); 224 $this->dumpFrom( 225 'page_namespace=' . $link->getNamespace() . 226 ' AND page_title=' . $this->db->addQuotes( $link->getDBkey() ) ); 227 } catch ( MalformedTitleException $ex ) { 228 throw new MWException( "Can't export invalid title" ); 229 } 230 } 231 232 /** 233 * @param string[] $names 234 */ 235 public function pagesByName( $names ) { 236 foreach ( $names as $name ) { 237 $this->pageByName( $name ); 238 } 239 } 240 241 public function allLogs() { 242 $this->dumpFrom( '' ); 243 } 244 245 /** 246 * @param int $start 247 * @param int $end 248 */ 249 public function logsByRange( $start, $end ) { 250 $condition = 'log_id >= ' . intval( $start ); 251 if ( $end ) { 252 $condition .= ' AND log_id < ' . intval( $end ); 253 } 254 $this->dumpFrom( $condition ); 255 } 256 257 /** 258 * Generates the distinct list of authors of an article 259 * Not called by default (depends on $this->list_authors) 260 * Can be set by Special:Export when not exporting whole history 261 * 262 * @param string $cond 263 */ 264 protected function do_list_authors( $cond ) { 265 $this->author_list = "<contributors>"; 266 // rev_deleted 267 268 $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] ); 269 $res = $this->db->select( 270 $revQuery['tables'], 271 [ 272 'rev_user_text' => $revQuery['fields']['rev_user_text'], 273 'rev_user' => $revQuery['fields']['rev_user'], 274 ], 275 [ 276 $this->db->bitAnd( 'rev_deleted', RevisionRecord::DELETED_USER ) . ' = 0', 277 $cond, 278 ], 279 __METHOD__, 280 [ 'DISTINCT' ], 281 $revQuery['joins'] 282 ); 283 284 foreach ( $res as $row ) { 285 $this->author_list .= "<contributor>" . 286 "<username>" . 287 htmlspecialchars( $row->rev_user_text ) . 288 "</username>" . 289 "<id>" . 290 ( (int)$row->rev_user ) . 291 "</id>" . 292 "</contributor>"; 293 } 294 $this->author_list .= "</contributors>"; 295 } 296 297 /** 298 * @param string $cond 299 * @param bool $orderRevs 300 * @throws MWException 301 * @throws Exception 302 */ 303 protected function dumpFrom( $cond = '', $orderRevs = false ) { 304 if ( $this->history & self::LOGS ) { 305 $this->dumpLogs( $cond ); 306 } else { 307 $this->dumpPages( $cond, $orderRevs ); 308 } 309 } 310 311 /** 312 * @param string $cond 313 * @throws Exception 314 */ 315 protected function dumpLogs( $cond ) { 316 $where = []; 317 # Hide private logs 318 $hideLogs = LogEventsList::getExcludeClause( $this->db ); 319 if ( $hideLogs ) { 320 $where[] = $hideLogs; 321 } 322 # Add on any caller specified conditions 323 if ( $cond ) { 324 $where[] = $cond; 325 } 326 $result = null; // Assuring $result is not undefined, if exception occurs early 327 328 $commentQuery = CommentStore::getStore()->getJoin( 'log_comment' ); 329 330 $tables = array_merge( 331 [ 'logging', 'actor' ], $commentQuery['tables'] 332 ); 333 $fields = [ 334 'log_id', 'log_type', 'log_action', 'log_timestamp', 'log_namespace', 335 'log_title', 'log_params', 'log_deleted', 'actor_user', 'actor_name' 336 ] + $commentQuery['fields']; 337 $options = [ 338 'ORDER BY' => 'log_id', 339 'USE INDEX' => [ 'logging' => 'PRIMARY' ], 340 'LIMIT' => self::BATCH_SIZE, 341 ]; 342 $joins = [ 343 'actor' => [ 'JOIN', 'actor_id=log_actor' ] 344 ] + $commentQuery['joins']; 345 346 $lastLogId = 0; 347 while ( true ) { 348 $result = $this->db->select( 349 $tables, 350 $fields, 351 array_merge( $where, [ 'log_id > ' . intval( $lastLogId ) ] ), 352 __METHOD__, 353 $options, 354 $joins 355 ); 356 357 if ( !$result->numRows() ) { 358 break; 359 } 360 361 $lastLogId = $this->outputLogStream( $result ); 362 } 363 } 364 365 /** 366 * @param string $cond 367 * @param bool $orderRevs 368 * @throws MWException 369 * @throws Exception 370 */ 371 protected function dumpPages( $cond, $orderRevs ) { 372 $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] ); 373 $slotQuery = $this->revisionStore->getSlotsQueryInfo( [ 'content' ] ); 374 375 // We want page primary rather than revision. 376 // We also want to join in the slots and content tables. 377 // NOTE: This means we may get multiple rows per revision, and more rows 378 // than the batch size! Should be ok, since the max number of slots is 379 // fixed and low (dozens at worst). 380 $tables = array_merge( [ 'page' ], array_diff( $revQuery['tables'], [ 'page' ] ) ); 381 $tables = array_merge( $tables, array_diff( $slotQuery['tables'], $tables ) ); 382 $join = $revQuery['joins'] + [ 383 'revision' => $revQuery['joins']['page'], 384 'slots' => [ 'JOIN', [ 'slot_revision_id = rev_id' ] ], 385 'content' => [ 'JOIN', [ 'content_id = slot_content_id' ] ], 386 ]; 387 unset( $join['page'] ); 388 389 $fields = array_merge( $revQuery['fields'], $slotQuery['fields'] ); 390 $fields[] = 'page_restrictions'; 391 392 if ( $this->text != self::STUB ) { 393 $fields['_load_content'] = '1'; 394 } 395 396 $conds = []; 397 if ( $cond !== '' ) { 398 $conds[] = $cond; 399 } 400 $opts = [ 'ORDER BY' => [ 'rev_page ASC', 'rev_id ASC' ] ]; 401 $opts['USE INDEX'] = []; 402 403 $op = '>'; 404 if ( is_array( $this->history ) ) { 405 # Time offset/limit for all pages/history... 406 # Set time order 407 if ( $this->history['dir'] == 'asc' ) { 408 $opts['ORDER BY'] = 'rev_timestamp ASC'; 409 } else { 410 $op = '<'; 411 $opts['ORDER BY'] = 'rev_timestamp DESC'; 412 } 413 # Set offset 414 if ( !empty( $this->history['offset'] ) ) { 415 $conds[] = "rev_timestamp $op " . 416 $this->db->addQuotes( $this->db->timestamp( $this->history['offset'] ) ); 417 } 418 # Set query limit 419 if ( !empty( $this->history['limit'] ) ) { 420 $maxRowCount = intval( $this->history['limit'] ); 421 } 422 } elseif ( $this->history & self::FULL ) { 423 # Full history dumps... 424 # query optimization for history stub dumps 425 if ( $this->text == self::STUB ) { 426 $opts[] = 'STRAIGHT_JOIN'; 427 $opts['USE INDEX']['revision'] = 'rev_page_id'; 428 unset( $join['revision'] ); 429 $join['page'] = [ 'JOIN', 'rev_page=page_id' ]; 430 } 431 } elseif ( $this->history & self::CURRENT ) { 432 # Latest revision dumps... 433 if ( $this->list_authors && $cond != '' ) { // List authors, if so desired 434 $this->do_list_authors( $cond ); 435 } 436 $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ]; 437 $opts[ 'ORDER BY' ] = [ 'page_id ASC' ]; 438 } elseif ( $this->history & self::STABLE ) { 439 # "Stable" revision dumps... 440 # Default JOIN, to be overridden... 441 $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ]; 442 # One, and only one hook should set this, and return false 443 if ( $this->hookRunner->onWikiExporter__dumpStableQuery( $tables, $opts, $join ) ) { 444 throw new MWException( __METHOD__ . " given invalid history dump type." ); 445 } 446 } elseif ( $this->history & self::RANGE ) { 447 # Dump of revisions within a specified range. Condition already set in revsByRange(). 448 } else { 449 # Unknown history specification parameter? 450 throw new MWException( __METHOD__ . " given invalid history dump type." ); 451 } 452 453 $result = null; // Assuring $result is not undefined, if exception occurs early 454 $done = false; 455 $lastRow = null; 456 $revPage = 0; 457 $revId = 0; 458 $rowCount = 0; 459 460 $opts['LIMIT'] = self::BATCH_SIZE; 461 462 $this->hookRunner->onModifyExportQuery( 463 $this->db, $tables, $cond, $opts, $join, $conds ); 464 465 while ( !$done ) { 466 // If necessary, impose the overall maximum and stop looping after this iteration. 467 if ( !empty( $maxRowCount ) && $rowCount + self::BATCH_SIZE > $maxRowCount ) { 468 $opts['LIMIT'] = $maxRowCount - $rowCount; 469 $done = true; 470 } 471 472 $queryConds = $conds; 473 $queryConds[] = 'rev_page>' . intval( $revPage ) . ' OR (rev_page=' . 474 intval( $revPage ) . ' AND rev_id' . $op . intval( $revId ) . ')'; 475 476 # Do the query and process any results, remembering max ids for the next iteration. 477 $result = $this->db->select( 478 $tables, 479 $fields, 480 $queryConds, 481 __METHOD__, 482 $opts, 483 $join 484 ); 485 if ( $result->numRows() > 0 ) { 486 $lastRow = $this->outputPageStreamBatch( $result, $lastRow ); 487 $rowCount += $result->numRows(); 488 $revPage = $lastRow->rev_page; 489 $revId = $lastRow->rev_id; 490 } else { 491 $done = true; 492 } 493 494 // If we are finished, close off final page element (if any). 495 if ( $done && $lastRow ) { 496 $this->finishPageStreamOutput( $lastRow ); 497 } 498 } 499 } 500 501 /** 502 * Runs through a query result set dumping page, revision, and slot records. 503 * The result set should join the page, revision, slots, and content tables, 504 * and be sorted/grouped by page and revision to avoid duplicate page records in the output. 505 * 506 * @param IResultWrapper $results 507 * @param stdClass $lastRow the last row output from the previous call (or null if none) 508 * @return stdClass the last row processed 509 */ 510 protected function outputPageStreamBatch( $results, $lastRow ) { 511 $rowCarry = null; 512 while ( true ) { 513 $slotRows = $this->getSlotRowBatch( $results, $rowCarry ); 514 515 if ( !$slotRows ) { 516 break; 517 } 518 519 // All revision info is present in all slot rows. 520 // Use the first slot row as the revision row. 521 $revRow = $slotRows[0]; 522 523 if ( $this->limitNamespaces && 524 !in_array( $revRow->page_namespace, $this->limitNamespaces ) ) { 525 $lastRow = $revRow; 526 continue; 527 } 528 529 if ( $lastRow === null || 530 $lastRow->page_namespace !== $revRow->page_namespace || 531 $lastRow->page_title !== $revRow->page_title ) { 532 if ( $lastRow !== null ) { 533 $output = ''; 534 if ( $this->dumpUploads ) { 535 $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents ); 536 } 537 $output .= $this->writer->closePage(); 538 $this->sink->writeClosePage( $output ); 539 } 540 $output = $this->writer->openPage( $revRow ); 541 $this->sink->writeOpenPage( $revRow, $output ); 542 } 543 try { 544 $output = $this->writer->writeRevision( $revRow, $slotRows ); 545 $this->sink->writeRevision( $revRow, $output ); 546 } catch ( RevisionAccessException $ex ) { 547 MWDebug::warning( 'Problem encountered retrieving rev and slot metadata for' 548 . ' revision ' . $revRow->rev_id . ': ' . $ex->getMessage() ); 549 } 550 $lastRow = $revRow; 551 } 552 553 if ( $rowCarry ) { 554 throw new LogicException( 'Error while processing a stream of slot rows' ); 555 } 556 557 return $lastRow; 558 } 559 560 /** 561 * Returns all slot rows for a revision. 562 * Takes and returns a carry row from the last batch; 563 * 564 * @param IResultWrapper|array $results 565 * @param null|stdClass &$carry A row carried over from the last call to getSlotRowBatch() 566 * 567 * @return stdClass[] 568 */ 569 protected function getSlotRowBatch( $results, &$carry = null ) { 570 $slotRows = []; 571 $prev = null; 572 573 if ( $carry ) { 574 $slotRows[] = $carry; 575 $prev = $carry; 576 $carry = null; 577 } 578 579 while ( $row = $results->fetchObject() ) { 580 if ( $prev && $prev->rev_id !== $row->rev_id ) { 581 $carry = $row; 582 break; 583 } 584 $slotRows[] = $row; 585 $prev = $row; 586 } 587 588 return $slotRows; 589 } 590 591 /** 592 * Final page stream output, after all batches are complete 593 * 594 * @param stdClass $lastRow the last row output from the last batch (or null if none) 595 */ 596 protected function finishPageStreamOutput( $lastRow ) { 597 $output = ''; 598 if ( $this->dumpUploads ) { 599 $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents ); 600 } 601 $output .= $this->author_list; 602 $output .= $this->writer->closePage(); 603 $this->sink->writeClosePage( $output ); 604 } 605 606 /** 607 * @param IResultWrapper $resultset 608 * @return int|null the log_id value of the last item output, or null if none 609 */ 610 protected function outputLogStream( $resultset ) { 611 foreach ( $resultset as $row ) { 612 $output = $this->writer->writeLogItem( $row ); 613 $this->sink->writeLogItem( $row, $output ); 614 } 615 return $row->log_id ?? null; 616 } 617} 618