1<?php 2/** 3 * Base class for exporting 4 * 5 * Copyright © 2003, 2005, 2006 Brion Vibber <brion@pobox.com> 6 * https://www.mediawiki.org/ 7 * 8 * This program is free software; you can redistribute it and/or modify 9 * it under the terms of the GNU General Public License as published by 10 * the Free Software Foundation; either version 2 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public License along 19 * with this program; if not, write to the Free Software Foundation, Inc., 20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 21 * http://www.gnu.org/copyleft/gpl.html 22 * 23 * @file 24 */ 25 26/** 27 * @defgroup Dump Dump 28 */ 29 30use MediaWiki\HookContainer\HookRunner; 31use MediaWiki\MediaWikiServices; 32use MediaWiki\Revision\RevisionRecord; 33use MediaWiki\Revision\RevisionStore; 34use Wikimedia\Rdbms\IDatabase; 35use Wikimedia\Rdbms\IResultWrapper; 36 37/** 38 * @ingroup SpecialPage Dump 39 */ 40class WikiExporter { 41 /** @var bool Return distinct author list (when not returning full history) */ 42 public $list_authors = false; 43 44 /** @var bool */ 45 public $dumpUploads = false; 46 47 /** @var bool */ 48 public $dumpUploadFileContents = false; 49 50 /** @var string */ 51 public $author_list = ""; 52 53 public const FULL = 1; 54 public const CURRENT = 2; 55 public const STABLE = 4; // extension defined 56 public const LOGS = 8; 57 public const RANGE = 16; 58 59 public const TEXT = XmlDumpWriter::WRITE_CONTENT; 60 public const STUB = XmlDumpWriter::WRITE_STUB; 61 62 protected const BATCH_SIZE = 50000; 63 64 /** @var int */ 65 public $text; 66 67 /** @var DumpOutput */ 68 public $sink; 69 70 /** @var XmlDumpWriter */ 71 private $writer; 72 73 /** @var IDatabase */ 74 protected $db; 75 76 /** @var array|int */ 77 protected $history; 78 79 /** @var array|null */ 80 protected $limitNamespaces; 81 82 /** @var RevisionStore */ 83 private $revisionStore; 84 85 /** @var HookRunner */ 86 private $hookRunner; 87 88 /** 89 * Returns the default export schema version, as defined by $wgXmlDumpSchemaVersion. 90 * @return string 91 */ 92 public static function schemaVersion() { 93 global $wgXmlDumpSchemaVersion; 94 return $wgXmlDumpSchemaVersion; 95 } 96 97 /** 98 * @param IDatabase $db 99 * @param int|array $history One of WikiExporter::FULL, WikiExporter::CURRENT, 100 * WikiExporter::RANGE or WikiExporter::STABLE, or an associative array: 101 * - offset: non-inclusive offset at which to start the query 102 * - limit: maximum number of rows to return 103 * - dir: "asc" or "desc" timestamp order 104 * @param int $text One of WikiExporter::TEXT or WikiExporter::STUB 105 * @param null|array $limitNamespaces Comma-separated list of namespace numbers 106 * to limit results 107 */ 108 public function __construct( 109 $db, 110 $history = self::CURRENT, 111 $text = self::TEXT, 112 $limitNamespaces = null 113 ) { 114 $this->db = $db; 115 $this->history = $history; 116 $this->writer = new XmlDumpWriter( $text, self::schemaVersion() ); 117 $this->sink = new DumpOutput(); 118 $this->text = $text; 119 $this->limitNamespaces = $limitNamespaces; 120 $services = MediaWikiServices::getInstance(); 121 $this->hookRunner = new HookRunner( $services->getHookContainer() ); 122 $this->revisionStore = $services->getRevisionStore(); 123 } 124 125 /** 126 * @param string $schemaVersion which schema version the generated XML should comply to. 127 * One of the values from self::$supportedSchemas, using the XML_DUMP_SCHEMA_VERSION_XX 128 * constants. 129 */ 130 public function setSchemaVersion( $schemaVersion ) { 131 $this->writer = new XmlDumpWriter( $this->text, $schemaVersion ); 132 } 133 134 /** 135 * Set the DumpOutput or DumpFilter object which will receive 136 * various row objects and XML output for filtering. Filters 137 * can be chained or used as callbacks. 138 * 139 * @param DumpOutput|DumpFilter &$sink 140 */ 141 public function setOutputSink( &$sink ) { 142 $this->sink =& $sink; 143 } 144 145 public function openStream() { 146 $output = $this->writer->openStream(); 147 $this->sink->writeOpenStream( $output ); 148 } 149 150 public function closeStream() { 151 $output = $this->writer->closeStream(); 152 $this->sink->writeCloseStream( $output ); 153 } 154 155 /** 156 * Dumps a series of page and revision records for all pages 157 * in the database, either including complete history or only 158 * the most recent version. 159 */ 160 public function allPages() { 161 $this->dumpFrom( '' ); 162 } 163 164 /** 165 * Dumps a series of page and revision records for those pages 166 * in the database falling within the page_id range given. 167 * @param int $start Inclusive lower limit (this id is included) 168 * @param int $end Exclusive upper limit (this id is not included) 169 * If 0, no upper limit. 170 * @param bool $orderRevs order revisions within pages in ascending order 171 */ 172 public function pagesByRange( $start, $end, $orderRevs ) { 173 if ( $orderRevs ) { 174 $condition = 'rev_page >= ' . intval( $start ); 175 if ( $end ) { 176 $condition .= ' AND rev_page < ' . intval( $end ); 177 } 178 } else { 179 $condition = 'page_id >= ' . intval( $start ); 180 if ( $end ) { 181 $condition .= ' AND page_id < ' . intval( $end ); 182 } 183 } 184 $this->dumpFrom( $condition, $orderRevs ); 185 } 186 187 /** 188 * Dumps a series of page and revision records for those pages 189 * in the database with revisions falling within the rev_id range given. 190 * @param int $start Inclusive lower limit (this id is included) 191 * @param int $end Exclusive upper limit (this id is not included) 192 * If 0, no upper limit. 193 */ 194 public function revsByRange( $start, $end ) { 195 $condition = 'rev_id >= ' . intval( $start ); 196 if ( $end ) { 197 $condition .= ' AND rev_id < ' . intval( $end ); 198 } 199 $this->dumpFrom( $condition ); 200 } 201 202 /** 203 * @param Title $title 204 */ 205 public function pageByTitle( $title ) { 206 $this->dumpFrom( 207 'page_namespace=' . $title->getNamespace() . 208 ' AND page_title=' . $this->db->addQuotes( $title->getDBkey() ) ); 209 } 210 211 /** 212 * @param string $name 213 * @throws MWException 214 */ 215 public function pageByName( $name ) { 216 $title = Title::newFromText( $name ); 217 if ( $title === null ) { 218 throw new MWException( "Can't export invalid title" ); 219 } else { 220 $this->pageByTitle( $title ); 221 } 222 } 223 224 /** 225 * @param array $names 226 */ 227 public function pagesByName( $names ) { 228 foreach ( $names as $name ) { 229 $this->pageByName( $name ); 230 } 231 } 232 233 public function allLogs() { 234 $this->dumpFrom( '' ); 235 } 236 237 /** 238 * @param int $start 239 * @param int $end 240 */ 241 public function logsByRange( $start, $end ) { 242 $condition = 'log_id >= ' . intval( $start ); 243 if ( $end ) { 244 $condition .= ' AND log_id < ' . intval( $end ); 245 } 246 $this->dumpFrom( $condition ); 247 } 248 249 /** 250 * Generates the distinct list of authors of an article 251 * Not called by default (depends on $this->list_authors) 252 * Can be set by Special:Export when not exporting whole history 253 * 254 * @param string $cond 255 */ 256 protected function do_list_authors( $cond ) { 257 $this->author_list = "<contributors>"; 258 // rev_deleted 259 260 $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] ); 261 $res = $this->db->select( 262 $revQuery['tables'], 263 [ 264 'rev_user_text' => $revQuery['fields']['rev_user_text'], 265 'rev_user' => $revQuery['fields']['rev_user'], 266 ], 267 [ 268 $this->db->bitAnd( 'rev_deleted', RevisionRecord::DELETED_USER ) . ' = 0', 269 $cond, 270 ], 271 __METHOD__, 272 [ 'DISTINCT' ], 273 $revQuery['joins'] 274 ); 275 276 foreach ( $res as $row ) { 277 $this->author_list .= "<contributor>" . 278 "<username>" . 279 htmlspecialchars( $row->rev_user_text ) . 280 "</username>" . 281 "<id>" . 282 ( (int)$row->rev_user ) . 283 "</id>" . 284 "</contributor>"; 285 } 286 $this->author_list .= "</contributors>"; 287 } 288 289 /** 290 * @param string $cond 291 * @param bool $orderRevs 292 * @throws MWException 293 * @throws Exception 294 */ 295 protected function dumpFrom( $cond = '', $orderRevs = false ) { 296 if ( $this->history & self::LOGS ) { 297 $this->dumpLogs( $cond ); 298 } else { 299 $this->dumpPages( $cond, $orderRevs ); 300 } 301 } 302 303 /** 304 * @param string $cond 305 * @throws Exception 306 */ 307 protected function dumpLogs( $cond ) { 308 $where = []; 309 # Hide private logs 310 $hideLogs = LogEventsList::getExcludeClause( $this->db ); 311 if ( $hideLogs ) { 312 $where[] = $hideLogs; 313 } 314 # Add on any caller specified conditions 315 if ( $cond ) { 316 $where[] = $cond; 317 } 318 $result = null; // Assuring $result is not undefined, if exception occurs early 319 320 $commentQuery = CommentStore::getStore()->getJoin( 'log_comment' ); 321 $actorQuery = ActorMigration::newMigration()->getJoin( 'log_user' ); 322 323 $tables = array_merge( 324 [ 'logging' ], $commentQuery['tables'], $actorQuery['tables'], [ 'user' ] 325 ); 326 $fields = [ 327 'log_id', 'log_type', 'log_action', 'log_timestamp', 'log_namespace', 328 'log_title', 'log_params', 'log_deleted', 'user_name' 329 ] + $commentQuery['fields'] + $actorQuery['fields']; 330 $options = [ 331 'ORDER BY' => 'log_id', 332 'USE INDEX' => [ 'logging' => 'PRIMARY' ], 333 'LIMIT' => self::BATCH_SIZE, 334 ]; 335 $joins = [ 336 'user' => [ 'JOIN', 'user_id = ' . $actorQuery['fields']['log_user'] ] 337 ] + $commentQuery['joins'] + $actorQuery['joins']; 338 339 $lastLogId = 0; 340 while ( true ) { 341 $result = $this->db->select( 342 $tables, 343 $fields, 344 array_merge( $where, [ 'log_id > ' . intval( $lastLogId ) ] ), 345 __METHOD__, 346 $options, 347 $joins 348 ); 349 350 if ( !$result->numRows() ) { 351 break; 352 } 353 354 $lastLogId = $this->outputLogStream( $result ); 355 } 356 } 357 358 /** 359 * @param string $cond 360 * @param bool $orderRevs 361 * @throws MWException 362 * @throws Exception 363 */ 364 protected function dumpPages( $cond, $orderRevs ) { 365 $revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] ); 366 $slotQuery = $this->revisionStore->getSlotsQueryInfo( [ 'content' ] ); 367 368 // We want page primary rather than revision. 369 // We also want to join in the slots and content tables. 370 // NOTE: This means we may get multiple rows per revision, and more rows 371 // than the batch size! Should be ok, since the max number of slots is 372 // fixed and low (dozens at worst). 373 $tables = array_merge( [ 'page' ], array_diff( $revQuery['tables'], [ 'page' ] ) ); 374 $tables = array_merge( $tables, array_diff( $slotQuery['tables'], $tables ) ); 375 $join = $revQuery['joins'] + [ 376 'revision' => $revQuery['joins']['page'], 377 'slots' => [ 'JOIN', [ 'slot_revision_id = rev_id' ] ], 378 'content' => [ 'JOIN', [ 'content_id = slot_content_id' ] ], 379 ]; 380 unset( $join['page'] ); 381 382 $fields = array_merge( $revQuery['fields'], $slotQuery['fields'] ); 383 $fields[] = 'page_restrictions'; 384 385 if ( $this->text != self::STUB ) { 386 $fields['_load_content'] = '1'; 387 } 388 389 $conds = []; 390 if ( $cond !== '' ) { 391 $conds[] = $cond; 392 } 393 $opts = [ 'ORDER BY' => [ 'rev_page ASC', 'rev_id ASC' ] ]; 394 $opts['USE INDEX'] = []; 395 396 $op = '>'; 397 if ( is_array( $this->history ) ) { 398 # Time offset/limit for all pages/history... 399 # Set time order 400 if ( $this->history['dir'] == 'asc' ) { 401 $opts['ORDER BY'] = 'rev_timestamp ASC'; 402 } else { 403 $op = '<'; 404 $opts['ORDER BY'] = 'rev_timestamp DESC'; 405 } 406 # Set offset 407 if ( !empty( $this->history['offset'] ) ) { 408 $conds[] = "rev_timestamp $op " . 409 $this->db->addQuotes( $this->db->timestamp( $this->history['offset'] ) ); 410 } 411 # Set query limit 412 if ( !empty( $this->history['limit'] ) ) { 413 $maxRowCount = intval( $this->history['limit'] ); 414 } 415 } elseif ( $this->history & self::FULL ) { 416 # Full history dumps... 417 # query optimization for history stub dumps 418 if ( $this->text == self::STUB ) { 419 $opts[] = 'STRAIGHT_JOIN'; 420 $opts['USE INDEX']['revision'] = 'rev_page_id'; 421 unset( $join['revision'] ); 422 $join['page'] = [ 'JOIN', 'rev_page=page_id' ]; 423 } 424 } elseif ( $this->history & self::CURRENT ) { 425 # Latest revision dumps... 426 if ( $this->list_authors && $cond != '' ) { // List authors, if so desired 427 $this->do_list_authors( $cond ); 428 } 429 $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ]; 430 $opts[ 'ORDER BY' ] = [ 'page_id ASC' ]; 431 } elseif ( $this->history & self::STABLE ) { 432 # "Stable" revision dumps... 433 # Default JOIN, to be overridden... 434 $join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ]; 435 # One, and only one hook should set this, and return false 436 if ( $this->hookRunner->onWikiExporter__dumpStableQuery( $tables, $opts, $join ) ) { 437 throw new MWException( __METHOD__ . " given invalid history dump type." ); 438 } 439 } elseif ( $this->history & self::RANGE ) { 440 # Dump of revisions within a specified range. Condition already set in revsByRange(). 441 } else { 442 # Unknown history specification parameter? 443 throw new MWException( __METHOD__ . " given invalid history dump type." ); 444 } 445 446 $result = null; // Assuring $result is not undefined, if exception occurs early 447 $done = false; 448 $lastRow = null; 449 $revPage = 0; 450 $revId = 0; 451 $rowCount = 0; 452 453 $opts['LIMIT'] = self::BATCH_SIZE; 454 455 $this->hookRunner->onModifyExportQuery( 456 $this->db, $tables, $cond, $opts, $join, $conds ); 457 458 while ( !$done ) { 459 // If necessary, impose the overall maximum and stop looping after this iteration. 460 if ( !empty( $maxRowCount ) && $rowCount + self::BATCH_SIZE > $maxRowCount ) { 461 $opts['LIMIT'] = $maxRowCount - $rowCount; 462 $done = true; 463 } 464 465 $queryConds = $conds; 466 $queryConds[] = 'rev_page>' . intval( $revPage ) . ' OR (rev_page=' . 467 intval( $revPage ) . ' AND rev_id' . $op . intval( $revId ) . ')'; 468 469 # Do the query and process any results, remembering max ids for the next iteration. 470 $result = $this->db->select( 471 $tables, 472 $fields, 473 $queryConds, 474 __METHOD__, 475 $opts, 476 $join 477 ); 478 if ( $result->numRows() > 0 ) { 479 $lastRow = $this->outputPageStreamBatch( $result, $lastRow ); 480 $rowCount += $result->numRows(); 481 $revPage = $lastRow->rev_page; 482 $revId = $lastRow->rev_id; 483 } else { 484 $done = true; 485 } 486 487 // If we are finished, close off final page element (if any). 488 if ( $done && $lastRow ) { 489 $this->finishPageStreamOutput( $lastRow ); 490 } 491 } 492 } 493 494 /** 495 * Runs through a query result set dumping page, revision, and slot records. 496 * The result set should join the page, revision, slots, and content tables, 497 * and be sorted/grouped by page and revision to avoid duplicate page records in the output. 498 * 499 * @param IResultWrapper $results 500 * @param object $lastRow the last row output from the previous call (or null if none) 501 * @return object the last row processed 502 */ 503 protected function outputPageStreamBatch( $results, $lastRow ) { 504 $rowCarry = null; 505 while ( true ) { 506 $slotRows = $this->getSlotRowBatch( $results, $rowCarry ); 507 508 if ( !$slotRows ) { 509 break; 510 } 511 512 // All revision info is present in all slot rows. 513 // Use the first slot row as the revision row. 514 $revRow = $slotRows[0]; 515 516 if ( $this->limitNamespaces && 517 !in_array( $revRow->page_namespace, $this->limitNamespaces ) ) { 518 $lastRow = $revRow; 519 continue; 520 } 521 522 if ( $lastRow === null || 523 $lastRow->page_namespace !== $revRow->page_namespace || 524 $lastRow->page_title !== $revRow->page_title ) { 525 if ( $lastRow !== null ) { 526 $output = ''; 527 if ( $this->dumpUploads ) { 528 $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents ); 529 } 530 $output .= $this->writer->closePage(); 531 $this->sink->writeClosePage( $output ); 532 } 533 $output = $this->writer->openPage( $revRow ); 534 $this->sink->writeOpenPage( $revRow, $output ); 535 } 536 $output = $this->writer->writeRevision( $revRow, $slotRows ); 537 $this->sink->writeRevision( $revRow, $output ); 538 $lastRow = $revRow; 539 } 540 541 if ( $rowCarry ) { 542 throw new LogicException( 'Error while processing a stream of slot rows' ); 543 } 544 545 return $lastRow; 546 } 547 548 /** 549 * Returns all slot rows for a revision. 550 * Takes and returns a carry row from the last batch; 551 * 552 * @param IResultWrapper|array $results 553 * @param null|object &$carry A row carried over from the last call to getSlotRowBatch() 554 * 555 * @return object[] 556 */ 557 protected function getSlotRowBatch( $results, &$carry = null ) { 558 $slotRows = []; 559 $prev = null; 560 561 if ( $carry ) { 562 $slotRows[] = $carry; 563 $prev = $carry; 564 $carry = null; 565 } 566 567 while ( $row = $results->fetchObject() ) { 568 if ( $prev && $prev->rev_id !== $row->rev_id ) { 569 $carry = $row; 570 break; 571 } 572 $slotRows[] = $row; 573 $prev = $row; 574 } 575 576 return $slotRows; 577 } 578 579 /** 580 * Final page stream output, after all batches are complete 581 * 582 * @param object $lastRow the last row output from the last batch (or null if none) 583 */ 584 protected function finishPageStreamOutput( $lastRow ) { 585 $output = ''; 586 if ( $this->dumpUploads ) { 587 $output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents ); 588 } 589 $output .= $this->author_list; 590 $output .= $this->writer->closePage(); 591 $this->sink->writeClosePage( $output ); 592 } 593 594 /** 595 * @param IResultWrapper $resultset 596 * @return int|null the log_id value of the last item output, or null if none 597 */ 598 protected function outputLogStream( $resultset ) { 599 foreach ( $resultset as $row ) { 600 $output = $this->writer->writeLogItem( $row ); 601 $this->sink->writeLogItem( $row, $output ); 602 } 603 return $row->log_id ?? null; 604 } 605} 606