1<?php
2/**
3 * Base class for exporting
4 *
5 * Copyright © 2003, 2005, 2006 Brion Vibber <brion@pobox.com>
6 * https://www.mediawiki.org/
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @file
24 */
25
26/**
27 * @defgroup Dump Dump
28 */
29
30use MediaWiki\HookContainer\HookRunner;
31use MediaWiki\MediaWikiServices;
32use MediaWiki\Revision\RevisionRecord;
33use MediaWiki\Revision\RevisionStore;
34use Wikimedia\Rdbms\IDatabase;
35use Wikimedia\Rdbms\IResultWrapper;
36
37/**
38 * @ingroup SpecialPage Dump
39 */
40class WikiExporter {
41	/** @var bool Return distinct author list (when not returning full history) */
42	public $list_authors = false;
43
44	/** @var bool */
45	public $dumpUploads = false;
46
47	/** @var bool */
48	public $dumpUploadFileContents = false;
49
50	/** @var string */
51	public $author_list = "";
52
53	public const FULL = 1;
54	public const CURRENT = 2;
55	public const STABLE = 4; // extension defined
56	public const LOGS = 8;
57	public const RANGE = 16;
58
59	public const TEXT = XmlDumpWriter::WRITE_CONTENT;
60	public const STUB = XmlDumpWriter::WRITE_STUB;
61
62	protected const BATCH_SIZE = 50000;
63
64	/** @var int */
65	public $text;
66
67	/** @var DumpOutput */
68	public $sink;
69
70	/** @var XmlDumpWriter */
71	private $writer;
72
73	/** @var IDatabase */
74	protected $db;
75
76	/** @var array|int */
77	protected $history;
78
79	/** @var array|null */
80	protected $limitNamespaces;
81
82	/** @var RevisionStore */
83	private $revisionStore;
84
85	/** @var HookRunner */
86	private $hookRunner;
87
88	/**
89	 * Returns the default export schema version, as defined by $wgXmlDumpSchemaVersion.
90	 * @return string
91	 */
92	public static function schemaVersion() {
93		global $wgXmlDumpSchemaVersion;
94		return $wgXmlDumpSchemaVersion;
95	}
96
97	/**
98	 * @param IDatabase $db
99	 * @param int|array $history One of WikiExporter::FULL, WikiExporter::CURRENT,
100	 *   WikiExporter::RANGE or WikiExporter::STABLE, or an associative array:
101	 *   - offset: non-inclusive offset at which to start the query
102	 *   - limit: maximum number of rows to return
103	 *   - dir: "asc" or "desc" timestamp order
104	 * @param int $text One of WikiExporter::TEXT or WikiExporter::STUB
105	 * @param null|array $limitNamespaces Comma-separated list of namespace numbers
106	 *   to limit results
107	 */
108	public function __construct(
109		$db,
110		$history = self::CURRENT,
111		$text = self::TEXT,
112		$limitNamespaces = null
113	) {
114		$this->db = $db;
115		$this->history = $history;
116		$this->writer = new XmlDumpWriter( $text, self::schemaVersion() );
117		$this->sink = new DumpOutput();
118		$this->text = $text;
119		$this->limitNamespaces = $limitNamespaces;
120		$services = MediaWikiServices::getInstance();
121		$this->hookRunner = new HookRunner( $services->getHookContainer() );
122		$this->revisionStore = $services->getRevisionStore();
123	}
124
125	/**
126	 * @param string $schemaVersion which schema version the generated XML should comply to.
127	 * One of the values from self::$supportedSchemas, using the XML_DUMP_SCHEMA_VERSION_XX
128	 * constants.
129	 */
130	public function setSchemaVersion( $schemaVersion ) {
131		$this->writer = new XmlDumpWriter( $this->text, $schemaVersion );
132	}
133
134	/**
135	 * Set the DumpOutput or DumpFilter object which will receive
136	 * various row objects and XML output for filtering. Filters
137	 * can be chained or used as callbacks.
138	 *
139	 * @param DumpOutput|DumpFilter &$sink
140	 */
141	public function setOutputSink( &$sink ) {
142		$this->sink =& $sink;
143	}
144
145	public function openStream() {
146		$output = $this->writer->openStream();
147		$this->sink->writeOpenStream( $output );
148	}
149
150	public function closeStream() {
151		$output = $this->writer->closeStream();
152		$this->sink->writeCloseStream( $output );
153	}
154
155	/**
156	 * Dumps a series of page and revision records for all pages
157	 * in the database, either including complete history or only
158	 * the most recent version.
159	 */
160	public function allPages() {
161		$this->dumpFrom( '' );
162	}
163
164	/**
165	 * Dumps a series of page and revision records for those pages
166	 * in the database falling within the page_id range given.
167	 * @param int $start Inclusive lower limit (this id is included)
168	 * @param int $end Exclusive upper limit (this id is not included)
169	 *   If 0, no upper limit.
170	 * @param bool $orderRevs order revisions within pages in ascending order
171	 */
172	public function pagesByRange( $start, $end, $orderRevs ) {
173		if ( $orderRevs ) {
174			$condition = 'rev_page >= ' . intval( $start );
175			if ( $end ) {
176				$condition .= ' AND rev_page < ' . intval( $end );
177			}
178		} else {
179			$condition = 'page_id >= ' . intval( $start );
180			if ( $end ) {
181				$condition .= ' AND page_id < ' . intval( $end );
182			}
183		}
184		$this->dumpFrom( $condition, $orderRevs );
185	}
186
187	/**
188	 * Dumps a series of page and revision records for those pages
189	 * in the database with revisions falling within the rev_id range given.
190	 * @param int $start Inclusive lower limit (this id is included)
191	 * @param int $end Exclusive upper limit (this id is not included)
192	 *   If 0, no upper limit.
193	 */
194	public function revsByRange( $start, $end ) {
195		$condition = 'rev_id >= ' . intval( $start );
196		if ( $end ) {
197			$condition .= ' AND rev_id < ' . intval( $end );
198		}
199		$this->dumpFrom( $condition );
200	}
201
202	/**
203	 * @param Title $title
204	 */
205	public function pageByTitle( $title ) {
206		$this->dumpFrom(
207			'page_namespace=' . $title->getNamespace() .
208			' AND page_title=' . $this->db->addQuotes( $title->getDBkey() ) );
209	}
210
211	/**
212	 * @param string $name
213	 * @throws MWException
214	 */
215	public function pageByName( $name ) {
216		$title = Title::newFromText( $name );
217		if ( $title === null ) {
218			throw new MWException( "Can't export invalid title" );
219		} else {
220			$this->pageByTitle( $title );
221		}
222	}
223
224	/**
225	 * @param array $names
226	 */
227	public function pagesByName( $names ) {
228		foreach ( $names as $name ) {
229			$this->pageByName( $name );
230		}
231	}
232
233	public function allLogs() {
234		$this->dumpFrom( '' );
235	}
236
237	/**
238	 * @param int $start
239	 * @param int $end
240	 */
241	public function logsByRange( $start, $end ) {
242		$condition = 'log_id >= ' . intval( $start );
243		if ( $end ) {
244			$condition .= ' AND log_id < ' . intval( $end );
245		}
246		$this->dumpFrom( $condition );
247	}
248
249	/**
250	 * Generates the distinct list of authors of an article
251	 * Not called by default (depends on $this->list_authors)
252	 * Can be set by Special:Export when not exporting whole history
253	 *
254	 * @param string $cond
255	 */
256	protected function do_list_authors( $cond ) {
257		$this->author_list = "<contributors>";
258		// rev_deleted
259
260		$revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] );
261		$res = $this->db->select(
262			$revQuery['tables'],
263			[
264				'rev_user_text' => $revQuery['fields']['rev_user_text'],
265				'rev_user' => $revQuery['fields']['rev_user'],
266			],
267			[
268				$this->db->bitAnd( 'rev_deleted', RevisionRecord::DELETED_USER ) . ' = 0',
269				$cond,
270			],
271			__METHOD__,
272			[ 'DISTINCT' ],
273			$revQuery['joins']
274		);
275
276		foreach ( $res as $row ) {
277			$this->author_list .= "<contributor>" .
278				"<username>" .
279				htmlspecialchars( $row->rev_user_text ) .
280				"</username>" .
281				"<id>" .
282				( (int)$row->rev_user ) .
283				"</id>" .
284				"</contributor>";
285		}
286		$this->author_list .= "</contributors>";
287	}
288
289	/**
290	 * @param string $cond
291	 * @param bool $orderRevs
292	 * @throws MWException
293	 * @throws Exception
294	 */
295	protected function dumpFrom( $cond = '', $orderRevs = false ) {
296		if ( $this->history & self::LOGS ) {
297			$this->dumpLogs( $cond );
298		} else {
299			$this->dumpPages( $cond, $orderRevs );
300		}
301	}
302
303	/**
304	 * @param string $cond
305	 * @throws Exception
306	 */
307	protected function dumpLogs( $cond ) {
308		$where = [];
309		# Hide private logs
310		$hideLogs = LogEventsList::getExcludeClause( $this->db );
311		if ( $hideLogs ) {
312			$where[] = $hideLogs;
313		}
314		# Add on any caller specified conditions
315		if ( $cond ) {
316			$where[] = $cond;
317		}
318		$result = null; // Assuring $result is not undefined, if exception occurs early
319
320		$commentQuery = CommentStore::getStore()->getJoin( 'log_comment' );
321		$actorQuery = ActorMigration::newMigration()->getJoin( 'log_user' );
322
323		$tables = array_merge(
324			[ 'logging' ], $commentQuery['tables'], $actorQuery['tables'], [ 'user' ]
325		);
326		$fields = [
327			'log_id', 'log_type', 'log_action', 'log_timestamp', 'log_namespace',
328			'log_title', 'log_params', 'log_deleted', 'user_name'
329		] + $commentQuery['fields'] + $actorQuery['fields'];
330		$options = [
331			'ORDER BY' => 'log_id',
332			'USE INDEX' => [ 'logging' => 'PRIMARY' ],
333			'LIMIT' => self::BATCH_SIZE,
334		];
335		$joins = [
336			'user' => [ 'JOIN', 'user_id = ' . $actorQuery['fields']['log_user'] ]
337		] + $commentQuery['joins'] + $actorQuery['joins'];
338
339		$lastLogId = 0;
340		while ( true ) {
341			$result = $this->db->select(
342				$tables,
343				$fields,
344				array_merge( $where, [ 'log_id > ' . intval( $lastLogId ) ] ),
345				__METHOD__,
346				$options,
347				$joins
348			);
349
350			if ( !$result->numRows() ) {
351				break;
352			}
353
354			$lastLogId = $this->outputLogStream( $result );
355		}
356	}
357
358	/**
359	 * @param string $cond
360	 * @param bool $orderRevs
361	 * @throws MWException
362	 * @throws Exception
363	 */
364	protected function dumpPages( $cond, $orderRevs ) {
365		$revQuery = $this->revisionStore->getQueryInfo( [ 'page' ] );
366		$slotQuery = $this->revisionStore->getSlotsQueryInfo( [ 'content' ] );
367
368		// We want page primary rather than revision.
369		// We also want to join in the slots and content tables.
370		// NOTE: This means we may get multiple rows per revision, and more rows
371		// than the batch size! Should be ok, since the max number of slots is
372		// fixed and low (dozens at worst).
373		$tables = array_merge( [ 'page' ], array_diff( $revQuery['tables'], [ 'page' ] ) );
374		$tables = array_merge( $tables, array_diff( $slotQuery['tables'], $tables ) );
375		$join = $revQuery['joins'] + [
376				'revision' => $revQuery['joins']['page'],
377				'slots' => [ 'JOIN', [ 'slot_revision_id = rev_id' ] ],
378				'content' => [ 'JOIN', [ 'content_id = slot_content_id' ] ],
379			];
380		unset( $join['page'] );
381
382		$fields = array_merge( $revQuery['fields'], $slotQuery['fields'] );
383		$fields[] = 'page_restrictions';
384
385		if ( $this->text != self::STUB ) {
386			$fields['_load_content'] = '1';
387		}
388
389		$conds = [];
390		if ( $cond !== '' ) {
391			$conds[] = $cond;
392		}
393		$opts = [ 'ORDER BY' => [ 'rev_page ASC', 'rev_id ASC' ] ];
394		$opts['USE INDEX'] = [];
395
396		$op = '>';
397		if ( is_array( $this->history ) ) {
398			# Time offset/limit for all pages/history...
399			# Set time order
400			if ( $this->history['dir'] == 'asc' ) {
401				$opts['ORDER BY'] = 'rev_timestamp ASC';
402			} else {
403				$op = '<';
404				$opts['ORDER BY'] = 'rev_timestamp DESC';
405			}
406			# Set offset
407			if ( !empty( $this->history['offset'] ) ) {
408				$conds[] = "rev_timestamp $op " .
409					$this->db->addQuotes( $this->db->timestamp( $this->history['offset'] ) );
410			}
411			# Set query limit
412			if ( !empty( $this->history['limit'] ) ) {
413				$maxRowCount = intval( $this->history['limit'] );
414			}
415		} elseif ( $this->history & self::FULL ) {
416			# Full history dumps...
417			# query optimization for history stub dumps
418			if ( $this->text == self::STUB ) {
419				$opts[] = 'STRAIGHT_JOIN';
420				$opts['USE INDEX']['revision'] = 'rev_page_id';
421				unset( $join['revision'] );
422				$join['page'] = [ 'JOIN', 'rev_page=page_id' ];
423			}
424		} elseif ( $this->history & self::CURRENT ) {
425			# Latest revision dumps...
426			if ( $this->list_authors && $cond != '' ) { // List authors, if so desired
427				$this->do_list_authors( $cond );
428			}
429			$join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
430			$opts[ 'ORDER BY' ] = [ 'page_id ASC' ];
431		} elseif ( $this->history & self::STABLE ) {
432			# "Stable" revision dumps...
433			# Default JOIN, to be overridden...
434			$join['revision'] = [ 'JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
435			# One, and only one hook should set this, and return false
436			if ( $this->hookRunner->onWikiExporter__dumpStableQuery( $tables, $opts, $join ) ) {
437				throw new MWException( __METHOD__ . " given invalid history dump type." );
438			}
439		} elseif ( $this->history & self::RANGE ) {
440			# Dump of revisions within a specified range.  Condition already set in revsByRange().
441		} else {
442			# Unknown history specification parameter?
443			throw new MWException( __METHOD__ . " given invalid history dump type." );
444		}
445
446		$result = null; // Assuring $result is not undefined, if exception occurs early
447		$done = false;
448		$lastRow = null;
449		$revPage = 0;
450		$revId = 0;
451		$rowCount = 0;
452
453		$opts['LIMIT'] = self::BATCH_SIZE;
454
455		$this->hookRunner->onModifyExportQuery(
456			$this->db, $tables, $cond, $opts, $join, $conds );
457
458		while ( !$done ) {
459			// If necessary, impose the overall maximum and stop looping after this iteration.
460			if ( !empty( $maxRowCount ) && $rowCount + self::BATCH_SIZE > $maxRowCount ) {
461				$opts['LIMIT'] = $maxRowCount - $rowCount;
462				$done = true;
463			}
464
465			$queryConds = $conds;
466			$queryConds[] = 'rev_page>' . intval( $revPage ) . ' OR (rev_page=' .
467				intval( $revPage ) . ' AND rev_id' . $op . intval( $revId ) . ')';
468
469			# Do the query and process any results, remembering max ids for the next iteration.
470			$result = $this->db->select(
471				$tables,
472				$fields,
473				$queryConds,
474				__METHOD__,
475				$opts,
476				$join
477			);
478			if ( $result->numRows() > 0 ) {
479				$lastRow = $this->outputPageStreamBatch( $result, $lastRow );
480				$rowCount += $result->numRows();
481				$revPage = $lastRow->rev_page;
482				$revId = $lastRow->rev_id;
483			} else {
484				$done = true;
485			}
486
487			// If we are finished, close off final page element (if any).
488			if ( $done && $lastRow ) {
489				$this->finishPageStreamOutput( $lastRow );
490			}
491		}
492	}
493
494	/**
495	 * Runs through a query result set dumping page, revision, and slot records.
496	 * The result set should join the page, revision, slots, and content tables,
497	 * and be sorted/grouped by page and revision to avoid duplicate page records in the output.
498	 *
499	 * @param IResultWrapper $results
500	 * @param object $lastRow the last row output from the previous call (or null if none)
501	 * @return object the last row processed
502	 */
503	protected function outputPageStreamBatch( $results, $lastRow ) {
504		$rowCarry = null;
505		while ( true ) {
506			$slotRows = $this->getSlotRowBatch( $results, $rowCarry );
507
508			if ( !$slotRows ) {
509				break;
510			}
511
512			// All revision info is present in all slot rows.
513			// Use the first slot row as the revision row.
514			$revRow = $slotRows[0];
515
516			if ( $this->limitNamespaces &&
517				!in_array( $revRow->page_namespace, $this->limitNamespaces ) ) {
518				$lastRow = $revRow;
519				continue;
520			}
521
522			if ( $lastRow === null ||
523				$lastRow->page_namespace !== $revRow->page_namespace ||
524				$lastRow->page_title !== $revRow->page_title ) {
525				if ( $lastRow !== null ) {
526					$output = '';
527					if ( $this->dumpUploads ) {
528						$output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
529					}
530					$output .= $this->writer->closePage();
531					$this->sink->writeClosePage( $output );
532				}
533				$output = $this->writer->openPage( $revRow );
534				$this->sink->writeOpenPage( $revRow, $output );
535			}
536			$output = $this->writer->writeRevision( $revRow, $slotRows );
537			$this->sink->writeRevision( $revRow, $output );
538			$lastRow = $revRow;
539		}
540
541		if ( $rowCarry ) {
542			throw new LogicException( 'Error while processing a stream of slot rows' );
543		}
544
545		return $lastRow;
546	}
547
548	/**
549	 * Returns all slot rows for a revision.
550	 * Takes and returns a carry row from the last batch;
551	 *
552	 * @param IResultWrapper|array $results
553	 * @param null|object &$carry A row carried over from the last call to getSlotRowBatch()
554	 *
555	 * @return object[]
556	 */
557	protected function getSlotRowBatch( $results, &$carry = null ) {
558		$slotRows = [];
559		$prev = null;
560
561		if ( $carry ) {
562			$slotRows[] = $carry;
563			$prev = $carry;
564			$carry = null;
565		}
566
567		while ( $row = $results->fetchObject() ) {
568			if ( $prev && $prev->rev_id !== $row->rev_id ) {
569				$carry = $row;
570				break;
571			}
572			$slotRows[] = $row;
573			$prev = $row;
574		}
575
576		return $slotRows;
577	}
578
579	/**
580	 * Final page stream output, after all batches are complete
581	 *
582	 * @param object $lastRow the last row output from the last batch (or null if none)
583	 */
584	protected function finishPageStreamOutput( $lastRow ) {
585		$output = '';
586		if ( $this->dumpUploads ) {
587			$output .= $this->writer->writeUploads( $lastRow, $this->dumpUploadFileContents );
588		}
589		$output .= $this->author_list;
590		$output .= $this->writer->closePage();
591		$this->sink->writeClosePage( $output );
592	}
593
594	/**
595	 * @param IResultWrapper $resultset
596	 * @return int|null the log_id value of the last item output, or null if none
597	 */
598	protected function outputLogStream( $resultset ) {
599		foreach ( $resultset as $row ) {
600			$output = $this->writer->writeLogItem( $row );
601			$this->sink->writeLogItem( $row, $output );
602		}
603		return $row->log_id ?? null;
604	}
605}
606