1<?php
2/**
3 * Moves blobs indexed by trackBlobs.php to a specified list of destination
4 * clusters, and recompresses them in the process.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 * http://www.gnu.org/copyleft/gpl.html
20 *
21 * @file
22 * @ingroup Maintenance ExternalStorage
23 */
24
25use MediaWiki\Logger\LegacyLogger;
26use MediaWiki\MediaWikiServices;
27use MediaWiki\Shell\Shell;
28use MediaWiki\Storage\SqlBlobStore;
29use Wikimedia\Rdbms\IMaintainableDatabase;
30
31$optionsWithArgs = RecompressTracked::getOptionsWithArgs();
32require __DIR__ . '/../commandLine.inc';
33
34if ( count( $args ) < 1 ) {
35	echo "Usage: php recompressTracked.php [options] <cluster> [... <cluster>...]
36Moves blobs indexed by trackBlobs.php to a specified list of destination clusters,
37and recompresses them in the process. Restartable.
38
39Options:
40	--procs <procs>       Set the number of child processes (default 1)
41	--copy-only           Copy only, do not update the text table. Restart
42	                      without this option to complete.
43	--debug-log <file>    Log debugging data to the specified file
44	--info-log <file>     Log progress messages to the specified file
45	--critical-log <file> Log error messages to the specified file
46";
47	exit( 1 );
48}
49
50$job = RecompressTracked::newFromCommandLine( $args, $options );
51$job->execute();
52
53/**
54 * Maintenance script that moves blobs indexed by trackBlobs.php to a specified
55 * list of destination clusters, and recompresses them in the process.
56 *
57 * @ingroup Maintenance ExternalStorage
58 */
59class RecompressTracked {
60	public $destClusters;
61	public $batchSize = 1000;
62	public $orphanBatchSize = 1000;
63	public $reportingInterval = 10;
64	public $numProcs = 1;
65	public $numBatches = 0;
66	public $pageBlobClass, $orphanBlobClass;
67	public $childPipes, $childProcs, $prevChildId;
68	public $copyOnly = false;
69	public $isChild = false;
70	public $childId = false;
71	public $noCount = false;
72	public $debugLog, $infoLog, $criticalLog;
73	/** @var ExternalStoreDB */
74	public $store;
75	/** @var SqlBlobStore */
76	private $blobStore;
77
78	private static $optionsWithArgs = [
79		'procs',
80		'child-id',
81		'debug-log',
82		'info-log',
83		'critical-log'
84	];
85
86	private static $cmdLineOptionMap = [
87		'no-count' => 'noCount',
88		'procs' => 'numProcs',
89		'copy-only' => 'copyOnly',
90		'child' => 'isChild',
91		'child-id' => 'childId',
92		'debug-log' => 'debugLog',
93		'info-log' => 'infoLog',
94		'critical-log' => 'criticalLog',
95	];
96
97	public static function getOptionsWithArgs() {
98		return self::$optionsWithArgs;
99	}
100
101	public static function newFromCommandLine( $args, $options ) {
102		$jobOptions = [ 'destClusters' => $args ];
103		foreach ( self::$cmdLineOptionMap as $cmdOption => $classOption ) {
104			if ( isset( $options[$cmdOption] ) ) {
105				$jobOptions[$classOption] = $options[$cmdOption];
106			}
107		}
108
109		return new self( $jobOptions );
110	}
111
112	public function __construct( $options ) {
113		foreach ( $options as $name => $value ) {
114			$this->$name = $value;
115		}
116		$esFactory = MediaWikiServices::getInstance()->getExternalStoreFactory();
117		$this->store = $esFactory->getStore( 'DB' );
118		if ( !$this->isChild ) {
119			$GLOBALS['wgDebugLogPrefix'] = "RCT M: ";
120		} elseif ( $this->childId !== false ) {
121			$GLOBALS['wgDebugLogPrefix'] = "RCT {$this->childId}: ";
122		}
123		$this->pageBlobClass = function_exists( 'xdiff_string_bdiff' ) ?
124			DiffHistoryBlob::class : ConcatenatedGzipHistoryBlob::class;
125		$this->orphanBlobClass = ConcatenatedGzipHistoryBlob::class;
126
127		$this->blobStore = MediaWikiServices::getInstance()
128			->getBlobStoreFactory()
129			->newSqlBlobStore();
130	}
131
132	public function debug( $msg ) {
133		wfDebug( "$msg" );
134		if ( $this->debugLog ) {
135			$this->logToFile( $msg, $this->debugLog );
136		}
137	}
138
139	public function info( $msg ) {
140		echo "$msg\n";
141		if ( $this->infoLog ) {
142			$this->logToFile( $msg, $this->infoLog );
143		}
144	}
145
146	public function critical( $msg ) {
147		echo "$msg\n";
148		if ( $this->criticalLog ) {
149			$this->logToFile( $msg, $this->criticalLog );
150		}
151	}
152
153	private function logToFile( $msg, $file ) {
154		$header = '[' . date( 'd\TH:i:s' ) . '] ' . wfHostname() . ' ' . posix_getpid();
155		if ( $this->childId !== false ) {
156			$header .= "({$this->childId})";
157		}
158		$header .= ' ' . WikiMap::getCurrentWikiDbDomain()->getId();
159		LegacyLogger::emit( sprintf( "%-50s %s\n", $header, $msg ), $file );
160	}
161
162	/**
163	 * Wait until the selected replica DB has caught up to the master.
164	 * This allows us to use the replica DB for things that were committed in a
165	 * previous part of this batch process.
166	 */
167	private function syncDBs() {
168		$dbw = wfGetDB( DB_MASTER );
169		$dbr = wfGetDB( DB_REPLICA );
170		$pos = $dbw->getMasterPos();
171		$dbr->masterPosWait( $pos, 100000 );
172	}
173
174	/**
175	 * Execute parent or child depending on the isChild option
176	 */
177	public function execute() {
178		if ( $this->isChild ) {
179			$this->executeChild();
180		} else {
181			$this->executeParent();
182		}
183	}
184
185	/**
186	 * Execute the parent process
187	 */
188	public function executeParent() {
189		if ( !$this->checkTrackingTable() ) {
190			return;
191		}
192
193		$this->syncDBs();
194		$this->startChildProcs();
195		$this->doAllPages();
196		$this->doAllOrphans();
197		$this->killChildProcs();
198	}
199
200	/**
201	 * Make sure the tracking table exists and isn't empty
202	 * @return bool
203	 */
204	private function checkTrackingTable() {
205		$dbr = wfGetDB( DB_REPLICA );
206		if ( !$dbr->tableExists( 'blob_tracking', __METHOD__ ) ) {
207			$this->critical( "Error: blob_tracking table does not exist" );
208
209			return false;
210		}
211		$row = $dbr->selectRow( 'blob_tracking', '*', '', __METHOD__ );
212		if ( !$row ) {
213			$this->info( "Warning: blob_tracking table contains no rows, skipping this wiki." );
214
215			return false;
216		}
217
218		return true;
219	}
220
221	/**
222	 * Start the worker processes.
223	 * These processes will listen on stdin for commands.
224	 * This necessary because text recompression is slow: loading, compressing and
225	 * writing are all slow.
226	 */
227	private function startChildProcs() {
228		$wiki = WikiMap::getCurrentWikiId();
229
230		$cmd = 'php ' . Shell::escape( __FILE__ );
231		foreach ( self::$cmdLineOptionMap as $cmdOption => $classOption ) {
232			if ( $cmdOption == 'child-id' ) {
233				continue;
234			} elseif ( in_array( $cmdOption, self::$optionsWithArgs ) && isset( $this->$classOption ) ) {
235				$cmd .= " --$cmdOption " . Shell::escape( $this->$classOption );
236			} elseif ( $this->$classOption ) {
237				$cmd .= " --$cmdOption";
238			}
239		}
240		$cmd .= ' --child' .
241			' --wiki ' . Shell::escape( $wiki ) .
242			' ' . Shell::escape( ...$this->destClusters );
243
244		$this->childPipes = $this->childProcs = [];
245		for ( $i = 0; $i < $this->numProcs; $i++ ) {
246			$pipes = [];
247			$spec = [
248				[ 'pipe', 'r' ],
249				[ 'file', 'php://stdout', 'w' ],
250				[ 'file', 'php://stderr', 'w' ]
251			];
252			Wikimedia\suppressWarnings();
253			$proc = proc_open( "$cmd --child-id $i", $spec, $pipes );
254			Wikimedia\restoreWarnings();
255			if ( !$proc ) {
256				$this->critical( "Error opening child process: $cmd" );
257				exit( 1 );
258			}
259			$this->childProcs[$i] = $proc;
260			$this->childPipes[$i] = $pipes[0];
261		}
262		$this->prevChildId = -1;
263	}
264
265	/**
266	 * Gracefully terminate the child processes
267	 */
268	private function killChildProcs() {
269		$this->info( "Waiting for child processes to finish..." );
270		for ( $i = 0; $i < $this->numProcs; $i++ ) {
271			$this->dispatchToChild( $i, 'quit' );
272		}
273		for ( $i = 0; $i < $this->numProcs; $i++ ) {
274			$status = proc_close( $this->childProcs[$i] );
275			if ( $status ) {
276				$this->critical( "Warning: child #$i exited with status $status" );
277			}
278		}
279		$this->info( "Done." );
280	}
281
282	/**
283	 * Dispatch a command to the next available child process.
284	 * This may block until a child process finishes its work and becomes available.
285	 * @param array|string ...$args
286	 */
287	private function dispatch( ...$args ) {
288		$pipes = $this->childPipes;
289		$x = [];
290		$y = [];
291		$numPipes = stream_select( $x, $pipes, $y, 3600 );
292		if ( !$numPipes ) {
293			$this->critical( "Error waiting to write to child process. Aborting" );
294			exit( 1 );
295		}
296		for ( $i = 0; $i < $this->numProcs; $i++ ) {
297			$childId = ( $i + $this->prevChildId + 1 ) % $this->numProcs;
298			if ( isset( $pipes[$childId] ) ) {
299				$this->prevChildId = $childId;
300				$this->dispatchToChild( $childId, $args );
301
302				return;
303			}
304		}
305		$this->critical( "Unreachable" );
306		exit( 1 );
307	}
308
309	/**
310	 * Dispatch a command to a specified child process
311	 * @param int $childId
312	 * @param array|string $args
313	 */
314	private function dispatchToChild( $childId, $args ) {
315		$args = (array)$args;
316		$cmd = implode( ' ', $args );
317		fwrite( $this->childPipes[$childId], "$cmd\n" );
318	}
319
320	/**
321	 * Move all tracked pages to the new clusters
322	 */
323	private function doAllPages() {
324		$dbr = wfGetDB( DB_REPLICA );
325		$i = 0;
326		$startId = 0;
327		if ( $this->noCount ) {
328			$numPages = '[unknown]';
329		} else {
330			$numPages = $dbr->selectField( 'blob_tracking',
331				'COUNT(DISTINCT bt_page)',
332				# A condition is required so that this query uses the index
333				[ 'bt_moved' => 0 ],
334				__METHOD__
335			);
336		}
337		if ( $this->copyOnly ) {
338			$this->info( "Copying pages..." );
339		} else {
340			$this->info( "Moving pages..." );
341		}
342		while ( true ) {
343			$res = $dbr->select( 'blob_tracking',
344				[ 'bt_page' ],
345				[
346					'bt_moved' => 0,
347					'bt_page > ' . $dbr->addQuotes( $startId )
348				],
349				__METHOD__,
350				[
351					'DISTINCT',
352					'ORDER BY' => 'bt_page',
353					'LIMIT' => $this->batchSize,
354				]
355			);
356			if ( !$res->numRows() ) {
357				break;
358			}
359			foreach ( $res as $row ) {
360				$startId = $row->bt_page;
361				$this->dispatch( 'doPage', $row->bt_page );
362				$i++;
363			}
364			$this->report( 'pages', $i, $numPages );
365		}
366		$this->report( 'pages', $i, $numPages );
367		if ( $this->copyOnly ) {
368			$this->info( "All page copies queued." );
369		} else {
370			$this->info( "All page moves queued." );
371		}
372	}
373
374	/**
375	 * Display a progress report
376	 * @param string $label
377	 * @param int $current
378	 * @param int $end
379	 */
380	private function report( $label, $current, $end ) {
381		$this->numBatches++;
382		if ( $current == $end || $this->numBatches >= $this->reportingInterval ) {
383			$this->numBatches = 0;
384			$this->info( "$label: $current / $end" );
385			MediaWikiServices::getInstance()->getDBLoadBalancerFactory()->waitForReplication();
386		}
387	}
388
389	/**
390	 * Move all orphan text to the new clusters
391	 */
392	private function doAllOrphans() {
393		$dbr = wfGetDB( DB_REPLICA );
394		$startId = 0;
395		$i = 0;
396		if ( $this->noCount ) {
397			$numOrphans = '[unknown]';
398		} else {
399			$numOrphans = $dbr->selectField( 'blob_tracking',
400				'COUNT(DISTINCT bt_text_id)',
401				[ 'bt_moved' => 0, 'bt_page' => 0 ],
402				__METHOD__ );
403			if ( !$numOrphans ) {
404				return;
405			}
406		}
407		if ( $this->copyOnly ) {
408			$this->info( "Copying orphans..." );
409		} else {
410			$this->info( "Moving orphans..." );
411		}
412
413		while ( true ) {
414			$res = $dbr->select( 'blob_tracking',
415				[ 'bt_text_id' ],
416				[
417					'bt_moved' => 0,
418					'bt_page' => 0,
419					'bt_text_id > ' . $dbr->addQuotes( $startId )
420				],
421				__METHOD__,
422				[
423					'DISTINCT',
424					'ORDER BY' => 'bt_text_id',
425					'LIMIT' => $this->batchSize
426				]
427			);
428			if ( !$res->numRows() ) {
429				break;
430			}
431			$ids = [];
432			foreach ( $res as $row ) {
433				$startId = $row->bt_text_id;
434				$ids[] = $row->bt_text_id;
435				$i++;
436			}
437			// Need to send enough orphan IDs to the child at a time to fill a blob,
438			// so orphanBatchSize needs to be at least ~100.
439			// batchSize can be smaller or larger.
440			while ( count( $ids ) > $this->orphanBatchSize ) {
441				$args = array_slice( $ids, 0, $this->orphanBatchSize );
442				$ids = array_slice( $ids, $this->orphanBatchSize );
443				array_unshift( $args, 'doOrphanList' );
444				$this->dispatch( ...$args );
445			}
446			if ( count( $ids ) ) {
447				$args = $ids;
448				array_unshift( $args, 'doOrphanList' );
449				$this->dispatch( ...$args );
450			}
451
452			$this->report( 'orphans', $i, $numOrphans );
453		}
454		$this->report( 'orphans', $i, $numOrphans );
455		$this->info( "All orphans queued." );
456	}
457
458	/**
459	 * Main entry point for worker processes
460	 */
461	public function executeChild() {
462		$this->debug( 'starting' );
463		$this->syncDBs();
464
465		while ( !feof( STDIN ) ) {
466			$line = rtrim( fgets( STDIN ) );
467			if ( $line == '' ) {
468				continue;
469			}
470			$this->debug( $line );
471			$args = explode( ' ', $line );
472			$cmd = array_shift( $args );
473			switch ( $cmd ) {
474				case 'doPage':
475					$this->doPage( intval( $args[0] ) );
476					break;
477				case 'doOrphanList':
478					$this->doOrphanList( array_map( 'intval', $args ) );
479					break;
480				case 'quit':
481					return;
482			}
483			MediaWikiServices::getInstance()->getDBLoadBalancerFactory()->waitForReplication();
484		}
485	}
486
487	/**
488	 * Move tracked text in a given page
489	 *
490	 * @param int $pageId
491	 */
492	private function doPage( $pageId ) {
493		$title = Title::newFromID( $pageId );
494		if ( $title ) {
495			$titleText = $title->getPrefixedText();
496		} else {
497			$titleText = '[deleted]';
498		}
499		$dbr = wfGetDB( DB_REPLICA );
500
501		// Finish any incomplete transactions
502		if ( !$this->copyOnly ) {
503			$this->finishIncompleteMoves( [ 'bt_page' => $pageId ] );
504			$this->syncDBs();
505		}
506
507		$startId = 0;
508		$trx = new CgzCopyTransaction( $this, $this->pageBlobClass );
509
510		$lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
511		while ( true ) {
512			$res = $dbr->select(
513				[ 'blob_tracking', 'text' ],
514				'*',
515				[
516					'bt_page' => $pageId,
517					'bt_text_id > ' . $dbr->addQuotes( $startId ),
518					'bt_moved' => 0,
519					'bt_new_url IS NULL',
520					'bt_text_id=old_id',
521				],
522				__METHOD__,
523				[
524					'ORDER BY' => 'bt_text_id',
525					'LIMIT' => $this->batchSize
526				]
527			);
528			if ( !$res->numRows() ) {
529				break;
530			}
531
532			$lastTextId = 0;
533			foreach ( $res as $row ) {
534				$startId = $row->bt_text_id;
535				if ( $lastTextId == $row->bt_text_id ) {
536					// Duplicate (null edit)
537					continue;
538				}
539				$lastTextId = $row->bt_text_id;
540				// Load the text
541				$text = $this->blobStore->expandBlob( $row->old_text, $row->old_flags );
542				if ( $text === false ) {
543					$this->critical( "Error loading {$row->bt_rev_id}/{$row->bt_text_id}" );
544					continue;
545				}
546
547				// Queue it
548				if ( !$trx->addItem( $text, $row->bt_text_id ) ) {
549					$this->debug( "$titleText: committing blob with " . $trx->getSize() . " items" );
550					$trx->commit();
551					$trx = new CgzCopyTransaction( $this, $this->pageBlobClass );
552					$lbFactory->waitForReplication();
553				}
554			}
555		}
556
557		$this->debug( "$titleText: committing blob with " . $trx->getSize() . " items" );
558		$trx->commit();
559	}
560
561	/**
562	 * Atomic move operation.
563	 *
564	 * Write the new URL to the text table and set the bt_moved flag.
565	 *
566	 * This is done in a single transaction to provide restartable behavior
567	 * without data loss.
568	 *
569	 * The transaction is kept short to reduce locking.
570	 *
571	 * @param int $textId
572	 * @param string $url
573	 */
574	public function moveTextRow( $textId, $url ) {
575		if ( $this->copyOnly ) {
576			$this->critical( "Internal error: can't call moveTextRow() in --copy-only mode" );
577			exit( 1 );
578		}
579		$dbw = wfGetDB( DB_MASTER );
580		$dbw->begin( __METHOD__ );
581		$dbw->update( 'text',
582			[ // set
583				'old_text' => $url,
584				'old_flags' => 'external,utf-8',
585			],
586			[ // where
587				'old_id' => $textId
588			],
589			__METHOD__
590		);
591		$dbw->update( 'blob_tracking',
592			[ 'bt_moved' => 1 ],
593			[ 'bt_text_id' => $textId ],
594			__METHOD__
595		);
596		$dbw->commit( __METHOD__ );
597	}
598
599	/**
600	 * Moves are done in two phases: bt_new_url and then bt_moved.
601	 *  - bt_new_url indicates that the text has been copied to the new cluster.
602	 *  - bt_moved indicates that the text table has been updated.
603	 *
604	 * This function completes any moves that only have done bt_new_url. This
605	 * can happen when the script is interrupted, or when --copy-only is used.
606	 *
607	 * @param array $conds
608	 */
609	private function finishIncompleteMoves( $conds ) {
610		$dbr = wfGetDB( DB_REPLICA );
611		$lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
612
613		$startId = 0;
614		$conds = array_merge( $conds, [
615			'bt_moved' => 0,
616			'bt_new_url IS NOT NULL'
617		] );
618		while ( true ) {
619			$res = $dbr->select( 'blob_tracking',
620				'*',
621				array_merge( $conds, [ 'bt_text_id > ' . $dbr->addQuotes( $startId ) ] ),
622				__METHOD__,
623				[
624					'ORDER BY' => 'bt_text_id',
625					'LIMIT' => $this->batchSize,
626				]
627			);
628			if ( !$res->numRows() ) {
629				break;
630			}
631			$this->debug( 'Incomplete: ' . $res->numRows() . ' rows' );
632			foreach ( $res as $row ) {
633				$startId = $row->bt_text_id;
634				$this->moveTextRow( $row->bt_text_id, $row->bt_new_url );
635				if ( $row->bt_text_id % 10 == 0 ) {
636					$lbFactory->waitForReplication();
637				}
638			}
639		}
640	}
641
642	/**
643	 * Returns the name of the next target cluster
644	 * @return string
645	 */
646	public function getTargetCluster() {
647		$cluster = next( $this->destClusters );
648		if ( $cluster === false ) {
649			$cluster = reset( $this->destClusters );
650		}
651
652		return $cluster;
653	}
654
655	/**
656	 * Gets a DB master connection for the given external cluster name
657	 * @param string $cluster
658	 * @return IMaintainableDatabase
659	 */
660	private function getExtDB( $cluster ) {
661		$lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
662		$lb = $lbFactory->getExternalLB( $cluster );
663
664		return $lb->getMaintenanceConnectionRef( DB_MASTER );
665	}
666
667	/**
668	 * Move an orphan text_id to the new cluster
669	 *
670	 * @param array $textIds
671	 */
672	private function doOrphanList( $textIds ) {
673		// Finish incomplete moves
674		if ( !$this->copyOnly ) {
675			$this->finishIncompleteMoves( [ 'bt_text_id' => $textIds ] );
676			$this->syncDBs();
677		}
678
679		$trx = new CgzCopyTransaction( $this, $this->orphanBlobClass );
680
681		$lbFactory = MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
682		$res = wfGetDB( DB_REPLICA )->select(
683			[ 'text', 'blob_tracking' ],
684			[ 'old_id', 'old_text', 'old_flags' ],
685			[
686				'old_id' => $textIds,
687				'bt_text_id=old_id',
688				'bt_moved' => 0,
689			],
690			__METHOD__,
691			[ 'DISTINCT' ]
692		);
693
694		foreach ( $res as $row ) {
695			$text = $this->blobStore->expandBlob( $row->old_text, $row->old_flags );
696			if ( $text === false ) {
697				$this->critical( "Error: cannot load revision text for old_id={$row->old_id}" );
698				continue;
699			}
700
701			if ( !$trx->addItem( $text, $row->old_id ) ) {
702				$this->debug( "[orphan]: committing blob with " . $trx->getSize() . " rows" );
703				$trx->commit();
704				$trx = new CgzCopyTransaction( $this, $this->orphanBlobClass );
705				$lbFactory->waitForReplication();
706			}
707		}
708		$this->debug( "[orphan]: committing blob with " . $trx->getSize() . " rows" );
709		$trx->commit();
710	}
711}
712
713/**
714 * Class to represent a recompression operation for a single CGZ blob
715 */
716class CgzCopyTransaction {
717	/** @var RecompressTracked */
718	public $parent;
719	public $blobClass;
720	/** @var ConcatenatedGzipHistoryBlob|false */
721	public $cgz;
722	public $referrers;
723	/** @var array */
724	private $texts;
725
726	/**
727	 * Create a transaction from a RecompressTracked object
728	 * @param RecompressTracked $parent
729	 * @param string $blobClass
730	 */
731	public function __construct( $parent, $blobClass ) {
732		$this->blobClass = $blobClass;
733		$this->cgz = false;
734		$this->texts = [];
735		$this->parent = $parent;
736	}
737
738	/**
739	 * Add text.
740	 * Returns false if it's ready to commit.
741	 * @param string $text
742	 * @param int $textId
743	 * @return bool
744	 */
745	public function addItem( $text, $textId ) {
746		if ( !$this->cgz ) {
747			$class = $this->blobClass;
748			$this->cgz = new $class;
749		}
750		$hash = $this->cgz->addItem( $text );
751		$this->referrers[$textId] = $hash;
752		$this->texts[$textId] = $text;
753
754		return $this->cgz->isHappy();
755	}
756
757	public function getSize() {
758		return count( $this->texts );
759	}
760
761	/**
762	 * Recompress text after some aberrant modification
763	 */
764	public function recompress() {
765		$class = $this->blobClass;
766		$this->cgz = new $class;
767		$this->referrers = [];
768		foreach ( $this->texts as $textId => $text ) {
769			$hash = $this->cgz->addItem( $text );
770			$this->referrers[$textId] = $hash;
771		}
772	}
773
774	/**
775	 * Commit the blob.
776	 * Does nothing if no text items have been added.
777	 * May skip the move if --copy-only is set.
778	 */
779	public function commit() {
780		$originalCount = count( $this->texts );
781		if ( !$originalCount ) {
782			return;
783		}
784
785		/* Check to see if the target text_ids have been moved already.
786		 *
787		 * We originally read from the replica DB, so this can happen when a single
788		 * text_id is shared between multiple pages. It's rare, but possible
789		 * if a delete/move/undelete cycle splits up a null edit.
790		 *
791		 * We do a locking read to prevent closer-run race conditions.
792		 */
793		$dbw = wfGetDB( DB_MASTER );
794		$dbw->begin( __METHOD__ );
795		$res = $dbw->select( 'blob_tracking',
796			[ 'bt_text_id', 'bt_moved' ],
797			[ 'bt_text_id' => array_keys( $this->referrers ) ],
798			__METHOD__, [ 'FOR UPDATE' ] );
799		$dirty = false;
800		foreach ( $res as $row ) {
801			if ( $row->bt_moved ) {
802				# This row has already been moved, remove it
803				$this->parent->debug( "TRX: conflict detected in old_id={$row->bt_text_id}" );
804				unset( $this->texts[$row->bt_text_id] );
805				$dirty = true;
806			}
807		}
808
809		// Recompress the blob if necessary
810		if ( $dirty ) {
811			if ( !count( $this->texts ) ) {
812				// All have been moved already
813				if ( $originalCount > 1 ) {
814					// This is suspcious, make noise
815					$this->parent->critical(
816						"Warning: concurrent operation detected, are there two conflicting " .
817						"processes running, doing the same job?" );
818				}
819
820				return;
821			}
822			$this->recompress();
823		}
824
825		// Insert the data into the destination cluster
826		$targetCluster = $this->parent->getTargetCluster();
827		$store = $this->parent->store;
828		$targetDB = $store->getMaster( $targetCluster );
829		$targetDB->clearFlag( DBO_TRX ); // we manage the transactions
830		$targetDB->begin( __METHOD__ );
831		$baseUrl = $this->parent->store->store( $targetCluster, serialize( $this->cgz ) );
832
833		// Write the new URLs to the blob_tracking table
834		foreach ( $this->referrers as $textId => $hash ) {
835			$url = $baseUrl . '/' . $hash;
836			$dbw->update( 'blob_tracking',
837				[ 'bt_new_url' => $url ],
838				[
839					'bt_text_id' => $textId,
840					'bt_moved' => 0, # Check for concurrent conflicting update
841				],
842				__METHOD__
843			);
844		}
845
846		$targetDB->commit( __METHOD__ );
847		// Critical section here: interruption at this point causes blob duplication
848		// Reversing the order of the commits would cause data loss instead
849		$dbw->commit( __METHOD__ );
850
851		// Write the new URLs to the text table and set the moved flag
852		if ( !$this->parent->copyOnly ) {
853			foreach ( $this->referrers as $textId => $hash ) {
854				$url = $baseUrl . '/' . $hash;
855				$this->parent->moveTextRow( $textId, $url );
856			}
857		}
858	}
859}
860