1<?php
2/**
3 * Job to update links for a given title.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
19 *
20 * @file
21 * @ingroup JobQueue
22 */
23
24use MediaWiki\MediaWikiServices;
25
26/**
27 * Class with Backlink related Job helper methods
28 *
29 * When an asset changes, a base job can be inserted to update all assets that depend on it.
30 * The base job splits into per-title "leaf" jobs and a "remnant" job to handle the remaining
31 * range of backlinks. This recurs until the remnant job's backlink range is small enough that
32 * only leaf jobs are created from it.
33 *
34 * For example, if templates A and B are edited (at the same time) the queue will have:
35 *     (A base, B base)
36 * When these jobs run, the queue will have per-title and remnant partition jobs:
37 *     (titleX,titleY,titleZ,...,A remnant,titleM,titleN,titleO,...,B remnant)
38 *
39 * This works best when the queue is FIFO, for several reasons:
40 *   - a) Since the remnant jobs are enqueued after the leaf jobs, the slower leaf jobs have to
41 *        get popped prior to the fast remnant jobs. This avoids flooding the queue with leaf jobs
42 *        for every single backlink of widely used assets (which can be millions).
43 *   - b) Other jobs going in the queue still get a chance to run after a widely used asset changes.
44 *        This is due to the large remnant job pushing to the end of the queue with each division.
45 *
46 * The size of the queues used in this manner depend on the number of assets changes and the
47 * number of workers. Also, with FIFO-per-partition queues, the queue size can be somewhat larger,
48 * depending on the number of queue partitions.
49 *
50 * @ingroup JobQueue
51 * @since 1.23
52 */
53class BacklinkJobUtils {
54	/**
55	 * Break down $job into approximately ($bSize/$cSize) leaf jobs and a single partition
56	 * job that covers the remaining backlink range (if needed). Jobs for the first $bSize
57	 * titles are collated ($cSize per job) into leaf jobs to do actual work. All the
58	 * resulting jobs are of the same class as $job. No partition job is returned if the
59	 * range covered by $job was less than $bSize, as the leaf jobs have full coverage.
60	 *
61	 * The leaf jobs have the 'pages' param set to a (<page ID>:(<namespace>,<DB key>),...)
62	 * map so that the run() function knows what pages to act on. The leaf jobs will keep
63	 * the same job title as the parent job (e.g. $job).
64	 *
65	 * The partition jobs have the 'range' parameter set to a map of the format
66	 * (start:<integer>, end:<integer>, batchSize:<integer>, subranges:((<start>,<end>),...)),
67	 * the 'table' parameter set to that of $job, and the 'recursive' parameter set to true.
68	 * This method can be called on the resulting job to repeat the process again.
69	 *
70	 * The job provided ($job) must have the 'recursive' parameter set to true and the 'table'
71	 * parameter must be set to a backlink table. The job title will be used as the title to
72	 * find backlinks for. Any 'range' parameter must follow the same format as mentioned above.
73	 * This should be managed by recursive calls to this method.
74	 *
75	 * The first jobs return are always the leaf jobs. This lets the caller use push() to
76	 * put them directly into the queue and works well if the queue is FIFO. In such a queue,
77	 * the leaf jobs have to get finished first before anything can resolve the next partition
78	 * job, which keeps the queue very small.
79	 *
80	 * $opts includes:
81	 *   - params : extra job parameters to include in each job
82	 *
83	 * @param Job $job
84	 * @param int $bSize BacklinkCache partition size; usually $wgUpdateRowsPerJob
85	 * @param int $cSize Max titles per leaf job; Usually 1 or a modest value
86	 * @param array $opts Optional parameter map
87	 * @return Job[]
88	 */
89	public static function partitionBacklinkJob( Job $job, $bSize, $cSize, $opts = [] ) {
90		$class = get_class( $job );
91		$title = $job->getTitle();
92		$params = $job->getParams();
93
94		$backlinkCache = MediaWikiServices::getInstance()->getBacklinkCacheFactory()
95			->getBacklinkCache( $title );
96		if ( isset( $params['pages'] ) || empty( $params['recursive'] ) ) {
97			$ranges = []; // sanity; this is a leaf node
98			$realBSize = 0;
99			wfWarn( __METHOD__ . " called on {$job->getType()} leaf job (explosive recursion)." );
100		} elseif ( isset( $params['range'] ) ) {
101			// This is a range job to trigger the insertion of partitioned/title jobs...
102			$ranges = $params['range']['subranges'];
103			$realBSize = $params['range']['batchSize'];
104		} else {
105			// This is a base job to trigger the insertion of partitioned jobs...
106			$ranges = $backlinkCache->partition( $params['table'], $bSize );
107			$realBSize = $bSize;
108		}
109
110		$extraParams = $opts['params'] ?? [];
111
112		$jobs = [];
113		// Combine the first range (of size $bSize) backlinks into leaf jobs
114		if ( isset( $ranges[0] ) ) {
115			list( $start, $end ) = $ranges[0];
116			$iter = $backlinkCache->getLinks( $params['table'], $start, $end );
117			$titles = iterator_to_array( $iter );
118			/** @var Title[] $titleBatch */
119			foreach ( array_chunk( $titles, $cSize ) as $titleBatch ) {
120				$pages = [];
121				foreach ( $titleBatch as $tl ) {
122					$pages[$tl->getArticleID()] = [ $tl->getNamespace(), $tl->getDBkey() ];
123				}
124				$jobs[] = new $class(
125					$title, // maintain parent job title
126					[ 'pages' => $pages ] + $extraParams
127				);
128			}
129		}
130		// Take all of the remaining ranges and build a partition job from it
131		if ( isset( $ranges[1] ) ) {
132			$jobs[] = new $class(
133				$title, // maintain parent job title
134				[
135					'recursive'     => true,
136					'table'         => $params['table'],
137					'range'         => [
138						'start'     => $ranges[1][0],
139						'end'       => $ranges[count( $ranges ) - 1][1],
140						'batchSize' => $realBSize,
141						'subranges' => array_slice( $ranges, 1 )
142					],
143					// Track how many times the base job divided for debugging
144					'division'      => isset( $params['division'] )
145						? ( $params['division'] + 1 )
146						: 1
147				] + $extraParams
148			);
149		}
150
151		return $jobs;
152	}
153}
154