1<?php
2
3namespace MediaWiki\Extension\PdfHandler;
4
5use File;
6use IContextSource;
7use ImageHandler;
8use MediaTransformError;
9use MediaTransformOutput;
10use MediaWiki\MediaWikiServices;
11use PoolCounterWorkViaCallback;
12use ResourceLoader;
13use ThumbnailImage;
14use TransformParameterError;
15use Wikimedia\AtEase\AtEase;
16
17/**
18 * Copyright © 2007 Martin Seidel (Xarax) <jodeldi@gmx.de>
19 *
20 * Inspired by djvuhandler from Tim Starling
21 * Modified and written by Xarax
22 *
23 * This program is free software; you can redistribute it and/or modify
24 * it under the terms of the GNU General Public License as published by
25 * the Free Software Foundation; either version 2 of the License, or
26 * (at your option) any later version.
27 *
28 * This program is distributed in the hope that it will be useful,
29 * but WITHOUT ANY WARRANTY; without even the implied warranty of
30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31 * GNU General Public License for more details.
32 *
33 * You should have received a copy of the GNU General Public License along
34 * with this program; if not, write to the Free Software Foundation, Inc.,
35 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
36 * http://www.gnu.org/copyleft/gpl.html
37 */
38
39class PdfHandler extends ImageHandler {
40	private const MESSAGES = [
41		'main' => 'pdf-file-page-warning',
42		'header' => 'pdf-file-page-warning-header',
43		'info' => 'pdf-file-page-warning-info',
44		'footer' => 'pdf-file-page-warning-footer',
45	];
46
47	/**
48	 * 10MB is considered a large file
49	 */
50	private const LARGE_FILE = 1e7;
51
52	/**
53	 * @return bool
54	 */
55	public function isEnabled() {
56		global $wgPdfProcessor, $wgPdfPostProcessor, $wgPdfInfo;
57
58		if ( !isset( $wgPdfProcessor ) || !isset( $wgPdfPostProcessor ) || !isset( $wgPdfInfo ) ) {
59			wfDebug( "PdfHandler is disabled, please set the following\n" );
60			wfDebug( "variables in LocalSettings.php:\n" );
61			wfDebug( "\$wgPdfProcessor, \$wgPdfPostProcessor, \$wgPdfInfo\n" );
62			return false;
63		}
64		return true;
65	}
66
67	/**
68	 * @param File $file
69	 * @return bool
70	 */
71	public function mustRender( $file ) {
72		return true;
73	}
74
75	/**
76	 * @param File $file
77	 * @return bool
78	 */
79	public function isMultiPage( $file ) {
80		return true;
81	}
82
83	/**
84	 * @param string $name
85	 * @param string $value
86	 * @return bool
87	 */
88	public function validateParam( $name, $value ) {
89		if ( $name === 'page' && trim( $value ) !== (string)intval( $value ) ) {
90			// Extra junk on the end of page, probably actually a caption
91			// e.g. [[File:Foo.pdf|thumb|Page 3 of the document shows foo]]
92			return false;
93		}
94		if ( in_array( $name, [ 'width', 'height', 'page' ] ) ) {
95			return ( $value > 0 );
96		}
97		return false;
98	}
99
100	/**
101	 * @param array $params
102	 * @return bool|string
103	 */
104	public function makeParamString( $params ) {
105		$page = $params['page'] ?? 1;
106		if ( !isset( $params['width'] ) ) {
107			return false;
108		}
109		return "page{$page}-{$params['width']}px";
110	}
111
112	/**
113	 * @param string $str
114	 * @return array|bool
115	 */
116	public function parseParamString( $str ) {
117		$m = [];
118
119		if ( preg_match( '/^page(\d+)-(\d+)px$/', $str, $m ) ) {
120			return [ 'width' => $m[2], 'page' => $m[1] ];
121		}
122
123		return false;
124	}
125
126	/**
127	 * @param array $params
128	 * @return array
129	 */
130	public function getScriptParams( $params ) {
131		return [
132			'width' => $params['width'],
133			'page' => $params['page'],
134		];
135	}
136
137	/**
138	 * @return array
139	 */
140	public function getParamMap() {
141		return [
142			'img_width' => 'width',
143			'img_page' => 'page',
144		];
145	}
146
147	/**
148	 * @param int $width
149	 * @param int $height
150	 * @param string $msg
151	 * @return MediaTransformError
152	 */
153	protected function doThumbError( $width, $height, $msg ) {
154		return new MediaTransformError( 'thumbnail_error',
155			$width, $height, wfMessage( $msg )->inContentLanguage()->text() );
156	}
157
158	/**
159	 * @param File $image
160	 * @param string $dstPath
161	 * @param string $dstUrl
162	 * @param array $params
163	 * @param int $flags
164	 * @return MediaTransformError|MediaTransformOutput|ThumbnailImage|TransformParameterError
165	 */
166	public function doTransform( $image, $dstPath, $dstUrl, $params, $flags = 0 ) {
167		global $wgPdfProcessor, $wgPdfPostProcessor, $wgPdfHandlerDpi, $wgPdfHandlerJpegQuality;
168
169		if ( !$this->normaliseParams( $image, $params ) ) {
170			return new TransformParameterError( $params );
171		}
172
173		// @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset
174		$width = (int)$params['width'];
175		$height = (int)$params['height'];
176		$page = (int)$params['page'];
177
178		if ( $page > $this->pageCount( $image ) ) {
179			return $this->doThumbError( $width, $height, 'pdf_page_error' );
180		}
181
182		if ( $flags & self::TRANSFORM_LATER ) {
183			return new ThumbnailImage( $image, $dstUrl, false, [
184				'width' => $width,
185				'height' => $height,
186				'page' => $page,
187			] );
188		}
189
190		if ( !wfMkdirParents( dirname( $dstPath ), null, __METHOD__ ) ) {
191			return $this->doThumbError( $width, $height, 'thumbnail_dest_directory' );
192		}
193
194		// Thumbnail extraction is very inefficient for large files.
195		// Provide a way to pool count limit the number of downloaders.
196		if ( $image->getSize() >= self::LARGE_FILE ) {
197			$work = new PoolCounterWorkViaCallback( 'GetLocalFileCopy', sha1( $image->getName() ),
198				[
199					'doWork' => function () use ( $image ) {
200						return $image->getLocalRefPath();
201					}
202				]
203			);
204			$srcPath = $work->execute();
205		} else {
206			$srcPath = $image->getLocalRefPath();
207		}
208
209		if ( $srcPath === false ) {
210			// could not download original
211			return $this->doThumbError( $width, $height, 'filemissing' );
212		}
213
214		$cmd = '(' . wfEscapeShellArg(
215			$wgPdfProcessor,
216			"-sDEVICE=jpeg",
217			"-sOutputFile=-",
218			"-sstdout=%stderr",
219			"-dFirstPage={$page}",
220			"-dLastPage={$page}",
221			"-dSAFER",
222			"-r{$wgPdfHandlerDpi}",
223			"-dBATCH",
224			"-dNOPAUSE",
225			"-q",
226			$srcPath
227		);
228		$cmd .= " | " . wfEscapeShellArg(
229			$wgPdfPostProcessor,
230			"-depth",
231			"8",
232			"-quality",
233			$wgPdfHandlerJpegQuality,
234			"-resize",
235			$width,
236			"-",
237			$dstPath
238		);
239		$cmd .= ")";
240
241		wfDebug( __METHOD__ . ": $cmd\n" );
242		$retval = '';
243		$err = wfShellExecWithStderr( $cmd, $retval );
244
245		$removed = $this->removeBadFile( $dstPath, $retval );
246
247		if ( $retval != 0 || $removed ) {
248			wfDebugLog( 'thumbnail',
249				sprintf( 'thumbnail failed on %s: error %d "%s" from "%s"',
250				wfHostname(), $retval, trim( $err ), $cmd ) );
251			return new MediaTransformError( 'thumbnail_error', $width, $height, $err );
252		} else {
253			return new ThumbnailImage( $image, $dstUrl, $dstPath, [
254				'width' => $width,
255				'height' => $height,
256				'page' => $page,
257			] );
258		}
259	}
260
261	/**
262	 * @param File $image
263	 * @param string $path
264	 * @return PdfImage
265	 * @suppress PhanUndeclaredProperty
266	 */
267	private function getPdfImage( $image, $path ) {
268		if ( !$image ) {
269			$pdfimg = new PdfImage( $path );
270		} elseif ( !isset( $image->pdfImage ) ) {
271			$pdfimg = $image->pdfImage = new PdfImage( $path );
272		} else {
273			$pdfimg = $image->pdfImage;
274		}
275
276		return $pdfimg;
277	}
278
279	/**
280	 * @param File $image
281	 * @return bool|array
282	 */
283	private function getMetaArray( $image ) {
284		if ( isset( $image->pdfMetaArray ) ) {
285			return $image->pdfMetaArray;
286		}
287
288		$metadata = $image->getMetadata();
289
290		if ( !$this->isMetadataValid( $image, $metadata ) ) {
291			wfDebug( "Pdf metadata is invalid or missing, should have been fixed in upgradeRow\n" );
292			return false;
293		}
294
295		$work = new PoolCounterWorkViaCallback(
296			'PdfHandler-unserialize-metadata',
297			$image->getName(),
298			[
299				/**
300				 * @suppress PhanUndeclaredProperty
301				 */
302				'doWork' => function () use ( $image, $metadata ) {
303					AtEase::suppressWarnings();
304					$image->pdfMetaArray = unserialize( $metadata );
305					AtEase::restoreWarnings();
306				},
307			]
308		);
309		$work->execute();
310
311		return $image->pdfMetaArray;
312	}
313
314	/**
315	 * @param File $image
316	 * @param string $path
317	 * @return array|bool
318	 */
319	public function getImageSize( $image, $path ) {
320		return $this->getPdfImage( $image, $path )->getImageSize();
321	}
322
323	/**
324	 * @param string $ext
325	 * @param string $mime
326	 * @param null $params
327	 * @return array
328	 */
329	public function getThumbType( $ext, $mime, $params = null ) {
330		global $wgPdfOutputExtension;
331		static $mime;
332
333		if ( !isset( $mime ) ) {
334			$magic = MediaWikiServices::getInstance()->getMimeAnalyzer();
335			$mime = $magic->guessTypesForExtension( $wgPdfOutputExtension );
336		}
337		return [ $wgPdfOutputExtension, $mime ];
338	}
339
340	/**
341	 * @param File $image
342	 * @param string $path
343	 * @return string
344	 */
345	public function getMetadata( $image, $path ) {
346		return serialize( $this->getPdfImage( $image, $path )->retrieveMetaData() );
347	}
348
349	/**
350	 * @param File $image
351	 * @param string $metadata
352	 * @return bool
353	 */
354	public function isMetadataValid( $image, $metadata ) {
355		if ( !$metadata || $metadata === serialize( [] ) ) {
356			return self::METADATA_BAD;
357		} elseif ( strpos( $metadata, 'mergedMetadata' ) === false ) {
358			return self::METADATA_COMPATIBLE;
359		}
360		return self::METADATA_GOOD;
361	}
362
363	/**
364	 * @param File $image
365	 * @param bool|IContextSource $context Context to use (optional)
366	 * @return bool|array
367	 */
368	public function formatMetadata( $image, $context = false ) {
369		$meta = $image->getMetadata();
370
371		if ( !$meta ) {
372			return false;
373		}
374		AtEase::suppressWarnings();
375		$meta = unserialize( $meta );
376		AtEase::restoreWarnings();
377
378		if ( !isset( $meta['mergedMetadata'] )
379			|| !is_array( $meta['mergedMetadata'] )
380			|| count( $meta['mergedMetadata'] ) < 1
381		) {
382			return false;
383		}
384
385		// Inherited from MediaHandler.
386		return $this->formatMetadataHelper( $meta['mergedMetadata'], $context );
387	}
388
389	/** @inheritDoc */
390	protected function formatTag( string $key, $vals, $context = false ) {
391		switch ( $key ) {
392			case 'pdf-Producer':
393			case 'pdf-Version':
394				return htmlspecialchars( $vals );
395			case 'pdf-PageSize':
396				foreach ( $vals as &$val ) {
397					$val = htmlspecialchars( $val );
398				}
399				return $vals;
400			case 'pdf-Encrypted':
401				// @todo: The value isn't i18n-ised; should be done here.
402				// For reference, if encrypted this field's value looks like:
403				// "yes (print:yes copy:no change:no addNotes:no)"
404				return htmlspecialchars( $vals );
405			default:
406				break;
407		}
408		// Use default formatting
409		return false;
410	}
411
412	/**
413	 * @param File $image
414	 * @return bool|int
415	 */
416	public function pageCount( File $image ) {
417		$info = $this->getDimensionInfo( $image );
418
419		return $info ? $info['pageCount'] : false;
420	}
421
422	/**
423	 * @param File $image
424	 * @param int $page
425	 * @return array|bool
426	 */
427	public function getPageDimensions( File $image, $page ) {
428		// MW starts pages at 1, as they are stored here
429		$index = $page;
430
431		$info = $this->getDimensionInfo( $image );
432		if ( $info && isset( $info['dimensionsByPage'][$index] ) ) {
433			return $info['dimensionsByPage'][$index];
434		}
435
436		return false;
437	}
438
439	protected function getDimensionInfo( File $file ) {
440		$cache = MediaWikiServices::getInstance()->getMainWANObjectCache();
441		return $cache->getWithSetCallback(
442			$cache->makeKey( 'file-pdf', 'dimensions', $file->getSha1() ),
443			$cache::TTL_INDEFINITE,
444			function () use ( $file ) {
445				$data = $this->getMetaArray( $file );
446				if ( !$data || !isset( $data['Pages'] ) ) {
447					return false;
448				}
449
450				// lower peak RAM
451				unset( $data['text'] );
452
453				$dimsByPage = [];
454				$count = intval( $data['Pages'] );
455				for ( $i = 1; $i <= $count; $i++ ) {
456					$dimsByPage[$i] = PdfImage::getPageSize( $data, $i );
457				}
458
459				return [ 'pageCount' => $count, 'dimensionsByPage' => $dimsByPage ];
460			},
461			[ 'pcTTL' => $cache::TTL_INDEFINITE ]
462		);
463	}
464
465	/**
466	 * @param File $image
467	 * @param int $page
468	 * @return bool
469	 */
470	public function getPageText( File $image, $page ) {
471		$data = $this->getMetaArray( $image );
472		if ( !$data || !isset( $data['text'] ) || !isset( $data['text'][$page - 1] ) ) {
473			return false;
474		}
475		return $data['text'][$page - 1];
476	}
477
478	/**
479	 * Adds a warning about PDFs being potentially dangerous to the file
480	 * page. Multiple messages with this base will be used.
481	 * @param File $file
482	 * @return array
483	 */
484	public function getWarningConfig( $file ) {
485		return [
486			'messages' => self::MESSAGES,
487			'link' => '//www.mediawiki.org/wiki/Special:MyLanguage/Help:Security/PDF_files',
488			'module' => 'pdfhandler.messages',
489		];
490	}
491
492	/**
493	 * Register a module with the warning messages in it.
494	 * @param ResourceLoader &$resourceLoader
495	 */
496	public static function registerWarningModule( &$resourceLoader ) {
497		$resourceLoader->register( 'pdfhandler.messages', [
498			'messages' => array_values( self::MESSAGES ),
499		] );
500	}
501}
502