1<?php 2 3namespace MediaWiki\Extension\PdfHandler; 4 5use File; 6use IContextSource; 7use ImageHandler; 8use MediaTransformError; 9use MediaTransformOutput; 10use MediaWiki\MediaWikiServices; 11use PoolCounterWorkViaCallback; 12use ResourceLoader; 13use ThumbnailImage; 14use TransformParameterError; 15use Wikimedia\AtEase\AtEase; 16 17/** 18 * Copyright © 2007 Martin Seidel (Xarax) <jodeldi@gmx.de> 19 * 20 * Inspired by djvuhandler from Tim Starling 21 * Modified and written by Xarax 22 * 23 * This program is free software; you can redistribute it and/or modify 24 * it under the terms of the GNU General Public License as published by 25 * the Free Software Foundation; either version 2 of the License, or 26 * (at your option) any later version. 27 * 28 * This program is distributed in the hope that it will be useful, 29 * but WITHOUT ANY WARRANTY; without even the implied warranty of 30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 31 * GNU General Public License for more details. 32 * 33 * You should have received a copy of the GNU General Public License along 34 * with this program; if not, write to the Free Software Foundation, Inc., 35 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 36 * http://www.gnu.org/copyleft/gpl.html 37 */ 38 39class PdfHandler extends ImageHandler { 40 private const MESSAGES = [ 41 'main' => 'pdf-file-page-warning', 42 'header' => 'pdf-file-page-warning-header', 43 'info' => 'pdf-file-page-warning-info', 44 'footer' => 'pdf-file-page-warning-footer', 45 ]; 46 47 /** 48 * 10MB is considered a large file 49 */ 50 private const LARGE_FILE = 1e7; 51 52 /** 53 * @return bool 54 */ 55 public function isEnabled() { 56 global $wgPdfProcessor, $wgPdfPostProcessor, $wgPdfInfo; 57 58 if ( !isset( $wgPdfProcessor ) || !isset( $wgPdfPostProcessor ) || !isset( $wgPdfInfo ) ) { 59 wfDebug( "PdfHandler is disabled, please set the following\n" ); 60 wfDebug( "variables in LocalSettings.php:\n" ); 61 wfDebug( "\$wgPdfProcessor, \$wgPdfPostProcessor, \$wgPdfInfo\n" ); 62 return false; 63 } 64 return true; 65 } 66 67 /** 68 * @param File $file 69 * @return bool 70 */ 71 public function mustRender( $file ) { 72 return true; 73 } 74 75 /** 76 * @param File $file 77 * @return bool 78 */ 79 public function isMultiPage( $file ) { 80 return true; 81 } 82 83 /** 84 * @param string $name 85 * @param string $value 86 * @return bool 87 */ 88 public function validateParam( $name, $value ) { 89 if ( $name === 'page' && trim( $value ) !== (string)intval( $value ) ) { 90 // Extra junk on the end of page, probably actually a caption 91 // e.g. [[File:Foo.pdf|thumb|Page 3 of the document shows foo]] 92 return false; 93 } 94 if ( in_array( $name, [ 'width', 'height', 'page' ] ) ) { 95 return ( $value > 0 ); 96 } 97 return false; 98 } 99 100 /** 101 * @param array $params 102 * @return bool|string 103 */ 104 public function makeParamString( $params ) { 105 $page = $params['page'] ?? 1; 106 if ( !isset( $params['width'] ) ) { 107 return false; 108 } 109 return "page{$page}-{$params['width']}px"; 110 } 111 112 /** 113 * @param string $str 114 * @return array|bool 115 */ 116 public function parseParamString( $str ) { 117 $m = []; 118 119 if ( preg_match( '/^page(\d+)-(\d+)px$/', $str, $m ) ) { 120 return [ 'width' => $m[2], 'page' => $m[1] ]; 121 } 122 123 return false; 124 } 125 126 /** 127 * @param array $params 128 * @return array 129 */ 130 public function getScriptParams( $params ) { 131 return [ 132 'width' => $params['width'], 133 'page' => $params['page'], 134 ]; 135 } 136 137 /** 138 * @return array 139 */ 140 public function getParamMap() { 141 return [ 142 'img_width' => 'width', 143 'img_page' => 'page', 144 ]; 145 } 146 147 /** 148 * @param int $width 149 * @param int $height 150 * @param string $msg 151 * @return MediaTransformError 152 */ 153 protected function doThumbError( $width, $height, $msg ) { 154 return new MediaTransformError( 'thumbnail_error', 155 $width, $height, wfMessage( $msg )->inContentLanguage()->text() ); 156 } 157 158 /** 159 * @param File $image 160 * @param string $dstPath 161 * @param string $dstUrl 162 * @param array $params 163 * @param int $flags 164 * @return MediaTransformError|MediaTransformOutput|ThumbnailImage|TransformParameterError 165 */ 166 public function doTransform( $image, $dstPath, $dstUrl, $params, $flags = 0 ) { 167 global $wgPdfProcessor, $wgPdfPostProcessor, $wgPdfHandlerDpi, $wgPdfHandlerJpegQuality; 168 169 if ( !$this->normaliseParams( $image, $params ) ) { 170 return new TransformParameterError( $params ); 171 } 172 173 // @phan-suppress-next-line PhanTypePossiblyInvalidDimOffset 174 $width = (int)$params['width']; 175 $height = (int)$params['height']; 176 $page = (int)$params['page']; 177 178 if ( $page > $this->pageCount( $image ) ) { 179 return $this->doThumbError( $width, $height, 'pdf_page_error' ); 180 } 181 182 if ( $flags & self::TRANSFORM_LATER ) { 183 return new ThumbnailImage( $image, $dstUrl, false, [ 184 'width' => $width, 185 'height' => $height, 186 'page' => $page, 187 ] ); 188 } 189 190 if ( !wfMkdirParents( dirname( $dstPath ), null, __METHOD__ ) ) { 191 return $this->doThumbError( $width, $height, 'thumbnail_dest_directory' ); 192 } 193 194 // Thumbnail extraction is very inefficient for large files. 195 // Provide a way to pool count limit the number of downloaders. 196 if ( $image->getSize() >= self::LARGE_FILE ) { 197 $work = new PoolCounterWorkViaCallback( 'GetLocalFileCopy', sha1( $image->getName() ), 198 [ 199 'doWork' => function () use ( $image ) { 200 return $image->getLocalRefPath(); 201 } 202 ] 203 ); 204 $srcPath = $work->execute(); 205 } else { 206 $srcPath = $image->getLocalRefPath(); 207 } 208 209 if ( $srcPath === false ) { 210 // could not download original 211 return $this->doThumbError( $width, $height, 'filemissing' ); 212 } 213 214 $cmd = '(' . wfEscapeShellArg( 215 $wgPdfProcessor, 216 "-sDEVICE=jpeg", 217 "-sOutputFile=-", 218 "-sstdout=%stderr", 219 "-dFirstPage={$page}", 220 "-dLastPage={$page}", 221 "-dSAFER", 222 "-r{$wgPdfHandlerDpi}", 223 "-dBATCH", 224 "-dNOPAUSE", 225 "-q", 226 $srcPath 227 ); 228 $cmd .= " | " . wfEscapeShellArg( 229 $wgPdfPostProcessor, 230 "-depth", 231 "8", 232 "-quality", 233 $wgPdfHandlerJpegQuality, 234 "-resize", 235 $width, 236 "-", 237 $dstPath 238 ); 239 $cmd .= ")"; 240 241 wfDebug( __METHOD__ . ": $cmd\n" ); 242 $retval = ''; 243 $err = wfShellExecWithStderr( $cmd, $retval ); 244 245 $removed = $this->removeBadFile( $dstPath, $retval ); 246 247 if ( $retval != 0 || $removed ) { 248 wfDebugLog( 'thumbnail', 249 sprintf( 'thumbnail failed on %s: error %d "%s" from "%s"', 250 wfHostname(), $retval, trim( $err ), $cmd ) ); 251 return new MediaTransformError( 'thumbnail_error', $width, $height, $err ); 252 } else { 253 return new ThumbnailImage( $image, $dstUrl, $dstPath, [ 254 'width' => $width, 255 'height' => $height, 256 'page' => $page, 257 ] ); 258 } 259 } 260 261 /** 262 * @param File $image 263 * @param string $path 264 * @return PdfImage 265 * @suppress PhanUndeclaredProperty 266 */ 267 private function getPdfImage( $image, $path ) { 268 if ( !$image ) { 269 $pdfimg = new PdfImage( $path ); 270 } elseif ( !isset( $image->pdfImage ) ) { 271 $pdfimg = $image->pdfImage = new PdfImage( $path ); 272 } else { 273 $pdfimg = $image->pdfImage; 274 } 275 276 return $pdfimg; 277 } 278 279 /** 280 * @param File $image 281 * @return bool|array 282 */ 283 private function getMetaArray( $image ) { 284 if ( isset( $image->pdfMetaArray ) ) { 285 return $image->pdfMetaArray; 286 } 287 288 $metadata = $image->getMetadata(); 289 290 if ( !$this->isMetadataValid( $image, $metadata ) ) { 291 wfDebug( "Pdf metadata is invalid or missing, should have been fixed in upgradeRow\n" ); 292 return false; 293 } 294 295 $work = new PoolCounterWorkViaCallback( 296 'PdfHandler-unserialize-metadata', 297 $image->getName(), 298 [ 299 /** 300 * @suppress PhanUndeclaredProperty 301 */ 302 'doWork' => function () use ( $image, $metadata ) { 303 AtEase::suppressWarnings(); 304 $image->pdfMetaArray = unserialize( $metadata ); 305 AtEase::restoreWarnings(); 306 }, 307 ] 308 ); 309 $work->execute(); 310 311 return $image->pdfMetaArray; 312 } 313 314 /** 315 * @param File $image 316 * @param string $path 317 * @return array|bool 318 */ 319 public function getImageSize( $image, $path ) { 320 return $this->getPdfImage( $image, $path )->getImageSize(); 321 } 322 323 /** 324 * @param string $ext 325 * @param string $mime 326 * @param null $params 327 * @return array 328 */ 329 public function getThumbType( $ext, $mime, $params = null ) { 330 global $wgPdfOutputExtension; 331 static $mime; 332 333 if ( !isset( $mime ) ) { 334 $magic = MediaWikiServices::getInstance()->getMimeAnalyzer(); 335 $mime = $magic->guessTypesForExtension( $wgPdfOutputExtension ); 336 } 337 return [ $wgPdfOutputExtension, $mime ]; 338 } 339 340 /** 341 * @param File $image 342 * @param string $path 343 * @return string 344 */ 345 public function getMetadata( $image, $path ) { 346 return serialize( $this->getPdfImage( $image, $path )->retrieveMetaData() ); 347 } 348 349 /** 350 * @param File $image 351 * @param string $metadata 352 * @return bool 353 */ 354 public function isMetadataValid( $image, $metadata ) { 355 if ( !$metadata || $metadata === serialize( [] ) ) { 356 return self::METADATA_BAD; 357 } elseif ( strpos( $metadata, 'mergedMetadata' ) === false ) { 358 return self::METADATA_COMPATIBLE; 359 } 360 return self::METADATA_GOOD; 361 } 362 363 /** 364 * @param File $image 365 * @param bool|IContextSource $context Context to use (optional) 366 * @return bool|array 367 */ 368 public function formatMetadata( $image, $context = false ) { 369 $meta = $image->getMetadata(); 370 371 if ( !$meta ) { 372 return false; 373 } 374 AtEase::suppressWarnings(); 375 $meta = unserialize( $meta ); 376 AtEase::restoreWarnings(); 377 378 if ( !isset( $meta['mergedMetadata'] ) 379 || !is_array( $meta['mergedMetadata'] ) 380 || count( $meta['mergedMetadata'] ) < 1 381 ) { 382 return false; 383 } 384 385 // Inherited from MediaHandler. 386 return $this->formatMetadataHelper( $meta['mergedMetadata'], $context ); 387 } 388 389 /** @inheritDoc */ 390 protected function formatTag( string $key, $vals, $context = false ) { 391 switch ( $key ) { 392 case 'pdf-Producer': 393 case 'pdf-Version': 394 return htmlspecialchars( $vals ); 395 case 'pdf-PageSize': 396 foreach ( $vals as &$val ) { 397 $val = htmlspecialchars( $val ); 398 } 399 return $vals; 400 case 'pdf-Encrypted': 401 // @todo: The value isn't i18n-ised; should be done here. 402 // For reference, if encrypted this field's value looks like: 403 // "yes (print:yes copy:no change:no addNotes:no)" 404 return htmlspecialchars( $vals ); 405 default: 406 break; 407 } 408 // Use default formatting 409 return false; 410 } 411 412 /** 413 * @param File $image 414 * @return bool|int 415 */ 416 public function pageCount( File $image ) { 417 $info = $this->getDimensionInfo( $image ); 418 419 return $info ? $info['pageCount'] : false; 420 } 421 422 /** 423 * @param File $image 424 * @param int $page 425 * @return array|bool 426 */ 427 public function getPageDimensions( File $image, $page ) { 428 // MW starts pages at 1, as they are stored here 429 $index = $page; 430 431 $info = $this->getDimensionInfo( $image ); 432 if ( $info && isset( $info['dimensionsByPage'][$index] ) ) { 433 return $info['dimensionsByPage'][$index]; 434 } 435 436 return false; 437 } 438 439 protected function getDimensionInfo( File $file ) { 440 $cache = MediaWikiServices::getInstance()->getMainWANObjectCache(); 441 return $cache->getWithSetCallback( 442 $cache->makeKey( 'file-pdf', 'dimensions', $file->getSha1() ), 443 $cache::TTL_INDEFINITE, 444 function () use ( $file ) { 445 $data = $this->getMetaArray( $file ); 446 if ( !$data || !isset( $data['Pages'] ) ) { 447 return false; 448 } 449 450 // lower peak RAM 451 unset( $data['text'] ); 452 453 $dimsByPage = []; 454 $count = intval( $data['Pages'] ); 455 for ( $i = 1; $i <= $count; $i++ ) { 456 $dimsByPage[$i] = PdfImage::getPageSize( $data, $i ); 457 } 458 459 return [ 'pageCount' => $count, 'dimensionsByPage' => $dimsByPage ]; 460 }, 461 [ 'pcTTL' => $cache::TTL_INDEFINITE ] 462 ); 463 } 464 465 /** 466 * @param File $image 467 * @param int $page 468 * @return bool 469 */ 470 public function getPageText( File $image, $page ) { 471 $data = $this->getMetaArray( $image ); 472 if ( !$data || !isset( $data['text'] ) || !isset( $data['text'][$page - 1] ) ) { 473 return false; 474 } 475 return $data['text'][$page - 1]; 476 } 477 478 /** 479 * Adds a warning about PDFs being potentially dangerous to the file 480 * page. Multiple messages with this base will be used. 481 * @param File $file 482 * @return array 483 */ 484 public function getWarningConfig( $file ) { 485 return [ 486 'messages' => self::MESSAGES, 487 'link' => '//www.mediawiki.org/wiki/Special:MyLanguage/Help:Security/PDF_files', 488 'module' => 'pdfhandler.messages', 489 ]; 490 } 491 492 /** 493 * Register a module with the warning messages in it. 494 * @param ResourceLoader &$resourceLoader 495 */ 496 public static function registerWarningModule( &$resourceLoader ) { 497 $resourceLoader->register( 'pdfhandler.messages', [ 498 'messages' => array_values( self::MESSAGES ), 499 ] ); 500 } 501} 502