1<?php 2/** 3 * Extraction of metadata from different bitmap image types. 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation; either version 2 of the License, or 8 * (at your option) any later version. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License along 16 * with this program; if not, write to the Free Software Foundation, Inc., 17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18 * http://www.gnu.org/copyleft/gpl.html 19 * 20 * @file 21 * @ingroup Media 22 */ 23 24use MediaWiki\Logger\LoggerFactory; 25use Wikimedia\XMPReader\Reader as XMPReader; 26 27/** 28 * Class to deal with reconciling and extracting metadata from bitmap images. 29 * This is meant to comply with http://www.metadataworkinggroup.org/pdf/mwg_guidance.pdf 30 * 31 * This sort of acts as an intermediary between MediaHandler::getMetadata 32 * and the various metadata extractors. 33 * 34 * @todo Other image formats. 35 * @newable 36 * @note marked as newable in 1.35 for lack of a better alternative, 37 * but should become a stateless service, or a handler managed 38 * registry for metadata handlers for different file types. 39 * @ingroup Media 40 */ 41class BitmapMetadataHandler { 42 /** @var array */ 43 private $metadata = []; 44 45 /** @var array Metadata priority */ 46 private $metaPriority = [ 47 20 => [ 'other' ], 48 40 => [ 'native' ], 49 60 => [ 'iptc-good-hash', 'iptc-no-hash' ], 50 70 => [ 'xmp-deprecated' ], 51 80 => [ 'xmp-general' ], 52 90 => [ 'xmp-exif' ], 53 100 => [ 'iptc-bad-hash' ], 54 120 => [ 'exif' ], 55 ]; 56 57 /** @var string */ 58 private $iptcType = 'iptc-no-hash'; 59 60 /** 61 * This does the photoshop image resource app13 block 62 * of interest, IPTC-IIM metadata is stored here. 63 * 64 * Mostly just calls doPSIR and doIPTC 65 * 66 * @param string $app13 String containing app13 block from jpeg file 67 */ 68 private function doApp13( $app13 ) { 69 try { 70 $this->iptcType = JpegMetadataExtractor::doPSIR( $app13 ); 71 } catch ( Exception $e ) { 72 // Error reading the iptc hash information. 73 // This probably means the App13 segment is something other than what we expect. 74 // However, still try to read it, and treat it as if the hash didn't exist. 75 wfDebug( "Error parsing iptc data of file: " . $e->getMessage() ); 76 $this->iptcType = 'iptc-no-hash'; 77 } 78 79 $iptc = IPTC::parse( $app13 ); 80 $this->addMetadata( $iptc, $this->iptcType ); 81 } 82 83 /** 84 * Get exif info using exif class. 85 * Basically what used to be in BitmapHandler::getMetadata(). 86 * Just calls stuff in the Exif class. 87 * 88 * Parameters are passed to the Exif class. 89 * 90 * @param string $filename 91 * @param string $byteOrder 92 */ 93 public function getExif( $filename, $byteOrder ) { 94 global $wgShowEXIF; 95 if ( file_exists( $filename ) && $wgShowEXIF ) { 96 $exif = new Exif( $filename, $byteOrder ); 97 $data = $exif->getFilteredData(); 98 if ( $data ) { 99 $this->addMetadata( $data, 'exif' ); 100 } 101 } 102 } 103 104 /** Add misc metadata. Warning: atm if the metadata category 105 * doesn't have a priority, it will be silently discarded. 106 * 107 * @param array $metaArray Array of metadata values 108 * @param string $type Type. defaults to other. if two things have the same type they're merged 109 */ 110 public function addMetadata( $metaArray, $type = 'other' ) { 111 if ( isset( $this->metadata[$type] ) ) { 112 /* merge with old data */ 113 $metaArray += $this->metadata[$type]; 114 } 115 116 $this->metadata[$type] = $metaArray; 117 } 118 119 /** 120 * Merge together the various types of metadata 121 * the different types have different priorites, 122 * and are merged in order. 123 * 124 * This function is generally called by the media handlers' getMetadata() 125 * 126 * @return array 127 */ 128 public function getMetadataArray() { 129 // this seems a bit ugly... This is all so its merged in right order 130 // based on the MWG recommendation. 131 $temp = []; 132 krsort( $this->metaPriority ); 133 foreach ( $this->metaPriority as $pri ) { 134 foreach ( $pri as $type ) { 135 if ( isset( $this->metadata[$type] ) ) { 136 // Do some special casing for multilingual values. 137 // Don't discard translations if also as a simple value. 138 foreach ( $this->metadata[$type] as $itemName => $item ) { 139 if ( is_array( $item ) && isset( $item['_type'] ) && $item['_type'] === 'lang' && 140 isset( $temp[$itemName] ) && !is_array( $temp[$itemName] ) 141 ) { 142 $default = $temp[$itemName]; 143 $temp[$itemName] = $item; 144 $temp[$itemName]['x-default'] = $default; 145 unset( $this->metadata[$type][$itemName] ); 146 } 147 } 148 149 $temp += $this->metadata[$type]; 150 } 151 } 152 } 153 154 return $temp; 155 } 156 157 /** Main entry point for jpeg's. 158 * 159 * @param string $filename Filename (with full path) 160 * @return array Metadata result array. 161 * @throws MWException On invalid file. 162 */ 163 public static function Jpeg( $filename ) { 164 $showXMP = XMPReader::isSupported(); 165 $meta = new self(); 166 167 $seg = JpegMetadataExtractor::segmentSplitter( $filename ); 168 169 if ( isset( $seg['SOF'] ) ) { 170 $meta->addMetadata( [ 'SOF' => $seg['SOF'] ] ); 171 } 172 if ( isset( $seg['COM'] ) && isset( $seg['COM'][0] ) ) { 173 $meta->addMetadata( [ 'JPEGFileComment' => $seg['COM'] ], 'native' ); 174 } 175 if ( isset( $seg['PSIR'] ) && count( $seg['PSIR'] ) > 0 ) { 176 foreach ( $seg['PSIR'] as $curPSIRValue ) { 177 $meta->doApp13( $curPSIRValue ); 178 } 179 } 180 if ( isset( $seg['XMP'] ) && $showXMP ) { 181 $xmp = new XMPReader( LoggerFactory::getInstance( 'XMP' ), $filename ); 182 $xmp->parse( $seg['XMP'] ); 183 foreach ( $seg['XMP_ext'] as $xmpExt ) { 184 /* Support for extended xmp in jpeg files 185 * is not well tested and a bit fragile. 186 */ 187 $xmp->parseExtended( $xmpExt ); 188 } 189 $res = $xmp->getResults(); 190 foreach ( $res as $type => $array ) { 191 $meta->addMetadata( $array, $type ); 192 } 193 } 194 195 $meta->getExif( $filename, $seg['byteOrder'] ?? 'BE' ); 196 197 return $meta->getMetadataArray(); 198 } 199 200 /** Entry point for png 201 * At some point in the future this might 202 * merge the png various tEXt chunks to that 203 * are interesting, but for now it only does XMP 204 * 205 * @param string $filename Full path to file 206 * @return array Array for storage in img_metadata. 207 */ 208 public static function PNG( $filename ) { 209 $showXMP = XMPReader::isSupported(); 210 211 $meta = new self(); 212 $array = PNGMetadataExtractor::getMetadata( $filename ); 213 if ( isset( $array['text']['xmp']['x-default'] ) 214 && $array['text']['xmp']['x-default'] !== '' && $showXMP 215 ) { 216 $xmp = new XMPReader( LoggerFactory::getInstance( 'XMP' ), $filename ); 217 $xmp->parse( $array['text']['xmp']['x-default'] ); 218 $xmpRes = $xmp->getResults(); 219 foreach ( $xmpRes as $type => $xmpSection ) { 220 $meta->addMetadata( $xmpSection, $type ); 221 } 222 } 223 unset( $array['text']['xmp'] ); 224 $meta->addMetadata( $array['text'], 'native' ); 225 unset( $array['text'] ); 226 $array['metadata'] = $meta->getMetadataArray(); 227 $array['metadata']['_MW_PNG_VERSION'] = PNGMetadataExtractor::VERSION; 228 229 return $array; 230 } 231 232 /** function for gif images. 233 * 234 * They don't really have native metadata, so just merges together 235 * XMP and image comment. 236 * 237 * @param string $filename Full path to file 238 * @return array Metadata array 239 */ 240 public static function GIF( $filename ) { 241 $meta = new self(); 242 $baseArray = GIFMetadataExtractor::getMetadata( $filename ); 243 244 if ( count( $baseArray['comment'] ) > 0 ) { 245 $meta->addMetadata( [ 'GIFFileComment' => $baseArray['comment'] ], 'native' ); 246 } 247 248 if ( $baseArray['xmp'] !== '' && XMPReader::isSupported() ) { 249 $xmp = new XMPReader( LoggerFactory::getInstance( 'XMP' ), $filename ); 250 $xmp->parse( $baseArray['xmp'] ); 251 $xmpRes = $xmp->getResults(); 252 foreach ( $xmpRes as $type => $xmpSection ) { 253 $meta->addMetadata( $xmpSection, $type ); 254 } 255 } 256 257 unset( $baseArray['comment'] ); 258 unset( $baseArray['xmp'] ); 259 260 $baseArray['metadata'] = $meta->getMetadataArray(); 261 $baseArray['metadata']['_MW_GIF_VERSION'] = GIFMetadataExtractor::VERSION; 262 263 return $baseArray; 264 } 265 266 /** 267 * This doesn't do much yet, but eventually I plan to add 268 * XMP support for Tiff. (PHP's exif support already extracts 269 * but needs some further processing because PHP's exif support 270 * is stupid...) 271 * 272 * @todo Add XMP support, so this function actually makes sense to put here. 273 * 274 * The various exceptions this throws are caught later. 275 * @param string $filename 276 * @throws MWException 277 * @return array The metadata. 278 */ 279 public static function Tiff( $filename ) { 280 if ( file_exists( $filename ) ) { 281 $byteOrder = self::getTiffByteOrder( $filename ); 282 if ( !$byteOrder ) { 283 throw new MWException( "Error determining byte order of $filename" ); 284 } 285 $exif = new Exif( $filename, $byteOrder ); 286 $data = $exif->getFilteredData(); 287 if ( $data ) { 288 $data['MEDIAWIKI_EXIF_VERSION'] = Exif::version(); 289 290 return $data; 291 } else { 292 throw new MWException( "Could not extract data from tiff file $filename" ); 293 } 294 } else { 295 throw new MWException( "File doesn't exist - $filename" ); 296 } 297 } 298 299 /** 300 * Read the first 2 bytes of a tiff file to figure out 301 * Little Endian or Big Endian. Needed for exif stuff. 302 * 303 * @param string $filename 304 * @return string 'BE' or 'LE' or false 305 */ 306 public static function getTiffByteOrder( $filename ) { 307 $fh = fopen( $filename, 'rb' ); 308 if ( !$fh ) { 309 return false; 310 } 311 $head = fread( $fh, 2 ); 312 fclose( $fh ); 313 314 switch ( $head ) { 315 case 'II': 316 return 'LE'; // II for intel. 317 case 'MM': 318 return 'BE'; // MM for motorla. 319 default: 320 return false; // Something went wrong. 321 322 } 323 } 324} 325