1<?php 2// (c) Copyright by authors of the Tiki Wiki CMS Groupware Project 3// 4// All Rights Reserved. See copyright.txt for details and a complete list of authors. 5// Licensed under the GNU LESSER GENERAL PUBLIC LICENSE. See license.txt for details. 6// $Id$ 7 8use thiagoalessio\TesseractOCR\TesseractOCR; 9use thiagoalessio\TesseractOCR\FriendlyErrors; 10use Tiki\Lib\Alchemy; 11use Symfony\Component\Filesystem\Filesystem; 12 13/** 14 * 15 * A Group of functions related to OCR processing, indexing & accounting 16 * 17 * Class ocr 18 */ 19class ocrLib extends TikiLib 20{ 21 22 /** 23 * @var int the fileid of the file currently being OCR'd 24 */ 25 private $ocrIngNow; 26 /** 27 * @var int fileid of the next file flagged to be processed by the OCR engine. 28 */ 29 public $nextOCRFile; 30 31 /** @var int An attempt to OCR the file has been made, but was not successful */ 32 public const OCR_STATUS_STALLED = 4; 33 /** @var int The file has been placed in a queue to be OCR'd */ 34 public const OCR_STATUS_PENDING = 3; 35 /** @var int The file is marked as currently being OCR'd */ 36 public const OCR_STATUS_PROCESSING = 2; 37 /** @var int The file has been OCR'd and no further action is required */ 38 public const OCR_STATUS_FINISHED = 1; 39 /** @var null This file will not be OCR'd */ 40 public const OCR_STATUS_SKIP = null; 41 42 /** @var array The mime types natively supported by Tesseract */ 43 public const OCR_MIME_NATIVE = ['image/jpeg', 'image/png', 'image/bmp', 'image/tiff', 'image/x-portable-anymap']; 44 45 /** @var array image types that can be handled with Tiki image handling */ 46 public const OCR_MIME_CONVERT = ['image/gif']; 47 48 /** @var array Extra file types that alchemy brings to process */ 49 public const PDF_MIME = ['application/pdf']; 50 51 /** @var array All file types that will be available for OCRing */ 52 public $ocrMime = []; 53 54 /** @var string The minimum version requirement of Tesseract that needs to be installed on the OS */ 55 private const TESSERACT_BINARY_VERSION = '3.5.1'; 56 57 public function setMimeTypes() 58 { 59 global $prefs; 60 if (empty($prefs['ocr_enable']) || $prefs['ocr_enable'] === 'n') { 61 return []; 62 } 63 $this->ocrMime = self::OCR_MIME_NATIVE; 64 if (is_callable('imagepng')) { 65 $this->ocrMime = array_merge(self::OCR_MIME_CONVERT, $this->ocrMime); 66 } 67 exec($prefs['ocr_pdfimages_path'] . ' -v', $output, $return); 68 if ($return === 0) { 69 $this->ocrMime = array_merge(self::PDF_MIME, $this->ocrMime); 70 } 71 } 72 73 /** 74 * Produces the absolute file path of any command. Unix and Windows safe. 75 * @param $executable string The file name you want to find the absolute path of 76 * 77 * @return string|null The absolute file path or null on no command found 78 * @throws Exception If no suitable command was found 79 * todo Find the correct exit code on Windows if the "where" does not find the command. 80 */ 81 82 public function whereIsExecutable(string $executable) : ?string 83 { 84 if (! is_callable('exec')) { 85 throw new Exception('exec() is not enabled. Could not execute command.'); 86 } 87 $executable = escapeshellarg($executable); 88 $return = 1; 89 if (function_exists('exec')) { 90 exec('type -p ' . $executable . ' 2>&1', $output, $return); 91 } 92 if ($return === 1) { // if "type" did not find the command on the system 93 return null; 94 } elseif ($return !== 0) { 95 unset($output); 96 exec('where ' . $executable . ' 2>&1', $output, $return); // windows command 97 } elseif ($return !== 0) { 98 unset($output); 99 exec('which ' . $executable . ' 2>&1', $output, $return); // alternative unix command but relies on $PATH 100 if ($return === 1) { // if "which" did not find the command on the system 101 return null; 102 } 103 } elseif ($return !== 0) { 104 throw new Exception('There was no suitable system command found. Could not execute command'); 105 } 106 if (empty($output[0])) { // if for some reason there was no output, return null 107 return null; 108 } 109 return $output[0]; 110 } 111 112 /** 113 * Checks if a file id can be processed or not. 114 * 115 * @throws Exception If the file is not suitable to be OCR'd, throw an exception 116 */ 117 public function checkFileGalID() 118 { 119 if (! $this->table('tiki_files')->fetchBool(['fileId' => $this->nextOCRFile])) { 120 throw new Exception('The File ID specified does not exist.'); 121 } 122 } 123 124 /** 125 * Checks if all the dependencies for OCR have been satisfied. 126 * 127 * @throws Exception if one of the dependencies are not satisfied; 128 */ 129 130 public function checkOCRDependencies() 131 { 132 global $prefs; 133 134 if ($prefs['ocr_enable'] !== 'y') { 135 throw new Exception('Feature Disabled'); 136 } 137 if (! class_exists('thiagoalessio\TesseractOCR\TesseractOCR')) { 138 throw new Exception('Tesseract not installed in Packages.'); 139 } 140 if (! $this->checkTesseractVersion()) { 141 throw new Exception('Tesseract binary not found.'); 142 } 143 } 144 145 /** 146 * Check if Tesseract binary is installed. 147 * 148 * @return bool false if Tesseract not installed or true otherwise 149 */ 150 151 private function checkTesseractInstalled(): bool 152 { 153 154 if (! class_exists('thiagoalessio\TesseractOCR\TesseractOCR')) { 155 return false; 156 } 157 158 $tesseract = $this->newTesseract(); 159 $errors = new FriendlyErrors(); 160 161 try { 162 $errors::checkTesseractPresence($tesseract->command->executable); 163 } catch (Exception $e) { 164 return false; 165 } 166 return true; 167 } 168 169 /** 170 * Gets the binary tesseract version. 171 * 172 * @return string version number upon success, or empty string otherwise. 173 */ 174 public function getTesseractVersion(): string 175 { 176 if (! class_exists('thiagoalessio\TesseractOCR\TesseractOCR')) { 177 return ''; 178 } 179 $tesseract = $this->newTesseract(); 180 if ($this->checkTesseractInstalled()) { 181 return $tesseract->command->getTesseractVersion(); 182 } 183 return ''; 184 } 185 186 /** 187 * Checks if the binary tesseract version is sufficient. 188 * 189 * @return bool True if version is sufficient, false otherwise 190 */ 191 public function checkTesseractVersion(): bool 192 { 193 return version_compare($this->getTesseractVersion(), self::TESSERACT_BINARY_VERSION, '>='); 194 } 195 196 197 /** 198 * @return array 3 character language codes installed with Tesseract Binary 199 */ 200 201 public function getTesseractLangs(): array 202 { 203 204 if (! class_exists('thiagoalessio\TesseractOCR\TesseractOCR')) { 205 return []; 206 } 207 $tesseract = $this->newTesseract(); 208 209 if (! $this->checkTesseractInstalled()) { 210 return []; 211 } 212 213 return $tesseract->command->getAvailableLanguages(); 214 } 215 216 /** 217 * Change processing flags back to pending. 218 * 219 * @return int Number of files changed from processing to pending. 220 */ 221 222 public function releaseAllProcessing(): int 223 { 224 $changes = $this->table('tiki_files')->updateMultiple( 225 ['ocr_state' => self::OCR_STATUS_PENDING], 226 ['ocr_state' => self::OCR_STATUS_PROCESSING] 227 ); 228 229 return $changes->numrows; 230 } 231 232 /** 233 * Change stalled flags back to pending. 234 * 235 * @return int Number of files changed from stalled to pending. 236 */ 237 238 public function releaseAllStalled(): int 239 { 240 $changes = $this->table('tiki_files')->updateMultiple( 241 ['ocr_state' => self::OCR_STATUS_PENDING], 242 ['ocr_state' => self::OCR_STATUS_STALLED] 243 ); 244 245 return $changes->numrows; 246 } 247 248 /** 249 * Set $nextOCRFile with the fileId of the next file scheduled to be processed by the OCR engine. 250 */ 251 252 public function setNextOCRFile() 253 { 254 $db = $this->table('tiki_files'); 255 $conditions = ['ocr_state' => self::OCR_STATUS_PENDING]; 256 if ($this->nextOCRFile) { // we always take a greater file id to avoid infinite loops 257 $conditions['fileId'] = $db->GreaterThan($this->nextOCRFile); 258 } 259 260 $this->nextOCRFile = $db->fetchOne('fileId', $conditions, ['fileId' => 'ASC']); 261 } 262 263 /** 264 * Creates a new tesseract instance. 265 * 266 * @param null|string $fileName File path of file to OCR. Null if no file. 267 * 268 * @return TesseractOCR A instance with all Tiki preferences applied. 269 */ 270 271 private function newTesseract(?string $fileName = null) 272 { 273 global $prefs; 274 275 $tesseract = new TesseractOCR($fileName); 276 if (! empty($prefs['ocr_tesseract_path'])) { 277 $tesseract->executable($prefs['ocr_tesseract_path']); 278 } 279 return $tesseract; 280 } 281 282 /** 283 * Finds the languages that a file will/has been processed with. 284 * 285 * @param null|int $fileId null defaults to the current file being worked on, otherwise it uses the passed fileid. 286 * 287 * @return array List of file specific languages 288 */ 289 290 public function listFileLanguages(?int $fileId = null): array 291 { 292 global $prefs; 293 if (! $fileId) { 294 $fileId = $this->ocrIngNow; 295 } 296 $db = $this->table('tiki_files'); 297 // first set file level languages if they exist 298 if (! empty($prefs['ocr_file_level']) && $prefs['ocr_file_level'] === 'y') { 299 $langs = json_decode($this->table('tiki_files')->fetchOne('ocr_lang', ['fileId' => $fileId])); 300 } 301 // if no file level languages we look for gallery level language prefrences 302 if (empty($langs)) { 303 $galId = $db->fetchOne('galleryId', ['fileId' => $fileId]); 304 $db = $this->table('tiki_file_galleries'); 305 $langs = json_decode($db->fetchOne('ocr_lang', ['galleryId' => $galId])); 306 // if gallery does not have prefrences, we take a look at the master gallery for direction. 307 if (empty($langs && $galId !== 1)) { 308 $langs = json_decode($db->fetchOne('ocr_lang', ['galleryId' => 1])); 309 } 310 } 311 // we fall back on Auto Detect if there are no preferences set 312 if (empty($langs)) { 313 $langs[] = 'osd'; 314 } 315 return $langs; 316 } 317 318 /** 319 * 320 * OCR's a file set by $ocrIngNow. Intended to be used by a CLI command, as OCRing a large file may cause timeouts. 321 * 322 * @return string Message detailing action performed. 323 * @throws Exception If a problem occurs while processing a file 324 */ 325 326 public function OCRfile() 327 { 328 329 if (! $this->nextOCRFile) { 330 throw new Exception('No files to OCR'); 331 } 332 333 // Set the database state to reflect that the next file in the queue has begun 334 $this->table('tiki_files')->update( 335 ['ocr_state' => self::OCR_STATUS_PROCESSING], 336 ['fileId' => $this->nextOCRFile] 337 ); 338 $this->setNextOCRFile(); 339 // Sets $ocrIngNow with the current file flagged as currently being processed. 340 $this->ocrIngNow = $this->table('tiki_files')->fetchOne( 341 'fileId', 342 ['ocr_state' => self::OCR_STATUS_PROCESSING] 343 ); 344 345 $file = TikiLib::lib('filegal')->get_file($this->ocrIngNow); 346 347 try { 348 if ($file['data']) { 349 /** @var tempFile string The file path of a temp file for processing */ 350 $tempFile = writeTempFile($file['data']); 351 } else { 352 global $prefs; 353 $directory = $prefs['fgal_use_dir']; // lets make sure there is a slash following the directory name 354 if (substr($directory, -1) !== '/') { 355 $directory = $directory . '/'; 356 } 357 $fileContent = @file_get_contents($directory . $file['path']); 358 if ($fileContent === false) { 359 throw new Exception('Reading ' . $file['path'] . ' failed'); 360 } 361 $tempFile = writeTempFile($fileContent); 362 unset($fileContent); 363 } 364 365 // now that we have a temp file written to file, lets start processing it 366 367 $filesystem = new Filesystem(); 368 if (in_array($file['filetype'], self::OCR_MIME_CONVERT)) { 369 /** @var fileName string The path that the file can be read on the server in a format readable to Tesseract. */ 370 $fileName = writeTempFile(''); 371 unlink($fileName); 372 if (! is_callable('imagepng')) { 373 throw new Exception('Install GD to convert.'); 374 } 375 imagepng(imagecreatefromstring(file_get_contents($tempFile)), $fileName); 376 } elseif (in_array($file['filetype'], self::OCR_MIME_NATIVE)) { 377 $fileName = $tempFile; 378 $tempFile = null; // we zero this out so the file is not deleted later. 379 } elseif (in_array($file['filetype'], self::PDF_MIME)) { 380 Tikilib::lib('pdfimages'); 381 $image = new PdfImagesLib(); 382 $image->setBinaryPath(); 383 $image->setArgument('tiff'); 384 $fileName = writeTempFile(null, 'random'); // in this case we create a directory for writing files to. 385 $image->setFilePaths($tempFile, $fileName); 386 $image->run(); 387 unset($image); 388 } else { // fall back onto media alchemist if the file type is not otherwise convertible. 389 if (! class_exists('MediaAlchemyst\Alchemyst')) { 390 throw new Exception('Install Media Alchemist to convert.'); 391 } 392 $alchemy = new Alchemy\AlchemyLib(); 393 // We create a empty temp file and then delete it, so we know its writable before passing to alchemy 394 $fileName = writeTempFile(''); 395 unlink($fileName); 396 if ($alchemy->convertToImage($tempFile, $fileName) === null) { 397 throw new Exception('Media Alchemist unable to convert file'); 398 } 399 } 400 @$filesystem->remove($tempFile); // now that we are done with the temp file, lets delete it. 401 402 $langs = $this->listFileLanguages(); 403 404 if (is_dir($fileName)) { 405 $OCRText = ''; 406 foreach (glob($fileName . '*.tif') as $tiffFile) { 407 $OCRText .= ($this->newTesseract($tiffFile))->lang(...$langs)->run(); 408 } 409 } else { 410 $OCRText = ($this->newTesseract($fileName))->lang(...$langs)->run(); 411 } 412 $OCRText = TikiFilter::get('striptags')->filter($OCRText); 413 $this->table('tiki_files')->update( 414 ['ocr_data' => $OCRText], 415 ['fileId' => $this->ocrIngNow] 416 ); 417 $unifiedsearchlib = TikiLib::lib('unifiedsearch'); 418 $unifiedsearchlib->invalidateObject('file', $this->ocrIngNow); 419 $unifiedsearchlib->processUpdateQueue(); 420 // change the ocr state from processing to finished OCR'ing 421 $this->ocrIngNow = $this->table('tiki_files')->update( 422 ['ocr_state' => self::OCR_STATUS_FINISHED], 423 ['fileId' => $this->ocrIngNow] 424 ); 425 } catch (Exception $e) { 426 @$filesystem->remove($fileName); 427 @$filesystem->remove($tempFile); 428 // Set the database flag to reflect that it is no longer processing but, still needs to be OCR'd 429 $this->table('tiki_files')->update( 430 ['ocr_state' => self::OCR_STATUS_STALLED], 431 ['fileId' => $this->ocrIngNow] 432 ); 433 throw new Exception($e->getMessage()); 434 } 435 // if we had to create temp files, lets remove them. 436 @$filesystem->remove($fileName); 437 } 438} 439