1<?php
2// (c) Copyright by authors of the Tiki Wiki CMS Groupware Project
3//
4// All Rights Reserved. See copyright.txt for details and a complete list of authors.
5// Licensed under the GNU LESSER GENERAL PUBLIC LICENSE. See license.txt for details.
6// $Id$
7
8use thiagoalessio\TesseractOCR\TesseractOCR;
9use thiagoalessio\TesseractOCR\FriendlyErrors;
10use Tiki\Lib\Alchemy;
11use Symfony\Component\Filesystem\Filesystem;
12
13/**
14 *
15 * A Group of functions related to OCR processing, indexing & accounting
16 *
17 * Class ocr
18 */
19class ocrLib extends TikiLib
20{
21
22	/**
23	 * @var int the fileid of the file currently being OCR'd
24	 */
25	private $ocrIngNow;
26	/**
27	 * @var int fileid of the next file flagged to be processed by the OCR engine.
28	 */
29	public $nextOCRFile;
30
31	/** @var int An attempt to OCR the file has been made, but was not successful */
32	public const OCR_STATUS_STALLED = 4;
33	/** @var int The file has been placed in a queue to be OCR'd */
34	public const OCR_STATUS_PENDING = 3;
35	/** @var int The file is marked as currently being OCR'd */
36	public const OCR_STATUS_PROCESSING = 2;
37	/** @var int The file has been OCR'd and no further action is required */
38	public const OCR_STATUS_FINISHED = 1;
39	/** @var null This file will not be OCR'd */
40	public const OCR_STATUS_SKIP = null;
41
42	/** @var array The mime types natively supported by Tesseract */
43	public const OCR_MIME_NATIVE = ['image/jpeg', 'image/png', 'image/bmp', 'image/tiff', 'image/x-portable-anymap'];
44
45	/** @var array image types that can be handled with Tiki image handling */
46	public const OCR_MIME_CONVERT = ['image/gif'];
47
48	/** @var array Extra file types that alchemy brings to process */
49	public const PDF_MIME = ['application/pdf'];
50
51	/** @var array All file types that will be available for OCRing */
52	public $ocrMime = [];
53
54	/** @var string The minimum version requirement of Tesseract that needs to be installed on the OS */
55	private const TESSERACT_BINARY_VERSION = '3.5.1';
56
57	public function setMimeTypes()
58	{
59		global $prefs;
60		if (empty($prefs['ocr_enable']) || $prefs['ocr_enable'] === 'n') {
61			return [];
62		}
63		$this->ocrMime = self::OCR_MIME_NATIVE;
64		if (is_callable('imagepng')) {
65			$this->ocrMime = array_merge(self::OCR_MIME_CONVERT, $this->ocrMime);
66		}
67		exec($prefs['ocr_pdfimages_path'] . ' -v', $output, $return);
68		if ($return === 0) {
69			$this->ocrMime = array_merge(self::PDF_MIME, $this->ocrMime);
70		}
71	}
72
73	/**
74	 * Produces the absolute file path of any command. Unix and Windows safe.
75	 * @param $executable string	The file name you want to find the absolute path of
76	 *
77	 * @return string|null		The absolute file path or null on no command found
78	 * @throws Exception		If no suitable command was found
79	 * todo 			Find the correct exit code on Windows if the "where" does not find the command.
80	 */
81
82	public function whereIsExecutable(string $executable) : ?string
83	{
84		if (! is_callable('exec')) {
85			throw new Exception('exec() is not enabled. Could not execute command.');
86		}
87		$executable = escapeshellarg($executable);
88		$return = 1;
89		if (function_exists('exec')) {
90			exec('type -p ' . $executable . ' 2>&1', $output, $return);
91		}
92		if ($return === 1) {				// if "type" did not find the command on the system
93			return null;
94		} elseif ($return !== 0) {
95			unset($output);
96			exec('where ' . $executable . ' 2>&1', $output, $return); // windows command
97		} elseif ($return !== 0) {
98			unset($output);
99			exec('which ' . $executable . ' 2>&1', $output, $return); // alternative unix command but relies on $PATH
100			if ($return === 1) {			// if "which" did not find the command on the system
101				return null;
102			}
103		} elseif ($return !== 0) {
104			throw new Exception('There was no suitable system command found. Could not execute command');
105		}
106		if (empty($output[0])) {                // if for some reason there was no output, return null
107			return null;
108		}
109		return $output[0];
110	}
111
112	/**
113	 * Checks if a file  id can be processed or not.
114	 *
115	 * @throws Exception If the file is not suitable to be OCR'd, throw an exception
116	 */
117	public function checkFileGalID()
118	{
119		if (! $this->table('tiki_files')->fetchBool(['fileId' => $this->nextOCRFile])) {
120			throw new Exception('The File ID specified does not exist.');
121		}
122	}
123
124	/**
125	 * Checks if all the dependencies for OCR have been satisfied.
126	 *
127	 * @throws Exception if one of the dependencies are not satisfied;
128	 */
129
130	public function checkOCRDependencies()
131	{
132		global $prefs;
133
134		if ($prefs['ocr_enable'] !== 'y') {
135			throw new Exception('Feature Disabled');
136		}
137		if (! class_exists('thiagoalessio\TesseractOCR\TesseractOCR')) {
138			throw new Exception('Tesseract not installed in Packages.');
139		}
140		if (! $this->checkTesseractVersion()) {
141			throw new Exception('Tesseract binary not found.');
142		}
143	}
144
145	/**
146	 * Check if Tesseract binary is installed.
147	 *
148	 * @return bool false if Tesseract not installed or true otherwise
149	 */
150
151	private function checkTesseractInstalled(): bool
152	{
153
154		if (! class_exists('thiagoalessio\TesseractOCR\TesseractOCR')) {
155			return false;
156		}
157
158		$tesseract = $this->newTesseract();
159		$errors = new FriendlyErrors();
160
161		try {
162			$errors::checkTesseractPresence($tesseract->command->executable);
163		} catch (Exception $e) {
164			return false;
165		}
166		return true;
167	}
168
169	/**
170	 * Gets the binary tesseract version.
171	 *
172	 * @return string version number upon success, or empty string otherwise.
173	 */
174	public function getTesseractVersion(): string
175	{
176		if (! class_exists('thiagoalessio\TesseractOCR\TesseractOCR')) {
177			return '';
178		}
179		$tesseract = $this->newTesseract();
180		if ($this->checkTesseractInstalled()) {
181			return $tesseract->command->getTesseractVersion();
182		}
183		return '';
184	}
185
186	/**
187	 * Checks if the binary tesseract version is sufficient.
188	 *
189	 * @return bool True if version is sufficient, false otherwise
190	 */
191	public function checkTesseractVersion(): bool
192	{
193		return version_compare($this->getTesseractVersion(), self::TESSERACT_BINARY_VERSION, '>=');
194	}
195
196
197	/**
198	 * @return array 3 character language codes installed with Tesseract Binary
199	 */
200
201	public function getTesseractLangs(): array
202	{
203
204		if (! class_exists('thiagoalessio\TesseractOCR\TesseractOCR')) {
205			return [];
206		}
207		$tesseract = $this->newTesseract();
208
209		if (! $this->checkTesseractInstalled()) {
210			return [];
211		}
212
213		return $tesseract->command->getAvailableLanguages();
214	}
215
216	/**
217	 * Change processing flags back to pending.
218	 *
219	 * @return int Number of files changed from processing to pending.
220	 */
221
222	public function releaseAllProcessing(): int
223	{
224		$changes = $this->table('tiki_files')->updateMultiple(
225			['ocr_state' => self::OCR_STATUS_PENDING],
226			['ocr_state' => self::OCR_STATUS_PROCESSING]
227		);
228
229		return $changes->numrows;
230	}
231
232	/**
233	 * Change stalled flags back to pending.
234	 *
235	 * @return int Number of files changed from stalled to pending.
236	 */
237
238	public function releaseAllStalled(): int
239	{
240		$changes = $this->table('tiki_files')->updateMultiple(
241			['ocr_state' => self::OCR_STATUS_PENDING],
242			['ocr_state' => self::OCR_STATUS_STALLED]
243		);
244
245		return $changes->numrows;
246	}
247
248	/**
249	 * Set $nextOCRFile with the fileId of the next file scheduled to be processed by the OCR engine.
250	 */
251
252	public function setNextOCRFile()
253	{
254		$db = $this->table('tiki_files');
255		$conditions = ['ocr_state' => self::OCR_STATUS_PENDING];
256		if ($this->nextOCRFile) {											// we always take a greater file id to avoid infinite loops
257			$conditions['fileId'] = $db->GreaterThan($this->nextOCRFile);
258		}
259
260		$this->nextOCRFile = $db->fetchOne('fileId', $conditions, ['fileId' => 'ASC']);
261	}
262
263	/**
264	 * Creates a new tesseract instance.
265	 *
266	 * @param null|string $fileName File path of file to OCR. Null if no file.
267	 *
268	 * @return TesseractOCR		A instance with all Tiki preferences applied.
269	 */
270
271	private function newTesseract(?string $fileName = null)
272	{
273		global $prefs;
274
275		$tesseract = new TesseractOCR($fileName);
276		if (! empty($prefs['ocr_tesseract_path'])) {
277			$tesseract->executable($prefs['ocr_tesseract_path']);
278		}
279		return $tesseract;
280	}
281
282	/**
283	 * Finds the languages that a file will/has been processed with.
284	 *
285	 * @param null|int $fileId null defaults to the current file being worked on, otherwise it uses the passed fileid.
286	 *
287	 * @return array List of file specific languages
288	 */
289
290	public function listFileLanguages(?int $fileId = null): array
291	{
292		global $prefs;
293		if (! $fileId) {
294			$fileId = $this->ocrIngNow;
295		}
296		$db = $this->table('tiki_files');
297		// first set file level languages if they exist
298		if (! empty($prefs['ocr_file_level']) && $prefs['ocr_file_level'] === 'y') {
299			$langs = json_decode($this->table('tiki_files')->fetchOne('ocr_lang', ['fileId' => $fileId]));
300		}
301		// if no file level languages we look for gallery level language prefrences
302		if (empty($langs)) {
303			$galId = $db->fetchOne('galleryId', ['fileId' => $fileId]);
304			$db = $this->table('tiki_file_galleries');
305			$langs = json_decode($db->fetchOne('ocr_lang', ['galleryId' => $galId]));
306			// if gallery does not have prefrences, we take a look at the master gallery for direction.
307			if (empty($langs && $galId !== 1)) {
308				$langs = json_decode($db->fetchOne('ocr_lang', ['galleryId' => 1]));
309			}
310		}
311		// we fall back on Auto Detect if there are no preferences set
312		if (empty($langs)) {
313			$langs[] = 'osd';
314		}
315		return $langs;
316	}
317
318	/**
319	 *
320	 * OCR's a file set by $ocrIngNow. Intended to be used by a CLI command, as OCRing a large file may cause timeouts.
321	 *
322	 * @return string    Message detailing action performed.
323	 * @throws Exception If a problem occurs while processing a file
324	 */
325
326	public function OCRfile()
327	{
328
329		if (! $this->nextOCRFile) {
330			throw new Exception('No files to OCR');
331		}
332
333		// Set the database state to reflect that the next file in the queue has begun
334		$this->table('tiki_files')->update(
335			['ocr_state' => self::OCR_STATUS_PROCESSING],
336			['fileId' => $this->nextOCRFile]
337		);
338		$this->setNextOCRFile();
339		// Sets $ocrIngNow with the current file flagged as currently being processed.
340		$this->ocrIngNow = $this->table('tiki_files')->fetchOne(
341			'fileId',
342			['ocr_state' => self::OCR_STATUS_PROCESSING]
343		);
344
345		$file = TikiLib::lib('filegal')->get_file($this->ocrIngNow);
346
347		try {
348			if ($file['data']) {
349				/** @var tempFile string The file path of a temp file for processing */
350				$tempFile = writeTempFile($file['data']);
351			} else {
352				global $prefs;
353				$directory = $prefs['fgal_use_dir'];                // lets make sure there is a slash following the directory name
354				if (substr($directory, -1) !== '/') {
355					$directory = $directory . '/';
356				}
357				$fileContent = @file_get_contents($directory . $file['path']);
358				if ($fileContent === false) {
359					throw new Exception('Reading ' . $file['path'] . ' failed');
360				}
361				$tempFile = writeTempFile($fileContent);
362				unset($fileContent);
363			}
364
365			// now that we have a temp file written to file, lets start processing it
366
367			$filesystem = new Filesystem();
368			if (in_array($file['filetype'], self::OCR_MIME_CONVERT)) {
369				/** @var fileName string The path that the file can be read on the server in a format readable to Tesseract. */
370				$fileName = writeTempFile('');
371				unlink($fileName);
372				if (! is_callable('imagepng')) {
373					throw new Exception('Install GD to convert.');
374				}
375				imagepng(imagecreatefromstring(file_get_contents($tempFile)), $fileName);
376			} elseif (in_array($file['filetype'], self::OCR_MIME_NATIVE)) {
377				$fileName = $tempFile;
378				$tempFile = null;                                // we zero this out so the file is not deleted later.
379			} elseif (in_array($file['filetype'], self::PDF_MIME)) {
380				Tikilib::lib('pdfimages');
381				$image = new PdfImagesLib();
382				$image->setBinaryPath();
383				$image->setArgument('tiff');
384				$fileName = writeTempFile(null, 'random'); // in this case we create a directory for writing files to.
385				$image->setFilePaths($tempFile, $fileName);
386				$image->run();
387				unset($image);
388			} else {                                                // fall back onto media alchemist if the file type is not otherwise convertible.
389				if (! class_exists('MediaAlchemyst\Alchemyst')) {
390					throw new Exception('Install Media Alchemist to convert.');
391				}
392				$alchemy = new Alchemy\AlchemyLib();
393				// We create a empty temp file and then delete it, so we know its writable before passing to alchemy
394				$fileName = writeTempFile('');
395				unlink($fileName);
396				if ($alchemy->convertToImage($tempFile, $fileName) === null) {
397					throw new Exception('Media Alchemist unable to convert file');
398				}
399			}
400			@$filesystem->remove($tempFile);                                    // now that we are done with the temp file, lets delete it.
401
402			$langs = $this->listFileLanguages();
403
404			if (is_dir($fileName)) {
405				$OCRText = '';
406				foreach (glob($fileName . '*.tif') as $tiffFile) {
407					$OCRText .= ($this->newTesseract($tiffFile))->lang(...$langs)->run();
408				}
409			} else {
410				$OCRText = ($this->newTesseract($fileName))->lang(...$langs)->run();
411			}
412			$OCRText = TikiFilter::get('striptags')->filter($OCRText);
413			$this->table('tiki_files')->update(
414				['ocr_data' => $OCRText],
415				['fileId' => $this->ocrIngNow]
416			);
417			$unifiedsearchlib = TikiLib::lib('unifiedsearch');
418			$unifiedsearchlib->invalidateObject('file', $this->ocrIngNow);
419			$unifiedsearchlib->processUpdateQueue();
420			// change the ocr state from processing to finished OCR'ing
421			$this->ocrIngNow = $this->table('tiki_files')->update(
422				['ocr_state' => self::OCR_STATUS_FINISHED],
423				['fileId' => $this->ocrIngNow]
424			);
425		} catch (Exception $e) {
426			@$filesystem->remove($fileName);
427			@$filesystem->remove($tempFile);
428			// Set the database flag to reflect that it is no longer processing but, still needs to be OCR'd
429			$this->table('tiki_files')->update(
430				['ocr_state' => self::OCR_STATUS_STALLED],
431				['fileId' => $this->ocrIngNow]
432			);
433			throw new Exception($e->getMessage());
434		}
435		// if we had to create temp files, lets remove them.
436		@$filesystem->remove($fileName);
437	}
438}
439