1#!/usr/local/bin/python3.8 2################################# 3# Copyright (C) 2020 J.F.Dockes 4# This program is free software; you can redistribute it and/or modify 5# it under the terms of the GNU General Public License as published by 6# the Free Software Foundation; either version 2 of the License, or 7# (at your option) any later version. 8# 9# This program is distributed in the hope that it will be useful, 10# but WITHOUT ANY WARRANTY; without even the implied warranty of 11# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12# GNU General Public License for more details. 13# 14# You should have received a copy of the GNU General Public License 15# along with this program; if not, write to the 16# Free Software Foundation, Inc., 17# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 18######################################################## 19 20# Running OCR programs for Recoll. This is executed from, 21# e.g. rclpdf.py if pdftotext returns no data. 22# 23# The script tries to retrieve the data from the ocr cache, else it 24# runs the configured OCR program and updates the cache. In both cases it writes 25# the resulting text to stdout. 26 27import os 28import sys 29import importlib.util 30 31import rclconfig 32import rclocrcache 33import rclexecm 34 35def _deb(s): 36 rclexecm.logmsg(s) 37 38def Usage(): 39 _deb("Usage: rclocr.py <imagefilename>") 40 sys.exit(1) 41 42def breakwrite(f, data): 43 # On Windows, writing big chunks can fail with a "not enough space" 44 # error. Seems a combined windows/python bug, depending on versions. 45 # See https://bugs.python.org/issue11395 46 # In any case, just break it up 47 total = len(data) 48 bs = 4*1024 49 offset = 0 50 while total > 0: 51 if total < bs: 52 tow = total 53 else: 54 tow = bs 55 f.write(data[offset:offset+tow]) 56 offset += tow 57 total -= tow 58 59 60if len(sys.argv) != 2: 61 Usage() 62 63path = sys.argv[1] 64 65config = rclconfig.RclConfig() 66config.setKeyDir(os.path.dirname(path)) 67 68cache = rclocrcache.OCRCache(config) 69 70incache, data = cache.get(path) 71if incache: 72 try: 73 breakwrite(sys.stdout.buffer, data) 74 except Exception as e: 75 _deb("RCLOCR error writing: %s" % e) 76 sys.exit(1) 77 sys.exit(0) 78 79#### Data not in cache 80 81# Retrieve configured OCR program names and try to load the 82# corresponding module 83ocrprogs = config.getConfParam("ocrprogs") 84if ocrprogs is None: 85 # Compat: the previous version has no ocrprogs variable, but would do 86 # tesseract by default. Use "ocrprogs = " for a really empty list 87 ocrprogs = "tesseract" 88if not ocrprogs: 89 _deb("No ocrprogs variable in recoll configuration") 90 sys.exit(0) 91 92#_deb("ocrprogs: %s" % ocrprogs) 93 94proglist = ocrprogs.split(" ") 95ok = False 96for ocrprog in proglist: 97 try: 98 modulename = "rclocr" + ocrprog 99 ocr = importlib.import_module(modulename) 100 if ocr.ocrpossible(config, path): 101 ok = True 102 break 103 except Exception as err: 104 _deb("While loading %s: got: %s" % (modulename, err)) 105 pass 106 107if not ok: 108 _deb("No OCR module could be loaded") 109 sys.exit(1) 110 111#_deb("Using ocr module %s" % modulename) 112 113# The OCR module will retrieve its specific parameters from the 114# configuration 115status, data = ocr.runocr(config, path) 116 117if not status: 118 _deb("runocr failed") 119 sys.exit(1) 120 121cache.store(path, data) 122sys.stdout.buffer.write(data) 123sys.exit(0) 124 125