1#!/usr/local/bin/python3.8
2#################################
3# Copyright (C) 2020 J.F.Dockes
4#   This program is free software; you can redistribute it and/or modify
5#   it under the terms of the GNU General Public License as published by
6#   the Free Software Foundation; either version 2 of the License, or
7#   (at your option) any later version.
8#
9#   This program is distributed in the hope that it will be useful,
10#   but WITHOUT ANY WARRANTY; without even the implied warranty of
11#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12#   GNU General Public License for more details.
13#
14#   You should have received a copy of the GNU General Public License
15#   along with this program; if not, write to the
16#   Free Software Foundation, Inc.,
17# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18########################################################
19
20# Running OCR programs for Recoll. This is executed from,
21# e.g. rclpdf.py if pdftotext returns no data.
22#
23# The script tries to retrieve the data from the ocr cache, else it
24# runs the configured OCR program and updates the cache. In both cases it writes
25# the resulting text to stdout.
26
27import os
28import sys
29import importlib.util
30
31import rclconfig
32import rclocrcache
33import rclexecm
34
35def _deb(s):
36    rclexecm.logmsg(s)
37
38def Usage():
39    _deb("Usage: rclocr.py <imagefilename>")
40    sys.exit(1)
41
42def breakwrite(f, data):
43    # On Windows, writing big chunks can fail with a "not enough space"
44    # error. Seems a combined windows/python bug, depending on versions.
45    # See https://bugs.python.org/issue11395
46    # In any case, just break it up
47    total = len(data)
48    bs = 4*1024
49    offset = 0
50    while total > 0:
51        if total < bs:
52            tow = total
53        else:
54            tow = bs
55        f.write(data[offset:offset+tow])
56        offset += tow
57        total -= tow
58
59
60if len(sys.argv) != 2:
61    Usage()
62
63path = sys.argv[1]
64
65config = rclconfig.RclConfig()
66config.setKeyDir(os.path.dirname(path))
67
68cache = rclocrcache.OCRCache(config)
69
70incache, data = cache.get(path)
71if incache:
72    try:
73        breakwrite(sys.stdout.buffer, data)
74    except Exception as e:
75        _deb("RCLOCR error writing: %s" % e)
76        sys.exit(1)
77    sys.exit(0)
78
79#### Data not in cache
80
81# Retrieve configured OCR program names and try to load the
82# corresponding module
83ocrprogs = config.getConfParam("ocrprogs")
84if ocrprogs is None:
85    # Compat: the previous version has no ocrprogs variable, but would do
86    # tesseract by default. Use "ocrprogs = " for a really empty list
87    ocrprogs = "tesseract"
88if not ocrprogs:
89    _deb("No ocrprogs variable in recoll configuration")
90    sys.exit(0)
91
92#_deb("ocrprogs: %s" % ocrprogs)
93
94proglist = ocrprogs.split(" ")
95ok = False
96for ocrprog in proglist:
97    try:
98        modulename = "rclocr" + ocrprog
99        ocr = importlib.import_module(modulename)
100        if ocr.ocrpossible(config, path):
101            ok = True
102            break
103    except Exception as err:
104        _deb("While loading %s: got: %s" % (modulename, err))
105        pass
106
107if not ok:
108    _deb("No OCR module could be loaded")
109    sys.exit(1)
110
111#_deb("Using ocr module %s" % modulename)
112
113# The OCR module will retrieve its specific parameters from the
114# configuration
115status, data = ocr.runocr(config, path)
116
117if not status:
118    _deb("runocr failed")
119    sys.exit(1)
120
121cache.store(path, data)
122sys.stdout.buffer.write(data)
123sys.exit(0)
124
125