1#!/usr/bin/env python
2import re
3import shlex
4import string
5import subprocess
6import sys
7from contextlib import contextmanager
8from csv import QUOTE_NONE
9from errno import ENOENT
10from functools import wraps
11from glob import iglob
12from io import BytesIO
13from os import environ
14from os import extsep
15from os import linesep
16from os import remove
17from os.path import normcase
18from os.path import normpath
19from os.path import realpath
20from pkgutil import find_loader
21from tempfile import NamedTemporaryFile
22from time import sleep
23
24from packaging.version import InvalidVersion
25from packaging.version import parse
26from packaging.version import Version
27from PIL import Image
28
29
30tesseract_cmd = 'tesseract'
31
32numpy_installed = find_loader('numpy') is not None
33if numpy_installed:
34    from numpy import ndarray
35
36pandas_installed = find_loader('pandas') is not None
37if pandas_installed:
38    import pandas as pd
39
40DEFAULT_ENCODING = 'utf-8'
41LANG_PATTERN = re.compile('^[a-z_]+$')
42RGB_MODE = 'RGB'
43SUPPORTED_FORMATS = {
44    'JPEG',
45    'PNG',
46    'PBM',
47    'PGM',
48    'PPM',
49    'TIFF',
50    'BMP',
51    'GIF',
52    'WEBP',
53}
54
55OSD_KEYS = {
56    'Page number': ('page_num', int),
57    'Orientation in degrees': ('orientation', int),
58    'Rotate': ('rotate', int),
59    'Orientation confidence': ('orientation_conf', float),
60    'Script': ('script', str),
61    'Script confidence': ('script_conf', float),
62}
63
64TESSERACT_MIN_VERSION = Version('3.05')
65TESSERACT_ALTO_VERSION = Version('4.1.0')
66
67
68class Output:
69    BYTES = 'bytes'
70    DATAFRAME = 'data.frame'
71    DICT = 'dict'
72    STRING = 'string'
73
74
75class PandasNotSupported(EnvironmentError):
76    def __init__(self):
77        super().__init__('Missing pandas package')
78
79
80class TesseractError(RuntimeError):
81    def __init__(self, status, message):
82        self.status = status
83        self.message = message
84        self.args = (status, message)
85
86
87class TesseractNotFoundError(EnvironmentError):
88    def __init__(self):
89        super().__init__(
90            f"{tesseract_cmd} is not installed or it's not in your PATH."
91            + ' See README file for more information.',
92        )
93
94
95class TSVNotSupported(EnvironmentError):
96    def __init__(self):
97        super().__init__(
98            'TSV output not supported. Tesseract >= 3.05 required',
99        )
100
101
102class ALTONotSupported(EnvironmentError):
103    def __init__(self):
104        super().__init__(
105            'ALTO output not supported. Tesseract >= 4.1.0 required',
106        )
107
108
109def kill(process, code):
110    process.terminate()
111    try:
112        process.wait(1)
113    except TypeError:  # python2 Popen.wait(1) fallback
114        sleep(1)
115    except Exception:  # python3 subprocess.TimeoutExpired
116        pass
117    finally:
118        process.kill()
119        process.returncode = code
120
121
122@contextmanager
123def timeout_manager(proc, seconds=None):
124    try:
125        if not seconds:
126            yield proc.communicate()[1]
127            return
128
129        try:
130            _, error_string = proc.communicate(timeout=seconds)
131            yield error_string
132        except subprocess.TimeoutExpired:
133            kill(proc, -1)
134            raise RuntimeError('Tesseract process timeout')
135    finally:
136        proc.stdin.close()
137        proc.stdout.close()
138        proc.stderr.close()
139
140
141def run_once(func):
142    @wraps(func)
143    def wrapper(*args, **kwargs):
144        if wrapper._result is wrapper:
145            wrapper._result = func(*args, **kwargs)
146        return wrapper._result
147
148    wrapper._result = wrapper
149    return wrapper
150
151
152def get_errors(error_string):
153    return ' '.join(
154        line for line in error_string.decode(DEFAULT_ENCODING).splitlines()
155    ).strip()
156
157
158def cleanup(temp_name):
159    """Tries to remove temp files by filename wildcard path."""
160    for filename in iglob(temp_name + '*' if temp_name else temp_name):
161        try:
162            remove(filename)
163        except OSError as e:
164            if e.errno != ENOENT:
165                raise e
166
167
168def prepare(image):
169    if numpy_installed and isinstance(image, ndarray):
170        image = Image.fromarray(image)
171
172    if not isinstance(image, Image.Image):
173        raise TypeError('Unsupported image object')
174
175    extension = 'PNG' if not image.format else image.format
176    if extension not in SUPPORTED_FORMATS:
177        raise TypeError('Unsupported image format/type')
178
179    if 'A' in image.getbands():
180        # discard and replace the alpha channel with white background
181        background = Image.new(RGB_MODE, image.size, (255, 255, 255))
182        background.paste(image, (0, 0), image.getchannel('A'))
183        image = background
184
185    image.format = extension
186    return image, extension
187
188
189@contextmanager
190def save(image):
191    try:
192        with NamedTemporaryFile(prefix='tess_', delete=False) as f:
193            if isinstance(image, str):
194                yield f.name, realpath(normpath(normcase(image)))
195                return
196            image, extension = prepare(image)
197            input_file_name = f.name + extsep + extension
198            image.save(input_file_name, format=image.format)
199            yield f.name, input_file_name
200    finally:
201        cleanup(f.name)
202
203
204def subprocess_args(include_stdout=True):
205    # See https://github.com/pyinstaller/pyinstaller/wiki/Recipe-subprocess
206    # for reference and comments.
207
208    kwargs = {
209        'stdin': subprocess.PIPE,
210        'stderr': subprocess.PIPE,
211        'startupinfo': None,
212        'env': environ,
213    }
214
215    if hasattr(subprocess, 'STARTUPINFO'):
216        kwargs['startupinfo'] = subprocess.STARTUPINFO()
217        kwargs['startupinfo'].dwFlags |= subprocess.STARTF_USESHOWWINDOW
218        kwargs['startupinfo'].wShowWindow = subprocess.SW_HIDE
219
220    if include_stdout:
221        kwargs['stdout'] = subprocess.PIPE
222    else:
223        kwargs['stdout'] = subprocess.DEVNULL
224
225    return kwargs
226
227
228def run_tesseract(
229    input_filename,
230    output_filename_base,
231    extension,
232    lang,
233    config='',
234    nice=0,
235    timeout=0,
236):
237    cmd_args = []
238
239    if not sys.platform.startswith('win32') and nice != 0:
240        cmd_args += ('nice', '-n', str(nice))
241
242    cmd_args += (tesseract_cmd, input_filename, output_filename_base)
243
244    if lang is not None:
245        cmd_args += ('-l', lang)
246
247    if config:
248        cmd_args += shlex.split(config)
249
250    if extension and extension not in {'box', 'osd', 'tsv', 'xml'}:
251        cmd_args.append(extension)
252
253    try:
254        proc = subprocess.Popen(cmd_args, **subprocess_args())
255    except OSError as e:
256        if e.errno != ENOENT:
257            raise e
258        raise TesseractNotFoundError()
259
260    with timeout_manager(proc, timeout) as error_string:
261        if proc.returncode:
262            raise TesseractError(proc.returncode, get_errors(error_string))
263
264
265def run_and_get_output(
266    image,
267    extension='',
268    lang=None,
269    config='',
270    nice=0,
271    timeout=0,
272    return_bytes=False,
273):
274
275    with save(image) as (temp_name, input_filename):
276        kwargs = {
277            'input_filename': input_filename,
278            'output_filename_base': temp_name,
279            'extension': extension,
280            'lang': lang,
281            'config': config,
282            'nice': nice,
283            'timeout': timeout,
284        }
285
286        run_tesseract(**kwargs)
287        filename = kwargs['output_filename_base'] + extsep + extension
288        with open(filename, 'rb') as output_file:
289            if return_bytes:
290                return output_file.read()
291            return output_file.read().decode(DEFAULT_ENCODING)
292
293
294def file_to_dict(tsv, cell_delimiter, str_col_idx):
295    result = {}
296    rows = [row.split(cell_delimiter) for row in tsv.strip().split('\n')]
297    if len(rows) < 2:
298        return result
299
300    header = rows.pop(0)
301    length = len(header)
302    if len(rows[-1]) < length:
303        # Fixes bug that occurs when last text string in TSV is null, and
304        # last row is missing a final cell in TSV file
305        rows[-1].append('')
306
307    if str_col_idx < 0:
308        str_col_idx += length
309
310    for i, head in enumerate(header):
311        result[head] = list()
312        for row in rows:
313            if len(row) <= i:
314                continue
315
316            if i != str_col_idx:
317                try:
318                    val = int(float(row[i]))
319                except ValueError:
320                    val = row[i]
321            else:
322                val = row[i]
323
324            result[head].append(val)
325
326    return result
327
328
329def is_valid(val, _type):
330    if _type is int:
331        return val.isdigit()
332
333    if _type is float:
334        try:
335            float(val)
336            return True
337        except ValueError:
338            return False
339
340    return True
341
342
343def osd_to_dict(osd):
344    return {
345        OSD_KEYS[kv[0]][0]: OSD_KEYS[kv[0]][1](kv[1])
346        for kv in (line.split(': ') for line in osd.split('\n'))
347        if len(kv) == 2 and is_valid(kv[1], OSD_KEYS[kv[0]][1])
348    }
349
350
351@run_once
352def get_languages(config=''):
353    cmd_args = [tesseract_cmd, '--list-langs']
354    if config:
355        cmd_args += shlex.split(config)
356
357    try:
358        result = subprocess.run(
359            cmd_args,
360            stdout=subprocess.PIPE,
361            stderr=subprocess.STDOUT,
362        )
363    except OSError:
364        raise TesseractNotFoundError()
365
366    # tesseract 3.x
367    if result.returncode not in (0, 1):
368        raise TesseractNotFoundError()
369
370    languages = []
371    if result.stdout:
372        for line in result.stdout.decode(DEFAULT_ENCODING).split(linesep):
373            lang = line.strip()
374            if LANG_PATTERN.match(lang):
375                languages.append(lang)
376
377    return languages
378
379
380@run_once
381def get_tesseract_version():
382    """
383    Returns Version object of the Tesseract version
384    """
385    try:
386        output = subprocess.check_output(
387            [tesseract_cmd, '--version'],
388            stderr=subprocess.STDOUT,
389            env=environ,
390            stdin=subprocess.DEVNULL,
391        )
392    except OSError:
393        raise TesseractNotFoundError()
394
395    raw_version = output.decode(DEFAULT_ENCODING)
396    str_version, *_ = raw_version.lstrip(string.printable[10:]).partition(' ')
397    str_version, *_ = str_version.partition('-')
398
399    try:
400        version = parse(str_version)
401        assert version >= TESSERACT_MIN_VERSION
402    except (AssertionError, InvalidVersion):
403        raise SystemExit(f'Invalid tesseract version: "{raw_version}"')
404
405    return version
406
407
408def image_to_string(
409    image,
410    lang=None,
411    config='',
412    nice=0,
413    output_type=Output.STRING,
414    timeout=0,
415):
416    """
417    Returns the result of a Tesseract OCR run on the provided image to string
418    """
419    args = [image, 'txt', lang, config, nice, timeout]
420
421    return {
422        Output.BYTES: lambda: run_and_get_output(*(args + [True])),
423        Output.DICT: lambda: {'text': run_and_get_output(*args)},
424        Output.STRING: lambda: run_and_get_output(*args),
425    }[output_type]()
426
427
428def image_to_pdf_or_hocr(
429    image,
430    lang=None,
431    config='',
432    nice=0,
433    extension='pdf',
434    timeout=0,
435):
436    """
437    Returns the result of a Tesseract OCR run on the provided image to pdf/hocr
438    """
439
440    if extension not in {'pdf', 'hocr'}:
441        raise ValueError(f'Unsupported extension: {extension}')
442    args = [image, extension, lang, config, nice, timeout, True]
443
444    return run_and_get_output(*args)
445
446
447def image_to_alto_xml(
448    image,
449    lang=None,
450    config='',
451    nice=0,
452    timeout=0,
453):
454    """
455    Returns the result of a Tesseract OCR run on the provided image to ALTO XML
456    """
457
458    if get_tesseract_version() < TESSERACT_ALTO_VERSION:
459        raise ALTONotSupported()
460
461    config = f'-c tessedit_create_alto=1 {config.strip()}'
462    args = [image, 'xml', lang, config, nice, timeout, True]
463
464    return run_and_get_output(*args)
465
466
467def image_to_boxes(
468    image,
469    lang=None,
470    config='',
471    nice=0,
472    output_type=Output.STRING,
473    timeout=0,
474):
475    """
476    Returns string containing recognized characters and their box boundaries
477    """
478    config = f'{config.strip()} batch.nochop makebox'
479    args = [image, 'box', lang, config, nice, timeout]
480
481    return {
482        Output.BYTES: lambda: run_and_get_output(*(args + [True])),
483        Output.DICT: lambda: file_to_dict(
484            f'char left bottom right top page\n{run_and_get_output(*args)}',
485            ' ',
486            0,
487        ),
488        Output.STRING: lambda: run_and_get_output(*args),
489    }[output_type]()
490
491
492def get_pandas_output(args, config=None):
493    if not pandas_installed:
494        raise PandasNotSupported()
495
496    kwargs = {'quoting': QUOTE_NONE, 'sep': '\t'}
497    try:
498        kwargs.update(config)
499    except (TypeError, ValueError):
500        pass
501
502    return pd.read_csv(BytesIO(run_and_get_output(*args)), **kwargs)
503
504
505def image_to_data(
506    image,
507    lang=None,
508    config='',
509    nice=0,
510    output_type=Output.STRING,
511    timeout=0,
512    pandas_config=None,
513):
514    """
515    Returns string containing box boundaries, confidences,
516    and other information. Requires Tesseract 3.05+
517    """
518
519    if get_tesseract_version() < TESSERACT_MIN_VERSION:
520        raise TSVNotSupported()
521
522    config = f'-c tessedit_create_tsv=1 {config.strip()}'
523    args = [image, 'tsv', lang, config, nice, timeout]
524
525    return {
526        Output.BYTES: lambda: run_and_get_output(*(args + [True])),
527        Output.DATAFRAME: lambda: get_pandas_output(
528            args + [True],
529            pandas_config,
530        ),
531        Output.DICT: lambda: file_to_dict(run_and_get_output(*args), '\t', -1),
532        Output.STRING: lambda: run_and_get_output(*args),
533    }[output_type]()
534
535
536def image_to_osd(
537    image,
538    lang='osd',
539    config='',
540    nice=0,
541    output_type=Output.STRING,
542    timeout=0,
543):
544    """
545    Returns string containing the orientation and script detection (OSD)
546    """
547    config = f'--psm 0 {config.strip()}'
548    args = [image, 'osd', lang, config, nice, timeout]
549
550    return {
551        Output.BYTES: lambda: run_and_get_output(*(args + [True])),
552        Output.DICT: lambda: osd_to_dict(run_and_get_output(*args)),
553        Output.STRING: lambda: run_and_get_output(*args),
554    }[output_type]()
555
556
557def main():
558    if len(sys.argv) == 2:
559        filename, lang = sys.argv[1], None
560    elif len(sys.argv) == 4 and sys.argv[1] == '-l':
561        filename, lang = sys.argv[3], sys.argv[2]
562    else:
563        print('Usage: pytesseract [-l lang] input_file\n', file=sys.stderr)
564        return 2
565
566    try:
567        with Image.open(filename) as img:
568            print(image_to_string(img, lang=lang))
569    except TesseractNotFoundError as e:
570        print(f'{str(e)}\n', file=sys.stderr)
571        return 1
572    except OSError as e:
573        print(f'{type(e).__name__}: {e}', file=sys.stderr)
574        return 1
575
576
577if __name__ == '__main__':
578    exit(main())
579