1#!/usr/bin/env python 2import re 3import shlex 4import string 5import subprocess 6import sys 7from contextlib import contextmanager 8from csv import QUOTE_NONE 9from errno import ENOENT 10from functools import wraps 11from glob import iglob 12from io import BytesIO 13from os import environ 14from os import extsep 15from os import linesep 16from os import remove 17from os.path import normcase 18from os.path import normpath 19from os.path import realpath 20from pkgutil import find_loader 21from tempfile import NamedTemporaryFile 22from time import sleep 23 24from packaging.version import InvalidVersion 25from packaging.version import parse 26from packaging.version import Version 27from PIL import Image 28 29 30tesseract_cmd = 'tesseract' 31 32numpy_installed = find_loader('numpy') is not None 33if numpy_installed: 34 from numpy import ndarray 35 36pandas_installed = find_loader('pandas') is not None 37if pandas_installed: 38 import pandas as pd 39 40DEFAULT_ENCODING = 'utf-8' 41LANG_PATTERN = re.compile('^[a-z_]+$') 42RGB_MODE = 'RGB' 43SUPPORTED_FORMATS = { 44 'JPEG', 45 'PNG', 46 'PBM', 47 'PGM', 48 'PPM', 49 'TIFF', 50 'BMP', 51 'GIF', 52 'WEBP', 53} 54 55OSD_KEYS = { 56 'Page number': ('page_num', int), 57 'Orientation in degrees': ('orientation', int), 58 'Rotate': ('rotate', int), 59 'Orientation confidence': ('orientation_conf', float), 60 'Script': ('script', str), 61 'Script confidence': ('script_conf', float), 62} 63 64TESSERACT_MIN_VERSION = Version('3.05') 65TESSERACT_ALTO_VERSION = Version('4.1.0') 66 67 68class Output: 69 BYTES = 'bytes' 70 DATAFRAME = 'data.frame' 71 DICT = 'dict' 72 STRING = 'string' 73 74 75class PandasNotSupported(EnvironmentError): 76 def __init__(self): 77 super().__init__('Missing pandas package') 78 79 80class TesseractError(RuntimeError): 81 def __init__(self, status, message): 82 self.status = status 83 self.message = message 84 self.args = (status, message) 85 86 87class TesseractNotFoundError(EnvironmentError): 88 def __init__(self): 89 super().__init__( 90 f"{tesseract_cmd} is not installed or it's not in your PATH." 91 + ' See README file for more information.', 92 ) 93 94 95class TSVNotSupported(EnvironmentError): 96 def __init__(self): 97 super().__init__( 98 'TSV output not supported. Tesseract >= 3.05 required', 99 ) 100 101 102class ALTONotSupported(EnvironmentError): 103 def __init__(self): 104 super().__init__( 105 'ALTO output not supported. Tesseract >= 4.1.0 required', 106 ) 107 108 109def kill(process, code): 110 process.terminate() 111 try: 112 process.wait(1) 113 except TypeError: # python2 Popen.wait(1) fallback 114 sleep(1) 115 except Exception: # python3 subprocess.TimeoutExpired 116 pass 117 finally: 118 process.kill() 119 process.returncode = code 120 121 122@contextmanager 123def timeout_manager(proc, seconds=None): 124 try: 125 if not seconds: 126 yield proc.communicate()[1] 127 return 128 129 try: 130 _, error_string = proc.communicate(timeout=seconds) 131 yield error_string 132 except subprocess.TimeoutExpired: 133 kill(proc, -1) 134 raise RuntimeError('Tesseract process timeout') 135 finally: 136 proc.stdin.close() 137 proc.stdout.close() 138 proc.stderr.close() 139 140 141def run_once(func): 142 @wraps(func) 143 def wrapper(*args, **kwargs): 144 if wrapper._result is wrapper: 145 wrapper._result = func(*args, **kwargs) 146 return wrapper._result 147 148 wrapper._result = wrapper 149 return wrapper 150 151 152def get_errors(error_string): 153 return ' '.join( 154 line for line in error_string.decode(DEFAULT_ENCODING).splitlines() 155 ).strip() 156 157 158def cleanup(temp_name): 159 """Tries to remove temp files by filename wildcard path.""" 160 for filename in iglob(temp_name + '*' if temp_name else temp_name): 161 try: 162 remove(filename) 163 except OSError as e: 164 if e.errno != ENOENT: 165 raise e 166 167 168def prepare(image): 169 if numpy_installed and isinstance(image, ndarray): 170 image = Image.fromarray(image) 171 172 if not isinstance(image, Image.Image): 173 raise TypeError('Unsupported image object') 174 175 extension = 'PNG' if not image.format else image.format 176 if extension not in SUPPORTED_FORMATS: 177 raise TypeError('Unsupported image format/type') 178 179 if 'A' in image.getbands(): 180 # discard and replace the alpha channel with white background 181 background = Image.new(RGB_MODE, image.size, (255, 255, 255)) 182 background.paste(image, (0, 0), image.getchannel('A')) 183 image = background 184 185 image.format = extension 186 return image, extension 187 188 189@contextmanager 190def save(image): 191 try: 192 with NamedTemporaryFile(prefix='tess_', delete=False) as f: 193 if isinstance(image, str): 194 yield f.name, realpath(normpath(normcase(image))) 195 return 196 image, extension = prepare(image) 197 input_file_name = f.name + extsep + extension 198 image.save(input_file_name, format=image.format) 199 yield f.name, input_file_name 200 finally: 201 cleanup(f.name) 202 203 204def subprocess_args(include_stdout=True): 205 # See https://github.com/pyinstaller/pyinstaller/wiki/Recipe-subprocess 206 # for reference and comments. 207 208 kwargs = { 209 'stdin': subprocess.PIPE, 210 'stderr': subprocess.PIPE, 211 'startupinfo': None, 212 'env': environ, 213 } 214 215 if hasattr(subprocess, 'STARTUPINFO'): 216 kwargs['startupinfo'] = subprocess.STARTUPINFO() 217 kwargs['startupinfo'].dwFlags |= subprocess.STARTF_USESHOWWINDOW 218 kwargs['startupinfo'].wShowWindow = subprocess.SW_HIDE 219 220 if include_stdout: 221 kwargs['stdout'] = subprocess.PIPE 222 else: 223 kwargs['stdout'] = subprocess.DEVNULL 224 225 return kwargs 226 227 228def run_tesseract( 229 input_filename, 230 output_filename_base, 231 extension, 232 lang, 233 config='', 234 nice=0, 235 timeout=0, 236): 237 cmd_args = [] 238 239 if not sys.platform.startswith('win32') and nice != 0: 240 cmd_args += ('nice', '-n', str(nice)) 241 242 cmd_args += (tesseract_cmd, input_filename, output_filename_base) 243 244 if lang is not None: 245 cmd_args += ('-l', lang) 246 247 if config: 248 cmd_args += shlex.split(config) 249 250 if extension and extension not in {'box', 'osd', 'tsv', 'xml'}: 251 cmd_args.append(extension) 252 253 try: 254 proc = subprocess.Popen(cmd_args, **subprocess_args()) 255 except OSError as e: 256 if e.errno != ENOENT: 257 raise e 258 raise TesseractNotFoundError() 259 260 with timeout_manager(proc, timeout) as error_string: 261 if proc.returncode: 262 raise TesseractError(proc.returncode, get_errors(error_string)) 263 264 265def run_and_get_output( 266 image, 267 extension='', 268 lang=None, 269 config='', 270 nice=0, 271 timeout=0, 272 return_bytes=False, 273): 274 275 with save(image) as (temp_name, input_filename): 276 kwargs = { 277 'input_filename': input_filename, 278 'output_filename_base': temp_name, 279 'extension': extension, 280 'lang': lang, 281 'config': config, 282 'nice': nice, 283 'timeout': timeout, 284 } 285 286 run_tesseract(**kwargs) 287 filename = kwargs['output_filename_base'] + extsep + extension 288 with open(filename, 'rb') as output_file: 289 if return_bytes: 290 return output_file.read() 291 return output_file.read().decode(DEFAULT_ENCODING) 292 293 294def file_to_dict(tsv, cell_delimiter, str_col_idx): 295 result = {} 296 rows = [row.split(cell_delimiter) for row in tsv.strip().split('\n')] 297 if len(rows) < 2: 298 return result 299 300 header = rows.pop(0) 301 length = len(header) 302 if len(rows[-1]) < length: 303 # Fixes bug that occurs when last text string in TSV is null, and 304 # last row is missing a final cell in TSV file 305 rows[-1].append('') 306 307 if str_col_idx < 0: 308 str_col_idx += length 309 310 for i, head in enumerate(header): 311 result[head] = list() 312 for row in rows: 313 if len(row) <= i: 314 continue 315 316 if i != str_col_idx: 317 try: 318 val = int(float(row[i])) 319 except ValueError: 320 val = row[i] 321 else: 322 val = row[i] 323 324 result[head].append(val) 325 326 return result 327 328 329def is_valid(val, _type): 330 if _type is int: 331 return val.isdigit() 332 333 if _type is float: 334 try: 335 float(val) 336 return True 337 except ValueError: 338 return False 339 340 return True 341 342 343def osd_to_dict(osd): 344 return { 345 OSD_KEYS[kv[0]][0]: OSD_KEYS[kv[0]][1](kv[1]) 346 for kv in (line.split(': ') for line in osd.split('\n')) 347 if len(kv) == 2 and is_valid(kv[1], OSD_KEYS[kv[0]][1]) 348 } 349 350 351@run_once 352def get_languages(config=''): 353 cmd_args = [tesseract_cmd, '--list-langs'] 354 if config: 355 cmd_args += shlex.split(config) 356 357 try: 358 result = subprocess.run( 359 cmd_args, 360 stdout=subprocess.PIPE, 361 stderr=subprocess.STDOUT, 362 ) 363 except OSError: 364 raise TesseractNotFoundError() 365 366 # tesseract 3.x 367 if result.returncode not in (0, 1): 368 raise TesseractNotFoundError() 369 370 languages = [] 371 if result.stdout: 372 for line in result.stdout.decode(DEFAULT_ENCODING).split(linesep): 373 lang = line.strip() 374 if LANG_PATTERN.match(lang): 375 languages.append(lang) 376 377 return languages 378 379 380@run_once 381def get_tesseract_version(): 382 """ 383 Returns Version object of the Tesseract version 384 """ 385 try: 386 output = subprocess.check_output( 387 [tesseract_cmd, '--version'], 388 stderr=subprocess.STDOUT, 389 env=environ, 390 stdin=subprocess.DEVNULL, 391 ) 392 except OSError: 393 raise TesseractNotFoundError() 394 395 raw_version = output.decode(DEFAULT_ENCODING) 396 str_version, *_ = raw_version.lstrip(string.printable[10:]).partition(' ') 397 str_version, *_ = str_version.partition('-') 398 399 try: 400 version = parse(str_version) 401 assert version >= TESSERACT_MIN_VERSION 402 except (AssertionError, InvalidVersion): 403 raise SystemExit(f'Invalid tesseract version: "{raw_version}"') 404 405 return version 406 407 408def image_to_string( 409 image, 410 lang=None, 411 config='', 412 nice=0, 413 output_type=Output.STRING, 414 timeout=0, 415): 416 """ 417 Returns the result of a Tesseract OCR run on the provided image to string 418 """ 419 args = [image, 'txt', lang, config, nice, timeout] 420 421 return { 422 Output.BYTES: lambda: run_and_get_output(*(args + [True])), 423 Output.DICT: lambda: {'text': run_and_get_output(*args)}, 424 Output.STRING: lambda: run_and_get_output(*args), 425 }[output_type]() 426 427 428def image_to_pdf_or_hocr( 429 image, 430 lang=None, 431 config='', 432 nice=0, 433 extension='pdf', 434 timeout=0, 435): 436 """ 437 Returns the result of a Tesseract OCR run on the provided image to pdf/hocr 438 """ 439 440 if extension not in {'pdf', 'hocr'}: 441 raise ValueError(f'Unsupported extension: {extension}') 442 args = [image, extension, lang, config, nice, timeout, True] 443 444 return run_and_get_output(*args) 445 446 447def image_to_alto_xml( 448 image, 449 lang=None, 450 config='', 451 nice=0, 452 timeout=0, 453): 454 """ 455 Returns the result of a Tesseract OCR run on the provided image to ALTO XML 456 """ 457 458 if get_tesseract_version() < TESSERACT_ALTO_VERSION: 459 raise ALTONotSupported() 460 461 config = f'-c tessedit_create_alto=1 {config.strip()}' 462 args = [image, 'xml', lang, config, nice, timeout, True] 463 464 return run_and_get_output(*args) 465 466 467def image_to_boxes( 468 image, 469 lang=None, 470 config='', 471 nice=0, 472 output_type=Output.STRING, 473 timeout=0, 474): 475 """ 476 Returns string containing recognized characters and their box boundaries 477 """ 478 config = f'{config.strip()} batch.nochop makebox' 479 args = [image, 'box', lang, config, nice, timeout] 480 481 return { 482 Output.BYTES: lambda: run_and_get_output(*(args + [True])), 483 Output.DICT: lambda: file_to_dict( 484 f'char left bottom right top page\n{run_and_get_output(*args)}', 485 ' ', 486 0, 487 ), 488 Output.STRING: lambda: run_and_get_output(*args), 489 }[output_type]() 490 491 492def get_pandas_output(args, config=None): 493 if not pandas_installed: 494 raise PandasNotSupported() 495 496 kwargs = {'quoting': QUOTE_NONE, 'sep': '\t'} 497 try: 498 kwargs.update(config) 499 except (TypeError, ValueError): 500 pass 501 502 return pd.read_csv(BytesIO(run_and_get_output(*args)), **kwargs) 503 504 505def image_to_data( 506 image, 507 lang=None, 508 config='', 509 nice=0, 510 output_type=Output.STRING, 511 timeout=0, 512 pandas_config=None, 513): 514 """ 515 Returns string containing box boundaries, confidences, 516 and other information. Requires Tesseract 3.05+ 517 """ 518 519 if get_tesseract_version() < TESSERACT_MIN_VERSION: 520 raise TSVNotSupported() 521 522 config = f'-c tessedit_create_tsv=1 {config.strip()}' 523 args = [image, 'tsv', lang, config, nice, timeout] 524 525 return { 526 Output.BYTES: lambda: run_and_get_output(*(args + [True])), 527 Output.DATAFRAME: lambda: get_pandas_output( 528 args + [True], 529 pandas_config, 530 ), 531 Output.DICT: lambda: file_to_dict(run_and_get_output(*args), '\t', -1), 532 Output.STRING: lambda: run_and_get_output(*args), 533 }[output_type]() 534 535 536def image_to_osd( 537 image, 538 lang='osd', 539 config='', 540 nice=0, 541 output_type=Output.STRING, 542 timeout=0, 543): 544 """ 545 Returns string containing the orientation and script detection (OSD) 546 """ 547 config = f'--psm 0 {config.strip()}' 548 args = [image, 'osd', lang, config, nice, timeout] 549 550 return { 551 Output.BYTES: lambda: run_and_get_output(*(args + [True])), 552 Output.DICT: lambda: osd_to_dict(run_and_get_output(*args)), 553 Output.STRING: lambda: run_and_get_output(*args), 554 }[output_type]() 555 556 557def main(): 558 if len(sys.argv) == 2: 559 filename, lang = sys.argv[1], None 560 elif len(sys.argv) == 4 and sys.argv[1] == '-l': 561 filename, lang = sys.argv[3], sys.argv[2] 562 else: 563 print('Usage: pytesseract [-l lang] input_file\n', file=sys.stderr) 564 return 2 565 566 try: 567 with Image.open(filename) as img: 568 print(image_to_string(img, lang=lang)) 569 except TesseractNotFoundError as e: 570 print(f'{str(e)}\n', file=sys.stderr) 571 return 1 572 except OSError as e: 573 print(f'{type(e).__name__}: {e}', file=sys.stderr) 574 return 1 575 576 577if __name__ == '__main__': 578 exit(main()) 579