1bbbd9b6eSWillian Rampazzo# ... 2bbbd9b6eSWillian Rampazzo# 3bbbd9b6eSWillian Rampazzo# Copyright (c) 2019 Philippe Mathieu-Daudé <f4bug@amsat.org> 4bbbd9b6eSWillian Rampazzo# 5bbbd9b6eSWillian Rampazzo# This work is licensed under the terms of the GNU GPL, version 2 or 6bbbd9b6eSWillian Rampazzo# later. See the COPYING file in the top-level directory. 7bbbd9b6eSWillian Rampazzo 8bbbd9b6eSWillian Rampazzoimport re 9bbbd9b6eSWillian Rampazzoimport logging 10bbbd9b6eSWillian Rampazzo 11bbbd9b6eSWillian Rampazzofrom avocado.utils import process 12bbbd9b6eSWillian Rampazzofrom avocado.utils.path import find_command, CmdNotFoundError 13bbbd9b6eSWillian Rampazzo 14bbbd9b6eSWillian Rampazzodef tesseract_available(expected_version): 15bbbd9b6eSWillian Rampazzo try: 16bbbd9b6eSWillian Rampazzo find_command('tesseract') 17bbbd9b6eSWillian Rampazzo except CmdNotFoundError: 18bbbd9b6eSWillian Rampazzo return False 19bbbd9b6eSWillian Rampazzo res = process.run('tesseract --version') 20bbbd9b6eSWillian Rampazzo try: 21bbbd9b6eSWillian Rampazzo version = res.stdout_text.split()[1] 22bbbd9b6eSWillian Rampazzo except IndexError: 23bbbd9b6eSWillian Rampazzo version = res.stderr_text.split()[1] 24645198d5SThomas Huth return int(version.split('.')[0]) >= expected_version 25bbbd9b6eSWillian Rampazzo 26bbbd9b6eSWillian Rampazzo match = re.match(r'tesseract\s(\d)', res) 27bbbd9b6eSWillian Rampazzo if match is None: 28bbbd9b6eSWillian Rampazzo return False 29bbbd9b6eSWillian Rampazzo # now this is guaranteed to be a digit 30645198d5SThomas Huth return int(match.groups()[0]) >= expected_version 31bbbd9b6eSWillian Rampazzo 32bbbd9b6eSWillian Rampazzo 33bbbd9b6eSWillian Rampazzodef tesseract_ocr(image_path, tesseract_args='', tesseract_version=3): 34bbbd9b6eSWillian Rampazzo console_logger = logging.getLogger('tesseract') 35bbbd9b6eSWillian Rampazzo console_logger.debug(image_path) 36bbbd9b6eSWillian Rampazzo if tesseract_version == 4: 37bbbd9b6eSWillian Rampazzo tesseract_args += ' --oem 1' 38bbbd9b6eSWillian Rampazzo proc = process.run("tesseract {} {} stdout".format(tesseract_args, 39bbbd9b6eSWillian Rampazzo image_path)) 40bbbd9b6eSWillian Rampazzo lines = [] 41bbbd9b6eSWillian Rampazzo for line in proc.stdout_text.split('\n'): 42bbbd9b6eSWillian Rampazzo sline = line.strip() 43bbbd9b6eSWillian Rampazzo if len(sline): 44bbbd9b6eSWillian Rampazzo console_logger.debug(sline) 45bbbd9b6eSWillian Rampazzo lines += [sline] 46bbbd9b6eSWillian Rampazzo return lines 47