xref: /qemu/tests/avocado/tesseract_utils.py (revision 645198d5)
1bbbd9b6eSWillian Rampazzo# ...
2bbbd9b6eSWillian Rampazzo#
3bbbd9b6eSWillian Rampazzo# Copyright (c) 2019 Philippe Mathieu-Daudé <f4bug@amsat.org>
4bbbd9b6eSWillian Rampazzo#
5bbbd9b6eSWillian Rampazzo# This work is licensed under the terms of the GNU GPL, version 2 or
6bbbd9b6eSWillian Rampazzo# later. See the COPYING file in the top-level directory.
7bbbd9b6eSWillian Rampazzo
8bbbd9b6eSWillian Rampazzoimport re
9bbbd9b6eSWillian Rampazzoimport logging
10bbbd9b6eSWillian Rampazzo
11bbbd9b6eSWillian Rampazzofrom avocado.utils import process
12bbbd9b6eSWillian Rampazzofrom avocado.utils.path import find_command, CmdNotFoundError
13bbbd9b6eSWillian Rampazzo
14bbbd9b6eSWillian Rampazzodef tesseract_available(expected_version):
15bbbd9b6eSWillian Rampazzo    try:
16bbbd9b6eSWillian Rampazzo        find_command('tesseract')
17bbbd9b6eSWillian Rampazzo    except CmdNotFoundError:
18bbbd9b6eSWillian Rampazzo        return False
19bbbd9b6eSWillian Rampazzo    res = process.run('tesseract --version')
20bbbd9b6eSWillian Rampazzo    try:
21bbbd9b6eSWillian Rampazzo        version = res.stdout_text.split()[1]
22bbbd9b6eSWillian Rampazzo    except IndexError:
23bbbd9b6eSWillian Rampazzo        version = res.stderr_text.split()[1]
24645198d5SThomas Huth    return int(version.split('.')[0]) >= expected_version
25bbbd9b6eSWillian Rampazzo
26bbbd9b6eSWillian Rampazzo    match = re.match(r'tesseract\s(\d)', res)
27bbbd9b6eSWillian Rampazzo    if match is None:
28bbbd9b6eSWillian Rampazzo        return False
29bbbd9b6eSWillian Rampazzo    # now this is guaranteed to be a digit
30645198d5SThomas Huth    return int(match.groups()[0]) >= expected_version
31bbbd9b6eSWillian Rampazzo
32bbbd9b6eSWillian Rampazzo
33bbbd9b6eSWillian Rampazzodef tesseract_ocr(image_path, tesseract_args='', tesseract_version=3):
34bbbd9b6eSWillian Rampazzo    console_logger = logging.getLogger('tesseract')
35bbbd9b6eSWillian Rampazzo    console_logger.debug(image_path)
36bbbd9b6eSWillian Rampazzo    if tesseract_version == 4:
37bbbd9b6eSWillian Rampazzo        tesseract_args += ' --oem 1'
38bbbd9b6eSWillian Rampazzo    proc = process.run("tesseract {} {} stdout".format(tesseract_args,
39bbbd9b6eSWillian Rampazzo                                                       image_path))
40bbbd9b6eSWillian Rampazzo    lines = []
41bbbd9b6eSWillian Rampazzo    for line in proc.stdout_text.split('\n'):
42bbbd9b6eSWillian Rampazzo        sline = line.strip()
43bbbd9b6eSWillian Rampazzo        if len(sline):
44bbbd9b6eSWillian Rampazzo            console_logger.debug(sline)
45bbbd9b6eSWillian Rampazzo            lines += [sline]
46bbbd9b6eSWillian Rampazzo    return lines
47