1# © 2019 James R. Barlow: github.com/jbarlow83
2#
3# This Source Code Form is subject to the terms of the Mozilla Public
4# License, v. 2.0. If a copy of the MPL was not distributed with this
5# file, You can obtain one at http://mozilla.org/MPL/2.0/.
6
7
8import logging
9import os
10import sys
11import threading
12from enum import IntEnum
13from io import IOBase
14from pathlib import Path
15from typing import AnyStr, BinaryIO, Iterable, Optional, Union
16from warnings import warn
17
18from ocrmypdf._logging import PageNumberFilter, TqdmConsole
19from ocrmypdf._plugin_manager import get_plugin_manager
20from ocrmypdf._sync import run_pipeline
21from ocrmypdf._validation import check_options
22from ocrmypdf.cli import ArgumentParser, get_parser
23from ocrmypdf.helpers import is_iterable_notstr
24
25try:
26    import coloredlogs
27except ModuleNotFoundError:
28    coloredlogs = None
29
30
31StrPath = Union[Path, AnyStr]
32PathOrIO = Union[BinaryIO, StrPath]
33
34_api_lock = threading.Lock()
35
36
37class Verbosity(IntEnum):
38    """Verbosity level for configure_logging."""
39
40    quiet = -1  #: Suppress most messages
41    default = 0  #: Default level of logging
42    debug = 1  #: Output ocrmypdf debug messages
43    debug_all = 2  #: More detailed debugging from ocrmypdf and dependent modules
44
45
46def configure_logging(
47    verbosity: Verbosity,
48    *,
49    progress_bar_friendly: bool = True,
50    manage_root_logger: bool = False,
51    plugin_manager=None,
52):
53    """Set up logging.
54
55    Before calling :func:`ocrmypdf.ocr()`, you can use this function to
56    configure logging if you want ocrmypdf's output to look like the ocrmypdf
57    command line interface. It will register log handlers, log filters, and
58    formatters, configure color logging to standard error, and adjust the log
59    levels of third party libraries. Details of this are fine-tuned and subject
60    to change. The ``verbosity`` argument is equivalent to the argument
61    ``--verbose`` and applies those settings. If you have a wrapper
62    script for ocrmypdf and you want it to be very similar to ocrmypdf, use this
63    function; if you are using ocrmypdf as part of an application that manages
64    its own logging, you probably do not want this function.
65
66    If this function is not called, ocrmypdf will not configure logging, and it
67    is up to the caller of ``ocrmypdf.ocr()`` to set up logging as it wishes using
68    the Python standard library's logging module. If this function is called,
69    the caller may of course make further adjustments to logging.
70
71    Regardless of whether this function is called, ocrmypdf will perform all of
72    its logging under the ``"ocrmypdf"`` logging namespace. In addition,
73    ocrmypdf imports pdfminer, which logs under ``"pdfminer"``. A library user
74    may wish to configure both; note that pdfminer is extremely chatty at the
75    log level ``logging.INFO``.
76
77    This function does not set up the ``debug.log`` log file that the command
78    line interface does at certain verbosity levels. Applications should configure
79    their own debug logging.
80
81    Args:
82        verbosity: Verbosity level.
83        progress_bar_friendly: If True (the default), install a custom log handler
84            that is compatible with progress bars and colored output.
85        manage_root_logger: Configure the process's root logger.
86        plugin_manager: The plugin manager, used for obtaining the custom log handler.
87
88    Returns:
89        The toplevel logger for ocrmypdf (or the root logger, if we are managing it).
90    """
91
92    prefix = '' if manage_root_logger else 'ocrmypdf'
93
94    log = logging.getLogger(prefix)
95    log.setLevel(logging.DEBUG)
96
97    console = None
98    if plugin_manager and progress_bar_friendly:
99        console = plugin_manager.hook.get_logging_console()
100
101    if not console:
102        console = logging.StreamHandler(stream=sys.stderr)
103
104    if verbosity < 0:
105        console.setLevel(logging.ERROR)
106    elif verbosity >= 1:
107        console.setLevel(logging.DEBUG)
108    else:
109        console.setLevel(logging.INFO)
110
111    console.addFilter(PageNumberFilter())
112
113    if verbosity >= 2:
114        fmt = '%(levelname)7s %(name)s -%(pageno)s %(message)s'
115    else:
116        fmt = '%(pageno)s%(message)s'
117
118    use_colors = progress_bar_friendly
119    if not coloredlogs:
120        use_colors = False
121    if use_colors:
122        if os.name == 'nt':
123            use_colors = coloredlogs.enable_ansi_support()
124        if use_colors:
125            use_colors = coloredlogs.terminal_supports_colors()
126    if use_colors:
127        formatter = coloredlogs.ColoredFormatter(fmt=fmt)
128    else:
129        formatter = logging.Formatter(fmt=fmt)
130
131    console.setFormatter(formatter)
132    log.addHandler(console)
133
134    if verbosity <= 1:
135        pdfminer_log = logging.getLogger('pdfminer')
136        pdfminer_log.setLevel(logging.ERROR)
137        pil_log = logging.getLogger('PIL')
138        pil_log.setLevel(logging.INFO)
139
140    if manage_root_logger:
141        logging.captureWarnings(True)
142
143    return log
144
145
146def create_options(
147    *, input_file: PathOrIO, output_file: PathOrIO, parser: ArgumentParser, **kwargs
148):
149    cmdline = []
150    deferred = []
151
152    for arg, val in kwargs.items():
153        if val is None:
154            continue
155
156        # These arguments with special handling for which we bypass
157        # argparse
158        if arg in {'progress_bar', 'plugins'}:
159            deferred.append((arg, val))
160            continue
161
162        cmd_style_arg = arg.replace('_', '-')
163
164        # Booleans are special: add only if True, omit for False
165        if isinstance(val, bool):
166            if val:
167                cmdline.append(f"--{cmd_style_arg}")
168            continue
169
170        if is_iterable_notstr(val):
171            for elem in val:
172                cmdline.append(f"--{cmd_style_arg}")
173                cmdline.append(elem)
174            continue
175
176        # We have a parameter
177        cmdline.append(f"--{cmd_style_arg}")
178        if isinstance(val, (int, float)):
179            cmdline.append(str(val))
180        elif isinstance(val, str):
181            cmdline.append(val)
182        elif isinstance(val, Path):
183            cmdline.append(str(val))
184        else:
185            raise TypeError(f"{arg}: {val} ({type(val)})")
186
187    if isinstance(input_file, (BinaryIO, IOBase)):
188        cmdline.append('stream://input_file')
189    else:
190        cmdline.append(os.fspath(input_file))
191    if isinstance(output_file, (BinaryIO, IOBase)):
192        cmdline.append('stream://output_file')
193    else:
194        cmdline.append(os.fspath(output_file))
195
196    parser._api_mode = True
197    options = parser.parse_args(cmdline)
198    for keyword, val in deferred:
199        setattr(options, keyword, val)
200
201    if options.input_file == 'stream://input_file':
202        options.input_file = input_file
203    if options.output_file == 'stream://output_file':
204        options.output_file = output_file
205
206    return options
207
208
209def ocr(  # pylint: disable=unused-argument
210    input_file: PathOrIO,
211    output_file: PathOrIO,
212    *,
213    language: Iterable[str] = None,
214    image_dpi: int = None,
215    output_type=None,
216    sidecar: Optional[StrPath] = None,
217    jobs: int = None,
218    use_threads: bool = None,
219    title: str = None,
220    author: str = None,
221    subject: str = None,
222    keywords: str = None,
223    rotate_pages: bool = None,
224    remove_background: bool = None,
225    deskew: bool = None,
226    clean: bool = None,
227    clean_final: bool = None,
228    unpaper_args: str = None,
229    oversample: int = None,
230    remove_vectors: bool = None,
231    threshold: bool = None,
232    force_ocr: bool = None,
233    skip_text: bool = None,
234    redo_ocr: bool = None,
235    skip_big: float = None,
236    optimize: int = None,
237    jpg_quality: int = None,
238    png_quality: int = None,
239    jbig2_lossy: bool = None,
240    jbig2_page_group_size: int = None,
241    pages: str = None,
242    max_image_mpixels: float = None,
243    tesseract_config: Iterable[str] = None,
244    tesseract_pagesegmode: int = None,
245    tesseract_oem: int = None,
246    pdf_renderer=None,
247    tesseract_timeout: float = None,
248    rotate_pages_threshold: float = None,
249    pdfa_image_compression=None,
250    user_words: os.PathLike = None,
251    user_patterns: os.PathLike = None,
252    fast_web_view: float = None,
253    plugins: Iterable[StrPath] = None,
254    plugin_manager=None,
255    keep_temporary_files: bool = None,
256    progress_bar: bool = None,
257    **kwargs,
258):
259    """Run OCRmyPDF on one PDF or image.
260
261    For most arguments, see documentation for the equivalent command line parameter.
262    A few specific arguments are discussed here:
263
264    Args:
265        use_threads: Use worker threads instead of processes. This reduces
266            performance but may make debugging easier since it is easier to set
267            breakpoints.
268        input_file: If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is
269            interpreted as file system path to the input file. If the object
270            appears to be a readable stream (with methods such as ``.read()``
271            and ``.seek()``), the object will be read in its entirety and saved to
272            a temporary file. If ``input_file`` is  ``"-"``, standard input will be
273            read.
274        output_file: If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is
275            interpreted as file system path to the output file. If the object
276            appears to be a writable stream (with methods such as ``.write()`` and
277            ``.seek()``), the output will be written to this stream. If
278            ``output_file`` is ``"-"``, the output will be written to ``sys.stdout``
279            (provided that standard output does not seem to be a terminal device).
280            When a stream is used as output, whether via a writable object or
281            ``"-"``, some final validation steps are not performed (we do not read
282            back the stream after it is written).
283    Raises:
284        ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging
285            with the OCR layer.
286        ocrmypdf.MissingDependencyError: If a required dependency program is missing or
287            was not found on PATH.
288        ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that
289            could not be read, or some other file type that is not a PDF.
290        ocrmypdf.DpiError: If the input file is an image, but the resolution of the
291            image is not credible (allowing it to proceed would cause poor OCR).
292        ocrmypdf.OutputFileAccessError: If an attempt to write to the intended output
293            file failed.
294        ocrmypdf.PriorOcrFoundError: If the input PDF seems to have OCR or digital
295            text already, and settings did not tell us to proceed.
296        ocrmypdf.InputFileError: Any other problem with the input file.
297        ocrmypdf.SubprocessOutputError: Any error related to executing a subprocess.
298        ocrmypdf.EncryptedPdfERror: If the input PDF is encrypted (password protected).
299            OCRmyPDF does not remove passwords.
300        ocrmypdf.TesseractConfigError: If Tesseract reported its configuration was not
301            valid.
302
303    Returns:
304        :class:`ocrmypdf.ExitCode`
305    """
306    if plugins and plugin_manager:
307        raise ValueError("plugins= and plugin_manager are mutually exclusive")
308
309    if not plugins:
310        plugins = []
311    elif isinstance(plugins, (str, Path)):
312        plugins = [plugins]
313    else:
314        plugins = list(plugins)
315
316    # No new variable names should be assigned until these two steps are run
317    create_options_kwargs = {k: v for k, v in locals().items() if k != 'kwargs'}
318    create_options_kwargs.update(kwargs)
319
320    parser = get_parser()
321    create_options_kwargs['parser'] = parser
322
323    with _api_lock:
324        # We can't allow multiple ocrmypdf.ocr() threads to run in parallel, because
325        # they might install different plugins, and generally speaking we have areas
326        # of code that use global state.
327
328        if not plugin_manager:
329            plugin_manager = get_plugin_manager(plugins)
330        plugin_manager.hook.add_options(parser=parser)  # pylint: disable=no-member
331
332        if 'verbose' in kwargs:
333            warn("ocrmypdf.ocr(verbose=) is ignored. Use ocrmypdf.configure_logging().")
334
335        options = create_options(**create_options_kwargs)
336        check_options(options, plugin_manager)
337        return run_pipeline(options=options, plugin_manager=plugin_manager, api=True)
338
339
340__all__ = [
341    'PageNumberFilter',
342    'TqdmConsole',
343    'Verbosity',
344    'check_options',
345    'configure_logging',
346    'create_options',
347    'get_parser',
348    'get_plugin_manager',
349    'ocr',
350    'run_pipeline',
351]
352