1# © 2019 James R. Barlow: github.com/jbarlow83 2# 3# This Source Code Form is subject to the terms of the Mozilla Public 4# License, v. 2.0. If a copy of the MPL was not distributed with this 5# file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 7 8import logging 9import os 10import sys 11import threading 12from enum import IntEnum 13from io import IOBase 14from pathlib import Path 15from typing import AnyStr, BinaryIO, Iterable, Optional, Union 16from warnings import warn 17 18from ocrmypdf._logging import PageNumberFilter, TqdmConsole 19from ocrmypdf._plugin_manager import get_plugin_manager 20from ocrmypdf._sync import run_pipeline 21from ocrmypdf._validation import check_options 22from ocrmypdf.cli import ArgumentParser, get_parser 23from ocrmypdf.helpers import is_iterable_notstr 24 25try: 26 import coloredlogs 27except ModuleNotFoundError: 28 coloredlogs = None 29 30 31StrPath = Union[Path, AnyStr] 32PathOrIO = Union[BinaryIO, StrPath] 33 34_api_lock = threading.Lock() 35 36 37class Verbosity(IntEnum): 38 """Verbosity level for configure_logging.""" 39 40 quiet = -1 #: Suppress most messages 41 default = 0 #: Default level of logging 42 debug = 1 #: Output ocrmypdf debug messages 43 debug_all = 2 #: More detailed debugging from ocrmypdf and dependent modules 44 45 46def configure_logging( 47 verbosity: Verbosity, 48 *, 49 progress_bar_friendly: bool = True, 50 manage_root_logger: bool = False, 51 plugin_manager=None, 52): 53 """Set up logging. 54 55 Before calling :func:`ocrmypdf.ocr()`, you can use this function to 56 configure logging if you want ocrmypdf's output to look like the ocrmypdf 57 command line interface. It will register log handlers, log filters, and 58 formatters, configure color logging to standard error, and adjust the log 59 levels of third party libraries. Details of this are fine-tuned and subject 60 to change. The ``verbosity`` argument is equivalent to the argument 61 ``--verbose`` and applies those settings. If you have a wrapper 62 script for ocrmypdf and you want it to be very similar to ocrmypdf, use this 63 function; if you are using ocrmypdf as part of an application that manages 64 its own logging, you probably do not want this function. 65 66 If this function is not called, ocrmypdf will not configure logging, and it 67 is up to the caller of ``ocrmypdf.ocr()`` to set up logging as it wishes using 68 the Python standard library's logging module. If this function is called, 69 the caller may of course make further adjustments to logging. 70 71 Regardless of whether this function is called, ocrmypdf will perform all of 72 its logging under the ``"ocrmypdf"`` logging namespace. In addition, 73 ocrmypdf imports pdfminer, which logs under ``"pdfminer"``. A library user 74 may wish to configure both; note that pdfminer is extremely chatty at the 75 log level ``logging.INFO``. 76 77 This function does not set up the ``debug.log`` log file that the command 78 line interface does at certain verbosity levels. Applications should configure 79 their own debug logging. 80 81 Args: 82 verbosity: Verbosity level. 83 progress_bar_friendly: If True (the default), install a custom log handler 84 that is compatible with progress bars and colored output. 85 manage_root_logger: Configure the process's root logger. 86 plugin_manager: The plugin manager, used for obtaining the custom log handler. 87 88 Returns: 89 The toplevel logger for ocrmypdf (or the root logger, if we are managing it). 90 """ 91 92 prefix = '' if manage_root_logger else 'ocrmypdf' 93 94 log = logging.getLogger(prefix) 95 log.setLevel(logging.DEBUG) 96 97 console = None 98 if plugin_manager and progress_bar_friendly: 99 console = plugin_manager.hook.get_logging_console() 100 101 if not console: 102 console = logging.StreamHandler(stream=sys.stderr) 103 104 if verbosity < 0: 105 console.setLevel(logging.ERROR) 106 elif verbosity >= 1: 107 console.setLevel(logging.DEBUG) 108 else: 109 console.setLevel(logging.INFO) 110 111 console.addFilter(PageNumberFilter()) 112 113 if verbosity >= 2: 114 fmt = '%(levelname)7s %(name)s -%(pageno)s %(message)s' 115 else: 116 fmt = '%(pageno)s%(message)s' 117 118 use_colors = progress_bar_friendly 119 if not coloredlogs: 120 use_colors = False 121 if use_colors: 122 if os.name == 'nt': 123 use_colors = coloredlogs.enable_ansi_support() 124 if use_colors: 125 use_colors = coloredlogs.terminal_supports_colors() 126 if use_colors: 127 formatter = coloredlogs.ColoredFormatter(fmt=fmt) 128 else: 129 formatter = logging.Formatter(fmt=fmt) 130 131 console.setFormatter(formatter) 132 log.addHandler(console) 133 134 if verbosity <= 1: 135 pdfminer_log = logging.getLogger('pdfminer') 136 pdfminer_log.setLevel(logging.ERROR) 137 pil_log = logging.getLogger('PIL') 138 pil_log.setLevel(logging.INFO) 139 140 if manage_root_logger: 141 logging.captureWarnings(True) 142 143 return log 144 145 146def create_options( 147 *, input_file: PathOrIO, output_file: PathOrIO, parser: ArgumentParser, **kwargs 148): 149 cmdline = [] 150 deferred = [] 151 152 for arg, val in kwargs.items(): 153 if val is None: 154 continue 155 156 # These arguments with special handling for which we bypass 157 # argparse 158 if arg in {'progress_bar', 'plugins'}: 159 deferred.append((arg, val)) 160 continue 161 162 cmd_style_arg = arg.replace('_', '-') 163 164 # Booleans are special: add only if True, omit for False 165 if isinstance(val, bool): 166 if val: 167 cmdline.append(f"--{cmd_style_arg}") 168 continue 169 170 if is_iterable_notstr(val): 171 for elem in val: 172 cmdline.append(f"--{cmd_style_arg}") 173 cmdline.append(elem) 174 continue 175 176 # We have a parameter 177 cmdline.append(f"--{cmd_style_arg}") 178 if isinstance(val, (int, float)): 179 cmdline.append(str(val)) 180 elif isinstance(val, str): 181 cmdline.append(val) 182 elif isinstance(val, Path): 183 cmdline.append(str(val)) 184 else: 185 raise TypeError(f"{arg}: {val} ({type(val)})") 186 187 if isinstance(input_file, (BinaryIO, IOBase)): 188 cmdline.append('stream://input_file') 189 else: 190 cmdline.append(os.fspath(input_file)) 191 if isinstance(output_file, (BinaryIO, IOBase)): 192 cmdline.append('stream://output_file') 193 else: 194 cmdline.append(os.fspath(output_file)) 195 196 parser._api_mode = True 197 options = parser.parse_args(cmdline) 198 for keyword, val in deferred: 199 setattr(options, keyword, val) 200 201 if options.input_file == 'stream://input_file': 202 options.input_file = input_file 203 if options.output_file == 'stream://output_file': 204 options.output_file = output_file 205 206 return options 207 208 209def ocr( # pylint: disable=unused-argument 210 input_file: PathOrIO, 211 output_file: PathOrIO, 212 *, 213 language: Iterable[str] = None, 214 image_dpi: int = None, 215 output_type=None, 216 sidecar: Optional[StrPath] = None, 217 jobs: int = None, 218 use_threads: bool = None, 219 title: str = None, 220 author: str = None, 221 subject: str = None, 222 keywords: str = None, 223 rotate_pages: bool = None, 224 remove_background: bool = None, 225 deskew: bool = None, 226 clean: bool = None, 227 clean_final: bool = None, 228 unpaper_args: str = None, 229 oversample: int = None, 230 remove_vectors: bool = None, 231 threshold: bool = None, 232 force_ocr: bool = None, 233 skip_text: bool = None, 234 redo_ocr: bool = None, 235 skip_big: float = None, 236 optimize: int = None, 237 jpg_quality: int = None, 238 png_quality: int = None, 239 jbig2_lossy: bool = None, 240 jbig2_page_group_size: int = None, 241 pages: str = None, 242 max_image_mpixels: float = None, 243 tesseract_config: Iterable[str] = None, 244 tesseract_pagesegmode: int = None, 245 tesseract_oem: int = None, 246 pdf_renderer=None, 247 tesseract_timeout: float = None, 248 rotate_pages_threshold: float = None, 249 pdfa_image_compression=None, 250 user_words: os.PathLike = None, 251 user_patterns: os.PathLike = None, 252 fast_web_view: float = None, 253 plugins: Iterable[StrPath] = None, 254 plugin_manager=None, 255 keep_temporary_files: bool = None, 256 progress_bar: bool = None, 257 **kwargs, 258): 259 """Run OCRmyPDF on one PDF or image. 260 261 For most arguments, see documentation for the equivalent command line parameter. 262 A few specific arguments are discussed here: 263 264 Args: 265 use_threads: Use worker threads instead of processes. This reduces 266 performance but may make debugging easier since it is easier to set 267 breakpoints. 268 input_file: If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is 269 interpreted as file system path to the input file. If the object 270 appears to be a readable stream (with methods such as ``.read()`` 271 and ``.seek()``), the object will be read in its entirety and saved to 272 a temporary file. If ``input_file`` is ``"-"``, standard input will be 273 read. 274 output_file: If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is 275 interpreted as file system path to the output file. If the object 276 appears to be a writable stream (with methods such as ``.write()`` and 277 ``.seek()``), the output will be written to this stream. If 278 ``output_file`` is ``"-"``, the output will be written to ``sys.stdout`` 279 (provided that standard output does not seem to be a terminal device). 280 When a stream is used as output, whether via a writable object or 281 ``"-"``, some final validation steps are not performed (we do not read 282 back the stream after it is written). 283 Raises: 284 ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging 285 with the OCR layer. 286 ocrmypdf.MissingDependencyError: If a required dependency program is missing or 287 was not found on PATH. 288 ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that 289 could not be read, or some other file type that is not a PDF. 290 ocrmypdf.DpiError: If the input file is an image, but the resolution of the 291 image is not credible (allowing it to proceed would cause poor OCR). 292 ocrmypdf.OutputFileAccessError: If an attempt to write to the intended output 293 file failed. 294 ocrmypdf.PriorOcrFoundError: If the input PDF seems to have OCR or digital 295 text already, and settings did not tell us to proceed. 296 ocrmypdf.InputFileError: Any other problem with the input file. 297 ocrmypdf.SubprocessOutputError: Any error related to executing a subprocess. 298 ocrmypdf.EncryptedPdfERror: If the input PDF is encrypted (password protected). 299 OCRmyPDF does not remove passwords. 300 ocrmypdf.TesseractConfigError: If Tesseract reported its configuration was not 301 valid. 302 303 Returns: 304 :class:`ocrmypdf.ExitCode` 305 """ 306 if plugins and plugin_manager: 307 raise ValueError("plugins= and plugin_manager are mutually exclusive") 308 309 if not plugins: 310 plugins = [] 311 elif isinstance(plugins, (str, Path)): 312 plugins = [plugins] 313 else: 314 plugins = list(plugins) 315 316 # No new variable names should be assigned until these two steps are run 317 create_options_kwargs = {k: v for k, v in locals().items() if k != 'kwargs'} 318 create_options_kwargs.update(kwargs) 319 320 parser = get_parser() 321 create_options_kwargs['parser'] = parser 322 323 with _api_lock: 324 # We can't allow multiple ocrmypdf.ocr() threads to run in parallel, because 325 # they might install different plugins, and generally speaking we have areas 326 # of code that use global state. 327 328 if not plugin_manager: 329 plugin_manager = get_plugin_manager(plugins) 330 plugin_manager.hook.add_options(parser=parser) # pylint: disable=no-member 331 332 if 'verbose' in kwargs: 333 warn("ocrmypdf.ocr(verbose=) is ignored. Use ocrmypdf.configure_logging().") 334 335 options = create_options(**create_options_kwargs) 336 check_options(options, plugin_manager) 337 return run_pipeline(options=options, plugin_manager=plugin_manager, api=True) 338 339 340__all__ = [ 341 'PageNumberFilter', 342 'TqdmConsole', 343 'Verbosity', 344 'check_options', 345 'configure_logging', 346 'create_options', 347 'get_parser', 348 'get_plugin_manager', 349 'ocr', 350 'run_pipeline', 351] 352