1# © 2018 James R. Barlow: github.com/jbarlow83 2# 3# This Source Code Form is subject to the terms of the Mozilla Public 4# License, v. 2.0. If a copy of the MPL was not distributed with this 5# file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 7 8import re 9from math import copysign 10from pathlib import Path 11from unittest.mock import patch 12 13import pdfminer 14import pdfminer.encodingdb 15import pdfminer.pdfdevice 16import pdfminer.pdfinterp 17from pdfminer.converter import PDFLayoutAnalyzer 18from pdfminer.layout import LAParams, LTChar, LTPage, LTTextBox 19from pdfminer.pdfdocument import PDFTextExtractionNotAllowed 20from pdfminer.pdffont import PDFSimpleFont, PDFUnicodeNotDefined 21from pdfminer.pdfpage import PDFPage 22from pdfminer.utils import bbox2str, matrix2str 23 24from ocrmypdf.exceptions import EncryptedPdfError, InputFileError 25 26STRIP_NAME = re.compile(r'[0-9]+') 27 28 29original_PDFSimpleFont_init = PDFSimpleFont.__init__ 30 31 32def PDFSimpleFont__init__(self, descriptor, widths, spec): 33 # Font encoding is specified either by a name of 34 # built-in encoding or a dictionary that describes 35 # the differences. 36 original_PDFSimpleFont_init(self, descriptor, widths, spec) 37 # pdfminer is incorrect. If there is no ToUnicode and no Encoding, do not 38 # assume Unicode conversion is possible. RM 9.10.2 39 if not self.unicode_map and 'Encoding' not in spec: 40 self.cid2unicode = {} 41 return 42 43 44PDFSimpleFont.__init__ = PDFSimpleFont__init__ 45 46# 47# pdfminer patches when creator is PScript5.dll 48# 49 50 51def PDFType3Font__PScript5_get_height(self): 52 h = self.bbox[3] - self.bbox[1] 53 if h == 0: 54 h = self.ascent - self.descent 55 return h * copysign(1.0, self.vscale) 56 57 58def PDFType3Font__PScript5_get_descent(self): 59 return self.descent * copysign(1.0, self.vscale) 60 61 62def PDFType3Font__PScript5_get_ascent(self): 63 return self.ascent * copysign(1.0, self.vscale) 64 65 66class LTStateAwareChar(LTChar): 67 """A subclass of LTChar that tracks text render mode at time of drawing""" 68 69 __slots__ = ( 70 'rendermode', 71 '_text', 72 'matrix', 73 'fontname', 74 'adv', 75 'upright', 76 'size', 77 'width', 78 'height', 79 'bbox', 80 'x0', 81 'x1', 82 'y0', 83 'y1', 84 ) 85 86 def __init__( 87 self, 88 matrix, 89 font, 90 fontsize, 91 scaling, 92 rise, 93 text, 94 textwidth, 95 textdisp, 96 ncs, 97 graphicstate, 98 textstate, 99 ): 100 super().__init__( 101 matrix, 102 font, 103 fontsize, 104 scaling, 105 rise, 106 text, 107 textwidth, 108 textdisp, 109 ncs, 110 graphicstate, 111 ) 112 self.rendermode = textstate.render 113 114 def is_compatible(self, obj): 115 """Check if characters can be combined into a textline 116 117 We consider characters compatible if: 118 - the Unicode mapping is known, and both have the same render mode 119 - the Unicode mapping is unknown but both are part of the same font 120 """ 121 # pylint: disable=protected-access 122 both_unicode_mapped = isinstance(self._text, str) and isinstance(obj._text, str) 123 try: 124 if both_unicode_mapped: 125 return self.rendermode == obj.rendermode 126 font0, _ = self._text 127 font1, _ = obj._text 128 return font0 == font1 and self.rendermode == obj.rendermode 129 except (ValueError, AttributeError): 130 return False 131 132 def get_text(self): 133 if isinstance(self._text, tuple): 134 return '\ufffd' # standard 'Unknown symbol' 135 return self._text 136 137 def __repr__(self): 138 return '<{} {} matrix={} rendermode={!r} font={!r} adv={} text={!r}>'.format( 139 self.__class__.__name__, 140 bbox2str(self.bbox), 141 matrix2str(self.matrix), 142 self.rendermode, 143 self.fontname, 144 self.adv, 145 self.get_text(), 146 ) 147 148 149class TextPositionTracker(PDFLayoutAnalyzer): 150 """A page layout analyzer that pays attention to text visibility""" 151 152 def __init__(self, rsrcmgr, pageno=1, laparams=None): 153 super().__init__(rsrcmgr, pageno, laparams) 154 self.textstate = None 155 self.result = None 156 self.cur_item = None # not defined in pdfminer code as it should be 157 158 def begin_page(self, page, ctm): 159 super().begin_page(page, ctm) 160 self.cur_item = LTPage(self.pageno, page.mediabox) 161 162 def end_page(self, page): 163 assert not self._stack, str(len(self._stack)) 164 assert isinstance(self.cur_item, LTPage), str(type(self.cur_item)) 165 if self.laparams is not None: 166 self.cur_item.analyze(self.laparams) 167 self.pageno += 1 168 self.receive_layout(self.cur_item) 169 170 def render_string(self, textstate, seq, ncs, graphicstate): 171 self.textstate = textstate.copy() 172 super().render_string(self.textstate, seq, ncs, graphicstate) 173 174 def render_char( 175 self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate 176 ): 177 try: 178 text = font.to_unichr(cid) 179 assert isinstance(text, str), str(type(text)) 180 except PDFUnicodeNotDefined: 181 text = self.handle_undefined_char(font, cid) 182 textwidth = font.char_width(cid) 183 textdisp = font.char_disp(cid) 184 item = LTStateAwareChar( 185 matrix, 186 font, 187 fontsize, 188 scaling, 189 rise, 190 text, 191 textwidth, 192 textdisp, 193 ncs, 194 graphicstate, 195 self.textstate, 196 ) 197 self.cur_item.add(item) 198 return item.adv 199 200 def handle_undefined_char(self, font, cid): 201 # log.info('undefined: %r, %r', font, cid) 202 return (font.fontname, cid) 203 204 def receive_layout(self, ltpage): 205 self.result = ltpage 206 207 def get_result(self): 208 return self.result 209 210 211def get_page_analysis(infile, pageno, pscript5_mode): 212 rman = pdfminer.pdfinterp.PDFResourceManager(caching=True) 213 if pdfminer.__version__ < '20200402': 214 # Workaround for https://github.com/pdfminer/pdfminer.six/issues/395 215 disable_boxes_flow = 2 216 else: 217 disable_boxes_flow = None 218 dev = TextPositionTracker( 219 rman, 220 laparams=LAParams( 221 all_texts=True, detect_vertical=True, boxes_flow=disable_boxes_flow 222 ), 223 ) 224 interp = pdfminer.pdfinterp.PDFPageInterpreter(rman, dev) 225 226 patcher = None 227 if pscript5_mode: 228 patcher = patch.multiple( 229 'pdfminer.pdffont.PDFType3Font', 230 spec=True, 231 get_ascent=PDFType3Font__PScript5_get_ascent, 232 get_descent=PDFType3Font__PScript5_get_descent, 233 get_height=PDFType3Font__PScript5_get_height, 234 ) 235 patcher.start() 236 237 try: 238 with Path(infile).open('rb') as f: 239 page_iter = PDFPage.get_pages(f, pagenos=[pageno], maxpages=0) 240 page = next(page_iter, None) 241 if page is None: 242 raise InputFileError( 243 f"pdfminer could not process page {pageno} (counting from 0)." 244 ) 245 interp.process_page(page) 246 except PDFTextExtractionNotAllowed as e: 247 raise EncryptedPdfError() from e 248 finally: 249 if patcher is not None: 250 patcher.stop() 251 252 return dev.get_result() 253 254 255def get_text_boxes(obj): 256 for child in obj: 257 if isinstance(child, (LTTextBox)): 258 yield child 259 else: 260 try: 261 yield from get_text_boxes(child) 262 except TypeError: 263 continue 264