1# © 2018 James R. Barlow: github.com/jbarlow83
2#
3# This Source Code Form is subject to the terms of the Mozilla Public
4# License, v. 2.0. If a copy of the MPL was not distributed with this
5# file, You can obtain one at http://mozilla.org/MPL/2.0/.
6
7
8import re
9from math import copysign
10from pathlib import Path
11from unittest.mock import patch
12
13import pdfminer
14import pdfminer.encodingdb
15import pdfminer.pdfdevice
16import pdfminer.pdfinterp
17from pdfminer.converter import PDFLayoutAnalyzer
18from pdfminer.layout import LAParams, LTChar, LTPage, LTTextBox
19from pdfminer.pdfdocument import PDFTextExtractionNotAllowed
20from pdfminer.pdffont import PDFSimpleFont, PDFUnicodeNotDefined
21from pdfminer.pdfpage import PDFPage
22from pdfminer.utils import bbox2str, matrix2str
23
24from ocrmypdf.exceptions import EncryptedPdfError, InputFileError
25
26STRIP_NAME = re.compile(r'[0-9]+')
27
28
29original_PDFSimpleFont_init = PDFSimpleFont.__init__
30
31
32def PDFSimpleFont__init__(self, descriptor, widths, spec):
33    # Font encoding is specified either by a name of
34    # built-in encoding or a dictionary that describes
35    # the differences.
36    original_PDFSimpleFont_init(self, descriptor, widths, spec)
37    # pdfminer is incorrect. If there is no ToUnicode and no Encoding, do not
38    # assume Unicode conversion is possible. RM 9.10.2
39    if not self.unicode_map and 'Encoding' not in spec:
40        self.cid2unicode = {}
41    return
42
43
44PDFSimpleFont.__init__ = PDFSimpleFont__init__
45
46#
47# pdfminer patches when creator is PScript5.dll
48#
49
50
51def PDFType3Font__PScript5_get_height(self):
52    h = self.bbox[3] - self.bbox[1]
53    if h == 0:
54        h = self.ascent - self.descent
55    return h * copysign(1.0, self.vscale)
56
57
58def PDFType3Font__PScript5_get_descent(self):
59    return self.descent * copysign(1.0, self.vscale)
60
61
62def PDFType3Font__PScript5_get_ascent(self):
63    return self.ascent * copysign(1.0, self.vscale)
64
65
66class LTStateAwareChar(LTChar):
67    """A subclass of LTChar that tracks text render mode at time of drawing"""
68
69    __slots__ = (
70        'rendermode',
71        '_text',
72        'matrix',
73        'fontname',
74        'adv',
75        'upright',
76        'size',
77        'width',
78        'height',
79        'bbox',
80        'x0',
81        'x1',
82        'y0',
83        'y1',
84    )
85
86    def __init__(
87        self,
88        matrix,
89        font,
90        fontsize,
91        scaling,
92        rise,
93        text,
94        textwidth,
95        textdisp,
96        ncs,
97        graphicstate,
98        textstate,
99    ):
100        super().__init__(
101            matrix,
102            font,
103            fontsize,
104            scaling,
105            rise,
106            text,
107            textwidth,
108            textdisp,
109            ncs,
110            graphicstate,
111        )
112        self.rendermode = textstate.render
113
114    def is_compatible(self, obj):
115        """Check if characters can be combined into a textline
116
117        We consider characters compatible if:
118            - the Unicode mapping is known, and both have the same render mode
119            - the Unicode mapping is unknown but both are part of the same font
120        """
121        # pylint: disable=protected-access
122        both_unicode_mapped = isinstance(self._text, str) and isinstance(obj._text, str)
123        try:
124            if both_unicode_mapped:
125                return self.rendermode == obj.rendermode
126            font0, _ = self._text
127            font1, _ = obj._text
128            return font0 == font1 and self.rendermode == obj.rendermode
129        except (ValueError, AttributeError):
130            return False
131
132    def get_text(self):
133        if isinstance(self._text, tuple):
134            return '\ufffd'  # standard 'Unknown symbol'
135        return self._text
136
137    def __repr__(self):
138        return '<{} {} matrix={} rendermode={!r} font={!r} adv={} text={!r}>'.format(
139            self.__class__.__name__,
140            bbox2str(self.bbox),
141            matrix2str(self.matrix),
142            self.rendermode,
143            self.fontname,
144            self.adv,
145            self.get_text(),
146        )
147
148
149class TextPositionTracker(PDFLayoutAnalyzer):
150    """A page layout analyzer that pays attention to text visibility"""
151
152    def __init__(self, rsrcmgr, pageno=1, laparams=None):
153        super().__init__(rsrcmgr, pageno, laparams)
154        self.textstate = None
155        self.result = None
156        self.cur_item = None  # not defined in pdfminer code as it should be
157
158    def begin_page(self, page, ctm):
159        super().begin_page(page, ctm)
160        self.cur_item = LTPage(self.pageno, page.mediabox)
161
162    def end_page(self, page):
163        assert not self._stack, str(len(self._stack))
164        assert isinstance(self.cur_item, LTPage), str(type(self.cur_item))
165        if self.laparams is not None:
166            self.cur_item.analyze(self.laparams)
167        self.pageno += 1
168        self.receive_layout(self.cur_item)
169
170    def render_string(self, textstate, seq, ncs, graphicstate):
171        self.textstate = textstate.copy()
172        super().render_string(self.textstate, seq, ncs, graphicstate)
173
174    def render_char(
175        self, matrix, font, fontsize, scaling, rise, cid, ncs, graphicstate
176    ):
177        try:
178            text = font.to_unichr(cid)
179            assert isinstance(text, str), str(type(text))
180        except PDFUnicodeNotDefined:
181            text = self.handle_undefined_char(font, cid)
182        textwidth = font.char_width(cid)
183        textdisp = font.char_disp(cid)
184        item = LTStateAwareChar(
185            matrix,
186            font,
187            fontsize,
188            scaling,
189            rise,
190            text,
191            textwidth,
192            textdisp,
193            ncs,
194            graphicstate,
195            self.textstate,
196        )
197        self.cur_item.add(item)
198        return item.adv
199
200    def handle_undefined_char(self, font, cid):
201        # log.info('undefined: %r, %r', font, cid)
202        return (font.fontname, cid)
203
204    def receive_layout(self, ltpage):
205        self.result = ltpage
206
207    def get_result(self):
208        return self.result
209
210
211def get_page_analysis(infile, pageno, pscript5_mode):
212    rman = pdfminer.pdfinterp.PDFResourceManager(caching=True)
213    if pdfminer.__version__ < '20200402':
214        # Workaround for https://github.com/pdfminer/pdfminer.six/issues/395
215        disable_boxes_flow = 2
216    else:
217        disable_boxes_flow = None
218    dev = TextPositionTracker(
219        rman,
220        laparams=LAParams(
221            all_texts=True, detect_vertical=True, boxes_flow=disable_boxes_flow
222        ),
223    )
224    interp = pdfminer.pdfinterp.PDFPageInterpreter(rman, dev)
225
226    patcher = None
227    if pscript5_mode:
228        patcher = patch.multiple(
229            'pdfminer.pdffont.PDFType3Font',
230            spec=True,
231            get_ascent=PDFType3Font__PScript5_get_ascent,
232            get_descent=PDFType3Font__PScript5_get_descent,
233            get_height=PDFType3Font__PScript5_get_height,
234        )
235        patcher.start()
236
237    try:
238        with Path(infile).open('rb') as f:
239            page_iter = PDFPage.get_pages(f, pagenos=[pageno], maxpages=0)
240            page = next(page_iter, None)
241            if page is None:
242                raise InputFileError(
243                    f"pdfminer could not process page {pageno} (counting from 0)."
244                )
245            interp.process_page(page)
246    except PDFTextExtractionNotAllowed as e:
247        raise EncryptedPdfError() from e
248    finally:
249        if patcher is not None:
250            patcher.stop()
251
252    return dev.get_result()
253
254
255def get_text_boxes(obj):
256    for child in obj:
257        if isinstance(child, (LTTextBox)):
258            yield child
259        else:
260            try:
261                yield from get_text_boxes(child)
262            except TypeError:
263                continue
264