1# © 2015 James R. Barlow: github.com/jbarlow83
2#
3# This Source Code Form is subject to the terms of the Mozilla Public
4# License, v. 2.0. If a copy of the MPL was not distributed with this
5# file, You can obtain one at http://mozilla.org/MPL/2.0/.
6
7import pickle
8from io import BytesIO
9from math import isclose
10
11import img2pdf
12import pikepdf
13import pytest
14from PIL import Image
15from reportlab.lib.units import inch
16from reportlab.pdfgen.canvas import Canvas
17
18from ocrmypdf import pdfinfo
19from ocrmypdf.exceptions import InputFileError
20from ocrmypdf.helpers import Resolution
21from ocrmypdf.pdfinfo import Colorspace, Encoding
22from ocrmypdf.pdfinfo.layout import PDFPage
23
24# pylint: disable=protected-access
25
26
27def test_single_page_text(outdir):
28    filename = outdir / 'text.pdf'
29    pdf = Canvas(str(filename), pagesize=(8 * inch, 6 * inch))
30    text = pdf.beginText()
31    text.setFont('Helvetica', 12)
32    text.setTextOrigin(1 * inch, 3 * inch)
33    text.textLine(
34        "Methink'st thou art a general offence and every" " man should beat thee."
35    )
36    pdf.drawText(text)
37    pdf.showPage()
38    pdf.save()
39
40    info = pdfinfo.PdfInfo(filename)
41
42    assert len(info) == 1
43    page = info[0]
44
45    assert page.has_text
46    assert len(page.images) == 0
47
48
49@pytest.fixture(scope='session')
50def eight_by_eight():
51    im = Image.new('1', (8, 8), 0)
52    for n in range(8):
53        im.putpixel((n, n), 1)
54    return im
55
56
57def test_single_page_image(eight_by_eight, outpdf):
58    im = eight_by_eight
59    bio = BytesIO()
60    im.save(bio, format='PNG')
61    bio.seek(0)
62
63    imgsize = ((img2pdf.ImgSize.dpi, 8), (img2pdf.ImgSize.dpi, 8))
64    layout_fun = img2pdf.get_layout_fun(None, imgsize, None, None, None)
65
66    with outpdf.open('wb') as f:
67        img2pdf.convert(
68            bio,
69            producer="img2pdf",
70            with_pdfrw=False,
71            layout_fun=layout_fun,
72            outputstream=f,
73        )
74    info = pdfinfo.PdfInfo(outpdf)
75
76    assert len(info) == 1
77    page = info[0]
78
79    assert not page.has_text
80    assert len(page.images) == 1
81
82    pdfimage = page.images[0]
83    assert pdfimage.width == 8
84    assert pdfimage.color == Colorspace.gray
85
86    # DPI in a 1"x1" is the image width
87    assert isclose(pdfimage.dpi.x, 8)
88    assert isclose(pdfimage.dpi.y, 8)
89
90
91def test_single_page_inline_image(eight_by_eight, outdir):
92    filename = outdir / 'image-mono-inline.pdf'
93    pdf = Canvas(str(filename), pagesize=(8 * 72, 6 * 72))
94
95    # Draw image in a 72x72 pt or 1"x1" area
96    pdf.drawInlineImage(eight_by_eight, 0, 0, width=72, height=72)
97    pdf.showPage()
98    pdf.save()
99
100    info = pdfinfo.PdfInfo(filename)
101    print(info)
102    pdfimage = info[0].images[0]
103    assert isclose(pdfimage.dpi.x, 8)
104    assert pdfimage.color == Colorspace.gray
105    assert pdfimage.width == 8
106
107
108def test_jpeg(resources):
109    filename = resources / 'c02-22.pdf'
110
111    pdf = pdfinfo.PdfInfo(filename)
112
113    pdfimage = pdf[0].images[0]
114    assert pdfimage.enc == Encoding.jpeg
115    assert isclose(pdfimage.dpi.x, 150)
116
117
118def test_form_xobject(resources):
119    filename = resources / 'formxobject.pdf'
120
121    pdf = pdfinfo.PdfInfo(filename)
122    pdfimage = pdf[0].images[0]
123    assert pdfimage.width == 50
124
125
126def test_no_contents(resources):
127    filename = resources / 'no_contents.pdf'
128
129    pdf = pdfinfo.PdfInfo(filename)
130    assert len(pdf[0].images) == 0
131    assert not pdf[0].has_text
132
133
134def test_oversized_page(resources):
135    pdf = pdfinfo.PdfInfo(resources / 'poster.pdf')
136    image = pdf[0].images[0]
137    assert image.width * image.dpi.x > 200, "this is supposed to be oversized"
138
139
140def test_pickle(resources):
141    # For multiprocessing we must be able to pickle our information - if
142    # this fails then we are probably storing some unpickleabe pikepdf or
143    # other external data around
144    filename = resources / 'graph_ocred.pdf'
145    pdf = pdfinfo.PdfInfo(filename)
146    pickle.dumps(pdf)
147
148
149def test_vector(resources):
150    filename = resources / 'vector.pdf'
151    pdf = pdfinfo.PdfInfo(filename)
152    assert pdf[0].has_vector
153    assert not pdf[0].has_text
154
155
156def test_ocr_detection(resources):
157    filename = resources / 'graph_ocred.pdf'
158    pdf = pdfinfo.PdfInfo(filename)
159    assert not pdf[0].has_vector
160    assert pdf[0].has_text
161
162
163@pytest.mark.parametrize(
164    'testfile', ('truetype_font_nomapping.pdf', 'type3_font_nomapping.pdf')
165)
166def test_corrupt_font_detection(resources, testfile):
167    filename = resources / testfile
168    pdf = pdfinfo.PdfInfo(filename, detailed_analysis=True)
169    assert pdf[0].has_corrupt_text
170
171
172def test_stack_abuse():
173    p = pikepdf.Pdf.new()
174
175    stream = pikepdf.Stream(p, b'q ' * 35)
176    with pytest.warns(None) as record:
177        pdfinfo.info._interpret_contents(stream)
178    assert 'overflowed' in str(record[0].message)
179
180    stream = pikepdf.Stream(p, b'q Q Q Q Q')
181    with pytest.warns(None) as record:
182        pdfinfo.info._interpret_contents(stream)
183    assert 'underflowed' in str(record[0].message)
184
185    stream = pikepdf.Stream(p, b'q ' * 135)
186    with pytest.warns(None):
187        with pytest.raises(RuntimeError):
188            pdfinfo.info._interpret_contents(stream)
189
190
191def test_pages_issue700(monkeypatch, resources):
192    def get_no_pages(*args, **kwargs):
193        return iter([])
194
195    monkeypatch.setattr(PDFPage, 'get_pages', get_no_pages)
196
197    with pytest.raises(InputFileError, match="pdfminer"):
198        pdfinfo.PdfInfo(
199            resources / 'cardinal.pdf',
200            detailed_analysis=True,
201            progbar=False,
202            max_workers=1,
203        )
204
205
206def test_image_scale0(resources, outpdf):
207    with pikepdf.open(resources / 'cmyk.pdf') as cmyk:
208        xobj = pikepdf.Page(cmyk.pages[0]).as_form_xobject()
209
210        p = pikepdf.Pdf.new()
211        p.add_blank_page(page_size=(72, 72))
212        objname = pikepdf.Page(p.pages[0]).add_resource(
213            p.copy_foreign(xobj), pikepdf.Name.XObject, pikepdf.Name.Im0
214        )
215        print(objname)
216        p.pages[0].Contents = pikepdf.Stream(
217            p, b"q 0 0 0 0 0 0 cm %s Do Q" % bytes(objname)
218        )
219        p.save(outpdf)
220
221    pi = pdfinfo.PdfInfo(outpdf, detailed_analysis=True, progbar=False, max_workers=1)
222    assert not pi.pages[0]._images[0].dpi.is_finite
223    assert pi.pages[0].dpi == Resolution(0, 0)
224