1# © 2015 James R. Barlow: github.com/jbarlow83 2# 3# This Source Code Form is subject to the terms of the Mozilla Public 4# License, v. 2.0. If a copy of the MPL was not distributed with this 5# file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 7import pickle 8from io import BytesIO 9from math import isclose 10 11import img2pdf 12import pikepdf 13import pytest 14from PIL import Image 15from reportlab.lib.units import inch 16from reportlab.pdfgen.canvas import Canvas 17 18from ocrmypdf import pdfinfo 19from ocrmypdf.exceptions import InputFileError 20from ocrmypdf.helpers import Resolution 21from ocrmypdf.pdfinfo import Colorspace, Encoding 22from ocrmypdf.pdfinfo.layout import PDFPage 23 24# pylint: disable=protected-access 25 26 27def test_single_page_text(outdir): 28 filename = outdir / 'text.pdf' 29 pdf = Canvas(str(filename), pagesize=(8 * inch, 6 * inch)) 30 text = pdf.beginText() 31 text.setFont('Helvetica', 12) 32 text.setTextOrigin(1 * inch, 3 * inch) 33 text.textLine( 34 "Methink'st thou art a general offence and every" " man should beat thee." 35 ) 36 pdf.drawText(text) 37 pdf.showPage() 38 pdf.save() 39 40 info = pdfinfo.PdfInfo(filename) 41 42 assert len(info) == 1 43 page = info[0] 44 45 assert page.has_text 46 assert len(page.images) == 0 47 48 49@pytest.fixture(scope='session') 50def eight_by_eight(): 51 im = Image.new('1', (8, 8), 0) 52 for n in range(8): 53 im.putpixel((n, n), 1) 54 return im 55 56 57def test_single_page_image(eight_by_eight, outpdf): 58 im = eight_by_eight 59 bio = BytesIO() 60 im.save(bio, format='PNG') 61 bio.seek(0) 62 63 imgsize = ((img2pdf.ImgSize.dpi, 8), (img2pdf.ImgSize.dpi, 8)) 64 layout_fun = img2pdf.get_layout_fun(None, imgsize, None, None, None) 65 66 with outpdf.open('wb') as f: 67 img2pdf.convert( 68 bio, 69 producer="img2pdf", 70 with_pdfrw=False, 71 layout_fun=layout_fun, 72 outputstream=f, 73 ) 74 info = pdfinfo.PdfInfo(outpdf) 75 76 assert len(info) == 1 77 page = info[0] 78 79 assert not page.has_text 80 assert len(page.images) == 1 81 82 pdfimage = page.images[0] 83 assert pdfimage.width == 8 84 assert pdfimage.color == Colorspace.gray 85 86 # DPI in a 1"x1" is the image width 87 assert isclose(pdfimage.dpi.x, 8) 88 assert isclose(pdfimage.dpi.y, 8) 89 90 91def test_single_page_inline_image(eight_by_eight, outdir): 92 filename = outdir / 'image-mono-inline.pdf' 93 pdf = Canvas(str(filename), pagesize=(8 * 72, 6 * 72)) 94 95 # Draw image in a 72x72 pt or 1"x1" area 96 pdf.drawInlineImage(eight_by_eight, 0, 0, width=72, height=72) 97 pdf.showPage() 98 pdf.save() 99 100 info = pdfinfo.PdfInfo(filename) 101 print(info) 102 pdfimage = info[0].images[0] 103 assert isclose(pdfimage.dpi.x, 8) 104 assert pdfimage.color == Colorspace.gray 105 assert pdfimage.width == 8 106 107 108def test_jpeg(resources): 109 filename = resources / 'c02-22.pdf' 110 111 pdf = pdfinfo.PdfInfo(filename) 112 113 pdfimage = pdf[0].images[0] 114 assert pdfimage.enc == Encoding.jpeg 115 assert isclose(pdfimage.dpi.x, 150) 116 117 118def test_form_xobject(resources): 119 filename = resources / 'formxobject.pdf' 120 121 pdf = pdfinfo.PdfInfo(filename) 122 pdfimage = pdf[0].images[0] 123 assert pdfimage.width == 50 124 125 126def test_no_contents(resources): 127 filename = resources / 'no_contents.pdf' 128 129 pdf = pdfinfo.PdfInfo(filename) 130 assert len(pdf[0].images) == 0 131 assert not pdf[0].has_text 132 133 134def test_oversized_page(resources): 135 pdf = pdfinfo.PdfInfo(resources / 'poster.pdf') 136 image = pdf[0].images[0] 137 assert image.width * image.dpi.x > 200, "this is supposed to be oversized" 138 139 140def test_pickle(resources): 141 # For multiprocessing we must be able to pickle our information - if 142 # this fails then we are probably storing some unpickleabe pikepdf or 143 # other external data around 144 filename = resources / 'graph_ocred.pdf' 145 pdf = pdfinfo.PdfInfo(filename) 146 pickle.dumps(pdf) 147 148 149def test_vector(resources): 150 filename = resources / 'vector.pdf' 151 pdf = pdfinfo.PdfInfo(filename) 152 assert pdf[0].has_vector 153 assert not pdf[0].has_text 154 155 156def test_ocr_detection(resources): 157 filename = resources / 'graph_ocred.pdf' 158 pdf = pdfinfo.PdfInfo(filename) 159 assert not pdf[0].has_vector 160 assert pdf[0].has_text 161 162 163@pytest.mark.parametrize( 164 'testfile', ('truetype_font_nomapping.pdf', 'type3_font_nomapping.pdf') 165) 166def test_corrupt_font_detection(resources, testfile): 167 filename = resources / testfile 168 pdf = pdfinfo.PdfInfo(filename, detailed_analysis=True) 169 assert pdf[0].has_corrupt_text 170 171 172def test_stack_abuse(): 173 p = pikepdf.Pdf.new() 174 175 stream = pikepdf.Stream(p, b'q ' * 35) 176 with pytest.warns(None) as record: 177 pdfinfo.info._interpret_contents(stream) 178 assert 'overflowed' in str(record[0].message) 179 180 stream = pikepdf.Stream(p, b'q Q Q Q Q') 181 with pytest.warns(None) as record: 182 pdfinfo.info._interpret_contents(stream) 183 assert 'underflowed' in str(record[0].message) 184 185 stream = pikepdf.Stream(p, b'q ' * 135) 186 with pytest.warns(None): 187 with pytest.raises(RuntimeError): 188 pdfinfo.info._interpret_contents(stream) 189 190 191def test_pages_issue700(monkeypatch, resources): 192 def get_no_pages(*args, **kwargs): 193 return iter([]) 194 195 monkeypatch.setattr(PDFPage, 'get_pages', get_no_pages) 196 197 with pytest.raises(InputFileError, match="pdfminer"): 198 pdfinfo.PdfInfo( 199 resources / 'cardinal.pdf', 200 detailed_analysis=True, 201 progbar=False, 202 max_workers=1, 203 ) 204 205 206def test_image_scale0(resources, outpdf): 207 with pikepdf.open(resources / 'cmyk.pdf') as cmyk: 208 xobj = pikepdf.Page(cmyk.pages[0]).as_form_xobject() 209 210 p = pikepdf.Pdf.new() 211 p.add_blank_page(page_size=(72, 72)) 212 objname = pikepdf.Page(p.pages[0]).add_resource( 213 p.copy_foreign(xobj), pikepdf.Name.XObject, pikepdf.Name.Im0 214 ) 215 print(objname) 216 p.pages[0].Contents = pikepdf.Stream( 217 p, b"q 0 0 0 0 0 0 cm %s Do Q" % bytes(objname) 218 ) 219 p.save(outpdf) 220 221 pi = pdfinfo.PdfInfo(outpdf, detailed_analysis=True, progbar=False, max_workers=1) 222 assert not pi.pages[0]._images[0].dpi.is_finite 223 assert pi.pages[0].dpi == Resolution(0, 0) 224