1# © 2018 James R. Barlow: github.com/jbarlow83 2# 3# This Source Code Form is subject to the terms of the Mozilla Public 4# License, v. 2.0. If a copy of the MPL was not distributed with this 5# file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 7 8from os import fspath 9from pathlib import Path 10from unittest.mock import patch 11 12import img2pdf 13import pikepdf 14import pytest 15from PIL import Image, ImageDraw 16 17from ocrmypdf import optimize as opt 18from ocrmypdf._exec import jbig2enc, pngquant 19from ocrmypdf._exec.ghostscript import rasterize_pdf 20from ocrmypdf.helpers import Resolution 21 22from .conftest import check_ocrmypdf 23 24needs_pngquant = pytest.mark.skipif( 25 not pngquant.available(), reason="pngquant not installed" 26) 27needs_jbig2enc = pytest.mark.skipif( 28 not jbig2enc.available(), reason="jbig2enc not installed" 29) 30 31 32@needs_pngquant 33@pytest.mark.parametrize('pdf', ['multipage.pdf', 'palette.pdf']) 34def test_basic(resources, pdf, outpdf): 35 infile = resources / pdf 36 opt.main(infile, outpdf, level=3) 37 38 assert 0.98 * Path(outpdf).stat().st_size <= Path(infile).stat().st_size 39 40 41@needs_pngquant 42def test_mono_not_inverted(resources, outdir): 43 infile = resources / '2400dpi.pdf' 44 opt.main(infile, outdir / 'out.pdf', level=3) 45 46 rasterize_pdf( 47 outdir / 'out.pdf', 48 outdir / 'im.png', 49 raster_device='pnggray', 50 raster_dpi=Resolution(10, 10), 51 ) 52 53 with Image.open(fspath(outdir / 'im.png')) as im: 54 assert im.getpixel((0, 0)) == 255, "Expected white background" 55 56 57@needs_pngquant 58def test_jpg_png_params(resources, outpdf): 59 check_ocrmypdf( 60 resources / 'crom.png', 61 outpdf, 62 '--image-dpi', 63 '200', 64 '--optimize', 65 '3', 66 '--jpg-quality', 67 '50', 68 '--png-quality', 69 '20', 70 '--plugin', 71 'tests/plugins/tesseract_noop.py', 72 ) 73 74 75@needs_jbig2enc 76@pytest.mark.parametrize('lossy', [False, True]) 77def test_jbig2_lossy(lossy, resources, outpdf): 78 args = [ 79 resources / 'ccitt.pdf', 80 outpdf, 81 '--image-dpi', 82 '200', 83 '--optimize', 84 3, 85 '--jpg-quality', 86 '50', 87 '--png-quality', 88 '20', 89 '--plugin', 90 'tests/plugins/tesseract_noop.py', 91 ] 92 if lossy: 93 args.append('--jbig2-lossy') 94 95 check_ocrmypdf(*args) 96 97 pdf = pikepdf.open(outpdf) 98 pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values()))) 99 assert pim.filters[0] == '/JBIG2Decode' 100 101 if lossy: 102 assert '/JBIG2Globals' in pim.decode_parms[0] 103 else: 104 assert len(pim.decode_parms) == 0 105 106 107@needs_pngquant 108@needs_jbig2enc 109def test_flate_to_jbig2(resources, outdir): 110 # This test requires an image that pngquant is capable of converting to 111 # to 1bpp - so use an existing 1bpp image, convert up, confirm it can 112 # convert down 113 with Image.open(fspath(resources / 'typewriter.png')) as im: 114 assert im.mode in ('1', 'P') 115 im = im.convert('L') 116 im.save(fspath(outdir / 'type8.png')) 117 118 check_ocrmypdf( 119 outdir / 'type8.png', 120 outdir / 'out.pdf', 121 '--image-dpi', 122 '100', 123 '--png-quality', 124 '50', 125 '--optimize', 126 '3', 127 '--plugin', 128 'tests/plugins/tesseract_noop.py', 129 ) 130 131 pdf = pikepdf.open(outdir / 'out.pdf') 132 pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values()))) 133 assert pim.filters[0] == '/JBIG2Decode' 134 135 136@needs_pngquant 137def test_multiple_pngs(resources, outdir): 138 with Path.open(outdir / 'in.pdf', 'wb') as inpdf: 139 img2pdf.convert( 140 fspath(resources / 'baiona_colormapped.png'), 141 fspath(resources / 'baiona_gray.png'), 142 with_pdfrw=False, 143 outputstream=inpdf, 144 ) 145 146 def mockquant(input_file, output_file, *_args): 147 with Image.open(input_file) as im: 148 draw = ImageDraw.Draw(im) 149 draw.rectangle((0, 0, im.width, im.height), fill=128) 150 im.save(output_file) 151 152 with patch('ocrmypdf.optimize.pngquant.quantize') as mock: 153 mock.side_effect = mockquant 154 check_ocrmypdf( 155 outdir / 'in.pdf', 156 outdir / 'out.pdf', 157 '--optimize', 158 '3', 159 '--jobs', 160 '1', 161 '--use-threads', 162 '--output-type', 163 'pdf', 164 '--plugin', 165 'tests/plugins/tesseract_noop.py', 166 ) 167 mock.assert_called() 168 169 with pikepdf.open(outdir / 'in.pdf') as inpdf, pikepdf.open( 170 outdir / 'out.pdf' 171 ) as outpdf: 172 for n in range(len(inpdf.pages)): 173 inim = next(iter(inpdf.pages[n].images.values())) 174 outim = next(iter(outpdf.pages[n].images.values())) 175 assert len(outim.read_raw_bytes()) < len(inim.read_raw_bytes()), n 176 177 178def test_optimize_off(resources, outpdf): 179 check_ocrmypdf( 180 resources / 'trivial.pdf', 181 outpdf, 182 '--optimize=0', 183 '--output-type', 184 'pdf', 185 '--plugin', 186 'tests/plugins/tesseract_noop.py', 187 ) 188 189 190def test_group3(resources, outdir): 191 with pikepdf.open(resources / 'ccitt.pdf') as pdf: 192 im = pdf.pages[0].Resources.XObject['/Im1'] 193 assert ( 194 opt.extract_image_filter(pdf, outdir, im, im.objgen[0]) is not None 195 ), "Group 4 should be allowed" 196 197 im.DecodeParms['/K'] = 0 198 assert ( 199 opt.extract_image_filter(pdf, outdir, im, im.objgen[0]) is None 200 ), "Group 3 should be disallowed" 201