1""" Handle PDF 2 3""" 4 5import os 6import re 7import logging 8import tempfile 9import io 10from typing import Dict, Union 11from distutils.version import LooseVersion 12 13import cairo 14import gi 15gi.require_version('Poppler', '0.18') 16from gi.repository import Poppler, GLib 17 18from . import abstract 19 20poppler_version = Poppler.get_version() 21if LooseVersion(poppler_version) < LooseVersion('0.46'): # pragma: no cover 22 raise ValueError("mat2 needs at least Poppler version 0.46 to work. \ 23The installed version is %s." % poppler_version) # pragma: no cover 24 25 26class PDFParser(abstract.AbstractParser): 27 mimetypes = {'application/pdf', } 28 meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords', 29 'metadata', 'mod-date', 'producer', 'subject', 'title', 30 'viewer-preferences'} 31 32 def __init__(self, filename): 33 super().__init__(filename) 34 self.uri = 'file://' + os.path.abspath(self.filename) 35 self.__scale = 200 / 72.0 # how much precision do we want for the render 36 try: # Check now that the file is valid, to avoid surprises later 37 Poppler.Document.new_from_file(self.uri, None) 38 except GLib.GError: # Invalid PDF 39 raise ValueError 40 41 def remove_all(self) -> bool: 42 if self.lightweight_cleaning is True: 43 return self.__remove_all_lightweight() 44 return self.__remove_all_thorough() 45 46 def __remove_all_lightweight(self) -> bool: 47 """ 48 Load the document into Poppler, render pages on a new PDFSurface. 49 """ 50 document = Poppler.Document.new_from_file(self.uri, None) 51 pages_count = document.get_n_pages() 52 53 tmp_path = tempfile.mkstemp()[1] 54 pdf_surface = cairo.PDFSurface(tmp_path, 10, 10) # resized later anyway 55 pdf_context = cairo.Context(pdf_surface) # context draws on the surface 56 57 for pagenum in range(pages_count): 58 logging.info("Rendering page %d/%d", pagenum + 1, pages_count) 59 page = document.get_page(pagenum) 60 page_width, page_height = page.get_size() 61 pdf_surface.set_size(page_width, page_height) 62 pdf_context.save() 63 page.render_for_printing(pdf_context) 64 pdf_context.restore() 65 pdf_context.show_page() # draw pdf_context on pdf_surface 66 pdf_surface.finish() 67 68 self.__remove_superficial_meta(tmp_path, self.output_filename) 69 os.remove(tmp_path) 70 71 return True 72 73 def __remove_all_thorough(self) -> bool: 74 """ 75 Load the document into Poppler, render pages on PNG, 76 and shove those PNG into a new PDF. 77 """ 78 document = Poppler.Document.new_from_file(self.uri, None) 79 pages_count = document.get_n_pages() 80 81 _, tmp_path = tempfile.mkstemp() 82 pdf_surface = cairo.PDFSurface(tmp_path, 32, 32) # resized later anyway 83 pdf_context = cairo.Context(pdf_surface) 84 85 for pagenum in range(pages_count): 86 page = document.get_page(pagenum) 87 if page is None: # pragma: no cover 88 logging.error("Unable to get PDF pages") 89 return False 90 page_width, page_height = page.get_size() 91 logging.info("Rendering page %d/%d", pagenum + 1, pages_count) 92 93 width = int(page_width * self.__scale) 94 height = int(page_height * self.__scale) 95 img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, width, height) 96 img_context = cairo.Context(img_surface) 97 98 img_context.scale(self.__scale, self.__scale) 99 page.render_for_printing(img_context) 100 img_context.show_page() 101 102 buf = io.BytesIO() 103 img_surface.write_to_png(buf) 104 img_surface.finish() 105 buf.seek(0) 106 107 img = cairo.ImageSurface.create_from_png(buf) 108 if cairo.version_info < (1, 12, 0): 109 pdf_surface.set_size(width, height) 110 else: 111 pdf_surface.set_size(page_width, page_height) 112 pdf_surface.set_device_scale(1 / self.__scale, 1 / self.__scale) 113 pdf_context.set_source_surface(img, 0, 0) 114 pdf_context.paint() 115 pdf_context.show_page() # draw pdf_context on pdf_surface 116 117 pdf_surface.finish() 118 119 # Removes metadata added by Poppler 120 self.__remove_superficial_meta(tmp_path, self.output_filename) 121 os.remove(tmp_path) 122 123 return True 124 125 @staticmethod 126 def __remove_superficial_meta(in_file: str, out_file: str) -> bool: 127 document = Poppler.Document.new_from_file('file://' + in_file) 128 document.set_producer('') 129 document.set_creator('') 130 document.set_creation_date(-1) 131 document.save('file://' + os.path.abspath(out_file)) 132 133 # Cairo adds "/Producer" and "/CreationDate", and Poppler sometimes 134 # fails to remove them, we have to use this terrible regex. 135 # It should(tm) be alright though, because cairo's output format 136 # for metadata is fixed. 137 with open(out_file, 'rb') as f: 138 out = re.sub(rb'<<[\s\n]*/Producer.*?>>', b' << >>', f.read(), 0, 139 re.DOTALL | re.IGNORECASE) 140 with open(out_file, 'wb') as f: 141 f.write(out) 142 143 return True 144 145 @staticmethod 146 def __parse_metadata_field(data: str) -> Dict[str, str]: 147 metadata = {} 148 for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I): 149 metadata[key] = value 150 return metadata 151 152 def get_meta(self) -> Dict[str, Union[str, dict]]: 153 """ Return a dict with all the meta of the file 154 """ 155 metadata = {} 156 document = Poppler.Document.new_from_file(self.uri, None) 157 158 for key in self.meta_list: 159 if document.get_property(key): 160 metadata[key] = document.get_property(key) 161 if 'metadata' in metadata: 162 parsed_meta = self.__parse_metadata_field(metadata['metadata']) 163 for key, value in parsed_meta.items(): 164 metadata[key] = value 165 return metadata 166