1""" Handle PDF
2
3"""
4
5import os
6import re
7import logging
8import tempfile
9import io
10from typing import Dict, Union
11from distutils.version import LooseVersion
12
13import cairo
14import gi
15gi.require_version('Poppler', '0.18')
16from gi.repository import Poppler, GLib
17
18from . import abstract
19
20poppler_version = Poppler.get_version()
21if LooseVersion(poppler_version) < LooseVersion('0.46'):  # pragma: no cover
22    raise ValueError("mat2 needs at least Poppler version 0.46 to work. \
23The installed version is %s." % poppler_version)  # pragma: no cover
24
25
26class PDFParser(abstract.AbstractParser):
27    mimetypes = {'application/pdf', }
28    meta_list = {'author', 'creation-date', 'creator', 'format', 'keywords',
29                 'metadata', 'mod-date', 'producer', 'subject', 'title',
30                 'viewer-preferences'}
31
32    def __init__(self, filename):
33        super().__init__(filename)
34        self.uri = 'file://' + os.path.abspath(self.filename)
35        self.__scale = 200 / 72.0  # how much precision do we want for the render
36        try:  # Check now that the file is valid, to avoid surprises later
37            Poppler.Document.new_from_file(self.uri, None)
38        except GLib.GError:  # Invalid PDF
39            raise ValueError
40
41    def remove_all(self) -> bool:
42        if self.lightweight_cleaning is True:
43            return self.__remove_all_lightweight()
44        return self.__remove_all_thorough()
45
46    def __remove_all_lightweight(self) -> bool:
47        """
48            Load the document into Poppler, render pages on a new PDFSurface.
49        """
50        document = Poppler.Document.new_from_file(self.uri, None)
51        pages_count = document.get_n_pages()
52
53        tmp_path = tempfile.mkstemp()[1]
54        pdf_surface = cairo.PDFSurface(tmp_path, 10, 10)  # resized later anyway
55        pdf_context = cairo.Context(pdf_surface)  # context draws on the surface
56
57        for pagenum in range(pages_count):
58            logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
59            page = document.get_page(pagenum)
60            page_width, page_height = page.get_size()
61            pdf_surface.set_size(page_width, page_height)
62            pdf_context.save()
63            page.render_for_printing(pdf_context)
64            pdf_context.restore()
65            pdf_context.show_page()  # draw pdf_context on pdf_surface
66        pdf_surface.finish()
67
68        self.__remove_superficial_meta(tmp_path, self.output_filename)
69        os.remove(tmp_path)
70
71        return True
72
73    def __remove_all_thorough(self) -> bool:
74        """
75            Load the document into Poppler, render pages on PNG,
76            and shove those PNG into a new PDF.
77        """
78        document = Poppler.Document.new_from_file(self.uri, None)
79        pages_count = document.get_n_pages()
80
81        _, tmp_path = tempfile.mkstemp()
82        pdf_surface = cairo.PDFSurface(tmp_path, 32, 32)  # resized later anyway
83        pdf_context = cairo.Context(pdf_surface)
84
85        for pagenum in range(pages_count):
86            page = document.get_page(pagenum)
87            if page is None:  # pragma: no cover
88                logging.error("Unable to get PDF pages")
89                return False
90            page_width, page_height = page.get_size()
91            logging.info("Rendering page %d/%d", pagenum + 1, pages_count)
92
93            width = int(page_width * self.__scale)
94            height = int(page_height * self.__scale)
95            img_surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, width, height)
96            img_context = cairo.Context(img_surface)
97
98            img_context.scale(self.__scale, self.__scale)
99            page.render_for_printing(img_context)
100            img_context.show_page()
101
102            buf = io.BytesIO()
103            img_surface.write_to_png(buf)
104            img_surface.finish()
105            buf.seek(0)
106
107            img = cairo.ImageSurface.create_from_png(buf)
108            if cairo.version_info < (1, 12, 0):
109                pdf_surface.set_size(width, height)
110            else:
111                pdf_surface.set_size(page_width, page_height)
112                pdf_surface.set_device_scale(1 / self.__scale, 1 / self.__scale)
113            pdf_context.set_source_surface(img, 0, 0)
114            pdf_context.paint()
115            pdf_context.show_page()  # draw pdf_context on pdf_surface
116
117        pdf_surface.finish()
118
119        # Removes metadata added by Poppler
120        self.__remove_superficial_meta(tmp_path, self.output_filename)
121        os.remove(tmp_path)
122
123        return True
124
125    @staticmethod
126    def __remove_superficial_meta(in_file: str, out_file: str) -> bool:
127        document = Poppler.Document.new_from_file('file://' + in_file)
128        document.set_producer('')
129        document.set_creator('')
130        document.set_creation_date(-1)
131        document.save('file://' + os.path.abspath(out_file))
132
133        # Cairo adds "/Producer" and "/CreationDate", and Poppler sometimes
134        # fails to remove them, we have to use this terrible regex.
135        # It should(tm) be alright though, because cairo's output format
136        # for metadata is fixed.
137        with open(out_file, 'rb') as f:
138            out = re.sub(rb'<<[\s\n]*/Producer.*?>>', b' << >>', f.read(), 0,
139                         re.DOTALL | re.IGNORECASE)
140        with open(out_file, 'wb') as f:
141            f.write(out)
142
143        return True
144
145    @staticmethod
146    def __parse_metadata_field(data: str) -> Dict[str, str]:
147        metadata = {}
148        for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I):
149            metadata[key] = value
150        return metadata
151
152    def get_meta(self) -> Dict[str, Union[str, dict]]:
153        """ Return a dict with all the meta of the file
154        """
155        metadata = {}
156        document = Poppler.Document.new_from_file(self.uri, None)
157
158        for key in self.meta_list:
159            if document.get_property(key):
160                metadata[key] = document.get_property(key)
161        if 'metadata' in metadata:
162            parsed_meta = self.__parse_metadata_field(metadata['metadata'])
163            for key, value in parsed_meta.items():
164                metadata[key] = value
165        return metadata
166