1#
2# diffoscope: in-depth comparison of files, archives, and directories
3#
4# Copyright © 2014-2015 Jérémy Bobbio <lunar@debian.org>
5# Copyright © 2015-2016, 2018-2021 Chris Lamb <lamby@debian.org>
6#
7# diffoscope is free software: you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation, either version 3 of the License, or
10# (at your option) any later version.
11#
12# diffoscope is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with diffoscope.  If not, see <https://www.gnu.org/licenses/>.
19
20import re
21
22from diffoscope.tools import python_module_missing, tool_required
23from diffoscope.difference import Difference
24
25from .utils.file import File
26from .utils.command import Command
27
28try:
29    import PyPDF2
30except ImportError:  # noqa
31    python_module_missing("PyPDF2")
32    PyPDF2 = None
33
34
35class Pdftotext(Command):
36    @tool_required("pdftotext")
37    def cmdline(self):
38        return ["pdftotext", self.path, "-"]
39
40
41class Dumppdf(Command):
42    @tool_required("dumppdf")
43    def cmdline(self):
44        return ["dumppdf", "-adt", self.path]
45
46
47class PdfFile(File):
48    DESCRIPTION = "PDF documents"
49    FILE_TYPE_RE = re.compile(r"^PDF document\b")
50
51    def compare_details(self, other, source=None):
52        xs = []
53
54        if PyPDF2 is None:
55            self.add_comment(
56                "Installing the 'PyPDF2' package may produce a better output."
57            )
58        else:
59            difference = Difference.from_text(
60                self.dump_pypdf2_metadata(self),
61                self.dump_pypdf2_metadata(other),
62                self.path,
63                other.path,
64            )
65            if difference:
66                difference.add_comment("Document info")
67            xs.append(difference)
68
69            difference = Difference.from_text(
70                self.dump_pypdf2_annotations(self),
71                self.dump_pypdf2_annotations(other),
72                self.path,
73                other.path,
74            )
75            if difference:
76                difference.add_comment("Annotations")
77            xs.append(difference)
78
79        xs.append(Difference.from_operation(Pdftotext, self.path, other.path))
80
81        # Don't include verbose dumppdf output unless we won't see any any
82        # differences without it.
83        if not any(xs):
84            xs.append(
85                Difference.from_operation(Dumppdf, self.path, other.path)
86            )
87
88        return xs
89
90    @staticmethod
91    def dump_pypdf2_metadata(file):
92        try:
93            pdf = PyPDF2.PdfFileReader(file.path)
94            document_info = pdf.getDocumentInfo()
95        except PyPDF2.utils.PdfReadError as e:
96            return f"(Could not extract metadata: {e})"
97
98        if document_info is None:
99            return ""
100
101        xs = []
102        for k, v in sorted(document_info.items()):
103            xs.append("{}: {!r}".format(k.lstrip("/"), v))
104
105        return "\n".join(xs)
106
107    @staticmethod
108    def dump_pypdf2_annotations(file):
109        try:
110            pdf = PyPDF2.PdfFileReader(file.path)
111        except PyPDF2.utils.PdfReadError as e:
112            return f"(Could not open file: {e})"
113
114        xs = []
115        for x in range(pdf.getNumPages()):
116            page = pdf.getPage(x)
117
118            try:
119                for annot in page["/Annots"]:
120                    subtype = annot.getObject()["/Subtype"]
121                    if subtype == "/Text":
122                        xs.append(annot.getObject()["/Contents"])
123            except:
124                pass
125
126        return "\n".join(xs)
127