1# 2# diffoscope: in-depth comparison of files, archives, and directories 3# 4# Copyright © 2014-2015 Jérémy Bobbio <lunar@debian.org> 5# Copyright © 2015-2016, 2018-2021 Chris Lamb <lamby@debian.org> 6# 7# diffoscope is free software: you can redistribute it and/or modify 8# it under the terms of the GNU General Public License as published by 9# the Free Software Foundation, either version 3 of the License, or 10# (at your option) any later version. 11# 12# diffoscope is distributed in the hope that it will be useful, 13# but WITHOUT ANY WARRANTY; without even the implied warranty of 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15# GNU General Public License for more details. 16# 17# You should have received a copy of the GNU General Public License 18# along with diffoscope. If not, see <https://www.gnu.org/licenses/>. 19 20import re 21 22from diffoscope.tools import python_module_missing, tool_required 23from diffoscope.difference import Difference 24 25from .utils.file import File 26from .utils.command import Command 27 28try: 29 import PyPDF2 30except ImportError: # noqa 31 python_module_missing("PyPDF2") 32 PyPDF2 = None 33 34 35class Pdftotext(Command): 36 @tool_required("pdftotext") 37 def cmdline(self): 38 return ["pdftotext", self.path, "-"] 39 40 41class Dumppdf(Command): 42 @tool_required("dumppdf") 43 def cmdline(self): 44 return ["dumppdf", "-adt", self.path] 45 46 47class PdfFile(File): 48 DESCRIPTION = "PDF documents" 49 FILE_TYPE_RE = re.compile(r"^PDF document\b") 50 51 def compare_details(self, other, source=None): 52 xs = [] 53 54 if PyPDF2 is None: 55 self.add_comment( 56 "Installing the 'PyPDF2' package may produce a better output." 57 ) 58 else: 59 difference = Difference.from_text( 60 self.dump_pypdf2_metadata(self), 61 self.dump_pypdf2_metadata(other), 62 self.path, 63 other.path, 64 ) 65 if difference: 66 difference.add_comment("Document info") 67 xs.append(difference) 68 69 difference = Difference.from_text( 70 self.dump_pypdf2_annotations(self), 71 self.dump_pypdf2_annotations(other), 72 self.path, 73 other.path, 74 ) 75 if difference: 76 difference.add_comment("Annotations") 77 xs.append(difference) 78 79 xs.append(Difference.from_operation(Pdftotext, self.path, other.path)) 80 81 # Don't include verbose dumppdf output unless we won't see any any 82 # differences without it. 83 if not any(xs): 84 xs.append( 85 Difference.from_operation(Dumppdf, self.path, other.path) 86 ) 87 88 return xs 89 90 @staticmethod 91 def dump_pypdf2_metadata(file): 92 try: 93 pdf = PyPDF2.PdfFileReader(file.path) 94 document_info = pdf.getDocumentInfo() 95 except PyPDF2.utils.PdfReadError as e: 96 return f"(Could not extract metadata: {e})" 97 98 if document_info is None: 99 return "" 100 101 xs = [] 102 for k, v in sorted(document_info.items()): 103 xs.append("{}: {!r}".format(k.lstrip("/"), v)) 104 105 return "\n".join(xs) 106 107 @staticmethod 108 def dump_pypdf2_annotations(file): 109 try: 110 pdf = PyPDF2.PdfFileReader(file.path) 111 except PyPDF2.utils.PdfReadError as e: 112 return f"(Could not open file: {e})" 113 114 xs = [] 115 for x in range(pdf.getNumPages()): 116 page = pdf.getPage(x) 117 118 try: 119 for annot in page["/Annots"]: 120 subtype = annot.getObject()["/Subtype"] 121 if subtype == "/Text": 122 xs.append(annot.getObject()["/Contents"]) 123 except: 124 pass 125 126 return "\n".join(xs) 127