1 //========================================================================
2 //
3 // pdfunite.cc
4 //
5 // This file is licensed under the GPLv2 or later
6 //
7 // Copyright (C) 2011-2015, 2017 Thomas Freitag <Thomas.Freitag@alfa.de>
8 // Copyright (C) 2012 Arseny Solokha <asolokha@gmx.com>
9 // Copyright (C) 2012 Fabio D'Urso <fabiodurso@hotmail.it>
10 // Copyright (C) 2012, 2014, 2017-2019, 2021 Albert Astals Cid <aacid@kde.org>
11 // Copyright (C) 2013 Adrian Johnson <ajohnson@redneon.com>
12 // Copyright (C) 2013 Hib Eris <hib@hiberis.nl>
13 // Copyright (C) 2015 Arthur Stavisky <vovodroid@gmail.com>
14 // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
15 // Copyright (C) 2018 Adam Reichold <adam.reichold@t-online.de>
16 // Copyright (C) 2019 Marek Kasik <mkasik@redhat.com>
17 // Copyright (C) 2019 Oliver Sander <oliver.sander@tu-dresden.de>
18 //
19 //========================================================================
20 
21 #include <PDFDoc.h>
22 #include <GlobalParams.h>
23 #include "parseargs.h"
24 #include "config.h"
25 #include <poppler-config.h>
26 #include <vector>
27 
28 static bool printVersion = false;
29 static bool printHelp = false;
30 
31 static const ArgDesc argDesc[] = { { "-v", argFlag, &printVersion, 0, "print copyright and version info" }, { "-h", argFlag, &printHelp, 0, "print usage information" }, { "-help", argFlag, &printHelp, 0, "print usage information" },
32                                    { "--help", argFlag, &printHelp, 0, "print usage information" },         { "-?", argFlag, &printHelp, 0, "print usage information" }, {} };
33 
doMergeNameTree(PDFDoc * doc,XRef * srcXRef,XRef * countRef,int oldRefNum,int newRefNum,Dict * srcNameTree,Dict * mergeNameTree,int numOffset)34 static void doMergeNameTree(PDFDoc *doc, XRef *srcXRef, XRef *countRef, int oldRefNum, int newRefNum, Dict *srcNameTree, Dict *mergeNameTree, int numOffset)
35 {
36     Object mergeNameArray = mergeNameTree->lookup("Names");
37     Object srcNameArray = srcNameTree->lookup("Names");
38     if (mergeNameArray.isArray() && srcNameArray.isArray()) {
39         Array *newNameArray = new Array(srcXRef);
40         int j = 0;
41         for (int i = 0; i < srcNameArray.arrayGetLength() - 1; i += 2) {
42             const Object &key = srcNameArray.arrayGetNF(i);
43             const Object &value = srcNameArray.arrayGetNF(i + 1);
44             if (key.isString() && value.isRef()) {
45                 while (j < mergeNameArray.arrayGetLength() - 1) {
46                     const Object &mkey = mergeNameArray.arrayGetNF(j);
47                     const Object &mvalue = mergeNameArray.arrayGetNF(j + 1);
48                     if (mkey.isString() && mvalue.isRef()) {
49                         if (mkey.getString()->cmp(key.getString()) < 0) {
50                             newNameArray->add(Object(new GooString(mkey.getString()->c_str())));
51                             newNameArray->add(Object({ mvalue.getRef().num + numOffset, mvalue.getRef().gen }));
52                             j += 2;
53                         } else if (mkey.getString()->cmp(key.getString()) == 0) {
54                             j += 2;
55                         } else {
56                             break;
57                         }
58                     } else {
59                         j += 2;
60                     }
61                 }
62                 newNameArray->add(Object(new GooString(key.getString()->c_str())));
63                 newNameArray->add(Object(value.getRef()));
64             }
65         }
66         while (j < mergeNameArray.arrayGetLength() - 1) {
67             const Object &mkey = mergeNameArray.arrayGetNF(j);
68             const Object &mvalue = mergeNameArray.arrayGetNF(j + 1);
69             if (mkey.isString() && mvalue.isRef()) {
70                 newNameArray->add(Object(new GooString(mkey.getString()->c_str())));
71                 newNameArray->add(Object({ mvalue.getRef().num + numOffset, mvalue.getRef().gen }));
72             }
73             j += 2;
74         }
75         srcNameTree->set("Names", Object(newNameArray));
76         doc->markPageObjects(mergeNameTree, srcXRef, countRef, numOffset, oldRefNum, newRefNum);
77     } else if (srcNameArray.isNull() && mergeNameArray.isArray()) {
78         Array *newNameArray = new Array(srcXRef);
79         for (int i = 0; i < mergeNameArray.arrayGetLength() - 1; i += 2) {
80             const Object &key = mergeNameArray.arrayGetNF(i);
81             const Object &value = mergeNameArray.arrayGetNF(i + 1);
82             if (key.isString() && value.isRef()) {
83                 newNameArray->add(Object(new GooString(key.getString()->c_str())));
84                 newNameArray->add(Object({ value.getRef().num + numOffset, value.getRef().gen }));
85             }
86         }
87         srcNameTree->add("Names", Object(newNameArray));
88         doc->markPageObjects(mergeNameTree, srcXRef, countRef, numOffset, oldRefNum, newRefNum);
89     }
90 }
91 
doMergeNameDict(PDFDoc * doc,XRef * srcXRef,XRef * countRef,int oldRefNum,int newRefNum,Dict * srcNameDict,Dict * mergeNameDict,int numOffset)92 static void doMergeNameDict(PDFDoc *doc, XRef *srcXRef, XRef *countRef, int oldRefNum, int newRefNum, Dict *srcNameDict, Dict *mergeNameDict, int numOffset)
93 {
94     for (int i = 0; i < mergeNameDict->getLength(); i++) {
95         const char *key = mergeNameDict->getKey(i);
96         Object mergeNameTree = mergeNameDict->lookup(key);
97         Object srcNameTree = srcNameDict->lookup(key);
98         if (srcNameTree.isDict() && mergeNameTree.isDict()) {
99             doMergeNameTree(doc, srcXRef, countRef, oldRefNum, newRefNum, srcNameTree.getDict(), mergeNameTree.getDict(), numOffset);
100         } else if (srcNameTree.isNull() && mergeNameTree.isDict()) {
101             Object newNameTree(new Dict(srcXRef));
102             doMergeNameTree(doc, srcXRef, countRef, oldRefNum, newRefNum, newNameTree.getDict(), mergeNameTree.getDict(), numOffset);
103             srcNameDict->add(key, std::move(newNameTree));
104         }
105     }
106 }
107 
doMergeFormDict(Dict * srcFormDict,Dict * mergeFormDict,int numOffset)108 static void doMergeFormDict(Dict *srcFormDict, Dict *mergeFormDict, int numOffset)
109 {
110     Object srcFields = srcFormDict->lookup("Fields");
111     Object mergeFields = mergeFormDict->lookup("Fields");
112     if (srcFields.isArray() && mergeFields.isArray()) {
113         for (int i = 0; i < mergeFields.arrayGetLength(); i++) {
114             const Object &value = mergeFields.arrayGetNF(i);
115             srcFields.arrayAdd(Object({ value.getRef().num + numOffset, value.getRef().gen }));
116         }
117     }
118 }
119 
120 ///////////////////////////////////////////////////////////////////////////
main(int argc,char * argv[])121 int main(int argc, char *argv[])
122 ///////////////////////////////////////////////////////////////////////////
123 // Merge PDF files given by arguments 1 to argc-2 and write the result
124 // to the file specified by argument argc-1.
125 ///////////////////////////////////////////////////////////////////////////
126 {
127     int objectsCount = 0;
128     unsigned int numOffset = 0;
129     std::vector<Object> pages;
130     std::vector<unsigned int> offsets;
131     XRef *yRef, *countRef;
132     FILE *f;
133     OutStream *outStr;
134     int i;
135     int j, rootNum;
136     std::vector<PDFDoc *> docs;
137     int majorVersion = 0;
138     int minorVersion = 0;
139     char *fileName = argv[argc - 1];
140 
141     const bool ok = parseArgs(argDesc, &argc, argv);
142     if (!ok || argc < 3 || printVersion || printHelp) {
143         fprintf(stderr, "pdfunite version %s\n", PACKAGE_VERSION);
144         fprintf(stderr, "%s\n", popplerCopyright);
145         fprintf(stderr, "%s\n", xpdfCopyright);
146         if (!printVersion) {
147             printUsage("pdfunite", "<PDF-sourcefile-1>..<PDF-sourcefile-n> <PDF-destfile>", argDesc);
148         }
149         if (printVersion || printHelp) {
150             return 0;
151         }
152         return 99;
153     }
154     globalParams = std::make_unique<GlobalParams>();
155 
156     for (i = 1; i < argc - 1; i++) {
157         GooString *gfileName = new GooString(argv[i]);
158         PDFDoc *doc = new PDFDoc(gfileName, nullptr, nullptr, nullptr);
159         if (doc->isOk() && !doc->isEncrypted() && doc->getXRef()->getCatalog().isDict()) {
160             docs.push_back(doc);
161             if (doc->getPDFMajorVersion() > majorVersion) {
162                 majorVersion = doc->getPDFMajorVersion();
163                 minorVersion = doc->getPDFMinorVersion();
164             } else if (doc->getPDFMajorVersion() == majorVersion) {
165                 if (doc->getPDFMinorVersion() > minorVersion) {
166                     minorVersion = doc->getPDFMinorVersion();
167                 }
168             }
169         } else if (doc->isOk()) {
170             if (doc->isEncrypted()) {
171                 error(errUnimplemented, -1, "Could not merge encrypted files ('{0:s}')", argv[i]);
172                 return -1;
173             } else if (!doc->getXRef()->getCatalog().isDict()) {
174                 error(errSyntaxError, -1, "XRef's Catalog is not a dictionary ('{0:s}')", argv[i]);
175                 return -1;
176             }
177         } else {
178             error(errSyntaxError, -1, "Could not merge damaged documents ('{0:s}')", argv[i]);
179             return -1;
180         }
181     }
182 
183     if (!(f = fopen(fileName, "wb"))) {
184         error(errIO, -1, "Could not open file '{0:s}'", fileName);
185         return -1;
186     }
187     outStr = new FileOutStream(f, 0);
188 
189     yRef = new XRef();
190     countRef = new XRef();
191     yRef->add(0, 65535, 0, false);
192     PDFDoc::writeHeader(outStr, majorVersion, minorVersion);
193 
194     // handle OutputIntents, AcroForm, OCProperties & Names
195     Object intents;
196     Object names;
197     Object afObj;
198     Object ocObj;
199     if (docs.size() >= 1) {
200         Object catObj = docs[0]->getXRef()->getCatalog();
201         Dict *catDict = catObj.getDict();
202         intents = catDict->lookup("OutputIntents");
203         afObj = catDict->lookupNF("AcroForm").copy();
204         Ref *refPage = docs[0]->getCatalog()->getPageRef(1);
205         if (!afObj.isNull() && refPage) {
206             docs[0]->markAcroForm(&afObj, yRef, countRef, 0, refPage->num, refPage->num);
207         }
208         ocObj = catDict->lookupNF("OCProperties").copy();
209         if (!ocObj.isNull() && ocObj.isDict() && refPage) {
210             docs[0]->markPageObjects(ocObj.getDict(), yRef, countRef, 0, refPage->num, refPage->num);
211         }
212         names = catDict->lookup("Names");
213         if (!names.isNull() && names.isDict() && refPage) {
214             docs[0]->markPageObjects(names.getDict(), yRef, countRef, 0, refPage->num, refPage->num);
215         }
216         if (intents.isArray() && intents.arrayGetLength() > 0) {
217             for (i = 1; i < (int)docs.size(); i++) {
218                 Object pagecatObj = docs[i]->getXRef()->getCatalog();
219                 Dict *pagecatDict = pagecatObj.getDict();
220                 Object pageintents = pagecatDict->lookup("OutputIntents");
221                 if (pageintents.isArray() && pageintents.arrayGetLength() > 0) {
222                     for (j = intents.arrayGetLength() - 1; j >= 0; j--) {
223                         Object intent = intents.arrayGet(j, 0);
224                         if (intent.isDict()) {
225                             Object idf = intent.dictLookup("OutputConditionIdentifier");
226                             if (idf.isString()) {
227                                 const GooString *gidf = idf.getString();
228                                 bool removeIntent = true;
229                                 for (int k = 0; k < pageintents.arrayGetLength(); k++) {
230                                     Object pgintent = pageintents.arrayGet(k, 0);
231                                     if (pgintent.isDict()) {
232                                         Object pgidf = pgintent.dictLookup("OutputConditionIdentifier");
233                                         if (pgidf.isString()) {
234                                             const GooString *gpgidf = pgidf.getString();
235                                             if (gpgidf->cmp(gidf) == 0) {
236                                                 removeIntent = false;
237                                                 break;
238                                             }
239                                         }
240                                     }
241                                 }
242                                 if (removeIntent) {
243                                     intents.arrayRemove(j);
244                                     error(errSyntaxWarning, -1, "Output intent {0:s} missing in pdf {1:s}, removed", gidf->c_str(), docs[i]->getFileName()->c_str());
245                                 }
246                             } else {
247                                 intents.arrayRemove(j);
248                                 error(errSyntaxWarning, -1, "Invalid output intent dict, missing required OutputConditionIdentifier");
249                             }
250                         } else {
251                             intents.arrayRemove(j);
252                         }
253                     }
254                 } else {
255                     error(errSyntaxWarning, -1, "Output intents differs, remove them all");
256                     break;
257                 }
258             }
259         }
260         if (intents.isArray() && intents.arrayGetLength() > 0) {
261             for (j = intents.arrayGetLength() - 1; j >= 0; j--) {
262                 Object intent = intents.arrayGet(j, 0);
263                 if (intent.isDict()) {
264                     docs[0]->markPageObjects(intent.getDict(), yRef, countRef, numOffset, 0, 0);
265                 } else {
266                     intents.arrayRemove(j);
267                 }
268             }
269         }
270     }
271 
272     for (i = 0; i < (int)docs.size(); i++) {
273         for (j = 1; j <= docs[i]->getNumPages(); j++) {
274             if (!docs[i]->getCatalog()->getPage(j)) {
275                 continue;
276             }
277 
278             const PDFRectangle *cropBox = nullptr;
279             if (docs[i]->getCatalog()->getPage(j)->isCropped())
280                 cropBox = docs[i]->getCatalog()->getPage(j)->getCropBox();
281             docs[i]->replacePageDict(j, docs[i]->getCatalog()->getPage(j)->getRotate(), docs[i]->getCatalog()->getPage(j)->getMediaBox(), cropBox);
282             Ref *refPage = docs[i]->getCatalog()->getPageRef(j);
283             Object page = docs[i]->getXRef()->fetch(*refPage);
284             Dict *pageDict = page.getDict();
285             Object *resDict = docs[i]->getCatalog()->getPage(j)->getResourceDictObject();
286             if (resDict->isDict()) {
287                 pageDict->set("Resources", resDict->copy());
288             }
289             pages.push_back(std::move(page));
290             offsets.push_back(numOffset);
291             docs[i]->markPageObjects(pageDict, yRef, countRef, numOffset, refPage->num, refPage->num);
292             Object annotsObj = pageDict->lookupNF("Annots").copy();
293             if (!annotsObj.isNull()) {
294                 docs[i]->markAnnotations(&annotsObj, yRef, countRef, numOffset, refPage->num, refPage->num);
295             }
296         }
297         Object pageCatObj = docs[i]->getXRef()->getCatalog();
298         Dict *pageCatDict = pageCatObj.getDict();
299         Object pageNames = pageCatDict->lookup("Names");
300         if (!pageNames.isNull() && pageNames.isDict()) {
301             if (!names.isDict()) {
302                 names = Object(new Dict(yRef));
303             }
304             doMergeNameDict(docs[i], yRef, countRef, 0, 0, names.getDict(), pageNames.getDict(), numOffset);
305         }
306         Object pageForm = pageCatDict->lookup("AcroForm");
307         if (i > 0 && !pageForm.isNull() && pageForm.isDict()) {
308             if (afObj.isNull()) {
309                 afObj = pageCatDict->lookupNF("AcroForm").copy();
310             } else if (afObj.isDict()) {
311                 doMergeFormDict(afObj.getDict(), pageForm.getDict(), numOffset);
312             }
313         }
314         objectsCount += docs[i]->writePageObjects(outStr, yRef, numOffset, true);
315         numOffset = yRef->getNumObjects() + 1;
316     }
317 
318     rootNum = yRef->getNumObjects() + 1;
319     yRef->add(rootNum, 0, outStr->getPos(), true);
320     outStr->printf("%d 0 obj\n", rootNum);
321     outStr->printf("<< /Type /Catalog /Pages %d 0 R", rootNum + 1);
322     // insert OutputIntents
323     if (intents.isArray() && intents.arrayGetLength() > 0) {
324         outStr->printf(" /OutputIntents [");
325         for (j = 0; j < intents.arrayGetLength(); j++) {
326             Object intent = intents.arrayGet(j, 0);
327             if (intent.isDict()) {
328                 PDFDoc::writeObject(&intent, outStr, yRef, 0, nullptr, cryptRC4, 0, 0, 0);
329             }
330         }
331         outStr->printf("]");
332     }
333     // insert AcroForm
334     if (!afObj.isNull()) {
335         outStr->printf(" /AcroForm ");
336         PDFDoc::writeObject(&afObj, outStr, yRef, 0, nullptr, cryptRC4, 0, 0, 0);
337     }
338     // insert OCProperties
339     if (!ocObj.isNull() && ocObj.isDict()) {
340         outStr->printf(" /OCProperties ");
341         PDFDoc::writeObject(&ocObj, outStr, yRef, 0, nullptr, cryptRC4, 0, 0, 0);
342     }
343     // insert Names
344     if (!names.isNull() && names.isDict()) {
345         outStr->printf(" /Names ");
346         PDFDoc::writeObject(&names, outStr, yRef, 0, nullptr, cryptRC4, 0, 0, 0);
347     }
348     outStr->printf(">>\nendobj\n");
349     objectsCount++;
350 
351     yRef->add(rootNum + 1, 0, outStr->getPos(), true);
352     outStr->printf("%d 0 obj\n", rootNum + 1);
353     outStr->printf("<< /Type /Pages /Kids [");
354     for (j = 0; j < (int)pages.size(); j++)
355         outStr->printf(" %d 0 R", rootNum + j + 2);
356     outStr->printf(" ] /Count %zd >>\nendobj\n", pages.size());
357     objectsCount++;
358 
359     for (i = 0; i < (int)pages.size(); i++) {
360         yRef->add(rootNum + i + 2, 0, outStr->getPos(), true);
361         outStr->printf("%d 0 obj\n", rootNum + i + 2);
362         outStr->printf("<< ");
363         Dict *pageDict = pages[i].getDict();
364         for (j = 0; j < pageDict->getLength(); j++) {
365             if (j > 0)
366                 outStr->printf(" ");
367             const char *key = pageDict->getKey(j);
368             Object value = pageDict->getValNF(j).copy();
369             if (strcmp(key, "Parent") == 0) {
370                 outStr->printf("/Parent %d 0 R", rootNum + 1);
371             } else {
372                 outStr->printf("/%s ", key);
373                 PDFDoc::writeObject(&value, outStr, yRef, offsets[i], nullptr, cryptRC4, 0, 0, 0);
374             }
375         }
376         outStr->printf(" >>\nendobj\n");
377         objectsCount++;
378     }
379     Goffset uxrefOffset = outStr->getPos();
380     Ref ref;
381     ref.num = rootNum;
382     ref.gen = 0;
383     Object trailerDict = PDFDoc::createTrailerDict(objectsCount, false, 0, &ref, yRef, fileName, outStr->getPos());
384     PDFDoc::writeXRefTableTrailer(std::move(trailerDict), yRef, true, // write all entries according to ISO 32000-1, 7.5.4 Cross-Reference Table: "For a file that has never been incrementally updated, the cross-reference section shall
385                                                                       // contain only one subsection, whose object numbering begins at 0."
386                                   uxrefOffset, outStr, yRef);
387 
388     outStr->close();
389     delete outStr;
390     fclose(f);
391     delete yRef;
392     delete countRef;
393     for (i = 0; i < (int)docs.size(); i++)
394         delete docs[i];
395     return 0;
396 }
397