1 //========================================================================
2 //
3 // pdfunite.cc
4 //
5 // This file is licensed under the GPLv2 or later
6 //
7 // Copyright (C) 2011-2015, 2017 Thomas Freitag <Thomas.Freitag@alfa.de>
8 // Copyright (C) 2012 Arseny Solokha <asolokha@gmx.com>
9 // Copyright (C) 2012 Fabio D'Urso <fabiodurso@hotmail.it>
10 // Copyright (C) 2012, 2014, 2017-2019, 2021 Albert Astals Cid <aacid@kde.org>
11 // Copyright (C) 2013 Adrian Johnson <ajohnson@redneon.com>
12 // Copyright (C) 2013 Hib Eris <hib@hiberis.nl>
13 // Copyright (C) 2015 Arthur Stavisky <vovodroid@gmail.com>
14 // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
15 // Copyright (C) 2018 Adam Reichold <adam.reichold@t-online.de>
16 // Copyright (C) 2019 Marek Kasik <mkasik@redhat.com>
17 // Copyright (C) 2019 Oliver Sander <oliver.sander@tu-dresden.de>
18 //
19 //========================================================================
20
21 #include <PDFDoc.h>
22 #include <GlobalParams.h>
23 #include "parseargs.h"
24 #include "config.h"
25 #include <poppler-config.h>
26 #include <vector>
27
28 static bool printVersion = false;
29 static bool printHelp = false;
30
31 static const ArgDesc argDesc[] = { { "-v", argFlag, &printVersion, 0, "print copyright and version info" }, { "-h", argFlag, &printHelp, 0, "print usage information" }, { "-help", argFlag, &printHelp, 0, "print usage information" },
32 { "--help", argFlag, &printHelp, 0, "print usage information" }, { "-?", argFlag, &printHelp, 0, "print usage information" }, {} };
33
doMergeNameTree(PDFDoc * doc,XRef * srcXRef,XRef * countRef,int oldRefNum,int newRefNum,Dict * srcNameTree,Dict * mergeNameTree,int numOffset)34 static void doMergeNameTree(PDFDoc *doc, XRef *srcXRef, XRef *countRef, int oldRefNum, int newRefNum, Dict *srcNameTree, Dict *mergeNameTree, int numOffset)
35 {
36 Object mergeNameArray = mergeNameTree->lookup("Names");
37 Object srcNameArray = srcNameTree->lookup("Names");
38 if (mergeNameArray.isArray() && srcNameArray.isArray()) {
39 Array *newNameArray = new Array(srcXRef);
40 int j = 0;
41 for (int i = 0; i < srcNameArray.arrayGetLength() - 1; i += 2) {
42 const Object &key = srcNameArray.arrayGetNF(i);
43 const Object &value = srcNameArray.arrayGetNF(i + 1);
44 if (key.isString() && value.isRef()) {
45 while (j < mergeNameArray.arrayGetLength() - 1) {
46 const Object &mkey = mergeNameArray.arrayGetNF(j);
47 const Object &mvalue = mergeNameArray.arrayGetNF(j + 1);
48 if (mkey.isString() && mvalue.isRef()) {
49 if (mkey.getString()->cmp(key.getString()) < 0) {
50 newNameArray->add(Object(new GooString(mkey.getString()->c_str())));
51 newNameArray->add(Object({ mvalue.getRef().num + numOffset, mvalue.getRef().gen }));
52 j += 2;
53 } else if (mkey.getString()->cmp(key.getString()) == 0) {
54 j += 2;
55 } else {
56 break;
57 }
58 } else {
59 j += 2;
60 }
61 }
62 newNameArray->add(Object(new GooString(key.getString()->c_str())));
63 newNameArray->add(Object(value.getRef()));
64 }
65 }
66 while (j < mergeNameArray.arrayGetLength() - 1) {
67 const Object &mkey = mergeNameArray.arrayGetNF(j);
68 const Object &mvalue = mergeNameArray.arrayGetNF(j + 1);
69 if (mkey.isString() && mvalue.isRef()) {
70 newNameArray->add(Object(new GooString(mkey.getString()->c_str())));
71 newNameArray->add(Object({ mvalue.getRef().num + numOffset, mvalue.getRef().gen }));
72 }
73 j += 2;
74 }
75 srcNameTree->set("Names", Object(newNameArray));
76 doc->markPageObjects(mergeNameTree, srcXRef, countRef, numOffset, oldRefNum, newRefNum);
77 } else if (srcNameArray.isNull() && mergeNameArray.isArray()) {
78 Array *newNameArray = new Array(srcXRef);
79 for (int i = 0; i < mergeNameArray.arrayGetLength() - 1; i += 2) {
80 const Object &key = mergeNameArray.arrayGetNF(i);
81 const Object &value = mergeNameArray.arrayGetNF(i + 1);
82 if (key.isString() && value.isRef()) {
83 newNameArray->add(Object(new GooString(key.getString()->c_str())));
84 newNameArray->add(Object({ value.getRef().num + numOffset, value.getRef().gen }));
85 }
86 }
87 srcNameTree->add("Names", Object(newNameArray));
88 doc->markPageObjects(mergeNameTree, srcXRef, countRef, numOffset, oldRefNum, newRefNum);
89 }
90 }
91
doMergeNameDict(PDFDoc * doc,XRef * srcXRef,XRef * countRef,int oldRefNum,int newRefNum,Dict * srcNameDict,Dict * mergeNameDict,int numOffset)92 static void doMergeNameDict(PDFDoc *doc, XRef *srcXRef, XRef *countRef, int oldRefNum, int newRefNum, Dict *srcNameDict, Dict *mergeNameDict, int numOffset)
93 {
94 for (int i = 0; i < mergeNameDict->getLength(); i++) {
95 const char *key = mergeNameDict->getKey(i);
96 Object mergeNameTree = mergeNameDict->lookup(key);
97 Object srcNameTree = srcNameDict->lookup(key);
98 if (srcNameTree.isDict() && mergeNameTree.isDict()) {
99 doMergeNameTree(doc, srcXRef, countRef, oldRefNum, newRefNum, srcNameTree.getDict(), mergeNameTree.getDict(), numOffset);
100 } else if (srcNameTree.isNull() && mergeNameTree.isDict()) {
101 Object newNameTree(new Dict(srcXRef));
102 doMergeNameTree(doc, srcXRef, countRef, oldRefNum, newRefNum, newNameTree.getDict(), mergeNameTree.getDict(), numOffset);
103 srcNameDict->add(key, std::move(newNameTree));
104 }
105 }
106 }
107
doMergeFormDict(Dict * srcFormDict,Dict * mergeFormDict,int numOffset)108 static void doMergeFormDict(Dict *srcFormDict, Dict *mergeFormDict, int numOffset)
109 {
110 Object srcFields = srcFormDict->lookup("Fields");
111 Object mergeFields = mergeFormDict->lookup("Fields");
112 if (srcFields.isArray() && mergeFields.isArray()) {
113 for (int i = 0; i < mergeFields.arrayGetLength(); i++) {
114 const Object &value = mergeFields.arrayGetNF(i);
115 srcFields.arrayAdd(Object({ value.getRef().num + numOffset, value.getRef().gen }));
116 }
117 }
118 }
119
120 ///////////////////////////////////////////////////////////////////////////
main(int argc,char * argv[])121 int main(int argc, char *argv[])
122 ///////////////////////////////////////////////////////////////////////////
123 // Merge PDF files given by arguments 1 to argc-2 and write the result
124 // to the file specified by argument argc-1.
125 ///////////////////////////////////////////////////////////////////////////
126 {
127 int objectsCount = 0;
128 unsigned int numOffset = 0;
129 std::vector<Object> pages;
130 std::vector<unsigned int> offsets;
131 XRef *yRef, *countRef;
132 FILE *f;
133 OutStream *outStr;
134 int i;
135 int j, rootNum;
136 std::vector<PDFDoc *> docs;
137 int majorVersion = 0;
138 int minorVersion = 0;
139 char *fileName = argv[argc - 1];
140
141 const bool ok = parseArgs(argDesc, &argc, argv);
142 if (!ok || argc < 3 || printVersion || printHelp) {
143 fprintf(stderr, "pdfunite version %s\n", PACKAGE_VERSION);
144 fprintf(stderr, "%s\n", popplerCopyright);
145 fprintf(stderr, "%s\n", xpdfCopyright);
146 if (!printVersion) {
147 printUsage("pdfunite", "<PDF-sourcefile-1>..<PDF-sourcefile-n> <PDF-destfile>", argDesc);
148 }
149 if (printVersion || printHelp) {
150 return 0;
151 }
152 return 99;
153 }
154 globalParams = std::make_unique<GlobalParams>();
155
156 for (i = 1; i < argc - 1; i++) {
157 GooString *gfileName = new GooString(argv[i]);
158 PDFDoc *doc = new PDFDoc(gfileName, nullptr, nullptr, nullptr);
159 if (doc->isOk() && !doc->isEncrypted() && doc->getXRef()->getCatalog().isDict()) {
160 docs.push_back(doc);
161 if (doc->getPDFMajorVersion() > majorVersion) {
162 majorVersion = doc->getPDFMajorVersion();
163 minorVersion = doc->getPDFMinorVersion();
164 } else if (doc->getPDFMajorVersion() == majorVersion) {
165 if (doc->getPDFMinorVersion() > minorVersion) {
166 minorVersion = doc->getPDFMinorVersion();
167 }
168 }
169 } else if (doc->isOk()) {
170 if (doc->isEncrypted()) {
171 error(errUnimplemented, -1, "Could not merge encrypted files ('{0:s}')", argv[i]);
172 return -1;
173 } else if (!doc->getXRef()->getCatalog().isDict()) {
174 error(errSyntaxError, -1, "XRef's Catalog is not a dictionary ('{0:s}')", argv[i]);
175 return -1;
176 }
177 } else {
178 error(errSyntaxError, -1, "Could not merge damaged documents ('{0:s}')", argv[i]);
179 return -1;
180 }
181 }
182
183 if (!(f = fopen(fileName, "wb"))) {
184 error(errIO, -1, "Could not open file '{0:s}'", fileName);
185 return -1;
186 }
187 outStr = new FileOutStream(f, 0);
188
189 yRef = new XRef();
190 countRef = new XRef();
191 yRef->add(0, 65535, 0, false);
192 PDFDoc::writeHeader(outStr, majorVersion, minorVersion);
193
194 // handle OutputIntents, AcroForm, OCProperties & Names
195 Object intents;
196 Object names;
197 Object afObj;
198 Object ocObj;
199 if (docs.size() >= 1) {
200 Object catObj = docs[0]->getXRef()->getCatalog();
201 Dict *catDict = catObj.getDict();
202 intents = catDict->lookup("OutputIntents");
203 afObj = catDict->lookupNF("AcroForm").copy();
204 Ref *refPage = docs[0]->getCatalog()->getPageRef(1);
205 if (!afObj.isNull() && refPage) {
206 docs[0]->markAcroForm(&afObj, yRef, countRef, 0, refPage->num, refPage->num);
207 }
208 ocObj = catDict->lookupNF("OCProperties").copy();
209 if (!ocObj.isNull() && ocObj.isDict() && refPage) {
210 docs[0]->markPageObjects(ocObj.getDict(), yRef, countRef, 0, refPage->num, refPage->num);
211 }
212 names = catDict->lookup("Names");
213 if (!names.isNull() && names.isDict() && refPage) {
214 docs[0]->markPageObjects(names.getDict(), yRef, countRef, 0, refPage->num, refPage->num);
215 }
216 if (intents.isArray() && intents.arrayGetLength() > 0) {
217 for (i = 1; i < (int)docs.size(); i++) {
218 Object pagecatObj = docs[i]->getXRef()->getCatalog();
219 Dict *pagecatDict = pagecatObj.getDict();
220 Object pageintents = pagecatDict->lookup("OutputIntents");
221 if (pageintents.isArray() && pageintents.arrayGetLength() > 0) {
222 for (j = intents.arrayGetLength() - 1; j >= 0; j--) {
223 Object intent = intents.arrayGet(j, 0);
224 if (intent.isDict()) {
225 Object idf = intent.dictLookup("OutputConditionIdentifier");
226 if (idf.isString()) {
227 const GooString *gidf = idf.getString();
228 bool removeIntent = true;
229 for (int k = 0; k < pageintents.arrayGetLength(); k++) {
230 Object pgintent = pageintents.arrayGet(k, 0);
231 if (pgintent.isDict()) {
232 Object pgidf = pgintent.dictLookup("OutputConditionIdentifier");
233 if (pgidf.isString()) {
234 const GooString *gpgidf = pgidf.getString();
235 if (gpgidf->cmp(gidf) == 0) {
236 removeIntent = false;
237 break;
238 }
239 }
240 }
241 }
242 if (removeIntent) {
243 intents.arrayRemove(j);
244 error(errSyntaxWarning, -1, "Output intent {0:s} missing in pdf {1:s}, removed", gidf->c_str(), docs[i]->getFileName()->c_str());
245 }
246 } else {
247 intents.arrayRemove(j);
248 error(errSyntaxWarning, -1, "Invalid output intent dict, missing required OutputConditionIdentifier");
249 }
250 } else {
251 intents.arrayRemove(j);
252 }
253 }
254 } else {
255 error(errSyntaxWarning, -1, "Output intents differs, remove them all");
256 break;
257 }
258 }
259 }
260 if (intents.isArray() && intents.arrayGetLength() > 0) {
261 for (j = intents.arrayGetLength() - 1; j >= 0; j--) {
262 Object intent = intents.arrayGet(j, 0);
263 if (intent.isDict()) {
264 docs[0]->markPageObjects(intent.getDict(), yRef, countRef, numOffset, 0, 0);
265 } else {
266 intents.arrayRemove(j);
267 }
268 }
269 }
270 }
271
272 for (i = 0; i < (int)docs.size(); i++) {
273 for (j = 1; j <= docs[i]->getNumPages(); j++) {
274 if (!docs[i]->getCatalog()->getPage(j)) {
275 continue;
276 }
277
278 const PDFRectangle *cropBox = nullptr;
279 if (docs[i]->getCatalog()->getPage(j)->isCropped())
280 cropBox = docs[i]->getCatalog()->getPage(j)->getCropBox();
281 docs[i]->replacePageDict(j, docs[i]->getCatalog()->getPage(j)->getRotate(), docs[i]->getCatalog()->getPage(j)->getMediaBox(), cropBox);
282 Ref *refPage = docs[i]->getCatalog()->getPageRef(j);
283 Object page = docs[i]->getXRef()->fetch(*refPage);
284 Dict *pageDict = page.getDict();
285 Object *resDict = docs[i]->getCatalog()->getPage(j)->getResourceDictObject();
286 if (resDict->isDict()) {
287 pageDict->set("Resources", resDict->copy());
288 }
289 pages.push_back(std::move(page));
290 offsets.push_back(numOffset);
291 docs[i]->markPageObjects(pageDict, yRef, countRef, numOffset, refPage->num, refPage->num);
292 Object annotsObj = pageDict->lookupNF("Annots").copy();
293 if (!annotsObj.isNull()) {
294 docs[i]->markAnnotations(&annotsObj, yRef, countRef, numOffset, refPage->num, refPage->num);
295 }
296 }
297 Object pageCatObj = docs[i]->getXRef()->getCatalog();
298 Dict *pageCatDict = pageCatObj.getDict();
299 Object pageNames = pageCatDict->lookup("Names");
300 if (!pageNames.isNull() && pageNames.isDict()) {
301 if (!names.isDict()) {
302 names = Object(new Dict(yRef));
303 }
304 doMergeNameDict(docs[i], yRef, countRef, 0, 0, names.getDict(), pageNames.getDict(), numOffset);
305 }
306 Object pageForm = pageCatDict->lookup("AcroForm");
307 if (i > 0 && !pageForm.isNull() && pageForm.isDict()) {
308 if (afObj.isNull()) {
309 afObj = pageCatDict->lookupNF("AcroForm").copy();
310 } else if (afObj.isDict()) {
311 doMergeFormDict(afObj.getDict(), pageForm.getDict(), numOffset);
312 }
313 }
314 objectsCount += docs[i]->writePageObjects(outStr, yRef, numOffset, true);
315 numOffset = yRef->getNumObjects() + 1;
316 }
317
318 rootNum = yRef->getNumObjects() + 1;
319 yRef->add(rootNum, 0, outStr->getPos(), true);
320 outStr->printf("%d 0 obj\n", rootNum);
321 outStr->printf("<< /Type /Catalog /Pages %d 0 R", rootNum + 1);
322 // insert OutputIntents
323 if (intents.isArray() && intents.arrayGetLength() > 0) {
324 outStr->printf(" /OutputIntents [");
325 for (j = 0; j < intents.arrayGetLength(); j++) {
326 Object intent = intents.arrayGet(j, 0);
327 if (intent.isDict()) {
328 PDFDoc::writeObject(&intent, outStr, yRef, 0, nullptr, cryptRC4, 0, 0, 0);
329 }
330 }
331 outStr->printf("]");
332 }
333 // insert AcroForm
334 if (!afObj.isNull()) {
335 outStr->printf(" /AcroForm ");
336 PDFDoc::writeObject(&afObj, outStr, yRef, 0, nullptr, cryptRC4, 0, 0, 0);
337 }
338 // insert OCProperties
339 if (!ocObj.isNull() && ocObj.isDict()) {
340 outStr->printf(" /OCProperties ");
341 PDFDoc::writeObject(&ocObj, outStr, yRef, 0, nullptr, cryptRC4, 0, 0, 0);
342 }
343 // insert Names
344 if (!names.isNull() && names.isDict()) {
345 outStr->printf(" /Names ");
346 PDFDoc::writeObject(&names, outStr, yRef, 0, nullptr, cryptRC4, 0, 0, 0);
347 }
348 outStr->printf(">>\nendobj\n");
349 objectsCount++;
350
351 yRef->add(rootNum + 1, 0, outStr->getPos(), true);
352 outStr->printf("%d 0 obj\n", rootNum + 1);
353 outStr->printf("<< /Type /Pages /Kids [");
354 for (j = 0; j < (int)pages.size(); j++)
355 outStr->printf(" %d 0 R", rootNum + j + 2);
356 outStr->printf(" ] /Count %zd >>\nendobj\n", pages.size());
357 objectsCount++;
358
359 for (i = 0; i < (int)pages.size(); i++) {
360 yRef->add(rootNum + i + 2, 0, outStr->getPos(), true);
361 outStr->printf("%d 0 obj\n", rootNum + i + 2);
362 outStr->printf("<< ");
363 Dict *pageDict = pages[i].getDict();
364 for (j = 0; j < pageDict->getLength(); j++) {
365 if (j > 0)
366 outStr->printf(" ");
367 const char *key = pageDict->getKey(j);
368 Object value = pageDict->getValNF(j).copy();
369 if (strcmp(key, "Parent") == 0) {
370 outStr->printf("/Parent %d 0 R", rootNum + 1);
371 } else {
372 outStr->printf("/%s ", key);
373 PDFDoc::writeObject(&value, outStr, yRef, offsets[i], nullptr, cryptRC4, 0, 0, 0);
374 }
375 }
376 outStr->printf(" >>\nendobj\n");
377 objectsCount++;
378 }
379 Goffset uxrefOffset = outStr->getPos();
380 Ref ref;
381 ref.num = rootNum;
382 ref.gen = 0;
383 Object trailerDict = PDFDoc::createTrailerDict(objectsCount, false, 0, &ref, yRef, fileName, outStr->getPos());
384 PDFDoc::writeXRefTableTrailer(std::move(trailerDict), yRef, true, // write all entries according to ISO 32000-1, 7.5.4 Cross-Reference Table: "For a file that has never been incrementally updated, the cross-reference section shall
385 // contain only one subsection, whose object numbering begins at 0."
386 uxrefOffset, outStr, yRef);
387
388 outStr->close();
389 delete outStr;
390 fclose(f);
391 delete yRef;
392 delete countRef;
393 for (i = 0; i < (int)docs.size(); i++)
394 delete docs[i];
395 return 0;
396 }
397