1 // See doc/linearization.
2
3 #include <qpdf/QPDF.hh>
4
5 #include <qpdf/QPDFExc.hh>
6 #include <qpdf/QTC.hh>
7 #include <qpdf/QUtil.hh>
8 #include <qpdf/Pl_Buffer.hh>
9 #include <qpdf/Pl_Flate.hh>
10 #include <qpdf/Pl_Count.hh>
11 #include <qpdf/BitWriter.hh>
12 #include <qpdf/BitStream.hh>
13
14 #include <iostream>
15 #include <algorithm>
16 #include <assert.h>
17 #include <math.h>
18 #include <string.h>
19
20 template <class T, class int_type>
21 static void
load_vector_int(BitStream & bit_stream,int nitems,std::vector<T> & vec,int bits_wanted,int_type T::* field)22 load_vector_int(BitStream& bit_stream, int nitems, std::vector<T>& vec,
23 int bits_wanted, int_type T::*field)
24 {
25 bool append = vec.empty();
26 // nitems times, read bits_wanted from the given bit stream,
27 // storing results in the ith vector entry.
28
29 for (size_t i = 0; i < QIntC::to_size(nitems); ++i)
30 {
31 if (append)
32 {
33 vec.push_back(T());
34 }
35 vec.at(i).*field = bit_stream.getBitsInt(QIntC::to_size(bits_wanted));
36 }
37 if (QIntC::to_int(vec.size()) != nitems)
38 {
39 throw std::logic_error("vector has wrong size in load_vector_int");
40 }
41 // The PDF spec says that each hint table starts at a byte
42 // boundary. Each "row" actually must start on a byte boundary.
43 bit_stream.skipToNextByte();
44 }
45
46 template <class T>
47 static void
load_vector_vector(BitStream & bit_stream,int nitems1,std::vector<T> & vec1,int T::* nitems2,int bits_wanted,std::vector<int> T::* vec2)48 load_vector_vector(BitStream& bit_stream,
49 int nitems1, std::vector<T>& vec1, int T::*nitems2,
50 int bits_wanted, std::vector<int> T::*vec2)
51 {
52 // nitems1 times, read nitems2 (from the ith element of vec1) items
53 // into the vec2 vector field of the ith item of vec1.
54 for (size_t i1 = 0; i1 < QIntC::to_size(nitems1); ++i1)
55 {
56 for (int i2 = 0; i2 < vec1.at(i1).*nitems2; ++i2)
57 {
58 (vec1.at(i1).*vec2).push_back(
59 bit_stream.getBitsInt(QIntC::to_size(bits_wanted)));
60 }
61 }
62 bit_stream.skipToNextByte();
63 }
64
65 bool
checkLinearization()66 QPDF::checkLinearization()
67 {
68 bool result = false;
69 try
70 {
71 readLinearizationData();
72 result = checkLinearizationInternal();
73 }
74 catch (std::runtime_error& e)
75 {
76 *this->m->err_stream
77 << "WARNING: error encountered while checking linearization data: "
78 << e.what() << std::endl;
79 }
80 return result;
81 }
82
83 bool
isLinearized()84 QPDF::isLinearized()
85 {
86 // If the first object in the file is a dictionary with a suitable
87 // /Linearized key and has an /L key that accurately indicates the
88 // file size, initialize this->m->lindict and return true.
89
90 // A linearized PDF spec's first object will be contained within
91 // the first 1024 bytes of the file and will be a dictionary with
92 // a valid /Linearized key. This routine looks for that and does
93 // no additional validation.
94
95 // The PDF spec says the linearization dictionary must be
96 // completely contained within the first 1024 bytes of the file.
97 // Add a byte for a null terminator.
98 static int const tbuf_size = 1025;
99
100 char* buf = new char[tbuf_size];
101 this->m->file->seek(0, SEEK_SET);
102 PointerHolder<char> b(true, buf);
103 memset(buf, '\0', tbuf_size);
104 this->m->file->read(buf, tbuf_size - 1);
105
106 int lindict_obj = -1;
107 char* p = buf;
108 while (lindict_obj == -1)
109 {
110 // Find a digit or end of buffer
111 while (((p - buf) < tbuf_size) && (! QUtil::is_digit(*p)))
112 {
113 ++p;
114 }
115 if (p - buf == tbuf_size)
116 {
117 break;
118 }
119 // Seek to the digit. Then skip over digits for a potential
120 // next iteration.
121 this->m->file->seek(p - buf, SEEK_SET);
122 while (((p - buf) < tbuf_size) && QUtil::is_digit(*p))
123 {
124 ++p;
125 }
126
127 QPDFTokenizer::Token t1 = readToken(this->m->file);
128 QPDFTokenizer::Token t2 = readToken(this->m->file);
129 QPDFTokenizer::Token t3 = readToken(this->m->file);
130 QPDFTokenizer::Token t4 = readToken(this->m->file);
131 if ((t1.getType() == QPDFTokenizer::tt_integer) &&
132 (t2.getType() == QPDFTokenizer::tt_integer) &&
133 (t3 == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "obj")) &&
134 (t4.getType() == QPDFTokenizer::tt_dict_open))
135 {
136 lindict_obj =
137 QIntC::to_int(QUtil::string_to_ll(t1.getValue().c_str()));
138 }
139 }
140
141 if (lindict_obj <= 0)
142 {
143 return false;
144 }
145
146 QPDFObjectHandle candidate = QPDFObjectHandle::Factory::newIndirect(
147 this, lindict_obj, 0);
148 if (! candidate.isDictionary())
149 {
150 return false;
151 }
152
153 QPDFObjectHandle linkey = candidate.getKey("/Linearized");
154 if (! (linkey.isNumber() &&
155 (QIntC::to_int(floor(linkey.getNumericValue())) == 1)))
156 {
157 return false;
158 }
159
160 QPDFObjectHandle L = candidate.getKey("/L");
161 if (L.isInteger())
162 {
163 qpdf_offset_t Li = L.getIntValue();
164 this->m->file->seek(0, SEEK_END);
165 if (Li != this->m->file->tell())
166 {
167 QTC::TC("qpdf", "QPDF /L mismatch");
168 return false;
169 }
170 else
171 {
172 this->m->linp.file_size = Li;
173 }
174 }
175
176 this->m->lindict = candidate;
177
178 return true;
179 }
180
181 void
readLinearizationData()182 QPDF::readLinearizationData()
183 {
184 // This function throws an exception (which is trapped by
185 // checkLinearization()) for any errors that prevent loading.
186
187 // Hint table parsing code needs at least 32 bits in a long.
188 assert(sizeof(long) >= 4);
189
190 if (! isLinearized())
191 {
192 throw std::logic_error("called readLinearizationData for file"
193 " that is not linearized");
194 }
195
196 // /L is read and stored in linp by isLinearized()
197 QPDFObjectHandle H = this->m->lindict.getKey("/H");
198 QPDFObjectHandle O = this->m->lindict.getKey("/O");
199 QPDFObjectHandle E = this->m->lindict.getKey("/E");
200 QPDFObjectHandle N = this->m->lindict.getKey("/N");
201 QPDFObjectHandle T = this->m->lindict.getKey("/T");
202 QPDFObjectHandle P = this->m->lindict.getKey("/P");
203
204 if (! (H.isArray() &&
205 O.isInteger() &&
206 E.isInteger() &&
207 N.isInteger() &&
208 T.isInteger() &&
209 (P.isInteger() || P.isNull())))
210 {
211 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
212 "linearization dictionary",
213 this->m->file->getLastOffset(),
214 "some keys in linearization dictionary are of "
215 "the wrong type");
216 }
217
218 // Hint table array: offset length [ offset length ]
219 size_t n_H_items = toS(H.getArrayNItems());
220 if (! ((n_H_items == 2) || (n_H_items == 4)))
221 {
222 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
223 "linearization dictionary",
224 this->m->file->getLastOffset(),
225 "H has the wrong number of items");
226 }
227
228 std::vector<int> H_items;
229 for (size_t i = 0; i < n_H_items; ++i)
230 {
231 QPDFObjectHandle oh(H.getArrayItem(toI(i)));
232 if (oh.isInteger())
233 {
234 H_items.push_back(oh.getIntValueAsInt());
235 }
236 else
237 {
238 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
239 "linearization dictionary",
240 this->m->file->getLastOffset(),
241 "some H items are of the wrong type");
242 }
243 }
244
245 // H: hint table offset/length for primary and overflow hint tables
246 int H0_offset = H_items.at(0);
247 int H0_length = H_items.at(1);
248 int H1_offset = 0;
249 int H1_length = 0;
250 if (H_items.size() == 4)
251 {
252 // Acrobat doesn't read or write these (as PDF 1.4), so we
253 // don't have a way to generate a test case.
254 // QTC::TC("qpdf", "QPDF overflow hint table");
255 H1_offset = H_items.at(2);
256 H1_length = H_items.at(3);
257 }
258
259 // P: first page number
260 int first_page = 0;
261 if (P.isInteger())
262 {
263 QTC::TC("qpdf", "QPDF P present in lindict");
264 first_page = P.getIntValueAsInt();
265 }
266 else
267 {
268 QTC::TC("qpdf", "QPDF P absent in lindict");
269 }
270
271 // Store linearization parameter data
272
273 // Various places in the code use linp.npages, which is
274 // initialized from N, to pre-allocate memory, so make sure it's
275 // accurate and bail right now if it's not.
276 if (N.getIntValue() != static_cast<long long>(getAllPages().size()))
277 {
278 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
279 "linearization hint table",
280 this->m->file->getLastOffset(),
281 "/N does not match number of pages");
282 }
283
284 // file_size initialized by isLinearized()
285 this->m->linp.first_page_object = O.getIntValueAsInt();
286 this->m->linp.first_page_end = E.getIntValue();
287 this->m->linp.npages = N.getIntValueAsInt();
288 this->m->linp.xref_zero_offset = T.getIntValue();
289 this->m->linp.first_page = first_page;
290 this->m->linp.H_offset = H0_offset;
291 this->m->linp.H_length = H0_length;
292
293 // Read hint streams
294
295 Pl_Buffer pb("hint buffer");
296 QPDFObjectHandle H0 = readHintStream(pb, H0_offset, toS(H0_length));
297 if (H1_offset)
298 {
299 (void) readHintStream(pb, H1_offset, toS(H1_length));
300 }
301
302 // PDF 1.4 hint tables that we ignore:
303
304 // /T thumbnail
305 // /A thread information
306 // /E named destination
307 // /V interactive form
308 // /I information dictionary
309 // /C logical structure
310 // /L page label
311
312 // Individual hint table offsets
313 QPDFObjectHandle HS = H0.getKey("/S"); // shared object
314 QPDFObjectHandle HO = H0.getKey("/O"); // outline
315
316 PointerHolder<Buffer> hbp = pb.getBuffer();
317 Buffer* hb = hbp.getPointer();
318 unsigned char const* h_buf = hb->getBuffer();
319 size_t h_size = hb->getSize();
320
321 readHPageOffset(BitStream(h_buf, h_size));
322
323 int HSi = HS.getIntValueAsInt();
324 if ((HSi < 0) || (toS(HSi) >= h_size))
325 {
326 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
327 "linearization hint table",
328 this->m->file->getLastOffset(),
329 "/S (shared object) offset is out of bounds");
330 }
331 readHSharedObject(BitStream(h_buf + HSi, h_size - toS(HSi)));
332
333 if (HO.isInteger())
334 {
335 int HOi = HO.getIntValueAsInt();
336 if ((HOi < 0) || (toS(HOi) >= h_size))
337 {
338 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
339 "linearization hint table",
340 this->m->file->getLastOffset(),
341 "/O (outline) offset is out of bounds");
342 }
343 readHGeneric(BitStream(h_buf + HOi, h_size - toS(HOi)),
344 this->m->outline_hints);
345 }
346 }
347
348 QPDFObjectHandle
readHintStream(Pipeline & pl,qpdf_offset_t offset,size_t length)349 QPDF::readHintStream(Pipeline& pl, qpdf_offset_t offset, size_t length)
350 {
351 int obj;
352 int gen;
353 QPDFObjectHandle H = readObjectAtOffset(
354 false, offset, "linearization hint stream", -1, 0, obj, gen);
355 ObjCache& oc = this->m->obj_cache[QPDFObjGen(obj, gen)];
356 qpdf_offset_t min_end_offset = oc.end_before_space;
357 qpdf_offset_t max_end_offset = oc.end_after_space;
358 if (! H.isStream())
359 {
360 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
361 "linearization dictionary",
362 this->m->file->getLastOffset(),
363 "hint table is not a stream");
364 }
365
366 QPDFObjectHandle Hdict = H.getDict();
367
368 // Some versions of Acrobat make /Length indirect and place it
369 // immediately after the stream, increasing length to cover it,
370 // even though the specification says all objects in the
371 // linearization parameter dictionary must be direct. We have to
372 // get the file position of the end of length in this case.
373 QPDFObjectHandle length_obj = Hdict.getKey("/Length");
374 if (length_obj.isIndirect())
375 {
376 QTC::TC("qpdf", "QPDF hint table length indirect");
377 // Force resolution
378 (void) length_obj.getIntValue();
379 ObjCache& oc2 = this->m->obj_cache[length_obj.getObjGen()];
380 min_end_offset = oc2.end_before_space;
381 max_end_offset = oc2.end_after_space;
382 }
383 else
384 {
385 QTC::TC("qpdf", "QPDF hint table length direct");
386 }
387 qpdf_offset_t computed_end = offset + toO(length);
388 if ((computed_end < min_end_offset) ||
389 (computed_end > max_end_offset))
390 {
391 *this->m->err_stream << "expected = " << computed_end
392 << "; actual = " << min_end_offset << ".."
393 << max_end_offset << std::endl;
394 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
395 "linearization dictionary",
396 this->m->file->getLastOffset(),
397 "hint table length mismatch");
398 }
399 H.pipeStreamData(&pl, 0, qpdf_dl_specialized);
400 return Hdict;
401 }
402
403 void
readHPageOffset(BitStream h)404 QPDF::readHPageOffset(BitStream h)
405 {
406 // All comments referring to the PDF spec refer to the spec for
407 // version 1.4.
408
409 HPageOffset& t = this->m->page_offset_hints;
410
411 t.min_nobjects = h.getBitsInt(32); // 1
412 t.first_page_offset = h.getBitsInt(32); // 2
413 t.nbits_delta_nobjects = h.getBitsInt(16); // 3
414 t.min_page_length = h.getBitsInt(32); // 4
415 t.nbits_delta_page_length = h.getBitsInt(16); // 5
416 t.min_content_offset = h.getBitsInt(32); // 6
417 t.nbits_delta_content_offset = h.getBitsInt(16); // 7
418 t.min_content_length = h.getBitsInt(32); // 8
419 t.nbits_delta_content_length = h.getBitsInt(16); // 9
420 t.nbits_nshared_objects = h.getBitsInt(16); // 10
421 t.nbits_shared_identifier = h.getBitsInt(16); // 11
422 t.nbits_shared_numerator = h.getBitsInt(16); // 12
423 t.shared_denominator = h.getBitsInt(16); // 13
424
425 std::vector<HPageOffsetEntry>& entries = t.entries;
426 entries.clear();
427 int nitems = this->m->linp.npages;
428 load_vector_int(h, nitems, entries,
429 t.nbits_delta_nobjects,
430 &HPageOffsetEntry::delta_nobjects);
431 load_vector_int(h, nitems, entries,
432 t.nbits_delta_page_length,
433 &HPageOffsetEntry::delta_page_length);
434 load_vector_int(h, nitems, entries,
435 t.nbits_nshared_objects,
436 &HPageOffsetEntry::nshared_objects);
437 load_vector_vector(h, nitems, entries,
438 &HPageOffsetEntry::nshared_objects,
439 t.nbits_shared_identifier,
440 &HPageOffsetEntry::shared_identifiers);
441 load_vector_vector(h, nitems, entries,
442 &HPageOffsetEntry::nshared_objects,
443 t.nbits_shared_numerator,
444 &HPageOffsetEntry::shared_numerators);
445 load_vector_int(h, nitems, entries,
446 t.nbits_delta_content_offset,
447 &HPageOffsetEntry::delta_content_offset);
448 load_vector_int(h, nitems, entries,
449 t.nbits_delta_content_length,
450 &HPageOffsetEntry::delta_content_length);
451 }
452
453 void
readHSharedObject(BitStream h)454 QPDF::readHSharedObject(BitStream h)
455 {
456 HSharedObject& t = this->m->shared_object_hints;
457
458 t.first_shared_obj = h.getBitsInt(32); // 1
459 t.first_shared_offset = h.getBitsInt(32); // 2
460 t.nshared_first_page = h.getBitsInt(32); // 3
461 t.nshared_total = h.getBitsInt(32); // 4
462 t.nbits_nobjects = h.getBitsInt(16); // 5
463 t.min_group_length = h.getBitsInt(32); // 6
464 t.nbits_delta_group_length = h.getBitsInt(16); // 7
465
466 QTC::TC("qpdf", "QPDF lin nshared_total > nshared_first_page",
467 (t.nshared_total > t.nshared_first_page) ? 1 : 0);
468
469 std::vector<HSharedObjectEntry>& entries = t.entries;
470 entries.clear();
471 int nitems = t.nshared_total;
472 load_vector_int(h, nitems, entries,
473 t.nbits_delta_group_length,
474 &HSharedObjectEntry::delta_group_length);
475 load_vector_int(h, nitems, entries,
476 1, &HSharedObjectEntry::signature_present);
477 for (size_t i = 0; i < toS(nitems); ++i)
478 {
479 if (entries.at(i).signature_present)
480 {
481 // Skip 128-bit MD5 hash. These are not supported by
482 // acrobat, so they should probably never be there. We
483 // have no test case for this.
484 for (int j = 0; j < 4; ++j)
485 {
486 (void) h.getBits(32);
487 }
488 }
489 }
490 load_vector_int(h, nitems, entries,
491 t.nbits_nobjects,
492 &HSharedObjectEntry::nobjects_minus_one);
493 }
494
495 void
readHGeneric(BitStream h,HGeneric & t)496 QPDF::readHGeneric(BitStream h, HGeneric& t)
497 {
498 t.first_object = h.getBitsInt(32); // 1
499 t.first_object_offset = h.getBitsInt(32); // 2
500 t.nobjects = h.getBitsInt(32); // 3
501 t.group_length = h.getBitsInt(32); // 4
502 }
503
504 bool
checkLinearizationInternal()505 QPDF::checkLinearizationInternal()
506 {
507 // All comments referring to the PDF spec refer to the spec for
508 // version 1.4.
509
510 std::list<std::string> errors;
511 std::list<std::string> warnings;
512
513 // Check all values in linearization parameter dictionary
514
515 LinParameters& p = this->m->linp;
516
517 // L: file size in bytes -- checked by isLinearized
518
519 // O: object number of first page
520 std::vector<QPDFObjectHandle> const& pages = getAllPages();
521 if (p.first_page_object != pages.at(0).getObjectID())
522 {
523 QTC::TC("qpdf", "QPDF err /O mismatch");
524 errors.push_back("first page object (/O) mismatch");
525 }
526
527 // N: number of pages
528 int npages = toI(pages.size());
529 if (p.npages != npages)
530 {
531 // Not tested in the test suite
532 errors.push_back("page count (/N) mismatch");
533 }
534
535 for (size_t i = 0; i < toS(npages); ++i)
536 {
537 QPDFObjectHandle const& page = pages.at(i);
538 QPDFObjGen og(page.getObjGen());
539 if (this->m->xref_table[og].getType() == 2)
540 {
541 errors.push_back("page dictionary for page " +
542 QUtil::uint_to_string(i) + " is compressed");
543 }
544 }
545
546 // T: offset of whitespace character preceding xref entry for object 0
547 this->m->file->seek(p.xref_zero_offset, SEEK_SET);
548 while (1)
549 {
550 char ch;
551 this->m->file->read(&ch, 1);
552 if (! ((ch == ' ') || (ch == '\r') || (ch == '\n')))
553 {
554 this->m->file->seek(-1, SEEK_CUR);
555 break;
556 }
557 }
558 if (this->m->file->tell() != this->m->first_xref_item_offset)
559 {
560 QTC::TC("qpdf", "QPDF err /T mismatch");
561 errors.push_back("space before first xref item (/T) mismatch "
562 "(computed = " +
563 QUtil::int_to_string(this->m->first_xref_item_offset) +
564 "; file = " +
565 QUtil::int_to_string(this->m->file->tell()));
566 }
567
568 // P: first page number -- Implementation note 124 says Acrobat
569 // ignores this value, so we will too.
570
571 // Check numbering of compressed objects in each xref section.
572 // For linearized files, all compressed objects are supposed to be
573 // at the end of the containing xref section if any object streams
574 // are in use.
575
576 if (this->m->uncompressed_after_compressed)
577 {
578 errors.push_back("linearized file contains an uncompressed object"
579 " after a compressed one in a cross-reference stream");
580 }
581
582 // Further checking requires optimization and order calculation.
583 // Don't allow optimization to make changes. If it has to, then
584 // the file is not properly linearized. We use the xref table to
585 // figure out which objects are compressed and which are
586 // uncompressed.
587 { // local scope
588 std::map<int, int> object_stream_data;
589 for (std::map<QPDFObjGen, QPDFXRefEntry>::const_iterator iter =
590 this->m->xref_table.begin();
591 iter != this->m->xref_table.end(); ++iter)
592 {
593 QPDFObjGen const& og = (*iter).first;
594 QPDFXRefEntry const& entry = (*iter).second;
595 if (entry.getType() == 2)
596 {
597 object_stream_data[og.getObj()] = entry.getObjStreamNumber();
598 }
599 }
600 optimize(object_stream_data, false);
601 calculateLinearizationData(object_stream_data);
602 }
603
604 // E: offset of end of first page -- Implementation note 123 says
605 // Acrobat includes on extra object here by mistake. pdlin fails
606 // to place thumbnail images in section 9, so when thumbnails are
607 // present, it also gets the wrong value for /E. It also doesn't
608 // count outlines here when it should even though it places them
609 // in part 6. This code fails to put thread information
610 // dictionaries in part 9, so it actually gets the wrong value for
611 // E when threads are present. In that case, it would probably
612 // agree with pdlin. As of this writing, the test suite doesn't
613 // contain any files with threads.
614
615 if (this->m->part6.empty())
616 {
617 stopOnError("linearization part 6 unexpectedly empty");
618 }
619 qpdf_offset_t min_E = -1;
620 qpdf_offset_t max_E = -1;
621 for (std::vector<QPDFObjectHandle>::iterator iter = this->m->part6.begin();
622 iter != this->m->part6.end(); ++iter)
623 {
624 QPDFObjGen og((*iter).getObjGen());
625 if (this->m->obj_cache.count(og) == 0)
626 {
627 // All objects have to have been dereferenced to be classified.
628 throw std::logic_error("linearization part6 object not in cache");
629 }
630 ObjCache const& oc = this->m->obj_cache[og];
631 min_E = std::max(min_E, oc.end_before_space);
632 max_E = std::max(max_E, oc.end_after_space);
633 }
634 if ((p.first_page_end < min_E) || (p.first_page_end > max_E))
635 {
636 QTC::TC("qpdf", "QPDF warn /E mismatch");
637 warnings.push_back("end of first page section (/E) mismatch: /E = " +
638 QUtil::int_to_string(p.first_page_end) +
639 "; computed = " +
640 QUtil::int_to_string(min_E) + ".." +
641 QUtil::int_to_string(max_E));
642 }
643
644 // Check hint tables
645
646 std::map<int, int> shared_idx_to_obj;
647 checkHSharedObject(errors, warnings, pages, shared_idx_to_obj);
648 checkHPageOffset(errors, warnings, pages, shared_idx_to_obj);
649 checkHOutlines(warnings);
650
651 // Report errors
652
653 bool result = true;
654
655 // Treat all linearization errors as warnings. Many of them occur
656 // in otherwise working files, so it's really misleading to treat
657 // them as errors. We'll hang onto the distinction in the code for
658 // now in case we ever have a chance to clean up the linearization
659 // code.
660 if (! errors.empty())
661 {
662 result = false;
663 for (std::list<std::string>::iterator iter = errors.begin();
664 iter != errors.end(); ++iter)
665 {
666 *this->m->err_stream << "WARNING: " << (*iter) << std::endl;
667 }
668 }
669
670 if (! warnings.empty())
671 {
672 result = false;
673 for (std::list<std::string>::iterator iter = warnings.begin();
674 iter != warnings.end(); ++iter)
675 {
676 *this->m->err_stream << "WARNING: " << (*iter) << std::endl;
677 }
678 }
679
680 return result;
681 }
682
683 qpdf_offset_t
maxEnd(ObjUser const & ou)684 QPDF::maxEnd(ObjUser const& ou)
685 {
686 if (this->m->obj_user_to_objects.count(ou) == 0)
687 {
688 stopOnError("no entry in object user table for requested object user");
689 }
690 std::set<QPDFObjGen> const& ogs = this->m->obj_user_to_objects[ou];
691 qpdf_offset_t end = 0;
692 for (std::set<QPDFObjGen>::const_iterator iter = ogs.begin();
693 iter != ogs.end(); ++iter)
694 {
695 QPDFObjGen const& og = *iter;
696 if (this->m->obj_cache.count(og) == 0)
697 {
698 stopOnError("unknown object referenced in object user table");
699 }
700 end = std::max(end, this->m->obj_cache[og].end_after_space);
701 }
702 return end;
703 }
704
705 qpdf_offset_t
getLinearizationOffset(QPDFObjGen const & og)706 QPDF::getLinearizationOffset(QPDFObjGen const& og)
707 {
708 QPDFXRefEntry entry = this->m->xref_table[og];
709 qpdf_offset_t result = 0;
710 switch (entry.getType())
711 {
712 case 1:
713 result = entry.getOffset();
714 break;
715
716 case 2:
717 // For compressed objects, return the offset of the object
718 // stream that contains them.
719 result = getLinearizationOffset(
720 QPDFObjGen(entry.getObjStreamNumber(), 0));
721 break;
722
723 default:
724 stopOnError(
725 "getLinearizationOffset called for xref entry not of type 1 or 2");
726 break;
727 }
728 return result;
729 }
730
731 QPDFObjectHandle
getUncompressedObject(QPDFObjectHandle & obj,std::map<int,int> const & object_stream_data)732 QPDF::getUncompressedObject(QPDFObjectHandle& obj,
733 std::map<int, int> const& object_stream_data)
734 {
735 if (obj.isNull() || (object_stream_data.count(obj.getObjectID()) == 0))
736 {
737 return obj;
738 }
739 else
740 {
741 int repl = (*(object_stream_data.find(obj.getObjectID()))).second;
742 return objGenToIndirect(QPDFObjGen(repl, 0));
743 }
744 }
745
746 int
lengthNextN(int first_object,int n,std::list<std::string> & errors)747 QPDF::lengthNextN(int first_object, int n,
748 std::list<std::string>& errors)
749 {
750 int length = 0;
751 for (int i = 0; i < n; ++i)
752 {
753 QPDFObjGen og(first_object + i, 0);
754 if (this->m->xref_table.count(og) == 0)
755 {
756 errors.push_back(
757 "no xref table entry for " +
758 QUtil::int_to_string(first_object + i) + " 0");
759 }
760 else
761 {
762 if (this->m->obj_cache.count(og) == 0)
763 {
764 stopOnError("found unknown object while"
765 " calculating length for linearization data");
766 }
767 length += toI(this->m->obj_cache[og].end_after_space -
768 getLinearizationOffset(og));
769 }
770 }
771 return length;
772 }
773
774 void
checkHPageOffset(std::list<std::string> & errors,std::list<std::string> & warnings,std::vector<QPDFObjectHandle> const & pages,std::map<int,int> & shared_idx_to_obj)775 QPDF::checkHPageOffset(std::list<std::string>& errors,
776 std::list<std::string>& warnings,
777 std::vector<QPDFObjectHandle> const& pages,
778 std::map<int, int>& shared_idx_to_obj)
779 {
780 // Implementation note 126 says Acrobat always sets
781 // delta_content_offset and delta_content_length in the page
782 // offset header dictionary to 0. It also states that
783 // min_content_offset in the per-page information is always 0,
784 // which is an incorrect value.
785
786 // Implementation note 127 explains that Acrobat always sets item
787 // 8 (min_content_length) to zero, item 9
788 // (nbits_delta_content_length) to the value of item 5
789 // (nbits_delta_page_length), and item 7 of each per-page hint
790 // table (delta_content_length) to item 2 (delta_page_length) of
791 // that entry. Acrobat ignores these values when reading files.
792
793 // Empirically, it also seems that Acrobat sometimes puts items
794 // under a page's /Resources dictionary in with shared objects
795 // even when they are private.
796
797 int npages = toI(pages.size());
798 qpdf_offset_t table_offset = adjusted_offset(
799 this->m->page_offset_hints.first_page_offset);
800 QPDFObjGen first_page_og(pages.at(0).getObjGen());
801 if (this->m->xref_table.count(first_page_og) == 0)
802 {
803 stopOnError("supposed first page object is not known");
804 }
805 qpdf_offset_t offset = getLinearizationOffset(first_page_og);
806 if (table_offset != offset)
807 {
808 warnings.push_back("first page object offset mismatch");
809 }
810
811 for (int pageno = 0; pageno < npages; ++pageno)
812 {
813 QPDFObjGen page_og(pages.at(toS(pageno)).getObjGen());
814 int first_object = page_og.getObj();
815 if (this->m->xref_table.count(page_og) == 0)
816 {
817 stopOnError("unknown object in page offset hint table");
818 }
819 offset = getLinearizationOffset(page_og);
820
821 HPageOffsetEntry& he =
822 this->m->page_offset_hints.entries.at(toS(pageno));
823 CHPageOffsetEntry& ce =
824 this->m->c_page_offset_data.entries.at(toS(pageno));
825 int h_nobjects = he.delta_nobjects +
826 this->m->page_offset_hints.min_nobjects;
827 if (h_nobjects != ce.nobjects)
828 {
829 // This happens with pdlin when there are thumbnails.
830 warnings.push_back(
831 "object count mismatch for page " +
832 QUtil::int_to_string(pageno) + ": hint table = " +
833 QUtil::int_to_string(h_nobjects) + "; computed = " +
834 QUtil::int_to_string(ce.nobjects));
835 }
836
837 // Use value for number of objects in hint table rather than
838 // computed value if there is a discrepancy.
839 int length = lengthNextN(first_object, h_nobjects, errors);
840 int h_length = toI(he.delta_page_length +
841 this->m->page_offset_hints.min_page_length);
842 if (length != h_length)
843 {
844 // This condition almost certainly indicates a bad hint
845 // table or a bug in this code.
846 errors.push_back(
847 "page length mismatch for page " +
848 QUtil::int_to_string(pageno) + ": hint table = " +
849 QUtil::int_to_string(h_length) + "; computed length = " +
850 QUtil::int_to_string(length) + " (offset = " +
851 QUtil::int_to_string(offset) + ")");
852 }
853
854 offset += h_length;
855
856 // Translate shared object indexes to object numbers.
857 std::set<int> hint_shared;
858 std::set<int> computed_shared;
859
860 if ((pageno == 0) && (he.nshared_objects > 0))
861 {
862 // pdlin and Acrobat both do this even though the spec
863 // states clearly and unambiguously that they should not.
864 warnings.push_back("page 0 has shared identifier entries");
865 }
866
867 for (size_t i = 0; i < toS(he.nshared_objects); ++i)
868 {
869 int idx = he.shared_identifiers.at(i);
870 if (shared_idx_to_obj.count(idx) == 0)
871 {
872 stopOnError(
873 "unable to get object for item in"
874 " shared objects hint table");
875 }
876 hint_shared.insert(shared_idx_to_obj[idx]);
877 }
878
879 for (size_t i = 0; i < toS(ce.nshared_objects); ++i)
880 {
881 int idx = ce.shared_identifiers.at(i);
882 if (idx >= this->m->c_shared_object_data.nshared_total)
883 {
884 stopOnError(
885 "index out of bounds for shared object hint table");
886 }
887 int obj = this->m->c_shared_object_data.entries.at(toS(idx)).object;
888 computed_shared.insert(obj);
889 }
890
891 for (std::set<int>::iterator iter = hint_shared.begin();
892 iter != hint_shared.end(); ++iter)
893 {
894 if (! computed_shared.count(*iter))
895 {
896 // pdlin puts thumbnails here even though it shouldn't
897 warnings.push_back(
898 "page " + QUtil::int_to_string(pageno) +
899 ": shared object " + QUtil::int_to_string(*iter) +
900 ": in hint table but not computed list");
901 }
902 }
903
904 for (std::set<int>::iterator iter = computed_shared.begin();
905 iter != computed_shared.end(); ++iter)
906 {
907 if (! hint_shared.count(*iter))
908 {
909 // Acrobat does not put some things including at least
910 // built-in fonts and procsets here, at least in some
911 // cases.
912 warnings.push_back(
913 "page " + QUtil::int_to_string(pageno) +
914 ": shared object " + QUtil::int_to_string(*iter) +
915 ": in computed list but not hint table");
916 }
917 }
918 }
919 }
920
921 void
checkHSharedObject(std::list<std::string> & errors,std::list<std::string> & warnings,std::vector<QPDFObjectHandle> const & pages,std::map<int,int> & idx_to_obj)922 QPDF::checkHSharedObject(std::list<std::string>& errors,
923 std::list<std::string>& warnings,
924 std::vector<QPDFObjectHandle> const& pages,
925 std::map<int, int>& idx_to_obj)
926 {
927 // Implementation note 125 says shared object groups always
928 // contain only one object. Implementation note 128 says that
929 // Acrobat always nbits_nobjects to zero. Implementation note 130
930 // says that Acrobat does not support more than one shared object
931 // per group. These are all consistent.
932
933 // Implementation note 129 states that MD5 signatures are not
934 // implemented in Acrobat, so signature_present must always be
935 // zero.
936
937 // Implementation note 131 states that first_shared_obj and
938 // first_shared_offset have meaningless values for single-page
939 // files.
940
941 // Empirically, Acrobat and pdlin generate incorrect values for
942 // these whenever there are no shared objects not referenced by
943 // the first page (i.e., nshared_total == nshared_first_page).
944
945 HSharedObject& so = this->m->shared_object_hints;
946 if (so.nshared_total < so.nshared_first_page)
947 {
948 errors.push_back("shared object hint table: ntotal < nfirst_page");
949 }
950 else
951 {
952 // The first nshared_first_page objects are consecutive
953 // objects starting with the first page object. The rest are
954 // consecutive starting from the first_shared_obj object.
955 int cur_object = pages.at(0).getObjectID();
956 for (int i = 0; i < so.nshared_total; ++i)
957 {
958 if (i == so.nshared_first_page)
959 {
960 QTC::TC("qpdf", "QPDF lin check shared past first page");
961 if (this->m->part8.empty())
962 {
963 errors.push_back(
964 "part 8 is empty but nshared_total > "
965 "nshared_first_page");
966 }
967 else
968 {
969 int obj = this->m->part8.at(0).getObjectID();
970 if (obj != so.first_shared_obj)
971 {
972 errors.push_back(
973 "first shared object number mismatch: "
974 "hint table = " +
975 QUtil::int_to_string(so.first_shared_obj) +
976 "; computed = " +
977 QUtil::int_to_string(obj));
978 }
979 }
980
981 cur_object = so.first_shared_obj;
982
983 QPDFObjGen og(cur_object, 0);
984 if (this->m->xref_table.count(og) == 0)
985 {
986 stopOnError("unknown object in shared object hint table");
987 }
988 qpdf_offset_t offset = getLinearizationOffset(og);
989 qpdf_offset_t h_offset =
990 adjusted_offset(so.first_shared_offset);
991 if (offset != h_offset)
992 {
993 errors.push_back(
994 "first shared object offset mismatch: hint table = " +
995 QUtil::int_to_string(h_offset) + "; computed = " +
996 QUtil::int_to_string(offset));
997 }
998 }
999
1000 idx_to_obj[i] = cur_object;
1001 HSharedObjectEntry& se = so.entries.at(toS(i));
1002 int nobjects = se.nobjects_minus_one + 1;
1003 int length = lengthNextN(cur_object, nobjects, errors);
1004 int h_length = so.min_group_length + se.delta_group_length;
1005 if (length != h_length)
1006 {
1007 errors.push_back(
1008 "shared object " + QUtil::int_to_string(i) +
1009 " length mismatch: hint table = " +
1010 QUtil::int_to_string(h_length) + "; computed = " +
1011 QUtil::int_to_string(length));
1012 }
1013 cur_object += nobjects;
1014 }
1015 }
1016 }
1017
1018 void
checkHOutlines(std::list<std::string> & warnings)1019 QPDF::checkHOutlines(std::list<std::string>& warnings)
1020 {
1021 // Empirically, Acrobat generates the correct value for the object
1022 // number but incorrectly stores the next object number's offset
1023 // as the offset, at least when outlines appear in part 6. It
1024 // also generates an incorrect value for length (specifically, the
1025 // length that would cover the correct number of objects from the
1026 // wrong starting place). pdlin appears to generate correct
1027 // values in those cases.
1028
1029 if (this->m->c_outline_data.nobjects == this->m->outline_hints.nobjects)
1030 {
1031 if (this->m->c_outline_data.nobjects == 0)
1032 {
1033 return;
1034 }
1035
1036 if (this->m->c_outline_data.first_object ==
1037 this->m->outline_hints.first_object)
1038 {
1039 // Check length and offset. Acrobat gets these wrong.
1040 QPDFObjectHandle outlines = getRoot().getKey("/Outlines");
1041 if (! outlines.isIndirect())
1042 {
1043 // This case is not exercised in test suite since not
1044 // permitted by the spec, but if this does occur, the
1045 // code below would fail.
1046 warnings.push_back(
1047 "/Outlines key of root dictionary is not indirect");
1048 return;
1049 }
1050 QPDFObjGen og(outlines.getObjGen());
1051 if (this->m->xref_table.count(og) == 0)
1052 {
1053 stopOnError("unknown object in outlines hint table");
1054 }
1055 qpdf_offset_t offset = getLinearizationOffset(og);
1056 ObjUser ou(ObjUser::ou_root_key, "/Outlines");
1057 int length = toI(maxEnd(ou) - offset);
1058 qpdf_offset_t table_offset =
1059 adjusted_offset(this->m->outline_hints.first_object_offset);
1060 if (offset != table_offset)
1061 {
1062 warnings.push_back(
1063 "incorrect offset in outlines table: hint table = " +
1064 QUtil::int_to_string(table_offset) +
1065 "; computed = " + QUtil::int_to_string(offset));
1066 }
1067 int table_length = this->m->outline_hints.group_length;
1068 if (length != table_length)
1069 {
1070 warnings.push_back(
1071 "incorrect length in outlines table: hint table = " +
1072 QUtil::int_to_string(table_length) +
1073 "; computed = " + QUtil::int_to_string(length));
1074 }
1075 }
1076 else
1077 {
1078 warnings.push_back("incorrect first object number in outline "
1079 "hints table.");
1080 }
1081 }
1082 else
1083 {
1084 warnings.push_back("incorrect object count in outline hint table");
1085 }
1086 }
1087
1088 void
showLinearizationData()1089 QPDF::showLinearizationData()
1090 {
1091 try
1092 {
1093 readLinearizationData();
1094 checkLinearizationInternal();
1095 dumpLinearizationDataInternal();
1096 }
1097 catch (QPDFExc& e)
1098 {
1099 *this->m->err_stream << e.what() << std::endl;
1100 }
1101 }
1102
1103 void
dumpLinearizationDataInternal()1104 QPDF::dumpLinearizationDataInternal()
1105 {
1106 *this->m->out_stream
1107 << this->m->file->getName() << ": linearization data:" << std::endl
1108 << std::endl;
1109
1110 *this->m->out_stream
1111 << "file_size: " << this->m->linp.file_size << std::endl
1112 << "first_page_object: " << this->m->linp.first_page_object << std::endl
1113 << "first_page_end: " << this->m->linp.first_page_end << std::endl
1114 << "npages: " << this->m->linp.npages << std::endl
1115 << "xref_zero_offset: " << this->m->linp.xref_zero_offset << std::endl
1116 << "first_page: " << this->m->linp.first_page << std::endl
1117 << "H_offset: " << this->m->linp.H_offset << std::endl
1118 << "H_length: " << this->m->linp.H_length << std::endl
1119 << std::endl;
1120
1121 *this->m->out_stream << "Page Offsets Hint Table" << std::endl
1122 << std::endl;
1123 dumpHPageOffset();
1124 *this->m->out_stream << std::endl
1125 << "Shared Objects Hint Table" << std::endl
1126 << std::endl;
1127 dumpHSharedObject();
1128
1129 if (this->m->outline_hints.nobjects > 0)
1130 {
1131 *this->m->out_stream << std::endl
1132 << "Outlines Hint Table" << std::endl
1133 << std::endl;
1134 dumpHGeneric(this->m->outline_hints);
1135 }
1136 }
1137
1138 qpdf_offset_t
adjusted_offset(qpdf_offset_t offset)1139 QPDF::adjusted_offset(qpdf_offset_t offset)
1140 {
1141 // All offsets >= H_offset have to be increased by H_length
1142 // since all hint table location values disregard the hint table
1143 // itself.
1144 if (offset >= this->m->linp.H_offset)
1145 {
1146 return offset + this->m->linp.H_length;
1147 }
1148 return offset;
1149 }
1150
1151
1152 void
dumpHPageOffset()1153 QPDF::dumpHPageOffset()
1154 {
1155 HPageOffset& t = this->m->page_offset_hints;
1156 *this->m->out_stream
1157 << "min_nobjects: " << t.min_nobjects
1158 << std::endl
1159 << "first_page_offset: " << adjusted_offset(t.first_page_offset)
1160 << std::endl
1161 << "nbits_delta_nobjects: " << t.nbits_delta_nobjects
1162 << std::endl
1163 << "min_page_length: " << t.min_page_length
1164 << std::endl
1165 << "nbits_delta_page_length: " << t.nbits_delta_page_length
1166 << std::endl
1167 << "min_content_offset: " << t.min_content_offset
1168 << std::endl
1169 << "nbits_delta_content_offset: " << t.nbits_delta_content_offset
1170 << std::endl
1171 << "min_content_length: " << t.min_content_length
1172 << std::endl
1173 << "nbits_delta_content_length: " << t.nbits_delta_content_length
1174 << std::endl
1175 << "nbits_nshared_objects: " << t.nbits_nshared_objects
1176 << std::endl
1177 << "nbits_shared_identifier: " << t.nbits_shared_identifier
1178 << std::endl
1179 << "nbits_shared_numerator: " << t.nbits_shared_numerator
1180 << std::endl
1181 << "shared_denominator: " << t.shared_denominator
1182 << std::endl;
1183
1184 for (size_t i1 = 0; i1 < toS(this->m->linp.npages); ++i1)
1185 {
1186 HPageOffsetEntry& pe = t.entries.at(i1);
1187 *this->m->out_stream
1188 << "Page " << i1 << ":" << std::endl
1189 << " nobjects: " << pe.delta_nobjects + t.min_nobjects
1190 << std::endl
1191 << " length: " << pe.delta_page_length + t.min_page_length
1192 << std::endl
1193 // content offset is relative to page, not file
1194 << " content_offset: "
1195 << pe.delta_content_offset + t.min_content_offset << std::endl
1196 << " content_length: "
1197 << pe.delta_content_length + t.min_content_length << std::endl
1198 << " nshared_objects: " << pe.nshared_objects << std::endl;
1199 for (size_t i2 = 0; i2 < toS(pe.nshared_objects); ++i2)
1200 {
1201 *this->m->out_stream << " identifier " << i2 << ": "
1202 << pe.shared_identifiers.at(i2) << std::endl;
1203 *this->m->out_stream << " numerator " << i2 << ": "
1204 << pe.shared_numerators.at(i2) << std::endl;
1205 }
1206 }
1207 }
1208
1209 void
dumpHSharedObject()1210 QPDF::dumpHSharedObject()
1211 {
1212 HSharedObject& t = this->m->shared_object_hints;
1213 *this->m->out_stream
1214 << "first_shared_obj: " << t.first_shared_obj
1215 << std::endl
1216 << "first_shared_offset: " << adjusted_offset(t.first_shared_offset)
1217 << std::endl
1218 << "nshared_first_page: " << t.nshared_first_page
1219 << std::endl
1220 << "nshared_total: " << t.nshared_total
1221 << std::endl
1222 << "nbits_nobjects: " << t.nbits_nobjects
1223 << std::endl
1224 << "min_group_length: " << t.min_group_length
1225 << std::endl
1226 << "nbits_delta_group_length: " << t.nbits_delta_group_length
1227 << std::endl;
1228
1229 for (size_t i = 0; i < toS(t.nshared_total); ++i)
1230 {
1231 HSharedObjectEntry& se = t.entries.at(i);
1232 *this->m->out_stream
1233 << "Shared Object " << i << ":" << std::endl
1234 << " group length: "
1235 << se.delta_group_length + t.min_group_length << std::endl;
1236 // PDF spec says signature present nobjects_minus_one are
1237 // always 0, so print them only if they have a non-zero value.
1238 if (se.signature_present)
1239 {
1240 *this->m->out_stream << " signature present" << std::endl;
1241 }
1242 if (se.nobjects_minus_one != 0)
1243 {
1244 *this->m->out_stream << " nobjects: "
1245 << se.nobjects_minus_one + 1 << std::endl;
1246 }
1247 }
1248 }
1249
1250 void
dumpHGeneric(HGeneric & t)1251 QPDF::dumpHGeneric(HGeneric& t)
1252 {
1253 *this->m->out_stream
1254 << "first_object: " << t.first_object
1255 << std::endl
1256 << "first_object_offset: " << adjusted_offset(t.first_object_offset)
1257 << std::endl
1258 << "nobjects: " << t.nobjects
1259 << std::endl
1260 << "group_length: " << t.group_length
1261 << std::endl;
1262 }
1263
1264 QPDFObjectHandle
objGenToIndirect(QPDFObjGen const & og)1265 QPDF::objGenToIndirect(QPDFObjGen const& og)
1266 {
1267 return getObjectByID(og.getObj(), og.getGen());
1268 }
1269
1270 void
calculateLinearizationData(std::map<int,int> const & object_stream_data)1271 QPDF::calculateLinearizationData(std::map<int, int> const& object_stream_data)
1272 {
1273 // This function calculates the ordering of objects, divides them
1274 // into the appropriate parts, and computes some values for the
1275 // linearization parameter dictionary and hint tables. The file
1276 // must be optimized (via calling optimize()) prior to calling
1277 // this function. Note that actual offsets and lengths are not
1278 // computed here, but anything related to object ordering is.
1279
1280 if (this->m->object_to_obj_users.empty())
1281 {
1282 // Note that we can't call optimize here because we don't know
1283 // whether it should be called with or without allow changes.
1284 throw std::logic_error(
1285 "INTERNAL ERROR: QPDF::calculateLinearizationData "
1286 "called before optimize()");
1287 }
1288
1289 // Separate objects into the categories sufficient for us to
1290 // determine which part of the linearized file should contain the
1291 // object. This categorization is useful for other purposes as
1292 // well. Part numbers refer to version 1.4 of the PDF spec.
1293
1294 // Parts 1, 3, 5, 10, and 11 don't contain any objects from the
1295 // original file (except the trailer dictionary in part 11).
1296
1297 // Part 4 is the document catalog (root) and the following root
1298 // keys: /ViewerPreferences, /PageMode, /Threads, /OpenAction,
1299 // /AcroForm, /Encrypt. Note that Thread information dictionaries
1300 // are supposed to appear in part 9, but we are disregarding that
1301 // recommendation for now.
1302
1303 // Part 6 is the first page section. It includes all remaining
1304 // objects referenced by the first page including shared objects
1305 // but not including thumbnails. Additionally, if /PageMode is
1306 // /Outlines, then information from /Outlines also appears here.
1307
1308 // Part 7 contains remaining objects private to pages other than
1309 // the first page.
1310
1311 // Part 8 contains all remaining shared objects except those that
1312 // are shared only within thumbnails.
1313
1314 // Part 9 contains all remaining objects.
1315
1316 // We sort objects into the following categories:
1317
1318 // * open_document: part 4
1319
1320 // * first_page_private: part 6
1321
1322 // * first_page_shared: part 6
1323
1324 // * other_page_private: part 7
1325
1326 // * other_page_shared: part 8
1327
1328 // * thumbnail_private: part 9
1329
1330 // * thumbnail_shared: part 9
1331
1332 // * other: part 9
1333
1334 // * outlines: part 6 or 9
1335
1336 this->m->part4.clear();
1337 this->m->part6.clear();
1338 this->m->part7.clear();
1339 this->m->part8.clear();
1340 this->m->part9.clear();
1341 this->m->c_linp = LinParameters();
1342 this->m->c_page_offset_data = CHPageOffset();
1343 this->m->c_shared_object_data = CHSharedObject();
1344 this->m->c_outline_data = HGeneric();
1345
1346 QPDFObjectHandle root = getRoot();
1347 bool outlines_in_first_page = false;
1348 QPDFObjectHandle pagemode = root.getKey("/PageMode");
1349 QTC::TC("qpdf", "QPDF categorize pagemode present",
1350 pagemode.isName() ? 1 : 0);
1351 if (pagemode.isName())
1352 {
1353 if (pagemode.getName() == "/UseOutlines")
1354 {
1355 if (root.hasKey("/Outlines"))
1356 {
1357 outlines_in_first_page = true;
1358 }
1359 else
1360 {
1361 QTC::TC("qpdf", "QPDF UseOutlines but no Outlines");
1362 }
1363 }
1364 QTC::TC("qpdf", "QPDF categorize pagemode outlines",
1365 outlines_in_first_page ? 1 : 0);
1366 }
1367
1368 std::set<std::string> open_document_keys;
1369 open_document_keys.insert("/ViewerPreferences");
1370 open_document_keys.insert("/PageMode");
1371 open_document_keys.insert("/Threads");
1372 open_document_keys.insert("/OpenAction");
1373 open_document_keys.insert("/AcroForm");
1374
1375 std::set<QPDFObjGen> lc_open_document;
1376 std::set<QPDFObjGen> lc_first_page_private;
1377 std::set<QPDFObjGen> lc_first_page_shared;
1378 std::set<QPDFObjGen> lc_other_page_private;
1379 std::set<QPDFObjGen> lc_other_page_shared;
1380 std::set<QPDFObjGen> lc_thumbnail_private;
1381 std::set<QPDFObjGen> lc_thumbnail_shared;
1382 std::set<QPDFObjGen> lc_other;
1383 std::set<QPDFObjGen> lc_outlines;
1384 std::set<QPDFObjGen> lc_root;
1385
1386 for (std::map<QPDFObjGen, std::set<ObjUser> >::iterator oiter =
1387 this->m->object_to_obj_users.begin();
1388 oiter != this->m->object_to_obj_users.end(); ++oiter)
1389 {
1390 QPDFObjGen const& og = (*oiter).first;
1391
1392 std::set<ObjUser>& ous = (*oiter).second;
1393
1394 bool in_open_document = false;
1395 bool in_first_page = false;
1396 int other_pages = 0;
1397 int thumbs = 0;
1398 int others = 0;
1399 bool in_outlines = false;
1400 bool is_root = false;
1401
1402 for (std::set<ObjUser>::iterator uiter = ous.begin();
1403 uiter != ous.end(); ++uiter)
1404 {
1405 ObjUser const& ou = *uiter;
1406 switch (ou.ou_type)
1407 {
1408 case ObjUser::ou_trailer_key:
1409 if (ou.key == "/Encrypt")
1410 {
1411 in_open_document = true;
1412 }
1413 else
1414 {
1415 ++others;
1416 }
1417 break;
1418
1419 case ObjUser::ou_thumb:
1420 ++thumbs;
1421 break;
1422
1423 case ObjUser::ou_root_key:
1424 if (open_document_keys.count(ou.key) > 0)
1425 {
1426 in_open_document = true;
1427 }
1428 else if (ou.key == "/Outlines")
1429 {
1430 in_outlines = true;
1431 }
1432 else
1433 {
1434 ++others;
1435 }
1436 break;
1437
1438 case ObjUser::ou_page:
1439 if (ou.pageno == 0)
1440 {
1441 in_first_page = true;
1442 }
1443 else
1444 {
1445 ++other_pages;
1446 }
1447 break;
1448
1449 case ObjUser::ou_root:
1450 is_root = true;
1451 break;
1452
1453 case ObjUser::ou_bad:
1454 stopOnError(
1455 "INTERNAL ERROR: QPDF::calculateLinearizationData: "
1456 "invalid user type");
1457 break;
1458 }
1459 }
1460
1461 if (is_root)
1462 {
1463 lc_root.insert(og);
1464 }
1465 else if (in_outlines)
1466 {
1467 lc_outlines.insert(og);
1468 }
1469 else if (in_open_document)
1470 {
1471 lc_open_document.insert(og);
1472 }
1473 else if ((in_first_page) &&
1474 (others == 0) && (other_pages == 0) && (thumbs == 0))
1475 {
1476 lc_first_page_private.insert(og);
1477 }
1478 else if (in_first_page)
1479 {
1480 lc_first_page_shared.insert(og);
1481 }
1482 else if ((other_pages == 1) && (others == 0) && (thumbs == 0))
1483 {
1484 lc_other_page_private.insert(og);
1485 }
1486 else if (other_pages > 1)
1487 {
1488 lc_other_page_shared.insert(og);
1489 }
1490 else if ((thumbs == 1) && (others == 0))
1491 {
1492 lc_thumbnail_private.insert(og);
1493 }
1494 else if (thumbs > 1)
1495 {
1496 lc_thumbnail_shared.insert(og);
1497 }
1498 else
1499 {
1500 lc_other.insert(og);
1501 }
1502 }
1503
1504 // Generate ordering for objects in the output file. Sometimes we
1505 // just dump right from a set into a vector. Rather than
1506 // optimizing this by going straight into the vector, we'll leave
1507 // these phases separate for now. That way, this section can be
1508 // concerned only with ordering, and the above section can be
1509 // considered only with categorization. Note that sets of
1510 // QPDFObjGens are sorted by QPDFObjGen. In a linearized file,
1511 // objects appear in sequence with the possible exception of hints
1512 // tables which we won't see here anyway. That means that running
1513 // calculateLinearizationData() on a linearized file should give
1514 // results identical to the original file ordering.
1515
1516 // We seem to traverse the page tree a lot in this code, but we
1517 // can address this for a future code optimization if necessary.
1518 // Premature optimization is the root of all evil.
1519 std::vector<QPDFObjectHandle> pages;
1520 { // local scope
1521 // Map all page objects to the containing object stream. This
1522 // should be a no-op in a properly linearized file.
1523 std::vector<QPDFObjectHandle> t = getAllPages();
1524 for (std::vector<QPDFObjectHandle>::iterator iter = t.begin();
1525 iter != t.end(); ++iter)
1526 {
1527 pages.push_back(getUncompressedObject(*iter, object_stream_data));
1528 }
1529 }
1530 int npages = toI(pages.size());
1531
1532 // We will be initializing some values of the computed hint
1533 // tables. Specifically, we can initialize any items that deal
1534 // with object numbers or counts but not any items that deal with
1535 // lengths or offsets. The code that writes linearized files will
1536 // have to fill in these values during the first pass. The
1537 // validation code can compute them relatively easily given the
1538 // rest of the information.
1539
1540 // npages is the size of the existing pages vector, which has been
1541 // created by traversing the pages tree, and as such is a
1542 // reasonable size.
1543 this->m->c_linp.npages = npages;
1544 this->m->c_page_offset_data.entries =
1545 std::vector<CHPageOffsetEntry>(toS(npages));
1546
1547 // Part 4: open document objects. We don't care about the order.
1548
1549 if (lc_root.size() != 1)
1550 {
1551 stopOnError("found other than one root while"
1552 " calculating linearization data");
1553 }
1554 this->m->part4.push_back(objGenToIndirect(*(lc_root.begin())));
1555 for (std::set<QPDFObjGen>::iterator iter = lc_open_document.begin();
1556 iter != lc_open_document.end(); ++iter)
1557 {
1558 this->m->part4.push_back(objGenToIndirect(*iter));
1559 }
1560
1561 // Part 6: first page objects. Note: implementation note 124
1562 // states that Acrobat always treats page 0 as the first page for
1563 // linearization regardless of /OpenAction. pdlin doesn't provide
1564 // any option to set this and also disregards /OpenAction. We
1565 // will do the same.
1566
1567 // First, place the actual first page object itself.
1568 if (pages.empty())
1569 {
1570 stopOnError("no pages found while calculating linearization data");
1571 }
1572 QPDFObjGen first_page_og(pages.at(0).getObjGen());
1573 if (! lc_first_page_private.count(first_page_og))
1574 {
1575 stopOnError(
1576 "INTERNAL ERROR: QPDF::calculateLinearizationData: first page "
1577 "object not in lc_first_page_private");
1578 }
1579 lc_first_page_private.erase(first_page_og);
1580 this->m->c_linp.first_page_object = pages.at(0).getObjectID();
1581 this->m->part6.push_back(pages.at(0));
1582
1583 // The PDF spec "recommends" an order for the rest of the objects,
1584 // but we are going to disregard it except to the extent that it
1585 // groups private and shared objects contiguously for the sake of
1586 // hint tables.
1587
1588 for (std::set<QPDFObjGen>::iterator iter = lc_first_page_private.begin();
1589 iter != lc_first_page_private.end(); ++iter)
1590 {
1591 this->m->part6.push_back(objGenToIndirect(*iter));
1592 }
1593
1594 for (std::set<QPDFObjGen>::iterator iter = lc_first_page_shared.begin();
1595 iter != lc_first_page_shared.end(); ++iter)
1596 {
1597 this->m->part6.push_back(objGenToIndirect(*iter));
1598 }
1599
1600 // Place the outline dictionary if it goes in the first page section.
1601 if (outlines_in_first_page)
1602 {
1603 pushOutlinesToPart(this->m->part6, lc_outlines, object_stream_data);
1604 }
1605
1606 // Fill in page offset hint table information for the first page.
1607 // The PDF spec says that nshared_objects should be zero for the
1608 // first page. pdlin does not appear to obey this, but it fills
1609 // in garbage values for all the shared object identifiers on the
1610 // first page.
1611
1612 this->m->c_page_offset_data.entries.at(0).nobjects =
1613 toI(this->m->part6.size());
1614
1615 // Part 7: other pages' private objects
1616
1617 // For each page in order:
1618 for (size_t i = 1; i < toS(npages); ++i)
1619 {
1620 // Place this page's page object
1621
1622 QPDFObjGen page_og(pages.at(i).getObjGen());
1623 if (! lc_other_page_private.count(page_og))
1624 {
1625 stopOnError(
1626 "INTERNAL ERROR: "
1627 "QPDF::calculateLinearizationData: page object for page " +
1628 QUtil::uint_to_string(i) + " not in lc_other_page_private");
1629 }
1630 lc_other_page_private.erase(page_og);
1631 this->m->part7.push_back(pages.at(i));
1632
1633 // Place all non-shared objects referenced by this page,
1634 // updating the page object count for the hint table.
1635
1636 this->m->c_page_offset_data.entries.at(i).nobjects = 1;
1637
1638 ObjUser ou(ObjUser::ou_page, toI(i));
1639 if (this->m->obj_user_to_objects.count(ou) == 0)
1640 {
1641 stopOnError("found unreferenced page while"
1642 " calculating linearization data");
1643 }
1644 std::set<QPDFObjGen> ogs = this->m->obj_user_to_objects[ou];
1645 for (std::set<QPDFObjGen>::iterator iter = ogs.begin();
1646 iter != ogs.end(); ++iter)
1647 {
1648 QPDFObjGen const& og = (*iter);
1649 if (lc_other_page_private.count(og))
1650 {
1651 lc_other_page_private.erase(og);
1652 this->m->part7.push_back(objGenToIndirect(og));
1653 ++this->m->c_page_offset_data.entries.at(i).nobjects;
1654 }
1655 }
1656 }
1657 // That should have covered all part7 objects.
1658 if (! lc_other_page_private.empty())
1659 {
1660 stopOnError(
1661 "INTERNAL ERROR:"
1662 " QPDF::calculateLinearizationData: lc_other_page_private is "
1663 "not empty after generation of part7");
1664 }
1665
1666 // Part 8: other pages' shared objects
1667
1668 // Order is unimportant.
1669 for (std::set<QPDFObjGen>::iterator iter = lc_other_page_shared.begin();
1670 iter != lc_other_page_shared.end(); ++iter)
1671 {
1672 this->m->part8.push_back(objGenToIndirect(*iter));
1673 }
1674
1675 // Part 9: other objects
1676
1677 // The PDF specification makes recommendations on ordering here.
1678 // We follow them only to a limited extent. Specifically, we put
1679 // the pages tree first, then private thumbnail objects in page
1680 // order, then shared thumbnail objects, and then outlines (unless
1681 // in part 6). After that, we throw all remaining objects in
1682 // arbitrary order.
1683
1684 // Place the pages tree.
1685 std::set<QPDFObjGen> pages_ogs =
1686 this->m->obj_user_to_objects[ObjUser(ObjUser::ou_root_key, "/Pages")];
1687 if (pages_ogs.empty())
1688 {
1689 stopOnError("found empty pages tree while"
1690 " calculating linearization data");
1691 }
1692 for (std::set<QPDFObjGen>::iterator iter = pages_ogs.begin();
1693 iter != pages_ogs.end(); ++iter)
1694 {
1695 QPDFObjGen const& og = *iter;
1696 if (lc_other.count(og))
1697 {
1698 lc_other.erase(og);
1699 this->m->part9.push_back(objGenToIndirect(og));
1700 }
1701 }
1702
1703 // Place private thumbnail images in page order. Slightly more
1704 // information would be required if we were going to bother with
1705 // thumbnail hint tables.
1706 for (size_t i = 0; i < toS(npages); ++i)
1707 {
1708 QPDFObjectHandle thumb = pages.at(i).getKey("/Thumb");
1709 thumb = getUncompressedObject(thumb, object_stream_data);
1710 if (! thumb.isNull())
1711 {
1712 // Output the thumbnail itself
1713 QPDFObjGen thumb_og(thumb.getObjGen());
1714 if (lc_thumbnail_private.count(thumb_og))
1715 {
1716 lc_thumbnail_private.erase(thumb_og);
1717 this->m->part9.push_back(thumb);
1718 }
1719 else
1720 {
1721 // No internal error this time...there's nothing to
1722 // stop this object from having been referred to
1723 // somewhere else outside of a page's /Thumb, and if
1724 // it had been, there's nothing to prevent it from
1725 // having been in some set other than
1726 // lc_thumbnail_private.
1727 }
1728 std::set<QPDFObjGen>& ogs =
1729 this->m->obj_user_to_objects[
1730 ObjUser(ObjUser::ou_thumb, toI(i))];
1731 for (std::set<QPDFObjGen>::iterator iter = ogs.begin();
1732 iter != ogs.end(); ++iter)
1733 {
1734 QPDFObjGen const& og = *iter;
1735 if (lc_thumbnail_private.count(og))
1736 {
1737 lc_thumbnail_private.erase(og);
1738 this->m->part9.push_back(objGenToIndirect(og));
1739 }
1740 }
1741 }
1742 }
1743 if (! lc_thumbnail_private.empty())
1744 {
1745 stopOnError(
1746 "INTERNAL ERROR: "
1747 "QPDF::calculateLinearizationData: lc_thumbnail_private "
1748 "not empty after placing thumbnails");
1749 }
1750
1751 // Place shared thumbnail objects
1752 for (std::set<QPDFObjGen>::iterator iter = lc_thumbnail_shared.begin();
1753 iter != lc_thumbnail_shared.end(); ++iter)
1754 {
1755 this->m->part9.push_back(objGenToIndirect(*iter));
1756 }
1757
1758 // Place outlines unless in first page
1759 if (! outlines_in_first_page)
1760 {
1761 pushOutlinesToPart(this->m->part9, lc_outlines, object_stream_data);
1762 }
1763
1764 // Place all remaining objects
1765 for (std::set<QPDFObjGen>::iterator iter = lc_other.begin();
1766 iter != lc_other.end(); ++iter)
1767 {
1768 this->m->part9.push_back(objGenToIndirect(*iter));
1769 }
1770
1771 // Make sure we got everything exactly once.
1772
1773 size_t num_placed =
1774 this->m->part4.size() + this->m->part6.size() + this->m->part7.size() +
1775 this->m->part8.size() + this->m->part9.size();
1776 size_t num_wanted = this->m->object_to_obj_users.size();
1777 if (num_placed != num_wanted)
1778 {
1779 stopOnError(
1780 "INTERNAL ERROR: QPDF::calculateLinearizationData: wrong "
1781 "number of objects placed (num_placed = " +
1782 QUtil::uint_to_string(num_placed) +
1783 "; number of objects: " +
1784 QUtil::uint_to_string(num_wanted));
1785 }
1786
1787 // Calculate shared object hint table information including
1788 // references to shared objects from page offset hint data.
1789
1790 // The shared object hint table consists of all part 6 (whether
1791 // shared or not) in order followed by all part 8 objects in
1792 // order. Add the objects to shared object data keeping a map of
1793 // object number to index. Then populate the shared object
1794 // information for the pages.
1795
1796 // Note that two objects never have the same object number, so we
1797 // can map from object number only without regards to generation.
1798 std::map<int, int> obj_to_index;
1799
1800 this->m->c_shared_object_data.nshared_first_page =
1801 toI(this->m->part6.size());
1802 this->m->c_shared_object_data.nshared_total =
1803 this->m->c_shared_object_data.nshared_first_page +
1804 toI(this->m->part8.size());
1805
1806 std::vector<CHSharedObjectEntry>& shared =
1807 this->m->c_shared_object_data.entries;
1808 for (std::vector<QPDFObjectHandle>::iterator iter = this->m->part6.begin();
1809 iter != this->m->part6.end(); ++iter)
1810 {
1811 QPDFObjectHandle& oh = *iter;
1812 int obj = oh.getObjectID();
1813 obj_to_index[obj] = toI(shared.size());
1814 shared.push_back(CHSharedObjectEntry(obj));
1815 }
1816 QTC::TC("qpdf", "QPDF lin part 8 empty", this->m->part8.empty() ? 1 : 0);
1817 if (! this->m->part8.empty())
1818 {
1819 this->m->c_shared_object_data.first_shared_obj =
1820 this->m->part8.at(0).getObjectID();
1821 for (std::vector<QPDFObjectHandle>::iterator iter =
1822 this->m->part8.begin();
1823 iter != this->m->part8.end(); ++iter)
1824 {
1825 QPDFObjectHandle& oh = *iter;
1826 int obj = oh.getObjectID();
1827 obj_to_index[obj] = toI(shared.size());
1828 shared.push_back(CHSharedObjectEntry(obj));
1829 }
1830 }
1831 if (static_cast<size_t>(this->m->c_shared_object_data.nshared_total) !=
1832 this->m->c_shared_object_data.entries.size())
1833 {
1834 stopOnError(
1835 "shared object hint table has wrong number of entries");
1836 }
1837
1838 // Now compute the list of shared objects for each page after the
1839 // first page.
1840
1841 for (size_t i = 1; i < toS(npages); ++i)
1842 {
1843 CHPageOffsetEntry& pe = this->m->c_page_offset_data.entries.at(i);
1844 ObjUser ou(ObjUser::ou_page, toI(i));
1845 if (this->m->obj_user_to_objects.count(ou) == 0)
1846 {
1847 stopOnError("found unreferenced page while"
1848 " calculating linearization data");
1849 }
1850 std::set<QPDFObjGen> const& ogs = this->m->obj_user_to_objects[ou];
1851 for (std::set<QPDFObjGen>::const_iterator iter = ogs.begin();
1852 iter != ogs.end(); ++iter)
1853 {
1854 QPDFObjGen const& og = *iter;
1855 if ((this->m->object_to_obj_users[og].size() > 1) &&
1856 (obj_to_index.count(og.getObj()) > 0))
1857 {
1858 int idx = obj_to_index[og.getObj()];
1859 ++pe.nshared_objects;
1860 pe.shared_identifiers.push_back(idx);
1861 }
1862 }
1863 }
1864 }
1865
1866 void
pushOutlinesToPart(std::vector<QPDFObjectHandle> & part,std::set<QPDFObjGen> & lc_outlines,std::map<int,int> const & object_stream_data)1867 QPDF::pushOutlinesToPart(
1868 std::vector<QPDFObjectHandle>& part,
1869 std::set<QPDFObjGen>& lc_outlines,
1870 std::map<int, int> const& object_stream_data)
1871 {
1872 QPDFObjectHandle root = getRoot();
1873 QPDFObjectHandle outlines = root.getKey("/Outlines");
1874 if (outlines.isNull())
1875 {
1876 return;
1877 }
1878 outlines = getUncompressedObject(outlines, object_stream_data);
1879 QPDFObjGen outlines_og(outlines.getObjGen());
1880 QTC::TC("qpdf", "QPDF lin outlines in part",
1881 ((&part == (&this->m->part6)) ? 0
1882 : (&part == (&this->m->part9)) ? 1
1883 : 9999)); // can't happen
1884 this->m->c_outline_data.first_object = outlines_og.getObj();
1885 this->m->c_outline_data.nobjects = 1;
1886 lc_outlines.erase(outlines_og);
1887 part.push_back(outlines);
1888 for (std::set<QPDFObjGen>::iterator iter = lc_outlines.begin();
1889 iter != lc_outlines.end(); ++iter)
1890 {
1891 part.push_back(objGenToIndirect(*iter));
1892 ++this->m->c_outline_data.nobjects;
1893 }
1894 }
1895
1896 void
getLinearizedParts(std::map<int,int> const & object_stream_data,std::vector<QPDFObjectHandle> & part4,std::vector<QPDFObjectHandle> & part6,std::vector<QPDFObjectHandle> & part7,std::vector<QPDFObjectHandle> & part8,std::vector<QPDFObjectHandle> & part9)1897 QPDF::getLinearizedParts(
1898 std::map<int, int> const& object_stream_data,
1899 std::vector<QPDFObjectHandle>& part4,
1900 std::vector<QPDFObjectHandle>& part6,
1901 std::vector<QPDFObjectHandle>& part7,
1902 std::vector<QPDFObjectHandle>& part8,
1903 std::vector<QPDFObjectHandle>& part9)
1904 {
1905 calculateLinearizationData(object_stream_data);
1906 part4 = this->m->part4;
1907 part6 = this->m->part6;
1908 part7 = this->m->part7;
1909 part8 = this->m->part8;
1910 part9 = this->m->part9;
1911 }
1912
nbits(int val)1913 static inline int nbits(int val)
1914 {
1915 return (val == 0 ? 0 : (1 + nbits(val >> 1)));
1916 }
1917
1918 int
outputLengthNextN(int in_object,int n,std::map<int,qpdf_offset_t> const & lengths,std::map<int,int> const & obj_renumber)1919 QPDF::outputLengthNextN(
1920 int in_object, int n,
1921 std::map<int, qpdf_offset_t> const& lengths,
1922 std::map<int, int> const& obj_renumber)
1923 {
1924 // Figure out the length of a series of n consecutive objects in
1925 // the output file starting with whatever object in_object from
1926 // the input file mapped to.
1927
1928 if (obj_renumber.count(in_object) == 0)
1929 {
1930 stopOnError("found object that is not renumbered while"
1931 " writing linearization data");
1932 }
1933 int first = (*(obj_renumber.find(in_object))).second;
1934 int length = 0;
1935 for (int i = 0; i < n; ++i)
1936 {
1937 if (lengths.count(first + i) == 0)
1938 {
1939 stopOnError("found item with unknown length"
1940 " while writing linearization data");
1941 }
1942 length += toI((*(lengths.find(first + toI(i)))).second);
1943 }
1944 return length;
1945 }
1946
1947 void
calculateHPageOffset(std::map<int,QPDFXRefEntry> const & xref,std::map<int,qpdf_offset_t> const & lengths,std::map<int,int> const & obj_renumber)1948 QPDF::calculateHPageOffset(
1949 std::map<int, QPDFXRefEntry> const& xref,
1950 std::map<int, qpdf_offset_t> const& lengths,
1951 std::map<int, int> const& obj_renumber)
1952 {
1953 // Page Offset Hint Table
1954
1955 // We are purposely leaving some values set to their initial zero
1956 // values.
1957
1958 std::vector<QPDFObjectHandle> const& pages = getAllPages();
1959 size_t npages = pages.size();
1960 CHPageOffset& cph = this->m->c_page_offset_data;
1961 std::vector<CHPageOffsetEntry>& cphe = cph.entries;
1962
1963 // Calculate minimum and maximum values for number of objects per
1964 // page and page length.
1965
1966 int min_nobjects = cphe.at(0).nobjects;
1967 int max_nobjects = min_nobjects;
1968 int min_length = outputLengthNextN(
1969 pages.at(0).getObjectID(), min_nobjects, lengths, obj_renumber);
1970 int max_length = min_length;
1971 int max_shared = cphe.at(0).nshared_objects;
1972
1973 HPageOffset& ph = this->m->page_offset_hints;
1974 std::vector<HPageOffsetEntry>& phe = ph.entries;
1975 // npages is the size of the existing pages array.
1976 phe = std::vector<HPageOffsetEntry>(npages);
1977
1978 for (unsigned int i = 0; i < npages; ++i)
1979 {
1980 // Calculate values for each page, assigning full values to
1981 // the delta items. They will be adjusted later.
1982
1983 // Repeat calculations for page 0 so we can assign to phe[i]
1984 // without duplicating those assignments.
1985
1986 int nobjects = cphe.at(i).nobjects;
1987 int length = outputLengthNextN(
1988 pages.at(i).getObjectID(), nobjects, lengths, obj_renumber);
1989 int nshared = cphe.at(i).nshared_objects;
1990
1991 min_nobjects = std::min(min_nobjects, nobjects);
1992 max_nobjects = std::max(max_nobjects, nobjects);
1993 min_length = std::min(min_length, length);
1994 max_length = std::max(max_length, length);
1995 max_shared = std::max(max_shared, nshared);
1996
1997 phe.at(i).delta_nobjects = nobjects;
1998 phe.at(i).delta_page_length = length;
1999 phe.at(i).nshared_objects = nshared;
2000 }
2001
2002 ph.min_nobjects = min_nobjects;
2003 int in_page0_id = pages.at(0).getObjectID();
2004 int out_page0_id = (*(obj_renumber.find(in_page0_id))).second;
2005 ph.first_page_offset = (*(xref.find(out_page0_id))).second.getOffset();
2006 ph.nbits_delta_nobjects = nbits(max_nobjects - min_nobjects);
2007 ph.min_page_length = min_length;
2008 ph.nbits_delta_page_length = nbits(max_length - min_length);
2009 ph.nbits_nshared_objects = nbits(max_shared);
2010 ph.nbits_shared_identifier =
2011 nbits(this->m->c_shared_object_data.nshared_total);
2012 ph.shared_denominator = 4; // doesn't matter
2013
2014 // It isn't clear how to compute content offset and content
2015 // length. Since we are not interleaving page objects with the
2016 // content stream, we'll use the same values for content length as
2017 // page length. We will use 0 as content offset because this is
2018 // what Adobe does (implementation note 127) and pdlin as well.
2019 ph.nbits_delta_content_length = ph.nbits_delta_page_length;
2020 ph.min_content_length = ph.min_page_length;
2021
2022 for (size_t i = 0; i < npages; ++i)
2023 {
2024 // Adjust delta entries
2025 if ((phe.at(i).delta_nobjects < min_nobjects) ||
2026 (phe.at(i).delta_page_length < min_length))
2027 {
2028 stopOnError("found too small delta nobjects or delta page length"
2029 " while writing linearization data");
2030 }
2031 phe.at(i).delta_nobjects -= min_nobjects;
2032 phe.at(i).delta_page_length -= min_length;
2033 phe.at(i).delta_content_length = phe.at(i).delta_page_length;
2034
2035 for (size_t j = 0; j < toS(cphe.at(i).nshared_objects); ++j)
2036 {
2037 phe.at(i).shared_identifiers.push_back(
2038 cphe.at(i).shared_identifiers.at(j));
2039 phe.at(i).shared_numerators.push_back(0);
2040 }
2041 }
2042 }
2043
2044 void
calculateHSharedObject(std::map<int,QPDFXRefEntry> const & xref,std::map<int,qpdf_offset_t> const & lengths,std::map<int,int> const & obj_renumber)2045 QPDF::calculateHSharedObject(
2046 std::map<int, QPDFXRefEntry> const& xref,
2047 std::map<int, qpdf_offset_t> const& lengths,
2048 std::map<int, int> const& obj_renumber)
2049 {
2050 CHSharedObject& cso = this->m->c_shared_object_data;
2051 std::vector<CHSharedObjectEntry>& csoe = cso.entries;
2052 HSharedObject& so = this->m->shared_object_hints;
2053 std::vector<HSharedObjectEntry>& soe = so.entries;
2054 soe.clear();
2055
2056 int min_length = outputLengthNextN(
2057 csoe.at(0).object, 1, lengths, obj_renumber);
2058 int max_length = min_length;
2059
2060 for (size_t i = 0; i < toS(cso.nshared_total); ++i)
2061 {
2062 // Assign absolute numbers to deltas; adjust later
2063 int length = outputLengthNextN(
2064 csoe.at(i).object, 1, lengths, obj_renumber);
2065 min_length = std::min(min_length, length);
2066 max_length = std::max(max_length, length);
2067 soe.push_back(HSharedObjectEntry());
2068 soe.at(i).delta_group_length = length;
2069 }
2070 if (soe.size() != QIntC::to_size(cso.nshared_total))
2071 {
2072 stopOnError("soe has wrong size after initialization");
2073 }
2074
2075 so.nshared_total = cso.nshared_total;
2076 so.nshared_first_page = cso.nshared_first_page;
2077 if (so.nshared_total > so.nshared_first_page)
2078 {
2079 so.first_shared_obj =
2080 (*(obj_renumber.find(cso.first_shared_obj))).second;
2081 so.first_shared_offset =
2082 (*(xref.find(so.first_shared_obj))).second.getOffset();
2083 }
2084 so.min_group_length = min_length;
2085 so.nbits_delta_group_length = nbits(max_length - min_length);
2086
2087 for (size_t i = 0; i < toS(cso.nshared_total); ++i)
2088 {
2089 // Adjust deltas
2090 if (soe.at(i).delta_group_length < min_length)
2091 {
2092 stopOnError("found too small group length while"
2093 " writing linearization data");
2094 }
2095 soe.at(i).delta_group_length -= min_length;
2096 }
2097 }
2098
2099 void
calculateHOutline(std::map<int,QPDFXRefEntry> const & xref,std::map<int,qpdf_offset_t> const & lengths,std::map<int,int> const & obj_renumber)2100 QPDF::calculateHOutline(
2101 std::map<int, QPDFXRefEntry> const& xref,
2102 std::map<int, qpdf_offset_t> const& lengths,
2103 std::map<int, int> const& obj_renumber)
2104 {
2105 HGeneric& cho = this->m->c_outline_data;
2106
2107 if (cho.nobjects == 0)
2108 {
2109 return;
2110 }
2111
2112 HGeneric& ho = this->m->outline_hints;
2113
2114 ho.first_object =
2115 (*(obj_renumber.find(cho.first_object))).second;
2116 ho.first_object_offset =
2117 (*(xref.find(ho.first_object))).second.getOffset();
2118 ho.nobjects = cho.nobjects;
2119 ho.group_length = outputLengthNextN(
2120 cho.first_object, ho.nobjects, lengths, obj_renumber);
2121 }
2122
2123 template <class T, class int_type>
2124 static void
write_vector_int(BitWriter & w,int nitems,std::vector<T> & vec,int bits,int_type T::* field)2125 write_vector_int(BitWriter& w, int nitems, std::vector<T>& vec,
2126 int bits, int_type T::*field)
2127 {
2128 // nitems times, write bits bits from the given field of the ith
2129 // vector to the given bit writer.
2130
2131 for (size_t i = 0; i < QIntC::to_size(nitems); ++i)
2132 {
2133 w.writeBits(QIntC::to_ulonglong(vec.at(i).*field),
2134 QIntC::to_size(bits));
2135 }
2136 // The PDF spec says that each hint table starts at a byte
2137 // boundary. Each "row" actually must start on a byte boundary.
2138 w.flush();
2139 }
2140
2141 template <class T>
2142 static void
write_vector_vector(BitWriter & w,int nitems1,std::vector<T> & vec1,int T::* nitems2,int bits,std::vector<int> T::* vec2)2143 write_vector_vector(BitWriter& w,
2144 int nitems1, std::vector<T>& vec1, int T::*nitems2,
2145 int bits, std::vector<int> T::*vec2)
2146 {
2147 // nitems1 times, write nitems2 (from the ith element of vec1) items
2148 // from the vec2 vector field of the ith item of vec1.
2149 for (size_t i1 = 0; i1 < QIntC::to_size(nitems1); ++i1)
2150 {
2151 for (size_t i2 = 0; i2 < QIntC::to_size(vec1.at(i1).*nitems2); ++i2)
2152 {
2153 w.writeBits(QIntC::to_ulonglong((vec1.at(i1).*vec2).at(i2)),
2154 QIntC::to_size(bits));
2155 }
2156 }
2157 w.flush();
2158 }
2159
2160
2161 void
writeHPageOffset(BitWriter & w)2162 QPDF::writeHPageOffset(BitWriter& w)
2163 {
2164 HPageOffset& t = this->m->page_offset_hints;
2165
2166 w.writeBitsInt(t.min_nobjects, 32); // 1
2167 w.writeBitsInt(toI(t.first_page_offset), 32); // 2
2168 w.writeBitsInt(t.nbits_delta_nobjects, 16); // 3
2169 w.writeBitsInt(t.min_page_length, 32); // 4
2170 w.writeBitsInt(t.nbits_delta_page_length, 16); // 5
2171 w.writeBitsInt(t.min_content_offset, 32); // 6
2172 w.writeBitsInt(t.nbits_delta_content_offset, 16); // 7
2173 w.writeBitsInt(t.min_content_length, 32); // 8
2174 w.writeBitsInt(t.nbits_delta_content_length, 16); // 9
2175 w.writeBitsInt(t.nbits_nshared_objects, 16); // 10
2176 w.writeBitsInt(t.nbits_shared_identifier, 16); // 11
2177 w.writeBitsInt(t.nbits_shared_numerator, 16); // 12
2178 w.writeBitsInt(t.shared_denominator, 16); // 13
2179
2180 int nitems = toI(getAllPages().size());
2181 std::vector<HPageOffsetEntry>& entries = t.entries;
2182
2183 write_vector_int(w, nitems, entries,
2184 t.nbits_delta_nobjects,
2185 &HPageOffsetEntry::delta_nobjects);
2186 write_vector_int(w, nitems, entries,
2187 t.nbits_delta_page_length,
2188 &HPageOffsetEntry::delta_page_length);
2189 write_vector_int(w, nitems, entries,
2190 t.nbits_nshared_objects,
2191 &HPageOffsetEntry::nshared_objects);
2192 write_vector_vector(w, nitems, entries,
2193 &HPageOffsetEntry::nshared_objects,
2194 t.nbits_shared_identifier,
2195 &HPageOffsetEntry::shared_identifiers);
2196 write_vector_vector(w, nitems, entries,
2197 &HPageOffsetEntry::nshared_objects,
2198 t.nbits_shared_numerator,
2199 &HPageOffsetEntry::shared_numerators);
2200 write_vector_int(w, nitems, entries,
2201 t.nbits_delta_content_offset,
2202 &HPageOffsetEntry::delta_content_offset);
2203 write_vector_int(w, nitems, entries,
2204 t.nbits_delta_content_length,
2205 &HPageOffsetEntry::delta_content_length);
2206 }
2207
2208 void
writeHSharedObject(BitWriter & w)2209 QPDF::writeHSharedObject(BitWriter& w)
2210 {
2211 HSharedObject& t = this->m->shared_object_hints;
2212
2213 w.writeBitsInt(t.first_shared_obj, 32); // 1
2214 w.writeBitsInt(toI(t.first_shared_offset), 32); // 2
2215 w.writeBitsInt(t.nshared_first_page, 32); // 3
2216 w.writeBitsInt(t.nshared_total, 32); // 4
2217 w.writeBitsInt(t.nbits_nobjects, 16); // 5
2218 w.writeBitsInt(t.min_group_length, 32); // 6
2219 w.writeBitsInt(t.nbits_delta_group_length, 16); // 7
2220
2221 QTC::TC("qpdf", "QPDF lin write nshared_total > nshared_first_page",
2222 (t.nshared_total > t.nshared_first_page) ? 1 : 0);
2223
2224 int nitems = t.nshared_total;
2225 std::vector<HSharedObjectEntry>& entries = t.entries;
2226
2227 write_vector_int(w, nitems, entries,
2228 t.nbits_delta_group_length,
2229 &HSharedObjectEntry::delta_group_length);
2230 write_vector_int(w, nitems, entries,
2231 1, &HSharedObjectEntry::signature_present);
2232 for (size_t i = 0; i < toS(nitems); ++i)
2233 {
2234 // If signature were present, we'd have to write a 128-bit hash.
2235 if (entries.at(i).signature_present != 0)
2236 {
2237 stopOnError("found unexpected signature present"
2238 " while writing linearization data");
2239 }
2240 }
2241 write_vector_int(w, nitems, entries,
2242 t.nbits_nobjects,
2243 &HSharedObjectEntry::nobjects_minus_one);
2244 }
2245
2246 void
writeHGeneric(BitWriter & w,HGeneric & t)2247 QPDF::writeHGeneric(BitWriter& w, HGeneric& t)
2248 {
2249 w.writeBitsInt(t.first_object, 32); // 1
2250 w.writeBitsInt(toI(t.first_object_offset), 32); // 2
2251 w.writeBitsInt(t.nobjects, 32); // 3
2252 w.writeBitsInt(t.group_length, 32); // 4
2253 }
2254
2255 void
generateHintStream(std::map<int,QPDFXRefEntry> const & xref,std::map<int,qpdf_offset_t> const & lengths,std::map<int,int> const & obj_renumber,PointerHolder<Buffer> & hint_buffer,int & S,int & O)2256 QPDF::generateHintStream(std::map<int, QPDFXRefEntry> const& xref,
2257 std::map<int, qpdf_offset_t> const& lengths,
2258 std::map<int, int> const& obj_renumber,
2259 PointerHolder<Buffer>& hint_buffer,
2260 int& S, int& O)
2261 {
2262 // Populate actual hint table values
2263 calculateHPageOffset(xref, lengths, obj_renumber);
2264 calculateHSharedObject(xref, lengths, obj_renumber);
2265 calculateHOutline(xref, lengths, obj_renumber);
2266
2267 // Write the hint stream itself into a compressed memory buffer.
2268 // Write through a counter so we can get offsets.
2269 Pl_Buffer hint_stream("hint stream");
2270 Pl_Flate f("compress hint stream", &hint_stream, Pl_Flate::a_deflate);
2271 Pl_Count c("count", &f);
2272 BitWriter w(&c);
2273
2274 writeHPageOffset(w);
2275 S = toI(c.getCount());
2276 writeHSharedObject(w);
2277 O = 0;
2278 if (this->m->outline_hints.nobjects > 0)
2279 {
2280 O = toI(c.getCount());
2281 writeHGeneric(w, this->m->outline_hints);
2282 }
2283 c.finish();
2284
2285 hint_buffer = hint_stream.getBuffer();
2286 }
2287