1 // See doc/linearization.
2 
3 #include <qpdf/QPDF.hh>
4 
5 #include <qpdf/QPDFExc.hh>
6 #include <qpdf/QTC.hh>
7 #include <qpdf/QUtil.hh>
8 #include <qpdf/Pl_Buffer.hh>
9 #include <qpdf/Pl_Flate.hh>
10 #include <qpdf/Pl_Count.hh>
11 #include <qpdf/BitWriter.hh>
12 #include <qpdf/BitStream.hh>
13 
14 #include <iostream>
15 #include <algorithm>
16 #include <assert.h>
17 #include <math.h>
18 #include <string.h>
19 
20 template <class T, class int_type>
21 static void
load_vector_int(BitStream & bit_stream,int nitems,std::vector<T> & vec,int bits_wanted,int_type T::* field)22 load_vector_int(BitStream& bit_stream, int nitems, std::vector<T>& vec,
23 		int bits_wanted, int_type T::*field)
24 {
25     bool append = vec.empty();
26     // nitems times, read bits_wanted from the given bit stream,
27     // storing results in the ith vector entry.
28 
29     for (size_t i = 0; i < QIntC::to_size(nitems); ++i)
30     {
31         if (append)
32         {
33             vec.push_back(T());
34         }
35 	vec.at(i).*field = bit_stream.getBitsInt(QIntC::to_size(bits_wanted));
36     }
37     if (QIntC::to_int(vec.size()) != nitems)
38     {
39         throw std::logic_error("vector has wrong size in load_vector_int");
40     }
41     // The PDF spec says that each hint table starts at a byte
42     // boundary.  Each "row" actually must start on a byte boundary.
43     bit_stream.skipToNextByte();
44 }
45 
46 template <class T>
47 static void
load_vector_vector(BitStream & bit_stream,int nitems1,std::vector<T> & vec1,int T::* nitems2,int bits_wanted,std::vector<int> T::* vec2)48 load_vector_vector(BitStream& bit_stream,
49 		   int nitems1, std::vector<T>& vec1, int T::*nitems2,
50 		   int bits_wanted, std::vector<int> T::*vec2)
51 {
52     // nitems1 times, read nitems2 (from the ith element of vec1) items
53     // into the vec2 vector field of the ith item of vec1.
54     for (size_t i1 = 0; i1 < QIntC::to_size(nitems1); ++i1)
55     {
56 	for (int i2 = 0; i2 < vec1.at(i1).*nitems2; ++i2)
57 	{
58 	    (vec1.at(i1).*vec2).push_back(
59                 bit_stream.getBitsInt(QIntC::to_size(bits_wanted)));
60 	}
61     }
62     bit_stream.skipToNextByte();
63 }
64 
65 bool
checkLinearization()66 QPDF::checkLinearization()
67 {
68     bool result = false;
69     try
70     {
71 	readLinearizationData();
72 	result = checkLinearizationInternal();
73     }
74     catch (std::runtime_error& e)
75     {
76 	*this->m->err_stream
77             << "WARNING: error encountered while checking linearization data: "
78             << e.what() << std::endl;
79     }
80     return result;
81 }
82 
83 bool
isLinearized()84 QPDF::isLinearized()
85 {
86     // If the first object in the file is a dictionary with a suitable
87     // /Linearized key and has an /L key that accurately indicates the
88     // file size, initialize this->m->lindict and return true.
89 
90     // A linearized PDF spec's first object will be contained within
91     // the first 1024 bytes of the file and will be a dictionary with
92     // a valid /Linearized key.  This routine looks for that and does
93     // no additional validation.
94 
95     // The PDF spec says the linearization dictionary must be
96     // completely contained within the first 1024 bytes of the file.
97     // Add a byte for a null terminator.
98     static int const tbuf_size = 1025;
99 
100     char* buf = new char[tbuf_size];
101     this->m->file->seek(0, SEEK_SET);
102     PointerHolder<char> b(true, buf);
103     memset(buf, '\0', tbuf_size);
104     this->m->file->read(buf, tbuf_size - 1);
105 
106     int lindict_obj = -1;
107     char* p = buf;
108     while (lindict_obj == -1)
109     {
110         // Find a digit or end of buffer
111         while (((p - buf) < tbuf_size) && (! QUtil::is_digit(*p)))
112         {
113             ++p;
114         }
115         if (p - buf == tbuf_size)
116         {
117             break;
118         }
119         // Seek to the digit. Then skip over digits for a potential
120         // next iteration.
121         this->m->file->seek(p - buf, SEEK_SET);
122         while (((p - buf) < tbuf_size) && QUtil::is_digit(*p))
123         {
124             ++p;
125         }
126 
127         QPDFTokenizer::Token t1 = readToken(this->m->file);
128         QPDFTokenizer::Token t2 = readToken(this->m->file);
129         QPDFTokenizer::Token t3 = readToken(this->m->file);
130         QPDFTokenizer::Token t4 = readToken(this->m->file);
131         if ((t1.getType() == QPDFTokenizer::tt_integer) &&
132             (t2.getType() == QPDFTokenizer::tt_integer) &&
133             (t3 == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "obj")) &&
134             (t4.getType() == QPDFTokenizer::tt_dict_open))
135         {
136 	    lindict_obj =
137                 QIntC::to_int(QUtil::string_to_ll(t1.getValue().c_str()));
138 	}
139     }
140 
141     if (lindict_obj <= 0)
142     {
143 	return false;
144     }
145 
146     QPDFObjectHandle candidate = QPDFObjectHandle::Factory::newIndirect(
147 	this, lindict_obj, 0);
148     if (! candidate.isDictionary())
149     {
150 	return false;
151     }
152 
153     QPDFObjectHandle linkey = candidate.getKey("/Linearized");
154     if (! (linkey.isNumber() &&
155            (QIntC::to_int(floor(linkey.getNumericValue())) == 1)))
156     {
157 	return false;
158     }
159 
160     QPDFObjectHandle L = candidate.getKey("/L");
161     if (L.isInteger())
162     {
163 	qpdf_offset_t Li = L.getIntValue();
164 	this->m->file->seek(0, SEEK_END);
165 	if (Li != this->m->file->tell())
166 	{
167 	    QTC::TC("qpdf", "QPDF /L mismatch");
168 	    return false;
169 	}
170 	else
171 	{
172 	    this->m->linp.file_size = Li;
173 	}
174     }
175 
176     this->m->lindict = candidate;
177 
178     return true;
179 }
180 
181 void
readLinearizationData()182 QPDF::readLinearizationData()
183 {
184     // This function throws an exception (which is trapped by
185     // checkLinearization()) for any errors that prevent loading.
186 
187     // Hint table parsing code needs at least 32 bits in a long.
188     assert(sizeof(long) >= 4);
189 
190     if (! isLinearized())
191     {
192 	throw std::logic_error("called readLinearizationData for file"
193 			       " that is not linearized");
194     }
195 
196     // /L is read and stored in linp by isLinearized()
197     QPDFObjectHandle H = this->m->lindict.getKey("/H");
198     QPDFObjectHandle O = this->m->lindict.getKey("/O");
199     QPDFObjectHandle E = this->m->lindict.getKey("/E");
200     QPDFObjectHandle N = this->m->lindict.getKey("/N");
201     QPDFObjectHandle T = this->m->lindict.getKey("/T");
202     QPDFObjectHandle P = this->m->lindict.getKey("/P");
203 
204     if (! (H.isArray() &&
205 	   O.isInteger() &&
206 	   E.isInteger() &&
207 	   N.isInteger() &&
208 	   T.isInteger() &&
209 	   (P.isInteger() || P.isNull())))
210     {
211 	throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
212 		      "linearization dictionary",
213 		      this->m->file->getLastOffset(),
214 		      "some keys in linearization dictionary are of "
215 		      "the wrong type");
216     }
217 
218     // Hint table array: offset length [ offset length ]
219     size_t n_H_items = toS(H.getArrayNItems());
220     if (! ((n_H_items == 2) || (n_H_items == 4)))
221     {
222 	throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
223 		      "linearization dictionary",
224 		      this->m->file->getLastOffset(),
225 		      "H has the wrong number of items");
226     }
227 
228     std::vector<int> H_items;
229     for (size_t i = 0; i < n_H_items; ++i)
230     {
231 	QPDFObjectHandle oh(H.getArrayItem(toI(i)));
232 	if (oh.isInteger())
233 	{
234 	    H_items.push_back(oh.getIntValueAsInt());
235 	}
236 	else
237 	{
238 	    throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
239 			  "linearization dictionary",
240 			  this->m->file->getLastOffset(),
241 			  "some H items are of the wrong type");
242 	}
243     }
244 
245     // H: hint table offset/length for primary and overflow hint tables
246     int H0_offset = H_items.at(0);
247     int H0_length = H_items.at(1);
248     int H1_offset = 0;
249     int H1_length = 0;
250     if (H_items.size() == 4)
251     {
252 	// Acrobat doesn't read or write these (as PDF 1.4), so we
253 	// don't have a way to generate a test case.
254 	// QTC::TC("qpdf", "QPDF overflow hint table");
255 	H1_offset = H_items.at(2);
256 	H1_length = H_items.at(3);
257     }
258 
259     // P: first page number
260     int first_page = 0;
261     if (P.isInteger())
262     {
263 	QTC::TC("qpdf", "QPDF P present in lindict");
264 	first_page = P.getIntValueAsInt();
265     }
266     else
267     {
268 	QTC::TC("qpdf", "QPDF P absent in lindict");
269     }
270 
271     // Store linearization parameter data
272 
273     // Various places in the code use linp.npages, which is
274     // initialized from N, to pre-allocate memory, so make sure it's
275     // accurate and bail right now if it's not.
276     if (N.getIntValue() != static_cast<long long>(getAllPages().size()))
277     {
278         throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
279                       "linearization hint table",
280                       this->m->file->getLastOffset(),
281                       "/N does not match number of pages");
282     }
283 
284     // file_size initialized by isLinearized()
285     this->m->linp.first_page_object = O.getIntValueAsInt();
286     this->m->linp.first_page_end = E.getIntValue();
287     this->m->linp.npages = N.getIntValueAsInt();
288     this->m->linp.xref_zero_offset = T.getIntValue();
289     this->m->linp.first_page = first_page;
290     this->m->linp.H_offset = H0_offset;
291     this->m->linp.H_length = H0_length;
292 
293     // Read hint streams
294 
295     Pl_Buffer pb("hint buffer");
296     QPDFObjectHandle H0 = readHintStream(pb, H0_offset, toS(H0_length));
297     if (H1_offset)
298     {
299 	(void) readHintStream(pb, H1_offset, toS(H1_length));
300     }
301 
302     // PDF 1.4 hint tables that we ignore:
303 
304     //  /T    thumbnail
305     //  /A    thread information
306     //  /E    named destination
307     //  /V    interactive form
308     //  /I    information dictionary
309     //  /C    logical structure
310     //  /L    page label
311 
312     // Individual hint table offsets
313     QPDFObjectHandle HS = H0.getKey("/S"); // shared object
314     QPDFObjectHandle HO = H0.getKey("/O"); // outline
315 
316     PointerHolder<Buffer> hbp = pb.getBuffer();
317     Buffer* hb = hbp.getPointer();
318     unsigned char const* h_buf = hb->getBuffer();
319     size_t h_size = hb->getSize();
320 
321     readHPageOffset(BitStream(h_buf, h_size));
322 
323     int HSi = HS.getIntValueAsInt();
324     if ((HSi < 0) || (toS(HSi) >= h_size))
325     {
326         throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
327                       "linearization hint table",
328                       this->m->file->getLastOffset(),
329                       "/S (shared object) offset is out of bounds");
330     }
331     readHSharedObject(BitStream(h_buf + HSi, h_size - toS(HSi)));
332 
333     if (HO.isInteger())
334     {
335 	int HOi = HO.getIntValueAsInt();
336         if ((HOi < 0) || (toS(HOi) >= h_size))
337         {
338             throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
339                           "linearization hint table",
340                           this->m->file->getLastOffset(),
341                           "/O (outline) offset is out of bounds");
342         }
343 	readHGeneric(BitStream(h_buf + HOi, h_size - toS(HOi)),
344 		     this->m->outline_hints);
345     }
346 }
347 
348 QPDFObjectHandle
readHintStream(Pipeline & pl,qpdf_offset_t offset,size_t length)349 QPDF::readHintStream(Pipeline& pl, qpdf_offset_t offset, size_t length)
350 {
351     int obj;
352     int gen;
353     QPDFObjectHandle H = readObjectAtOffset(
354 	false, offset, "linearization hint stream", -1, 0, obj, gen);
355     ObjCache& oc = this->m->obj_cache[QPDFObjGen(obj, gen)];
356     qpdf_offset_t min_end_offset = oc.end_before_space;
357     qpdf_offset_t max_end_offset = oc.end_after_space;
358     if (! H.isStream())
359     {
360 	throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
361 		      "linearization dictionary",
362 		      this->m->file->getLastOffset(),
363 		      "hint table is not a stream");
364     }
365 
366     QPDFObjectHandle Hdict = H.getDict();
367 
368     // Some versions of Acrobat make /Length indirect and place it
369     // immediately after the stream, increasing length to cover it,
370     // even though the specification says all objects in the
371     // linearization parameter dictionary must be direct.  We have to
372     // get the file position of the end of length in this case.
373     QPDFObjectHandle length_obj = Hdict.getKey("/Length");
374     if (length_obj.isIndirect())
375     {
376 	QTC::TC("qpdf", "QPDF hint table length indirect");
377 	// Force resolution
378 	(void) length_obj.getIntValue();
379 	ObjCache& oc2 = this->m->obj_cache[length_obj.getObjGen()];
380 	min_end_offset = oc2.end_before_space;
381 	max_end_offset = oc2.end_after_space;
382     }
383     else
384     {
385 	QTC::TC("qpdf", "QPDF hint table length direct");
386     }
387     qpdf_offset_t computed_end = offset + toO(length);
388     if ((computed_end < min_end_offset) ||
389 	(computed_end > max_end_offset))
390     {
391 	*this->m->err_stream << "expected = " << computed_end
392                              << "; actual = " << min_end_offset << ".."
393                              << max_end_offset << std::endl;
394 	throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
395 		      "linearization dictionary",
396 		      this->m->file->getLastOffset(),
397 		      "hint table length mismatch");
398     }
399     H.pipeStreamData(&pl, 0, qpdf_dl_specialized);
400     return Hdict;
401 }
402 
403 void
readHPageOffset(BitStream h)404 QPDF::readHPageOffset(BitStream h)
405 {
406     // All comments referring to the PDF spec refer to the spec for
407     // version 1.4.
408 
409     HPageOffset& t = this->m->page_offset_hints;
410 
411     t.min_nobjects = h.getBitsInt(32);                      // 1
412     t.first_page_offset = h.getBitsInt(32);                 // 2
413     t.nbits_delta_nobjects = h.getBitsInt(16);              // 3
414     t.min_page_length = h.getBitsInt(32);                   // 4
415     t.nbits_delta_page_length = h.getBitsInt(16);           // 5
416     t.min_content_offset = h.getBitsInt(32);                // 6
417     t.nbits_delta_content_offset = h.getBitsInt(16);        // 7
418     t.min_content_length = h.getBitsInt(32);                // 8
419     t.nbits_delta_content_length = h.getBitsInt(16);        // 9
420     t.nbits_nshared_objects = h.getBitsInt(16);             // 10
421     t.nbits_shared_identifier = h.getBitsInt(16);           // 11
422     t.nbits_shared_numerator = h.getBitsInt(16);            // 12
423     t.shared_denominator = h.getBitsInt(16);                // 13
424 
425     std::vector<HPageOffsetEntry>& entries = t.entries;
426     entries.clear();
427     int nitems = this->m->linp.npages;
428     load_vector_int(h, nitems, entries,
429 		    t.nbits_delta_nobjects,
430 		    &HPageOffsetEntry::delta_nobjects);
431     load_vector_int(h, nitems, entries,
432 		    t.nbits_delta_page_length,
433 		    &HPageOffsetEntry::delta_page_length);
434     load_vector_int(h, nitems, entries,
435 		    t.nbits_nshared_objects,
436 		    &HPageOffsetEntry::nshared_objects);
437     load_vector_vector(h, nitems, entries,
438 		       &HPageOffsetEntry::nshared_objects,
439 		       t.nbits_shared_identifier,
440 		       &HPageOffsetEntry::shared_identifiers);
441     load_vector_vector(h, nitems, entries,
442 		       &HPageOffsetEntry::nshared_objects,
443 		       t.nbits_shared_numerator,
444 		       &HPageOffsetEntry::shared_numerators);
445     load_vector_int(h, nitems, entries,
446 		    t.nbits_delta_content_offset,
447 		    &HPageOffsetEntry::delta_content_offset);
448     load_vector_int(h, nitems, entries,
449 		    t.nbits_delta_content_length,
450 		    &HPageOffsetEntry::delta_content_length);
451 }
452 
453 void
readHSharedObject(BitStream h)454 QPDF::readHSharedObject(BitStream h)
455 {
456     HSharedObject& t = this->m->shared_object_hints;
457 
458     t.first_shared_obj = h.getBitsInt(32);                  // 1
459     t.first_shared_offset = h.getBitsInt(32);               // 2
460     t.nshared_first_page = h.getBitsInt(32);                // 3
461     t.nshared_total = h.getBitsInt(32);                     // 4
462     t.nbits_nobjects = h.getBitsInt(16);                    // 5
463     t.min_group_length = h.getBitsInt(32);                  // 6
464     t.nbits_delta_group_length = h.getBitsInt(16);          // 7
465 
466     QTC::TC("qpdf", "QPDF lin nshared_total > nshared_first_page",
467 	    (t.nshared_total > t.nshared_first_page) ? 1 : 0);
468 
469     std::vector<HSharedObjectEntry>& entries = t.entries;
470     entries.clear();
471     int nitems = t.nshared_total;
472     load_vector_int(h, nitems, entries,
473 		    t.nbits_delta_group_length,
474 		    &HSharedObjectEntry::delta_group_length);
475     load_vector_int(h, nitems, entries,
476 		    1, &HSharedObjectEntry::signature_present);
477     for (size_t i = 0; i < toS(nitems); ++i)
478     {
479 	if (entries.at(i).signature_present)
480 	{
481 	    // Skip 128-bit MD5 hash.  These are not supported by
482 	    // acrobat, so they should probably never be there.  We
483 	    // have no test case for this.
484 	    for (int j = 0; j < 4; ++j)
485 	    {
486 		(void) h.getBits(32);
487 	    }
488 	}
489     }
490     load_vector_int(h, nitems, entries,
491 		    t.nbits_nobjects,
492 		    &HSharedObjectEntry::nobjects_minus_one);
493 }
494 
495 void
readHGeneric(BitStream h,HGeneric & t)496 QPDF::readHGeneric(BitStream h, HGeneric& t)
497 {
498     t.first_object = h.getBitsInt(32);                      // 1
499     t.first_object_offset = h.getBitsInt(32);               // 2
500     t.nobjects = h.getBitsInt(32);                          // 3
501     t.group_length = h.getBitsInt(32);                      // 4
502 }
503 
504 bool
checkLinearizationInternal()505 QPDF::checkLinearizationInternal()
506 {
507     // All comments referring to the PDF spec refer to the spec for
508     // version 1.4.
509 
510     std::list<std::string> errors;
511     std::list<std::string> warnings;
512 
513     // Check all values in linearization parameter dictionary
514 
515     LinParameters& p = this->m->linp;
516 
517     // L: file size in bytes -- checked by isLinearized
518 
519     // O: object number of first page
520     std::vector<QPDFObjectHandle> const& pages = getAllPages();
521     if (p.first_page_object != pages.at(0).getObjectID())
522     {
523 	QTC::TC("qpdf", "QPDF err /O mismatch");
524 	errors.push_back("first page object (/O) mismatch");
525     }
526 
527     // N: number of pages
528     int npages = toI(pages.size());
529     if (p.npages != npages)
530     {
531 	// Not tested in the test suite
532 	errors.push_back("page count (/N) mismatch");
533     }
534 
535     for (size_t i = 0; i < toS(npages); ++i)
536     {
537 	QPDFObjectHandle const& page = pages.at(i);
538 	QPDFObjGen og(page.getObjGen());
539 	if (this->m->xref_table[og].getType() == 2)
540 	{
541 	    errors.push_back("page dictionary for page " +
542 			     QUtil::uint_to_string(i) + " is compressed");
543 	}
544     }
545 
546     // T: offset of whitespace character preceding xref entry for object 0
547     this->m->file->seek(p.xref_zero_offset, SEEK_SET);
548     while (1)
549     {
550 	char ch;
551 	this->m->file->read(&ch, 1);
552 	if (! ((ch == ' ') || (ch == '\r') || (ch == '\n')))
553 	{
554 	    this->m->file->seek(-1, SEEK_CUR);
555 	    break;
556 	}
557     }
558     if (this->m->file->tell() != this->m->first_xref_item_offset)
559     {
560 	QTC::TC("qpdf", "QPDF err /T mismatch");
561 	errors.push_back("space before first xref item (/T) mismatch "
562 			 "(computed = " +
563 			 QUtil::int_to_string(this->m->first_xref_item_offset) +
564 			 "; file = " +
565 			 QUtil::int_to_string(this->m->file->tell()));
566     }
567 
568     // P: first page number -- Implementation note 124 says Acrobat
569     // ignores this value, so we will too.
570 
571     // Check numbering of compressed objects in each xref section.
572     // For linearized files, all compressed objects are supposed to be
573     // at the end of the containing xref section if any object streams
574     // are in use.
575 
576     if (this->m->uncompressed_after_compressed)
577     {
578 	errors.push_back("linearized file contains an uncompressed object"
579 			 " after a compressed one in a cross-reference stream");
580     }
581 
582     // Further checking requires optimization and order calculation.
583     // Don't allow optimization to make changes.  If it has to, then
584     // the file is not properly linearized.  We use the xref table to
585     // figure out which objects are compressed and which are
586     // uncompressed.
587     { // local scope
588 	std::map<int, int> object_stream_data;
589 	for (std::map<QPDFObjGen, QPDFXRefEntry>::const_iterator iter =
590 		 this->m->xref_table.begin();
591 	     iter != this->m->xref_table.end(); ++iter)
592 	{
593 	    QPDFObjGen const& og = (*iter).first;
594 	    QPDFXRefEntry const& entry = (*iter).second;
595 	    if (entry.getType() == 2)
596 	    {
597 		object_stream_data[og.getObj()] = entry.getObjStreamNumber();
598 	    }
599 	}
600 	optimize(object_stream_data, false);
601 	calculateLinearizationData(object_stream_data);
602     }
603 
604     // E: offset of end of first page -- Implementation note 123 says
605     // Acrobat includes on extra object here by mistake.  pdlin fails
606     // to place thumbnail images in section 9, so when thumbnails are
607     // present, it also gets the wrong value for /E.  It also doesn't
608     // count outlines here when it should even though it places them
609     // in part 6.  This code fails to put thread information
610     // dictionaries in part 9, so it actually gets the wrong value for
611     // E when threads are present.  In that case, it would probably
612     // agree with pdlin.  As of this writing, the test suite doesn't
613     // contain any files with threads.
614 
615     if (this->m->part6.empty())
616     {
617         stopOnError("linearization part 6 unexpectedly empty");
618     }
619     qpdf_offset_t min_E = -1;
620     qpdf_offset_t max_E = -1;
621     for (std::vector<QPDFObjectHandle>::iterator iter = this->m->part6.begin();
622 	 iter != this->m->part6.end(); ++iter)
623     {
624 	QPDFObjGen og((*iter).getObjGen());
625 	if (this->m->obj_cache.count(og) == 0)
626         {
627             // All objects have to have been dereferenced to be classified.
628             throw std::logic_error("linearization part6 object not in cache");
629         }
630 	ObjCache const& oc = this->m->obj_cache[og];
631 	min_E = std::max(min_E, oc.end_before_space);
632 	max_E = std::max(max_E, oc.end_after_space);
633     }
634     if ((p.first_page_end < min_E) || (p.first_page_end > max_E))
635     {
636 	QTC::TC("qpdf", "QPDF warn /E mismatch");
637 	warnings.push_back("end of first page section (/E) mismatch: /E = " +
638 			   QUtil::int_to_string(p.first_page_end) +
639 			   "; computed = " +
640 			   QUtil::int_to_string(min_E) + ".." +
641 			   QUtil::int_to_string(max_E));
642     }
643 
644     // Check hint tables
645 
646     std::map<int, int> shared_idx_to_obj;
647     checkHSharedObject(errors, warnings, pages, shared_idx_to_obj);
648     checkHPageOffset(errors, warnings, pages, shared_idx_to_obj);
649     checkHOutlines(warnings);
650 
651     // Report errors
652 
653     bool result = true;
654 
655     // Treat all linearization errors as warnings. Many of them occur
656     // in otherwise working files, so it's really misleading to treat
657     // them as errors. We'll hang onto the distinction in the code for
658     // now in case we ever have a chance to clean up the linearization
659     // code.
660     if (! errors.empty())
661     {
662 	result = false;
663 	for (std::list<std::string>::iterator iter = errors.begin();
664 	     iter != errors.end(); ++iter)
665 	{
666 	    *this->m->err_stream << "WARNING: " << (*iter) << std::endl;
667 	}
668     }
669 
670     if (! warnings.empty())
671     {
672 	result = false;
673 	for (std::list<std::string>::iterator iter = warnings.begin();
674 	     iter != warnings.end(); ++iter)
675 	{
676 	    *this->m->err_stream << "WARNING: " << (*iter) << std::endl;
677 	}
678     }
679 
680     return result;
681 }
682 
683 qpdf_offset_t
maxEnd(ObjUser const & ou)684 QPDF::maxEnd(ObjUser const& ou)
685 {
686     if (this->m->obj_user_to_objects.count(ou) == 0)
687     {
688         stopOnError("no entry in object user table for requested object user");
689     }
690     std::set<QPDFObjGen> const& ogs = this->m->obj_user_to_objects[ou];
691     qpdf_offset_t end = 0;
692     for (std::set<QPDFObjGen>::const_iterator iter = ogs.begin();
693 	 iter != ogs.end(); ++iter)
694     {
695 	QPDFObjGen const& og = *iter;
696 	if (this->m->obj_cache.count(og) == 0)
697         {
698             stopOnError("unknown object referenced in object user table");
699         }
700 	end = std::max(end, this->m->obj_cache[og].end_after_space);
701     }
702     return end;
703 }
704 
705 qpdf_offset_t
getLinearizationOffset(QPDFObjGen const & og)706 QPDF::getLinearizationOffset(QPDFObjGen const& og)
707 {
708     QPDFXRefEntry entry = this->m->xref_table[og];
709     qpdf_offset_t result = 0;
710     switch (entry.getType())
711     {
712       case 1:
713 	result = entry.getOffset();
714 	break;
715 
716       case 2:
717 	// For compressed objects, return the offset of the object
718 	// stream that contains them.
719 	result = getLinearizationOffset(
720             QPDFObjGen(entry.getObjStreamNumber(), 0));
721 	break;
722 
723       default:
724 	stopOnError(
725 	    "getLinearizationOffset called for xref entry not of type 1 or 2");
726 	break;
727     }
728     return result;
729 }
730 
731 QPDFObjectHandle
getUncompressedObject(QPDFObjectHandle & obj,std::map<int,int> const & object_stream_data)732 QPDF::getUncompressedObject(QPDFObjectHandle& obj,
733 			    std::map<int, int> const& object_stream_data)
734 {
735     if (obj.isNull() || (object_stream_data.count(obj.getObjectID()) == 0))
736     {
737 	return obj;
738     }
739     else
740     {
741 	int repl = (*(object_stream_data.find(obj.getObjectID()))).second;
742 	return objGenToIndirect(QPDFObjGen(repl, 0));
743     }
744 }
745 
746 int
lengthNextN(int first_object,int n,std::list<std::string> & errors)747 QPDF::lengthNextN(int first_object, int n,
748 		  std::list<std::string>& errors)
749 {
750     int length = 0;
751     for (int i = 0; i < n; ++i)
752     {
753 	QPDFObjGen og(first_object + i, 0);
754 	if (this->m->xref_table.count(og) == 0)
755 	{
756 	    errors.push_back(
757 		"no xref table entry for " +
758 		QUtil::int_to_string(first_object + i) + " 0");
759 	}
760 	else
761 	{
762 	    if (this->m->obj_cache.count(og) == 0)
763             {
764                 stopOnError("found unknown object while"
765                             " calculating length for linearization data");
766             }
767 	    length += toI(this->m->obj_cache[og].end_after_space -
768                           getLinearizationOffset(og));
769 	}
770     }
771     return length;
772 }
773 
774 void
checkHPageOffset(std::list<std::string> & errors,std::list<std::string> & warnings,std::vector<QPDFObjectHandle> const & pages,std::map<int,int> & shared_idx_to_obj)775 QPDF::checkHPageOffset(std::list<std::string>& errors,
776 		       std::list<std::string>& warnings,
777 		       std::vector<QPDFObjectHandle> const& pages,
778 		       std::map<int, int>& shared_idx_to_obj)
779 {
780     // Implementation note 126 says Acrobat always sets
781     // delta_content_offset and delta_content_length in the page
782     // offset header dictionary to 0.  It also states that
783     // min_content_offset in the per-page information is always 0,
784     // which is an incorrect value.
785 
786     // Implementation note 127 explains that Acrobat always sets item
787     // 8 (min_content_length) to zero, item 9
788     // (nbits_delta_content_length) to the value of item 5
789     // (nbits_delta_page_length), and item 7 of each per-page hint
790     // table (delta_content_length) to item 2 (delta_page_length) of
791     // that entry.  Acrobat ignores these values when reading files.
792 
793     // Empirically, it also seems that Acrobat sometimes puts items
794     // under a page's /Resources dictionary in with shared objects
795     // even when they are private.
796 
797     int npages = toI(pages.size());
798     qpdf_offset_t table_offset = adjusted_offset(
799 	this->m->page_offset_hints.first_page_offset);
800     QPDFObjGen first_page_og(pages.at(0).getObjGen());
801     if (this->m->xref_table.count(first_page_og) == 0)
802     {
803         stopOnError("supposed first page object is not known");
804     }
805     qpdf_offset_t offset = getLinearizationOffset(first_page_og);
806     if (table_offset != offset)
807     {
808 	warnings.push_back("first page object offset mismatch");
809     }
810 
811     for (int pageno = 0; pageno < npages; ++pageno)
812     {
813 	QPDFObjGen page_og(pages.at(toS(pageno)).getObjGen());
814 	int first_object = page_og.getObj();
815 	if (this->m->xref_table.count(page_og) == 0)
816         {
817             stopOnError("unknown object in page offset hint table");
818         }
819 	offset = getLinearizationOffset(page_og);
820 
821 	HPageOffsetEntry& he =
822             this->m->page_offset_hints.entries.at(toS(pageno));
823 	CHPageOffsetEntry& ce =
824             this->m->c_page_offset_data.entries.at(toS(pageno));
825 	int h_nobjects = he.delta_nobjects +
826 	    this->m->page_offset_hints.min_nobjects;
827 	if (h_nobjects != ce.nobjects)
828 	{
829 	    // This happens with pdlin when there are thumbnails.
830 	    warnings.push_back(
831 		"object count mismatch for page " +
832 		QUtil::int_to_string(pageno) + ": hint table = " +
833 		QUtil::int_to_string(h_nobjects) + "; computed = " +
834 		QUtil::int_to_string(ce.nobjects));
835 	}
836 
837 	// Use value for number of objects in hint table rather than
838 	// computed value if there is a discrepancy.
839 	int length = lengthNextN(first_object, h_nobjects, errors);
840 	int h_length = toI(he.delta_page_length +
841                            this->m->page_offset_hints.min_page_length);
842 	if (length != h_length)
843 	{
844 	    // This condition almost certainly indicates a bad hint
845 	    // table or a bug in this code.
846 	    errors.push_back(
847 		"page length mismatch for page " +
848 		QUtil::int_to_string(pageno) + ": hint table = " +
849 		QUtil::int_to_string(h_length) + "; computed length = " +
850 		QUtil::int_to_string(length) + " (offset = " +
851 		QUtil::int_to_string(offset) + ")");
852 	}
853 
854 	offset += h_length;
855 
856 	// Translate shared object indexes to object numbers.
857 	std::set<int> hint_shared;
858 	std::set<int> computed_shared;
859 
860 	if ((pageno == 0) && (he.nshared_objects > 0))
861 	{
862 	    // pdlin and Acrobat both do this even though the spec
863 	    // states clearly and unambiguously that they should not.
864 	    warnings.push_back("page 0 has shared identifier entries");
865 	}
866 
867 	for (size_t i = 0; i < toS(he.nshared_objects); ++i)
868 	{
869 	    int idx = he.shared_identifiers.at(i);
870 	    if (shared_idx_to_obj.count(idx) == 0)
871             {
872                 stopOnError(
873                     "unable to get object for item in"
874                     " shared objects hint table");
875             }
876 	    hint_shared.insert(shared_idx_to_obj[idx]);
877 	}
878 
879 	for (size_t i = 0; i < toS(ce.nshared_objects); ++i)
880 	{
881 	    int idx = ce.shared_identifiers.at(i);
882 	    if (idx >= this->m->c_shared_object_data.nshared_total)
883             {
884                 stopOnError(
885                     "index out of bounds for shared object hint table");
886             }
887 	    int obj = this->m->c_shared_object_data.entries.at(toS(idx)).object;
888 	    computed_shared.insert(obj);
889 	}
890 
891 	for (std::set<int>::iterator iter = hint_shared.begin();
892 	     iter != hint_shared.end(); ++iter)
893 	{
894 	    if (! computed_shared.count(*iter))
895 	    {
896 		// pdlin puts thumbnails here even though it shouldn't
897 		warnings.push_back(
898 		    "page " + QUtil::int_to_string(pageno) +
899 		    ": shared object " + QUtil::int_to_string(*iter) +
900 		    ": in hint table but not computed list");
901 	    }
902 	}
903 
904 	for (std::set<int>::iterator iter = computed_shared.begin();
905 	     iter != computed_shared.end(); ++iter)
906 	{
907 	    if (! hint_shared.count(*iter))
908 	    {
909 		// Acrobat does not put some things including at least
910 		// built-in fonts and procsets here, at least in some
911 		// cases.
912 		warnings.push_back(
913 		    "page " + QUtil::int_to_string(pageno) +
914 		    ": shared object " + QUtil::int_to_string(*iter) +
915 		    ": in computed list but not hint table");
916 	    }
917 	}
918     }
919 }
920 
921 void
checkHSharedObject(std::list<std::string> & errors,std::list<std::string> & warnings,std::vector<QPDFObjectHandle> const & pages,std::map<int,int> & idx_to_obj)922 QPDF::checkHSharedObject(std::list<std::string>& errors,
923 			 std::list<std::string>& warnings,
924 			 std::vector<QPDFObjectHandle> const& pages,
925 			 std::map<int, int>& idx_to_obj)
926 {
927     // Implementation note 125 says shared object groups always
928     // contain only one object.  Implementation note 128 says that
929     // Acrobat always nbits_nobjects to zero.  Implementation note 130
930     // says that Acrobat does not support more than one shared object
931     // per group.  These are all consistent.
932 
933     // Implementation note 129 states that MD5 signatures are not
934     // implemented in Acrobat, so signature_present must always be
935     // zero.
936 
937     // Implementation note 131 states that first_shared_obj and
938     // first_shared_offset have meaningless values for single-page
939     // files.
940 
941     // Empirically, Acrobat and pdlin generate incorrect values for
942     // these whenever there are no shared objects not referenced by
943     // the first page (i.e., nshared_total == nshared_first_page).
944 
945     HSharedObject& so = this->m->shared_object_hints;
946     if (so.nshared_total < so.nshared_first_page)
947     {
948 	errors.push_back("shared object hint table: ntotal < nfirst_page");
949     }
950     else
951     {
952 	// The first nshared_first_page objects are consecutive
953 	// objects starting with the first page object.  The rest are
954 	// consecutive starting from the first_shared_obj object.
955 	int cur_object = pages.at(0).getObjectID();
956 	for (int i = 0; i < so.nshared_total; ++i)
957 	{
958 	    if (i == so.nshared_first_page)
959 	    {
960 		QTC::TC("qpdf", "QPDF lin check shared past first page");
961 		if (this->m->part8.empty())
962 		{
963 		    errors.push_back(
964 			"part 8 is empty but nshared_total > "
965 			"nshared_first_page");
966 		}
967 		else
968 		{
969 		    int obj = this->m->part8.at(0).getObjectID();
970 		    if (obj != so.first_shared_obj)
971 		    {
972 			errors.push_back(
973 			    "first shared object number mismatch: "
974 			    "hint table = " +
975 			    QUtil::int_to_string(so.first_shared_obj) +
976 			    "; computed = " +
977 			    QUtil::int_to_string(obj));
978 		    }
979 		}
980 
981 		cur_object = so.first_shared_obj;
982 
983 		QPDFObjGen og(cur_object, 0);
984 		if (this->m->xref_table.count(og) == 0)
985                 {
986                     stopOnError("unknown object in shared object hint table");
987                 }
988 		qpdf_offset_t offset = getLinearizationOffset(og);
989 		qpdf_offset_t h_offset =
990                     adjusted_offset(so.first_shared_offset);
991 		if (offset != h_offset)
992 		{
993 		    errors.push_back(
994 			"first shared object offset mismatch: hint table = " +
995 			QUtil::int_to_string(h_offset) + "; computed = " +
996 			QUtil::int_to_string(offset));
997 		}
998 	    }
999 
1000 	    idx_to_obj[i] = cur_object;
1001 	    HSharedObjectEntry& se = so.entries.at(toS(i));
1002 	    int nobjects = se.nobjects_minus_one + 1;
1003 	    int length = lengthNextN(cur_object, nobjects, errors);
1004 	    int h_length = so.min_group_length + se.delta_group_length;
1005 	    if (length != h_length)
1006 	    {
1007 		errors.push_back(
1008 		    "shared object " + QUtil::int_to_string(i) +
1009 		    " length mismatch: hint table = " +
1010 		    QUtil::int_to_string(h_length) + "; computed = " +
1011 		    QUtil::int_to_string(length));
1012 	    }
1013 	    cur_object += nobjects;
1014 	}
1015     }
1016 }
1017 
1018 void
checkHOutlines(std::list<std::string> & warnings)1019 QPDF::checkHOutlines(std::list<std::string>& warnings)
1020 {
1021     // Empirically, Acrobat generates the correct value for the object
1022     // number but incorrectly stores the next object number's offset
1023     // as the offset, at least when outlines appear in part 6.  It
1024     // also generates an incorrect value for length (specifically, the
1025     // length that would cover the correct number of objects from the
1026     // wrong starting place).  pdlin appears to generate correct
1027     // values in those cases.
1028 
1029     if (this->m->c_outline_data.nobjects == this->m->outline_hints.nobjects)
1030     {
1031 	if (this->m->c_outline_data.nobjects == 0)
1032 	{
1033 	    return;
1034 	}
1035 
1036 	if (this->m->c_outline_data.first_object ==
1037 	    this->m->outline_hints.first_object)
1038 	{
1039 	    // Check length and offset.  Acrobat gets these wrong.
1040 	    QPDFObjectHandle outlines = getRoot().getKey("/Outlines");
1041             if (! outlines.isIndirect())
1042             {
1043                 // This case is not exercised in test suite since not
1044                 // permitted by the spec, but if this does occur, the
1045                 // code below would fail.
1046 		warnings.push_back(
1047 		    "/Outlines key of root dictionary is not indirect");
1048                 return;
1049             }
1050 	    QPDFObjGen og(outlines.getObjGen());
1051 	    if (this->m->xref_table.count(og) == 0)
1052             {
1053                 stopOnError("unknown object in outlines hint table");
1054             }
1055 	    qpdf_offset_t offset = getLinearizationOffset(og);
1056 	    ObjUser ou(ObjUser::ou_root_key, "/Outlines");
1057 	    int length = toI(maxEnd(ou) - offset);
1058 	    qpdf_offset_t table_offset =
1059 		adjusted_offset(this->m->outline_hints.first_object_offset);
1060 	    if (offset != table_offset)
1061 	    {
1062 		warnings.push_back(
1063 		    "incorrect offset in outlines table: hint table = " +
1064 		    QUtil::int_to_string(table_offset) +
1065 		    "; computed = " + QUtil::int_to_string(offset));
1066 	    }
1067 	    int table_length = this->m->outline_hints.group_length;
1068 	    if (length != table_length)
1069 	    {
1070 		warnings.push_back(
1071 		    "incorrect length in outlines table: hint table = " +
1072 		    QUtil::int_to_string(table_length) +
1073 		    "; computed = " + QUtil::int_to_string(length));
1074 	    }
1075 	}
1076 	else
1077 	{
1078 	    warnings.push_back("incorrect first object number in outline "
1079 			       "hints table.");
1080 	}
1081     }
1082     else
1083     {
1084 	warnings.push_back("incorrect object count in outline hint table");
1085     }
1086 }
1087 
1088 void
showLinearizationData()1089 QPDF::showLinearizationData()
1090 {
1091     try
1092     {
1093 	readLinearizationData();
1094 	checkLinearizationInternal();
1095 	dumpLinearizationDataInternal();
1096     }
1097     catch (QPDFExc& e)
1098     {
1099 	*this->m->err_stream << e.what() << std::endl;
1100     }
1101 }
1102 
1103 void
dumpLinearizationDataInternal()1104 QPDF::dumpLinearizationDataInternal()
1105 {
1106     *this->m->out_stream
1107         << this->m->file->getName() << ": linearization data:" << std::endl
1108         << std::endl;
1109 
1110     *this->m->out_stream
1111 	<< "file_size: " << this->m->linp.file_size << std::endl
1112 	<< "first_page_object: " << this->m->linp.first_page_object << std::endl
1113 	<< "first_page_end: " << this->m->linp.first_page_end << std::endl
1114 	<< "npages: " << this->m->linp.npages << std::endl
1115 	<< "xref_zero_offset: " << this->m->linp.xref_zero_offset << std::endl
1116 	<< "first_page: " << this->m->linp.first_page << std::endl
1117 	<< "H_offset: " << this->m->linp.H_offset << std::endl
1118 	<< "H_length: " << this->m->linp.H_length << std::endl
1119 	<< std::endl;
1120 
1121     *this->m->out_stream << "Page Offsets Hint Table" << std::endl
1122                          << std::endl;
1123     dumpHPageOffset();
1124     *this->m->out_stream << std::endl
1125                          << "Shared Objects Hint Table" << std::endl
1126                          << std::endl;
1127     dumpHSharedObject();
1128 
1129     if (this->m->outline_hints.nobjects > 0)
1130     {
1131 	*this->m->out_stream << std::endl
1132                              << "Outlines Hint Table" << std::endl
1133                              << std::endl;
1134 	dumpHGeneric(this->m->outline_hints);
1135     }
1136 }
1137 
1138 qpdf_offset_t
adjusted_offset(qpdf_offset_t offset)1139 QPDF::adjusted_offset(qpdf_offset_t offset)
1140 {
1141     // All offsets >= H_offset have to be increased by H_length
1142     // since all hint table location values disregard the hint table
1143     // itself.
1144     if (offset >= this->m->linp.H_offset)
1145     {
1146 	return offset + this->m->linp.H_length;
1147     }
1148     return offset;
1149 }
1150 
1151 
1152 void
dumpHPageOffset()1153 QPDF::dumpHPageOffset()
1154 {
1155     HPageOffset& t = this->m->page_offset_hints;
1156     *this->m->out_stream
1157 	<< "min_nobjects: " << t.min_nobjects
1158 	<< std::endl
1159 	<< "first_page_offset: " << adjusted_offset(t.first_page_offset)
1160 	<< std::endl
1161 	<< "nbits_delta_nobjects: " << t.nbits_delta_nobjects
1162 	<< std::endl
1163 	<< "min_page_length: " << t.min_page_length
1164 	<< std::endl
1165 	<< "nbits_delta_page_length: " << t.nbits_delta_page_length
1166 	<< std::endl
1167 	<< "min_content_offset: " << t.min_content_offset
1168 	<< std::endl
1169 	<< "nbits_delta_content_offset: " << t.nbits_delta_content_offset
1170 	<< std::endl
1171 	<< "min_content_length: " << t.min_content_length
1172 	<< std::endl
1173 	<< "nbits_delta_content_length: " << t.nbits_delta_content_length
1174 	<< std::endl
1175 	<< "nbits_nshared_objects: " << t.nbits_nshared_objects
1176 	<< std::endl
1177 	<< "nbits_shared_identifier: " << t.nbits_shared_identifier
1178 	<< std::endl
1179 	<< "nbits_shared_numerator: " << t.nbits_shared_numerator
1180 	<< std::endl
1181 	<< "shared_denominator: " << t.shared_denominator
1182 	<< std::endl;
1183 
1184     for (size_t i1 = 0; i1 < toS(this->m->linp.npages); ++i1)
1185     {
1186 	HPageOffsetEntry& pe = t.entries.at(i1);
1187 	*this->m->out_stream
1188 	    << "Page " << i1 << ":" << std::endl
1189 	    << "  nobjects: " << pe.delta_nobjects + t.min_nobjects
1190 	    << std::endl
1191 	    << "  length: " << pe.delta_page_length + t.min_page_length
1192 	    << std::endl
1193 	    // content offset is relative to page, not file
1194 	    << "  content_offset: "
1195 	    << pe.delta_content_offset + t.min_content_offset << std::endl
1196 	    << "  content_length: "
1197 	    << pe.delta_content_length + t.min_content_length << std::endl
1198 	    << "  nshared_objects: " << pe.nshared_objects << std::endl;
1199 	for (size_t i2 = 0; i2 < toS(pe.nshared_objects); ++i2)
1200 	{
1201 	    *this->m->out_stream << "    identifier " << i2 << ": "
1202                                  << pe.shared_identifiers.at(i2) << std::endl;
1203 	    *this->m->out_stream << "    numerator " << i2 << ": "
1204                                  << pe.shared_numerators.at(i2) << std::endl;
1205 	}
1206     }
1207 }
1208 
1209 void
dumpHSharedObject()1210 QPDF::dumpHSharedObject()
1211 {
1212     HSharedObject& t = this->m->shared_object_hints;
1213     *this->m->out_stream
1214 	<< "first_shared_obj: " << t.first_shared_obj
1215 	<< std::endl
1216 	<< "first_shared_offset: " << adjusted_offset(t.first_shared_offset)
1217 	<< std::endl
1218 	<< "nshared_first_page: " << t.nshared_first_page
1219 	<< std::endl
1220 	<< "nshared_total: " << t.nshared_total
1221 	<< std::endl
1222 	<< "nbits_nobjects: " << t.nbits_nobjects
1223 	<< std::endl
1224 	<< "min_group_length: " << t.min_group_length
1225 	<< std::endl
1226 	<< "nbits_delta_group_length: " << t.nbits_delta_group_length
1227 	<< std::endl;
1228 
1229     for (size_t i = 0; i < toS(t.nshared_total); ++i)
1230     {
1231 	HSharedObjectEntry& se = t.entries.at(i);
1232 	*this->m->out_stream
1233             << "Shared Object " << i << ":" << std::endl
1234             << "  group length: "
1235             << se.delta_group_length + t.min_group_length << std::endl;
1236 	// PDF spec says signature present nobjects_minus_one are
1237 	// always 0, so print them only if they have a non-zero value.
1238 	if (se.signature_present)
1239 	{
1240 	    *this->m->out_stream << "  signature present" << std::endl;
1241 	}
1242 	if (se.nobjects_minus_one != 0)
1243 	{
1244 	    *this->m->out_stream << "  nobjects: "
1245                                  << se.nobjects_minus_one + 1 << std::endl;
1246 	}
1247     }
1248 }
1249 
1250 void
dumpHGeneric(HGeneric & t)1251 QPDF::dumpHGeneric(HGeneric& t)
1252 {
1253     *this->m->out_stream
1254 	<< "first_object: " << t.first_object
1255 	<< std::endl
1256 	<< "first_object_offset: " << adjusted_offset(t.first_object_offset)
1257 	<< std::endl
1258 	<< "nobjects: " << t.nobjects
1259 	<< std::endl
1260 	<< "group_length: " << t.group_length
1261 	<< std::endl;
1262 }
1263 
1264 QPDFObjectHandle
objGenToIndirect(QPDFObjGen const & og)1265 QPDF::objGenToIndirect(QPDFObjGen const& og)
1266 {
1267     return getObjectByID(og.getObj(), og.getGen());
1268 }
1269 
1270 void
calculateLinearizationData(std::map<int,int> const & object_stream_data)1271 QPDF::calculateLinearizationData(std::map<int, int> const& object_stream_data)
1272 {
1273     // This function calculates the ordering of objects, divides them
1274     // into the appropriate parts, and computes some values for the
1275     // linearization parameter dictionary and hint tables.  The file
1276     // must be optimized (via calling optimize()) prior to calling
1277     // this function.  Note that actual offsets and lengths are not
1278     // computed here, but anything related to object ordering is.
1279 
1280     if (this->m->object_to_obj_users.empty())
1281     {
1282 	// Note that we can't call optimize here because we don't know
1283 	// whether it should be called with or without allow changes.
1284 	throw std::logic_error(
1285 	    "INTERNAL ERROR: QPDF::calculateLinearizationData "
1286 	    "called before optimize()");
1287     }
1288 
1289     // Separate objects into the categories sufficient for us to
1290     // determine which part of the linearized file should contain the
1291     // object.  This categorization is useful for other purposes as
1292     // well.  Part numbers refer to version 1.4 of the PDF spec.
1293 
1294     // Parts 1, 3, 5, 10, and 11 don't contain any objects from the
1295     // original file (except the trailer dictionary in part 11).
1296 
1297     // Part 4 is the document catalog (root) and the following root
1298     // keys: /ViewerPreferences, /PageMode, /Threads, /OpenAction,
1299     // /AcroForm, /Encrypt.  Note that Thread information dictionaries
1300     // are supposed to appear in part 9, but we are disregarding that
1301     // recommendation for now.
1302 
1303     // Part 6 is the first page section.  It includes all remaining
1304     // objects referenced by the first page including shared objects
1305     // but not including thumbnails.  Additionally, if /PageMode is
1306     // /Outlines, then information from /Outlines also appears here.
1307 
1308     // Part 7 contains remaining objects private to pages other than
1309     // the first page.
1310 
1311     // Part 8 contains all remaining shared objects except those that
1312     // are shared only within thumbnails.
1313 
1314     // Part 9 contains all remaining objects.
1315 
1316     // We sort objects into the following categories:
1317 
1318     //   * open_document: part 4
1319 
1320     //   * first_page_private: part 6
1321 
1322     //   * first_page_shared: part 6
1323 
1324     //   * other_page_private: part 7
1325 
1326     //   * other_page_shared: part 8
1327 
1328     //   * thumbnail_private: part 9
1329 
1330     //   * thumbnail_shared: part 9
1331 
1332     //   * other: part 9
1333 
1334     //   * outlines: part 6 or 9
1335 
1336     this->m->part4.clear();
1337     this->m->part6.clear();
1338     this->m->part7.clear();
1339     this->m->part8.clear();
1340     this->m->part9.clear();
1341     this->m->c_linp = LinParameters();
1342     this->m->c_page_offset_data = CHPageOffset();
1343     this->m->c_shared_object_data = CHSharedObject();
1344     this->m->c_outline_data = HGeneric();
1345 
1346     QPDFObjectHandle root = getRoot();
1347     bool outlines_in_first_page = false;
1348     QPDFObjectHandle pagemode = root.getKey("/PageMode");
1349     QTC::TC("qpdf", "QPDF categorize pagemode present",
1350 	    pagemode.isName() ? 1 : 0);
1351     if (pagemode.isName())
1352     {
1353 	if (pagemode.getName() == "/UseOutlines")
1354 	{
1355 	    if (root.hasKey("/Outlines"))
1356 	    {
1357 		outlines_in_first_page = true;
1358 	    }
1359 	    else
1360 	    {
1361 		QTC::TC("qpdf", "QPDF UseOutlines but no Outlines");
1362 	    }
1363 	}
1364 	QTC::TC("qpdf", "QPDF categorize pagemode outlines",
1365 		outlines_in_first_page ? 1 : 0);
1366     }
1367 
1368     std::set<std::string> open_document_keys;
1369     open_document_keys.insert("/ViewerPreferences");
1370     open_document_keys.insert("/PageMode");
1371     open_document_keys.insert("/Threads");
1372     open_document_keys.insert("/OpenAction");
1373     open_document_keys.insert("/AcroForm");
1374 
1375     std::set<QPDFObjGen> lc_open_document;
1376     std::set<QPDFObjGen> lc_first_page_private;
1377     std::set<QPDFObjGen> lc_first_page_shared;
1378     std::set<QPDFObjGen> lc_other_page_private;
1379     std::set<QPDFObjGen> lc_other_page_shared;
1380     std::set<QPDFObjGen> lc_thumbnail_private;
1381     std::set<QPDFObjGen> lc_thumbnail_shared;
1382     std::set<QPDFObjGen> lc_other;
1383     std::set<QPDFObjGen> lc_outlines;
1384     std::set<QPDFObjGen> lc_root;
1385 
1386     for (std::map<QPDFObjGen, std::set<ObjUser> >::iterator oiter =
1387 	     this->m->object_to_obj_users.begin();
1388 	 oiter != this->m->object_to_obj_users.end(); ++oiter)
1389     {
1390 	QPDFObjGen const& og = (*oiter).first;
1391 
1392 	std::set<ObjUser>& ous = (*oiter).second;
1393 
1394 	bool in_open_document = false;
1395 	bool in_first_page = false;
1396 	int other_pages = 0;
1397 	int thumbs = 0;
1398 	int others = 0;
1399 	bool in_outlines = false;
1400 	bool is_root = false;
1401 
1402 	for (std::set<ObjUser>::iterator uiter = ous.begin();
1403 	     uiter != ous.end(); ++uiter)
1404 	{
1405 	    ObjUser const& ou = *uiter;
1406 	    switch (ou.ou_type)
1407 	    {
1408 	      case ObjUser::ou_trailer_key:
1409 		if (ou.key == "/Encrypt")
1410 		{
1411 		    in_open_document = true;
1412 		}
1413 		else
1414 		{
1415 		    ++others;
1416 		}
1417 		break;
1418 
1419 	      case ObjUser::ou_thumb:
1420 		++thumbs;
1421 		break;
1422 
1423 	      case ObjUser::ou_root_key:
1424 		if (open_document_keys.count(ou.key) > 0)
1425 		{
1426 		    in_open_document = true;
1427 		}
1428 		else if (ou.key == "/Outlines")
1429 		{
1430 		    in_outlines = true;
1431 		}
1432 		else
1433 		{
1434 		    ++others;
1435 		}
1436 		break;
1437 
1438 	      case ObjUser::ou_page:
1439 		if (ou.pageno == 0)
1440 		{
1441 		    in_first_page = true;
1442 		}
1443 		else
1444 		{
1445 		    ++other_pages;
1446 		}
1447 		break;
1448 
1449 	      case ObjUser::ou_root:
1450 		is_root = true;
1451 		break;
1452 
1453 	      case ObjUser::ou_bad:
1454 		stopOnError(
1455 		    "INTERNAL ERROR: QPDF::calculateLinearizationData: "
1456 		    "invalid user type");
1457 		break;
1458 	    }
1459 	}
1460 
1461 	if (is_root)
1462 	{
1463 	    lc_root.insert(og);
1464 	}
1465 	else if (in_outlines)
1466 	{
1467 	    lc_outlines.insert(og);
1468 	}
1469 	else if (in_open_document)
1470 	{
1471 	    lc_open_document.insert(og);
1472 	}
1473 	else if ((in_first_page) &&
1474 		 (others == 0) && (other_pages == 0) && (thumbs == 0))
1475 	{
1476 	    lc_first_page_private.insert(og);
1477 	}
1478 	else if (in_first_page)
1479 	{
1480 	    lc_first_page_shared.insert(og);
1481 	}
1482 	else if ((other_pages == 1) && (others == 0) && (thumbs == 0))
1483 	{
1484 	    lc_other_page_private.insert(og);
1485 	}
1486 	else if (other_pages > 1)
1487 	{
1488 	    lc_other_page_shared.insert(og);
1489 	}
1490 	else if ((thumbs == 1) && (others == 0))
1491 	{
1492 	    lc_thumbnail_private.insert(og);
1493 	}
1494 	else if (thumbs > 1)
1495 	{
1496 	    lc_thumbnail_shared.insert(og);
1497 	}
1498 	else
1499 	{
1500 	    lc_other.insert(og);
1501 	}
1502     }
1503 
1504     // Generate ordering for objects in the output file.  Sometimes we
1505     // just dump right from a set into a vector.  Rather than
1506     // optimizing this by going straight into the vector, we'll leave
1507     // these phases separate for now.  That way, this section can be
1508     // concerned only with ordering, and the above section can be
1509     // considered only with categorization.  Note that sets of
1510     // QPDFObjGens are sorted by QPDFObjGen.  In a linearized file,
1511     // objects appear in sequence with the possible exception of hints
1512     // tables which we won't see here anyway.  That means that running
1513     // calculateLinearizationData() on a linearized file should give
1514     // results identical to the original file ordering.
1515 
1516     // We seem to traverse the page tree a lot in this code, but we
1517     // can address this for a future code optimization if necessary.
1518     // Premature optimization is the root of all evil.
1519     std::vector<QPDFObjectHandle> pages;
1520     { // local scope
1521 	// Map all page objects to the containing object stream.  This
1522 	// should be a no-op in a properly linearized file.
1523 	std::vector<QPDFObjectHandle> t = getAllPages();
1524 	for (std::vector<QPDFObjectHandle>::iterator iter = t.begin();
1525 	     iter != t.end(); ++iter)
1526 	{
1527 	    pages.push_back(getUncompressedObject(*iter, object_stream_data));
1528 	}
1529     }
1530     int npages = toI(pages.size());
1531 
1532     // We will be initializing some values of the computed hint
1533     // tables.  Specifically, we can initialize any items that deal
1534     // with object numbers or counts but not any items that deal with
1535     // lengths or offsets.  The code that writes linearized files will
1536     // have to fill in these values during the first pass.  The
1537     // validation code can compute them relatively easily given the
1538     // rest of the information.
1539 
1540     // npages is the size of the existing pages vector, which has been
1541     // created by traversing the pages tree, and as such is a
1542     // reasonable size.
1543     this->m->c_linp.npages = npages;
1544     this->m->c_page_offset_data.entries =
1545         std::vector<CHPageOffsetEntry>(toS(npages));
1546 
1547     // Part 4: open document objects.  We don't care about the order.
1548 
1549     if (lc_root.size() != 1)
1550     {
1551         stopOnError("found other than one root while"
1552                     " calculating linearization data");
1553     }
1554     this->m->part4.push_back(objGenToIndirect(*(lc_root.begin())));
1555     for (std::set<QPDFObjGen>::iterator iter = lc_open_document.begin();
1556 	 iter != lc_open_document.end(); ++iter)
1557     {
1558 	this->m->part4.push_back(objGenToIndirect(*iter));
1559     }
1560 
1561     // Part 6: first page objects.  Note: implementation note 124
1562     // states that Acrobat always treats page 0 as the first page for
1563     // linearization regardless of /OpenAction.  pdlin doesn't provide
1564     // any option to set this and also disregards /OpenAction.  We
1565     // will do the same.
1566 
1567     // First, place the actual first page object itself.
1568     if (pages.empty())
1569     {
1570         stopOnError("no pages found while calculating linearization data");
1571     }
1572     QPDFObjGen first_page_og(pages.at(0).getObjGen());
1573     if (! lc_first_page_private.count(first_page_og))
1574     {
1575 	stopOnError(
1576 	    "INTERNAL ERROR: QPDF::calculateLinearizationData: first page "
1577 	    "object not in lc_first_page_private");
1578     }
1579     lc_first_page_private.erase(first_page_og);
1580     this->m->c_linp.first_page_object = pages.at(0).getObjectID();
1581     this->m->part6.push_back(pages.at(0));
1582 
1583     // The PDF spec "recommends" an order for the rest of the objects,
1584     // but we are going to disregard it except to the extent that it
1585     // groups private and shared objects contiguously for the sake of
1586     // hint tables.
1587 
1588     for (std::set<QPDFObjGen>::iterator iter = lc_first_page_private.begin();
1589 	 iter != lc_first_page_private.end(); ++iter)
1590     {
1591 	this->m->part6.push_back(objGenToIndirect(*iter));
1592     }
1593 
1594     for (std::set<QPDFObjGen>::iterator iter = lc_first_page_shared.begin();
1595 	 iter != lc_first_page_shared.end(); ++iter)
1596     {
1597 	this->m->part6.push_back(objGenToIndirect(*iter));
1598     }
1599 
1600     // Place the outline dictionary if it goes in the first page section.
1601     if (outlines_in_first_page)
1602     {
1603 	pushOutlinesToPart(this->m->part6, lc_outlines, object_stream_data);
1604     }
1605 
1606     // Fill in page offset hint table information for the first page.
1607     // The PDF spec says that nshared_objects should be zero for the
1608     // first page.  pdlin does not appear to obey this, but it fills
1609     // in garbage values for all the shared object identifiers on the
1610     // first page.
1611 
1612     this->m->c_page_offset_data.entries.at(0).nobjects =
1613         toI(this->m->part6.size());
1614 
1615     // Part 7: other pages' private objects
1616 
1617     // For each page in order:
1618     for (size_t i = 1; i < toS(npages); ++i)
1619     {
1620 	// Place this page's page object
1621 
1622 	QPDFObjGen page_og(pages.at(i).getObjGen());
1623 	if (! lc_other_page_private.count(page_og))
1624 	{
1625 	    stopOnError(
1626 		"INTERNAL ERROR: "
1627 		"QPDF::calculateLinearizationData: page object for page " +
1628 		QUtil::uint_to_string(i) + " not in lc_other_page_private");
1629 	}
1630 	lc_other_page_private.erase(page_og);
1631 	this->m->part7.push_back(pages.at(i));
1632 
1633 	// Place all non-shared objects referenced by this page,
1634 	// updating the page object count for the hint table.
1635 
1636 	this->m->c_page_offset_data.entries.at(i).nobjects = 1;
1637 
1638 	ObjUser ou(ObjUser::ou_page, toI(i));
1639 	if (this->m->obj_user_to_objects.count(ou) == 0)
1640         {
1641             stopOnError("found unreferenced page while"
1642                         " calculating linearization data");
1643         }
1644 	std::set<QPDFObjGen> ogs = this->m->obj_user_to_objects[ou];
1645 	for (std::set<QPDFObjGen>::iterator iter = ogs.begin();
1646 	     iter != ogs.end(); ++iter)
1647 	{
1648 	    QPDFObjGen const& og = (*iter);
1649 	    if (lc_other_page_private.count(og))
1650 	    {
1651 		lc_other_page_private.erase(og);
1652 		this->m->part7.push_back(objGenToIndirect(og));
1653 		++this->m->c_page_offset_data.entries.at(i).nobjects;
1654 	    }
1655 	}
1656     }
1657     // That should have covered all part7 objects.
1658     if (! lc_other_page_private.empty())
1659     {
1660 	stopOnError(
1661 	    "INTERNAL ERROR:"
1662 	    " QPDF::calculateLinearizationData: lc_other_page_private is "
1663 	    "not empty after generation of part7");
1664     }
1665 
1666     // Part 8: other pages' shared objects
1667 
1668     // Order is unimportant.
1669     for (std::set<QPDFObjGen>::iterator iter = lc_other_page_shared.begin();
1670 	 iter != lc_other_page_shared.end(); ++iter)
1671     {
1672 	this->m->part8.push_back(objGenToIndirect(*iter));
1673     }
1674 
1675     // Part 9: other objects
1676 
1677     // The PDF specification makes recommendations on ordering here.
1678     // We follow them only to a limited extent.  Specifically, we put
1679     // the pages tree first, then private thumbnail objects in page
1680     // order, then shared thumbnail objects, and then outlines (unless
1681     // in part 6).  After that, we throw all remaining objects in
1682     // arbitrary order.
1683 
1684     // Place the pages tree.
1685     std::set<QPDFObjGen> pages_ogs =
1686 	this->m->obj_user_to_objects[ObjUser(ObjUser::ou_root_key, "/Pages")];
1687     if (pages_ogs.empty())
1688     {
1689         stopOnError("found empty pages tree while"
1690                     " calculating linearization data");
1691     }
1692     for (std::set<QPDFObjGen>::iterator iter = pages_ogs.begin();
1693 	 iter != pages_ogs.end(); ++iter)
1694     {
1695 	QPDFObjGen const& og = *iter;
1696 	if (lc_other.count(og))
1697 	{
1698 	    lc_other.erase(og);
1699 	    this->m->part9.push_back(objGenToIndirect(og));
1700 	}
1701     }
1702 
1703     // Place private thumbnail images in page order.  Slightly more
1704     // information would be required if we were going to bother with
1705     // thumbnail hint tables.
1706     for (size_t i = 0; i < toS(npages); ++i)
1707     {
1708 	QPDFObjectHandle thumb = pages.at(i).getKey("/Thumb");
1709 	thumb = getUncompressedObject(thumb, object_stream_data);
1710 	if (! thumb.isNull())
1711 	{
1712 	    // Output the thumbnail itself
1713 	    QPDFObjGen thumb_og(thumb.getObjGen());
1714 	    if (lc_thumbnail_private.count(thumb_og))
1715 	    {
1716 		lc_thumbnail_private.erase(thumb_og);
1717 		this->m->part9.push_back(thumb);
1718 	    }
1719 	    else
1720 	    {
1721 		// No internal error this time...there's nothing to
1722 		// stop this object from having been referred to
1723 		// somewhere else outside of a page's /Thumb, and if
1724 		// it had been, there's nothing to prevent it from
1725 		// having been in some set other than
1726 		// lc_thumbnail_private.
1727 	    }
1728 	    std::set<QPDFObjGen>& ogs =
1729 		this->m->obj_user_to_objects[
1730                     ObjUser(ObjUser::ou_thumb, toI(i))];
1731 	    for (std::set<QPDFObjGen>::iterator iter = ogs.begin();
1732 		 iter != ogs.end(); ++iter)
1733 	    {
1734 		QPDFObjGen const& og = *iter;
1735 		if (lc_thumbnail_private.count(og))
1736 		{
1737 		    lc_thumbnail_private.erase(og);
1738 		    this->m->part9.push_back(objGenToIndirect(og));
1739 		}
1740 	    }
1741 	}
1742     }
1743     if (! lc_thumbnail_private.empty())
1744     {
1745 	stopOnError(
1746 	    "INTERNAL ERROR: "
1747 	    "QPDF::calculateLinearizationData: lc_thumbnail_private "
1748 	    "not empty after placing thumbnails");
1749     }
1750 
1751     // Place shared thumbnail objects
1752     for (std::set<QPDFObjGen>::iterator iter = lc_thumbnail_shared.begin();
1753 	 iter != lc_thumbnail_shared.end(); ++iter)
1754     {
1755 	this->m->part9.push_back(objGenToIndirect(*iter));
1756     }
1757 
1758     // Place outlines unless in first page
1759     if (! outlines_in_first_page)
1760     {
1761 	pushOutlinesToPart(this->m->part9, lc_outlines, object_stream_data);
1762     }
1763 
1764     // Place all remaining objects
1765     for (std::set<QPDFObjGen>::iterator iter = lc_other.begin();
1766 	 iter != lc_other.end(); ++iter)
1767     {
1768 	this->m->part9.push_back(objGenToIndirect(*iter));
1769     }
1770 
1771     // Make sure we got everything exactly once.
1772 
1773     size_t num_placed =
1774         this->m->part4.size() + this->m->part6.size() + this->m->part7.size() +
1775         this->m->part8.size() + this->m->part9.size();
1776     size_t num_wanted = this->m->object_to_obj_users.size();
1777     if (num_placed != num_wanted)
1778     {
1779 	stopOnError(
1780 	    "INTERNAL ERROR: QPDF::calculateLinearizationData: wrong "
1781 	    "number of objects placed (num_placed = " +
1782 	    QUtil::uint_to_string(num_placed) +
1783 	    "; number of objects: " +
1784 	    QUtil::uint_to_string(num_wanted));
1785     }
1786 
1787     // Calculate shared object hint table information including
1788     // references to shared objects from page offset hint data.
1789 
1790     // The shared object hint table consists of all part 6 (whether
1791     // shared or not) in order followed by all part 8 objects in
1792     // order.  Add the objects to shared object data keeping a map of
1793     // object number to index.  Then populate the shared object
1794     // information for the pages.
1795 
1796     // Note that two objects never have the same object number, so we
1797     // can map from object number only without regards to generation.
1798     std::map<int, int> obj_to_index;
1799 
1800     this->m->c_shared_object_data.nshared_first_page =
1801         toI(this->m->part6.size());
1802     this->m->c_shared_object_data.nshared_total =
1803 	this->m->c_shared_object_data.nshared_first_page +
1804         toI(this->m->part8.size());
1805 
1806     std::vector<CHSharedObjectEntry>& shared =
1807 	this->m->c_shared_object_data.entries;
1808     for (std::vector<QPDFObjectHandle>::iterator iter = this->m->part6.begin();
1809 	 iter != this->m->part6.end(); ++iter)
1810     {
1811 	QPDFObjectHandle& oh = *iter;
1812 	int obj = oh.getObjectID();
1813 	obj_to_index[obj] = toI(shared.size());
1814 	shared.push_back(CHSharedObjectEntry(obj));
1815     }
1816     QTC::TC("qpdf", "QPDF lin part 8 empty", this->m->part8.empty() ? 1 : 0);
1817     if (! this->m->part8.empty())
1818     {
1819 	this->m->c_shared_object_data.first_shared_obj =
1820 	    this->m->part8.at(0).getObjectID();
1821 	for (std::vector<QPDFObjectHandle>::iterator iter =
1822 		 this->m->part8.begin();
1823 	     iter != this->m->part8.end(); ++iter)
1824 	{
1825 	    QPDFObjectHandle& oh = *iter;
1826 	    int obj = oh.getObjectID();
1827 	    obj_to_index[obj] = toI(shared.size());
1828 	    shared.push_back(CHSharedObjectEntry(obj));
1829 	}
1830     }
1831     if (static_cast<size_t>(this->m->c_shared_object_data.nshared_total) !=
1832         this->m->c_shared_object_data.entries.size())
1833     {
1834         stopOnError(
1835             "shared object hint table has wrong number of entries");
1836     }
1837 
1838     // Now compute the list of shared objects for each page after the
1839     // first page.
1840 
1841     for (size_t i = 1; i < toS(npages); ++i)
1842     {
1843 	CHPageOffsetEntry& pe = this->m->c_page_offset_data.entries.at(i);
1844 	ObjUser ou(ObjUser::ou_page, toI(i));
1845 	if (this->m->obj_user_to_objects.count(ou) == 0)
1846         {
1847             stopOnError("found unreferenced page while"
1848                         " calculating linearization data");
1849         }
1850 	std::set<QPDFObjGen> const& ogs = this->m->obj_user_to_objects[ou];
1851 	for (std::set<QPDFObjGen>::const_iterator iter = ogs.begin();
1852 	     iter != ogs.end(); ++iter)
1853 	{
1854 	    QPDFObjGen const& og = *iter;
1855 	    if ((this->m->object_to_obj_users[og].size() > 1) &&
1856 		(obj_to_index.count(og.getObj()) > 0))
1857 	    {
1858 		int idx = obj_to_index[og.getObj()];
1859 		++pe.nshared_objects;
1860 		pe.shared_identifiers.push_back(idx);
1861 	    }
1862 	}
1863     }
1864 }
1865 
1866 void
pushOutlinesToPart(std::vector<QPDFObjectHandle> & part,std::set<QPDFObjGen> & lc_outlines,std::map<int,int> const & object_stream_data)1867 QPDF::pushOutlinesToPart(
1868     std::vector<QPDFObjectHandle>& part,
1869     std::set<QPDFObjGen>& lc_outlines,
1870     std::map<int, int> const& object_stream_data)
1871 {
1872     QPDFObjectHandle root = getRoot();
1873     QPDFObjectHandle outlines = root.getKey("/Outlines");
1874     if (outlines.isNull())
1875     {
1876 	return;
1877     }
1878     outlines = getUncompressedObject(outlines, object_stream_data);
1879     QPDFObjGen outlines_og(outlines.getObjGen());
1880     QTC::TC("qpdf", "QPDF lin outlines in part",
1881 	    ((&part == (&this->m->part6)) ? 0
1882 	     : (&part == (&this->m->part9)) ? 1
1883 	     : 9999));		// can't happen
1884     this->m->c_outline_data.first_object = outlines_og.getObj();
1885     this->m->c_outline_data.nobjects = 1;
1886     lc_outlines.erase(outlines_og);
1887     part.push_back(outlines);
1888     for (std::set<QPDFObjGen>::iterator iter = lc_outlines.begin();
1889 	 iter != lc_outlines.end(); ++iter)
1890     {
1891 	part.push_back(objGenToIndirect(*iter));
1892 	++this->m->c_outline_data.nobjects;
1893     }
1894 }
1895 
1896 void
getLinearizedParts(std::map<int,int> const & object_stream_data,std::vector<QPDFObjectHandle> & part4,std::vector<QPDFObjectHandle> & part6,std::vector<QPDFObjectHandle> & part7,std::vector<QPDFObjectHandle> & part8,std::vector<QPDFObjectHandle> & part9)1897 QPDF::getLinearizedParts(
1898     std::map<int, int> const& object_stream_data,
1899     std::vector<QPDFObjectHandle>& part4,
1900     std::vector<QPDFObjectHandle>& part6,
1901     std::vector<QPDFObjectHandle>& part7,
1902     std::vector<QPDFObjectHandle>& part8,
1903     std::vector<QPDFObjectHandle>& part9)
1904 {
1905     calculateLinearizationData(object_stream_data);
1906     part4 = this->m->part4;
1907     part6 = this->m->part6;
1908     part7 = this->m->part7;
1909     part8 = this->m->part8;
1910     part9 = this->m->part9;
1911 }
1912 
nbits(int val)1913 static inline int nbits(int val)
1914 {
1915     return (val == 0 ? 0 : (1 + nbits(val >> 1)));
1916 }
1917 
1918 int
outputLengthNextN(int in_object,int n,std::map<int,qpdf_offset_t> const & lengths,std::map<int,int> const & obj_renumber)1919 QPDF::outputLengthNextN(
1920     int in_object, int n,
1921     std::map<int, qpdf_offset_t> const& lengths,
1922     std::map<int, int> const& obj_renumber)
1923 {
1924     // Figure out the length of a series of n consecutive objects in
1925     // the output file starting with whatever object in_object from
1926     // the input file mapped to.
1927 
1928     if (obj_renumber.count(in_object) == 0)
1929     {
1930         stopOnError("found object that is not renumbered while"
1931                     " writing linearization data");
1932     }
1933     int first = (*(obj_renumber.find(in_object))).second;
1934     int length = 0;
1935     for (int i = 0; i < n; ++i)
1936     {
1937 	if (lengths.count(first + i) == 0)
1938         {
1939             stopOnError("found item with unknown length"
1940                         " while writing linearization data");
1941         }
1942 	length += toI((*(lengths.find(first + toI(i)))).second);
1943     }
1944     return length;
1945 }
1946 
1947 void
calculateHPageOffset(std::map<int,QPDFXRefEntry> const & xref,std::map<int,qpdf_offset_t> const & lengths,std::map<int,int> const & obj_renumber)1948 QPDF::calculateHPageOffset(
1949     std::map<int, QPDFXRefEntry> const& xref,
1950     std::map<int, qpdf_offset_t> const& lengths,
1951     std::map<int, int> const& obj_renumber)
1952 {
1953     // Page Offset Hint Table
1954 
1955     // We are purposely leaving some values set to their initial zero
1956     // values.
1957 
1958     std::vector<QPDFObjectHandle> const& pages = getAllPages();
1959     size_t npages = pages.size();
1960     CHPageOffset& cph = this->m->c_page_offset_data;
1961     std::vector<CHPageOffsetEntry>& cphe = cph.entries;
1962 
1963     // Calculate minimum and maximum values for number of objects per
1964     // page and page length.
1965 
1966     int min_nobjects = cphe.at(0).nobjects;
1967     int max_nobjects = min_nobjects;
1968     int min_length = outputLengthNextN(
1969 	pages.at(0).getObjectID(), min_nobjects, lengths, obj_renumber);
1970     int max_length = min_length;
1971     int max_shared = cphe.at(0).nshared_objects;
1972 
1973     HPageOffset& ph = this->m->page_offset_hints;
1974     std::vector<HPageOffsetEntry>& phe = ph.entries;
1975     // npages is the size of the existing pages array.
1976     phe = std::vector<HPageOffsetEntry>(npages);
1977 
1978     for (unsigned int i = 0; i < npages; ++i)
1979     {
1980 	// Calculate values for each page, assigning full values to
1981 	// the delta items.  They will be adjusted later.
1982 
1983 	// Repeat calculations for page 0 so we can assign to phe[i]
1984 	// without duplicating those assignments.
1985 
1986 	int nobjects = cphe.at(i).nobjects;
1987 	int length = outputLengthNextN(
1988 	    pages.at(i).getObjectID(), nobjects, lengths, obj_renumber);
1989 	int nshared = cphe.at(i).nshared_objects;
1990 
1991 	min_nobjects = std::min(min_nobjects, nobjects);
1992 	max_nobjects = std::max(max_nobjects, nobjects);
1993 	min_length = std::min(min_length, length);
1994 	max_length = std::max(max_length, length);
1995 	max_shared = std::max(max_shared, nshared);
1996 
1997 	phe.at(i).delta_nobjects = nobjects;
1998 	phe.at(i).delta_page_length = length;
1999 	phe.at(i).nshared_objects = nshared;
2000     }
2001 
2002     ph.min_nobjects = min_nobjects;
2003     int in_page0_id = pages.at(0).getObjectID();
2004     int out_page0_id = (*(obj_renumber.find(in_page0_id))).second;
2005     ph.first_page_offset = (*(xref.find(out_page0_id))).second.getOffset();
2006     ph.nbits_delta_nobjects = nbits(max_nobjects - min_nobjects);
2007     ph.min_page_length = min_length;
2008     ph.nbits_delta_page_length = nbits(max_length - min_length);
2009     ph.nbits_nshared_objects = nbits(max_shared);
2010     ph.nbits_shared_identifier =
2011 	nbits(this->m->c_shared_object_data.nshared_total);
2012     ph.shared_denominator = 4;	// doesn't matter
2013 
2014     // It isn't clear how to compute content offset and content
2015     // length.  Since we are not interleaving page objects with the
2016     // content stream, we'll use the same values for content length as
2017     // page length.  We will use 0 as content offset because this is
2018     // what Adobe does (implementation note 127) and pdlin as well.
2019     ph.nbits_delta_content_length = ph.nbits_delta_page_length;
2020     ph.min_content_length = ph.min_page_length;
2021 
2022     for (size_t i = 0; i < npages; ++i)
2023     {
2024 	// Adjust delta entries
2025 	if ((phe.at(i).delta_nobjects < min_nobjects) ||
2026             (phe.at(i).delta_page_length < min_length))
2027         {
2028             stopOnError("found too small delta nobjects or delta page length"
2029                         " while writing linearization data");
2030         }
2031 	phe.at(i).delta_nobjects -= min_nobjects;
2032 	phe.at(i).delta_page_length -= min_length;
2033 	phe.at(i).delta_content_length = phe.at(i).delta_page_length;
2034 
2035 	for (size_t j = 0; j < toS(cphe.at(i).nshared_objects); ++j)
2036 	{
2037 	    phe.at(i).shared_identifiers.push_back(
2038 		cphe.at(i).shared_identifiers.at(j));
2039 	    phe.at(i).shared_numerators.push_back(0);
2040 	}
2041     }
2042 }
2043 
2044 void
calculateHSharedObject(std::map<int,QPDFXRefEntry> const & xref,std::map<int,qpdf_offset_t> const & lengths,std::map<int,int> const & obj_renumber)2045 QPDF::calculateHSharedObject(
2046     std::map<int, QPDFXRefEntry> const& xref,
2047     std::map<int, qpdf_offset_t> const& lengths,
2048     std::map<int, int> const& obj_renumber)
2049 {
2050     CHSharedObject& cso = this->m->c_shared_object_data;
2051     std::vector<CHSharedObjectEntry>& csoe = cso.entries;
2052     HSharedObject& so = this->m->shared_object_hints;
2053     std::vector<HSharedObjectEntry>& soe = so.entries;
2054     soe.clear();
2055 
2056     int min_length = outputLengthNextN(
2057 	csoe.at(0).object, 1, lengths, obj_renumber);
2058     int max_length = min_length;
2059 
2060     for (size_t i = 0; i < toS(cso.nshared_total); ++i)
2061     {
2062 	// Assign absolute numbers to deltas; adjust later
2063 	int length = outputLengthNextN(
2064 	    csoe.at(i).object, 1, lengths, obj_renumber);
2065 	min_length = std::min(min_length, length);
2066 	max_length = std::max(max_length, length);
2067         soe.push_back(HSharedObjectEntry());
2068 	soe.at(i).delta_group_length = length;
2069     }
2070     if (soe.size() != QIntC::to_size(cso.nshared_total))
2071     {
2072         stopOnError("soe has wrong size after initialization");
2073     }
2074 
2075     so.nshared_total = cso.nshared_total;
2076     so.nshared_first_page = cso.nshared_first_page;
2077     if (so.nshared_total > so.nshared_first_page)
2078     {
2079 	so.first_shared_obj =
2080 	    (*(obj_renumber.find(cso.first_shared_obj))).second;
2081 	so.first_shared_offset =
2082 	    (*(xref.find(so.first_shared_obj))).second.getOffset();
2083     }
2084     so.min_group_length = min_length;
2085     so.nbits_delta_group_length = nbits(max_length - min_length);
2086 
2087     for (size_t i = 0; i < toS(cso.nshared_total); ++i)
2088     {
2089 	// Adjust deltas
2090 	if (soe.at(i).delta_group_length < min_length)
2091         {
2092             stopOnError("found too small group length while"
2093                         " writing linearization data");
2094         }
2095 	soe.at(i).delta_group_length -= min_length;
2096     }
2097 }
2098 
2099 void
calculateHOutline(std::map<int,QPDFXRefEntry> const & xref,std::map<int,qpdf_offset_t> const & lengths,std::map<int,int> const & obj_renumber)2100 QPDF::calculateHOutline(
2101     std::map<int, QPDFXRefEntry> const& xref,
2102     std::map<int, qpdf_offset_t> const& lengths,
2103     std::map<int, int> const& obj_renumber)
2104 {
2105     HGeneric& cho = this->m->c_outline_data;
2106 
2107     if (cho.nobjects == 0)
2108     {
2109 	return;
2110     }
2111 
2112     HGeneric& ho = this->m->outline_hints;
2113 
2114     ho.first_object =
2115 	(*(obj_renumber.find(cho.first_object))).second;
2116     ho.first_object_offset =
2117 	(*(xref.find(ho.first_object))).second.getOffset();
2118     ho.nobjects = cho.nobjects;
2119     ho.group_length = outputLengthNextN(
2120 	cho.first_object, ho.nobjects, lengths, obj_renumber);
2121 }
2122 
2123 template <class T, class int_type>
2124 static void
write_vector_int(BitWriter & w,int nitems,std::vector<T> & vec,int bits,int_type T::* field)2125 write_vector_int(BitWriter& w, int nitems, std::vector<T>& vec,
2126 		 int bits, int_type T::*field)
2127 {
2128     // nitems times, write bits bits from the given field of the ith
2129     // vector to the given bit writer.
2130 
2131     for (size_t i = 0; i < QIntC::to_size(nitems); ++i)
2132     {
2133 	w.writeBits(QIntC::to_ulonglong(vec.at(i).*field),
2134                     QIntC::to_size(bits));
2135     }
2136     // The PDF spec says that each hint table starts at a byte
2137     // boundary.  Each "row" actually must start on a byte boundary.
2138     w.flush();
2139 }
2140 
2141 template <class T>
2142 static void
write_vector_vector(BitWriter & w,int nitems1,std::vector<T> & vec1,int T::* nitems2,int bits,std::vector<int> T::* vec2)2143 write_vector_vector(BitWriter& w,
2144 		    int nitems1, std::vector<T>& vec1, int T::*nitems2,
2145 		    int bits, std::vector<int> T::*vec2)
2146 {
2147     // nitems1 times, write nitems2 (from the ith element of vec1) items
2148     // from the vec2 vector field of the ith item of vec1.
2149     for (size_t i1 = 0; i1 < QIntC::to_size(nitems1); ++i1)
2150     {
2151 	for (size_t i2 = 0; i2 < QIntC::to_size(vec1.at(i1).*nitems2); ++i2)
2152 	{
2153 	    w.writeBits(QIntC::to_ulonglong((vec1.at(i1).*vec2).at(i2)),
2154                         QIntC::to_size(bits));
2155 	}
2156     }
2157     w.flush();
2158 }
2159 
2160 
2161 void
writeHPageOffset(BitWriter & w)2162 QPDF::writeHPageOffset(BitWriter& w)
2163 {
2164     HPageOffset& t = this->m->page_offset_hints;
2165 
2166     w.writeBitsInt(t.min_nobjects, 32);                     // 1
2167     w.writeBitsInt(toI(t.first_page_offset), 32);           // 2
2168     w.writeBitsInt(t.nbits_delta_nobjects, 16);             // 3
2169     w.writeBitsInt(t.min_page_length, 32);                  // 4
2170     w.writeBitsInt(t.nbits_delta_page_length, 16);          // 5
2171     w.writeBitsInt(t.min_content_offset, 32);               // 6
2172     w.writeBitsInt(t.nbits_delta_content_offset, 16);       // 7
2173     w.writeBitsInt(t.min_content_length, 32);               // 8
2174     w.writeBitsInt(t.nbits_delta_content_length, 16);       // 9
2175     w.writeBitsInt(t.nbits_nshared_objects, 16);            // 10
2176     w.writeBitsInt(t.nbits_shared_identifier, 16);          // 11
2177     w.writeBitsInt(t.nbits_shared_numerator, 16);           // 12
2178     w.writeBitsInt(t.shared_denominator, 16);               // 13
2179 
2180     int nitems = toI(getAllPages().size());
2181     std::vector<HPageOffsetEntry>& entries = t.entries;
2182 
2183     write_vector_int(w, nitems, entries,
2184 		     t.nbits_delta_nobjects,
2185 		     &HPageOffsetEntry::delta_nobjects);
2186     write_vector_int(w, nitems, entries,
2187 		     t.nbits_delta_page_length,
2188 		     &HPageOffsetEntry::delta_page_length);
2189     write_vector_int(w, nitems, entries,
2190 		     t.nbits_nshared_objects,
2191 		     &HPageOffsetEntry::nshared_objects);
2192     write_vector_vector(w, nitems, entries,
2193 			&HPageOffsetEntry::nshared_objects,
2194 			t.nbits_shared_identifier,
2195 			&HPageOffsetEntry::shared_identifiers);
2196     write_vector_vector(w, nitems, entries,
2197 			&HPageOffsetEntry::nshared_objects,
2198 			t.nbits_shared_numerator,
2199 			&HPageOffsetEntry::shared_numerators);
2200     write_vector_int(w, nitems, entries,
2201 		     t.nbits_delta_content_offset,
2202 		     &HPageOffsetEntry::delta_content_offset);
2203     write_vector_int(w, nitems, entries,
2204 		     t.nbits_delta_content_length,
2205 		     &HPageOffsetEntry::delta_content_length);
2206 }
2207 
2208 void
writeHSharedObject(BitWriter & w)2209 QPDF::writeHSharedObject(BitWriter& w)
2210 {
2211     HSharedObject& t = this->m->shared_object_hints;
2212 
2213     w.writeBitsInt(t.first_shared_obj, 32);                 // 1
2214     w.writeBitsInt(toI(t.first_shared_offset), 32);         // 2
2215     w.writeBitsInt(t.nshared_first_page, 32);               // 3
2216     w.writeBitsInt(t.nshared_total, 32);                    // 4
2217     w.writeBitsInt(t.nbits_nobjects, 16);                   // 5
2218     w.writeBitsInt(t.min_group_length, 32);                 // 6
2219     w.writeBitsInt(t.nbits_delta_group_length, 16);         // 7
2220 
2221     QTC::TC("qpdf", "QPDF lin write nshared_total > nshared_first_page",
2222 	    (t.nshared_total > t.nshared_first_page) ? 1 : 0);
2223 
2224     int nitems = t.nshared_total;
2225     std::vector<HSharedObjectEntry>& entries = t.entries;
2226 
2227     write_vector_int(w, nitems, entries,
2228 		     t.nbits_delta_group_length,
2229 		     &HSharedObjectEntry::delta_group_length);
2230     write_vector_int(w, nitems, entries,
2231 		     1, &HSharedObjectEntry::signature_present);
2232     for (size_t i = 0; i < toS(nitems); ++i)
2233     {
2234 	// If signature were present, we'd have to write a 128-bit hash.
2235 	if (entries.at(i).signature_present != 0)
2236         {
2237             stopOnError("found unexpected signature present"
2238                         " while writing linearization data");
2239         }
2240     }
2241     write_vector_int(w, nitems, entries,
2242 		     t.nbits_nobjects,
2243 		     &HSharedObjectEntry::nobjects_minus_one);
2244 }
2245 
2246 void
writeHGeneric(BitWriter & w,HGeneric & t)2247 QPDF::writeHGeneric(BitWriter& w, HGeneric& t)
2248 {
2249     w.writeBitsInt(t.first_object, 32);                     // 1
2250     w.writeBitsInt(toI(t.first_object_offset), 32);         // 2
2251     w.writeBitsInt(t.nobjects, 32);                         // 3
2252     w.writeBitsInt(t.group_length, 32);                     // 4
2253 }
2254 
2255 void
generateHintStream(std::map<int,QPDFXRefEntry> const & xref,std::map<int,qpdf_offset_t> const & lengths,std::map<int,int> const & obj_renumber,PointerHolder<Buffer> & hint_buffer,int & S,int & O)2256 QPDF::generateHintStream(std::map<int, QPDFXRefEntry> const& xref,
2257 			 std::map<int, qpdf_offset_t> const& lengths,
2258 			 std::map<int, int> const& obj_renumber,
2259 			 PointerHolder<Buffer>& hint_buffer,
2260 			 int& S, int& O)
2261 {
2262     // Populate actual hint table values
2263     calculateHPageOffset(xref, lengths, obj_renumber);
2264     calculateHSharedObject(xref, lengths, obj_renumber);
2265     calculateHOutline(xref, lengths, obj_renumber);
2266 
2267     // Write the hint stream itself into a compressed memory buffer.
2268     // Write through a counter so we can get offsets.
2269     Pl_Buffer hint_stream("hint stream");
2270     Pl_Flate f("compress hint stream", &hint_stream, Pl_Flate::a_deflate);
2271     Pl_Count c("count", &f);
2272     BitWriter w(&c);
2273 
2274     writeHPageOffset(w);
2275     S = toI(c.getCount());
2276     writeHSharedObject(w);
2277     O = 0;
2278     if (this->m->outline_hints.nobjects > 0)
2279     {
2280 	O = toI(c.getCount());
2281 	writeHGeneric(w, this->m->outline_hints);
2282     }
2283     c.finish();
2284 
2285     hint_buffer = hint_stream.getBuffer();
2286 }
2287