1 #include <qpdf/qpdf-config.h> // include first for large file support
2 #include <qpdf/QPDF.hh>
3
4 #include <atomic>
5 #include <vector>
6 #include <map>
7 #include <algorithm>
8 #include <limits>
9 #include <sstream>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <memory.h>
13
14 #include <qpdf/QTC.hh>
15 #include <qpdf/QUtil.hh>
16 #include <qpdf/Pipeline.hh>
17 #include <qpdf/Pl_Discard.hh>
18 #include <qpdf/FileInputSource.hh>
19 #include <qpdf/BufferInputSource.hh>
20 #include <qpdf/OffsetInputSource.hh>
21
22 #include <qpdf/QPDFExc.hh>
23 #include <qpdf/QPDF_Null.hh>
24 #include <qpdf/QPDF_Dictionary.hh>
25 #include <qpdf/QPDF_Stream.hh>
26 #include <qpdf/QPDF_Array.hh>
27
28 std::string QPDF::qpdf_version = "10.5.0";
29
30 static char const* EMPTY_PDF =
31 "%PDF-1.3\n"
32 "1 0 obj\n"
33 "<< /Type /Catalog /Pages 2 0 R >>\n"
34 "endobj\n"
35 "2 0 obj\n"
36 "<< /Type /Pages /Kids [] /Count 0 >>\n"
37 "endobj\n"
38 "xref\n"
39 "0 3\n"
40 "0000000000 65535 f \n"
41 "0000000009 00000 n \n"
42 "0000000058 00000 n \n"
43 "trailer << /Size 3 /Root 1 0 R >>\n"
44 "startxref\n"
45 "110\n"
46 "%%EOF\n";
47
48 class InvalidInputSource: public InputSource
49 {
50 public:
51 virtual ~InvalidInputSource() = default;
findAndSkipNextEOL()52 virtual qpdf_offset_t findAndSkipNextEOL() override
53 {
54 throwException();
55 return 0;
56 }
getName() const57 virtual std::string const& getName() const override
58 {
59 static std::string name("closed input source");
60 return name;
61 }
tell()62 virtual qpdf_offset_t tell() override
63 {
64 throwException();
65 return 0;
66 }
seek(qpdf_offset_t offset,int whence)67 virtual void seek(qpdf_offset_t offset, int whence) override
68 {
69 throwException();
70 }
rewind()71 virtual void rewind() override
72 {
73 throwException();
74 }
read(char * buffer,size_t length)75 virtual size_t read(char* buffer, size_t length) override
76 {
77 throwException();
78 return 0;
79 }
unreadCh(char ch)80 virtual void unreadCh(char ch) override
81 {
82 throwException();
83 }
84
85 private:
throwException()86 void throwException()
87 {
88 throw std::logic_error(
89 "QPDF operation attempted on a QPDF object with no input source."
90 " QPDF operations are invalid before processFile (or another"
91 " process method) or after closeInputSource");
92 }
93 };
94
ForeignStreamData(PointerHolder<EncryptionParameters> encp,PointerHolder<InputSource> file,int foreign_objid,int foreign_generation,qpdf_offset_t offset,size_t length,QPDFObjectHandle local_dict)95 QPDF::ForeignStreamData::ForeignStreamData(
96 PointerHolder<EncryptionParameters> encp,
97 PointerHolder<InputSource> file,
98 int foreign_objid,
99 int foreign_generation,
100 qpdf_offset_t offset,
101 size_t length,
102 QPDFObjectHandle local_dict)
103 :
104 encp(encp),
105 file(file),
106 foreign_objid(foreign_objid),
107 foreign_generation(foreign_generation),
108 offset(offset),
109 length(length),
110 local_dict(local_dict)
111 {
112 }
113
CopiedStreamDataProvider(QPDF & destination_qpdf)114 QPDF::CopiedStreamDataProvider::CopiedStreamDataProvider(
115 QPDF& destination_qpdf) :
116 QPDFObjectHandle::StreamDataProvider(true),
117 destination_qpdf(destination_qpdf)
118 {
119 }
120
121 bool
provideStreamData(int objid,int generation,Pipeline * pipeline,bool suppress_warnings,bool will_retry)122 QPDF::CopiedStreamDataProvider::provideStreamData(
123 int objid, int generation, Pipeline* pipeline,
124 bool suppress_warnings, bool will_retry)
125 {
126 PointerHolder<ForeignStreamData> foreign_data =
127 this->foreign_stream_data[QPDFObjGen(objid, generation)];
128 bool result = false;
129 if (foreign_data.getPointer())
130 {
131 result = destination_qpdf.pipeForeignStreamData(
132 foreign_data, pipeline, suppress_warnings, will_retry);
133 QTC::TC("qpdf", "QPDF copy foreign with data",
134 result ? 0 : 1);
135 }
136 else
137 {
138 QPDFObjectHandle foreign_stream =
139 this->foreign_streams[QPDFObjGen(objid, generation)];
140 result = foreign_stream.pipeStreamData(
141 pipeline, nullptr, 0, qpdf_dl_none,
142 suppress_warnings, will_retry);
143 QTC::TC("qpdf", "QPDF copy foreign with foreign_stream",
144 result ? 0 : 1);
145 }
146 return result;
147 }
148
149 void
registerForeignStream(QPDFObjGen const & local_og,QPDFObjectHandle foreign_stream)150 QPDF::CopiedStreamDataProvider::registerForeignStream(
151 QPDFObjGen const& local_og, QPDFObjectHandle foreign_stream)
152 {
153 this->foreign_streams[local_og] = foreign_stream;
154 }
155
156 void
registerForeignStream(QPDFObjGen const & local_og,PointerHolder<ForeignStreamData> foreign_stream)157 QPDF::CopiedStreamDataProvider::registerForeignStream(
158 QPDFObjGen const& local_og,
159 PointerHolder<ForeignStreamData> foreign_stream)
160 {
161 this->foreign_stream_data[local_og] = foreign_stream;
162 }
163
StringDecrypter(QPDF * qpdf,int objid,int gen)164 QPDF::StringDecrypter::StringDecrypter(QPDF* qpdf, int objid, int gen) :
165 qpdf(qpdf),
166 objid(objid),
167 gen(gen)
168 {
169 }
170
171 void
decryptString(std::string & val)172 QPDF::StringDecrypter::decryptString(std::string& val)
173 {
174 qpdf->decryptString(val, objid, gen);
175 }
176
177 std::string const&
QPDFVersion()178 QPDF::QPDFVersion()
179 {
180 return QPDF::qpdf_version;
181 }
182
EncryptionParameters()183 QPDF::EncryptionParameters::EncryptionParameters() :
184 encrypted(false),
185 encryption_initialized(false),
186 encryption_V(0),
187 encryption_R(0),
188 encrypt_metadata(true),
189 cf_stream(e_none),
190 cf_string(e_none),
191 cf_file(e_none),
192 cached_key_objid(0),
193 cached_key_generation(0),
194 user_password_matched(false),
195 owner_password_matched(false)
196 {
197 }
198
Members()199 QPDF::Members::Members() :
200 unique_id(0),
201 file(new InvalidInputSource()),
202 provided_password_is_hex_key(false),
203 ignore_xref_streams(false),
204 suppress_warnings(false),
205 out_stream(&std::cout),
206 err_stream(&std::cerr),
207 attempt_recovery(true),
208 encp(new EncryptionParameters),
209 pushed_inherited_attributes_to_pages(false),
210 copied_stream_data_provider(0),
211 reconstructed_xref(false),
212 fixed_dangling_refs(false),
213 immediate_copy_from(false),
214 in_parse(false),
215 parsed(false),
216 ever_replaced_objects(false),
217 first_xref_item_offset(0),
218 uncompressed_after_compressed(false)
219 {
220 }
221
~Members()222 QPDF::Members::~Members()
223 {
224 }
225
QPDF()226 QPDF::QPDF() :
227 m(new Members())
228 {
229 m->tokenizer.allowEOF();
230 // Generate a unique ID. It just has to be unique among all QPDF
231 // objects allocated throughout the lifetime of this running
232 // application.
233 static std::atomic<unsigned long long> unique_id{0};
234 m->unique_id = unique_id.fetch_add(1ULL);
235 }
236
~QPDF()237 QPDF::~QPDF()
238 {
239 // If two objects are mutually referential (through each object
240 // having an array or dictionary that contains an indirect
241 // reference to the other), the circular references in the
242 // PointerHolder objects will prevent the objects from being
243 // deleted. Walk through all objects in the object cache, which
244 // is those objects that we read from the file, and break all
245 // resolved references. At this point, obviously no one is still
246 // using the QPDF object, but we'll explicitly clear the xref
247 // table anyway just to prevent any possibility of resolve()
248 // succeeding. Note that we can't break references like this at
249 // any time when the QPDF object is active. If we do, the next
250 // reference will reread the object from the file, which would
251 // have the effect of undoing any modifications that may have been
252 // made to any of the objects.
253 this->m->xref_table.clear();
254 for (std::map<QPDFObjGen, ObjCache>::iterator iter =
255 this->m->obj_cache.begin();
256 iter != this->m->obj_cache.end(); ++iter)
257 {
258 QPDFObject::ObjAccessor::releaseResolved(
259 (*iter).second.object.getPointer());
260 }
261 }
262
263 void
processFile(char const * filename,char const * password)264 QPDF::processFile(char const* filename, char const* password)
265 {
266 FileInputSource* fi = new FileInputSource();
267 fi->setFilename(filename);
268 processInputSource(fi, password);
269 }
270
271 void
processFile(char const * description,FILE * filep,bool close_file,char const * password)272 QPDF::processFile(char const* description, FILE* filep,
273 bool close_file, char const* password)
274 {
275 FileInputSource* fi = new FileInputSource();
276 fi->setFile(description, filep, close_file);
277 processInputSource(fi, password);
278 }
279
280 void
processMemoryFile(char const * description,char const * buf,size_t length,char const * password)281 QPDF::processMemoryFile(char const* description,
282 char const* buf, size_t length,
283 char const* password)
284 {
285 processInputSource(
286 new BufferInputSource(
287 description,
288 new Buffer(QUtil::unsigned_char_pointer(buf), length),
289 true),
290 password);
291 }
292
293 void
processInputSource(PointerHolder<InputSource> source,char const * password)294 QPDF::processInputSource(PointerHolder<InputSource> source,
295 char const* password)
296 {
297 this->m->file = source;
298 parse(password);
299 }
300
301 void
closeInputSource()302 QPDF::closeInputSource()
303 {
304 this->m->file = new InvalidInputSource();
305 }
306
307 void
setPasswordIsHexKey(bool val)308 QPDF::setPasswordIsHexKey(bool val)
309 {
310 this->m->provided_password_is_hex_key = val;
311 }
312
313 void
emptyPDF()314 QPDF::emptyPDF()
315 {
316 processMemoryFile("empty PDF", EMPTY_PDF, strlen(EMPTY_PDF));
317 }
318
319 void
registerStreamFilter(std::string const & filter_name,std::function<std::shared_ptr<QPDFStreamFilter> ()> factory)320 QPDF::registerStreamFilter(
321 std::string const& filter_name,
322 std::function<std::shared_ptr<QPDFStreamFilter> ()> factory)
323 {
324 QPDF_Stream::registerStreamFilter(filter_name, factory);
325 }
326
327 void
setIgnoreXRefStreams(bool val)328 QPDF::setIgnoreXRefStreams(bool val)
329 {
330 this->m->ignore_xref_streams = val;
331 }
332
333 void
setOutputStreams(std::ostream * out,std::ostream * err)334 QPDF::setOutputStreams(std::ostream* out, std::ostream* err)
335 {
336 this->m->out_stream = out ? out : &std::cout;
337 this->m->err_stream = err ? err : &std::cerr;
338 }
339
340 void
setSuppressWarnings(bool val)341 QPDF::setSuppressWarnings(bool val)
342 {
343 this->m->suppress_warnings = val;
344 }
345
346 void
setAttemptRecovery(bool val)347 QPDF::setAttemptRecovery(bool val)
348 {
349 this->m->attempt_recovery = val;
350 }
351
352 void
setImmediateCopyFrom(bool val)353 QPDF::setImmediateCopyFrom(bool val)
354 {
355 this->m->immediate_copy_from = val;
356 }
357
358 std::vector<QPDFExc>
getWarnings()359 QPDF::getWarnings()
360 {
361 std::vector<QPDFExc> result = this->m->warnings;
362 this->m->warnings.clear();
363 return result;
364 }
365
366 bool
anyWarnings() const367 QPDF::anyWarnings() const
368 {
369 return ! this->m->warnings.empty();
370 }
371
372 size_t
numWarnings() const373 QPDF::numWarnings() const
374 {
375 return this->m->warnings.size();
376 }
377
378 bool
findHeader()379 QPDF::findHeader()
380 {
381 qpdf_offset_t global_offset = this->m->file->tell();
382 std::string line = this->m->file->readLine(1024);
383 char const* p = line.c_str();
384 if (strncmp(p, "%PDF-", 5) != 0)
385 {
386 throw std::logic_error("findHeader is not looking at %PDF-");
387 }
388 p += 5;
389 std::string version;
390 // Note: The string returned by line.c_str() is always
391 // null-terminated. The code below never overruns the buffer
392 // because a null character always short-circuits further
393 // advancement.
394 bool valid = QUtil::is_digit(*p);
395 if (valid)
396 {
397 while (QUtil::is_digit(*p))
398 {
399 version.append(1, *p++);
400 }
401 if ((*p == '.') && QUtil::is_digit(*(p+1)))
402 {
403 version.append(1, *p++);
404 while (QUtil::is_digit(*p))
405 {
406 version.append(1, *p++);
407 }
408 }
409 else
410 {
411 valid = false;
412 }
413 }
414 if (valid)
415 {
416 this->m->pdf_version = version;
417 if (global_offset != 0)
418 {
419 // Empirical evidence strongly suggests that when there is
420 // leading material prior to the PDF header, all explicit
421 // offsets in the file are such that 0 points to the
422 // beginning of the header.
423 QTC::TC("qpdf", "QPDF global offset");
424 this->m->file = new OffsetInputSource(this->m->file, global_offset);
425 }
426 }
427 return valid;
428 }
429
430 bool
findStartxref()431 QPDF::findStartxref()
432 {
433 QPDFTokenizer::Token t = readToken(this->m->file);
434 if (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "startxref"))
435 {
436 t = readToken(this->m->file);
437 if (t.getType() == QPDFTokenizer::tt_integer)
438 {
439 // Position in front of offset token
440 this->m->file->seek(this->m->file->getLastOffset(), SEEK_SET);
441 return true;
442 }
443 }
444 return false;
445 }
446
447 void
parse(char const * password)448 QPDF::parse(char const* password)
449 {
450 if (password)
451 {
452 this->m->encp->provided_password = password;
453 }
454
455 // Find the header anywhere in the first 1024 bytes of the file.
456 PatternFinder hf(*this, &QPDF::findHeader);
457 if (! this->m->file->findFirst("%PDF-", 0, 1024, hf))
458 {
459 QTC::TC("qpdf", "QPDF not a pdf file");
460 warn(QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
461 "", 0, "can't find PDF header"));
462 // QPDFWriter writes files that usually require at least
463 // version 1.2 for /FlateDecode
464 this->m->pdf_version = "1.2";
465 }
466
467 // PDF spec says %%EOF must be found within the last 1024 bytes of
468 // the file. We add an extra 30 characters to leave room for the
469 // startxref stuff.
470 this->m->file->seek(0, SEEK_END);
471 qpdf_offset_t end_offset = this->m->file->tell();
472 qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);
473 PatternFinder sf(*this, &QPDF::findStartxref);
474 qpdf_offset_t xref_offset = 0;
475 if (this->m->file->findLast("startxref", start_offset, 0, sf))
476 {
477 xref_offset = QUtil::string_to_ll(
478 readToken(this->m->file).getValue().c_str());
479 }
480
481 try
482 {
483 if (xref_offset == 0)
484 {
485 QTC::TC("qpdf", "QPDF can't find startxref");
486 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(), "", 0,
487 "can't find startxref");
488 }
489 try
490 {
491 read_xref(xref_offset);
492 }
493 catch (QPDFExc&)
494 {
495 throw;
496 }
497 catch (std::exception& e)
498 {
499 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(), "", 0,
500 std::string("error reading xref: ") + e.what());
501
502 }
503 }
504 catch (QPDFExc& e)
505 {
506 if (this->m->attempt_recovery)
507 {
508 reconstruct_xref(e);
509 QTC::TC("qpdf", "QPDF reconstructed xref table");
510 }
511 else
512 {
513 throw e;
514 }
515 }
516
517 initializeEncryption();
518 this->m->parsed = true;
519 }
520
521 void
inParse(bool v)522 QPDF::inParse(bool v)
523 {
524 if (this->m->in_parse == v)
525 {
526 // This happens of QPDFObjectHandle::parseInternal tries to
527 // resolve an indirect object while it is parsing.
528 throw std::logic_error(
529 "QPDF: re-entrant parsing detected. This is a qpdf bug."
530 " Please report at https://github.com/qpdf/qpdf/issues.");
531 }
532 this->m->in_parse = v;
533 }
534
535 void
warn(QPDFExc const & e)536 QPDF::warn(QPDFExc const& e)
537 {
538 this->m->warnings.push_back(e);
539 if (! this->m->suppress_warnings)
540 {
541 *this->m->err_stream
542 << "WARNING: "
543 << this->m->warnings.back().what() << std::endl;
544 }
545 }
546
547 void
setTrailer(QPDFObjectHandle obj)548 QPDF::setTrailer(QPDFObjectHandle obj)
549 {
550 if (this->m->trailer.isInitialized())
551 {
552 return;
553 }
554 this->m->trailer = obj;
555 }
556
557 void
reconstruct_xref(QPDFExc & e)558 QPDF::reconstruct_xref(QPDFExc& e)
559 {
560 if (this->m->reconstructed_xref)
561 {
562 // Avoid xref reconstruction infinite loops. This is getting
563 // very hard to reproduce because qpdf is throwing many fewer
564 // exceptions while parsing. Most situations are warnings now.
565 throw e;
566 }
567
568 this->m->reconstructed_xref = true;
569
570 warn(QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(), "", 0,
571 "file is damaged"));
572 warn(e);
573 warn(QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(), "", 0,
574 "Attempting to reconstruct cross-reference table"));
575
576 // Delete all references to type 1 (uncompressed) objects
577 std::set<QPDFObjGen> to_delete;
578 for (std::map<QPDFObjGen, QPDFXRefEntry>::iterator iter =
579 this->m->xref_table.begin();
580 iter != this->m->xref_table.end(); ++iter)
581 {
582 if (((*iter).second).getType() == 1)
583 {
584 to_delete.insert((*iter).first);
585 }
586 }
587 for (std::set<QPDFObjGen>::iterator iter = to_delete.begin();
588 iter != to_delete.end(); ++iter)
589 {
590 this->m->xref_table.erase(*iter);
591 }
592
593 this->m->file->seek(0, SEEK_END);
594 qpdf_offset_t eof = this->m->file->tell();
595 this->m->file->seek(0, SEEK_SET);
596 qpdf_offset_t line_start = 0;
597 // Don't allow very long tokens here during recovery.
598 static size_t const MAX_LEN = 100;
599 while (this->m->file->tell() < eof)
600 {
601 this->m->file->findAndSkipNextEOL();
602 qpdf_offset_t next_line_start = this->m->file->tell();
603 this->m->file->seek(line_start, SEEK_SET);
604 QPDFTokenizer::Token t1 = readToken(this->m->file, MAX_LEN);
605 qpdf_offset_t token_start =
606 this->m->file->tell() - toO(t1.getValue().length());
607 if (token_start >= next_line_start)
608 {
609 // don't process yet -- wait until we get to the line
610 // containing this token
611 }
612 else if (t1.getType() == QPDFTokenizer::tt_integer)
613 {
614 QPDFTokenizer::Token t2 =
615 readToken(this->m->file, MAX_LEN);
616 QPDFTokenizer::Token t3 =
617 readToken(this->m->file, MAX_LEN);
618 if ((t2.getType() == QPDFTokenizer::tt_integer) &&
619 (t3 == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "obj")))
620 {
621 int obj = QUtil::string_to_int(t1.getValue().c_str());
622 int gen = QUtil::string_to_int(t2.getValue().c_str());
623 insertXrefEntry(obj, 1, token_start, gen, true);
624 }
625 }
626 else if ((! this->m->trailer.isInitialized()) &&
627 (t1 == QPDFTokenizer::Token(
628 QPDFTokenizer::tt_word, "trailer")))
629 {
630 QPDFObjectHandle t =
631 readObject(this->m->file, "trailer", 0, 0, false);
632 if (! t.isDictionary())
633 {
634 // Oh well. It was worth a try.
635 }
636 else
637 {
638 setTrailer(t);
639 }
640 }
641 this->m->file->seek(next_line_start, SEEK_SET);
642 line_start = next_line_start;
643 }
644
645 if (! this->m->trailer.isInitialized())
646 {
647 // We could check the last encountered object to see if it was
648 // an xref stream. If so, we could try to get the trailer
649 // from there. This may make it possible to recover files
650 // with bad startxref pointers even when they have object
651 // streams.
652
653 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(), "", 0,
654 "unable to find trailer "
655 "dictionary while recovering damaged file");
656 }
657
658 // We could iterate through the objects looking for streams and
659 // try to find objects inside of them, but it's probably not worth
660 // the trouble. Acrobat can't recover files with any errors in an
661 // xref stream, and this would be a real long shot anyway. If we
662 // wanted to do anything that involved looking at stream contents,
663 // we'd also have to call initializeEncryption() here. It's safe
664 // to call it more than once.
665 }
666
667 void
read_xref(qpdf_offset_t xref_offset)668 QPDF::read_xref(qpdf_offset_t xref_offset)
669 {
670 std::map<int, int> free_table;
671 std::set<qpdf_offset_t> visited;
672 while (xref_offset)
673 {
674 visited.insert(xref_offset);
675 char buf[7];
676 memset(buf, 0, sizeof(buf));
677 this->m->file->seek(xref_offset, SEEK_SET);
678 // Some files miss the mark a little with startxref. We could
679 // do a better job of searching in the neighborhood for
680 // something that looks like either an xref table or stream,
681 // but the simple heuristic of skipping whitespace can help
682 // with the xref table case and is harmless with the stream
683 // case.
684 bool done = false;
685 bool skipped_space = false;
686 while (! done)
687 {
688 char ch;
689 if (1 == this->m->file->read(&ch, 1))
690 {
691 if (QUtil::is_space(ch))
692 {
693 skipped_space = true;
694 }
695 else
696 {
697 this->m->file->unreadCh(ch);
698 done = true;
699 }
700 }
701 else
702 {
703 QTC::TC("qpdf", "QPDF eof skipping spaces before xref",
704 skipped_space ? 0 : 1);
705 done = true;
706 }
707 }
708
709 this->m->file->read(buf, sizeof(buf) - 1);
710 // The PDF spec says xref must be followed by a line
711 // terminator, but files exist in the wild where it is
712 // terminated by arbitrary whitespace.
713 if ((strncmp(buf, "xref", 4) == 0) &&
714 QUtil::is_space(buf[4]))
715 {
716 if (skipped_space)
717 {
718 QTC::TC("qpdf", "QPDF xref skipped space");
719 warn(QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
720 "", 0,
721 "extraneous whitespace seen before xref"));
722 }
723 QTC::TC("qpdf", "QPDF xref space",
724 ((buf[4] == '\n') ? 0 :
725 (buf[4] == '\r') ? 1 :
726 (buf[4] == ' ') ? 2 : 9999));
727 int skip = 4;
728 // buf is null-terminated, and QUtil::is_space('\0') is
729 // false, so this won't overrun.
730 while (QUtil::is_space(buf[skip]))
731 {
732 ++skip;
733 }
734 xref_offset = read_xrefTable(xref_offset + skip);
735 }
736 else
737 {
738 xref_offset = read_xrefStream(xref_offset);
739 }
740 if (visited.count(xref_offset) != 0)
741 {
742 QTC::TC("qpdf", "QPDF xref loop");
743 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(), "", 0,
744 "loop detected following xref tables");
745 }
746 }
747
748 if (! this->m->trailer.isInitialized())
749 {
750 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(), "", 0,
751 "unable to find trailer while reading xref");
752 }
753 int size = this->m->trailer.getKey("/Size").getIntValueAsInt();
754 int max_obj = 0;
755 if (! this->m->xref_table.empty())
756 {
757 max_obj = (*(this->m->xref_table.rbegin())).first.getObj();
758 }
759 if (! this->m->deleted_objects.empty())
760 {
761 max_obj = std::max(max_obj, *(this->m->deleted_objects.rbegin()));
762 }
763 if ((size < 1) || (size - 1 != max_obj))
764 {
765 QTC::TC("qpdf", "QPDF xref size mismatch");
766 warn(QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(), "", 0,
767 std::string("reported number of objects (") +
768 QUtil::int_to_string(size) +
769 ") is not one plus the highest object number (" +
770 QUtil::int_to_string(max_obj) + ")"));
771 }
772
773 // We no longer need the deleted_objects table, so go ahead and
774 // clear it out to make sure we never depend on its being set.
775 this->m->deleted_objects.clear();
776 }
777
778 bool
parse_xrefFirst(std::string const & line,int & obj,int & num,int & bytes)779 QPDF::parse_xrefFirst(std::string const& line,
780 int& obj, int& num, int& bytes)
781 {
782 // is_space and is_digit both return false on '\0', so this will
783 // not overrun the null-terminated buffer.
784 char const* p = line.c_str();
785 char const* start = line.c_str();
786
787 // Skip zero or more spaces
788 while (QUtil::is_space(*p))
789 {
790 ++p;
791 }
792 // Require digit
793 if (! QUtil::is_digit(*p))
794 {
795 return false;
796 }
797 // Gather digits
798 std::string obj_str;
799 while (QUtil::is_digit(*p))
800 {
801 obj_str.append(1, *p++);
802 }
803 // Require space
804 if (! QUtil::is_space(*p))
805 {
806 return false;
807 }
808 // Skip spaces
809 while (QUtil::is_space(*p))
810 {
811 ++p;
812 }
813 // Require digit
814 if (! QUtil::is_digit(*p))
815 {
816 return false;
817 }
818 // Gather digits
819 std::string num_str;
820 while (QUtil::is_digit(*p))
821 {
822 num_str.append(1, *p++);
823 }
824 // Skip any space including line terminators
825 while (QUtil::is_space(*p))
826 {
827 ++p;
828 }
829 bytes = toI(p - start);
830 obj = QUtil::string_to_int(obj_str.c_str());
831 num = QUtil::string_to_int(num_str.c_str());
832 return true;
833 }
834
835 bool
parse_xrefEntry(std::string const & line,qpdf_offset_t & f1,int & f2,char & type)836 QPDF::parse_xrefEntry(std::string const& line,
837 qpdf_offset_t& f1, int& f2, char& type)
838 {
839 // is_space and is_digit both return false on '\0', so this will
840 // not overrun the null-terminated buffer.
841 char const* p = line.c_str();
842
843 // Skip zero or more spaces. There aren't supposed to be any.
844 bool invalid = false;
845 while (QUtil::is_space(*p))
846 {
847 ++p;
848 QTC::TC("qpdf", "QPDF ignore first space in xref entry");
849 invalid = true;
850 }
851 // Require digit
852 if (! QUtil::is_digit(*p))
853 {
854 return false;
855 }
856 // Gather digits
857 std::string f1_str;
858 while (QUtil::is_digit(*p))
859 {
860 f1_str.append(1, *p++);
861 }
862 // Require space
863 if (! QUtil::is_space(*p))
864 {
865 return false;
866 }
867 if (QUtil::is_space(*(p+1)))
868 {
869 QTC::TC("qpdf", "QPDF ignore first extra space in xref entry");
870 invalid = true;
871 }
872 // Skip spaces
873 while (QUtil::is_space(*p))
874 {
875 ++p;
876 }
877 // Require digit
878 if (! QUtil::is_digit(*p))
879 {
880 return false;
881 }
882 // Gather digits
883 std::string f2_str;
884 while (QUtil::is_digit(*p))
885 {
886 f2_str.append(1, *p++);
887 }
888 // Require space
889 if (! QUtil::is_space(*p))
890 {
891 return false;
892 }
893 if (QUtil::is_space(*(p+1)))
894 {
895 QTC::TC("qpdf", "QPDF ignore second extra space in xref entry");
896 invalid = true;
897 }
898 // Skip spaces
899 while (QUtil::is_space(*p))
900 {
901 ++p;
902 }
903 if ((*p == 'f') || (*p == 'n'))
904 {
905 type = *p;
906 }
907 else
908 {
909 return false;
910 }
911 if ((f1_str.length() != 10) || (f2_str.length() != 5))
912 {
913 QTC::TC("qpdf", "QPDF ignore length error xref entry");
914 invalid = true;
915 }
916
917 if (invalid)
918 {
919 warn(QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
920 "xref table",
921 this->m->file->getLastOffset(),
922 "accepting invalid xref table entry"));
923 }
924
925 f1 = QUtil::string_to_ll(f1_str.c_str());
926 f2 = QUtil::string_to_int(f2_str.c_str());
927
928 return true;
929 }
930
931 qpdf_offset_t
read_xrefTable(qpdf_offset_t xref_offset)932 QPDF::read_xrefTable(qpdf_offset_t xref_offset)
933 {
934 std::vector<QPDFObjGen> deleted_items;
935
936 this->m->file->seek(xref_offset, SEEK_SET);
937 bool done = false;
938 while (! done)
939 {
940 char linebuf[51];
941 memset(linebuf, 0, sizeof(linebuf));
942 this->m->file->read(linebuf, sizeof(linebuf) - 1);
943 std::string line = linebuf;
944 int obj = 0;
945 int num = 0;
946 int bytes = 0;
947 if (! parse_xrefFirst(line, obj, num, bytes))
948 {
949 QTC::TC("qpdf", "QPDF invalid xref");
950 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
951 "xref table", this->m->file->getLastOffset(),
952 "xref syntax invalid");
953 }
954 this->m->file->seek(this->m->file->getLastOffset() + bytes, SEEK_SET);
955 for (qpdf_offset_t i = obj; i - num < obj; ++i)
956 {
957 if (i == 0)
958 {
959 // This is needed by checkLinearization()
960 this->m->first_xref_item_offset = this->m->file->tell();
961 }
962 std::string xref_entry = this->m->file->readLine(30);
963 // For xref_table, these will always be small enough to be ints
964 qpdf_offset_t f1 = 0;
965 int f2 = 0;
966 char type = '\0';
967 if (! parse_xrefEntry(xref_entry, f1, f2, type))
968 {
969 QTC::TC("qpdf", "QPDF invalid xref entry");
970 throw QPDFExc(
971 qpdf_e_damaged_pdf, this->m->file->getName(),
972 "xref table", this->m->file->getLastOffset(),
973 "invalid xref entry (obj=" +
974 QUtil::int_to_string(i) + ")");
975 }
976 if (type == 'f')
977 {
978 // Save deleted items until after we've checked the
979 // XRefStm, if any.
980 deleted_items.push_back(QPDFObjGen(toI(i), f2));
981 }
982 else
983 {
984 insertXrefEntry(toI(i), 1, f1, f2);
985 }
986 }
987 qpdf_offset_t pos = this->m->file->tell();
988 QPDFTokenizer::Token t = readToken(this->m->file);
989 if (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "trailer"))
990 {
991 done = true;
992 }
993 else
994 {
995 this->m->file->seek(pos, SEEK_SET);
996 }
997 }
998
999 // Set offset to previous xref table if any
1000 QPDFObjectHandle cur_trailer =
1001 readObject(this->m->file, "trailer", 0, 0, false);
1002 if (! cur_trailer.isDictionary())
1003 {
1004 QTC::TC("qpdf", "QPDF missing trailer");
1005 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1006 "", this->m->file->getLastOffset(),
1007 "expected trailer dictionary");
1008 }
1009
1010 if (! this->m->trailer.isInitialized())
1011 {
1012 setTrailer(cur_trailer);
1013
1014 if (! this->m->trailer.hasKey("/Size"))
1015 {
1016 QTC::TC("qpdf", "QPDF trailer lacks size");
1017 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1018 "trailer", this->m->file->getLastOffset(),
1019 "trailer dictionary lacks /Size key");
1020 }
1021 if (! this->m->trailer.getKey("/Size").isInteger())
1022 {
1023 QTC::TC("qpdf", "QPDF trailer size not integer");
1024 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1025 "trailer", this->m->file->getLastOffset(),
1026 "/Size key in trailer dictionary is not "
1027 "an integer");
1028 }
1029 }
1030
1031 if (cur_trailer.hasKey("/XRefStm"))
1032 {
1033 if (this->m->ignore_xref_streams)
1034 {
1035 QTC::TC("qpdf", "QPDF ignoring XRefStm in trailer");
1036 }
1037 else
1038 {
1039 if (cur_trailer.getKey("/XRefStm").isInteger())
1040 {
1041 // Read the xref stream but disregard any return value
1042 // -- we'll use our trailer's /Prev key instead of the
1043 // xref stream's.
1044 (void) read_xrefStream(
1045 cur_trailer.getKey("/XRefStm").getIntValue());
1046 }
1047 else
1048 {
1049 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1050 "xref stream", xref_offset,
1051 "invalid /XRefStm");
1052 }
1053 }
1054 }
1055
1056 // Handle any deleted items now that we've read the /XRefStm.
1057 for (std::vector<QPDFObjGen>::iterator iter = deleted_items.begin();
1058 iter != deleted_items.end(); ++iter)
1059 {
1060 QPDFObjGen& og = *iter;
1061 insertXrefEntry(og.getObj(), 0, 0, og.getGen());
1062 }
1063
1064 if (cur_trailer.hasKey("/Prev"))
1065 {
1066 if (! cur_trailer.getKey("/Prev").isInteger())
1067 {
1068 QTC::TC("qpdf", "QPDF trailer prev not integer");
1069 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1070 "trailer", this->m->file->getLastOffset(),
1071 "/Prev key in trailer dictionary is not "
1072 "an integer");
1073 }
1074 QTC::TC("qpdf", "QPDF prev key in trailer dictionary");
1075 xref_offset = cur_trailer.getKey("/Prev").getIntValue();
1076 }
1077 else
1078 {
1079 xref_offset = 0;
1080 }
1081
1082 return xref_offset;
1083 }
1084
1085 qpdf_offset_t
read_xrefStream(qpdf_offset_t xref_offset)1086 QPDF::read_xrefStream(qpdf_offset_t xref_offset)
1087 {
1088 bool found = false;
1089 if (! this->m->ignore_xref_streams)
1090 {
1091 int xobj;
1092 int xgen;
1093 QPDFObjectHandle xref_obj;
1094 try
1095 {
1096 xref_obj = readObjectAtOffset(
1097 false, xref_offset, "xref stream", -1, 0, xobj, xgen);
1098 }
1099 catch (QPDFExc&)
1100 {
1101 // ignore -- report error below
1102 }
1103 if (xref_obj.isInitialized() &&
1104 xref_obj.isStream() &&
1105 xref_obj.getDict().getKey("/Type").isName() &&
1106 xref_obj.getDict().getKey("/Type").getName() == "/XRef")
1107 {
1108 QTC::TC("qpdf", "QPDF found xref stream");
1109 found = true;
1110 xref_offset = processXRefStream(xref_offset, xref_obj);
1111 }
1112 }
1113
1114 if (! found)
1115 {
1116 QTC::TC("qpdf", "QPDF can't find xref");
1117 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1118 "", xref_offset, "xref not found");
1119 }
1120
1121 return xref_offset;
1122 }
1123
1124 qpdf_offset_t
processXRefStream(qpdf_offset_t xref_offset,QPDFObjectHandle & xref_obj)1125 QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj)
1126 {
1127 QPDFObjectHandle dict = xref_obj.getDict();
1128 QPDFObjectHandle W_obj = dict.getKey("/W");
1129 QPDFObjectHandle Index_obj = dict.getKey("/Index");
1130 if (! (W_obj.isArray() &&
1131 (W_obj.getArrayNItems() >= 3) &&
1132 W_obj.getArrayItem(0).isInteger() &&
1133 W_obj.getArrayItem(1).isInteger() &&
1134 W_obj.getArrayItem(2).isInteger() &&
1135 dict.getKey("/Size").isInteger() &&
1136 (Index_obj.isArray() || Index_obj.isNull())))
1137 {
1138 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1139 "xref stream", xref_offset,
1140 "Cross-reference stream does not have"
1141 " proper /W and /Index keys");
1142 }
1143
1144 int W[3];
1145 size_t entry_size = 0;
1146 int max_bytes = sizeof(qpdf_offset_t);
1147 for (int i = 0; i < 3; ++i)
1148 {
1149 W[i] = W_obj.getArrayItem(i).getIntValueAsInt();
1150 if (W[i] > max_bytes)
1151 {
1152 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1153 "xref stream", xref_offset,
1154 "Cross-reference stream's /W contains"
1155 " impossibly large values");
1156 }
1157 entry_size += toS(W[i]);
1158 }
1159 if (entry_size == 0)
1160 {
1161 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1162 "xref stream", xref_offset,
1163 "Cross-reference stream's /W indicates"
1164 " entry size of 0");
1165 }
1166 unsigned long long max_num_entries =
1167 static_cast<unsigned long long>(-1) / entry_size;
1168
1169 std::vector<long long> indx;
1170 if (Index_obj.isArray())
1171 {
1172 int n_index = Index_obj.getArrayNItems();
1173 if ((n_index % 2) || (n_index < 2))
1174 {
1175 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1176 "xref stream", xref_offset,
1177 "Cross-reference stream's /Index has an"
1178 " invalid number of values");
1179 }
1180 for (int i = 0; i < n_index; ++i)
1181 {
1182 if (Index_obj.getArrayItem(i).isInteger())
1183 {
1184 indx.push_back(Index_obj.getArrayItem(i).getIntValue());
1185 }
1186 else
1187 {
1188 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1189 "xref stream", xref_offset,
1190 "Cross-reference stream's /Index's item " +
1191 QUtil::int_to_string(i) +
1192 " is not an integer");
1193 }
1194 }
1195 QTC::TC("qpdf", "QPDF xref /Index is array",
1196 n_index == 2 ? 0 : 1);
1197 }
1198 else
1199 {
1200 QTC::TC("qpdf", "QPDF xref /Index is null");
1201 long long size = dict.getKey("/Size").getIntValue();
1202 indx.push_back(0);
1203 indx.push_back(size);
1204 }
1205
1206 size_t num_entries = 0;
1207 for (size_t i = 1; i < indx.size(); i += 2)
1208 {
1209 if (indx.at(i) > QIntC::to_longlong(max_num_entries - num_entries))
1210 {
1211 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1212 "xref stream", xref_offset,
1213 "Cross-reference stream claims to contain"
1214 " too many entries: " +
1215 QUtil::int_to_string(indx.at(i)) + " " +
1216 QUtil::uint_to_string(max_num_entries) + " " +
1217 QUtil::uint_to_string(num_entries));
1218 }
1219 num_entries += toS(indx.at(i));
1220 }
1221
1222 // entry_size and num_entries have both been validated to ensure
1223 // that this multiplication does not cause an overflow.
1224 size_t expected_size = entry_size * num_entries;
1225
1226 PointerHolder<Buffer> bp = xref_obj.getStreamData(qpdf_dl_specialized);
1227 size_t actual_size = bp->getSize();
1228
1229 if (expected_size != actual_size)
1230 {
1231 QPDFExc x(qpdf_e_damaged_pdf, this->m->file->getName(),
1232 "xref stream", xref_offset,
1233 "Cross-reference stream data has the wrong size;"
1234 " expected = " + QUtil::uint_to_string(expected_size) +
1235 "; actual = " + QUtil::uint_to_string(actual_size));
1236 if (expected_size > actual_size)
1237 {
1238 throw x;
1239 }
1240 else
1241 {
1242 warn(x);
1243 }
1244 }
1245
1246 size_t cur_chunk = 0;
1247 int chunk_count = 0;
1248
1249 bool saw_first_compressed_object = false;
1250
1251 // Actual size vs. expected size check above ensures that we will
1252 // not overflow any buffers here. We know that entry_size *
1253 // num_entries is equal to the size of the buffer.
1254 unsigned char const* data = bp->getBuffer();
1255 for (size_t i = 0; i < num_entries; ++i)
1256 {
1257 // Read this entry
1258 unsigned char const* entry = data + (entry_size * i);
1259 qpdf_offset_t fields[3];
1260 unsigned char const* p = entry;
1261 for (int j = 0; j < 3; ++j)
1262 {
1263 fields[j] = 0;
1264 if ((j == 0) && (W[0] == 0))
1265 {
1266 QTC::TC("qpdf", "QPDF default for xref stream field 0");
1267 fields[0] = 1;
1268 }
1269 for (int k = 0; k < W[j]; ++k)
1270 {
1271 fields[j] <<= 8;
1272 fields[j] += toI(*p++);
1273 }
1274 }
1275
1276 // Get the object and generation number. The object number is
1277 // based on /Index. The generation number is 0 unless this is
1278 // an uncompressed object record, in which case the generation
1279 // number appears as the third field.
1280 int obj = toI(indx.at(cur_chunk));
1281 if ((obj < 0) ||
1282 ((std::numeric_limits<int>::max() - obj) < chunk_count))
1283 {
1284 std::ostringstream msg;
1285 msg.imbue(std::locale::classic());
1286 msg << "adding " << chunk_count << " to " << obj
1287 << " while computing index in xref stream would cause"
1288 << " an integer overflow";
1289 throw std::range_error(msg.str());
1290 }
1291 obj += chunk_count;
1292 ++chunk_count;
1293 if (chunk_count >= indx.at(cur_chunk + 1))
1294 {
1295 cur_chunk += 2;
1296 chunk_count = 0;
1297 }
1298
1299 if (saw_first_compressed_object)
1300 {
1301 if (fields[0] != 2)
1302 {
1303 this->m->uncompressed_after_compressed = true;
1304 }
1305 }
1306 else if (fields[0] == 2)
1307 {
1308 saw_first_compressed_object = true;
1309 }
1310 if (obj == 0)
1311 {
1312 // This is needed by checkLinearization()
1313 this->m->first_xref_item_offset = xref_offset;
1314 }
1315 if (fields[0] == 0)
1316 {
1317 // Ignore fields[2], which we don't care about in this
1318 // case. This works around the issue of some PDF files
1319 // that put invalid values, like -1, here for deleted
1320 // objects.
1321 fields[2] = 0;
1322 }
1323 insertXrefEntry(obj, toI(fields[0]),
1324 fields[1], toI(fields[2]));
1325 }
1326
1327 if (! this->m->trailer.isInitialized())
1328 {
1329 setTrailer(dict);
1330 }
1331
1332 if (dict.hasKey("/Prev"))
1333 {
1334 if (! dict.getKey("/Prev").isInteger())
1335 {
1336 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1337 "xref stream", this->m->file->getLastOffset(),
1338 "/Prev key in xref stream dictionary is not "
1339 "an integer");
1340 }
1341 QTC::TC("qpdf", "QPDF prev key in xref stream dictionary");
1342 xref_offset = dict.getKey("/Prev").getIntValue();
1343 }
1344 else
1345 {
1346 xref_offset = 0;
1347 }
1348
1349 return xref_offset;
1350 }
1351
1352 void
insertXrefEntry(int obj,int f0,qpdf_offset_t f1,int f2,bool overwrite)1353 QPDF::insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2, bool overwrite)
1354 {
1355 // Populate the xref table in such a way that the first reference
1356 // to an object that we see, which is the one in the latest xref
1357 // table in which it appears, is the one that gets stored. This
1358 // works because we are reading more recent appends before older
1359 // ones. Exception: if overwrite is true, then replace any
1360 // existing object. This is used in xref recovery mode, which
1361 // reads the file from beginning to end.
1362
1363 // If there is already an entry for this object and generation in
1364 // the table, it means that a later xref table has registered this
1365 // object. Disregard this one.
1366 { // private scope
1367 int gen = (f0 == 2 ? 0 : f2);
1368 QPDFObjGen og(obj, gen);
1369 if (this->m->xref_table.count(og))
1370 {
1371 if (overwrite)
1372 {
1373 QTC::TC("qpdf", "QPDF xref overwrite object");
1374 this->m->xref_table.erase(og);
1375 }
1376 else
1377 {
1378 QTC::TC("qpdf", "QPDF xref reused object");
1379 return;
1380 }
1381 }
1382 if (this->m->deleted_objects.count(obj))
1383 {
1384 QTC::TC("qpdf", "QPDF xref deleted object");
1385 return;
1386 }
1387 }
1388
1389 switch (f0)
1390 {
1391 case 0:
1392 this->m->deleted_objects.insert(obj);
1393 break;
1394
1395 case 1:
1396 // f2 is generation
1397 QTC::TC("qpdf", "QPDF xref gen > 0", ((f2 > 0) ? 1 : 0));
1398 this->m->xref_table[QPDFObjGen(obj, f2)] = QPDFXRefEntry(f0, f1, f2);
1399 break;
1400
1401 case 2:
1402 this->m->xref_table[QPDFObjGen(obj, 0)] = QPDFXRefEntry(f0, f1, f2);
1403 break;
1404
1405 default:
1406 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1407 "xref stream", this->m->file->getLastOffset(),
1408 "unknown xref stream entry type " +
1409 QUtil::int_to_string(f0));
1410 break;
1411 }
1412 }
1413
1414 void
showXRefTable()1415 QPDF::showXRefTable()
1416 {
1417 for (std::map<QPDFObjGen, QPDFXRefEntry>::iterator iter =
1418 this->m->xref_table.begin();
1419 iter != this->m->xref_table.end(); ++iter)
1420 {
1421 QPDFObjGen const& og = (*iter).first;
1422 QPDFXRefEntry const& entry = (*iter).second;
1423 *this->m->out_stream << og.getObj() << "/" << og.getGen() << ": ";
1424 switch (entry.getType())
1425 {
1426 case 1:
1427 *this->m->out_stream
1428 << "uncompressed; offset = " << entry.getOffset();
1429 break;
1430
1431 case 2:
1432 *this->m->out_stream
1433 << "compressed; stream = "
1434 << entry.getObjStreamNumber()
1435 << ", index = " << entry.getObjStreamIndex();
1436 break;
1437
1438 default:
1439 throw std::logic_error("unknown cross-reference table type while"
1440 " showing xref_table");
1441 break;
1442 }
1443 *this->m->out_stream << std::endl;
1444 }
1445 }
1446
1447 void
fixDanglingReferences(bool force)1448 QPDF::fixDanglingReferences(bool force)
1449 {
1450 if (this->m->fixed_dangling_refs && (! force))
1451 {
1452 return;
1453 }
1454 this->m->fixed_dangling_refs = true;
1455
1456 // Create a set of all known indirect objects including those
1457 // we've previously resolved and those that we have created.
1458 std::set<QPDFObjGen> to_process;
1459 for (std::map<QPDFObjGen, ObjCache>::iterator iter =
1460 this->m->obj_cache.begin();
1461 iter != this->m->obj_cache.end(); ++iter)
1462 {
1463 to_process.insert((*iter).first);
1464 }
1465 for (std::map<QPDFObjGen, QPDFXRefEntry>::iterator iter =
1466 this->m->xref_table.begin();
1467 iter != this->m->xref_table.end(); ++iter)
1468 {
1469 to_process.insert((*iter).first);
1470 }
1471
1472 // For each non-scalar item to process, put it in the queue.
1473 std::list<QPDFObjectHandle> queue;
1474 queue.push_back(this->m->trailer);
1475 for (std::set<QPDFObjGen>::iterator iter = to_process.begin();
1476 iter != to_process.end(); ++iter)
1477 {
1478 QPDFObjectHandle obj = QPDFObjectHandle::Factory::newIndirect(
1479 this, (*iter).getObj(), (*iter).getGen());
1480 if (obj.isDictionary() || obj.isArray())
1481 {
1482 queue.push_back(obj);
1483 }
1484 else if (obj.isStream())
1485 {
1486 queue.push_back(obj.getDict());
1487 }
1488 }
1489
1490 // Process the queue by recursively resolving all object
1491 // references. We don't need to do loop detection because we don't
1492 // traverse known indirect objects when processing the queue.
1493 while (! queue.empty())
1494 {
1495 QPDFObjectHandle obj = queue.front();
1496 queue.pop_front();
1497 std::list<QPDFObjectHandle> to_check;
1498 if (obj.isDictionary())
1499 {
1500 std::map<std::string, QPDFObjectHandle> members =
1501 obj.getDictAsMap();
1502 for (std::map<std::string, QPDFObjectHandle>::iterator iter =
1503 members.begin();
1504 iter != members.end(); ++iter)
1505 {
1506 to_check.push_back((*iter).second);
1507 }
1508 }
1509 else if (obj.isArray())
1510 {
1511 QPDF_Array* arr =
1512 dynamic_cast<QPDF_Array*>(
1513 QPDFObjectHandle::ObjAccessor::getObject(obj).getPointer());
1514 arr->addExplicitElementsToList(to_check);
1515 }
1516 for (std::list<QPDFObjectHandle>::iterator iter = to_check.begin();
1517 iter != to_check.end(); ++iter)
1518 {
1519 QPDFObjectHandle sub = *iter;
1520 if (sub.isIndirect())
1521 {
1522 if (sub.getOwningQPDF() == this)
1523 {
1524 QPDFObjGen og(sub.getObjGen());
1525 if (this->m->obj_cache.count(og) == 0)
1526 {
1527 QTC::TC("qpdf", "QPDF detected dangling ref");
1528 queue.push_back(sub);
1529 }
1530 }
1531 }
1532 else
1533 {
1534 queue.push_back(sub);
1535 }
1536 }
1537 }
1538 }
1539
1540 size_t
getObjectCount()1541 QPDF::getObjectCount()
1542 {
1543 // This method returns the next available indirect object number.
1544 // makeIndirectObject uses it for this purpose. After
1545 // fixDanglingReferences is called, all objects in the xref table
1546 // will also be in obj_cache.
1547 fixDanglingReferences();
1548 QPDFObjGen og(0, 0);
1549 if (! this->m->obj_cache.empty())
1550 {
1551 og = (*(this->m->obj_cache.rbegin())).first;
1552 }
1553 return toS(og.getObj());
1554 }
1555
1556 std::vector<QPDFObjectHandle>
getAllObjects()1557 QPDF::getAllObjects()
1558 {
1559 // After fixDanglingReferences is called, all objects are in the
1560 // object cache.
1561 fixDanglingReferences(true);
1562 std::vector<QPDFObjectHandle> result;
1563 for (std::map<QPDFObjGen, ObjCache>::iterator iter =
1564 this->m->obj_cache.begin();
1565 iter != this->m->obj_cache.end(); ++iter)
1566 {
1567
1568 QPDFObjGen const& og = (*iter).first;
1569 result.push_back(QPDFObjectHandle::Factory::newIndirect(
1570 this, og.getObj(), og.getGen()));
1571 }
1572 return result;
1573 }
1574
1575 void
setLastObjectDescription(std::string const & description,int objid,int generation)1576 QPDF::setLastObjectDescription(std::string const& description,
1577 int objid, int generation)
1578 {
1579 this->m->last_object_description.clear();
1580 if (! description.empty())
1581 {
1582 this->m->last_object_description += description;
1583 if (objid > 0)
1584 {
1585 this->m->last_object_description += ": ";
1586 }
1587 }
1588 if (objid > 0)
1589 {
1590 this->m->last_object_description += "object " +
1591 QUtil::int_to_string(objid) + " " +
1592 QUtil::int_to_string(generation);
1593 }
1594 }
1595
1596 QPDFObjectHandle
readObject(PointerHolder<InputSource> input,std::string const & description,int objid,int generation,bool in_object_stream)1597 QPDF::readObject(PointerHolder<InputSource> input,
1598 std::string const& description,
1599 int objid, int generation, bool in_object_stream)
1600 {
1601 setLastObjectDescription(description, objid, generation);
1602 qpdf_offset_t offset = input->tell();
1603
1604 bool empty = false;
1605 PointerHolder<StringDecrypter> decrypter_ph;
1606 StringDecrypter* decrypter = 0;
1607 if (this->m->encp->encrypted && (! in_object_stream))
1608 {
1609 decrypter_ph = new StringDecrypter(this, objid, generation);
1610 decrypter = decrypter_ph.getPointer();
1611 }
1612 QPDFObjectHandle object = QPDFObjectHandle::parse(
1613 input, this->m->last_object_description,
1614 this->m->tokenizer, empty, decrypter, this);
1615 if (empty)
1616 {
1617 // Nothing in the PDF spec appears to allow empty objects, but
1618 // they have been encountered in actual PDF files and Adobe
1619 // Reader appears to ignore them.
1620 warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(),
1621 this->m->last_object_description,
1622 input->getLastOffset(),
1623 "empty object treated as null"));
1624 }
1625 else if (object.isDictionary() && (! in_object_stream))
1626 {
1627 // check for stream
1628 qpdf_offset_t cur_offset = input->tell();
1629 if (readToken(input) ==
1630 QPDFTokenizer::Token(QPDFTokenizer::tt_word, "stream"))
1631 {
1632 // The PDF specification states that the word "stream"
1633 // should be followed by either a carriage return and
1634 // a newline or by a newline alone. It specifically
1635 // disallowed following it by a carriage return alone
1636 // since, in that case, there would be no way to tell
1637 // whether the NL in a CR NL sequence was part of the
1638 // stream data. However, some readers, including
1639 // Adobe reader, accept a carriage return by itself
1640 // when followed by a non-newline character, so that's
1641 // what we do here. We have also seen files that have
1642 // extraneous whitespace between the stream keyword and
1643 // the newline.
1644 bool done = false;
1645 while (! done)
1646 {
1647 done = true;
1648 char ch;
1649 if (input->read(&ch, 1) == 0)
1650 {
1651 // A premature EOF here will result in some
1652 // other problem that will get reported at
1653 // another time.
1654 }
1655 else if (ch == '\n')
1656 {
1657 // ready to read stream data
1658 QTC::TC("qpdf", "QPDF stream with NL only");
1659 }
1660 else if (ch == '\r')
1661 {
1662 // Read another character
1663 if (input->read(&ch, 1) != 0)
1664 {
1665 if (ch == '\n')
1666 {
1667 // Ready to read stream data
1668 QTC::TC("qpdf", "QPDF stream with CRNL");
1669 }
1670 else
1671 {
1672 // Treat the \r by itself as the
1673 // whitespace after endstream and
1674 // start reading stream data in spite
1675 // of not having seen a newline.
1676 QTC::TC("qpdf", "QPDF stream with CR only");
1677 input->unreadCh(ch);
1678 warn(QPDFExc(
1679 qpdf_e_damaged_pdf,
1680 input->getName(),
1681 this->m->last_object_description,
1682 input->tell(),
1683 "stream keyword followed"
1684 " by carriage return only"));
1685 }
1686 }
1687 }
1688 else if (QUtil::is_space(ch))
1689 {
1690 warn(QPDFExc(
1691 qpdf_e_damaged_pdf,
1692 input->getName(),
1693 this->m->last_object_description,
1694 input->tell(),
1695 "stream keyword followed by"
1696 " extraneous whitespace"));
1697 done = false;
1698 }
1699 else
1700 {
1701 QTC::TC("qpdf", "QPDF stream without newline");
1702 input->unreadCh(ch);
1703 warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(),
1704 this->m->last_object_description,
1705 input->tell(),
1706 "stream keyword not followed"
1707 " by proper line terminator"));
1708 }
1709 }
1710
1711 // Must get offset before accessing any additional
1712 // objects since resolving a previously unresolved
1713 // indirect object will change file position.
1714 qpdf_offset_t stream_offset = input->tell();
1715 size_t length = 0;
1716
1717 try
1718 {
1719 std::map<std::string, QPDFObjectHandle> dict =
1720 object.getDictAsMap();
1721
1722 if (dict.count("/Length") == 0)
1723 {
1724 QTC::TC("qpdf", "QPDF stream without length");
1725 throw QPDFExc(qpdf_e_damaged_pdf, input->getName(),
1726 this->m->last_object_description, offset,
1727 "stream dictionary lacks /Length key");
1728 }
1729
1730 QPDFObjectHandle length_obj = dict["/Length"];
1731 if (! length_obj.isInteger())
1732 {
1733 QTC::TC("qpdf", "QPDF stream length not integer");
1734 throw QPDFExc(qpdf_e_damaged_pdf, input->getName(),
1735 this->m->last_object_description, offset,
1736 "/Length key in stream dictionary is not "
1737 "an integer");
1738 }
1739
1740 length = toS(length_obj.getUIntValue());
1741 // Seek in two steps to avoid potential integer overflow
1742 input->seek(stream_offset, SEEK_SET);
1743 input->seek(toO(length), SEEK_CUR);
1744 if (! (readToken(input) ==
1745 QPDFTokenizer::Token(
1746 QPDFTokenizer::tt_word, "endstream")))
1747 {
1748 QTC::TC("qpdf", "QPDF missing endstream");
1749 throw QPDFExc(qpdf_e_damaged_pdf, input->getName(),
1750 this->m->last_object_description,
1751 input->getLastOffset(),
1752 "expected endstream");
1753 }
1754 }
1755 catch (QPDFExc& e)
1756 {
1757 if (this->m->attempt_recovery)
1758 {
1759 warn(e);
1760 length = recoverStreamLength(
1761 input, objid, generation, stream_offset);
1762 }
1763 else
1764 {
1765 throw e;
1766 }
1767 }
1768 object = QPDFObjectHandle::Factory::newStream(
1769 this, objid, generation, object, stream_offset, length);
1770 }
1771 else
1772 {
1773 input->seek(cur_offset, SEEK_SET);
1774 }
1775 }
1776
1777 // Override last_offset so that it points to the beginning of the
1778 // object we just read
1779 input->setLastOffset(offset);
1780 return object;
1781 }
1782
1783 bool
findEndstream()1784 QPDF::findEndstream()
1785 {
1786 // Find endstream or endobj. Position the input at that token.
1787 QPDFTokenizer::Token t = readToken(this->m->file, 20);
1788 if ((t.getType() == QPDFTokenizer::tt_word) &&
1789 ((t.getValue() == "endobj") ||
1790 (t.getValue() == "endstream")))
1791 {
1792 this->m->file->seek(this->m->file->getLastOffset(), SEEK_SET);
1793 return true;
1794 }
1795 return false;
1796 }
1797
1798 size_t
recoverStreamLength(PointerHolder<InputSource> input,int objid,int generation,qpdf_offset_t stream_offset)1799 QPDF::recoverStreamLength(PointerHolder<InputSource> input,
1800 int objid, int generation,
1801 qpdf_offset_t stream_offset)
1802 {
1803 // Try to reconstruct stream length by looking for
1804 // endstream or endobj
1805 warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(),
1806 this->m->last_object_description, stream_offset,
1807 "attempting to recover stream length"));
1808
1809 PatternFinder ef(*this, &QPDF::findEndstream);
1810 size_t length = 0;
1811 if (this->m->file->findFirst("end", stream_offset, 0, ef))
1812 {
1813 length = toS(this->m->file->tell() - stream_offset);
1814 // Reread endstream but, if it was endobj, don't skip that.
1815 QPDFTokenizer::Token t = readToken(this->m->file);
1816 if (t.getValue() == "endobj")
1817 {
1818 this->m->file->seek(this->m->file->getLastOffset(), SEEK_SET);
1819 }
1820 }
1821
1822 if (length)
1823 {
1824 qpdf_offset_t this_obj_offset = 0;
1825 QPDFObjGen this_obj(0, 0);
1826
1827 // Make sure this is inside this object
1828 for (std::map<QPDFObjGen, QPDFXRefEntry>::iterator iter =
1829 this->m->xref_table.begin();
1830 iter != this->m->xref_table.end(); ++iter)
1831 {
1832 QPDFObjGen const& og = (*iter).first;
1833 QPDFXRefEntry const& entry = (*iter).second;
1834 if (entry.getType() == 1)
1835 {
1836 qpdf_offset_t obj_offset = entry.getOffset();
1837 if ((obj_offset > stream_offset) &&
1838 ((this_obj_offset == 0) ||
1839 (this_obj_offset > obj_offset)))
1840 {
1841 this_obj_offset = obj_offset;
1842 this_obj = og;
1843 }
1844 }
1845 }
1846 if (this_obj_offset &&
1847 (this_obj.getObj() == objid) &&
1848 (this_obj.getGen() == generation))
1849 {
1850 // Well, we found endstream\nendobj within the space
1851 // allowed for this object, so we're probably in good
1852 // shape.
1853 }
1854 else
1855 {
1856 QTC::TC("qpdf", "QPDF found wrong endstream in recovery");
1857 }
1858 }
1859
1860 if (length == 0)
1861 {
1862 warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(),
1863 this->m->last_object_description, stream_offset,
1864 "unable to recover stream data;"
1865 " treating stream as empty"));
1866 }
1867 else
1868 {
1869 warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(),
1870 this->m->last_object_description, stream_offset,
1871 "recovered stream length: " +
1872 QUtil::uint_to_string(length)));
1873 }
1874
1875 QTC::TC("qpdf", "QPDF recovered stream length");
1876 return length;
1877 }
1878
1879 QPDFTokenizer::Token
readToken(PointerHolder<InputSource> input,size_t max_len)1880 QPDF::readToken(PointerHolder<InputSource> input, size_t max_len)
1881 {
1882 return this->m->tokenizer.readToken(
1883 input, this->m->last_object_description, true, max_len);
1884 }
1885
1886 QPDFObjectHandle
readObjectAtOffset(bool try_recovery,qpdf_offset_t offset,std::string const & description,int exp_objid,int exp_generation,int & objid,int & generation)1887 QPDF::readObjectAtOffset(bool try_recovery,
1888 qpdf_offset_t offset, std::string const& description,
1889 int exp_objid, int exp_generation,
1890 int& objid, int& generation)
1891 {
1892 if (! this->m->attempt_recovery)
1893 {
1894 try_recovery = false;
1895 }
1896 setLastObjectDescription(description, exp_objid, exp_generation);
1897
1898 // Special case: if offset is 0, just return null. Some PDF
1899 // writers, in particular "Mac OS X 10.7.5 Quartz PDFContext", may
1900 // store deleted objects in the xref table as "0000000000 00000
1901 // n", which is not correct, but it won't hurt anything for to
1902 // ignore these.
1903 if (offset == 0)
1904 {
1905 QTC::TC("qpdf", "QPDF bogus 0 offset", 0);
1906 warn(QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1907 this->m->last_object_description, 0,
1908 "object has offset 0"));
1909 return QPDFObjectHandle::newNull();
1910 }
1911
1912 this->m->file->seek(offset, SEEK_SET);
1913
1914 QPDFTokenizer::Token tobjid = readToken(this->m->file);
1915 QPDFTokenizer::Token tgen = readToken(this->m->file);
1916 QPDFTokenizer::Token tobj = readToken(this->m->file);
1917
1918 bool objidok = (tobjid.getType() == QPDFTokenizer::tt_integer);
1919 int genok = (tgen.getType() == QPDFTokenizer::tt_integer);
1920 int objok = (tobj == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "obj"));
1921
1922 QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0);
1923 QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0);
1924 QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0);
1925
1926 try
1927 {
1928 if (! (objidok && genok && objok))
1929 {
1930 QTC::TC("qpdf", "QPDF expected n n obj");
1931 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1932 this->m->last_object_description, offset,
1933 "expected n n obj");
1934 }
1935 objid = QUtil::string_to_int(tobjid.getValue().c_str());
1936 generation = QUtil::string_to_int(tgen.getValue().c_str());
1937
1938 if (objid == 0)
1939 {
1940 QTC::TC("qpdf", "QPDF object id 0");
1941 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1942 this->m->last_object_description, offset,
1943 "object with ID 0");
1944 }
1945
1946 if ((exp_objid >= 0) &&
1947 (! ((objid == exp_objid) && (generation == exp_generation))))
1948 {
1949 QTC::TC("qpdf", "QPDF err wrong objid/generation");
1950 QPDFExc e(qpdf_e_damaged_pdf, this->m->file->getName(),
1951 this->m->last_object_description, offset,
1952 std::string("expected ") +
1953 QUtil::int_to_string(exp_objid) + " " +
1954 QUtil::int_to_string(exp_generation) + " obj");
1955 if (try_recovery)
1956 {
1957 // Will be retried below
1958 throw e;
1959 }
1960 else
1961 {
1962 // We can try reading the object anyway even if the ID
1963 // doesn't match.
1964 warn(e);
1965 }
1966 }
1967 }
1968 catch (QPDFExc& e)
1969 {
1970 if ((exp_objid >= 0) && try_recovery)
1971 {
1972 // Try again after reconstructing xref table
1973 reconstruct_xref(e);
1974 QPDFObjGen og(exp_objid, exp_generation);
1975 if (this->m->xref_table.count(og) &&
1976 (this->m->xref_table[og].getType() == 1))
1977 {
1978 qpdf_offset_t new_offset = this->m->xref_table[og].getOffset();
1979 QPDFObjectHandle result = readObjectAtOffset(
1980 false, new_offset, description,
1981 exp_objid, exp_generation, objid, generation);
1982 QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset");
1983 return result;
1984 }
1985 else
1986 {
1987 QTC::TC("qpdf", "QPDF object gone after xref reconstruction");
1988 warn(QPDFExc(
1989 qpdf_e_damaged_pdf, this->m->file->getName(),
1990 "", 0,
1991 std::string(
1992 "object " +
1993 QUtil::int_to_string(exp_objid) +
1994 " " +
1995 QUtil::int_to_string(exp_generation) +
1996 " not found in file after regenerating"
1997 " cross reference table")));
1998 return QPDFObjectHandle::newNull();
1999 }
2000 }
2001 else
2002 {
2003 throw e;
2004 }
2005 }
2006
2007 QPDFObjectHandle oh = readObject(
2008 this->m->file, description, objid, generation, false);
2009
2010 if (! (readToken(this->m->file) ==
2011 QPDFTokenizer::Token(QPDFTokenizer::tt_word, "endobj")))
2012 {
2013 QTC::TC("qpdf", "QPDF err expected endobj");
2014 warn(QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
2015 this->m->last_object_description,
2016 this->m->file->getLastOffset(),
2017 "expected endobj"));
2018 }
2019
2020 QPDFObjGen og(objid, generation);
2021 if (! this->m->obj_cache.count(og))
2022 {
2023 // Store the object in the cache here so it gets cached
2024 // whether we first know the offset or whether we first know
2025 // the object ID and generation (in which we case we would get
2026 // here through resolve).
2027
2028 // Determine the end offset of this object before and after
2029 // white space. We use these numbers to validate
2030 // linearization hint tables. Offsets and lengths of objects
2031 // may imply the end of an object to be anywhere between these
2032 // values.
2033 qpdf_offset_t end_before_space = this->m->file->tell();
2034
2035 // skip over spaces
2036 while (true)
2037 {
2038 char ch;
2039 if (this->m->file->read(&ch, 1))
2040 {
2041 if (! isspace(static_cast<unsigned char>(ch)))
2042 {
2043 this->m->file->seek(-1, SEEK_CUR);
2044 break;
2045 }
2046 }
2047 else
2048 {
2049 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
2050 this->m->last_object_description,
2051 this->m->file->tell(),
2052 "EOF after endobj");
2053 }
2054 }
2055 qpdf_offset_t end_after_space = this->m->file->tell();
2056
2057 this->m->obj_cache[og] =
2058 ObjCache(QPDFObjectHandle::ObjAccessor::getObject(oh),
2059 end_before_space, end_after_space);
2060 }
2061
2062 return oh;
2063 }
2064
2065 bool
objectChanged(QPDFObjGen const & og,PointerHolder<QPDFObject> & oph)2066 QPDF::objectChanged(QPDFObjGen const& og, PointerHolder<QPDFObject>& oph)
2067 {
2068 // See if the object cached at og, if any, is the one passed in.
2069 // QPDFObjectHandle uses this to detect outdated handles to
2070 // replaced or swapped objects. This is a somewhat expensive check
2071 // because it happens with every dereference of a
2072 // QPDFObjectHandle. To reduce the hit somewhat, short-circuit the
2073 // check if we never called a function that replaces an object
2074 // already in cache. It is important for functions that do this to
2075 // set ever_replaced_objects = true.
2076
2077 if (! this->m->ever_replaced_objects)
2078 {
2079 return false;
2080 }
2081 auto c = this->m->obj_cache.find(og);
2082 if (c == this->m->obj_cache.end())
2083 {
2084 return true;
2085 }
2086 return (c->second.object.getPointer() != oph.getPointer());
2087 }
2088
2089 PointerHolder<QPDFObject>
resolve(int objid,int generation)2090 QPDF::resolve(int objid, int generation)
2091 {
2092 // Check object cache before checking xref table. This allows us
2093 // to insert things into the object cache that don't actually
2094 // exist in the file.
2095 QPDFObjGen og(objid, generation);
2096 if (this->m->resolving.count(og))
2097 {
2098 // This can happen if an object references itself directly or
2099 // indirectly in some key that has to be resolved during
2100 // object parsing, such as stream length.
2101 QTC::TC("qpdf", "QPDF recursion loop in resolve");
2102 warn(QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
2103 "", this->m->file->getLastOffset(),
2104 "loop detected resolving object " +
2105 QUtil::int_to_string(objid) + " " +
2106 QUtil::int_to_string(generation)));
2107 return new QPDF_Null;
2108 }
2109 ResolveRecorder rr(this, og);
2110
2111 if ((! this->m->obj_cache.count(og)) && this->m->xref_table.count(og))
2112 {
2113 QPDFXRefEntry const& entry = this->m->xref_table[og];
2114 try
2115 {
2116 switch (entry.getType())
2117 {
2118 case 1:
2119 {
2120 qpdf_offset_t offset = entry.getOffset();
2121 // Object stored in cache by readObjectAtOffset
2122 int aobjid;
2123 int ageneration;
2124 QPDFObjectHandle oh =
2125 readObjectAtOffset(true, offset, "", objid, generation,
2126 aobjid, ageneration);
2127 }
2128 break;
2129
2130 case 2:
2131 resolveObjectsInStream(entry.getObjStreamNumber());
2132 break;
2133
2134 default:
2135 throw QPDFExc(qpdf_e_damaged_pdf,
2136 this->m->file->getName(), "", 0,
2137 "object " +
2138 QUtil::int_to_string(objid) + "/" +
2139 QUtil::int_to_string(generation) +
2140 " has unexpected xref entry type");
2141 }
2142 }
2143 catch (QPDFExc& e)
2144 {
2145 warn(e);
2146 }
2147 catch (std::exception& e)
2148 {
2149 warn(QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(), "", 0,
2150 "object " +
2151 QUtil::int_to_string(objid) + "/" +
2152 QUtil::int_to_string(generation) +
2153 ": error reading object: " + e.what()));
2154 }
2155 }
2156 if (this->m->obj_cache.count(og) == 0)
2157 {
2158 // PDF spec says unknown objects resolve to the null object.
2159 QTC::TC("qpdf", "QPDF resolve failure to null");
2160 QPDFObjectHandle oh = QPDFObjectHandle::newNull();
2161 this->m->obj_cache[og] =
2162 ObjCache(QPDFObjectHandle::ObjAccessor::getObject(oh), -1, -1);
2163 }
2164
2165 PointerHolder<QPDFObject> result(this->m->obj_cache[og].object);
2166 if (! result->hasDescription())
2167 {
2168 result->setDescription(
2169 this,
2170 "object " + QUtil::int_to_string(objid) + " " +
2171 QUtil::int_to_string(generation));
2172 }
2173 return result;
2174 }
2175
2176 void
resolveObjectsInStream(int obj_stream_number)2177 QPDF::resolveObjectsInStream(int obj_stream_number)
2178 {
2179 if (this->m->resolved_object_streams.count(obj_stream_number))
2180 {
2181 return;
2182 }
2183 this->m->resolved_object_streams.insert(obj_stream_number);
2184 // Force resolution of object stream
2185 QPDFObjectHandle obj_stream = getObjectByID(obj_stream_number, 0);
2186 if (! obj_stream.isStream())
2187 {
2188 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
2189 this->m->last_object_description,
2190 this->m->file->getLastOffset(),
2191 "supposed object stream " +
2192 QUtil::int_to_string(obj_stream_number) +
2193 " is not a stream");
2194 }
2195
2196 // For linearization data in the object, use the data from the
2197 // object stream for the objects in the stream.
2198 QPDFObjGen stream_og(obj_stream_number, 0);
2199 qpdf_offset_t end_before_space =
2200 this->m->obj_cache[stream_og].end_before_space;
2201 qpdf_offset_t end_after_space =
2202 this->m->obj_cache[stream_og].end_after_space;
2203
2204 QPDFObjectHandle dict = obj_stream.getDict();
2205 if (! (dict.getKey("/Type").isName() &&
2206 dict.getKey("/Type").getName() == "/ObjStm"))
2207 {
2208 QTC::TC("qpdf", "QPDF ERR object stream with wrong type");
2209 warn(QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
2210 this->m->last_object_description,
2211 this->m->file->getLastOffset(),
2212 "supposed object stream " +
2213 QUtil::int_to_string(obj_stream_number) +
2214 " has wrong type"));
2215 }
2216
2217 if (! (dict.getKey("/N").isInteger() &&
2218 dict.getKey("/First").isInteger()))
2219 {
2220 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
2221 this->m->last_object_description,
2222 this->m->file->getLastOffset(),
2223 "object stream " +
2224 QUtil::int_to_string(obj_stream_number) +
2225 " has incorrect keys");
2226 }
2227
2228 int n = dict.getKey("/N").getIntValueAsInt();
2229 int first = dict.getKey("/First").getIntValueAsInt();
2230
2231 std::map<int, int> offsets;
2232
2233 PointerHolder<Buffer> bp = obj_stream.getStreamData(qpdf_dl_specialized);
2234 PointerHolder<InputSource> input = new BufferInputSource(
2235 this->m->file->getName() +
2236 " object stream " + QUtil::int_to_string(obj_stream_number),
2237 bp.getPointer());
2238
2239 for (int i = 0; i < n; ++i)
2240 {
2241 QPDFTokenizer::Token tnum = readToken(input);
2242 QPDFTokenizer::Token toffset = readToken(input);
2243 if (! ((tnum.getType() == QPDFTokenizer::tt_integer) &&
2244 (toffset.getType() == QPDFTokenizer::tt_integer)))
2245 {
2246 throw QPDFExc(qpdf_e_damaged_pdf, input->getName(),
2247 this->m->last_object_description,
2248 input->getLastOffset(),
2249 "expected integer in object stream header");
2250 }
2251
2252 int num = QUtil::string_to_int(tnum.getValue().c_str());
2253 long long offset = QUtil::string_to_int(toffset.getValue().c_str());
2254 offsets[num] = QIntC::to_int(offset + first);
2255 }
2256
2257 // To avoid having to read the object stream multiple times, store
2258 // all objects that would be found here in the cache. Remember
2259 // that some objects stored here might have been overridden by new
2260 // objects appended to the file, so it is necessary to recheck the
2261 // xref table and only cache what would actually be resolved here.
2262 for (std::map<int, int>::iterator iter = offsets.begin();
2263 iter != offsets.end(); ++iter)
2264 {
2265 int obj = (*iter).first;
2266 QPDFObjGen og(obj, 0);
2267 QPDFXRefEntry const& entry = this->m->xref_table[og];
2268 if ((entry.getType() == 2) &&
2269 (entry.getObjStreamNumber() == obj_stream_number))
2270 {
2271 int offset = (*iter).second;
2272 input->seek(offset, SEEK_SET);
2273 QPDFObjectHandle oh = readObject(input, "", obj, 0, true);
2274 this->m->obj_cache[og] =
2275 ObjCache(QPDFObjectHandle::ObjAccessor::getObject(oh),
2276 end_before_space, end_after_space);
2277 }
2278 else
2279 {
2280 QTC::TC("qpdf", "QPDF not caching overridden objstm object");
2281 }
2282 }
2283 }
2284
2285 QPDFObjectHandle
makeIndirectObject(QPDFObjectHandle oh)2286 QPDF::makeIndirectObject(QPDFObjectHandle oh)
2287 {
2288 int max_objid = toI(getObjectCount());
2289 if (max_objid == std::numeric_limits<int>::max())
2290 {
2291 throw std::range_error(
2292 "max object id is too high to create new objects");
2293 }
2294 QPDFObjGen next(max_objid + 1, 0);
2295 this->m->obj_cache[next] =
2296 ObjCache(QPDFObjectHandle::ObjAccessor::getObject(oh), -1, -1);
2297 return QPDFObjectHandle::Factory::newIndirect(
2298 this, next.getObj(), next.getGen());
2299 }
2300
2301 QPDFObjectHandle
getObjectByObjGen(QPDFObjGen const & og)2302 QPDF::getObjectByObjGen(QPDFObjGen const& og)
2303 {
2304 return getObjectByID(og.getObj(), og.getGen());
2305 }
2306
2307 QPDFObjectHandle
getObjectByID(int objid,int generation)2308 QPDF::getObjectByID(int objid, int generation)
2309 {
2310 return QPDFObjectHandle::Factory::newIndirect(this, objid, generation);
2311 }
2312
2313 void
replaceObject(QPDFObjGen const & og,QPDFObjectHandle oh)2314 QPDF::replaceObject(QPDFObjGen const& og, QPDFObjectHandle oh)
2315 {
2316 replaceObject(og.getObj(), og.getGen(), oh);
2317 }
2318
2319 void
replaceObject(int objid,int generation,QPDFObjectHandle oh)2320 QPDF::replaceObject(int objid, int generation, QPDFObjectHandle oh)
2321 {
2322 if (oh.isIndirect())
2323 {
2324 QTC::TC("qpdf", "QPDF replaceObject called with indirect object");
2325 throw std::logic_error(
2326 "QPDF::replaceObject called with indirect object handle");
2327 }
2328
2329 // Force new object to appear in the cache
2330 resolve(objid, generation);
2331
2332 // Replace the object in the object cache
2333 QPDFObjGen og(objid, generation);
2334 this->m->ever_replaced_objects = true;
2335 this->m->obj_cache[og] =
2336 ObjCache(QPDFObjectHandle::ObjAccessor::getObject(oh), -1, -1);
2337 }
2338
2339 void
replaceReserved(QPDFObjectHandle reserved,QPDFObjectHandle replacement)2340 QPDF::replaceReserved(QPDFObjectHandle reserved,
2341 QPDFObjectHandle replacement)
2342 {
2343 QTC::TC("qpdf", "QPDF replaceReserved");
2344 reserved.assertReserved();
2345 replaceObject(reserved.getObjGen(), replacement);
2346 }
2347
2348 QPDFObjectHandle
copyForeignObject(QPDFObjectHandle foreign)2349 QPDF::copyForeignObject(QPDFObjectHandle foreign)
2350 {
2351 // Here's an explanation of what's going on here.
2352 //
2353 // A QPDFObjectHandle that is an indirect object has an owning
2354 // QPDF. The object ID and generation refers to an object in the
2355 // owning QPDF. When we copy the QPDFObjectHandle from a foreign
2356 // QPDF into the local QPDF, we have to replace all indirect
2357 // object references with references to the corresponding object
2358 // in the local file.
2359 //
2360 // To do this, we maintain mappings from foreign object IDs to
2361 // local object IDs for each foreign QPDF that we are copying
2362 // from. The mapping is stored in an ObjCopier, which contains a
2363 // mapping from the foreign ObjGen to the local QPDFObjectHandle.
2364 //
2365 // To copy, we do a deep traversal of the foreign object with loop
2366 // detection to discover all indirect objects that are
2367 // encountered, stopping at page boundaries. Whenever we encounter
2368 // an indirect object, we check to see if we have already created
2369 // a local copy of it. If not, we allocate a "reserved" object
2370 // (or, for a stream, just a new stream) and store in the map the
2371 // mapping from the foreign object ID to the new object. While we
2372 // do this, we keep a list of objects to copy.
2373 //
2374 // Once we are done with the traversal, we copy all the objects
2375 // that we need to copy. However, the copies will contain indirect
2376 // object IDs that refer to objects in the foreign file. We need
2377 // to replace them with references to objects in the local file.
2378 // This is what replaceForeignIndirectObjects does. Once we have
2379 // created a copy of the foreign object with all the indirect
2380 // references replaced with new ones in the local context, we can
2381 // replace the local reserved object with the copy. This mechanism
2382 // allows us to copy objects with circular references in any
2383 // order.
2384
2385 // For streams, rather than copying the objects, we set up the
2386 // stream data to pull from the original stream by using a stream
2387 // data provider. This is done in a manner that doesn't require
2388 // the original QPDF object but may require the original source of
2389 // the stream data with special handling for immediate_copy_from.
2390 // This logic is also in replaceForeignIndirectObjects.
2391
2392 // Note that we explicitly allow use of copyForeignObject on page
2393 // objects. It is a documented use case to copy pages this way if
2394 // the intention is to not update the pages tree.
2395 if (! foreign.isIndirect())
2396 {
2397 QTC::TC("qpdf", "QPDF copyForeign direct");
2398 throw std::logic_error(
2399 "QPDF::copyForeign called with direct object handle");
2400 }
2401 QPDF* other = foreign.getOwningQPDF();
2402 if (other == this)
2403 {
2404 QTC::TC("qpdf", "QPDF copyForeign not foreign");
2405 throw std::logic_error(
2406 "QPDF::copyForeign called with object from this QPDF");
2407 }
2408
2409 ObjCopier& obj_copier = this->m->object_copiers[other->m->unique_id];
2410 if (! obj_copier.visiting.empty())
2411 {
2412 throw std::logic_error("obj_copier.visiting is not empty"
2413 " at the beginning of copyForeignObject");
2414 }
2415
2416 // Make sure we have an object in this file for every referenced
2417 // object in the old file. obj_copier.object_map maps foreign
2418 // QPDFObjGen to local objects. For everything new that we have
2419 // to copy, the local object will be a reservation, unless it is a
2420 // stream, in which case the local object will already be a
2421 // stream.
2422 reserveObjects(foreign, obj_copier, true);
2423
2424 if (! obj_copier.visiting.empty())
2425 {
2426 throw std::logic_error("obj_copier.visiting is not empty"
2427 " after reserving objects");
2428 }
2429
2430 // Copy any new objects and replace the reservations.
2431 for (std::vector<QPDFObjectHandle>::iterator iter =
2432 obj_copier.to_copy.begin();
2433 iter != obj_copier.to_copy.end(); ++iter)
2434 {
2435 QPDFObjectHandle& to_copy = *iter;
2436 QPDFObjectHandle copy =
2437 replaceForeignIndirectObjects(to_copy, obj_copier, true);
2438 if (! to_copy.isStream())
2439 {
2440 QPDFObjGen og(to_copy.getObjGen());
2441 replaceReserved(obj_copier.object_map[og], copy);
2442 }
2443 }
2444 obj_copier.to_copy.clear();
2445
2446 return obj_copier.object_map[foreign.getObjGen()];
2447 }
2448
2449 void
reserveObjects(QPDFObjectHandle foreign,ObjCopier & obj_copier,bool top)2450 QPDF::reserveObjects(QPDFObjectHandle foreign, ObjCopier& obj_copier,
2451 bool top)
2452 {
2453 if (foreign.isReserved())
2454 {
2455 throw std::logic_error(
2456 "QPDF: attempting to copy a foreign reserved object");
2457 }
2458
2459 if (foreign.isPagesObject())
2460 {
2461 QTC::TC("qpdf", "QPDF not copying pages object");
2462 return;
2463 }
2464
2465 if ((! top) && foreign.isPageObject())
2466 {
2467 QTC::TC("qpdf", "QPDF not crossing page boundary");
2468 return;
2469 }
2470
2471 if (foreign.isIndirect())
2472 {
2473 QPDFObjGen foreign_og(foreign.getObjGen());
2474 if (obj_copier.visiting.find(foreign_og) != obj_copier.visiting.end())
2475 {
2476 QTC::TC("qpdf", "QPDF loop reserving objects");
2477 return;
2478 }
2479 if (obj_copier.object_map.find(foreign_og) !=
2480 obj_copier.object_map.end())
2481 {
2482 QTC::TC("qpdf", "QPDF already reserved object");
2483 return;
2484 }
2485 QTC::TC("qpdf", "QPDF copy indirect");
2486 obj_copier.visiting.insert(foreign_og);
2487 std::map<QPDFObjGen, QPDFObjectHandle>::iterator mapping =
2488 obj_copier.object_map.find(foreign_og);
2489 if (mapping == obj_copier.object_map.end())
2490 {
2491 obj_copier.to_copy.push_back(foreign);
2492 QPDFObjectHandle reservation;
2493 if (foreign.isStream())
2494 {
2495 reservation = QPDFObjectHandle::newStream(this);
2496 }
2497 else
2498 {
2499 reservation = QPDFObjectHandle::newReserved(this);
2500 }
2501 obj_copier.object_map[foreign_og] = reservation;
2502 }
2503 }
2504
2505 if (foreign.isArray())
2506 {
2507 QTC::TC("qpdf", "QPDF reserve array");
2508 int n = foreign.getArrayNItems();
2509 for (int i = 0; i < n; ++i)
2510 {
2511 reserveObjects(foreign.getArrayItem(i), obj_copier, false);
2512 }
2513 }
2514 else if (foreign.isDictionary())
2515 {
2516 QTC::TC("qpdf", "QPDF reserve dictionary");
2517 std::set<std::string> keys = foreign.getKeys();
2518 for (std::set<std::string>::iterator iter = keys.begin();
2519 iter != keys.end(); ++iter)
2520 {
2521 reserveObjects(foreign.getKey(*iter), obj_copier, false);
2522 }
2523 }
2524 else if (foreign.isStream())
2525 {
2526 QTC::TC("qpdf", "QPDF reserve stream");
2527 reserveObjects(foreign.getDict(), obj_copier, false);
2528 }
2529
2530 if (foreign.isIndirect())
2531 {
2532 QPDFObjGen foreign_og(foreign.getObjGen());
2533 obj_copier.visiting.erase(foreign_og);
2534 }
2535 }
2536
2537 QPDFObjectHandle
replaceForeignIndirectObjects(QPDFObjectHandle foreign,ObjCopier & obj_copier,bool top)2538 QPDF::replaceForeignIndirectObjects(
2539 QPDFObjectHandle foreign, ObjCopier& obj_copier, bool top)
2540 {
2541 QPDFObjectHandle result;
2542 if ((! top) && foreign.isIndirect())
2543 {
2544 QTC::TC("qpdf", "QPDF replace indirect");
2545 QPDFObjGen foreign_og(foreign.getObjGen());
2546 std::map<QPDFObjGen, QPDFObjectHandle>::iterator mapping =
2547 obj_copier.object_map.find(foreign_og);
2548 if (mapping == obj_copier.object_map.end())
2549 {
2550 // This case would occur if this is a reference to a Page
2551 // or Pages object that we didn't traverse into.
2552 QTC::TC("qpdf", "QPDF replace foreign indirect with null");
2553 result = QPDFObjectHandle::newNull();
2554 }
2555 else
2556 {
2557 result = obj_copier.object_map[foreign_og];
2558 }
2559 }
2560 else if (foreign.isArray())
2561 {
2562 QTC::TC("qpdf", "QPDF replace array");
2563 result = QPDFObjectHandle::newArray();
2564 int n = foreign.getArrayNItems();
2565 for (int i = 0; i < n; ++i)
2566 {
2567 result.appendItem(
2568 replaceForeignIndirectObjects(
2569 foreign.getArrayItem(i), obj_copier, false));
2570 }
2571 }
2572 else if (foreign.isDictionary())
2573 {
2574 QTC::TC("qpdf", "QPDF replace dictionary");
2575 result = QPDFObjectHandle::newDictionary();
2576 std::set<std::string> keys = foreign.getKeys();
2577 for (std::set<std::string>::iterator iter = keys.begin();
2578 iter != keys.end(); ++iter)
2579 {
2580 result.replaceKey(
2581 *iter,
2582 replaceForeignIndirectObjects(
2583 foreign.getKey(*iter), obj_copier, false));
2584 }
2585 }
2586 else if (foreign.isStream())
2587 {
2588 QTC::TC("qpdf", "QPDF replace stream");
2589 QPDFObjGen foreign_og(foreign.getObjGen());
2590 result = obj_copier.object_map[foreign_og];
2591 result.assertStream();
2592 QPDFObjectHandle dict = result.getDict();
2593 QPDFObjectHandle old_dict = foreign.getDict();
2594 std::set<std::string> keys = old_dict.getKeys();
2595 for (std::set<std::string>::iterator iter = keys.begin();
2596 iter != keys.end(); ++iter)
2597 {
2598 dict.replaceKey(
2599 *iter,
2600 replaceForeignIndirectObjects(
2601 old_dict.getKey(*iter), obj_copier, false));
2602 }
2603 copyStreamData(result, foreign);
2604 }
2605 else
2606 {
2607 foreign.assertScalar();
2608 result = foreign;
2609 result.makeDirect();
2610 }
2611
2612 if (top && (! result.isStream()) && result.isIndirect())
2613 {
2614 throw std::logic_error("replacement for foreign object is indirect");
2615 }
2616
2617 return result;
2618 }
2619
2620 void
copyStreamData(QPDFObjectHandle result,QPDFObjectHandle foreign)2621 QPDF::copyStreamData(QPDFObjectHandle result, QPDFObjectHandle foreign)
2622 {
2623 // This method was originally written for copying foreign streams,
2624 // but it is used by QPDFObjectHandle to copy streams from the
2625 // same QPDF object as well.
2626
2627 QPDFObjectHandle dict = result.getDict();
2628 QPDFObjectHandle old_dict = foreign.getDict();
2629 if (this->m->copied_stream_data_provider == 0)
2630 {
2631 this->m->copied_stream_data_provider =
2632 new CopiedStreamDataProvider(*this);
2633 this->m->copied_streams = this->m->copied_stream_data_provider;
2634 }
2635 QPDFObjGen local_og(result.getObjGen());
2636 // Copy information from the foreign stream so we can pipe its
2637 // data later without keeping the original QPDF object around.
2638 QPDF* foreign_stream_qpdf = foreign.getOwningQPDF();
2639 if (! foreign_stream_qpdf)
2640 {
2641 throw std::logic_error("unable to retrieve owning qpdf"
2642 " from foreign stream");
2643 }
2644 QPDF_Stream* stream =
2645 dynamic_cast<QPDF_Stream*>(
2646 QPDFObjectHandle::ObjAccessor::getObject(
2647 foreign).getPointer());
2648 if (! stream)
2649 {
2650 throw std::logic_error("unable to retrieve underlying"
2651 " stream object from foreign stream");
2652 }
2653 PointerHolder<Buffer> stream_buffer =
2654 stream->getStreamDataBuffer();
2655 if ((foreign_stream_qpdf->m->immediate_copy_from) &&
2656 (stream_buffer.getPointer() == 0))
2657 {
2658 // Pull the stream data into a buffer before attempting
2659 // the copy operation. Do it on the source stream so that
2660 // if the source stream is copied multiple times, we don't
2661 // have to keep duplicating the memory.
2662 QTC::TC("qpdf", "QPDF immediate copy stream data");
2663 foreign.replaceStreamData(foreign.getRawStreamData(),
2664 old_dict.getKey("/Filter"),
2665 old_dict.getKey("/DecodeParms"));
2666 stream_buffer = stream->getStreamDataBuffer();
2667 }
2668 PointerHolder<QPDFObjectHandle::StreamDataProvider> stream_provider =
2669 stream->getStreamDataProvider();
2670 if (stream_buffer.getPointer())
2671 {
2672 QTC::TC("qpdf", "QPDF copy foreign stream with buffer");
2673 result.replaceStreamData(stream_buffer,
2674 dict.getKey("/Filter"),
2675 dict.getKey("/DecodeParms"));
2676 }
2677 else if (stream_provider.getPointer())
2678 {
2679 // In this case, the remote stream's QPDF must stay in scope.
2680 QTC::TC("qpdf", "QPDF copy foreign stream with provider");
2681 this->m->copied_stream_data_provider->registerForeignStream(
2682 local_og, foreign);
2683 result.replaceStreamData(this->m->copied_streams,
2684 dict.getKey("/Filter"),
2685 dict.getKey("/DecodeParms"));
2686 }
2687 else
2688 {
2689 PointerHolder<ForeignStreamData> foreign_stream_data =
2690 new ForeignStreamData(
2691 foreign_stream_qpdf->m->encp,
2692 foreign_stream_qpdf->m->file,
2693 foreign.getObjectID(),
2694 foreign.getGeneration(),
2695 stream->getOffset(),
2696 stream->getLength(),
2697 dict);
2698 this->m->copied_stream_data_provider->registerForeignStream(
2699 local_og, foreign_stream_data);
2700 result.replaceStreamData(this->m->copied_streams,
2701 dict.getKey("/Filter"),
2702 dict.getKey("/DecodeParms"));
2703 }
2704 }
2705
2706 void
swapObjects(QPDFObjGen const & og1,QPDFObjGen const & og2)2707 QPDF::swapObjects(QPDFObjGen const& og1, QPDFObjGen const& og2)
2708 {
2709 swapObjects(og1.getObj(), og1.getGen(), og2.getObj(), og2.getGen());
2710 }
2711
2712 void
swapObjects(int objid1,int generation1,int objid2,int generation2)2713 QPDF::swapObjects(int objid1, int generation1, int objid2, int generation2)
2714 {
2715 // Force objects to be loaded into cache; then swap them in the
2716 // cache.
2717 resolve(objid1, generation1);
2718 resolve(objid2, generation2);
2719 QPDFObjGen og1(objid1, generation1);
2720 QPDFObjGen og2(objid2, generation2);
2721 ObjCache t = this->m->obj_cache[og1];
2722 this->m->ever_replaced_objects = true;
2723 this->m->obj_cache[og1] = this->m->obj_cache[og2];
2724 this->m->obj_cache[og2] = t;
2725 }
2726
2727 unsigned long long
getUniqueId() const2728 QPDF::getUniqueId() const
2729 {
2730 return this->m->unique_id;
2731 }
2732
2733 std::string
getFilename() const2734 QPDF::getFilename() const
2735 {
2736 return this->m->file->getName();
2737 }
2738
2739 std::string
getPDFVersion() const2740 QPDF::getPDFVersion() const
2741 {
2742 return this->m->pdf_version;
2743 }
2744
2745 int
getExtensionLevel()2746 QPDF::getExtensionLevel()
2747 {
2748 int result = 0;
2749 QPDFObjectHandle obj = getRoot();
2750 if (obj.hasKey("/Extensions"))
2751 {
2752 obj = obj.getKey("/Extensions");
2753 if (obj.isDictionary() && obj.hasKey("/ADBE"))
2754 {
2755 obj = obj.getKey("/ADBE");
2756 if (obj.isDictionary() && obj.hasKey("/ExtensionLevel"))
2757 {
2758 obj = obj.getKey("/ExtensionLevel");
2759 if (obj.isInteger())
2760 {
2761 result = obj.getIntValueAsInt();
2762 }
2763 }
2764 }
2765 }
2766 return result;
2767 }
2768
2769 QPDFObjectHandle
getTrailer()2770 QPDF::getTrailer()
2771 {
2772 return this->m->trailer;
2773 }
2774
2775 QPDFObjectHandle
getRoot()2776 QPDF::getRoot()
2777 {
2778 QPDFObjectHandle root = this->m->trailer.getKey("/Root");
2779 if (! root.isDictionary())
2780 {
2781 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
2782 "", this->m->file->getLastOffset(),
2783 "unable to find /Root dictionary");
2784 }
2785 return root;
2786 }
2787
2788 std::map<QPDFObjGen, QPDFXRefEntry>
getXRefTable()2789 QPDF::getXRefTable()
2790 {
2791 if (! this->m->parsed)
2792 {
2793 throw std::logic_error("QPDF::getXRefTable called before parsing.");
2794 }
2795
2796 return this->m->xref_table;
2797 }
2798
2799 void
getObjectStreamData(std::map<int,int> & omap)2800 QPDF::getObjectStreamData(std::map<int, int>& omap)
2801 {
2802 for (std::map<QPDFObjGen, QPDFXRefEntry>::iterator iter =
2803 this->m->xref_table.begin();
2804 iter != this->m->xref_table.end(); ++iter)
2805 {
2806 QPDFObjGen const& og = (*iter).first;
2807 QPDFXRefEntry const& entry = (*iter).second;
2808 if (entry.getType() == 2)
2809 {
2810 omap[og.getObj()] = entry.getObjStreamNumber();
2811 }
2812 }
2813 }
2814
2815 std::vector<QPDFObjGen>
getCompressibleObjGens()2816 QPDF::getCompressibleObjGens()
2817 {
2818 // Return a list of objects that are allowed to be in object
2819 // streams. Walk through the objects by traversing the document
2820 // from the root, including a traversal of the pages tree. This
2821 // makes that objects that are on the same page are more likely to
2822 // be in the same object stream, which is slightly more efficient,
2823 // particularly with linearized files. This is better than
2824 // iterating through the xref table since it avoids preserving
2825 // orphaned items.
2826
2827 // Exclude encryption dictionary, if any
2828 QPDFObjectHandle encryption_dict =
2829 this->m->trailer.getKey("/Encrypt");
2830 QPDFObjGen encryption_dict_og = encryption_dict.getObjGen();
2831
2832 std::set<QPDFObjGen> visited;
2833 std::list<QPDFObjectHandle> queue;
2834 queue.push_front(this->m->trailer);
2835 std::vector<QPDFObjGen> result;
2836 while (! queue.empty())
2837 {
2838 QPDFObjectHandle obj = queue.front();
2839 queue.pop_front();
2840 if (obj.isIndirect())
2841 {
2842 QPDFObjGen og = obj.getObjGen();
2843 if (visited.count(og))
2844 {
2845 QTC::TC("qpdf", "QPDF loop detected traversing objects");
2846 continue;
2847 }
2848 if (og == encryption_dict_og)
2849 {
2850 QTC::TC("qpdf", "QPDF exclude encryption dictionary");
2851 }
2852 else if ((! obj.isStream()) &&
2853 (! (obj.isDictionary() &&
2854 obj.hasKey("/ByteRange") &&
2855 obj.hasKey("/Contents") &&
2856 obj.hasKey("/Type") &&
2857 obj.getKey("/Type").isName() &&
2858 obj.getKey("/Type").getName() == "/Sig")))
2859 {
2860 result.push_back(og);
2861 }
2862 visited.insert(og);
2863 }
2864 if (obj.isStream())
2865 {
2866 QPDFObjectHandle dict = obj.getDict();
2867 std::set<std::string> keys = dict.getKeys();
2868 for (std::set<std::string>::reverse_iterator iter = keys.rbegin();
2869 iter != keys.rend(); ++iter)
2870 {
2871 std::string const& key = *iter;
2872 QPDFObjectHandle value = dict.getKey(key);
2873 if (key == "/Length")
2874 {
2875 // omit stream lengths
2876 if (value.isIndirect())
2877 {
2878 QTC::TC("qpdf", "QPDF exclude indirect length");
2879 }
2880 }
2881 else
2882 {
2883 queue.push_front(value);
2884 }
2885 }
2886 }
2887 else if (obj.isDictionary())
2888 {
2889 std::set<std::string> keys = obj.getKeys();
2890 for (std::set<std::string>::reverse_iterator iter = keys.rbegin();
2891 iter != keys.rend(); ++iter)
2892 {
2893 queue.push_front(obj.getKey(*iter));
2894 }
2895 }
2896 else if (obj.isArray())
2897 {
2898 int n = obj.getArrayNItems();
2899 for (int i = 1; i <= n; ++i)
2900 {
2901 queue.push_front(obj.getArrayItem(n - i));
2902 }
2903 }
2904 }
2905
2906 return result;
2907 }
2908
2909 bool
pipeStreamData(PointerHolder<EncryptionParameters> encp,PointerHolder<InputSource> file,QPDF & qpdf_for_warning,int objid,int generation,qpdf_offset_t offset,size_t length,QPDFObjectHandle stream_dict,Pipeline * pipeline,bool suppress_warnings,bool will_retry)2910 QPDF::pipeStreamData(PointerHolder<EncryptionParameters> encp,
2911 PointerHolder<InputSource> file,
2912 QPDF& qpdf_for_warning,
2913 int objid, int generation,
2914 qpdf_offset_t offset, size_t length,
2915 QPDFObjectHandle stream_dict,
2916 Pipeline* pipeline,
2917 bool suppress_warnings,
2918 bool will_retry)
2919 {
2920 std::vector<PointerHolder<Pipeline> > to_delete;
2921 if (encp->encrypted)
2922 {
2923 decryptStream(encp, file, qpdf_for_warning,
2924 pipeline, objid, generation,
2925 stream_dict, to_delete);
2926 }
2927
2928 bool success = false;
2929 try
2930 {
2931 file->seek(offset, SEEK_SET);
2932 char buf[10240];
2933 while (length > 0)
2934 {
2935 size_t to_read = (sizeof(buf) < length ? sizeof(buf) : length);
2936 size_t len = file->read(buf, to_read);
2937 if (len == 0)
2938 {
2939 throw QPDFExc(qpdf_e_damaged_pdf,
2940 file->getName(),
2941 "",
2942 file->getLastOffset(),
2943 "unexpected EOF reading stream data");
2944 }
2945 length -= len;
2946 pipeline->write(QUtil::unsigned_char_pointer(buf), len);
2947 }
2948 pipeline->finish();
2949 success = true;
2950 }
2951 catch (QPDFExc& e)
2952 {
2953 if (! suppress_warnings)
2954 {
2955 qpdf_for_warning.warn(e);
2956 }
2957 }
2958 catch (std::exception& e)
2959 {
2960 if (! suppress_warnings)
2961 {
2962 QTC::TC("qpdf", "QPDF decoding error warning");
2963 qpdf_for_warning.warn(
2964 QPDFExc(qpdf_e_damaged_pdf, file->getName(),
2965 "", file->getLastOffset(),
2966 "error decoding stream data for object " +
2967 QUtil::int_to_string(objid) + " " +
2968 QUtil::int_to_string(generation) + ": " + e.what()));
2969 if (will_retry)
2970 {
2971 qpdf_for_warning.warn(
2972 QPDFExc(qpdf_e_damaged_pdf, file->getName(),
2973 "", file->getLastOffset(),
2974 "stream will be re-processed without"
2975 " filtering to avoid data loss"));
2976 }
2977 }
2978 }
2979 if (! success)
2980 {
2981 try
2982 {
2983 pipeline->finish();
2984 }
2985 catch (std::exception&)
2986 {
2987 // ignore
2988 }
2989 }
2990 return success;
2991 }
2992
2993 bool
pipeStreamData(int objid,int generation,qpdf_offset_t offset,size_t length,QPDFObjectHandle stream_dict,Pipeline * pipeline,bool suppress_warnings,bool will_retry)2994 QPDF::pipeStreamData(int objid, int generation,
2995 qpdf_offset_t offset, size_t length,
2996 QPDFObjectHandle stream_dict,
2997 Pipeline* pipeline,
2998 bool suppress_warnings,
2999 bool will_retry)
3000 {
3001 return pipeStreamData(
3002 this->m->encp, this->m->file, *this,
3003 objid, generation, offset, length,
3004 stream_dict, pipeline,
3005 suppress_warnings, will_retry);
3006 }
3007
3008 bool
pipeForeignStreamData(PointerHolder<ForeignStreamData> foreign,Pipeline * pipeline,bool suppress_warnings,bool will_retry)3009 QPDF::pipeForeignStreamData(
3010 PointerHolder<ForeignStreamData> foreign,
3011 Pipeline* pipeline,
3012 bool suppress_warnings, bool will_retry)
3013 {
3014 if (foreign->encp->encrypted)
3015 {
3016 QTC::TC("qpdf", "QPDF pipe foreign encrypted stream");
3017 }
3018 return pipeStreamData(
3019 foreign->encp, foreign->file, *this,
3020 foreign->foreign_objid, foreign->foreign_generation,
3021 foreign->offset, foreign->length,
3022 foreign->local_dict, pipeline,
3023 suppress_warnings, will_retry);
3024 }
3025
3026 void
stopOnError(std::string const & message)3027 QPDF::stopOnError(std::string const& message)
3028 {
3029 // Throw a generic exception when we lack context for something
3030 // more specific. New code should not use this. This method exists
3031 // to improve somewhat from calling assert in very old code.
3032 throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
3033 "", this->m->file->getLastOffset(), message);
3034 }
3035