1 #include <qpdf/qpdf-config.h>  // include first for large file support
2 #include <qpdf/QPDF.hh>
3 
4 #include <atomic>
5 #include <vector>
6 #include <map>
7 #include <algorithm>
8 #include <limits>
9 #include <sstream>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <memory.h>
13 
14 #include <qpdf/QTC.hh>
15 #include <qpdf/QUtil.hh>
16 #include <qpdf/Pipeline.hh>
17 #include <qpdf/Pl_Discard.hh>
18 #include <qpdf/FileInputSource.hh>
19 #include <qpdf/BufferInputSource.hh>
20 #include <qpdf/OffsetInputSource.hh>
21 
22 #include <qpdf/QPDFExc.hh>
23 #include <qpdf/QPDF_Null.hh>
24 #include <qpdf/QPDF_Dictionary.hh>
25 #include <qpdf/QPDF_Stream.hh>
26 #include <qpdf/QPDF_Array.hh>
27 
28 std::string QPDF::qpdf_version = "10.5.0";
29 
30 static char const* EMPTY_PDF =
31     "%PDF-1.3\n"
32     "1 0 obj\n"
33     "<< /Type /Catalog /Pages 2 0 R >>\n"
34     "endobj\n"
35     "2 0 obj\n"
36     "<< /Type /Pages /Kids [] /Count 0 >>\n"
37     "endobj\n"
38     "xref\n"
39     "0 3\n"
40     "0000000000 65535 f \n"
41     "0000000009 00000 n \n"
42     "0000000058 00000 n \n"
43     "trailer << /Size 3 /Root 1 0 R >>\n"
44     "startxref\n"
45     "110\n"
46     "%%EOF\n";
47 
48 class InvalidInputSource: public InputSource
49 {
50   public:
51     virtual ~InvalidInputSource() = default;
findAndSkipNextEOL()52     virtual qpdf_offset_t findAndSkipNextEOL() override
53     {
54         throwException();
55         return 0;
56     }
getName() const57     virtual std::string const& getName() const override
58     {
59         static std::string name("closed input source");
60         return name;
61     }
tell()62     virtual qpdf_offset_t tell() override
63     {
64         throwException();
65         return 0;
66     }
seek(qpdf_offset_t offset,int whence)67     virtual void seek(qpdf_offset_t offset, int whence) override
68     {
69         throwException();
70     }
rewind()71     virtual void rewind() override
72     {
73         throwException();
74     }
read(char * buffer,size_t length)75     virtual size_t read(char* buffer, size_t length) override
76     {
77         throwException();
78         return 0;
79     }
unreadCh(char ch)80     virtual void unreadCh(char ch) override
81     {
82         throwException();
83     }
84 
85   private:
throwException()86     void throwException()
87     {
88         throw std::logic_error(
89             "QPDF operation attempted on a QPDF object with no input source."
90             " QPDF operations are invalid before processFile (or another"
91             " process method) or after closeInputSource");
92     }
93 };
94 
ForeignStreamData(PointerHolder<EncryptionParameters> encp,PointerHolder<InputSource> file,int foreign_objid,int foreign_generation,qpdf_offset_t offset,size_t length,QPDFObjectHandle local_dict)95 QPDF::ForeignStreamData::ForeignStreamData(
96     PointerHolder<EncryptionParameters> encp,
97     PointerHolder<InputSource> file,
98     int foreign_objid,
99     int foreign_generation,
100     qpdf_offset_t offset,
101     size_t length,
102     QPDFObjectHandle local_dict)
103     :
104     encp(encp),
105     file(file),
106     foreign_objid(foreign_objid),
107     foreign_generation(foreign_generation),
108     offset(offset),
109     length(length),
110     local_dict(local_dict)
111 {
112 }
113 
CopiedStreamDataProvider(QPDF & destination_qpdf)114 QPDF::CopiedStreamDataProvider::CopiedStreamDataProvider(
115     QPDF& destination_qpdf) :
116     QPDFObjectHandle::StreamDataProvider(true),
117     destination_qpdf(destination_qpdf)
118 {
119 }
120 
121 bool
provideStreamData(int objid,int generation,Pipeline * pipeline,bool suppress_warnings,bool will_retry)122 QPDF::CopiedStreamDataProvider::provideStreamData(
123     int objid, int generation, Pipeline* pipeline,
124     bool suppress_warnings, bool will_retry)
125 {
126     PointerHolder<ForeignStreamData> foreign_data =
127         this->foreign_stream_data[QPDFObjGen(objid, generation)];
128     bool result = false;
129     if (foreign_data.getPointer())
130     {
131         result = destination_qpdf.pipeForeignStreamData(
132             foreign_data, pipeline, suppress_warnings, will_retry);
133         QTC::TC("qpdf", "QPDF copy foreign with data",
134                 result ? 0 : 1);
135     }
136     else
137     {
138         QPDFObjectHandle foreign_stream =
139             this->foreign_streams[QPDFObjGen(objid, generation)];
140         result = foreign_stream.pipeStreamData(
141             pipeline, nullptr, 0, qpdf_dl_none,
142             suppress_warnings, will_retry);
143         QTC::TC("qpdf", "QPDF copy foreign with foreign_stream",
144                 result ? 0 : 1);
145     }
146     return result;
147 }
148 
149 void
registerForeignStream(QPDFObjGen const & local_og,QPDFObjectHandle foreign_stream)150 QPDF::CopiedStreamDataProvider::registerForeignStream(
151     QPDFObjGen const& local_og, QPDFObjectHandle foreign_stream)
152 {
153     this->foreign_streams[local_og] = foreign_stream;
154 }
155 
156 void
registerForeignStream(QPDFObjGen const & local_og,PointerHolder<ForeignStreamData> foreign_stream)157 QPDF::CopiedStreamDataProvider::registerForeignStream(
158     QPDFObjGen const& local_og,
159     PointerHolder<ForeignStreamData> foreign_stream)
160 {
161     this->foreign_stream_data[local_og] = foreign_stream;
162 }
163 
StringDecrypter(QPDF * qpdf,int objid,int gen)164 QPDF::StringDecrypter::StringDecrypter(QPDF* qpdf, int objid, int gen) :
165     qpdf(qpdf),
166     objid(objid),
167     gen(gen)
168 {
169 }
170 
171 void
decryptString(std::string & val)172 QPDF::StringDecrypter::decryptString(std::string& val)
173 {
174     qpdf->decryptString(val, objid, gen);
175 }
176 
177 std::string const&
QPDFVersion()178 QPDF::QPDFVersion()
179 {
180     return QPDF::qpdf_version;
181 }
182 
EncryptionParameters()183 QPDF::EncryptionParameters::EncryptionParameters() :
184     encrypted(false),
185     encryption_initialized(false),
186     encryption_V(0),
187     encryption_R(0),
188     encrypt_metadata(true),
189     cf_stream(e_none),
190     cf_string(e_none),
191     cf_file(e_none),
192     cached_key_objid(0),
193     cached_key_generation(0),
194     user_password_matched(false),
195     owner_password_matched(false)
196 {
197 }
198 
Members()199 QPDF::Members::Members() :
200     unique_id(0),
201     file(new InvalidInputSource()),
202     provided_password_is_hex_key(false),
203     ignore_xref_streams(false),
204     suppress_warnings(false),
205     out_stream(&std::cout),
206     err_stream(&std::cerr),
207     attempt_recovery(true),
208     encp(new EncryptionParameters),
209     pushed_inherited_attributes_to_pages(false),
210     copied_stream_data_provider(0),
211     reconstructed_xref(false),
212     fixed_dangling_refs(false),
213     immediate_copy_from(false),
214     in_parse(false),
215     parsed(false),
216     ever_replaced_objects(false),
217     first_xref_item_offset(0),
218     uncompressed_after_compressed(false)
219 {
220 }
221 
~Members()222 QPDF::Members::~Members()
223 {
224 }
225 
QPDF()226 QPDF::QPDF() :
227     m(new Members())
228 {
229     m->tokenizer.allowEOF();
230     // Generate a unique ID. It just has to be unique among all QPDF
231     // objects allocated throughout the lifetime of this running
232     // application.
233     static std::atomic<unsigned long long> unique_id{0};
234     m->unique_id = unique_id.fetch_add(1ULL);
235 }
236 
~QPDF()237 QPDF::~QPDF()
238 {
239     // If two objects are mutually referential (through each object
240     // having an array or dictionary that contains an indirect
241     // reference to the other), the circular references in the
242     // PointerHolder objects will prevent the objects from being
243     // deleted.  Walk through all objects in the object cache, which
244     // is those objects that we read from the file, and break all
245     // resolved references.  At this point, obviously no one is still
246     // using the QPDF object, but we'll explicitly clear the xref
247     // table anyway just to prevent any possibility of resolve()
248     // succeeding.  Note that we can't break references like this at
249     // any time when the QPDF object is active.  If we do, the next
250     // reference will reread the object from the file, which would
251     // have the effect of undoing any modifications that may have been
252     // made to any of the objects.
253     this->m->xref_table.clear();
254     for (std::map<QPDFObjGen, ObjCache>::iterator iter =
255              this->m->obj_cache.begin();
256 	 iter != this->m->obj_cache.end(); ++iter)
257     {
258 	QPDFObject::ObjAccessor::releaseResolved(
259 	    (*iter).second.object.getPointer());
260     }
261 }
262 
263 void
processFile(char const * filename,char const * password)264 QPDF::processFile(char const* filename, char const* password)
265 {
266     FileInputSource* fi = new FileInputSource();
267     fi->setFilename(filename);
268     processInputSource(fi, password);
269 }
270 
271 void
processFile(char const * description,FILE * filep,bool close_file,char const * password)272 QPDF::processFile(char const* description, FILE* filep,
273                   bool close_file, char const* password)
274 {
275     FileInputSource* fi = new FileInputSource();
276     fi->setFile(description, filep, close_file);
277     processInputSource(fi, password);
278 }
279 
280 void
processMemoryFile(char const * description,char const * buf,size_t length,char const * password)281 QPDF::processMemoryFile(char const* description,
282 			char const* buf, size_t length,
283 			char const* password)
284 {
285     processInputSource(
286 	new BufferInputSource(
287             description,
288             new Buffer(QUtil::unsigned_char_pointer(buf), length),
289             true),
290         password);
291 }
292 
293 void
processInputSource(PointerHolder<InputSource> source,char const * password)294 QPDF::processInputSource(PointerHolder<InputSource> source,
295                          char const* password)
296 {
297     this->m->file = source;
298     parse(password);
299 }
300 
301 void
closeInputSource()302 QPDF::closeInputSource()
303 {
304     this->m->file = new InvalidInputSource();
305 }
306 
307 void
setPasswordIsHexKey(bool val)308 QPDF::setPasswordIsHexKey(bool val)
309 {
310     this->m->provided_password_is_hex_key = val;
311 }
312 
313 void
emptyPDF()314 QPDF::emptyPDF()
315 {
316     processMemoryFile("empty PDF", EMPTY_PDF, strlen(EMPTY_PDF));
317 }
318 
319 void
registerStreamFilter(std::string const & filter_name,std::function<std::shared_ptr<QPDFStreamFilter> ()> factory)320 QPDF::registerStreamFilter(
321     std::string const& filter_name,
322     std::function<std::shared_ptr<QPDFStreamFilter> ()> factory)
323 {
324     QPDF_Stream::registerStreamFilter(filter_name, factory);
325 }
326 
327 void
setIgnoreXRefStreams(bool val)328 QPDF::setIgnoreXRefStreams(bool val)
329 {
330     this->m->ignore_xref_streams = val;
331 }
332 
333 void
setOutputStreams(std::ostream * out,std::ostream * err)334 QPDF::setOutputStreams(std::ostream* out, std::ostream* err)
335 {
336     this->m->out_stream = out ? out : &std::cout;
337     this->m->err_stream = err ? err : &std::cerr;
338 }
339 
340 void
setSuppressWarnings(bool val)341 QPDF::setSuppressWarnings(bool val)
342 {
343     this->m->suppress_warnings = val;
344 }
345 
346 void
setAttemptRecovery(bool val)347 QPDF::setAttemptRecovery(bool val)
348 {
349     this->m->attempt_recovery = val;
350 }
351 
352 void
setImmediateCopyFrom(bool val)353 QPDF::setImmediateCopyFrom(bool val)
354 {
355     this->m->immediate_copy_from = val;
356 }
357 
358 std::vector<QPDFExc>
getWarnings()359 QPDF::getWarnings()
360 {
361     std::vector<QPDFExc> result = this->m->warnings;
362     this->m->warnings.clear();
363     return result;
364 }
365 
366 bool
anyWarnings() const367 QPDF::anyWarnings() const
368 {
369     return ! this->m->warnings.empty();
370 }
371 
372 size_t
numWarnings() const373 QPDF::numWarnings() const
374 {
375     return this->m->warnings.size();
376 }
377 
378 bool
findHeader()379 QPDF::findHeader()
380 {
381     qpdf_offset_t global_offset = this->m->file->tell();
382     std::string line = this->m->file->readLine(1024);
383     char const* p = line.c_str();
384     if (strncmp(p, "%PDF-", 5) != 0)
385     {
386         throw std::logic_error("findHeader is not looking at %PDF-");
387     }
388     p += 5;
389     std::string version;
390     // Note: The string returned by line.c_str() is always
391     // null-terminated. The code below never overruns the buffer
392     // because a null character always short-circuits further
393     // advancement.
394     bool valid = QUtil::is_digit(*p);
395     if (valid)
396     {
397         while (QUtil::is_digit(*p))
398         {
399             version.append(1, *p++);
400         }
401         if ((*p == '.') && QUtil::is_digit(*(p+1)))
402         {
403             version.append(1, *p++);
404             while (QUtil::is_digit(*p))
405             {
406                 version.append(1, *p++);
407             }
408         }
409         else
410         {
411             valid = false;
412         }
413     }
414     if (valid)
415     {
416         this->m->pdf_version = version;
417         if (global_offset != 0)
418         {
419             // Empirical evidence strongly suggests that when there is
420             // leading material prior to the PDF header, all explicit
421             // offsets in the file are such that 0 points to the
422             // beginning of the header.
423             QTC::TC("qpdf", "QPDF global offset");
424             this->m->file = new OffsetInputSource(this->m->file, global_offset);
425         }
426     }
427     return valid;
428 }
429 
430 bool
findStartxref()431 QPDF::findStartxref()
432 {
433     QPDFTokenizer::Token t = readToken(this->m->file);
434     if (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "startxref"))
435     {
436         t = readToken(this->m->file);
437         if (t.getType() == QPDFTokenizer::tt_integer)
438         {
439             // Position in front of offset token
440             this->m->file->seek(this->m->file->getLastOffset(), SEEK_SET);
441             return true;
442         }
443     }
444     return false;
445 }
446 
447 void
parse(char const * password)448 QPDF::parse(char const* password)
449 {
450     if (password)
451     {
452 	this->m->encp->provided_password = password;
453     }
454 
455     // Find the header anywhere in the first 1024 bytes of the file.
456     PatternFinder hf(*this, &QPDF::findHeader);
457     if (! this->m->file->findFirst("%PDF-", 0, 1024, hf))
458     {
459 	QTC::TC("qpdf", "QPDF not a pdf file");
460 	warn(QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
461                      "", 0, "can't find PDF header"));
462         // QPDFWriter writes files that usually require at least
463         // version 1.2 for /FlateDecode
464         this->m->pdf_version = "1.2";
465     }
466 
467     // PDF spec says %%EOF must be found within the last 1024 bytes of
468     // the file.  We add an extra 30 characters to leave room for the
469     // startxref stuff.
470     this->m->file->seek(0, SEEK_END);
471     qpdf_offset_t end_offset = this->m->file->tell();
472     qpdf_offset_t start_offset = (end_offset > 1054 ? end_offset - 1054 : 0);
473     PatternFinder sf(*this, &QPDF::findStartxref);
474     qpdf_offset_t xref_offset = 0;
475     if (this->m->file->findLast("startxref", start_offset, 0, sf))
476     {
477         xref_offset = QUtil::string_to_ll(
478             readToken(this->m->file).getValue().c_str());
479     }
480 
481     try
482     {
483 	if (xref_offset == 0)
484 	{
485 	    QTC::TC("qpdf", "QPDF can't find startxref");
486 	    throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(), "", 0,
487 			  "can't find startxref");
488 	}
489         try
490         {
491             read_xref(xref_offset);
492         }
493         catch (QPDFExc&)
494         {
495             throw;
496         }
497         catch (std::exception& e)
498         {
499 	    throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(), "", 0,
500 			  std::string("error reading xref: ") + e.what());
501 
502         }
503     }
504     catch (QPDFExc& e)
505     {
506 	if (this->m->attempt_recovery)
507 	{
508 	    reconstruct_xref(e);
509 	    QTC::TC("qpdf", "QPDF reconstructed xref table");
510 	}
511 	else
512 	{
513 	    throw e;
514 	}
515     }
516 
517     initializeEncryption();
518     this->m->parsed = true;
519 }
520 
521 void
inParse(bool v)522 QPDF::inParse(bool v)
523 {
524     if (this->m->in_parse == v)
525     {
526         // This happens of QPDFObjectHandle::parseInternal tries to
527         // resolve an indirect object while it is parsing.
528         throw std::logic_error(
529             "QPDF: re-entrant parsing detected. This is a qpdf bug."
530             " Please report at https://github.com/qpdf/qpdf/issues.");
531     }
532     this->m->in_parse = v;
533 }
534 
535 void
warn(QPDFExc const & e)536 QPDF::warn(QPDFExc const& e)
537 {
538     this->m->warnings.push_back(e);
539     if (! this->m->suppress_warnings)
540     {
541 	*this->m->err_stream
542             << "WARNING: "
543             << this->m->warnings.back().what() << std::endl;
544     }
545 }
546 
547 void
setTrailer(QPDFObjectHandle obj)548 QPDF::setTrailer(QPDFObjectHandle obj)
549 {
550     if (this->m->trailer.isInitialized())
551     {
552 	return;
553     }
554     this->m->trailer = obj;
555 }
556 
557 void
reconstruct_xref(QPDFExc & e)558 QPDF::reconstruct_xref(QPDFExc& e)
559 {
560     if (this->m->reconstructed_xref)
561     {
562         // Avoid xref reconstruction infinite loops. This is getting
563         // very hard to reproduce because qpdf is throwing many fewer
564         // exceptions while parsing. Most situations are warnings now.
565         throw e;
566     }
567 
568     this->m->reconstructed_xref = true;
569 
570     warn(QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(), "", 0,
571 		 "file is damaged"));
572     warn(e);
573     warn(QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(), "", 0,
574 		 "Attempting to reconstruct cross-reference table"));
575 
576     // Delete all references to type 1 (uncompressed) objects
577     std::set<QPDFObjGen> to_delete;
578     for (std::map<QPDFObjGen, QPDFXRefEntry>::iterator iter =
579 	     this->m->xref_table.begin();
580 	 iter != this->m->xref_table.end(); ++iter)
581     {
582 	if (((*iter).second).getType() == 1)
583 	{
584 	    to_delete.insert((*iter).first);
585 	}
586     }
587     for (std::set<QPDFObjGen>::iterator iter = to_delete.begin();
588 	 iter != to_delete.end(); ++iter)
589     {
590 	this->m->xref_table.erase(*iter);
591     }
592 
593     this->m->file->seek(0, SEEK_END);
594     qpdf_offset_t eof = this->m->file->tell();
595     this->m->file->seek(0, SEEK_SET);
596     qpdf_offset_t line_start = 0;
597     // Don't allow very long tokens here during recovery.
598     static size_t const MAX_LEN = 100;
599     while (this->m->file->tell() < eof)
600     {
601         this->m->file->findAndSkipNextEOL();
602         qpdf_offset_t next_line_start = this->m->file->tell();
603         this->m->file->seek(line_start, SEEK_SET);
604         QPDFTokenizer::Token t1 = readToken(this->m->file, MAX_LEN);
605         qpdf_offset_t token_start =
606             this->m->file->tell() - toO(t1.getValue().length());
607         if (token_start >= next_line_start)
608         {
609             // don't process yet -- wait until we get to the line
610             // containing this token
611         }
612 	else if (t1.getType() == QPDFTokenizer::tt_integer)
613         {
614             QPDFTokenizer::Token t2 =
615                 readToken(this->m->file, MAX_LEN);
616             QPDFTokenizer::Token t3 =
617                 readToken(this->m->file, MAX_LEN);
618             if ((t2.getType() == QPDFTokenizer::tt_integer) &&
619                 (t3 == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "obj")))
620             {
621                 int obj = QUtil::string_to_int(t1.getValue().c_str());
622                 int gen = QUtil::string_to_int(t2.getValue().c_str());
623                 insertXrefEntry(obj, 1, token_start, gen, true);
624             }
625         }
626         else if ((! this->m->trailer.isInitialized()) &&
627                  (t1 == QPDFTokenizer::Token(
628                      QPDFTokenizer::tt_word, "trailer")))
629         {
630             QPDFObjectHandle t =
631                     readObject(this->m->file, "trailer", 0, 0, false);
632             if (! t.isDictionary())
633             {
634                 // Oh well.  It was worth a try.
635             }
636             else
637             {
638                 setTrailer(t);
639             }
640 	}
641         this->m->file->seek(next_line_start, SEEK_SET);
642         line_start = next_line_start;
643     }
644 
645     if (! this->m->trailer.isInitialized())
646     {
647 	// We could check the last encountered object to see if it was
648 	// an xref stream.  If so, we could try to get the trailer
649 	// from there.  This may make it possible to recover files
650 	// with bad startxref pointers even when they have object
651 	// streams.
652 
653 	throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(), "", 0,
654 		      "unable to find trailer "
655 		      "dictionary while recovering damaged file");
656     }
657 
658     // We could iterate through the objects looking for streams and
659     // try to find objects inside of them, but it's probably not worth
660     // the trouble.  Acrobat can't recover files with any errors in an
661     // xref stream, and this would be a real long shot anyway.  If we
662     // wanted to do anything that involved looking at stream contents,
663     // we'd also have to call initializeEncryption() here.  It's safe
664     // to call it more than once.
665 }
666 
667 void
read_xref(qpdf_offset_t xref_offset)668 QPDF::read_xref(qpdf_offset_t xref_offset)
669 {
670     std::map<int, int> free_table;
671     std::set<qpdf_offset_t> visited;
672     while (xref_offset)
673     {
674         visited.insert(xref_offset);
675         char buf[7];
676         memset(buf, 0, sizeof(buf));
677 	this->m->file->seek(xref_offset, SEEK_SET);
678         // Some files miss the mark a little with startxref. We could
679         // do a better job of searching in the neighborhood for
680         // something that looks like either an xref table or stream,
681         // but the simple heuristic of skipping whitespace can help
682         // with the xref table case and is harmless with the stream
683         // case.
684         bool done = false;
685         bool skipped_space = false;
686         while (! done)
687         {
688             char ch;
689             if (1 == this->m->file->read(&ch, 1))
690             {
691                 if (QUtil::is_space(ch))
692                 {
693                     skipped_space = true;
694                 }
695                 else
696                 {
697                     this->m->file->unreadCh(ch);
698                     done = true;
699                 }
700             }
701             else
702             {
703                 QTC::TC("qpdf", "QPDF eof skipping spaces before xref",
704                         skipped_space ? 0 : 1);
705                 done = true;
706             }
707         }
708 
709 	this->m->file->read(buf, sizeof(buf) - 1);
710         // The PDF spec says xref must be followed by a line
711         // terminator, but files exist in the wild where it is
712         // terminated by arbitrary whitespace.
713         if ((strncmp(buf, "xref", 4) == 0) &&
714             QUtil::is_space(buf[4]))
715 	{
716             if (skipped_space)
717             {
718                 QTC::TC("qpdf", "QPDF xref skipped space");
719                 warn(QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
720                              "", 0,
721                              "extraneous whitespace seen before xref"));
722             }
723             QTC::TC("qpdf", "QPDF xref space",
724                     ((buf[4] == '\n') ? 0 :
725                      (buf[4] == '\r') ? 1 :
726                      (buf[4] == ' ') ? 2 : 9999));
727             int skip = 4;
728             // buf is null-terminated, and QUtil::is_space('\0') is
729             // false, so this won't overrun.
730             while (QUtil::is_space(buf[skip]))
731             {
732                 ++skip;
733             }
734             xref_offset = read_xrefTable(xref_offset + skip);
735 	}
736 	else
737 	{
738 	    xref_offset = read_xrefStream(xref_offset);
739 	}
740         if (visited.count(xref_offset) != 0)
741         {
742             QTC::TC("qpdf", "QPDF xref loop");
743             throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(), "", 0,
744                           "loop detected following xref tables");
745         }
746     }
747 
748     if (! this->m->trailer.isInitialized())
749     {
750         throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(), "", 0,
751                       "unable to find trailer while reading xref");
752     }
753     int size = this->m->trailer.getKey("/Size").getIntValueAsInt();
754     int max_obj = 0;
755     if (! this->m->xref_table.empty())
756     {
757 	max_obj = (*(this->m->xref_table.rbegin())).first.getObj();
758     }
759     if (! this->m->deleted_objects.empty())
760     {
761 	max_obj = std::max(max_obj, *(this->m->deleted_objects.rbegin()));
762     }
763     if ((size < 1) || (size - 1 != max_obj))
764     {
765 	QTC::TC("qpdf", "QPDF xref size mismatch");
766 	warn(QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(), "", 0,
767 		     std::string("reported number of objects (") +
768 		     QUtil::int_to_string(size) +
769 		     ") is not one plus the highest object number (" +
770 		     QUtil::int_to_string(max_obj) + ")"));
771     }
772 
773     // We no longer need the deleted_objects table, so go ahead and
774     // clear it out to make sure we never depend on its being set.
775     this->m->deleted_objects.clear();
776 }
777 
778 bool
parse_xrefFirst(std::string const & line,int & obj,int & num,int & bytes)779 QPDF::parse_xrefFirst(std::string const& line,
780                       int& obj, int& num, int& bytes)
781 {
782     // is_space and is_digit both return false on '\0', so this will
783     // not overrun the null-terminated buffer.
784     char const* p = line.c_str();
785     char const* start = line.c_str();
786 
787     // Skip zero or more spaces
788     while (QUtil::is_space(*p))
789     {
790         ++p;
791     }
792     // Require digit
793     if (! QUtil::is_digit(*p))
794     {
795         return false;
796     }
797     // Gather digits
798     std::string obj_str;
799     while (QUtil::is_digit(*p))
800     {
801         obj_str.append(1, *p++);
802     }
803     // Require space
804     if (! QUtil::is_space(*p))
805     {
806         return false;
807     }
808     // Skip spaces
809     while (QUtil::is_space(*p))
810     {
811         ++p;
812     }
813     // Require digit
814     if (! QUtil::is_digit(*p))
815     {
816         return false;
817     }
818     // Gather digits
819     std::string num_str;
820     while (QUtil::is_digit(*p))
821     {
822         num_str.append(1, *p++);
823     }
824     // Skip any space including line terminators
825     while (QUtil::is_space(*p))
826     {
827         ++p;
828     }
829     bytes = toI(p - start);
830     obj = QUtil::string_to_int(obj_str.c_str());
831     num = QUtil::string_to_int(num_str.c_str());
832     return true;
833 }
834 
835 bool
parse_xrefEntry(std::string const & line,qpdf_offset_t & f1,int & f2,char & type)836 QPDF::parse_xrefEntry(std::string const& line,
837                       qpdf_offset_t& f1, int& f2, char& type)
838 {
839     // is_space and is_digit both return false on '\0', so this will
840     // not overrun the null-terminated buffer.
841     char const* p = line.c_str();
842 
843     // Skip zero or more spaces. There aren't supposed to be any.
844     bool invalid = false;
845     while (QUtil::is_space(*p))
846     {
847         ++p;
848         QTC::TC("qpdf", "QPDF ignore first space in xref entry");
849         invalid = true;
850     }
851     // Require digit
852     if (! QUtil::is_digit(*p))
853     {
854         return false;
855     }
856     // Gather digits
857     std::string f1_str;
858     while (QUtil::is_digit(*p))
859     {
860         f1_str.append(1, *p++);
861     }
862     // Require space
863     if (! QUtil::is_space(*p))
864     {
865         return false;
866     }
867     if (QUtil::is_space(*(p+1)))
868     {
869         QTC::TC("qpdf", "QPDF ignore first extra space in xref entry");
870         invalid = true;
871     }
872     // Skip spaces
873     while (QUtil::is_space(*p))
874     {
875         ++p;
876     }
877     // Require digit
878     if (! QUtil::is_digit(*p))
879     {
880         return false;
881     }
882     // Gather digits
883     std::string f2_str;
884     while (QUtil::is_digit(*p))
885     {
886         f2_str.append(1, *p++);
887     }
888     // Require space
889     if (! QUtil::is_space(*p))
890     {
891         return false;
892     }
893     if (QUtil::is_space(*(p+1)))
894     {
895         QTC::TC("qpdf", "QPDF ignore second extra space in xref entry");
896         invalid = true;
897     }
898     // Skip spaces
899     while (QUtil::is_space(*p))
900     {
901         ++p;
902     }
903     if ((*p == 'f') || (*p == 'n'))
904     {
905         type = *p;
906     }
907     else
908     {
909         return false;
910     }
911     if ((f1_str.length() != 10) || (f2_str.length() != 5))
912     {
913         QTC::TC("qpdf", "QPDF ignore length error xref entry");
914         invalid = true;
915     }
916 
917     if (invalid)
918     {
919         warn(QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
920                      "xref table",
921                      this->m->file->getLastOffset(),
922                      "accepting invalid xref table entry"));
923     }
924 
925     f1 = QUtil::string_to_ll(f1_str.c_str());
926     f2 = QUtil::string_to_int(f2_str.c_str());
927 
928     return true;
929 }
930 
931 qpdf_offset_t
read_xrefTable(qpdf_offset_t xref_offset)932 QPDF::read_xrefTable(qpdf_offset_t xref_offset)
933 {
934     std::vector<QPDFObjGen> deleted_items;
935 
936     this->m->file->seek(xref_offset, SEEK_SET);
937     bool done = false;
938     while (! done)
939     {
940         char linebuf[51];
941         memset(linebuf, 0, sizeof(linebuf));
942         this->m->file->read(linebuf, sizeof(linebuf) - 1);
943 	std::string line = linebuf;
944         int obj = 0;
945         int num = 0;
946         int bytes = 0;
947         if (! parse_xrefFirst(line, obj, num, bytes))
948 	{
949 	    QTC::TC("qpdf", "QPDF invalid xref");
950 	    throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
951 			  "xref table", this->m->file->getLastOffset(),
952 			  "xref syntax invalid");
953 	}
954         this->m->file->seek(this->m->file->getLastOffset() + bytes, SEEK_SET);
955 	for (qpdf_offset_t i = obj; i - num < obj; ++i)
956 	{
957 	    if (i == 0)
958 	    {
959 		// This is needed by checkLinearization()
960 		this->m->first_xref_item_offset = this->m->file->tell();
961 	    }
962 	    std::string xref_entry = this->m->file->readLine(30);
963             // For xref_table, these will always be small enough to be ints
964 	    qpdf_offset_t f1 = 0;
965 	    int f2 = 0;
966 	    char type = '\0';
967             if (! parse_xrefEntry(xref_entry, f1, f2, type))
968 	    {
969 		QTC::TC("qpdf", "QPDF invalid xref entry");
970 		throw QPDFExc(
971 		    qpdf_e_damaged_pdf, this->m->file->getName(),
972 		    "xref table", this->m->file->getLastOffset(),
973 		    "invalid xref entry (obj=" +
974 		    QUtil::int_to_string(i) + ")");
975 	    }
976 	    if (type == 'f')
977 	    {
978 		// Save deleted items until after we've checked the
979 		// XRefStm, if any.
980 		deleted_items.push_back(QPDFObjGen(toI(i), f2));
981 	    }
982 	    else
983 	    {
984 		insertXrefEntry(toI(i), 1, f1, f2);
985 	    }
986 	}
987 	qpdf_offset_t pos = this->m->file->tell();
988 	QPDFTokenizer::Token t = readToken(this->m->file);
989 	if (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "trailer"))
990 	{
991 	    done = true;
992 	}
993 	else
994 	{
995 	    this->m->file->seek(pos, SEEK_SET);
996 	}
997     }
998 
999     // Set offset to previous xref table if any
1000     QPDFObjectHandle cur_trailer =
1001 	readObject(this->m->file, "trailer", 0, 0, false);
1002     if (! cur_trailer.isDictionary())
1003     {
1004 	QTC::TC("qpdf", "QPDF missing trailer");
1005 	throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1006 		      "", this->m->file->getLastOffset(),
1007 		      "expected trailer dictionary");
1008     }
1009 
1010     if (! this->m->trailer.isInitialized())
1011     {
1012 	setTrailer(cur_trailer);
1013 
1014 	if (! this->m->trailer.hasKey("/Size"))
1015 	{
1016 	    QTC::TC("qpdf", "QPDF trailer lacks size");
1017 	    throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1018 			  "trailer", this->m->file->getLastOffset(),
1019 			  "trailer dictionary lacks /Size key");
1020 	}
1021 	if (! this->m->trailer.getKey("/Size").isInteger())
1022 	{
1023 	    QTC::TC("qpdf", "QPDF trailer size not integer");
1024 	    throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1025 			  "trailer", this->m->file->getLastOffset(),
1026 			  "/Size key in trailer dictionary is not "
1027 			  "an integer");
1028 	}
1029     }
1030 
1031     if (cur_trailer.hasKey("/XRefStm"))
1032     {
1033 	if (this->m->ignore_xref_streams)
1034 	{
1035 	    QTC::TC("qpdf", "QPDF ignoring XRefStm in trailer");
1036 	}
1037 	else
1038 	{
1039 	    if (cur_trailer.getKey("/XRefStm").isInteger())
1040 	    {
1041 		// Read the xref stream but disregard any return value
1042 		// -- we'll use our trailer's /Prev key instead of the
1043 		// xref stream's.
1044 		(void) read_xrefStream(
1045 		    cur_trailer.getKey("/XRefStm").getIntValue());
1046 	    }
1047 	    else
1048 	    {
1049 		throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1050 			      "xref stream", xref_offset,
1051 			      "invalid /XRefStm");
1052 	    }
1053 	}
1054     }
1055 
1056     // Handle any deleted items now that we've read the /XRefStm.
1057     for (std::vector<QPDFObjGen>::iterator iter = deleted_items.begin();
1058 	 iter != deleted_items.end(); ++iter)
1059     {
1060 	QPDFObjGen& og = *iter;
1061 	insertXrefEntry(og.getObj(), 0, 0, og.getGen());
1062     }
1063 
1064     if (cur_trailer.hasKey("/Prev"))
1065     {
1066 	if (! cur_trailer.getKey("/Prev").isInteger())
1067 	{
1068 	    QTC::TC("qpdf", "QPDF trailer prev not integer");
1069 	    throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1070 			  "trailer", this->m->file->getLastOffset(),
1071 			  "/Prev key in trailer dictionary is not "
1072 			  "an integer");
1073 	}
1074 	QTC::TC("qpdf", "QPDF prev key in trailer dictionary");
1075 	xref_offset = cur_trailer.getKey("/Prev").getIntValue();
1076     }
1077     else
1078     {
1079 	xref_offset = 0;
1080     }
1081 
1082     return xref_offset;
1083 }
1084 
1085 qpdf_offset_t
read_xrefStream(qpdf_offset_t xref_offset)1086 QPDF::read_xrefStream(qpdf_offset_t xref_offset)
1087 {
1088     bool found = false;
1089     if (! this->m->ignore_xref_streams)
1090     {
1091 	int xobj;
1092 	int xgen;
1093 	QPDFObjectHandle xref_obj;
1094 	try
1095 	{
1096 	    xref_obj = readObjectAtOffset(
1097 		false, xref_offset, "xref stream", -1, 0, xobj, xgen);
1098 	}
1099 	catch (QPDFExc&)
1100 	{
1101 	    // ignore -- report error below
1102 	}
1103 	if (xref_obj.isInitialized() &&
1104 	    xref_obj.isStream() &&
1105 	    xref_obj.getDict().getKey("/Type").isName() &&
1106 	    xref_obj.getDict().getKey("/Type").getName() == "/XRef")
1107 	{
1108 	    QTC::TC("qpdf", "QPDF found xref stream");
1109 	    found = true;
1110 	    xref_offset = processXRefStream(xref_offset, xref_obj);
1111 	}
1112     }
1113 
1114     if (! found)
1115     {
1116 	QTC::TC("qpdf", "QPDF can't find xref");
1117 	throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1118 		      "", xref_offset, "xref not found");
1119     }
1120 
1121     return xref_offset;
1122 }
1123 
1124 qpdf_offset_t
processXRefStream(qpdf_offset_t xref_offset,QPDFObjectHandle & xref_obj)1125 QPDF::processXRefStream(qpdf_offset_t xref_offset, QPDFObjectHandle& xref_obj)
1126 {
1127     QPDFObjectHandle dict = xref_obj.getDict();
1128     QPDFObjectHandle W_obj = dict.getKey("/W");
1129     QPDFObjectHandle Index_obj = dict.getKey("/Index");
1130     if (! (W_obj.isArray() &&
1131 	   (W_obj.getArrayNItems() >= 3) &&
1132 	   W_obj.getArrayItem(0).isInteger() &&
1133 	   W_obj.getArrayItem(1).isInteger() &&
1134 	   W_obj.getArrayItem(2).isInteger() &&
1135 	   dict.getKey("/Size").isInteger() &&
1136 	   (Index_obj.isArray() || Index_obj.isNull())))
1137     {
1138 	throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1139 		      "xref stream", xref_offset,
1140 		      "Cross-reference stream does not have"
1141 		      " proper /W and /Index keys");
1142     }
1143 
1144     int W[3];
1145     size_t entry_size = 0;
1146     int max_bytes = sizeof(qpdf_offset_t);
1147     for (int i = 0; i < 3; ++i)
1148     {
1149 	W[i] = W_obj.getArrayItem(i).getIntValueAsInt();
1150         if (W[i] > max_bytes)
1151         {
1152             throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1153                           "xref stream", xref_offset,
1154                           "Cross-reference stream's /W contains"
1155                           " impossibly large values");
1156         }
1157 	entry_size += toS(W[i]);
1158     }
1159     if (entry_size == 0)
1160     {
1161         throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1162                       "xref stream", xref_offset,
1163                       "Cross-reference stream's /W indicates"
1164                       " entry size of 0");
1165     }
1166     unsigned long long max_num_entries =
1167         static_cast<unsigned long long>(-1) / entry_size;
1168 
1169     std::vector<long long> indx;
1170     if (Index_obj.isArray())
1171     {
1172 	int n_index = Index_obj.getArrayNItems();
1173 	if ((n_index % 2) || (n_index < 2))
1174 	{
1175 	    throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1176 			  "xref stream", xref_offset,
1177 			  "Cross-reference stream's /Index has an"
1178 			  " invalid number of values");
1179 	}
1180 	for (int i = 0; i < n_index; ++i)
1181 	{
1182 	    if (Index_obj.getArrayItem(i).isInteger())
1183 	    {
1184 		indx.push_back(Index_obj.getArrayItem(i).getIntValue());
1185 	    }
1186 	    else
1187 	    {
1188 		throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1189 			      "xref stream", xref_offset,
1190 			      "Cross-reference stream's /Index's item " +
1191 			      QUtil::int_to_string(i) +
1192 			      " is not an integer");
1193 	    }
1194 	}
1195 	QTC::TC("qpdf", "QPDF xref /Index is array",
1196 		n_index == 2 ? 0 : 1);
1197     }
1198     else
1199     {
1200 	QTC::TC("qpdf", "QPDF xref /Index is null");
1201 	long long size = dict.getKey("/Size").getIntValue();
1202 	indx.push_back(0);
1203 	indx.push_back(size);
1204     }
1205 
1206     size_t num_entries = 0;
1207     for (size_t i = 1; i < indx.size(); i += 2)
1208     {
1209         if (indx.at(i) > QIntC::to_longlong(max_num_entries - num_entries))
1210         {
1211             throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1212                           "xref stream", xref_offset,
1213                           "Cross-reference stream claims to contain"
1214                           " too many entries: " +
1215                           QUtil::int_to_string(indx.at(i)) + " " +
1216                           QUtil::uint_to_string(max_num_entries) + " " +
1217                           QUtil::uint_to_string(num_entries));
1218         }
1219 	num_entries += toS(indx.at(i));
1220     }
1221 
1222     // entry_size and num_entries have both been validated to ensure
1223     // that this multiplication does not cause an overflow.
1224     size_t expected_size = entry_size * num_entries;
1225 
1226     PointerHolder<Buffer> bp = xref_obj.getStreamData(qpdf_dl_specialized);
1227     size_t actual_size = bp->getSize();
1228 
1229     if (expected_size != actual_size)
1230     {
1231 	QPDFExc x(qpdf_e_damaged_pdf, this->m->file->getName(),
1232 		  "xref stream", xref_offset,
1233 		  "Cross-reference stream data has the wrong size;"
1234 		  " expected = " + QUtil::uint_to_string(expected_size) +
1235 		  "; actual = " + QUtil::uint_to_string(actual_size));
1236 	if (expected_size > actual_size)
1237 	{
1238 	    throw x;
1239 	}
1240 	else
1241 	{
1242 	    warn(x);
1243 	}
1244     }
1245 
1246     size_t cur_chunk = 0;
1247     int chunk_count = 0;
1248 
1249     bool saw_first_compressed_object = false;
1250 
1251     // Actual size vs. expected size check above ensures that we will
1252     // not overflow any buffers here.  We know that entry_size *
1253     // num_entries is equal to the size of the buffer.
1254     unsigned char const* data = bp->getBuffer();
1255     for (size_t i = 0; i < num_entries; ++i)
1256     {
1257 	// Read this entry
1258 	unsigned char const* entry = data + (entry_size * i);
1259 	qpdf_offset_t fields[3];
1260 	unsigned char const* p = entry;
1261 	for (int j = 0; j < 3; ++j)
1262 	{
1263 	    fields[j] = 0;
1264 	    if ((j == 0) && (W[0] == 0))
1265 	    {
1266 		QTC::TC("qpdf", "QPDF default for xref stream field 0");
1267 		fields[0] = 1;
1268 	    }
1269 	    for (int k = 0; k < W[j]; ++k)
1270 	    {
1271 		fields[j] <<= 8;
1272 		fields[j] += toI(*p++);
1273 	    }
1274 	}
1275 
1276 	// Get the object and generation number.  The object number is
1277 	// based on /Index.  The generation number is 0 unless this is
1278 	// an uncompressed object record, in which case the generation
1279 	// number appears as the third field.
1280 	int obj = toI(indx.at(cur_chunk));
1281         if ((obj < 0) ||
1282             ((std::numeric_limits<int>::max() - obj) < chunk_count))
1283         {
1284             std::ostringstream msg;
1285             msg.imbue(std::locale::classic());
1286             msg << "adding " << chunk_count << " to " << obj
1287                 << " while computing index in xref stream would cause"
1288                 << " an integer overflow";
1289             throw std::range_error(msg.str());
1290         }
1291         obj += chunk_count;
1292 	++chunk_count;
1293 	if (chunk_count >= indx.at(cur_chunk + 1))
1294 	{
1295 	    cur_chunk += 2;
1296 	    chunk_count = 0;
1297 	}
1298 
1299 	if (saw_first_compressed_object)
1300 	{
1301 	    if (fields[0] != 2)
1302 	    {
1303 		this->m->uncompressed_after_compressed = true;
1304 	    }
1305 	}
1306 	else if (fields[0] == 2)
1307 	{
1308 	    saw_first_compressed_object = true;
1309 	}
1310 	if (obj == 0)
1311 	{
1312 	    // This is needed by checkLinearization()
1313 	    this->m->first_xref_item_offset = xref_offset;
1314 	}
1315         if (fields[0] == 0)
1316         {
1317             // Ignore fields[2], which we don't care about in this
1318             // case. This works around the issue of some PDF files
1319             // that put invalid values, like -1, here for deleted
1320             // objects.
1321             fields[2] = 0;
1322         }
1323 	insertXrefEntry(obj, toI(fields[0]),
1324                         fields[1], toI(fields[2]));
1325     }
1326 
1327     if (! this->m->trailer.isInitialized())
1328     {
1329 	setTrailer(dict);
1330     }
1331 
1332     if (dict.hasKey("/Prev"))
1333     {
1334 	if (! dict.getKey("/Prev").isInteger())
1335 	{
1336 	    throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1337 			  "xref stream", this->m->file->getLastOffset(),
1338 			  "/Prev key in xref stream dictionary is not "
1339 			  "an integer");
1340 	}
1341 	QTC::TC("qpdf", "QPDF prev key in xref stream dictionary");
1342 	xref_offset = dict.getKey("/Prev").getIntValue();
1343     }
1344     else
1345     {
1346 	xref_offset = 0;
1347     }
1348 
1349     return xref_offset;
1350 }
1351 
1352 void
insertXrefEntry(int obj,int f0,qpdf_offset_t f1,int f2,bool overwrite)1353 QPDF::insertXrefEntry(int obj, int f0, qpdf_offset_t f1, int f2, bool overwrite)
1354 {
1355     // Populate the xref table in such a way that the first reference
1356     // to an object that we see, which is the one in the latest xref
1357     // table in which it appears, is the one that gets stored.  This
1358     // works because we are reading more recent appends before older
1359     // ones.  Exception: if overwrite is true, then replace any
1360     // existing object.  This is used in xref recovery mode, which
1361     // reads the file from beginning to end.
1362 
1363     // If there is already an entry for this object and generation in
1364     // the table, it means that a later xref table has registered this
1365     // object.  Disregard this one.
1366     { // private scope
1367 	int gen = (f0 == 2 ? 0 : f2);
1368 	QPDFObjGen og(obj, gen);
1369 	if (this->m->xref_table.count(og))
1370 	{
1371 	    if (overwrite)
1372 	    {
1373 		QTC::TC("qpdf", "QPDF xref overwrite object");
1374 		this->m->xref_table.erase(og);
1375 	    }
1376 	    else
1377 	    {
1378 		QTC::TC("qpdf", "QPDF xref reused object");
1379 		return;
1380 	    }
1381 	}
1382 	if (this->m->deleted_objects.count(obj))
1383 	{
1384 	    QTC::TC("qpdf", "QPDF xref deleted object");
1385 	    return;
1386 	}
1387     }
1388 
1389     switch (f0)
1390     {
1391       case 0:
1392 	this->m->deleted_objects.insert(obj);
1393 	break;
1394 
1395       case 1:
1396 	// f2 is generation
1397 	QTC::TC("qpdf", "QPDF xref gen > 0", ((f2 > 0) ? 1 : 0));
1398 	this->m->xref_table[QPDFObjGen(obj, f2)] = QPDFXRefEntry(f0, f1, f2);
1399 	break;
1400 
1401       case 2:
1402 	this->m->xref_table[QPDFObjGen(obj, 0)] = QPDFXRefEntry(f0, f1, f2);
1403 	break;
1404 
1405       default:
1406 	throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1407 		      "xref stream", this->m->file->getLastOffset(),
1408 		      "unknown xref stream entry type " +
1409 		      QUtil::int_to_string(f0));
1410 	break;
1411     }
1412 }
1413 
1414 void
showXRefTable()1415 QPDF::showXRefTable()
1416 {
1417     for (std::map<QPDFObjGen, QPDFXRefEntry>::iterator iter =
1418 	     this->m->xref_table.begin();
1419 	 iter != this->m->xref_table.end(); ++iter)
1420     {
1421 	QPDFObjGen const& og = (*iter).first;
1422 	QPDFXRefEntry const& entry = (*iter).second;
1423 	*this->m->out_stream << og.getObj() << "/" << og.getGen() << ": ";
1424 	switch (entry.getType())
1425 	{
1426 	  case 1:
1427 	    *this->m->out_stream
1428                 << "uncompressed; offset = " << entry.getOffset();
1429 	    break;
1430 
1431 	  case 2:
1432 	    *this->m->out_stream
1433                 << "compressed; stream = "
1434                 << entry.getObjStreamNumber()
1435                 << ", index = " << entry.getObjStreamIndex();
1436 	    break;
1437 
1438 	  default:
1439 	    throw std::logic_error("unknown cross-reference table type while"
1440 				   " showing xref_table");
1441 	    break;
1442 	}
1443 	*this->m->out_stream << std::endl;
1444     }
1445 }
1446 
1447 void
fixDanglingReferences(bool force)1448 QPDF::fixDanglingReferences(bool force)
1449 {
1450     if (this->m->fixed_dangling_refs && (! force))
1451     {
1452         return;
1453     }
1454     this->m->fixed_dangling_refs = true;
1455 
1456     // Create a set of all known indirect objects including those
1457     // we've previously resolved and those that we have created.
1458     std::set<QPDFObjGen> to_process;
1459     for (std::map<QPDFObjGen, ObjCache>::iterator iter =
1460 	     this->m->obj_cache.begin();
1461 	 iter != this->m->obj_cache.end(); ++iter)
1462     {
1463 	to_process.insert((*iter).first);
1464     }
1465     for (std::map<QPDFObjGen, QPDFXRefEntry>::iterator iter =
1466 	     this->m->xref_table.begin();
1467 	 iter != this->m->xref_table.end(); ++iter)
1468     {
1469 	to_process.insert((*iter).first);
1470     }
1471 
1472     // For each non-scalar item to process, put it in the queue.
1473     std::list<QPDFObjectHandle> queue;
1474     queue.push_back(this->m->trailer);
1475     for (std::set<QPDFObjGen>::iterator iter = to_process.begin();
1476          iter != to_process.end(); ++iter)
1477     {
1478         QPDFObjectHandle obj = QPDFObjectHandle::Factory::newIndirect(
1479             this, (*iter).getObj(), (*iter).getGen());
1480         if (obj.isDictionary() || obj.isArray())
1481         {
1482             queue.push_back(obj);
1483         }
1484         else if (obj.isStream())
1485         {
1486             queue.push_back(obj.getDict());
1487         }
1488     }
1489 
1490     // Process the queue by recursively resolving all object
1491     // references. We don't need to do loop detection because we don't
1492     // traverse known indirect objects when processing the queue.
1493     while (! queue.empty())
1494     {
1495         QPDFObjectHandle obj = queue.front();
1496         queue.pop_front();
1497         std::list<QPDFObjectHandle> to_check;
1498         if (obj.isDictionary())
1499         {
1500             std::map<std::string, QPDFObjectHandle> members =
1501                 obj.getDictAsMap();
1502             for (std::map<std::string, QPDFObjectHandle>::iterator iter =
1503                      members.begin();
1504                  iter != members.end(); ++iter)
1505             {
1506                 to_check.push_back((*iter).second);
1507             }
1508         }
1509         else if (obj.isArray())
1510         {
1511             QPDF_Array* arr =
1512                 dynamic_cast<QPDF_Array*>(
1513                     QPDFObjectHandle::ObjAccessor::getObject(obj).getPointer());
1514             arr->addExplicitElementsToList(to_check);
1515         }
1516         for (std::list<QPDFObjectHandle>::iterator iter = to_check.begin();
1517              iter != to_check.end(); ++iter)
1518         {
1519             QPDFObjectHandle sub = *iter;
1520             if (sub.isIndirect())
1521             {
1522                 if (sub.getOwningQPDF() == this)
1523                 {
1524                     QPDFObjGen og(sub.getObjGen());
1525                     if (this->m->obj_cache.count(og) == 0)
1526                     {
1527                         QTC::TC("qpdf", "QPDF detected dangling ref");
1528                         queue.push_back(sub);
1529                     }
1530                 }
1531             }
1532             else
1533             {
1534                 queue.push_back(sub);
1535             }
1536         }
1537     }
1538 }
1539 
1540 size_t
getObjectCount()1541 QPDF::getObjectCount()
1542 {
1543     // This method returns the next available indirect object number.
1544     // makeIndirectObject uses it for this purpose. After
1545     // fixDanglingReferences is called, all objects in the xref table
1546     // will also be in obj_cache.
1547     fixDanglingReferences();
1548     QPDFObjGen og(0, 0);
1549     if (! this->m->obj_cache.empty())
1550     {
1551 	og = (*(this->m->obj_cache.rbegin())).first;
1552     }
1553     return toS(og.getObj());
1554 }
1555 
1556 std::vector<QPDFObjectHandle>
getAllObjects()1557 QPDF::getAllObjects()
1558 {
1559     // After fixDanglingReferences is called, all objects are in the
1560     // object cache.
1561     fixDanglingReferences(true);
1562     std::vector<QPDFObjectHandle> result;
1563     for (std::map<QPDFObjGen, ObjCache>::iterator iter =
1564 	     this->m->obj_cache.begin();
1565 	 iter != this->m->obj_cache.end(); ++iter)
1566     {
1567 
1568 	QPDFObjGen const& og = (*iter).first;
1569         result.push_back(QPDFObjectHandle::Factory::newIndirect(
1570                              this, og.getObj(), og.getGen()));
1571     }
1572     return result;
1573 }
1574 
1575 void
setLastObjectDescription(std::string const & description,int objid,int generation)1576 QPDF::setLastObjectDescription(std::string const& description,
1577 			       int objid, int generation)
1578 {
1579     this->m->last_object_description.clear();
1580     if (! description.empty())
1581     {
1582 	this->m->last_object_description += description;
1583 	if (objid > 0)
1584 	{
1585 	    this->m->last_object_description += ": ";
1586 	}
1587     }
1588     if (objid > 0)
1589     {
1590 	this->m->last_object_description += "object " +
1591 	    QUtil::int_to_string(objid) + " " +
1592 	    QUtil::int_to_string(generation);
1593     }
1594 }
1595 
1596 QPDFObjectHandle
readObject(PointerHolder<InputSource> input,std::string const & description,int objid,int generation,bool in_object_stream)1597 QPDF::readObject(PointerHolder<InputSource> input,
1598 		 std::string const& description,
1599 		 int objid, int generation, bool in_object_stream)
1600 {
1601     setLastObjectDescription(description, objid, generation);
1602     qpdf_offset_t offset = input->tell();
1603 
1604     bool empty = false;
1605     PointerHolder<StringDecrypter> decrypter_ph;
1606     StringDecrypter* decrypter = 0;
1607     if (this->m->encp->encrypted && (! in_object_stream))
1608     {
1609         decrypter_ph = new StringDecrypter(this, objid, generation);
1610         decrypter = decrypter_ph.getPointer();
1611     }
1612     QPDFObjectHandle object = QPDFObjectHandle::parse(
1613         input, this->m->last_object_description,
1614         this->m->tokenizer, empty, decrypter, this);
1615     if (empty)
1616     {
1617         // Nothing in the PDF spec appears to allow empty objects, but
1618         // they have been encountered in actual PDF files and Adobe
1619         // Reader appears to ignore them.
1620         warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(),
1621                      this->m->last_object_description,
1622                      input->getLastOffset(),
1623                      "empty object treated as null"));
1624     }
1625     else if (object.isDictionary() && (! in_object_stream))
1626     {
1627         // check for stream
1628         qpdf_offset_t cur_offset = input->tell();
1629         if (readToken(input) ==
1630             QPDFTokenizer::Token(QPDFTokenizer::tt_word, "stream"))
1631         {
1632             // The PDF specification states that the word "stream"
1633             // should be followed by either a carriage return and
1634             // a newline or by a newline alone.  It specifically
1635             // disallowed following it by a carriage return alone
1636             // since, in that case, there would be no way to tell
1637             // whether the NL in a CR NL sequence was part of the
1638             // stream data.  However, some readers, including
1639             // Adobe reader, accept a carriage return by itself
1640             // when followed by a non-newline character, so that's
1641             // what we do here. We have also seen files that have
1642             // extraneous whitespace between the stream keyword and
1643             // the newline.
1644             bool done = false;
1645             while (! done)
1646             {
1647                 done = true;
1648                 char ch;
1649                 if (input->read(&ch, 1) == 0)
1650                 {
1651                     // A premature EOF here will result in some
1652                     // other problem that will get reported at
1653                     // another time.
1654                 }
1655                 else if (ch == '\n')
1656                 {
1657                     // ready to read stream data
1658                     QTC::TC("qpdf", "QPDF stream with NL only");
1659                 }
1660                 else if (ch == '\r')
1661                 {
1662                     // Read another character
1663                     if (input->read(&ch, 1) != 0)
1664                     {
1665                         if (ch == '\n')
1666                         {
1667                             // Ready to read stream data
1668                             QTC::TC("qpdf", "QPDF stream with CRNL");
1669                         }
1670                         else
1671                         {
1672                             // Treat the \r by itself as the
1673                             // whitespace after endstream and
1674                             // start reading stream data in spite
1675                             // of not having seen a newline.
1676                             QTC::TC("qpdf", "QPDF stream with CR only");
1677                             input->unreadCh(ch);
1678                             warn(QPDFExc(
1679                                      qpdf_e_damaged_pdf,
1680                                      input->getName(),
1681                                      this->m->last_object_description,
1682                                      input->tell(),
1683                                      "stream keyword followed"
1684                                      " by carriage return only"));
1685                         }
1686                     }
1687                 }
1688                 else if (QUtil::is_space(ch))
1689                 {
1690                     warn(QPDFExc(
1691                              qpdf_e_damaged_pdf,
1692                              input->getName(),
1693                              this->m->last_object_description,
1694                              input->tell(),
1695                              "stream keyword followed by"
1696                              " extraneous whitespace"));
1697                     done = false;
1698                 }
1699                 else
1700                 {
1701                     QTC::TC("qpdf", "QPDF stream without newline");
1702                     input->unreadCh(ch);
1703                     warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(),
1704                                  this->m->last_object_description,
1705                                  input->tell(),
1706                                  "stream keyword not followed"
1707                                  " by proper line terminator"));
1708                 }
1709             }
1710 
1711             // Must get offset before accessing any additional
1712             // objects since resolving a previously unresolved
1713             // indirect object will change file position.
1714             qpdf_offset_t stream_offset = input->tell();
1715             size_t length = 0;
1716 
1717             try
1718             {
1719                 std::map<std::string, QPDFObjectHandle> dict =
1720                     object.getDictAsMap();
1721 
1722                 if (dict.count("/Length") == 0)
1723                 {
1724                     QTC::TC("qpdf", "QPDF stream without length");
1725                     throw QPDFExc(qpdf_e_damaged_pdf, input->getName(),
1726                                   this->m->last_object_description, offset,
1727                                   "stream dictionary lacks /Length key");
1728                 }
1729 
1730                 QPDFObjectHandle length_obj = dict["/Length"];
1731                 if (! length_obj.isInteger())
1732                 {
1733                     QTC::TC("qpdf", "QPDF stream length not integer");
1734                     throw QPDFExc(qpdf_e_damaged_pdf, input->getName(),
1735                                   this->m->last_object_description, offset,
1736                                   "/Length key in stream dictionary is not "
1737                                   "an integer");
1738                 }
1739 
1740                 length = toS(length_obj.getUIntValue());
1741                 // Seek in two steps to avoid potential integer overflow
1742                 input->seek(stream_offset, SEEK_SET);
1743                 input->seek(toO(length), SEEK_CUR);
1744                 if (! (readToken(input) ==
1745                        QPDFTokenizer::Token(
1746                            QPDFTokenizer::tt_word, "endstream")))
1747                 {
1748                     QTC::TC("qpdf", "QPDF missing endstream");
1749                     throw QPDFExc(qpdf_e_damaged_pdf, input->getName(),
1750                                   this->m->last_object_description,
1751                                   input->getLastOffset(),
1752                                   "expected endstream");
1753                 }
1754             }
1755             catch (QPDFExc& e)
1756             {
1757                 if (this->m->attempt_recovery)
1758                 {
1759                     warn(e);
1760                     length = recoverStreamLength(
1761                         input, objid, generation, stream_offset);
1762                 }
1763                 else
1764                 {
1765                     throw e;
1766                 }
1767             }
1768             object = QPDFObjectHandle::Factory::newStream(
1769                 this, objid, generation, object, stream_offset, length);
1770         }
1771         else
1772         {
1773             input->seek(cur_offset, SEEK_SET);
1774         }
1775     }
1776 
1777     // Override last_offset so that it points to the beginning of the
1778     // object we just read
1779     input->setLastOffset(offset);
1780     return object;
1781 }
1782 
1783 bool
findEndstream()1784 QPDF::findEndstream()
1785 {
1786     // Find endstream or endobj. Position the input at that token.
1787     QPDFTokenizer::Token t = readToken(this->m->file, 20);
1788     if ((t.getType() == QPDFTokenizer::tt_word) &&
1789         ((t.getValue() == "endobj") ||
1790          (t.getValue() == "endstream")))
1791     {
1792         this->m->file->seek(this->m->file->getLastOffset(), SEEK_SET);
1793         return true;
1794     }
1795     return false;
1796 }
1797 
1798 size_t
recoverStreamLength(PointerHolder<InputSource> input,int objid,int generation,qpdf_offset_t stream_offset)1799 QPDF::recoverStreamLength(PointerHolder<InputSource> input,
1800 			  int objid, int generation,
1801                           qpdf_offset_t stream_offset)
1802 {
1803     // Try to reconstruct stream length by looking for
1804     // endstream or endobj
1805     warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(),
1806 		 this->m->last_object_description, stream_offset,
1807 		 "attempting to recover stream length"));
1808 
1809     PatternFinder ef(*this, &QPDF::findEndstream);
1810     size_t length = 0;
1811     if (this->m->file->findFirst("end", stream_offset, 0, ef))
1812     {
1813         length = toS(this->m->file->tell() - stream_offset);
1814         // Reread endstream but, if it was endobj, don't skip that.
1815         QPDFTokenizer::Token t = readToken(this->m->file);
1816         if (t.getValue() == "endobj")
1817         {
1818             this->m->file->seek(this->m->file->getLastOffset(), SEEK_SET);
1819         }
1820     }
1821 
1822     if (length)
1823     {
1824 	qpdf_offset_t this_obj_offset = 0;
1825 	QPDFObjGen this_obj(0, 0);
1826 
1827 	// Make sure this is inside this object
1828 	for (std::map<QPDFObjGen, QPDFXRefEntry>::iterator iter =
1829 		 this->m->xref_table.begin();
1830 	     iter != this->m->xref_table.end(); ++iter)
1831 	{
1832 	    QPDFObjGen const& og = (*iter).first;
1833 	    QPDFXRefEntry const& entry = (*iter).second;
1834 	    if (entry.getType() == 1)
1835 	    {
1836 		qpdf_offset_t obj_offset = entry.getOffset();
1837 		if ((obj_offset > stream_offset) &&
1838 		    ((this_obj_offset == 0) ||
1839 		     (this_obj_offset > obj_offset)))
1840 		{
1841 		    this_obj_offset = obj_offset;
1842 		    this_obj = og;
1843 		}
1844 	    }
1845 	}
1846 	if (this_obj_offset &&
1847 	    (this_obj.getObj() == objid) &&
1848 	    (this_obj.getGen() == generation))
1849 	{
1850 	    // Well, we found endstream\nendobj within the space
1851 	    // allowed for this object, so we're probably in good
1852 	    // shape.
1853 	}
1854 	else
1855 	{
1856 	    QTC::TC("qpdf", "QPDF found wrong endstream in recovery");
1857 	}
1858     }
1859 
1860     if (length == 0)
1861     {
1862         warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(),
1863                      this->m->last_object_description, stream_offset,
1864                      "unable to recover stream data;"
1865                      " treating stream as empty"));
1866     }
1867     else
1868     {
1869         warn(QPDFExc(qpdf_e_damaged_pdf, input->getName(),
1870                      this->m->last_object_description, stream_offset,
1871                      "recovered stream length: " +
1872                      QUtil::uint_to_string(length)));
1873     }
1874 
1875     QTC::TC("qpdf", "QPDF recovered stream length");
1876     return length;
1877 }
1878 
1879 QPDFTokenizer::Token
readToken(PointerHolder<InputSource> input,size_t max_len)1880 QPDF::readToken(PointerHolder<InputSource> input, size_t max_len)
1881 {
1882     return this->m->tokenizer.readToken(
1883         input, this->m->last_object_description, true, max_len);
1884 }
1885 
1886 QPDFObjectHandle
readObjectAtOffset(bool try_recovery,qpdf_offset_t offset,std::string const & description,int exp_objid,int exp_generation,int & objid,int & generation)1887 QPDF::readObjectAtOffset(bool try_recovery,
1888 			 qpdf_offset_t offset, std::string const& description,
1889 			 int exp_objid, int exp_generation,
1890 			 int& objid, int& generation)
1891 {
1892     if (! this->m->attempt_recovery)
1893     {
1894         try_recovery = false;
1895     }
1896     setLastObjectDescription(description, exp_objid, exp_generation);
1897 
1898     // Special case: if offset is 0, just return null.  Some PDF
1899     // writers, in particular "Mac OS X 10.7.5 Quartz PDFContext", may
1900     // store deleted objects in the xref table as "0000000000 00000
1901     // n", which is not correct, but it won't hurt anything for to
1902     // ignore these.
1903     if (offset == 0)
1904     {
1905         QTC::TC("qpdf", "QPDF bogus 0 offset", 0);
1906 	warn(QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1907 		     this->m->last_object_description, 0,
1908 		     "object has offset 0"));
1909         return QPDFObjectHandle::newNull();
1910     }
1911 
1912     this->m->file->seek(offset, SEEK_SET);
1913 
1914     QPDFTokenizer::Token tobjid = readToken(this->m->file);
1915     QPDFTokenizer::Token tgen = readToken(this->m->file);
1916     QPDFTokenizer::Token tobj = readToken(this->m->file);
1917 
1918     bool objidok = (tobjid.getType() == QPDFTokenizer::tt_integer);
1919     int genok = (tgen.getType() == QPDFTokenizer::tt_integer);
1920     int objok = (tobj == QPDFTokenizer::Token(QPDFTokenizer::tt_word, "obj"));
1921 
1922     QTC::TC("qpdf", "QPDF check objid", objidok ? 1 : 0);
1923     QTC::TC("qpdf", "QPDF check generation", genok ? 1 : 0);
1924     QTC::TC("qpdf", "QPDF check obj", objok ? 1 : 0);
1925 
1926     try
1927     {
1928 	if (! (objidok && genok && objok))
1929 	{
1930 	    QTC::TC("qpdf", "QPDF expected n n obj");
1931 	    throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1932 			  this->m->last_object_description, offset,
1933 			  "expected n n obj");
1934 	}
1935 	objid = QUtil::string_to_int(tobjid.getValue().c_str());
1936 	generation = QUtil::string_to_int(tgen.getValue().c_str());
1937 
1938         if (objid == 0)
1939         {
1940             QTC::TC("qpdf", "QPDF object id 0");
1941             throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
1942                           this->m->last_object_description, offset,
1943                           "object with ID 0");
1944         }
1945 
1946 	if ((exp_objid >= 0) &&
1947 	    (! ((objid == exp_objid) && (generation == exp_generation))))
1948 	{
1949 	    QTC::TC("qpdf", "QPDF err wrong objid/generation");
1950 	    QPDFExc e(qpdf_e_damaged_pdf, this->m->file->getName(),
1951                       this->m->last_object_description, offset,
1952                       std::string("expected ") +
1953                       QUtil::int_to_string(exp_objid) + " " +
1954                       QUtil::int_to_string(exp_generation) + " obj");
1955             if (try_recovery)
1956             {
1957                 // Will be retried below
1958                 throw e;
1959             }
1960             else
1961             {
1962                 // We can try reading the object anyway even if the ID
1963                 // doesn't match.
1964                 warn(e);
1965             }
1966 	}
1967     }
1968     catch (QPDFExc& e)
1969     {
1970 	if ((exp_objid >= 0) && try_recovery)
1971 	{
1972 	    // Try again after reconstructing xref table
1973 	    reconstruct_xref(e);
1974 	    QPDFObjGen og(exp_objid, exp_generation);
1975 	    if (this->m->xref_table.count(og) &&
1976 		(this->m->xref_table[og].getType() == 1))
1977 	    {
1978 		qpdf_offset_t new_offset = this->m->xref_table[og].getOffset();
1979 		QPDFObjectHandle result = readObjectAtOffset(
1980 		    false, new_offset, description,
1981 		    exp_objid, exp_generation, objid, generation);
1982 		QTC::TC("qpdf", "QPDF recovered in readObjectAtOffset");
1983 		return result;
1984 	    }
1985 	    else
1986 	    {
1987 		QTC::TC("qpdf", "QPDF object gone after xref reconstruction");
1988 		warn(QPDFExc(
1989 			 qpdf_e_damaged_pdf, this->m->file->getName(),
1990 			 "", 0,
1991 			 std::string(
1992 			     "object " +
1993 			     QUtil::int_to_string(exp_objid) +
1994 			     " " +
1995 			     QUtil::int_to_string(exp_generation) +
1996 			     " not found in file after regenerating"
1997 			     " cross reference table")));
1998 		return QPDFObjectHandle::newNull();
1999 	    }
2000 	}
2001 	else
2002 	{
2003 	    throw e;
2004 	}
2005     }
2006 
2007     QPDFObjectHandle oh = readObject(
2008 	this->m->file, description, objid, generation, false);
2009 
2010     if (! (readToken(this->m->file) ==
2011 	   QPDFTokenizer::Token(QPDFTokenizer::tt_word, "endobj")))
2012     {
2013 	QTC::TC("qpdf", "QPDF err expected endobj");
2014 	warn(QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
2015 		     this->m->last_object_description,
2016                      this->m->file->getLastOffset(),
2017 		     "expected endobj"));
2018     }
2019 
2020     QPDFObjGen og(objid, generation);
2021     if (! this->m->obj_cache.count(og))
2022     {
2023 	// Store the object in the cache here so it gets cached
2024 	// whether we first know the offset or whether we first know
2025 	// the object ID and generation (in which we case we would get
2026 	// here through resolve).
2027 
2028 	// Determine the end offset of this object before and after
2029 	// white space.  We use these numbers to validate
2030 	// linearization hint tables.  Offsets and lengths of objects
2031 	// may imply the end of an object to be anywhere between these
2032 	// values.
2033 	qpdf_offset_t end_before_space = this->m->file->tell();
2034 
2035 	// skip over spaces
2036 	while (true)
2037 	{
2038 	    char ch;
2039 	    if (this->m->file->read(&ch, 1))
2040 	    {
2041 		if (! isspace(static_cast<unsigned char>(ch)))
2042 		{
2043 		    this->m->file->seek(-1, SEEK_CUR);
2044 		    break;
2045 		}
2046 	    }
2047 	    else
2048 	    {
2049 		throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
2050 			      this->m->last_object_description,
2051                               this->m->file->tell(),
2052 			      "EOF after endobj");
2053 	    }
2054 	}
2055 	qpdf_offset_t end_after_space = this->m->file->tell();
2056 
2057 	this->m->obj_cache[og] =
2058 	    ObjCache(QPDFObjectHandle::ObjAccessor::getObject(oh),
2059 		     end_before_space, end_after_space);
2060     }
2061 
2062     return oh;
2063 }
2064 
2065 bool
objectChanged(QPDFObjGen const & og,PointerHolder<QPDFObject> & oph)2066 QPDF::objectChanged(QPDFObjGen const& og, PointerHolder<QPDFObject>& oph)
2067 {
2068     // See if the object cached at og, if any, is the one passed in.
2069     // QPDFObjectHandle uses this to detect outdated handles to
2070     // replaced or swapped objects. This is a somewhat expensive check
2071     // because it happens with every dereference of a
2072     // QPDFObjectHandle. To reduce the hit somewhat, short-circuit the
2073     // check if we never called a function that replaces an object
2074     // already in cache. It is important for functions that do this to
2075     // set ever_replaced_objects = true.
2076 
2077     if (! this->m->ever_replaced_objects)
2078     {
2079         return false;
2080     }
2081     auto c = this->m->obj_cache.find(og);
2082     if (c == this->m->obj_cache.end())
2083     {
2084         return true;
2085     }
2086     return (c->second.object.getPointer() != oph.getPointer());
2087 }
2088 
2089 PointerHolder<QPDFObject>
resolve(int objid,int generation)2090 QPDF::resolve(int objid, int generation)
2091 {
2092     // Check object cache before checking xref table.  This allows us
2093     // to insert things into the object cache that don't actually
2094     // exist in the file.
2095     QPDFObjGen og(objid, generation);
2096     if (this->m->resolving.count(og))
2097     {
2098         // This can happen if an object references itself directly or
2099         // indirectly in some key that has to be resolved during
2100         // object parsing, such as stream length.
2101 	QTC::TC("qpdf", "QPDF recursion loop in resolve");
2102 	warn(QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
2103 		     "", this->m->file->getLastOffset(),
2104 		     "loop detected resolving object " +
2105 		     QUtil::int_to_string(objid) + " " +
2106 		     QUtil::int_to_string(generation)));
2107         return new QPDF_Null;
2108     }
2109     ResolveRecorder rr(this, og);
2110 
2111     if ((! this->m->obj_cache.count(og)) && this->m->xref_table.count(og))
2112     {
2113 	QPDFXRefEntry const& entry = this->m->xref_table[og];
2114         try
2115         {
2116             switch (entry.getType())
2117             {
2118               case 1:
2119                 {
2120                     qpdf_offset_t offset = entry.getOffset();
2121                     // Object stored in cache by readObjectAtOffset
2122                     int aobjid;
2123                     int ageneration;
2124                     QPDFObjectHandle oh =
2125                         readObjectAtOffset(true, offset, "", objid, generation,
2126                                            aobjid, ageneration);
2127                 }
2128                 break;
2129 
2130               case 2:
2131                 resolveObjectsInStream(entry.getObjStreamNumber());
2132                 break;
2133 
2134               default:
2135                 throw QPDFExc(qpdf_e_damaged_pdf,
2136                               this->m->file->getName(), "", 0,
2137                               "object " +
2138                               QUtil::int_to_string(objid) + "/" +
2139                               QUtil::int_to_string(generation) +
2140                               " has unexpected xref entry type");
2141             }
2142         }
2143         catch (QPDFExc& e)
2144         {
2145             warn(e);
2146         }
2147         catch (std::exception& e)
2148         {
2149             warn(QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(), "", 0,
2150                          "object " +
2151                          QUtil::int_to_string(objid) + "/" +
2152                          QUtil::int_to_string(generation) +
2153                          ": error reading object: " + e.what()));
2154         }
2155     }
2156     if (this->m->obj_cache.count(og) == 0)
2157     {
2158         // PDF spec says unknown objects resolve to the null object.
2159         QTC::TC("qpdf", "QPDF resolve failure to null");
2160         QPDFObjectHandle oh = QPDFObjectHandle::newNull();
2161         this->m->obj_cache[og] =
2162             ObjCache(QPDFObjectHandle::ObjAccessor::getObject(oh), -1, -1);
2163     }
2164 
2165     PointerHolder<QPDFObject> result(this->m->obj_cache[og].object);
2166     if (! result->hasDescription())
2167     {
2168         result->setDescription(
2169             this,
2170             "object " + QUtil::int_to_string(objid) + " " +
2171             QUtil::int_to_string(generation));
2172     }
2173     return result;
2174 }
2175 
2176 void
resolveObjectsInStream(int obj_stream_number)2177 QPDF::resolveObjectsInStream(int obj_stream_number)
2178 {
2179     if (this->m->resolved_object_streams.count(obj_stream_number))
2180     {
2181         return;
2182     }
2183     this->m->resolved_object_streams.insert(obj_stream_number);
2184     // Force resolution of object stream
2185     QPDFObjectHandle obj_stream = getObjectByID(obj_stream_number, 0);
2186     if (! obj_stream.isStream())
2187     {
2188 	throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
2189 		      this->m->last_object_description,
2190 		      this->m->file->getLastOffset(),
2191 		      "supposed object stream " +
2192 		      QUtil::int_to_string(obj_stream_number) +
2193 		      " is not a stream");
2194     }
2195 
2196     // For linearization data in the object, use the data from the
2197     // object stream for the objects in the stream.
2198     QPDFObjGen stream_og(obj_stream_number, 0);
2199     qpdf_offset_t end_before_space =
2200         this->m->obj_cache[stream_og].end_before_space;
2201     qpdf_offset_t end_after_space =
2202         this->m->obj_cache[stream_og].end_after_space;
2203 
2204     QPDFObjectHandle dict = obj_stream.getDict();
2205     if (! (dict.getKey("/Type").isName() &&
2206 	   dict.getKey("/Type").getName() == "/ObjStm"))
2207     {
2208 	QTC::TC("qpdf", "QPDF ERR object stream with wrong type");
2209 	warn(QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
2210                      this->m->last_object_description,
2211                      this->m->file->getLastOffset(),
2212                      "supposed object stream " +
2213                      QUtil::int_to_string(obj_stream_number) +
2214                      " has wrong type"));
2215     }
2216 
2217     if (! (dict.getKey("/N").isInteger() &&
2218 	   dict.getKey("/First").isInteger()))
2219     {
2220 	throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
2221 		      this->m->last_object_description,
2222 		      this->m->file->getLastOffset(),
2223 		      "object stream " +
2224 		      QUtil::int_to_string(obj_stream_number) +
2225 		      " has incorrect keys");
2226     }
2227 
2228     int n = dict.getKey("/N").getIntValueAsInt();
2229     int first = dict.getKey("/First").getIntValueAsInt();
2230 
2231     std::map<int, int> offsets;
2232 
2233     PointerHolder<Buffer> bp = obj_stream.getStreamData(qpdf_dl_specialized);
2234     PointerHolder<InputSource> input = new BufferInputSource(
2235         this->m->file->getName() +
2236         " object stream " + QUtil::int_to_string(obj_stream_number),
2237 	bp.getPointer());
2238 
2239     for (int i = 0; i < n; ++i)
2240     {
2241 	QPDFTokenizer::Token tnum = readToken(input);
2242 	QPDFTokenizer::Token toffset = readToken(input);
2243 	if (! ((tnum.getType() == QPDFTokenizer::tt_integer) &&
2244 	       (toffset.getType() == QPDFTokenizer::tt_integer)))
2245 	{
2246 	    throw QPDFExc(qpdf_e_damaged_pdf, input->getName(),
2247 			  this->m->last_object_description,
2248                           input->getLastOffset(),
2249 			  "expected integer in object stream header");
2250 	}
2251 
2252 	int num = QUtil::string_to_int(tnum.getValue().c_str());
2253 	long long offset = QUtil::string_to_int(toffset.getValue().c_str());
2254 	offsets[num] = QIntC::to_int(offset + first);
2255     }
2256 
2257     // To avoid having to read the object stream multiple times, store
2258     // all objects that would be found here in the cache.  Remember
2259     // that some objects stored here might have been overridden by new
2260     // objects appended to the file, so it is necessary to recheck the
2261     // xref table and only cache what would actually be resolved here.
2262     for (std::map<int, int>::iterator iter = offsets.begin();
2263 	 iter != offsets.end(); ++iter)
2264     {
2265 	int obj = (*iter).first;
2266 	QPDFObjGen og(obj, 0);
2267         QPDFXRefEntry const& entry = this->m->xref_table[og];
2268         if ((entry.getType() == 2) &&
2269             (entry.getObjStreamNumber() == obj_stream_number))
2270         {
2271             int offset = (*iter).second;
2272             input->seek(offset, SEEK_SET);
2273             QPDFObjectHandle oh = readObject(input, "", obj, 0, true);
2274             this->m->obj_cache[og] =
2275                 ObjCache(QPDFObjectHandle::ObjAccessor::getObject(oh),
2276                          end_before_space, end_after_space);
2277         }
2278         else
2279         {
2280             QTC::TC("qpdf", "QPDF not caching overridden objstm object");
2281         }
2282     }
2283 }
2284 
2285 QPDFObjectHandle
makeIndirectObject(QPDFObjectHandle oh)2286 QPDF::makeIndirectObject(QPDFObjectHandle oh)
2287 {
2288     int max_objid = toI(getObjectCount());
2289     if (max_objid == std::numeric_limits<int>::max())
2290     {
2291         throw std::range_error(
2292             "max object id is too high to create new objects");
2293     }
2294     QPDFObjGen next(max_objid + 1, 0);
2295     this->m->obj_cache[next] =
2296 	ObjCache(QPDFObjectHandle::ObjAccessor::getObject(oh), -1, -1);
2297     return QPDFObjectHandle::Factory::newIndirect(
2298         this, next.getObj(), next.getGen());
2299 }
2300 
2301 QPDFObjectHandle
getObjectByObjGen(QPDFObjGen const & og)2302 QPDF::getObjectByObjGen(QPDFObjGen const& og)
2303 {
2304     return getObjectByID(og.getObj(), og.getGen());
2305 }
2306 
2307 QPDFObjectHandle
getObjectByID(int objid,int generation)2308 QPDF::getObjectByID(int objid, int generation)
2309 {
2310     return QPDFObjectHandle::Factory::newIndirect(this, objid, generation);
2311 }
2312 
2313 void
replaceObject(QPDFObjGen const & og,QPDFObjectHandle oh)2314 QPDF::replaceObject(QPDFObjGen const& og, QPDFObjectHandle oh)
2315 {
2316     replaceObject(og.getObj(), og.getGen(), oh);
2317 }
2318 
2319 void
replaceObject(int objid,int generation,QPDFObjectHandle oh)2320 QPDF::replaceObject(int objid, int generation, QPDFObjectHandle oh)
2321 {
2322     if (oh.isIndirect())
2323     {
2324 	QTC::TC("qpdf", "QPDF replaceObject called with indirect object");
2325 	throw std::logic_error(
2326 	    "QPDF::replaceObject called with indirect object handle");
2327     }
2328 
2329     // Force new object to appear in the cache
2330     resolve(objid, generation);
2331 
2332     // Replace the object in the object cache
2333     QPDFObjGen og(objid, generation);
2334     this->m->ever_replaced_objects = true;
2335     this->m->obj_cache[og] =
2336 	ObjCache(QPDFObjectHandle::ObjAccessor::getObject(oh), -1, -1);
2337 }
2338 
2339 void
replaceReserved(QPDFObjectHandle reserved,QPDFObjectHandle replacement)2340 QPDF::replaceReserved(QPDFObjectHandle reserved,
2341                       QPDFObjectHandle replacement)
2342 {
2343     QTC::TC("qpdf", "QPDF replaceReserved");
2344     reserved.assertReserved();
2345     replaceObject(reserved.getObjGen(), replacement);
2346 }
2347 
2348 QPDFObjectHandle
copyForeignObject(QPDFObjectHandle foreign)2349 QPDF::copyForeignObject(QPDFObjectHandle foreign)
2350 {
2351     // Here's an explanation of what's going on here.
2352     //
2353     // A QPDFObjectHandle that is an indirect object has an owning
2354     // QPDF. The object ID and generation refers to an object in the
2355     // owning QPDF. When we copy the QPDFObjectHandle from a foreign
2356     // QPDF into the local QPDF, we have to replace all indirect
2357     // object references with references to the corresponding object
2358     // in the local file.
2359     //
2360     // To do this, we maintain mappings from foreign object IDs to
2361     // local object IDs for each foreign QPDF that we are copying
2362     // from. The mapping is stored in an ObjCopier, which contains a
2363     // mapping from the foreign ObjGen to the local QPDFObjectHandle.
2364     //
2365     // To copy, we do a deep traversal of the foreign object with loop
2366     // detection to discover all indirect objects that are
2367     // encountered, stopping at page boundaries. Whenever we encounter
2368     // an indirect object, we check to see if we have already created
2369     // a local copy of it. If not, we allocate a "reserved" object
2370     // (or, for a stream, just a new stream) and store in the map the
2371     // mapping from the foreign object ID to the new object. While we
2372     // do this, we keep a list of objects to copy.
2373     //
2374     // Once we are done with the traversal, we copy all the objects
2375     // that we need to copy. However, the copies will contain indirect
2376     // object IDs that refer to objects in the foreign file. We need
2377     // to replace them with references to objects in the local file.
2378     // This is what replaceForeignIndirectObjects does. Once we have
2379     // created a copy of the foreign object with all the indirect
2380     // references replaced with new ones in the local context, we can
2381     // replace the local reserved object with the copy. This mechanism
2382     // allows us to copy objects with circular references in any
2383     // order.
2384 
2385     // For streams, rather than copying the objects, we set up the
2386     // stream data to pull from the original stream by using a stream
2387     // data provider. This is done in a manner that doesn't require
2388     // the original QPDF object but may require the original source of
2389     // the stream data with special handling for immediate_copy_from.
2390     // This logic is also in replaceForeignIndirectObjects.
2391 
2392     // Note that we explicitly allow use of copyForeignObject on page
2393     // objects. It is a documented use case to copy pages this way if
2394     // the intention is to not update the pages tree.
2395     if (! foreign.isIndirect())
2396     {
2397         QTC::TC("qpdf", "QPDF copyForeign direct");
2398 	throw std::logic_error(
2399 	    "QPDF::copyForeign called with direct object handle");
2400     }
2401     QPDF* other = foreign.getOwningQPDF();
2402     if (other == this)
2403     {
2404         QTC::TC("qpdf", "QPDF copyForeign not foreign");
2405         throw std::logic_error(
2406             "QPDF::copyForeign called with object from this QPDF");
2407     }
2408 
2409     ObjCopier& obj_copier = this->m->object_copiers[other->m->unique_id];
2410     if (! obj_copier.visiting.empty())
2411     {
2412         throw std::logic_error("obj_copier.visiting is not empty"
2413                                " at the beginning of copyForeignObject");
2414     }
2415 
2416     // Make sure we have an object in this file for every referenced
2417     // object in the old file.  obj_copier.object_map maps foreign
2418     // QPDFObjGen to local objects.  For everything new that we have
2419     // to copy, the local object will be a reservation, unless it is a
2420     // stream, in which case the local object will already be a
2421     // stream.
2422     reserveObjects(foreign, obj_copier, true);
2423 
2424     if (! obj_copier.visiting.empty())
2425     {
2426         throw std::logic_error("obj_copier.visiting is not empty"
2427                                " after reserving objects");
2428     }
2429 
2430     // Copy any new objects and replace the reservations.
2431     for (std::vector<QPDFObjectHandle>::iterator iter =
2432              obj_copier.to_copy.begin();
2433          iter != obj_copier.to_copy.end(); ++iter)
2434     {
2435         QPDFObjectHandle& to_copy = *iter;
2436         QPDFObjectHandle copy =
2437             replaceForeignIndirectObjects(to_copy, obj_copier, true);
2438         if (! to_copy.isStream())
2439         {
2440             QPDFObjGen og(to_copy.getObjGen());
2441             replaceReserved(obj_copier.object_map[og], copy);
2442         }
2443     }
2444     obj_copier.to_copy.clear();
2445 
2446     return obj_copier.object_map[foreign.getObjGen()];
2447 }
2448 
2449 void
reserveObjects(QPDFObjectHandle foreign,ObjCopier & obj_copier,bool top)2450 QPDF::reserveObjects(QPDFObjectHandle foreign, ObjCopier& obj_copier,
2451                      bool top)
2452 {
2453     if (foreign.isReserved())
2454     {
2455         throw std::logic_error(
2456             "QPDF: attempting to copy a foreign reserved object");
2457     }
2458 
2459     if (foreign.isPagesObject())
2460     {
2461         QTC::TC("qpdf", "QPDF not copying pages object");
2462         return;
2463     }
2464 
2465     if ((! top) && foreign.isPageObject())
2466     {
2467         QTC::TC("qpdf", "QPDF not crossing page boundary");
2468         return;
2469     }
2470 
2471     if (foreign.isIndirect())
2472     {
2473         QPDFObjGen foreign_og(foreign.getObjGen());
2474         if (obj_copier.visiting.find(foreign_og) != obj_copier.visiting.end())
2475         {
2476             QTC::TC("qpdf", "QPDF loop reserving objects");
2477             return;
2478         }
2479         if (obj_copier.object_map.find(foreign_og) !=
2480             obj_copier.object_map.end())
2481         {
2482             QTC::TC("qpdf", "QPDF already reserved object");
2483             return;
2484         }
2485         QTC::TC("qpdf", "QPDF copy indirect");
2486         obj_copier.visiting.insert(foreign_og);
2487         std::map<QPDFObjGen, QPDFObjectHandle>::iterator mapping =
2488             obj_copier.object_map.find(foreign_og);
2489         if (mapping == obj_copier.object_map.end())
2490         {
2491             obj_copier.to_copy.push_back(foreign);
2492             QPDFObjectHandle reservation;
2493             if (foreign.isStream())
2494             {
2495                 reservation = QPDFObjectHandle::newStream(this);
2496             }
2497             else
2498             {
2499                 reservation = QPDFObjectHandle::newReserved(this);
2500             }
2501             obj_copier.object_map[foreign_og] = reservation;
2502         }
2503     }
2504 
2505     if (foreign.isArray())
2506     {
2507         QTC::TC("qpdf", "QPDF reserve array");
2508 	int n = foreign.getArrayNItems();
2509 	for (int i = 0; i < n; ++i)
2510 	{
2511             reserveObjects(foreign.getArrayItem(i), obj_copier, false);
2512 	}
2513     }
2514     else if (foreign.isDictionary())
2515     {
2516         QTC::TC("qpdf", "QPDF reserve dictionary");
2517 	std::set<std::string> keys = foreign.getKeys();
2518 	for (std::set<std::string>::iterator iter = keys.begin();
2519 	     iter != keys.end(); ++iter)
2520 	{
2521             reserveObjects(foreign.getKey(*iter), obj_copier, false);
2522 	}
2523     }
2524     else if (foreign.isStream())
2525     {
2526         QTC::TC("qpdf", "QPDF reserve stream");
2527         reserveObjects(foreign.getDict(), obj_copier, false);
2528     }
2529 
2530     if (foreign.isIndirect())
2531     {
2532         QPDFObjGen foreign_og(foreign.getObjGen());
2533         obj_copier.visiting.erase(foreign_og);
2534     }
2535 }
2536 
2537 QPDFObjectHandle
replaceForeignIndirectObjects(QPDFObjectHandle foreign,ObjCopier & obj_copier,bool top)2538 QPDF::replaceForeignIndirectObjects(
2539     QPDFObjectHandle foreign, ObjCopier& obj_copier, bool top)
2540 {
2541     QPDFObjectHandle result;
2542     if ((! top) && foreign.isIndirect())
2543     {
2544         QTC::TC("qpdf", "QPDF replace indirect");
2545         QPDFObjGen foreign_og(foreign.getObjGen());
2546         std::map<QPDFObjGen, QPDFObjectHandle>::iterator mapping =
2547             obj_copier.object_map.find(foreign_og);
2548         if (mapping == obj_copier.object_map.end())
2549         {
2550             // This case would occur if this is a reference to a Page
2551             // or Pages object that we didn't traverse into.
2552             QTC::TC("qpdf", "QPDF replace foreign indirect with null");
2553             result = QPDFObjectHandle::newNull();
2554         }
2555         else
2556         {
2557             result = obj_copier.object_map[foreign_og];
2558         }
2559     }
2560     else if (foreign.isArray())
2561     {
2562         QTC::TC("qpdf", "QPDF replace array");
2563         result = QPDFObjectHandle::newArray();
2564 	int n = foreign.getArrayNItems();
2565 	for (int i = 0; i < n; ++i)
2566 	{
2567             result.appendItem(
2568                 replaceForeignIndirectObjects(
2569                     foreign.getArrayItem(i), obj_copier, false));
2570 	}
2571     }
2572     else if (foreign.isDictionary())
2573     {
2574         QTC::TC("qpdf", "QPDF replace dictionary");
2575         result = QPDFObjectHandle::newDictionary();
2576 	std::set<std::string> keys = foreign.getKeys();
2577 	for (std::set<std::string>::iterator iter = keys.begin();
2578 	     iter != keys.end(); ++iter)
2579 	{
2580             result.replaceKey(
2581                 *iter,
2582                 replaceForeignIndirectObjects(
2583                     foreign.getKey(*iter), obj_copier, false));
2584 	}
2585     }
2586     else if (foreign.isStream())
2587     {
2588         QTC::TC("qpdf", "QPDF replace stream");
2589         QPDFObjGen foreign_og(foreign.getObjGen());
2590         result = obj_copier.object_map[foreign_og];
2591         result.assertStream();
2592         QPDFObjectHandle dict = result.getDict();
2593         QPDFObjectHandle old_dict = foreign.getDict();
2594         std::set<std::string> keys = old_dict.getKeys();
2595         for (std::set<std::string>::iterator iter = keys.begin();
2596              iter != keys.end(); ++iter)
2597         {
2598             dict.replaceKey(
2599                 *iter,
2600                 replaceForeignIndirectObjects(
2601                     old_dict.getKey(*iter), obj_copier, false));
2602         }
2603         copyStreamData(result, foreign);
2604     }
2605     else
2606     {
2607         foreign.assertScalar();
2608         result = foreign;
2609         result.makeDirect();
2610     }
2611 
2612     if (top && (! result.isStream()) && result.isIndirect())
2613     {
2614         throw std::logic_error("replacement for foreign object is indirect");
2615     }
2616 
2617     return result;
2618 }
2619 
2620 void
copyStreamData(QPDFObjectHandle result,QPDFObjectHandle foreign)2621 QPDF::copyStreamData(QPDFObjectHandle result, QPDFObjectHandle foreign)
2622 {
2623     // This method was originally written for copying foreign streams,
2624     // but it is used by QPDFObjectHandle to copy streams from the
2625     // same QPDF object as well.
2626 
2627     QPDFObjectHandle dict = result.getDict();
2628     QPDFObjectHandle old_dict = foreign.getDict();
2629     if (this->m->copied_stream_data_provider == 0)
2630     {
2631         this->m->copied_stream_data_provider =
2632             new CopiedStreamDataProvider(*this);
2633         this->m->copied_streams = this->m->copied_stream_data_provider;
2634     }
2635     QPDFObjGen local_og(result.getObjGen());
2636     // Copy information from the foreign stream so we can pipe its
2637     // data later without keeping the original QPDF object around.
2638     QPDF* foreign_stream_qpdf = foreign.getOwningQPDF();
2639     if (! foreign_stream_qpdf)
2640     {
2641         throw std::logic_error("unable to retrieve owning qpdf"
2642                                " from foreign stream");
2643     }
2644     QPDF_Stream* stream =
2645         dynamic_cast<QPDF_Stream*>(
2646             QPDFObjectHandle::ObjAccessor::getObject(
2647                 foreign).getPointer());
2648     if (! stream)
2649     {
2650         throw std::logic_error("unable to retrieve underlying"
2651                                " stream object from foreign stream");
2652     }
2653     PointerHolder<Buffer> stream_buffer =
2654         stream->getStreamDataBuffer();
2655     if ((foreign_stream_qpdf->m->immediate_copy_from) &&
2656         (stream_buffer.getPointer() == 0))
2657     {
2658         // Pull the stream data into a buffer before attempting
2659         // the copy operation. Do it on the source stream so that
2660         // if the source stream is copied multiple times, we don't
2661         // have to keep duplicating the memory.
2662         QTC::TC("qpdf", "QPDF immediate copy stream data");
2663         foreign.replaceStreamData(foreign.getRawStreamData(),
2664                                   old_dict.getKey("/Filter"),
2665                                   old_dict.getKey("/DecodeParms"));
2666         stream_buffer = stream->getStreamDataBuffer();
2667     }
2668     PointerHolder<QPDFObjectHandle::StreamDataProvider> stream_provider =
2669         stream->getStreamDataProvider();
2670     if (stream_buffer.getPointer())
2671     {
2672         QTC::TC("qpdf", "QPDF copy foreign stream with buffer");
2673         result.replaceStreamData(stream_buffer,
2674                                  dict.getKey("/Filter"),
2675                                  dict.getKey("/DecodeParms"));
2676     }
2677     else if (stream_provider.getPointer())
2678     {
2679         // In this case, the remote stream's QPDF must stay in scope.
2680         QTC::TC("qpdf", "QPDF copy foreign stream with provider");
2681         this->m->copied_stream_data_provider->registerForeignStream(
2682             local_og, foreign);
2683         result.replaceStreamData(this->m->copied_streams,
2684                                  dict.getKey("/Filter"),
2685                                  dict.getKey("/DecodeParms"));
2686     }
2687     else
2688     {
2689         PointerHolder<ForeignStreamData> foreign_stream_data =
2690             new ForeignStreamData(
2691                 foreign_stream_qpdf->m->encp,
2692                 foreign_stream_qpdf->m->file,
2693                 foreign.getObjectID(),
2694                 foreign.getGeneration(),
2695                 stream->getOffset(),
2696                 stream->getLength(),
2697                 dict);
2698         this->m->copied_stream_data_provider->registerForeignStream(
2699             local_og, foreign_stream_data);
2700         result.replaceStreamData(this->m->copied_streams,
2701                                  dict.getKey("/Filter"),
2702                                  dict.getKey("/DecodeParms"));
2703     }
2704 }
2705 
2706 void
swapObjects(QPDFObjGen const & og1,QPDFObjGen const & og2)2707 QPDF::swapObjects(QPDFObjGen const& og1, QPDFObjGen const& og2)
2708 {
2709     swapObjects(og1.getObj(), og1.getGen(), og2.getObj(), og2.getGen());
2710 }
2711 
2712 void
swapObjects(int objid1,int generation1,int objid2,int generation2)2713 QPDF::swapObjects(int objid1, int generation1, int objid2, int generation2)
2714 {
2715     // Force objects to be loaded into cache; then swap them in the
2716     // cache.
2717     resolve(objid1, generation1);
2718     resolve(objid2, generation2);
2719     QPDFObjGen og1(objid1, generation1);
2720     QPDFObjGen og2(objid2, generation2);
2721     ObjCache t = this->m->obj_cache[og1];
2722     this->m->ever_replaced_objects = true;
2723     this->m->obj_cache[og1] = this->m->obj_cache[og2];
2724     this->m->obj_cache[og2] = t;
2725 }
2726 
2727 unsigned long long
getUniqueId() const2728 QPDF::getUniqueId() const
2729 {
2730     return this->m->unique_id;
2731 }
2732 
2733 std::string
getFilename() const2734 QPDF::getFilename() const
2735 {
2736     return this->m->file->getName();
2737 }
2738 
2739 std::string
getPDFVersion() const2740 QPDF::getPDFVersion() const
2741 {
2742     return this->m->pdf_version;
2743 }
2744 
2745 int
getExtensionLevel()2746 QPDF::getExtensionLevel()
2747 {
2748     int result = 0;
2749     QPDFObjectHandle obj = getRoot();
2750     if (obj.hasKey("/Extensions"))
2751     {
2752         obj = obj.getKey("/Extensions");
2753         if (obj.isDictionary() && obj.hasKey("/ADBE"))
2754         {
2755             obj = obj.getKey("/ADBE");
2756             if (obj.isDictionary() && obj.hasKey("/ExtensionLevel"))
2757             {
2758                 obj = obj.getKey("/ExtensionLevel");
2759                 if (obj.isInteger())
2760                 {
2761                     result = obj.getIntValueAsInt();
2762                 }
2763             }
2764         }
2765     }
2766     return result;
2767 }
2768 
2769 QPDFObjectHandle
getTrailer()2770 QPDF::getTrailer()
2771 {
2772     return this->m->trailer;
2773 }
2774 
2775 QPDFObjectHandle
getRoot()2776 QPDF::getRoot()
2777 {
2778     QPDFObjectHandle root = this->m->trailer.getKey("/Root");
2779     if (! root.isDictionary())
2780     {
2781         throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
2782                       "", this->m->file->getLastOffset(),
2783                       "unable to find /Root dictionary");
2784     }
2785     return root;
2786 }
2787 
2788 std::map<QPDFObjGen, QPDFXRefEntry>
getXRefTable()2789 QPDF::getXRefTable()
2790 {
2791     if (! this->m->parsed)
2792     {
2793         throw std::logic_error("QPDF::getXRefTable called before parsing.");
2794     }
2795 
2796     return this->m->xref_table;
2797 }
2798 
2799 void
getObjectStreamData(std::map<int,int> & omap)2800 QPDF::getObjectStreamData(std::map<int, int>& omap)
2801 {
2802     for (std::map<QPDFObjGen, QPDFXRefEntry>::iterator iter =
2803 	     this->m->xref_table.begin();
2804 	 iter != this->m->xref_table.end(); ++iter)
2805     {
2806 	QPDFObjGen const& og = (*iter).first;
2807 	QPDFXRefEntry const& entry = (*iter).second;
2808 	if (entry.getType() == 2)
2809 	{
2810 	    omap[og.getObj()] = entry.getObjStreamNumber();
2811 	}
2812     }
2813 }
2814 
2815 std::vector<QPDFObjGen>
getCompressibleObjGens()2816 QPDF::getCompressibleObjGens()
2817 {
2818     // Return a list of objects that are allowed to be in object
2819     // streams.  Walk through the objects by traversing the document
2820     // from the root, including a traversal of the pages tree.  This
2821     // makes that objects that are on the same page are more likely to
2822     // be in the same object stream, which is slightly more efficient,
2823     // particularly with linearized files.  This is better than
2824     // iterating through the xref table since it avoids preserving
2825     // orphaned items.
2826 
2827     // Exclude encryption dictionary, if any
2828     QPDFObjectHandle encryption_dict =
2829         this->m->trailer.getKey("/Encrypt");
2830     QPDFObjGen encryption_dict_og = encryption_dict.getObjGen();
2831 
2832     std::set<QPDFObjGen> visited;
2833     std::list<QPDFObjectHandle> queue;
2834     queue.push_front(this->m->trailer);
2835     std::vector<QPDFObjGen> result;
2836     while (! queue.empty())
2837     {
2838 	QPDFObjectHandle obj = queue.front();
2839 	queue.pop_front();
2840 	if (obj.isIndirect())
2841 	{
2842 	    QPDFObjGen og = obj.getObjGen();
2843 	    if (visited.count(og))
2844 	    {
2845 		QTC::TC("qpdf", "QPDF loop detected traversing objects");
2846 		continue;
2847 	    }
2848 	    if (og == encryption_dict_og)
2849 	    {
2850 		QTC::TC("qpdf", "QPDF exclude encryption dictionary");
2851 	    }
2852 	    else if ((! obj.isStream()) &&
2853 		     (! (obj.isDictionary() &&
2854                          obj.hasKey("/ByteRange") &&
2855                          obj.hasKey("/Contents") &&
2856                          obj.hasKey("/Type") &&
2857                          obj.getKey("/Type").isName() &&
2858                          obj.getKey("/Type").getName() == "/Sig")))
2859 	    {
2860 		result.push_back(og);
2861 	    }
2862 	    visited.insert(og);
2863 	}
2864 	if (obj.isStream())
2865 	{
2866 	    QPDFObjectHandle dict = obj.getDict();
2867 	    std::set<std::string> keys = dict.getKeys();
2868 	    for (std::set<std::string>::reverse_iterator iter = keys.rbegin();
2869 		 iter != keys.rend(); ++iter)
2870 	    {
2871 		std::string const& key = *iter;
2872 		QPDFObjectHandle value = dict.getKey(key);
2873 		if (key == "/Length")
2874 		{
2875 		    // omit stream lengths
2876 		    if (value.isIndirect())
2877 		    {
2878 			QTC::TC("qpdf", "QPDF exclude indirect length");
2879 		    }
2880 		}
2881 		else
2882 		{
2883 		    queue.push_front(value);
2884 		}
2885 	    }
2886 	}
2887 	else if (obj.isDictionary())
2888 	{
2889 	    std::set<std::string> keys = obj.getKeys();
2890 	    for (std::set<std::string>::reverse_iterator iter = keys.rbegin();
2891 		 iter != keys.rend(); ++iter)
2892 	    {
2893 		queue.push_front(obj.getKey(*iter));
2894 	    }
2895 	}
2896 	else if (obj.isArray())
2897 	{
2898 	    int n = obj.getArrayNItems();
2899 	    for (int i = 1; i <= n; ++i)
2900 	    {
2901 		queue.push_front(obj.getArrayItem(n - i));
2902 	    }
2903 	}
2904     }
2905 
2906     return result;
2907 }
2908 
2909 bool
pipeStreamData(PointerHolder<EncryptionParameters> encp,PointerHolder<InputSource> file,QPDF & qpdf_for_warning,int objid,int generation,qpdf_offset_t offset,size_t length,QPDFObjectHandle stream_dict,Pipeline * pipeline,bool suppress_warnings,bool will_retry)2910 QPDF::pipeStreamData(PointerHolder<EncryptionParameters> encp,
2911                      PointerHolder<InputSource> file,
2912                      QPDF& qpdf_for_warning,
2913                      int objid, int generation,
2914 		     qpdf_offset_t offset, size_t length,
2915 		     QPDFObjectHandle stream_dict,
2916 		     Pipeline* pipeline,
2917                      bool suppress_warnings,
2918                      bool will_retry)
2919 {
2920     std::vector<PointerHolder<Pipeline> > to_delete;
2921     if (encp->encrypted)
2922     {
2923 	decryptStream(encp, file, qpdf_for_warning,
2924                       pipeline, objid, generation,
2925                       stream_dict, to_delete);
2926     }
2927 
2928     bool success = false;
2929     try
2930     {
2931 	file->seek(offset, SEEK_SET);
2932 	char buf[10240];
2933 	while (length > 0)
2934 	{
2935 	    size_t to_read = (sizeof(buf) < length ? sizeof(buf) : length);
2936 	    size_t len = file->read(buf, to_read);
2937 	    if (len == 0)
2938 	    {
2939 		throw QPDFExc(qpdf_e_damaged_pdf,
2940 			      file->getName(),
2941 			      "",
2942 			      file->getLastOffset(),
2943 			      "unexpected EOF reading stream data");
2944 	    }
2945 	    length -= len;
2946 	    pipeline->write(QUtil::unsigned_char_pointer(buf), len);
2947 	}
2948         pipeline->finish();
2949         success = true;
2950     }
2951     catch (QPDFExc& e)
2952     {
2953         if (! suppress_warnings)
2954         {
2955             qpdf_for_warning.warn(e);
2956         }
2957     }
2958     catch (std::exception& e)
2959     {
2960         if (! suppress_warnings)
2961         {
2962             QTC::TC("qpdf", "QPDF decoding error warning");
2963             qpdf_for_warning.warn(
2964                 QPDFExc(qpdf_e_damaged_pdf, file->getName(),
2965                         "", file->getLastOffset(),
2966                         "error decoding stream data for object " +
2967                         QUtil::int_to_string(objid) + " " +
2968                         QUtil::int_to_string(generation) + ": " + e.what()));
2969             if (will_retry)
2970             {
2971                 qpdf_for_warning.warn(
2972                     QPDFExc(qpdf_e_damaged_pdf, file->getName(),
2973                             "", file->getLastOffset(),
2974                             "stream will be re-processed without"
2975                             " filtering to avoid data loss"));
2976             }
2977         }
2978     }
2979     if (! success)
2980     {
2981         try
2982         {
2983             pipeline->finish();
2984         }
2985         catch (std::exception&)
2986         {
2987             // ignore
2988         }
2989     }
2990     return success;
2991 }
2992 
2993 bool
pipeStreamData(int objid,int generation,qpdf_offset_t offset,size_t length,QPDFObjectHandle stream_dict,Pipeline * pipeline,bool suppress_warnings,bool will_retry)2994 QPDF::pipeStreamData(int objid, int generation,
2995 		     qpdf_offset_t offset, size_t length,
2996 		     QPDFObjectHandle stream_dict,
2997 		     Pipeline* pipeline,
2998                      bool suppress_warnings,
2999                      bool will_retry)
3000 {
3001     return pipeStreamData(
3002         this->m->encp, this->m->file, *this,
3003         objid, generation, offset, length,
3004         stream_dict, pipeline,
3005         suppress_warnings, will_retry);
3006 }
3007 
3008 bool
pipeForeignStreamData(PointerHolder<ForeignStreamData> foreign,Pipeline * pipeline,bool suppress_warnings,bool will_retry)3009 QPDF::pipeForeignStreamData(
3010     PointerHolder<ForeignStreamData> foreign,
3011     Pipeline* pipeline,
3012     bool suppress_warnings, bool will_retry)
3013 {
3014     if (foreign->encp->encrypted)
3015     {
3016         QTC::TC("qpdf", "QPDF pipe foreign encrypted stream");
3017     }
3018     return pipeStreamData(
3019         foreign->encp, foreign->file, *this,
3020         foreign->foreign_objid, foreign->foreign_generation,
3021         foreign->offset, foreign->length,
3022         foreign->local_dict, pipeline,
3023         suppress_warnings, will_retry);
3024 }
3025 
3026 void
stopOnError(std::string const & message)3027 QPDF::stopOnError(std::string const& message)
3028 {
3029     // Throw a generic exception when we lack context for something
3030     // more specific. New code should not use this. This method exists
3031     // to improve somewhat from calling assert in very old code.
3032     throw QPDFExc(qpdf_e_damaged_pdf, this->m->file->getName(),
3033                   "", this->m->file->getLastOffset(), message);
3034 }
3035