1 /*
2 Source File : PDFParser.cpp
3
4
5 Copyright 2011 Gal Kahana PDFWriter
6
7 Licensed under the Apache License, Version 2.0 (the "License");
8 you may not use this file except in compliance with the License.
9 You may obtain a copy of the License at
10
11 http://www.apache.org/licenses/LICENSE-2.0
12
13 Unless required by applicable law or agreed to in writing, software
14 distributed under the License is distributed on an "AS IS" BASIS,
15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 See the License for the specific language governing permissions and
17 limitations under the License.
18
19
20 */
21 #include "PDFParser.h"
22 #include "IByteReaderWithPosition.h"
23 #include "PDFParserTokenizer.h"
24 #include "Trace.h"
25 #include "PDFInteger.h"
26 #include "PDFObject.h"
27 #include "PDFSymbol.h"
28 #include "BoxingBase.h"
29 #include "PDFDictionary.h"
30 #include "BoxingBase.h"
31 #include "PDFIndirectObjectReference.h"
32 #include "PDFName.h"
33 #include "PDFArray.h"
34 #include "RefCountPtr.h"
35 #include "PDFObjectCast.h"
36 #include "PDFStreamInput.h"
37 #include "InputLimitedStream.h"
38 #include "InputFlateDecodeStream.h"
39 #include "InputStreamSkipperStream.h"
40 #include "InputPredictorPNGUpStream.h"
41 #include "InputPredictorPNGNoneStream.h"
42 #include "InputPredictorPNGSubStream.h"
43 #include "InputPredictorPNGAverageStream.h"
44 #include "InputPredictorPNGPaethStream.h"
45 #include "InputPredictorPNGOptimumStream.h"
46 #include "InputPredictorTIFFSubStream.h"
47 #include "InputAscii85DecodeStream.h"
48 #include "IPDFParserExtender.h"
49 #include "InputDCTDecodeStream.h"
50
51 #include <algorithm>
52 using namespace PDFHummus;
53
PDFParser(void)54 PDFParser::PDFParser(void)
55 {
56 mStream = NULL;
57 mTrailer = NULL;
58 mXrefTable = NULL;
59 mPagesObjectIDs = NULL;
60 mParserExtender = NULL;
61 mAllowExtendingSegments = true; // Gal 19.9.2013: here's some policy changer. basically i'm supposed to ignore all segments that declare objects past the trailer
62 // declared size. but i would like to allow files that do extend. as this is incompatible with the specs, i'll make
63 // this boolean dendent. i will sometimes make it public so ppl can actually modify this policy. for now, it's internal
64 }
65
~PDFParser(void)66 PDFParser::~PDFParser(void)
67 {
68 ResetParser();
69 }
70
ResetParser()71 void PDFParser::ResetParser()
72 {
73 mTrailer = NULL;
74 delete[] mXrefTable;
75 mXrefTable = NULL;
76 delete[] mPagesObjectIDs;
77 mPagesObjectIDs = NULL;
78 mStream = NULL;
79 mCurrentPositionProvider.Assign(NULL);
80
81 ObjectIDTypeToObjectStreamHeaderEntryMap::iterator it = mObjectStreamsCache.begin();
82 for(; it != mObjectStreamsCache.end();++it)
83 delete[] it->second;
84 mObjectStreamsCache.clear();
85
86 }
87
StartPDFParsing(IByteReaderWithPosition * inSourceStream)88 EStatusCode PDFParser::StartPDFParsing(IByteReaderWithPosition* inSourceStream)
89 {
90 EStatusCode status;
91
92 ResetParser();
93
94 mStream = inSourceStream;
95 mCurrentPositionProvider.Assign(mStream);
96 mObjectParser.SetReadStream(inSourceStream,&mCurrentPositionProvider);
97
98 do
99 {
100 status = ParseHeaderLine();
101 if(status != PDFHummus::eSuccess)
102 break;
103
104 // initialize reading from end
105 mLastReadPositionFromEnd = 0;
106 mEncounteredFileStart = false;
107 mLastAvailableIndex = mCurrentBufferIndex = mLinesBuffer;
108
109 status = ParseEOFLine();
110 if(status != PDFHummus::eSuccess)
111 break;
112
113 status = ParseLastXrefPosition();
114 if(status != PDFHummus::eSuccess)
115 break;
116
117 status = ParseFileDirectory(); // that would be the xref and trailer
118 if(status != PDFHummus::eSuccess)
119 break;
120
121 if(IsEncrypted())
122 {
123 // not parsing pages for encrypted docs.
124 // not commiting..and there's a practical reason.
125 // lower level objects will be in object streams (for those PDFs that have them)
126 // and the may not be accessed
127 mPagesCount = 0;
128 mPagesObjectIDs = NULL;
129 }
130 else
131 {
132 status = ParsePagesObjectIDs();
133 if(status != PDFHummus::eSuccess)
134 break;
135 }
136 }while(false);
137
138 return status;
139 }
140
GetObjectParser()141 PDFObjectParser& PDFParser::GetObjectParser()
142 {
143 return mObjectParser;
144 }
145
146 static const std::string scPDFMagic = "%PDF-";
ParseHeaderLine()147 EStatusCode PDFParser::ParseHeaderLine()
148 {
149 PDFParserTokenizer tokenizer;
150
151 tokenizer.SetReadStream(mStream);
152 BoolAndString tokenizerResult = tokenizer.GetNextToken();
153
154 if(!tokenizerResult.first)
155 {
156 TRACE_LOG("PDFParser::ParseHeaderLine, no tokens in PDF input. in other words - it's empty.");
157 return PDFHummus::eFailure;
158 }
159
160 if(tokenizerResult.second.compare(0,scPDFMagic.size(),scPDFMagic) != 0)
161 {
162 TRACE_LOG1("PDFParser::ParseHeaderLine, file does not begin as a PDF file. a PDF file should start with \"%PDF-\". file header = %s",tokenizerResult.second.c_str());
163 return PDFHummus::eFailure;
164 }
165
166 mPDFLevel = Double(tokenizerResult.second.substr(scPDFMagic.size()));
167 return PDFHummus::eSuccess;
168 }
169
170 static const std::string scEOF = "%%EOF";
ParseEOFLine()171 EStatusCode PDFParser::ParseEOFLine()
172 {
173 /* go back till you hit token. this should be the EOF. go back till line start and get the token...if it's not EOF, fail.
174 since EOF is a comment, then if there's anything else in that line it will either be before %%EOF, which means %%EOF won't be taken, or after -
175 in which case it'd be part of the comment. in any case - if it's not exactly EOF, there will be a failure. but i am allowing
176 extra empty lines after %%EOF
177 */
178 if(GoBackTillToken())
179 {
180 GoBackTillLineStart();
181 mStream->SetPositionFromEnd(GetCurrentPositionFromEnd());
182
183 PDFParserTokenizer aTokenizer;
184 aTokenizer.SetReadStream(mStream);
185 BoolAndString token = aTokenizer.GetNextToken();
186
187 if(token.first && (token.second.substr(0,scEOF.length()) == scEOF))
188 {
189 return PDFHummus::eSuccess;
190 }
191 else
192 {
193 TRACE_LOG("PDFParser::ParseEOFLine, failure, last line not %%EOF");
194 return PDFHummus::eFailure;
195 }
196 }
197 else
198 {
199 TRACE_LOG("PDFParser::ParseEOFLine, Couldn't find tokens in file");
200 return PDFHummus::eFailure;
201 }
202 }
203
GetCurrentPositionFromEnd()204 LongBufferSizeType PDFParser::GetCurrentPositionFromEnd()
205 {
206 return mLastReadPositionFromEnd-(mCurrentBufferIndex-mLinesBuffer);
207 }
208
GoBackTillToken()209 bool PDFParser::GoBackTillToken()
210 {
211 Byte buffer;
212 bool foundToken = false;
213
214 while(ReadBack(buffer))
215 {
216 if(!IsPDFWhiteSpace(buffer))
217 {
218 foundToken = true;
219 break;
220 }
221 }
222 return foundToken;
223 }
224
GoBackTillNonToken()225 bool PDFParser::GoBackTillNonToken()
226 {
227 Byte buffer;
228 bool foundNonToken = false;
229
230 while(ReadBack(buffer))
231 {
232 if(IsPDFWhiteSpace(buffer))
233 {
234 foundNonToken = true;
235 break;
236 }
237 }
238 return foundNonToken;
239 }
240
241 static const Byte scWhiteSpaces[] = {0,0x9,0xA,0xC,0xD,0x20};
IsPDFWhiteSpace(Byte inCharacter)242 bool PDFParser::IsPDFWhiteSpace(Byte inCharacter)
243 {
244 bool isWhiteSpace = false;
245 for(int i=0; i < 6 && !isWhiteSpace; ++i)
246 isWhiteSpace = (scWhiteSpaces[i] == inCharacter);
247 return isWhiteSpace;
248 }
249
250
251 static const char scCR = '\r';
252 static const char scLN = '\n';
GoBackTillLineStart()253 void PDFParser::GoBackTillLineStart()
254 {
255 Byte buffer;
256
257 while(ReadBack(buffer))
258 {
259 if(scLN == buffer || scCR == buffer)
260 break;
261 }
262 }
263
ReadBack(Byte & outValue)264 bool PDFParser::ReadBack(Byte& outValue)
265 {
266 if(IsBeginOfFile())
267 return false;
268
269 if(mCurrentBufferIndex > mLinesBuffer)
270 {
271 --mCurrentBufferIndex;
272 outValue = *mCurrentBufferIndex;
273 return true;
274 }
275 else
276 {
277 ReadNextBufferFromEnd(); // must be able to read...but could be 0 bytes
278 if(mCurrentBufferIndex > mLinesBuffer)
279 {
280 --mCurrentBufferIndex;
281 outValue = *mCurrentBufferIndex;
282 return true;
283 }
284 else
285 return false;
286 }
287 }
288
ReadNextBufferFromEnd()289 bool PDFParser::ReadNextBufferFromEnd()
290 {
291 if(mEncounteredFileStart)
292 {
293 return false;
294 }
295 else
296 {
297 mStream->SetPositionFromEnd(mLastReadPositionFromEnd + LINE_BUFFER_SIZE);
298 LongBufferSizeType readAmount = mStream->Read(mLinesBuffer,LINE_BUFFER_SIZE);
299 if(0 == readAmount)
300 return false;
301 mLastAvailableIndex = mLinesBuffer + readAmount;
302 mCurrentBufferIndex = mLastAvailableIndex;
303 mLastReadPositionFromEnd+= readAmount;
304 mEncounteredFileStart = readAmount < LINE_BUFFER_SIZE;
305 return true;
306 }
307 }
308
IsBeginOfFile()309 bool PDFParser::IsBeginOfFile()
310 {
311 return mEncounteredFileStart && (mCurrentBufferIndex == mLinesBuffer);
312 }
313
314 static const std::string scStartxref = "startxref";
ParseLastXrefPosition()315 EStatusCode PDFParser::ParseLastXrefPosition()
316 {
317 EStatusCode status = PDFHummus::eSuccess;
318
319 // next two lines should be the xref position and then "startxref"
320
321 do
322 {
323
324 // find and read xref position
325 if(!GoBackTillToken())
326 {
327 status = PDFHummus::eFailure;
328 TRACE_LOG("PDFParser::ParseXrefPosition, couldn't find xref position token");
329 break;
330 }
331
332 GoBackTillLineStart();
333
334 // now go forward, and here i'm guessing a bit, till you get to either and integer, or the startxref keyword
335 mStream->SetPositionFromEnd(GetCurrentPositionFromEnd());
336
337 mObjectParser.ResetReadState();
338 RefCountPtr<PDFObject> anObject(mObjectParser.ParseNewObject(mParserExtender));
339
340 if(anObject->GetType() == PDFObject::ePDFObjectInteger)
341 {
342 mLastXrefPosition = (LongFilePositionType)((PDFInteger*)anObject.GetPtr())->GetValue();
343
344 // find and read startxref keyword
345 if(!GoBackTillToken())
346 {
347 status = PDFHummus::eFailure;
348 TRACE_LOG("PDFParser::ParseXrefPosition, couldn't find startxref keyword");
349 break;
350 }
351
352 GoBackTillLineStart();
353 mStream->SetPositionFromEnd(GetCurrentPositionFromEnd());
354
355 mObjectParser.ResetReadState();
356 PDFObjectCastPtr<PDFSymbol> startxRef(mObjectParser.ParseNewObject(mParserExtender));
357
358 if(!startxRef || startxRef->GetValue() != scStartxref)
359 {
360 status = PDFHummus::eFailure;
361 TRACE_LOG("PDFParser::ParseXrefPosition, syntax error in reading xref position");
362 break;
363 }
364 }
365 else // this means that the line is not only integer, a bit more complicated path, look for startxref and then the next would be the number
366 {
367 bool foundStartXref = (anObject->GetType() == PDFObject::ePDFObjectSymbol) && (((PDFSymbol*)anObject.GetPtr())->GetValue() == scStartxref);
368
369 while(!foundStartXref && mStream->NotEnded())
370 {
371 PDFObjectCastPtr<PDFSymbol> startxRef(mObjectParser.ParseNewObject(mParserExtender));
372 foundStartXref = startxRef.GetPtr() && (startxRef->GetValue() == scStartxref);
373 }
374
375 if(!foundStartXref)
376 {
377 status = PDFHummus::eFailure;
378 TRACE_LOG("PDFParser::ParseXrefPosition, could not find startxref keyword");
379 break;
380 }
381
382 PDFObjectCastPtr<PDFInteger> xrefPosition(mObjectParser.ParseNewObject(mParserExtender));
383 if(!xrefPosition)
384 {
385 status = PDFHummus::eFailure;
386 TRACE_LOG("PDFParser::ParseXrefPosition, syntax error in reading xref position");
387 break;
388 }
389
390 mLastXrefPosition = xrefPosition->GetValue();
391 }
392
393 }while(false);
394
395 return status;
396
397 }
398
399 static const std::string scTrailer = "trailer";
ParseTrailerDictionary()400 EStatusCode PDFParser::ParseTrailerDictionary()
401 {
402
403 EStatusCode status = PDFHummus::eSuccess;
404 bool foundTrailer = false;
405
406 do
407 {
408 PDFParserTokenizer aTokenizer;
409 aTokenizer.SetReadStream(mStream);
410
411 do
412 {
413 BoolAndString token = aTokenizer.GetNextToken();
414 if(!token.first)
415 break;
416 foundTrailer = (scTrailer == token.second);
417 }while(!foundTrailer);
418
419
420 if(!foundTrailer)
421 {
422 status = PDFHummus::eFailure;
423 TRACE_LOG("PDFParser::ParseTrailerDictionary, trailer not found...");
424 break;
425 }
426
427 // k. now that all is well, just parse the damn dictionary, which is actually...the easiest part.
428 mObjectParser.ResetReadState();
429 PDFObjectCastPtr<PDFDictionary> dictionaryObject(mObjectParser.ParseNewObject(mParserExtender));
430 if(!dictionaryObject)
431 {
432 status = PDFHummus::eFailure;
433 TRACE_LOG("PDFParser::ParseTrailerDictionary, failure to parse trailer dictionary");
434 break;
435 }
436
437 mTrailer = dictionaryObject;
438 }while(false);
439
440 return status;
441 }
442
BuildXrefTableFromTable()443 EStatusCode PDFParser::BuildXrefTableFromTable()
444 {
445 EStatusCode status;
446
447 do
448 {
449 status = DetermineXrefSize();
450 if(status != PDFHummus::eSuccess)
451 break;
452
453 status = InitializeXref();
454 if(status != PDFHummus::eSuccess)
455 break;
456
457 if(mTrailer->Exists("Prev"))
458 {
459 status = ParsePreviousXrefs(mTrailer.GetPtr());
460 if(status != PDFHummus::eSuccess)
461 break;
462 }
463
464 XrefEntryInput* extendedTable = NULL;
465 ObjectIDType extendedTableSize;
466 status = ParseXrefFromXrefTable(mXrefTable,mXrefSize,mLastXrefPosition,&extendedTable,&extendedTableSize);
467 if(status != PDFHummus::eSuccess)
468 break;
469
470 // Table may have been extended, in which case replace the pointer and current size
471 if(extendedTable)
472 {
473 mXrefSize = extendedTableSize;
474 delete[] mXrefTable;
475 mXrefTable = extendedTable;
476 }
477
478 // For hybrids, check also XRefStm entry
479 PDFObjectCastPtr<PDFInteger> xrefStmReference(mTrailer->QueryDirectObject("XRefStm"));
480 if(!xrefStmReference)
481 break;
482 // if exists, merge update xref
483 status = ParseXrefFromXrefStream(mXrefTable,mXrefSize,xrefStmReference->GetValue(),&extendedTable,&extendedTableSize);
484 if(status != PDFHummus::eSuccess)
485 {
486 TRACE_LOG("PDFParser::ParseDirectory, failure to parse xref in hybrid mode");
487 break;
488 }
489 if(extendedTable)
490 {
491 mXrefSize = extendedTableSize;
492 delete[] mXrefTable;
493 mXrefTable = extendedTable;
494 }
495 }while(false);
496
497 return status;
498 }
499
DetermineXrefSize()500 EStatusCode PDFParser::DetermineXrefSize()
501 {
502 PDFObjectCastPtr<PDFInteger> aSize(mTrailer->QueryDirectObject("Size"));
503
504 if(!aSize)
505 {
506 return PDFHummus::eFailure;
507 }
508 else
509 {
510 mXrefSize = (ObjectIDType)aSize->GetValue();
511 return PDFHummus::eSuccess;
512 }
513 }
514
InitializeXref()515 EStatusCode PDFParser::InitializeXref()
516 {
517 mXrefTable = new XrefEntryInput[mXrefSize];
518 return PDFHummus::eSuccess;
519 }
520
521 typedef BoxingBaseWithRW<ObjectIDType> ObjectIDTypeBox;
522 typedef BoxingBaseWithRW<unsigned long> ULong;
523 typedef BoxingBaseWithRW<LongFilePositionType> LongFilePositionTypeBox;
524
525 static const std::string scXref = "xref";
ParseXrefFromXrefTable(XrefEntryInput * inXrefTable,ObjectIDType inXrefSize,LongFilePositionType inXrefPosition,XrefEntryInput ** outExtendedTable,ObjectIDType * outExtendedTableSize)526 EStatusCode PDFParser::ParseXrefFromXrefTable(XrefEntryInput* inXrefTable,
527 ObjectIDType inXrefSize,
528 LongFilePositionType inXrefPosition,
529 XrefEntryInput** outExtendedTable,
530 ObjectIDType* outExtendedTableSize)
531 {
532 // K. cross ref starts at xref position
533 // and ends with trailer (or when exahausted the number of objects...whichever first)
534 // i'm gonna tokanize them, for easier reading
535 PDFParserTokenizer tokenizer;
536 BoolAndString token;
537 EStatusCode status = PDFHummus::eSuccess;
538 ObjectIDType firstNonSectionObject;
539 Byte entry[20];
540
541 *outExtendedTable = NULL;
542
543 tokenizer.SetReadStream(mStream);
544 MovePositionInStream(inXrefPosition);
545
546 // Note that at times, the xref is being read "on empty". meaning - entries will be read but they will not affect the actual xref.
547 // This is done because final xref might be smaller than the prev xrefs, and i'm only interested in objects that are in the final xref.
548 // That said - i still want to be in the position of the trailer after this function is being executed.
549
550 do
551 {
552 // first token must be "xref", so just verify
553 token = tokenizer.GetNextToken();
554 if(!token.first || token.second != scXref)
555 {
556 TRACE_LOG1("PDFParser::ParseXref, error in parsing xref, expected to find \"xref\" keyword, found = %s",token.second.c_str());
557 status = PDFHummus::eFailure;
558 break;
559 }
560
561 ObjectIDType currentObject = 0;
562
563 while(PDFHummus::eSuccess == status)
564 {
565 token = tokenizer.GetNextToken();
566 if(!token.first)
567 {
568 TRACE_LOG("PDFParser::ParseXref, failed to read tokens, while reading xref");
569 status = PDFHummus::eFailure;
570 break;
571 }
572
573 // token may be either start of section or "trailer"
574 if(scTrailer == token.second)
575 break;
576
577 currentObject = ObjectIDTypeBox(token.second);
578 token = tokenizer.GetNextToken();
579 if(!token.first)
580 {
581 TRACE_LOG("PDFParser::ParseXref, unable to read section size, while reading xref");
582 status = PDFHummus::eFailure;
583 break;
584 }
585 if(ObjectIDTypeBox(token.second) == 0)
586 continue; // probably will never happen
587 firstNonSectionObject = currentObject + ObjectIDTypeBox(token.second);
588
589 // if the segment declared objects above the xref size, consult policy on what to do
590 if(firstNonSectionObject > inXrefSize && mAllowExtendingSegments)
591 {
592 inXrefTable = ExtendXrefTableToSize(inXrefTable,inXrefSize,firstNonSectionObject);
593 inXrefSize = firstNonSectionObject;
594 if(*outExtendedTable)
595 delete[] *outExtendedTable;
596 *outExtendedTable = inXrefTable;
597 *outExtendedTableSize = firstNonSectionObject;
598 }
599
600
601 // first row...not sure where starts...so skip till passing all endlines
602 do
603 {
604 if(mStream->Read(entry,1) != 1)
605 {
606 TRACE_LOG("PDFParser::ParseXref, failed to read xref entry");
607 status = PDFHummus::eFailure;
608 break;
609 }
610 }while(IsPDFWhiteSpace(entry[0]));
611
612 // now read extra 19
613 if(mStream->Read(entry+1,19) != 19)
614 {
615 TRACE_LOG("PDFParser::ParseXref, failed to read xref entry");
616 status = PDFHummus::eFailure;
617 break;
618 }
619 if(currentObject < inXrefSize)
620 {
621 inXrefTable[currentObject].mObjectPosition = LongFilePositionTypeBox((const char*)entry);
622 inXrefTable[currentObject].mRivision = ULong((const char*)(entry+11));
623 inXrefTable[currentObject].mType = entry[17] == 'n' ? eXrefEntryExisting:eXrefEntryDelete;
624 }
625 ++currentObject;
626
627
628
629 // now parse the section.
630 while(currentObject < firstNonSectionObject)
631 {
632 if(mStream->Read(entry,20) != 20)
633 {
634 TRACE_LOG("PDFParser::ParseXref, failed to read xref entry");
635 status = PDFHummus::eFailure;
636 break;
637 }
638 if(currentObject < inXrefSize)
639 {
640 inXrefTable[currentObject].mObjectPosition = LongFilePositionTypeBox((const char*)entry);
641 inXrefTable[currentObject].mRivision = ULong((const char*)(entry+11));
642 inXrefTable[currentObject].mType = entry[17] == 'n' ? eXrefEntryExisting:eXrefEntryDelete;
643 }
644 ++currentObject;
645 }
646 }
647 if(status != PDFHummus::eSuccess)
648 break;
649
650 }while(false);
651 mObjectParser.ResetReadState(); // cause read without consulting with the tokenizer...so now there shouldn't be available tokens
652
653 return status;
654 }
655
ExtendXrefTableToSize(XrefEntryInput * inXrefTable,ObjectIDType inOldSize,ObjectIDType inNewSize)656 XrefEntryInput* PDFParser::ExtendXrefTableToSize(XrefEntryInput* inXrefTable,ObjectIDType inOldSize,ObjectIDType inNewSize)
657 {
658 XrefEntryInput* newTable = new XrefEntryInput[inNewSize];
659
660 for(ObjectIDType i = 0; i < inOldSize; ++i)
661 newTable[i] = inXrefTable[i];
662 return newTable;
663 }
664
GetTrailer()665 PDFDictionary* PDFParser::GetTrailer()
666 {
667 return mTrailer.GetPtr();
668 }
669
GetPDFLevel()670 double PDFParser::GetPDFLevel()
671 {
672 return mPDFLevel;
673 }
674
ParseNewObject(ObjectIDType inObjectId)675 PDFObject* PDFParser::ParseNewObject(ObjectIDType inObjectId)
676 {
677 if(inObjectId >= mXrefSize)
678 {
679 return NULL;
680 }
681 else if(eXrefEntryExisting == mXrefTable[inObjectId].mType)
682 {
683 return ParseExistingInDirectObject(inObjectId);
684 }
685 else if(eXrefEntryStreamObject == mXrefTable[inObjectId].mType)
686 {
687 return ParseExistingInDirectStreamObject(inObjectId);
688 }
689 else
690 return NULL;
691 }
692
GetObjectsCount()693 ObjectIDType PDFParser::GetObjectsCount()
694 {
695 return mXrefSize;
696 }
697
698 static const std::string scObj = "obj";
ParseExistingInDirectObject(ObjectIDType inObjectID)699 PDFObject* PDFParser::ParseExistingInDirectObject(ObjectIDType inObjectID)
700 {
701 PDFObject* readObject = NULL;
702
703 MovePositionInStream(mXrefTable[inObjectID].mObjectPosition);
704
705 do
706 {
707 // should get us to the ObjectNumber ObjectVersion obj section
708 // verify that it's good and if so continue to parse the object itself
709
710 // verify object ID
711 PDFObjectCastPtr<PDFInteger> idObject(mObjectParser.ParseNewObject(mParserExtender));
712
713 if(!idObject)
714 {
715 TRACE_LOG("PDFParser::ParseExistingInDirectObject, failed to read object declaration, ID");
716 break;
717 }
718
719 if((ObjectIDType)idObject->GetValue() != inObjectID)
720 {
721 TRACE_LOG2("PDFParser::ParseExistingInDirectObject, failed to read object declaration, exepected ID = %ld, found %ld",
722 inObjectID,idObject->GetValue());
723 break;
724 }
725
726 // verify object Version
727 PDFObjectCastPtr<PDFInteger> versionObject(mObjectParser.ParseNewObject(mParserExtender));
728
729 if(!versionObject)
730 {
731 TRACE_LOG("PDFParser::ParseExistingInDirectObject, failed to read object declaration, Version");
732 break;
733 }
734
735 if(mParserExtender)
736 mParserExtender->OnObjectStart(inObjectID,versionObject->GetValue());
737
738 if((unsigned long)versionObject->GetValue() != mXrefTable[inObjectID].mRivision)
739 {
740 TRACE_LOG2("PDFParser::ParseExistingInDirectObject, failed to read object declaration, exepected version = %ld, found %ld",
741 mXrefTable[inObjectID].mRivision,versionObject->GetValue());
742 break;
743 }
744
745 // now the obj keyword
746 PDFObjectCastPtr<PDFSymbol> objKeyword(mObjectParser.ParseNewObject(mParserExtender));
747
748 if(!objKeyword)
749 {
750 TRACE_LOG("PDFParser::ParseExistingInDirectObject, failed to read object declaration, obj keyword");
751 break;
752 }
753
754 if(objKeyword->GetValue() != scObj)
755 {
756 TRACE_LOG1("PDFParser::ParseExistingInDirectObject, failed to read object declaration, expected obj keyword found %s",
757 objKeyword->GetValue().c_str());
758 break;
759 }
760
761 readObject = mObjectParser.ParseNewObject(mParserExtender);
762
763 if(mParserExtender)
764 mParserExtender->OnObjectEnd(readObject);
765 }while(false);
766
767 return readObject;
768 }
769
ParsePagesObjectIDs()770 EStatusCode PDFParser::ParsePagesObjectIDs()
771 {
772 EStatusCode status = PDFHummus::eSuccess;
773
774 // m.k plan is to look for the catalog, then find the pages, then initialize the array to the count at the root, and then just recursively loop
775 // the pages by order of pages and fill up the IDs. easy.
776
777 do
778 {
779 // get catalogue, verify indirect reference
780 PDFObjectCastPtr<PDFIndirectObjectReference> catalogReference(mTrailer->QueryDirectObject("Root"));
781 if(!catalogReference)
782 {
783 TRACE_LOG("PDFParser::ParsePagesObjectIDs, failed to read catalog reference in trailer");
784 status = PDFHummus::eFailure;
785 break;
786 }
787
788 PDFObjectCastPtr<PDFDictionary> catalog(ParseNewObject(catalogReference->mObjectID));
789 if(!catalog)
790 {
791 TRACE_LOG("PDFParser::ParsePagesObjectIDs, failed to read catalog");
792 status = PDFHummus::eFailure;
793 break;
794 }
795
796 // get pages, verify indirect reference
797 PDFObjectCastPtr<PDFIndirectObjectReference> pagesReference(catalog->QueryDirectObject("Pages"));
798 if(!pagesReference)
799 {
800 TRACE_LOG("PDFParser::ParsePagesObjectIDs, failed to read pages reference in catalog");
801 status = PDFHummus::eFailure;
802 break;
803 }
804
805 PDFObjectCastPtr<PDFDictionary> pages(ParseNewObject(pagesReference->mObjectID));
806 if(!pages)
807 {
808 TRACE_LOG("PDFParser::ParsePagesObjectIDs, failed to read pages");
809 status = PDFHummus::eFailure;
810 break;
811 }
812
813 PDFObjectCastPtr<PDFInteger> totalPagesCount(QueryDictionaryObject(pages.GetPtr(),"Count"));
814 if(!totalPagesCount)
815 {
816 TRACE_LOG("PDFParser::ParsePagesObjectIDs, failed to read pages count");
817 status = PDFHummus::eFailure;
818 break;
819 }
820
821 mPagesCount = (unsigned long)totalPagesCount->GetValue();
822 mPagesObjectIDs = new ObjectIDType[mPagesCount];
823
824 // now iterate through pages objects, and fill up the IDs [don't really need the object ID for the root pages tree...but whatever
825 status = ParsePagesIDs(pages.GetPtr(),pagesReference->mObjectID);
826
827 }while(false);
828
829 return status;
830 }
831
ParsePagesIDs(PDFDictionary * inPageNode,ObjectIDType inNodeObjectID)832 EStatusCode PDFParser::ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNodeObjectID)
833 {
834 unsigned long currentPageIndex = 0;
835
836 return ParsePagesIDs(inPageNode,inNodeObjectID,currentPageIndex);
837 }
838
839 static const std::string scPage = "Page";
840 static const std::string scPages = "Pages";
ParsePagesIDs(PDFDictionary * inPageNode,ObjectIDType inNodeObjectID,unsigned long & ioCurrentPageIndex)841 EStatusCode PDFParser::ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNodeObjectID,unsigned long& ioCurrentPageIndex)
842 {
843 // recursion.
844 // if this is a page, write it's node object ID in the current page index and +1
845 // if this is a pagetree, loop it's kids, for each parsing the kid, running the recursion on it, and deleting
846
847 EStatusCode status = PDFHummus::eSuccess;
848
849 do
850 {
851 PDFObjectCastPtr<PDFName> objectType(inPageNode->QueryDirectObject("Type"));
852 if(!objectType)
853 {
854 TRACE_LOG("PDFParser::ParsePagesIDs, can't read object type");
855 status = PDFHummus::eFailure;
856 break;
857 }
858
859 if(scPage == objectType->GetValue())
860 {
861 // a Page
862 if(ioCurrentPageIndex >= mPagesCount)
863 {
864 TRACE_LOG("PDFParser::ParsePagesIDs, there are more pages than the page count specifies. fail.");
865 status = PDFHummus::eFailure;
866 break;
867 }
868
869 mPagesObjectIDs[ioCurrentPageIndex] = inNodeObjectID;
870 ++ioCurrentPageIndex;
871 }
872 else if(scPages == objectType->GetValue())
873 {
874 // a Page tree node
875 PDFObjectCastPtr<PDFArray> kidsObject(inPageNode->QueryDirectObject("Kids"));
876 if(!kidsObject)
877 {
878 TRACE_LOG("PDFParser::ParsePagesIDs, unable to find page kids array");
879 status = PDFHummus::eFailure;
880 break;
881 }
882
883 SingleValueContainerIterator<PDFObjectVector> it = kidsObject->GetIterator();
884
885 while(it.MoveNext() && PDFHummus::eSuccess == status)
886 {
887 if(it.GetItem()->GetType() != PDFObject::ePDFObjectIndirectObjectReference)
888 {
889 TRACE_LOG1("PDFParser::ParsePagesIDs, unexpected type for a Kids array object, type = %s",PDFObject::scPDFObjectTypeLabel[it.GetItem()->GetType()]);
890 status = PDFHummus::eFailure;
891 break;
892 }
893
894 PDFObjectCastPtr<PDFDictionary> pageNodeObject(ParseNewObject(((PDFIndirectObjectReference*)it.GetItem())->mObjectID));
895 if(!pageNodeObject)
896 {
897 TRACE_LOG("PDFParser::ParsePagesIDs, unable to parse page node object from kids reference");
898 status = PDFHummus::eFailure;
899 break;
900 }
901
902 status = ParsePagesIDs(pageNodeObject.GetPtr(),((PDFIndirectObjectReference*)it.GetItem())->mObjectID,ioCurrentPageIndex);
903 }
904 }
905 else
906 {
907 TRACE_LOG1("PDFParser::ParsePagesIDs, unexpected object type. should be either Page or Pages, found %s",objectType->GetValue().c_str());
908 status = PDFHummus::eFailure;
909 break;
910 }
911 }while(false);
912
913 return status;
914 }
915
GetPagesCount()916 unsigned long PDFParser::GetPagesCount()
917 {
918 return mPagesCount;
919 }
920
GetPageObjectID(unsigned long inPageIndex)921 ObjectIDType PDFParser::GetPageObjectID(unsigned long inPageIndex)
922 {
923 if(mPagesCount <= inPageIndex)
924 return 0;
925
926 return mPagesObjectIDs[inPageIndex];
927 }
928
929
ParsePage(unsigned long inPageIndex)930 PDFDictionary* PDFParser::ParsePage(unsigned long inPageIndex)
931 {
932 if(mPagesCount <= inPageIndex)
933 return NULL;
934
935 PDFObjectCastPtr<PDFDictionary> pageObject(ParseNewObject(mPagesObjectIDs[inPageIndex]));
936
937 if(!pageObject)
938 {
939 TRACE_LOG1("PDFParser::ParsePage, couldn't find page object for index %ld",inPageIndex);
940 return NULL;
941 }
942
943 PDFObjectCastPtr<PDFName> objectType(pageObject->QueryDirectObject("Type"));
944
945 if(scPage == objectType->GetValue())
946 {
947 pageObject->AddRef();
948 return pageObject.GetPtr();
949 }
950 else
951 {
952 TRACE_LOG1("PDFParser::ParsePage, page object listed in page array for %ld is actually not a page",inPageIndex);
953 return NULL;
954 }
955 }
956
QueryDictionaryObject(PDFDictionary * inDictionary,const std::string & inName)957 PDFObject* PDFParser::QueryDictionaryObject(PDFDictionary* inDictionary,const std::string& inName)
958 {
959 RefCountPtr<PDFObject> anObject(inDictionary->QueryDirectObject(inName));
960
961 if(anObject.GetPtr() == NULL)
962 return NULL;
963
964 if(anObject->GetType() == PDFObject::ePDFObjectIndirectObjectReference)
965 {
966 PDFObject* theActualObject = ParseNewObject(((PDFIndirectObjectReference*)anObject.GetPtr())->mObjectID);
967 return theActualObject;
968 }
969 else
970 {
971 anObject->AddRef(); // adding ref to increase owners
972 return anObject.GetPtr();
973 }
974 }
975
QueryArrayObject(PDFArray * inArray,unsigned long inIndex)976 PDFObject* PDFParser::QueryArrayObject(PDFArray* inArray,unsigned long inIndex)
977 {
978 RefCountPtr<PDFObject> anObject(inArray->QueryObject(inIndex));
979
980 if(anObject.GetPtr() == NULL)
981 return NULL;
982
983 if(anObject->GetType() == PDFObject::ePDFObjectIndirectObjectReference)
984 {
985 PDFObject* theActualObject = ParseNewObject(((PDFIndirectObjectReference*)anObject.GetPtr())->mObjectID);
986 return theActualObject;
987 }
988 else
989 {
990 anObject->AddRef(); // adding ref to increase owners
991 return anObject.GetPtr();
992 }
993
994 }
995
ParsePreviousXrefs(PDFDictionary * inTrailer)996 EStatusCode PDFParser::ParsePreviousXrefs(PDFDictionary* inTrailer)
997 {
998 PDFObjectCastPtr<PDFInteger> previousPosition(inTrailer->QueryDirectObject("Prev"));
999 if(!previousPosition)
1000 {
1001 TRACE_LOG("PDFParser::ParsePreviousXrefs, unexpected, prev is not integer");
1002 return PDFHummus::eFailure;
1003 }
1004
1005 EStatusCode status;
1006
1007 XrefEntryInput* aTable = new XrefEntryInput[mXrefSize];
1008 do
1009 {
1010 PDFDictionary* trailerP = NULL;
1011
1012 XrefEntryInput* extendedTable = NULL;
1013 ObjectIDType extendedTableSize;
1014 status = ParseDirectory(previousPosition->GetValue(),aTable,mXrefSize,&trailerP,&extendedTable,&extendedTableSize);
1015 if(status != PDFHummus::eSuccess)
1016 break;
1017 RefCountPtr<PDFDictionary> trailer(trailerP);
1018
1019 if(trailer->Exists("Prev"))
1020 {
1021 status = ParsePreviousXrefs(trailer.GetPtr());
1022 if(status != PDFHummus::eSuccess)
1023 break;
1024 }
1025
1026
1027 // Table may have been extended, in which case replace the pointer and current size
1028 ObjectIDType newTableSize;
1029 if(extendedTable)
1030 {
1031 newTableSize = extendedTableSize;
1032 delete[] aTable;
1033 aTable = extendedTable;
1034 }
1035 else
1036 newTableSize = mXrefSize;
1037 MergeXrefWithMainXref(aTable,newTableSize);
1038 }
1039 while(false);
1040
1041 delete[] aTable;
1042 return status;
1043 }
1044
ParseDirectory(LongFilePositionType inXrefPosition,XrefEntryInput * inXrefTable,ObjectIDType inXrefSize,PDFDictionary ** outTrailer,XrefEntryInput ** outExtendedTable,ObjectIDType * outExtendedTableSize)1045 EStatusCode PDFParser::ParseDirectory(LongFilePositionType inXrefPosition,
1046 XrefEntryInput* inXrefTable,
1047 ObjectIDType inXrefSize,
1048 PDFDictionary** outTrailer,
1049 XrefEntryInput** outExtendedTable,
1050 ObjectIDType* outExtendedTableSize)
1051 {
1052 EStatusCode status = PDFHummus::eSuccess;
1053
1054 MovePositionInStream(inXrefPosition);
1055
1056 do
1057 {
1058 // take the object, so that we can check whether this is an Xref or an Xref stream
1059 RefCountPtr<PDFObject> anObject(mObjectParser.ParseNewObject(mParserExtender));
1060 if(!anObject)
1061 {
1062 status = PDFHummus::eFailure;
1063 break;
1064 }
1065
1066 if(anObject->GetType() == PDFObject::ePDFObjectSymbol && ((PDFSymbol*)anObject.GetPtr())->GetValue() == scXref)
1067 {
1068 // This is the case of a regular xref table. note that as oppose to the main trailer case
1069 // i already have a limit of Xrefsize (which is determined by the main trailer Size entry)
1070 // so i don't have to parse the trailer in advance, but rather just read the file in the natural order:
1071 // first - the xref then the trailer.
1072 status = ParseXrefFromXrefTable(inXrefTable,inXrefSize,inXrefPosition,outExtendedTable,outExtendedTableSize);
1073 if(status != PDFHummus::eSuccess)
1074 {
1075 TRACE_LOG1("PDFParser::ParseDirectory, failed to parse xref table in %ld",inXrefPosition);
1076 break;
1077 }
1078
1079 if(outExtendedTable)
1080 {
1081 inXrefTable = *outExtendedTable;
1082 inXrefSize = *outExtendedTableSize;
1083 }
1084
1085 // at this point we should be after the token of the "trailer"
1086 PDFObjectCastPtr<PDFDictionary> trailerDictionary(mObjectParser.ParseNewObject(mParserExtender));
1087 if(!trailerDictionary)
1088 {
1089 status = PDFHummus::eFailure;
1090 TRACE_LOG("PDFParser::ParseDirectory, failure to parse trailer dictionary");
1091 break;
1092 }
1093
1094 // For hybrids, check also XRefStm entry
1095 PDFObjectCastPtr<PDFInteger> xrefStmReference(trailerDictionary->QueryDirectObject("XRefStm"));
1096 if(xrefStmReference.GetPtr())
1097 {
1098 // if exists, merge update xref
1099 status = ParseXrefFromXrefStream(inXrefTable,inXrefSize,xrefStmReference->GetValue(),outExtendedTable,outExtendedTableSize);
1100 if(status != PDFHummus::eSuccess)
1101 {
1102 TRACE_LOG("PDFParser::ParseDirectory, failure to parse xref in hybrid mode");
1103 break;
1104 }
1105 }
1106
1107 trailerDictionary->AddRef();
1108 *outTrailer = trailerDictionary.GetPtr();
1109 }
1110 else if(anObject->GetType() == PDFObject::ePDFObjectInteger && ((PDFInteger*)anObject.GetPtr())->GetValue() > 0)
1111 {
1112 // Xref stream case. make some validations, grab the xref stream object details, and parse it
1113
1114 PDFObjectCastPtr<PDFInteger> versionObject(mObjectParser.ParseNewObject(mParserExtender));
1115
1116 if(!versionObject)
1117 {
1118 TRACE_LOG("PDFParser::ParseDirectory, failed to read xref object declaration, Version");
1119 status = PDFHummus::eFailure;
1120 break;
1121 }
1122
1123 if(mParserExtender)
1124 mParserExtender->OnObjectStart(((PDFInteger*)anObject.GetPtr())->GetValue(),versionObject->GetValue());
1125
1126
1127 PDFObjectCastPtr<PDFSymbol> objKeyword(mObjectParser.ParseNewObject(mParserExtender));
1128
1129 if(!objKeyword)
1130 {
1131 TRACE_LOG("PDFParser::ParseDirectory, failed to read xref object declaration, obj keyword");
1132 status = PDFHummus::eFailure;
1133 break;
1134 }
1135
1136 if(objKeyword->GetValue() != scObj)
1137 {
1138 TRACE_LOG1("PDFParser::ParseDirectory, failed to read xref object declaration, expected obj keyword found %s",
1139 objKeyword->GetValue().c_str());
1140 status = PDFHummus::eFailure;
1141 break;
1142 }
1143
1144 PDFObjectCastPtr<PDFStreamInput> xrefStream(mObjectParser.ParseNewObject(mParserExtender));
1145 if(!xrefStream)
1146 {
1147 TRACE_LOG("PDFParser::BuildXrefTableAndTrailerFromXrefStream, failure to parse xref stream");
1148 status = PDFHummus::eFailure;
1149 break;
1150 }
1151
1152 if(mParserExtender)
1153 mParserExtender->OnObjectEnd(xrefStream.GetPtr());
1154
1155 *outTrailer = xrefStream->QueryStreamDictionary();
1156
1157 status = ParseXrefFromXrefStream(inXrefTable,inXrefSize,xrefStream.GetPtr(),outExtendedTable,outExtendedTableSize);
1158 if(status != PDFHummus::eSuccess)
1159 break;
1160 }
1161 else
1162 {
1163 TRACE_LOG("PDFParser::ParseDirectory,Unexpected object at xref start");
1164 status = PDFHummus::eFailure;
1165 }
1166 }while(false);
1167 return status;
1168 }
1169
MergeXrefWithMainXref(XrefEntryInput * inTableToMerge,ObjectIDType inMergedTableSize)1170 void PDFParser::MergeXrefWithMainXref(XrefEntryInput* inTableToMerge,ObjectIDType inMergedTableSize)
1171 {
1172 if(inMergedTableSize > mXrefSize)
1173 {
1174 XrefEntryInput* newTable = ExtendXrefTableToSize(mXrefTable, mXrefSize, inMergedTableSize);
1175 mXrefSize = inMergedTableSize;
1176 delete[] mXrefTable;
1177 mXrefTable = newTable;
1178 }
1179
1180 for(ObjectIDType i = 0; i < mXrefSize; ++i)
1181 {
1182 if(inTableToMerge[i].mType != eXrefEntryUndefined)
1183 mXrefTable[i] = inTableToMerge[i];
1184 }
1185 }
1186
1187
ParseFileDirectory()1188 EStatusCode PDFParser::ParseFileDirectory()
1189 {
1190 EStatusCode status = PDFHummus::eSuccess;
1191
1192
1193 MovePositionInStream(mLastXrefPosition);
1194
1195 do
1196 {
1197 // take the object, so that we can check whether this is an Xref or an Xref stream
1198 RefCountPtr<PDFObject> anObject(mObjectParser.ParseNewObject(mParserExtender));
1199 if(!anObject)
1200 {
1201 status = PDFHummus::eFailure;
1202 break;
1203 }
1204
1205 if(anObject->GetType() == PDFObject::ePDFObjectSymbol && ((PDFSymbol*)anObject.GetPtr())->GetValue() == scXref)
1206 {
1207 // this would be a normal xref case
1208 // jump lines till you get to a line where the token is "trailer". then parse.
1209 status = ParseTrailerDictionary();
1210 if(status != PDFHummus::eSuccess)
1211 break;
1212
1213 status = BuildXrefTableFromTable();
1214 if(status != PDFHummus::eSuccess)
1215 break;
1216 }
1217 else if(anObject->GetType() == PDFObject::ePDFObjectInteger && ((PDFInteger*)anObject.GetPtr())->GetValue() > 0)
1218 {
1219 // Xref stream case
1220 status = BuildXrefTableAndTrailerFromXrefStream(((PDFInteger*)anObject.GetPtr())->GetValue());
1221 if(status != PDFHummus::eSuccess)
1222 break;
1223
1224 }
1225 else
1226 {
1227 TRACE_LOG("PDFParser::ParseFileDirectory,Unexpected object at xref start");
1228 status = eFailure;
1229 }
1230
1231
1232 }while(false);
1233
1234
1235
1236 return status;
1237 }
1238
BuildXrefTableAndTrailerFromXrefStream(long long inXrefStreamObjectID)1239 EStatusCode PDFParser::BuildXrefTableAndTrailerFromXrefStream(long long inXrefStreamObjectID)
1240 {
1241 // xref stream is trailer and stream togather. need to parse them both.
1242 // the object parser is now after the object ID. so verify that next we goot a version and the obj keyword
1243 // then parse the xref stream
1244 EStatusCode status = PDFHummus::eSuccess;
1245
1246 PDFObjectCastPtr<PDFInteger> versionObject(mObjectParser.ParseNewObject(mParserExtender));
1247
1248 do
1249 {
1250 if(!versionObject)
1251 {
1252 TRACE_LOG("PDFParser::BuildXrefTableAndTrailerFromXrefStream, failed to read xref object declaration, Version");
1253 status = PDFHummus::eFailure;
1254 break;
1255 }
1256
1257
1258 if(mParserExtender)
1259 mParserExtender->OnObjectStart(inXrefStreamObjectID,versionObject->GetValue());
1260
1261 PDFObjectCastPtr<PDFSymbol> objKeyword(mObjectParser.ParseNewObject(mParserExtender));
1262
1263 if(!objKeyword)
1264 {
1265 TRACE_LOG("PDFParser::BuildXrefTableAndTrailerFromXrefStream, failed to read xref object declaration, obj keyword");
1266 status = PDFHummus::eFailure;
1267 break;
1268 }
1269
1270 if(objKeyword->GetValue() != scObj)
1271 {
1272 TRACE_LOG1("PDFParser::BuildXrefTableAndTrailerFromXrefStream, failed to read xref object declaration, expected obj keyword found %s",
1273 objKeyword->GetValue().c_str());
1274 status = PDFHummus::eFailure;
1275 break;
1276 }
1277
1278 // k. now just parse the object which should be a stream
1279 PDFObjectCastPtr<PDFStreamInput> xrefStream(mObjectParser.ParseNewObject(mParserExtender));
1280 if(!xrefStream)
1281 {
1282 TRACE_LOG("PDFParser::BuildXrefTableAndTrailerFromXrefStream, failure to parse xref stream");
1283 status = PDFHummus::eFailure;
1284 break;
1285 }
1286
1287 if(mParserExtender)
1288 mParserExtender->OnObjectEnd(xrefStream.GetPtr());
1289
1290 RefCountPtr<PDFDictionary> xrefDictionary(xrefStream->QueryStreamDictionary());
1291 mTrailer = xrefDictionary;
1292
1293 status = DetermineXrefSize();
1294 if(status != PDFHummus::eSuccess)
1295 break;
1296
1297 status = InitializeXref();
1298 if(status != PDFHummus::eSuccess)
1299 break;
1300
1301 if(mTrailer->Exists("Prev"))
1302 {
1303 status = ParsePreviousXrefs(mTrailer.GetPtr());
1304 if(status != PDFHummus::eSuccess)
1305 break;
1306 }
1307
1308 XrefEntryInput* extendedTable = NULL;
1309 ObjectIDType extendedTableSize;
1310 status = ParseXrefFromXrefStream(mXrefTable,mXrefSize,xrefStream.GetPtr(),&extendedTable,&extendedTableSize);
1311 if(status != PDFHummus::eSuccess)
1312 break;
1313
1314 // Table may have been extended, in which case replace the pointer and current size
1315 if(extendedTable)
1316 {
1317 mXrefSize = extendedTableSize;
1318 delete[] mXrefTable;
1319 mXrefTable = extendedTable;
1320 }
1321
1322 }while(false);
1323
1324 return status;
1325
1326 }
1327
ParseXrefFromXrefStream(XrefEntryInput * inXrefTable,ObjectIDType inXrefSize,LongFilePositionType inXrefPosition,XrefEntryInput ** outExtendedTable,ObjectIDType * outExtendedTableSize)1328 EStatusCode PDFParser::ParseXrefFromXrefStream(XrefEntryInput* inXrefTable,
1329 ObjectIDType inXrefSize,
1330 LongFilePositionType inXrefPosition,
1331 XrefEntryInput** outExtendedTable,
1332 ObjectIDType* outExtendedTableSize)
1333 {
1334 EStatusCode status = PDFHummus::eSuccess;
1335
1336 MovePositionInStream(inXrefPosition);
1337
1338 do
1339 {
1340 // take the object, so that we can check whether this is an Xref or an Xref stream
1341 PDFObjectCastPtr<PDFInteger> anObject(mObjectParser.ParseNewObject(mParserExtender));
1342 if(!anObject || anObject->GetValue() <= 0)
1343 {
1344 TRACE_LOG1("PDFParser::ParseXrefFromXrefStream, expecting object number for xref stream at %ld",inXrefPosition);
1345 status = PDFHummus::eFailure;
1346 break;
1347 }
1348
1349 PDFObjectCastPtr<PDFInteger> versionObject(mObjectParser.ParseNewObject(mParserExtender));
1350
1351 if(!versionObject)
1352 {
1353 TRACE_LOG("PDFParser::ParseXrefFromXrefStream, failed to read xref object declaration, Version");
1354 status = PDFHummus::eFailure;
1355 break;
1356 }
1357
1358 if(mParserExtender)
1359 mParserExtender->OnObjectStart(anObject->GetValue(),versionObject->GetValue());
1360
1361 PDFObjectCastPtr<PDFSymbol> objKeyword(mObjectParser.ParseNewObject(mParserExtender));
1362
1363 if(!objKeyword)
1364 {
1365 TRACE_LOG("PDFParser::ParseXrefFromXrefStream, failed to read xref object declaration, obj keyword");
1366 status = PDFHummus::eFailure;
1367 break;
1368 }
1369
1370 if(objKeyword->GetValue() != scObj)
1371 {
1372 TRACE_LOG1("PDFParser::ParseXrefFromXrefStream, failed to read xref object declaration, expected obj keyword found %s",
1373 objKeyword->GetValue().c_str());
1374 status = PDFHummus::eFailure;
1375 break;
1376 }
1377
1378 PDFObjectCastPtr<PDFStreamInput> xrefStream(mObjectParser.ParseNewObject(mParserExtender));
1379 if(!xrefStream)
1380 {
1381 TRACE_LOG("PDFParser::ParseXrefFromXrefStream, failure to parse xref stream");
1382 status = PDFHummus::eFailure;
1383 break;
1384 }
1385
1386 if(mParserExtender)
1387 mParserExtender->OnObjectEnd(xrefStream.GetPtr());
1388
1389 status = ParseXrefFromXrefStream(inXrefTable,inXrefSize,xrefStream.GetPtr(),outExtendedTable,outExtendedTableSize);
1390 }while(false);
1391 return status;
1392 }
1393
ParseXrefFromXrefStream(XrefEntryInput * inXrefTable,ObjectIDType inXrefSize,PDFStreamInput * inXrefStream,XrefEntryInput ** outExtendedTable,ObjectIDType * outExtendedTableSize)1394 EStatusCode PDFParser::ParseXrefFromXrefStream(XrefEntryInput* inXrefTable,
1395 ObjectIDType inXrefSize,
1396 PDFStreamInput* inXrefStream,
1397 XrefEntryInput** outExtendedTable,
1398 ObjectIDType* outExtendedTableSize)
1399 {
1400 // 1. Setup the stream to read from the stream start location
1401 // 2. Set it up with an input stream to decode if required
1402 // 3. if there are subsections, loop them, otherwise assume a single section of 0..size
1403 // 4. for each subsection use the base number as starting, and count as well, to read the stream entries to the right position in the table
1404 // The entries are read using the "W" value. make sure to read even values that you don't need.
1405
1406 EStatusCode status = PDFHummus::eSuccess;
1407
1408 outExtendedTable = NULL;
1409
1410 IByteReader* xrefStreamSource = CreateInputStreamReader(inXrefStream);
1411 int* widthsArray = NULL;
1412
1413 do
1414 {
1415 if(!xrefStreamSource)
1416 {
1417 status = PDFHummus::eFailure;
1418 break;
1419 }
1420
1421 RefCountPtr<PDFDictionary> streamDictionary(inXrefStream->QueryStreamDictionary());
1422
1423 // setup w array
1424 PDFObjectCastPtr<PDFArray> wArray(QueryDictionaryObject(streamDictionary.GetPtr(),"W"));
1425 if(!wArray)
1426 {
1427 TRACE_LOG("PDFParser::ParseXrefFromXrefStream, W array not available. failing");
1428 status = PDFHummus::eFailure;
1429 break;
1430 }
1431
1432 widthsArray = new int[wArray->GetLength()];
1433 for(unsigned long i=0;i <wArray->GetLength();++i)
1434 {
1435 PDFObjectCastPtr<PDFInteger> widthObject(wArray->QueryObject(i));
1436 if(!widthObject)
1437 {
1438 TRACE_LOG("PDFParser::ParseXrefFromXrefStream, wrong items in width array (supposed to have only integers)");
1439 status = PDFHummus::eFailure;
1440 break;
1441 }
1442 widthsArray[i] = (int)widthObject->GetValue();
1443 }
1444 if(status != PDFHummus::eSuccess)
1445 break;
1446
1447 // read the segments from the stream
1448 PDFObjectCastPtr<PDFArray> subsectionsIndex(QueryDictionaryObject(streamDictionary.GetPtr(),"Index"));
1449 MovePositionInStream(inXrefStream->GetStreamContentStart());
1450
1451 if(!subsectionsIndex)
1452 {
1453 PDFObjectCastPtr<PDFInteger> xrefSize(QueryDictionaryObject(streamDictionary.GetPtr(),"Size"));
1454 if(!xrefSize)
1455 {
1456 TRACE_LOG("PDFParser::ParseXrefFromXrefStream, xref size does not exist for this stream");
1457 status = PDFHummus::eFailure;
1458 break;
1459 }
1460
1461 // if reading objects past expected range interesting consult policy
1462 ObjectIDType readXrefSize = (ObjectIDType)xrefSize->GetValue();
1463 if(readXrefSize > inXrefSize)
1464 {
1465 if(mAllowExtendingSegments)
1466 {
1467 inXrefTable = ExtendXrefTableToSize(inXrefTable,inXrefSize,readXrefSize);
1468 inXrefSize = readXrefSize;
1469 if(outExtendedTable && *outExtendedTable)
1470 delete[] *outExtendedTable;
1471 *outExtendedTable = inXrefTable;
1472 *outExtendedTableSize = readXrefSize;
1473 }
1474 else
1475 break;
1476 }
1477 status = ReadXrefStreamSegment(inXrefTable,0,readXrefSize,xrefStreamSource,widthsArray,wArray->GetLength());
1478 }
1479 else
1480 {
1481 SingleValueContainerIterator<PDFObjectVector> segmentsIterator = subsectionsIndex->GetIterator();
1482 PDFObjectCastPtr<PDFInteger> segmentValue;
1483 while(segmentsIterator.MoveNext() && PDFHummus::eSuccess == status)
1484 {
1485 segmentValue = segmentsIterator.GetItem();
1486 if(!segmentValue)
1487 {
1488 TRACE_LOG("PDFParser::ParseXrefFromXrefStream, found non integer value in Index array of xref stream");
1489 status = PDFHummus::eFailure;
1490 break;
1491 }
1492 ObjectIDType startObject = (ObjectIDType)segmentValue->GetValue();
1493 if(!segmentsIterator.MoveNext())
1494 {
1495 TRACE_LOG("PDFParser::ParseXrefFromXrefStream,Index array of xref stream should have an even number of values");
1496 status = PDFHummus::eFailure;
1497 break;
1498 }
1499
1500 segmentValue = segmentsIterator.GetItem();
1501 if(!segmentValue)
1502 {
1503 TRACE_LOG("PDFParser::ParseXrefFromXrefStream, found non integer value in Index array of xref stream");
1504 status = PDFHummus::eFailure;
1505 break;
1506 }
1507 ObjectIDType objectsCount = (ObjectIDType)segmentValue->GetValue();
1508 // if reading objects past expected range interesting consult policy
1509 if(startObject + objectsCount > inXrefSize)
1510 {
1511 if(mAllowExtendingSegments)
1512 {
1513 inXrefTable = ExtendXrefTableToSize(inXrefTable,inXrefSize,startObject + objectsCount);
1514 inXrefSize = startObject + objectsCount;
1515 if(outExtendedTable && *outExtendedTable)
1516 delete[] *outExtendedTable;
1517 *outExtendedTable = inXrefTable;
1518 *outExtendedTableSize = startObject + objectsCount;
1519 }
1520 else
1521 break;
1522 }
1523 status = ReadXrefStreamSegment(inXrefTable,startObject,std::min<ObjectIDType>(objectsCount,inXrefSize - startObject),xrefStreamSource,widthsArray,wArray->GetLength());
1524 }
1525 }
1526 }while(false);
1527
1528 delete xrefStreamSource;
1529 delete[] widthsArray;
1530 return status;
1531 }
1532
MovePositionInStream(LongFilePositionType inPosition)1533 void PDFParser::MovePositionInStream(LongFilePositionType inPosition)
1534 {
1535 mStream->SetPosition(inPosition);
1536 mObjectParser.ResetReadState();
1537 }
1538
ReadXrefStreamSegment(XrefEntryInput * inXrefTable,ObjectIDType inSegmentStartObject,ObjectIDType inSegmentCount,IByteReader * inReadFrom,int * inEntryWidths,unsigned long inEntryWidthsSize)1539 EStatusCode PDFParser::ReadXrefStreamSegment(XrefEntryInput* inXrefTable,
1540 ObjectIDType inSegmentStartObject,
1541 ObjectIDType inSegmentCount,
1542 IByteReader* inReadFrom,
1543 int* inEntryWidths,
1544 unsigned long inEntryWidthsSize)
1545 {
1546 ObjectIDType objectToRead = inSegmentStartObject;
1547 EStatusCode status = PDFHummus::eSuccess;
1548 if(inEntryWidthsSize != 3)
1549 {
1550 TRACE_LOG("PDFParser::ReadXrefStreamSegment, can handle only 3 length entries");
1551 return PDFHummus::eFailure;
1552 }
1553
1554 // Note - i'm also checking that the stream is not ended. in non-finite segments, it could be that the particular
1555 // stream does no define all objects...just the "updated" ones
1556 for(; (objectToRead < inSegmentStartObject + inSegmentCount) && PDFHummus::eSuccess == status && inReadFrom->NotEnded();++objectToRead)
1557 {
1558 long long entryType;
1559 status = ReadXrefSegmentValue(inReadFrom,inEntryWidths[0],entryType);
1560 if(status != PDFHummus::eSuccess)
1561 break;
1562 status = ReadXrefSegmentValue(inReadFrom,inEntryWidths[1],inXrefTable[objectToRead].mObjectPosition);
1563 if(status != PDFHummus::eSuccess)
1564 break;
1565 status = ReadXrefSegmentValue(inReadFrom,inEntryWidths[2],inXrefTable[objectToRead].mRivision);
1566 if(status != PDFHummus::eSuccess)
1567 break;
1568
1569 if(0 == entryType)
1570 {
1571 inXrefTable[objectToRead].mType = eXrefEntryDelete;
1572 }
1573 else if (1 == entryType)
1574 {
1575 inXrefTable[objectToRead].mType = eXrefEntryExisting;
1576 }
1577 else if(2 == entryType)
1578 {
1579 inXrefTable[objectToRead].mType = eXrefEntryStreamObject;
1580 }
1581 else
1582 {
1583 TRACE_LOG("PDFParser::ReadXrefStreamSegment, unfamiliar entry type. must be either 0,1 or 2");
1584 status = PDFHummus::eFailure;
1585 }
1586 }
1587 return status;
1588 }
1589
ReadXrefSegmentValue(IByteReader * inSource,int inEntrySize,long long & outValue)1590 EStatusCode PDFParser::ReadXrefSegmentValue(IByteReader* inSource,int inEntrySize,long long& outValue)
1591 {
1592 outValue = 0;
1593 Byte buffer;
1594 EStatusCode status = PDFHummus::eSuccess;
1595
1596 for(int i=0;i<inEntrySize && PDFHummus::eSuccess == status;++i)
1597 {
1598 status = (inSource->Read(&buffer,1) == 1 ? PDFHummus::eSuccess : PDFHummus::eFailure);
1599 if(status != PDFHummus::eFailure)
1600 outValue = (outValue<<8) + buffer;
1601 }
1602 return status;
1603 }
1604
ReadXrefSegmentValue(IByteReader * inSource,int inEntrySize,ObjectIDType & outValue)1605 EStatusCode PDFParser::ReadXrefSegmentValue(IByteReader* inSource,int inEntrySize,ObjectIDType& outValue)
1606 {
1607 outValue = 0;
1608 Byte buffer;
1609 EStatusCode status = PDFHummus::eSuccess;
1610
1611 for(int i=0;i<inEntrySize && PDFHummus::eSuccess == status;++i)
1612 {
1613 status = (inSource->Read(&buffer,1) == 1 ? PDFHummus::eSuccess : PDFHummus::eFailure);
1614 if(status != PDFHummus::eFailure)
1615 outValue = (outValue<<8) + buffer;
1616 }
1617 return status;
1618 }
1619
ParseExistingInDirectStreamObject(ObjectIDType inObjectId)1620 PDFObject* PDFParser::ParseExistingInDirectStreamObject(ObjectIDType inObjectId)
1621 {
1622 // parsing an object in an object stream requires the following:
1623 // 1. Setting the position to this object stream
1624 // 2. Reading the stream First and N. store.
1625 // 3. Creating a stream reader for the initial stream position and length, possibly decoding with flate
1626 // 4. Read the stream header. store.
1627 // 5. Jump to the right object position (or decode till its position)
1628 // 6. Read the object
1629
1630 EStatusCode status = PDFHummus::eSuccess;
1631 ObjectStreamHeaderEntry* objectStreamHeader;
1632 IByteReader* objectSource = NULL;
1633
1634 InputStreamSkipperStream skipperStream;
1635 ObjectIDType objectStreamID;
1636 PDFObject* anObject = NULL;
1637
1638 do
1639 {
1640 objectStreamID = (ObjectIDType)mXrefTable[inObjectId].mObjectPosition;
1641 PDFObjectCastPtr<PDFStreamInput> objectStream(ParseNewObject(objectStreamID));
1642 if(!objectStream)
1643 {
1644 TRACE_LOG2("PDFParser::ParseExistingInDirectStreamObject, failed to parse object %ld. failed to find object stream for it, which should be %ld",
1645 inObjectId,mXrefTable[inObjectId].mObjectPosition);
1646 status = PDFHummus::eFailure;
1647 break;
1648 }
1649
1650 RefCountPtr<PDFDictionary> streamDictionary(objectStream->QueryStreamDictionary());
1651
1652 PDFObjectCastPtr<PDFInteger> streamObjectsCount(QueryDictionaryObject(streamDictionary.GetPtr(),"N"));
1653 if(!streamObjectsCount)
1654 {
1655 TRACE_LOG1("PDFParser::ParseExistingInDirectStreamObject, no N key in stream dictionary %ld",objectStreamID);
1656 status = PDFHummus::eFailure;
1657 break;
1658 }
1659 ObjectIDType objectsCount = (ObjectIDType)streamObjectsCount->GetValue();
1660
1661 PDFObjectCastPtr<PDFInteger> firstStreamObjectPosition(QueryDictionaryObject(streamDictionary.GetPtr(),"First"));
1662 if(!streamObjectsCount)
1663 {
1664 TRACE_LOG1("PDFParser::ParseExistingInDirectStreamObject, no First key in stream dictionary %ld",objectStreamID);
1665 status = PDFHummus::eFailure;
1666 break;
1667 }
1668
1669 objectSource = CreateInputStreamReader(objectStream.GetPtr());
1670 skipperStream.Assign(objectSource);
1671 MovePositionInStream(objectStream->GetStreamContentStart());
1672
1673 mObjectParser.SetReadStream(&skipperStream,&skipperStream);
1674
1675 ObjectIDTypeToObjectStreamHeaderEntryMap::iterator it = mObjectStreamsCache.find(objectStreamID);
1676
1677 if(it == mObjectStreamsCache.end())
1678 {
1679 objectStreamHeader = new ObjectStreamHeaderEntry[objectsCount];
1680 status = ParseObjectStreamHeader(objectStreamHeader,objectsCount);
1681 if(status != PDFHummus::eSuccess)
1682 {
1683 delete[] objectStreamHeader;
1684 break;
1685 }
1686 it = mObjectStreamsCache.insert(ObjectIDTypeToObjectStreamHeaderEntryMap::value_type(objectStreamID,objectStreamHeader)).first;
1687 }
1688 objectStreamHeader = it->second;
1689
1690 // verify that i got the right object ID
1691 if(objectsCount <= mXrefTable[inObjectId].mRivision || objectStreamHeader[mXrefTable[inObjectId].mRivision].mObjectNumber != inObjectId)
1692 {
1693 TRACE_LOG2("PDFParser::ParseXrefFromXrefStream, wrong object. expecting to find object ID %ld, and found %ld",
1694 inObjectId,
1695 objectsCount <= mXrefTable[inObjectId].mRivision ?
1696 -1 :
1697 objectStreamHeader[mXrefTable[inObjectId].mRivision].mObjectNumber);
1698 status = PDFHummus::eFailure;
1699 break;
1700 }
1701
1702 // when parsing the header, should be at position already..so don't skip if already there [using GetCurrentPosition to see if parsed some]
1703 if(mXrefTable[inObjectId].mRivision != 0 || skipperStream.GetCurrentPosition() == 0)
1704 {
1705 LongFilePositionType objectPositionInStream = objectStreamHeader[mXrefTable[inObjectId].mRivision].mObjectOffset +
1706 firstStreamObjectPosition->GetValue();
1707 skipperStream.SkipTo(objectPositionInStream);
1708 mObjectParser.ResetReadState();
1709 }
1710
1711 anObject = mObjectParser.ParseNewObject(mParserExtender);
1712
1713 }while(false);
1714
1715 mObjectParser.SetReadStream(mStream,&mCurrentPositionProvider);
1716
1717 if(PDFHummus::eSuccess == status)
1718 {
1719 return anObject;
1720 }
1721 else
1722 {
1723 if(anObject)
1724 anObject->Release();
1725 return NULL;
1726 }
1727 }
1728
ParseObjectStreamHeader(ObjectStreamHeaderEntry * inHeaderInfo,ObjectIDType inObjectsCount)1729 EStatusCode PDFParser::ParseObjectStreamHeader(ObjectStreamHeaderEntry* inHeaderInfo,ObjectIDType inObjectsCount)
1730 {
1731 ObjectIDType currentObject = 0;
1732 EStatusCode status = PDFHummus::eSuccess;
1733
1734 while(currentObject < inObjectsCount && (PDFHummus::eSuccess == status))
1735 {
1736 PDFObjectCastPtr<PDFInteger> objectNumber(mObjectParser.ParseNewObject(mParserExtender));
1737 if(!objectNumber)
1738 {
1739 TRACE_LOG("PDFParser::ParseObjectStreamHeader, parsing failed when reading object number. either not enough objects, or of the wrong type");
1740 status = PDFHummus::eFailure;
1741 break;
1742 }
1743
1744 PDFObjectCastPtr<PDFInteger> objectPosition(mObjectParser.ParseNewObject(mParserExtender));
1745 if(!objectPosition)
1746 {
1747 TRACE_LOG("PDFParser::ParseObjectStreamHeader, parsing failed when reading object position. either not enough objects, or of the wrong type");
1748 status = PDFHummus::eFailure;
1749 break;
1750 }
1751 inHeaderInfo[currentObject].mObjectNumber = (ObjectIDType)(objectNumber->GetValue());
1752 inHeaderInfo[currentObject].mObjectOffset = objectPosition->GetValue();
1753 ++currentObject;
1754 }
1755 return status;
1756 }
1757
CreateInputStreamReader(PDFStreamInput * inStream)1758 IByteReader* PDFParser::CreateInputStreamReader(PDFStreamInput* inStream)
1759 {
1760 RefCountPtr<PDFDictionary> streamDictionary(inStream->QueryStreamDictionary());
1761 IByteReader* result = NULL;
1762 EStatusCode status = PDFHummus::eSuccess;
1763
1764 do
1765 {
1766
1767 // setup stream according to length and possible filter
1768 PDFObjectCastPtr<PDFInteger> lengthObject(QueryDictionaryObject(streamDictionary.GetPtr(),"Length"));
1769 if(!lengthObject)
1770 {
1771 TRACE_LOG("PDFParser::CreateInputStreamReader, stream does not have length, failing");
1772 status = PDFHummus::eFailure;
1773 break;
1774 }
1775
1776 result = new InputLimitedStream(mStream,lengthObject->GetValue(),false);
1777
1778 // call for parser extender for encryption implementation
1779 if(mParserExtender)
1780 result = mParserExtender->CreateDecryptionFilterForStream(result);
1781
1782 RefCountPtr<PDFObject> filterObject(QueryDictionaryObject(streamDictionary.GetPtr(),"Filter"));
1783 if(!filterObject)
1784 {
1785 // no filter, so stop here
1786 break;
1787 }
1788
1789 if(filterObject->GetType() == PDFObject::ePDFObjectArray)
1790 {
1791 PDFArray* filterObjectArray = (PDFArray*)filterObject.GetPtr();
1792 PDFObjectCastPtr<PDFArray> decodeParams(QueryDictionaryObject(streamDictionary.GetPtr(),"DecodeParms"));
1793 for(unsigned long i=0; i < filterObjectArray->GetLength() && eSuccess == status;++i)
1794 {
1795 PDFObjectCastPtr<PDFName> filterObjectItem(filterObjectArray->QueryObject(i));
1796 if(!filterObjectItem)
1797 {
1798 TRACE_LOG("PDFParser::CreateInputStreamReader, filter item in an array is not a name. should be a name");
1799 status = PDFHummus::eFailure;
1800 break;
1801 }
1802
1803 EStatusCodeAndIByteReader createStatus;
1804 if(!decodeParams)
1805 {
1806 createStatus = CreateFilterForStream(result,filterObjectItem.GetPtr(), NULL);
1807 }
1808 else
1809 {
1810 PDFObjectCastPtr<PDFDictionary> decodeParamsItem(QueryArrayObject(decodeParams.GetPtr(),i));
1811
1812 createStatus = CreateFilterForStream(result,(PDFName*)filterObject.GetPtr(), !decodeParamsItem ? NULL: decodeParamsItem.GetPtr());
1813 }
1814
1815 if(createStatus.first != eSuccess)
1816 {
1817 status = PDFHummus::eFailure;
1818 break;
1819 }
1820 else
1821 result = createStatus.second;
1822 }
1823 }
1824 else if(filterObject->GetType() == PDFObject::ePDFObjectName)
1825 {
1826 PDFObjectCastPtr<PDFDictionary> decodeParams(QueryDictionaryObject(streamDictionary.GetPtr(),"DecodeParms"));
1827
1828 EStatusCodeAndIByteReader createStatus = CreateFilterForStream(result,(PDFName*)filterObject.GetPtr(), !decodeParams ? NULL: decodeParams.GetPtr());
1829 if(createStatus.first != eSuccess)
1830 {
1831 status = PDFHummus::eFailure;
1832 break;
1833 }
1834 else
1835 result = createStatus.second;
1836
1837 }
1838 else
1839 {
1840 TRACE_LOG("PDFParser::CreateInputStreamReader, filter parameter is of unkown type. only array and name are supported.");
1841 status = PDFHummus::eFailure;
1842 break;
1843 }
1844
1845 }while(false);
1846
1847
1848 if(status != PDFHummus::eSuccess)
1849 {
1850 delete result;
1851 result = NULL;
1852 }
1853 return result;
1854 }
1855
CreateFilterForStream(IByteReader * inStream,PDFName * inFilterName,PDFDictionary * inDecodeParams)1856 EStatusCodeAndIByteReader PDFParser::CreateFilterForStream(IByteReader* inStream,PDFName* inFilterName,PDFDictionary* inDecodeParams)
1857 {
1858 EStatusCode status = eSuccess;
1859 IByteReader* result = NULL;
1860
1861 do
1862 {
1863
1864 if(inFilterName->GetValue() == "FlateDecode")
1865 {
1866 InputFlateDecodeStream* flateStream;
1867 flateStream = new InputFlateDecodeStream(NULL); // assigning null, so later delete, if failure occurs won't delete the input stream
1868 result = flateStream;
1869
1870 // check for predictor n' such
1871 if(!inDecodeParams)
1872 {
1873 // no predictor, stop here
1874 flateStream->Assign(inStream);
1875 break;
1876 }
1877
1878 // read predictor, and apply the relevant predictor function
1879 PDFObjectCastPtr<PDFInteger> predictor(QueryDictionaryObject(inDecodeParams,"Predictor"));
1880
1881 if(!predictor || predictor->GetValue() == 1)
1882 {
1883 // no predictor or default, stop here
1884 flateStream->Assign(inStream);
1885 break;
1886 }
1887
1888 PDFObjectCastPtr<PDFInteger> columns(QueryDictionaryObject(inDecodeParams,"Columns"));
1889 LongBufferSizeType columnsValue = columns.GetPtr() ?
1890 (IOBasicTypes::LongBufferSizeType)columns->GetValue() :
1891 1;
1892
1893 switch(predictor->GetValue())
1894 {
1895 case 2:
1896 {
1897 PDFObjectCastPtr<PDFInteger> colors(QueryDictionaryObject(inDecodeParams,"Colors"));
1898 PDFObjectCastPtr<PDFInteger> bitsPerComponent(QueryDictionaryObject(inDecodeParams,"BitsPerComponent"));
1899 result = new InputPredictorTIFFSubStream(result,
1900 colors.GetPtr() ?
1901 (IOBasicTypes::LongBufferSizeType)colors->GetValue() :
1902 1,
1903 bitsPerComponent.GetPtr() ?
1904 (IOBasicTypes::Byte)colors->GetValue() :
1905 8,
1906 columnsValue);
1907 break;
1908 }
1909 case 10:
1910 {
1911 result = new InputPredictorPNGNoneStream(result,columnsValue);
1912 break;
1913 }
1914 case 11:
1915 {
1916 result = new InputPredictorPNGSubStream(result,columnsValue);
1917 break;
1918 }
1919 case 12:
1920 {
1921
1922 result = new InputPredictorPNGUpStream(result,columnsValue);
1923 break;
1924 }
1925 case 13:
1926 {
1927
1928 result = new InputPredictorPNGAverageStream(result,columnsValue);
1929 break;
1930 }
1931 case 14:
1932 {
1933 result = new InputPredictorPNGPaethStream(result,columnsValue);
1934 break;
1935 }
1936 case 15:
1937 {
1938 result = new InputPredictorPNGOptimumStream(result,columnsValue);
1939 break;
1940 }
1941 default:
1942 {
1943 TRACE_LOG("PDFParser::CreateFilterForStream, supporting only predictor of types 1,2,10,11,12,13,14,15, failing");
1944 status = PDFHummus::eFailure;
1945 break;
1946 }
1947 }
1948 flateStream->Assign(inStream);
1949 }
1950 else if(inFilterName->GetValue() == "ASCII85Decode")
1951 {
1952 result = new InputAscii85DecodeStream(inStream);
1953 }
1954 #ifndef PDFHUMMUS_NO_DCT
1955 else if(inFilterName->GetValue() == "DCTDecode")
1956 {
1957 result = new InputDCTDecodeStream(inStream);
1958 }
1959 #endif
1960 else if(mParserExtender)
1961 {
1962 result = mParserExtender->CreateFilterForStream(inStream,inFilterName,inDecodeParams);
1963 if(result == inStream)
1964 {
1965 TRACE_LOG1("PDFParser::CreateFilterForStream, filter is not supported by extender - %s",inFilterName->GetValue().c_str());
1966 status = PDFHummus::eFailure;
1967 break;
1968 }
1969 }
1970 else
1971 {
1972 TRACE_LOG("PDFParser::CreateFilterForStream, supporting only flate decode and ascii 85 decode, failing");
1973 status = PDFHummus::eFailure;
1974 break;
1975 }
1976 }while(false);
1977
1978 if(status != PDFHummus::eSuccess)
1979 {
1980 delete result;
1981 result = NULL;
1982 }
1983 return EStatusCodeAndIByteReader(status,result);
1984
1985 }
1986
StartReadingFromStream(PDFStreamInput * inStream)1987 IByteReader* PDFParser::StartReadingFromStream(PDFStreamInput* inStream)
1988 {
1989 IByteReader* result = CreateInputStreamReader(inStream);
1990 if(result)
1991 MovePositionInStream(inStream->GetStreamContentStart());
1992 return result;
1993 }
1994
StartStateFileParsing(IByteReaderWithPosition * inSourceStream)1995 EStatusCode PDFParser::StartStateFileParsing(IByteReaderWithPosition* inSourceStream)
1996 {
1997 EStatusCode status;
1998
1999 ResetParser();
2000
2001 mStream = inSourceStream;
2002 mCurrentPositionProvider.Assign(mStream);
2003 mObjectParser.SetReadStream(inSourceStream,&mCurrentPositionProvider);
2004
2005 do
2006 {
2007 // initialize reading from end
2008 mLastReadPositionFromEnd = 0;
2009 mEncounteredFileStart = false;
2010 mLastAvailableIndex = mCurrentBufferIndex = mLinesBuffer;
2011
2012 status = ParseEOFLine();
2013 if(status != PDFHummus::eSuccess)
2014 break;
2015
2016 status = ParseLastXrefPosition();
2017 if(status != PDFHummus::eSuccess)
2018 break;
2019
2020 status = ParseFileDirectory(); // that would be the xref and trailer
2021 if(status != PDFHummus::eSuccess)
2022 break;
2023
2024 }while(false);
2025
2026 return status;
2027 }
2028
IsEncrypted()2029 bool PDFParser::IsEncrypted()
2030 {
2031 PDFObjectCastPtr<PDFDictionary> encryptionDictionary(QueryDictionaryObject(mTrailer.GetPtr(),"Encrypt"));
2032 return encryptionDictionary.GetPtr() != NULL ;
2033 }
2034
SetParserExtender(IPDFParserExtender * inParserExtender)2035 void PDFParser::SetParserExtender(IPDFParserExtender* inParserExtender)
2036 {
2037 mParserExtender = inParserExtender;
2038 }
2039
IsEncryptionSupported()2040 bool PDFParser::IsEncryptionSupported()
2041 {
2042 return mParserExtender && mParserExtender->DoesSupportEncryption();
2043 }
2044
GetXrefSize()2045 ObjectIDType PDFParser::GetXrefSize()
2046 {
2047 return mXrefSize;
2048 }
2049
GetXrefEntry(ObjectIDType inObjectID)2050 XrefEntryInput* PDFParser::GetXrefEntry(ObjectIDType inObjectID)
2051 {
2052 return (inObjectID < mXrefSize) ? mXrefTable+inObjectID : NULL;
2053 }
2054
GetXrefPosition()2055 LongFilePositionType PDFParser::GetXrefPosition()
2056 {
2057 return mLastXrefPosition;
2058 }
2059
GetParserStream()2060 IByteReaderWithPosition* PDFParser::GetParserStream()
2061 {
2062 return mStream;
2063 }
2064