1 /*
2    Source File : PDFParser.cpp
3 
4 
5    Copyright 2011 Gal Kahana PDFWriter
6 
7    Licensed under the Apache License, Version 2.0 (the "License");
8    you may not use this file except in compliance with the License.
9    You may obtain a copy of the License at
10 
11        http://www.apache.org/licenses/LICENSE-2.0
12 
13    Unless required by applicable law or agreed to in writing, software
14    distributed under the License is distributed on an "AS IS" BASIS,
15    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16    See the License for the specific language governing permissions and
17    limitations under the License.
18 
19 
20 */
21 #include "PDFParser.h"
22 #include "IByteReaderWithPosition.h"
23 #include "PDFParserTokenizer.h"
24 #include "Trace.h"
25 #include "PDFInteger.h"
26 #include "PDFObject.h"
27 #include "PDFSymbol.h"
28 #include "BoxingBase.h"
29 #include "PDFDictionary.h"
30 #include "BoxingBase.h"
31 #include "PDFIndirectObjectReference.h"
32 #include "PDFName.h"
33 #include "PDFArray.h"
34 #include "RefCountPtr.h"
35 #include "PDFObjectCast.h"
36 #include "PDFStreamInput.h"
37 #include "InputLimitedStream.h"
38 #include "InputFlateDecodeStream.h"
39 #include "InputStreamSkipperStream.h"
40 #include "InputPredictorPNGUpStream.h"
41 #include "InputPredictorPNGNoneStream.h"
42 #include "InputPredictorPNGSubStream.h"
43 #include "InputPredictorPNGAverageStream.h"
44 #include "InputPredictorPNGPaethStream.h"
45 #include "InputPredictorPNGOptimumStream.h"
46 #include "InputPredictorTIFFSubStream.h"
47 #include "InputAscii85DecodeStream.h"
48 #include "IPDFParserExtender.h"
49 #include "InputDCTDecodeStream.h"
50 
51 #include  <algorithm>
52 using namespace PDFHummus;
53 
PDFParser(void)54 PDFParser::PDFParser(void)
55 {
56 	mStream = NULL;
57 	mTrailer = NULL;
58 	mXrefTable = NULL;
59 	mPagesObjectIDs = NULL;
60 	mParserExtender = NULL;
61     mAllowExtendingSegments = true; // Gal 19.9.2013: here's some policy changer. basically i'm supposed to ignore all segments that declare objects past the trailer
62                                     // declared size. but i would like to allow files that do extend. as this is incompatible with the specs, i'll make
63                                     // this boolean dendent. i will sometimes make it public so ppl can actually modify this policy. for now, it's internal
64 }
65 
~PDFParser(void)66 PDFParser::~PDFParser(void)
67 {
68 	ResetParser();
69 }
70 
ResetParser()71 void PDFParser::ResetParser()
72 {
73 	mTrailer = NULL;
74 	delete[] mXrefTable;
75 	mXrefTable = NULL;
76 	delete[] mPagesObjectIDs;
77 	mPagesObjectIDs = NULL;
78 	mStream = NULL;
79 	mCurrentPositionProvider.Assign(NULL);
80 
81 	ObjectIDTypeToObjectStreamHeaderEntryMap::iterator it = mObjectStreamsCache.begin();
82 	for(; it != mObjectStreamsCache.end();++it)
83 		delete[] it->second;
84 	mObjectStreamsCache.clear();
85 
86 }
87 
StartPDFParsing(IByteReaderWithPosition * inSourceStream)88 EStatusCode PDFParser::StartPDFParsing(IByteReaderWithPosition* inSourceStream)
89 {
90 	EStatusCode status;
91 
92 	ResetParser();
93 
94 	mStream = inSourceStream;
95 	mCurrentPositionProvider.Assign(mStream);
96 	mObjectParser.SetReadStream(inSourceStream,&mCurrentPositionProvider);
97 
98 	do
99 	{
100 		status = ParseHeaderLine();
101 		if(status != PDFHummus::eSuccess)
102 			break;
103 
104 		// initialize reading from end
105 		mLastReadPositionFromEnd = 0;
106 		mEncounteredFileStart = false;
107 		mLastAvailableIndex = mCurrentBufferIndex = mLinesBuffer;
108 
109 		status = ParseEOFLine();
110 		if(status != PDFHummus::eSuccess)
111 			break;
112 
113 		status = ParseLastXrefPosition();
114 		if(status != PDFHummus::eSuccess)
115 			break;
116 
117 		status = ParseFileDirectory(); // that would be the xref and trailer
118 		if(status != PDFHummus::eSuccess)
119 			break;
120 
121 		if(IsEncrypted())
122 		{
123 			// not parsing pages for encrypted docs.
124 			// not commiting..and there's a practical reason.
125 			// lower level objects will be in object streams (for those PDFs that have them)
126 			// and the may not be accessed
127 			mPagesCount = 0;
128 			mPagesObjectIDs = NULL;
129 		}
130 		else
131 		{
132 			status = ParsePagesObjectIDs();
133 			if(status != PDFHummus::eSuccess)
134 				break;
135 		}
136 	}while(false);
137 
138 	return status;
139 }
140 
GetObjectParser()141 PDFObjectParser& PDFParser::GetObjectParser()
142 {
143 	return mObjectParser;
144 }
145 
146 static const std::string scPDFMagic = "%PDF-";
ParseHeaderLine()147 EStatusCode PDFParser::ParseHeaderLine()
148 {
149 	PDFParserTokenizer tokenizer;
150 
151 	tokenizer.SetReadStream(mStream);
152 	BoolAndString tokenizerResult = tokenizer.GetNextToken();
153 
154 	if(!tokenizerResult.first)
155 	{
156 		TRACE_LOG("PDFParser::ParseHeaderLine, no tokens in PDF input. in other words - it's empty.");
157 		return PDFHummus::eFailure;
158 	}
159 
160 	if(tokenizerResult.second.compare(0,scPDFMagic.size(),scPDFMagic) != 0)
161 	{
162 		TRACE_LOG1("PDFParser::ParseHeaderLine, file does not begin as a PDF file. a PDF file should start with \"%PDF-\". file header = %s",tokenizerResult.second.c_str());
163 		return PDFHummus::eFailure;
164 	}
165 
166 	mPDFLevel = Double(tokenizerResult.second.substr(scPDFMagic.size()));
167 	return PDFHummus::eSuccess;
168 }
169 
170 static const std::string scEOF = "%%EOF";
ParseEOFLine()171 EStatusCode PDFParser::ParseEOFLine()
172 {
173 	/* go back till you hit token. this should be the EOF. go back till line start and get the token...if it's not EOF, fail.
174 	   since EOF is a comment, then if there's anything else in that line it will either be before %%EOF, which means %%EOF won't be taken, or after -
175 	   in which case it'd be part of the comment. in any case - if it's not exactly EOF, there will be a failure. but i am allowing
176 	   extra empty lines after %%EOF
177 	*/
178 	if(GoBackTillToken())
179 	{
180 		GoBackTillLineStart();
181 		mStream->SetPositionFromEnd(GetCurrentPositionFromEnd());
182 
183 		PDFParserTokenizer aTokenizer;
184 		aTokenizer.SetReadStream(mStream);
185 		BoolAndString token = aTokenizer.GetNextToken();
186 
187 		if(token.first && (token.second.substr(0,scEOF.length()) == scEOF))
188 		{
189 			return PDFHummus::eSuccess;
190 		}
191 		else
192 		{
193 			TRACE_LOG("PDFParser::ParseEOFLine, failure, last line not %%EOF");
194 			return PDFHummus::eFailure;
195 		}
196 	}
197 	else
198 	{
199 		TRACE_LOG("PDFParser::ParseEOFLine, Couldn't find tokens in file");
200 		return PDFHummus::eFailure;
201 	}
202 }
203 
GetCurrentPositionFromEnd()204 LongBufferSizeType PDFParser::GetCurrentPositionFromEnd()
205 {
206 	return mLastReadPositionFromEnd-(mCurrentBufferIndex-mLinesBuffer);
207 }
208 
GoBackTillToken()209 bool PDFParser::GoBackTillToken()
210 {
211 	Byte buffer;
212 	bool foundToken = false;
213 
214 	while(ReadBack(buffer))
215 	{
216 		if(!IsPDFWhiteSpace(buffer))
217 		{
218 			foundToken = true;
219 			break;
220 		}
221 	}
222 	return foundToken;
223 }
224 
GoBackTillNonToken()225 bool PDFParser::GoBackTillNonToken()
226 {
227 	Byte buffer;
228 	bool foundNonToken = false;
229 
230 	while(ReadBack(buffer))
231 	{
232 		if(IsPDFWhiteSpace(buffer))
233 		{
234 			foundNonToken = true;
235 			break;
236 		}
237 	}
238 	return foundNonToken;
239 }
240 
241 static const Byte scWhiteSpaces[] = {0,0x9,0xA,0xC,0xD,0x20};
IsPDFWhiteSpace(Byte inCharacter)242 bool PDFParser::IsPDFWhiteSpace(Byte inCharacter)
243 {
244 	bool isWhiteSpace = false;
245 	for(int i=0; i < 6 && !isWhiteSpace; ++i)
246 		isWhiteSpace =  (scWhiteSpaces[i] == inCharacter);
247 	return isWhiteSpace;
248 }
249 
250 
251 static const char scCR = '\r';
252 static const char scLN = '\n';
GoBackTillLineStart()253 void PDFParser::GoBackTillLineStart()
254 {
255 	Byte buffer;
256 
257 	while(ReadBack(buffer))
258 	{
259 		if(scLN == buffer || scCR == buffer)
260 			break;
261 	}
262 }
263 
ReadBack(Byte & outValue)264 bool PDFParser::ReadBack(Byte& outValue)
265 {
266 	if(IsBeginOfFile())
267 		return false;
268 
269 	if(mCurrentBufferIndex > mLinesBuffer)
270 	{
271 		--mCurrentBufferIndex;
272 		outValue = *mCurrentBufferIndex;
273 		return true;
274 	}
275 	else
276 	{
277 		ReadNextBufferFromEnd(); // must be able to read...but could be 0 bytes
278 		if(mCurrentBufferIndex > mLinesBuffer)
279 		{
280 			--mCurrentBufferIndex;
281 			outValue = *mCurrentBufferIndex;
282 			return true;
283 		}
284 		else
285 			return false;
286 	}
287 }
288 
ReadNextBufferFromEnd()289 bool PDFParser::ReadNextBufferFromEnd()
290 {
291 	if(mEncounteredFileStart)
292 	{
293 		return false;
294 	}
295 	else
296 	{
297 		mStream->SetPositionFromEnd(mLastReadPositionFromEnd + LINE_BUFFER_SIZE);
298 		LongBufferSizeType readAmount = mStream->Read(mLinesBuffer,LINE_BUFFER_SIZE);
299 		if(0 == readAmount)
300 			return false;
301 		mLastAvailableIndex = mLinesBuffer + readAmount;
302 		mCurrentBufferIndex = mLastAvailableIndex;
303 		mLastReadPositionFromEnd+= readAmount;
304 		mEncounteredFileStart = readAmount < LINE_BUFFER_SIZE;
305 		return true;
306 	}
307 }
308 
IsBeginOfFile()309 bool PDFParser::IsBeginOfFile()
310 {
311 	return mEncounteredFileStart && (mCurrentBufferIndex == mLinesBuffer);
312 }
313 
314 static const std::string scStartxref = "startxref";
ParseLastXrefPosition()315 EStatusCode PDFParser::ParseLastXrefPosition()
316 {
317 	EStatusCode status = PDFHummus::eSuccess;
318 
319 	// next two lines should be the xref position and then "startxref"
320 
321 	do
322 	{
323 
324 		// find and read xref position
325 		if(!GoBackTillToken())
326 		{
327 			status = PDFHummus::eFailure;
328 			TRACE_LOG("PDFParser::ParseXrefPosition, couldn't find xref position token");
329 			break;
330 		}
331 
332 		GoBackTillLineStart();
333 
334 		// now go forward, and here i'm guessing a bit, till you get to either and integer, or the startxref keyword
335 		mStream->SetPositionFromEnd(GetCurrentPositionFromEnd());
336 
337 		mObjectParser.ResetReadState();
338 		RefCountPtr<PDFObject> anObject(mObjectParser.ParseNewObject(mParserExtender));
339 
340 		if(anObject->GetType() == PDFObject::ePDFObjectInteger)
341 		{
342 			mLastXrefPosition = (LongFilePositionType)((PDFInteger*)anObject.GetPtr())->GetValue();
343 
344 			// find and read startxref keyword
345 			if(!GoBackTillToken())
346 			{
347 				status = PDFHummus::eFailure;
348 				TRACE_LOG("PDFParser::ParseXrefPosition, couldn't find startxref keyword");
349 				break;
350 			}
351 
352 			GoBackTillLineStart();
353 			mStream->SetPositionFromEnd(GetCurrentPositionFromEnd());
354 
355 			mObjectParser.ResetReadState();
356 			PDFObjectCastPtr<PDFSymbol> startxRef(mObjectParser.ParseNewObject(mParserExtender));
357 
358 			if(!startxRef || startxRef->GetValue() != scStartxref)
359 			{
360 				status = PDFHummus::eFailure;
361 				TRACE_LOG("PDFParser::ParseXrefPosition, syntax error in reading xref position");
362 				break;
363 			}
364 		}
365 		else // this means that the line is not only integer, a bit more complicated path, look for startxref and then the next would be the number
366 		{
367 			bool foundStartXref = (anObject->GetType() == PDFObject::ePDFObjectSymbol) && (((PDFSymbol*)anObject.GetPtr())->GetValue() == scStartxref);
368 
369 			while(!foundStartXref && mStream->NotEnded())
370 			{
371 				PDFObjectCastPtr<PDFSymbol> startxRef(mObjectParser.ParseNewObject(mParserExtender));
372 				foundStartXref = startxRef.GetPtr() && (startxRef->GetValue() == scStartxref);
373 			}
374 
375 			if(!foundStartXref)
376 			{
377 				status = PDFHummus::eFailure;
378 				TRACE_LOG("PDFParser::ParseXrefPosition, could not find startxref keyword");
379 				break;
380 			}
381 
382 			PDFObjectCastPtr<PDFInteger> xrefPosition(mObjectParser.ParseNewObject(mParserExtender));
383 			if(!xrefPosition)
384 			{
385 				status = PDFHummus::eFailure;
386 				TRACE_LOG("PDFParser::ParseXrefPosition, syntax error in reading xref position");
387 				break;
388 			}
389 
390 			mLastXrefPosition = xrefPosition->GetValue();
391 		}
392 
393 	}while(false);
394 
395 	return status;
396 
397 }
398 
399 static const std::string scTrailer = "trailer";
ParseTrailerDictionary()400 EStatusCode PDFParser::ParseTrailerDictionary()
401 {
402 
403 	EStatusCode status = PDFHummus::eSuccess;
404 	bool foundTrailer = false;
405 
406 	do
407 	{
408 		PDFParserTokenizer aTokenizer;
409 		aTokenizer.SetReadStream(mStream);
410 
411 		do
412 		{
413 			BoolAndString token = aTokenizer.GetNextToken();
414 			if(!token.first)
415 				break;
416 			foundTrailer = (scTrailer == token.second);
417 		}while(!foundTrailer);
418 
419 
420 		if(!foundTrailer)
421 		{
422 			status = PDFHummus::eFailure;
423 			TRACE_LOG("PDFParser::ParseTrailerDictionary, trailer not found...");
424 			break;
425 		}
426 
427 		// k. now that all is well, just parse the damn dictionary, which is actually...the easiest part.
428 		mObjectParser.ResetReadState();
429 		PDFObjectCastPtr<PDFDictionary> dictionaryObject(mObjectParser.ParseNewObject(mParserExtender));
430 		if(!dictionaryObject)
431 		{
432 			status = PDFHummus::eFailure;
433 			TRACE_LOG("PDFParser::ParseTrailerDictionary, failure to parse trailer dictionary");
434 			break;
435 		}
436 
437 		mTrailer = dictionaryObject;
438 	}while(false);
439 
440 	return status;
441 }
442 
BuildXrefTableFromTable()443 EStatusCode PDFParser::BuildXrefTableFromTable()
444 {
445 	EStatusCode status;
446 
447 	do
448 	{
449 		status = DetermineXrefSize();
450 		if(status != PDFHummus::eSuccess)
451 			break;
452 
453 		status = InitializeXref();
454 		if(status != PDFHummus::eSuccess)
455 			break;
456 
457 		if(mTrailer->Exists("Prev"))
458 		{
459 			status = ParsePreviousXrefs(mTrailer.GetPtr());
460 			if(status != PDFHummus::eSuccess)
461 				break;
462 		}
463 
464         XrefEntryInput* extendedTable = NULL;
465         ObjectIDType extendedTableSize;
466 		status = ParseXrefFromXrefTable(mXrefTable,mXrefSize,mLastXrefPosition,&extendedTable,&extendedTableSize);
467 		if(status != PDFHummus::eSuccess)
468 			break;
469 
470         // Table may have been extended, in which case replace the pointer and current size
471         if(extendedTable)
472         {
473             mXrefSize = extendedTableSize;
474             delete[] mXrefTable;
475             mXrefTable = extendedTable;
476         }
477 
478 		// For hybrids, check also XRefStm entry
479 		PDFObjectCastPtr<PDFInteger> xrefStmReference(mTrailer->QueryDirectObject("XRefStm"));
480 		if(!xrefStmReference)
481 			break;
482 		// if exists, merge update xref
483 		status = ParseXrefFromXrefStream(mXrefTable,mXrefSize,xrefStmReference->GetValue(),&extendedTable,&extendedTableSize);
484 		if(status != PDFHummus::eSuccess)
485 		{
486 			TRACE_LOG("PDFParser::ParseDirectory, failure to parse xref in hybrid mode");
487 			break;
488 		}
489         if(extendedTable)
490         {
491             mXrefSize = extendedTableSize;
492             delete[] mXrefTable;
493             mXrefTable = extendedTable;
494         }
495 	}while(false);
496 
497 	return status;
498 }
499 
DetermineXrefSize()500 EStatusCode PDFParser::DetermineXrefSize()
501 {
502 	PDFObjectCastPtr<PDFInteger> aSize(mTrailer->QueryDirectObject("Size"));
503 
504 	if(!aSize)
505 	{
506 		return PDFHummus::eFailure;
507 	}
508 	else
509 	{
510 		mXrefSize = (ObjectIDType)aSize->GetValue();
511 		return PDFHummus::eSuccess;
512 	}
513 }
514 
InitializeXref()515 EStatusCode PDFParser::InitializeXref()
516 {
517 	mXrefTable = new XrefEntryInput[mXrefSize];
518 	return PDFHummus::eSuccess;
519 }
520 
521 typedef BoxingBaseWithRW<ObjectIDType> ObjectIDTypeBox;
522 typedef BoxingBaseWithRW<unsigned long> ULong;
523 typedef BoxingBaseWithRW<LongFilePositionType> LongFilePositionTypeBox;
524 
525 static const std::string scXref = "xref";
ParseXrefFromXrefTable(XrefEntryInput * inXrefTable,ObjectIDType inXrefSize,LongFilePositionType inXrefPosition,XrefEntryInput ** outExtendedTable,ObjectIDType * outExtendedTableSize)526 EStatusCode PDFParser::ParseXrefFromXrefTable(XrefEntryInput* inXrefTable,
527                                               ObjectIDType inXrefSize,
528                                               LongFilePositionType inXrefPosition,
529                                               XrefEntryInput** outExtendedTable,
530                                               ObjectIDType* outExtendedTableSize)
531 {
532 	// K. cross ref starts at  xref position
533 	// and ends with trailer (or when exahausted the number of objects...whichever first)
534 	// i'm gonna tokanize them, for easier reading
535 	PDFParserTokenizer tokenizer;
536 	BoolAndString token;
537 	EStatusCode status = PDFHummus::eSuccess;
538 	ObjectIDType firstNonSectionObject;
539 	Byte entry[20];
540 
541     *outExtendedTable = NULL;
542 
543 	tokenizer.SetReadStream(mStream);
544 	MovePositionInStream(inXrefPosition);
545 
546 	// Note that at times, the xref is being read "on empty". meaning - entries will be read but they will not affect the actual xref.
547 	// This is done because final xref might be smaller than the prev xrefs, and i'm only interested in objects that are in the final xref.
548 	// That said - i still want to be in the position of the trailer after this function is being executed.
549 
550 	do
551 	{
552 		// first token must be "xref", so just verify
553 		token = tokenizer.GetNextToken();
554 		if(!token.first || token.second != scXref)
555 		{
556 			TRACE_LOG1("PDFParser::ParseXref, error in parsing xref, expected to find \"xref\" keyword, found = %s",token.second.c_str());
557 			status = PDFHummus::eFailure;
558 			break;
559 		}
560 
561 		ObjectIDType currentObject = 0;
562 
563 		while(PDFHummus::eSuccess == status)
564 		{
565 			token = tokenizer.GetNextToken();
566 			if(!token.first)
567 			{
568 				TRACE_LOG("PDFParser::ParseXref, failed to read tokens, while reading xref");
569 				status = PDFHummus::eFailure;
570 				break;
571 			}
572 
573 			// token may be either start of section or "trailer"
574 			if(scTrailer == token.second)
575 				break;
576 
577 			currentObject = ObjectIDTypeBox(token.second);
578 			token = tokenizer.GetNextToken();
579 			if(!token.first)
580 			{
581 				TRACE_LOG("PDFParser::ParseXref, unable to read section size, while reading xref");
582 				status = PDFHummus::eFailure;
583 				break;
584 			}
585 			if(ObjectIDTypeBox(token.second) == 0)
586 				continue; // probably will never happen
587 			firstNonSectionObject = currentObject + ObjectIDTypeBox(token.second);
588 
589             // if the segment declared objects above the xref size, consult policy on what to do
590             if(firstNonSectionObject > inXrefSize && mAllowExtendingSegments)
591             {
592                 inXrefTable = ExtendXrefTableToSize(inXrefTable,inXrefSize,firstNonSectionObject);
593                 inXrefSize = firstNonSectionObject;
594                 if(*outExtendedTable)
595                     delete[] *outExtendedTable;
596                 *outExtendedTable = inXrefTable;
597                 *outExtendedTableSize = firstNonSectionObject;
598             }
599 
600 
601 			// first row...not sure where starts...so skip till passing all endlines
602 			do
603 			{
604 				if(mStream->Read(entry,1) != 1)
605 				{
606 					TRACE_LOG("PDFParser::ParseXref, failed to read xref entry");
607 					status = PDFHummus::eFailure;
608 					break;
609 				}
610 			}while(IsPDFWhiteSpace(entry[0]));
611 
612 			// now read extra 19
613 			if(mStream->Read(entry+1,19) != 19)
614 			{
615 				TRACE_LOG("PDFParser::ParseXref, failed to read xref entry");
616 				status = PDFHummus::eFailure;
617 				break;
618 			}
619 			if(currentObject < inXrefSize)
620 			{
621 				inXrefTable[currentObject].mObjectPosition = LongFilePositionTypeBox((const char*)entry);
622 				inXrefTable[currentObject].mRivision = ULong((const char*)(entry+11));
623 				inXrefTable[currentObject].mType = entry[17] == 'n' ? eXrefEntryExisting:eXrefEntryDelete;
624 			}
625 			++currentObject;
626 
627 
628 
629 			// now parse the section.
630 			while(currentObject < firstNonSectionObject)
631 			{
632 				if(mStream->Read(entry,20) != 20)
633 				{
634 					TRACE_LOG("PDFParser::ParseXref, failed to read xref entry");
635 					status = PDFHummus::eFailure;
636 					break;
637 				}
638 				if(currentObject < inXrefSize)
639 				{
640 					inXrefTable[currentObject].mObjectPosition = LongFilePositionTypeBox((const char*)entry);
641 					inXrefTable[currentObject].mRivision = ULong((const char*)(entry+11));
642 					inXrefTable[currentObject].mType = entry[17] == 'n' ? eXrefEntryExisting:eXrefEntryDelete;
643 				}
644 				++currentObject;
645 			}
646 		}
647 		if(status != PDFHummus::eSuccess)
648 			break;
649 
650 	}while(false);
651 	mObjectParser.ResetReadState(); // cause read without consulting with the tokenizer...so now there shouldn't be available tokens
652 
653 	return status;
654 }
655 
ExtendXrefTableToSize(XrefEntryInput * inXrefTable,ObjectIDType inOldSize,ObjectIDType inNewSize)656 XrefEntryInput* PDFParser::ExtendXrefTableToSize(XrefEntryInput* inXrefTable,ObjectIDType inOldSize,ObjectIDType inNewSize)
657 {
658     XrefEntryInput* newTable = new XrefEntryInput[inNewSize];
659 
660 	for(ObjectIDType i = 0; i < inOldSize; ++i)
661         newTable[i] =	inXrefTable[i];
662     return newTable;
663 }
664 
GetTrailer()665 PDFDictionary* PDFParser::GetTrailer()
666 {
667 	return mTrailer.GetPtr();
668 }
669 
GetPDFLevel()670 double PDFParser::GetPDFLevel()
671 {
672 	return mPDFLevel;
673 }
674 
ParseNewObject(ObjectIDType inObjectId)675 PDFObject* PDFParser::ParseNewObject(ObjectIDType inObjectId)
676 {
677 	if(inObjectId >= mXrefSize)
678 	{
679 		return NULL;
680 	}
681 	else if(eXrefEntryExisting == mXrefTable[inObjectId].mType)
682 	{
683 		return ParseExistingInDirectObject(inObjectId);
684 	}
685 	else if(eXrefEntryStreamObject == mXrefTable[inObjectId].mType)
686 	{
687 		return ParseExistingInDirectStreamObject(inObjectId);
688 	}
689 	else
690 		return NULL;
691 }
692 
GetObjectsCount()693 ObjectIDType PDFParser::GetObjectsCount()
694 {
695 	return mXrefSize;
696 }
697 
698 static const std::string scObj = "obj";
ParseExistingInDirectObject(ObjectIDType inObjectID)699 PDFObject* PDFParser::ParseExistingInDirectObject(ObjectIDType inObjectID)
700 {
701 	PDFObject* readObject = NULL;
702 
703 	MovePositionInStream(mXrefTable[inObjectID].mObjectPosition);
704 
705 	do
706 	{
707 		// should get us to the ObjectNumber ObjectVersion obj section
708 		// verify that it's good and if so continue to parse the object itself
709 
710 		// verify object ID
711 		PDFObjectCastPtr<PDFInteger> idObject(mObjectParser.ParseNewObject(mParserExtender));
712 
713 		if(!idObject)
714 		{
715 			TRACE_LOG("PDFParser::ParseExistingInDirectObject, failed to read object declaration, ID");
716 			break;
717 		}
718 
719 		if((ObjectIDType)idObject->GetValue() != inObjectID)
720 		{
721 			TRACE_LOG2("PDFParser::ParseExistingInDirectObject, failed to read object declaration, exepected ID = %ld, found %ld",
722 				inObjectID,idObject->GetValue());
723 			break;
724 		}
725 
726 		// verify object Version
727 		PDFObjectCastPtr<PDFInteger> versionObject(mObjectParser.ParseNewObject(mParserExtender));
728 
729 		if(!versionObject)
730 		{
731 			TRACE_LOG("PDFParser::ParseExistingInDirectObject, failed to read object declaration, Version");
732 			break;
733 		}
734 
735 		if(mParserExtender)
736 			mParserExtender->OnObjectStart(inObjectID,versionObject->GetValue());
737 
738 		if((unsigned long)versionObject->GetValue() != mXrefTable[inObjectID].mRivision)
739 		{
740 			TRACE_LOG2("PDFParser::ParseExistingInDirectObject, failed to read object declaration, exepected version = %ld, found %ld",
741 				mXrefTable[inObjectID].mRivision,versionObject->GetValue());
742 			break;
743 		}
744 
745 		// now the obj keyword
746 		PDFObjectCastPtr<PDFSymbol> objKeyword(mObjectParser.ParseNewObject(mParserExtender));
747 
748 		if(!objKeyword)
749 		{
750 			TRACE_LOG("PDFParser::ParseExistingInDirectObject, failed to read object declaration, obj keyword");
751 			break;
752 		}
753 
754 		if(objKeyword->GetValue() != scObj)
755 		{
756 			TRACE_LOG1("PDFParser::ParseExistingInDirectObject, failed to read object declaration, expected obj keyword found %s",
757 				objKeyword->GetValue().c_str());
758 			break;
759 		}
760 
761 		readObject = mObjectParser.ParseNewObject(mParserExtender);
762 
763 		if(mParserExtender)
764 			mParserExtender->OnObjectEnd(readObject);
765 	}while(false);
766 
767 	return readObject;
768 }
769 
ParsePagesObjectIDs()770 EStatusCode PDFParser::ParsePagesObjectIDs()
771 {
772 	EStatusCode status = PDFHummus::eSuccess;
773 
774 	// m.k plan is to look for the catalog, then find the pages, then initialize the array to the count at the root, and then just recursively loop
775 	// the pages by order of pages and fill up the IDs. easy.
776 
777 	do
778 	{
779 		// get catalogue, verify indirect reference
780 		PDFObjectCastPtr<PDFIndirectObjectReference> catalogReference(mTrailer->QueryDirectObject("Root"));
781 		if(!catalogReference)
782 		{
783 			TRACE_LOG("PDFParser::ParsePagesObjectIDs, failed to read catalog reference in trailer");
784 			status = PDFHummus::eFailure;
785 			break;
786 		}
787 
788 		PDFObjectCastPtr<PDFDictionary> catalog(ParseNewObject(catalogReference->mObjectID));
789 		if(!catalog)
790 		{
791 			TRACE_LOG("PDFParser::ParsePagesObjectIDs, failed to read catalog");
792 			status = PDFHummus::eFailure;
793 			break;
794 		}
795 
796 		// get pages, verify indirect reference
797 		PDFObjectCastPtr<PDFIndirectObjectReference> pagesReference(catalog->QueryDirectObject("Pages"));
798 		if(!pagesReference)
799 		{
800 			TRACE_LOG("PDFParser::ParsePagesObjectIDs, failed to read pages reference in catalog");
801 			status = PDFHummus::eFailure;
802 			break;
803 		}
804 
805 		PDFObjectCastPtr<PDFDictionary> pages(ParseNewObject(pagesReference->mObjectID));
806 		if(!pages)
807 		{
808 			TRACE_LOG("PDFParser::ParsePagesObjectIDs, failed to read pages");
809 			status = PDFHummus::eFailure;
810 			break;
811 		}
812 
813 		PDFObjectCastPtr<PDFInteger> totalPagesCount(QueryDictionaryObject(pages.GetPtr(),"Count"));
814 		if(!totalPagesCount)
815 		{
816 			TRACE_LOG("PDFParser::ParsePagesObjectIDs, failed to read pages count");
817 			status = PDFHummus::eFailure;
818 			break;
819 		}
820 
821 		mPagesCount = (unsigned long)totalPagesCount->GetValue();
822 		mPagesObjectIDs = new ObjectIDType[mPagesCount];
823 
824 		// now iterate through pages objects, and fill up the IDs [don't really need the object ID for the root pages tree...but whatever
825 		status = ParsePagesIDs(pages.GetPtr(),pagesReference->mObjectID);
826 
827 	}while(false);
828 
829 	return status;
830 }
831 
ParsePagesIDs(PDFDictionary * inPageNode,ObjectIDType inNodeObjectID)832 EStatusCode PDFParser::ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNodeObjectID)
833 {
834 	unsigned long currentPageIndex = 0;
835 
836 	return ParsePagesIDs(inPageNode,inNodeObjectID,currentPageIndex);
837 }
838 
839 static const std::string scPage = "Page";
840 static const std::string scPages = "Pages";
ParsePagesIDs(PDFDictionary * inPageNode,ObjectIDType inNodeObjectID,unsigned long & ioCurrentPageIndex)841 EStatusCode PDFParser::ParsePagesIDs(PDFDictionary* inPageNode,ObjectIDType inNodeObjectID,unsigned long& ioCurrentPageIndex)
842 {
843 	// recursion.
844 	// if this is a page, write it's node object ID in the current page index and +1
845 	// if this is a pagetree, loop it's kids, for each parsing the kid, running the recursion on it, and deleting
846 
847 	EStatusCode status = PDFHummus::eSuccess;
848 
849 	do
850 	{
851 		PDFObjectCastPtr<PDFName> objectType(inPageNode->QueryDirectObject("Type"));
852 		if(!objectType)
853 		{
854 			TRACE_LOG("PDFParser::ParsePagesIDs, can't read object type");
855 			status = PDFHummus::eFailure;
856 			break;
857 		}
858 
859 		if(scPage == objectType->GetValue())
860 		{
861 			// a Page
862 			if(ioCurrentPageIndex >= mPagesCount)
863 			{
864 				TRACE_LOG("PDFParser::ParsePagesIDs, there are more pages than the page count specifies. fail.");
865 				status = PDFHummus::eFailure;
866 				break;
867 			}
868 
869 			mPagesObjectIDs[ioCurrentPageIndex] = inNodeObjectID;
870 			++ioCurrentPageIndex;
871 		}
872 		else if(scPages == objectType->GetValue())
873 		{
874 			// a Page tree node
875 			PDFObjectCastPtr<PDFArray> kidsObject(inPageNode->QueryDirectObject("Kids"));
876 			if(!kidsObject)
877 			{
878 				TRACE_LOG("PDFParser::ParsePagesIDs, unable to find page kids array");
879 				status = PDFHummus::eFailure;
880 				break;
881 			}
882 
883 			SingleValueContainerIterator<PDFObjectVector> it = kidsObject->GetIterator();
884 
885 			while(it.MoveNext() && PDFHummus::eSuccess == status)
886 			{
887 				if(it.GetItem()->GetType() != PDFObject::ePDFObjectIndirectObjectReference)
888 				{
889 					TRACE_LOG1("PDFParser::ParsePagesIDs, unexpected type for a Kids array object, type = %s",PDFObject::scPDFObjectTypeLabel[it.GetItem()->GetType()]);
890 					status = PDFHummus::eFailure;
891 					break;
892 				}
893 
894 				PDFObjectCastPtr<PDFDictionary> pageNodeObject(ParseNewObject(((PDFIndirectObjectReference*)it.GetItem())->mObjectID));
895 				if(!pageNodeObject)
896 				{
897 					TRACE_LOG("PDFParser::ParsePagesIDs, unable to parse page node object from kids reference");
898 					status = PDFHummus::eFailure;
899 					break;
900 				}
901 
902 				status = ParsePagesIDs(pageNodeObject.GetPtr(),((PDFIndirectObjectReference*)it.GetItem())->mObjectID,ioCurrentPageIndex);
903 			}
904 		}
905 		else
906 		{
907 			TRACE_LOG1("PDFParser::ParsePagesIDs, unexpected object type. should be either Page or Pages, found %s",objectType->GetValue().c_str());
908 			status = PDFHummus::eFailure;
909 			break;
910 		}
911 	}while(false);
912 
913 	return status;
914 }
915 
GetPagesCount()916 unsigned long  PDFParser::GetPagesCount()
917 {
918 	return mPagesCount;
919 }
920 
GetPageObjectID(unsigned long inPageIndex)921 ObjectIDType PDFParser::GetPageObjectID(unsigned long inPageIndex)
922 {
923 	if(mPagesCount <= inPageIndex)
924 		return 0;
925 
926 	return mPagesObjectIDs[inPageIndex];
927 }
928 
929 
ParsePage(unsigned long inPageIndex)930 PDFDictionary* PDFParser::ParsePage(unsigned long inPageIndex)
931 {
932 	if(mPagesCount <= inPageIndex)
933 		return NULL;
934 
935 	PDFObjectCastPtr<PDFDictionary> pageObject(ParseNewObject(mPagesObjectIDs[inPageIndex]));
936 
937 	if(!pageObject)
938 	{
939 		TRACE_LOG1("PDFParser::ParsePage, couldn't find page object for index %ld",inPageIndex);
940 		return NULL;
941 	}
942 
943 	PDFObjectCastPtr<PDFName> objectType(pageObject->QueryDirectObject("Type"));
944 
945 	if(scPage == objectType->GetValue())
946 	{
947 		pageObject->AddRef();
948 		return pageObject.GetPtr();
949 	}
950 	else
951 	{
952 		TRACE_LOG1("PDFParser::ParsePage, page object listed in page array for %ld is actually not a page",inPageIndex);
953 		return NULL;
954 	}
955 }
956 
QueryDictionaryObject(PDFDictionary * inDictionary,const std::string & inName)957 PDFObject* PDFParser::QueryDictionaryObject(PDFDictionary* inDictionary,const std::string& inName)
958 {
959 	RefCountPtr<PDFObject> anObject(inDictionary->QueryDirectObject(inName));
960 
961 	if(anObject.GetPtr() == NULL)
962 		return NULL;
963 
964 	if(anObject->GetType() == PDFObject::ePDFObjectIndirectObjectReference)
965 	{
966 		PDFObject* theActualObject = ParseNewObject(((PDFIndirectObjectReference*)anObject.GetPtr())->mObjectID);
967 		return theActualObject;
968 	}
969 	else
970 	{
971 		anObject->AddRef(); // adding ref to increase owners
972 		return anObject.GetPtr();
973 	}
974 }
975 
QueryArrayObject(PDFArray * inArray,unsigned long inIndex)976 PDFObject* PDFParser::QueryArrayObject(PDFArray* inArray,unsigned long inIndex)
977 {
978 	RefCountPtr<PDFObject> anObject(inArray->QueryObject(inIndex));
979 
980 	if(anObject.GetPtr() == NULL)
981 		return NULL;
982 
983 	if(anObject->GetType() == PDFObject::ePDFObjectIndirectObjectReference)
984 	{
985 		PDFObject* theActualObject = ParseNewObject(((PDFIndirectObjectReference*)anObject.GetPtr())->mObjectID);
986 		return theActualObject;
987 	}
988 	else
989 	{
990 		anObject->AddRef(); // adding ref to increase owners
991 		return anObject.GetPtr();
992 	}
993 
994 }
995 
ParsePreviousXrefs(PDFDictionary * inTrailer)996 EStatusCode PDFParser::ParsePreviousXrefs(PDFDictionary* inTrailer)
997 {
998 	PDFObjectCastPtr<PDFInteger> previousPosition(inTrailer->QueryDirectObject("Prev"));
999 	if(!previousPosition)
1000 	{
1001 		TRACE_LOG("PDFParser::ParsePreviousXrefs, unexpected, prev is not integer");
1002 		return PDFHummus::eFailure;
1003 	}
1004 
1005 	EStatusCode status;
1006 
1007 	XrefEntryInput* aTable = new XrefEntryInput[mXrefSize];
1008 	do
1009 	{
1010 		PDFDictionary* trailerP = NULL;
1011 
1012         XrefEntryInput* extendedTable = NULL;
1013         ObjectIDType extendedTableSize;
1014 		status = ParseDirectory(previousPosition->GetValue(),aTable,mXrefSize,&trailerP,&extendedTable,&extendedTableSize);
1015 		if(status != PDFHummus::eSuccess)
1016 			break;
1017 		RefCountPtr<PDFDictionary> trailer(trailerP);
1018 
1019 		if(trailer->Exists("Prev"))
1020 		{
1021 			status = ParsePreviousXrefs(trailer.GetPtr());
1022 			if(status != PDFHummus::eSuccess)
1023 				break;
1024 		}
1025 
1026 
1027         // Table may have been extended, in which case replace the pointer and current size
1028         ObjectIDType newTableSize;
1029         if(extendedTable)
1030         {
1031             newTableSize = extendedTableSize;
1032             delete[] aTable;
1033             aTable = extendedTable;
1034         }
1035         else
1036             newTableSize = mXrefSize;
1037         MergeXrefWithMainXref(aTable,newTableSize);
1038 	}
1039 	while(false);
1040 
1041 	delete[] aTable;
1042 	return status;
1043 }
1044 
ParseDirectory(LongFilePositionType inXrefPosition,XrefEntryInput * inXrefTable,ObjectIDType inXrefSize,PDFDictionary ** outTrailer,XrefEntryInput ** outExtendedTable,ObjectIDType * outExtendedTableSize)1045 EStatusCode PDFParser::ParseDirectory(LongFilePositionType inXrefPosition,
1046 									  XrefEntryInput* inXrefTable,
1047 									  ObjectIDType inXrefSize,
1048 									  PDFDictionary** outTrailer,
1049                                       XrefEntryInput** outExtendedTable,
1050                                       ObjectIDType* outExtendedTableSize)
1051 {
1052 	EStatusCode status = PDFHummus::eSuccess;
1053 
1054 	MovePositionInStream(inXrefPosition);
1055 
1056 	do
1057 	{
1058 		// take the object, so that we can check whether this is an Xref or an Xref stream
1059 		RefCountPtr<PDFObject> anObject(mObjectParser.ParseNewObject(mParserExtender));
1060 		if(!anObject)
1061 		{
1062 			status = PDFHummus::eFailure;
1063 			break;
1064 		}
1065 
1066 		if(anObject->GetType() == PDFObject::ePDFObjectSymbol && ((PDFSymbol*)anObject.GetPtr())->GetValue() == scXref)
1067 		{
1068 			// This is the case of a regular xref table. note that as oppose to the main trailer case
1069 			// i already have a limit of Xrefsize (which is determined by the main trailer Size entry)
1070 			// so i don't have to parse the trailer in advance, but rather just read the file in the natural order:
1071 			// first - the xref then the trailer.
1072 			status = ParseXrefFromXrefTable(inXrefTable,inXrefSize,inXrefPosition,outExtendedTable,outExtendedTableSize);
1073 			if(status != PDFHummus::eSuccess)
1074 			{
1075 				TRACE_LOG1("PDFParser::ParseDirectory, failed to parse xref table in %ld",inXrefPosition);
1076 				break;
1077 			}
1078 
1079             if(outExtendedTable)
1080             {
1081                 inXrefTable = *outExtendedTable;
1082                 inXrefSize = *outExtendedTableSize;
1083             }
1084 
1085 			// at this point we should be after the token of the "trailer"
1086 			PDFObjectCastPtr<PDFDictionary> trailerDictionary(mObjectParser.ParseNewObject(mParserExtender));
1087 			if(!trailerDictionary)
1088 			{
1089 				status = PDFHummus::eFailure;
1090 				TRACE_LOG("PDFParser::ParseDirectory, failure to parse trailer dictionary");
1091 				break;
1092 			}
1093 
1094 			// For hybrids, check also XRefStm entry
1095 			PDFObjectCastPtr<PDFInteger> xrefStmReference(trailerDictionary->QueryDirectObject("XRefStm"));
1096 			if(xrefStmReference.GetPtr())
1097 			{
1098 				// if exists, merge update xref
1099 				status = ParseXrefFromXrefStream(inXrefTable,inXrefSize,xrefStmReference->GetValue(),outExtendedTable,outExtendedTableSize);
1100 				if(status != PDFHummus::eSuccess)
1101 				{
1102 					TRACE_LOG("PDFParser::ParseDirectory, failure to parse xref in hybrid mode");
1103 					break;
1104 				}
1105 			}
1106 
1107 			trailerDictionary->AddRef();
1108 			*outTrailer = trailerDictionary.GetPtr();
1109 		}
1110 		else if(anObject->GetType() == PDFObject::ePDFObjectInteger && ((PDFInteger*)anObject.GetPtr())->GetValue() > 0)
1111 		{
1112 			// Xref stream case. make some validations, grab the xref stream object details, and parse it
1113 
1114 			PDFObjectCastPtr<PDFInteger> versionObject(mObjectParser.ParseNewObject(mParserExtender));
1115 
1116 			if(!versionObject)
1117 			{
1118 				TRACE_LOG("PDFParser::ParseDirectory, failed to read xref object declaration, Version");
1119 				status = PDFHummus::eFailure;
1120 				break;
1121 			}
1122 
1123 			if(mParserExtender)
1124 				mParserExtender->OnObjectStart(((PDFInteger*)anObject.GetPtr())->GetValue(),versionObject->GetValue());
1125 
1126 
1127 			PDFObjectCastPtr<PDFSymbol> objKeyword(mObjectParser.ParseNewObject(mParserExtender));
1128 
1129 			if(!objKeyword)
1130 			{
1131 				TRACE_LOG("PDFParser::ParseDirectory, failed to read xref object declaration, obj keyword");
1132 				status = PDFHummus::eFailure;
1133 				break;
1134 			}
1135 
1136 			if(objKeyword->GetValue() != scObj)
1137 			{
1138 				TRACE_LOG1("PDFParser::ParseDirectory, failed to read xref object declaration, expected obj keyword found %s",
1139 					objKeyword->GetValue().c_str());
1140 				status = PDFHummus::eFailure;
1141 				break;
1142 			}
1143 
1144 			PDFObjectCastPtr<PDFStreamInput> xrefStream(mObjectParser.ParseNewObject(mParserExtender));
1145 			if(!xrefStream)
1146 			{
1147 				TRACE_LOG("PDFParser::BuildXrefTableAndTrailerFromXrefStream, failure to parse xref stream");
1148 				status = PDFHummus::eFailure;
1149 				break;
1150 			}
1151 
1152 			if(mParserExtender)
1153 				mParserExtender->OnObjectEnd(xrefStream.GetPtr());
1154 
1155 			*outTrailer = xrefStream->QueryStreamDictionary();
1156 
1157 			status = ParseXrefFromXrefStream(inXrefTable,inXrefSize,xrefStream.GetPtr(),outExtendedTable,outExtendedTableSize);
1158 			if(status != PDFHummus::eSuccess)
1159 				break;
1160 		}
1161 		else
1162 		{
1163 			TRACE_LOG("PDFParser::ParseDirectory,Unexpected object at xref start");
1164 			status = PDFHummus::eFailure;
1165 		}
1166 	}while(false);
1167 	return status;
1168 }
1169 
MergeXrefWithMainXref(XrefEntryInput * inTableToMerge,ObjectIDType inMergedTableSize)1170 void PDFParser::MergeXrefWithMainXref(XrefEntryInput* inTableToMerge,ObjectIDType inMergedTableSize)
1171 {
1172     if(inMergedTableSize > mXrefSize)
1173     {
1174         XrefEntryInput* newTable = ExtendXrefTableToSize(mXrefTable, mXrefSize, inMergedTableSize);
1175         mXrefSize = inMergedTableSize;
1176         delete[] mXrefTable;
1177         mXrefTable = newTable;
1178     }
1179 
1180 	for(ObjectIDType i = 0; i < mXrefSize; ++i)
1181 	{
1182 		if(inTableToMerge[i].mType != eXrefEntryUndefined)
1183 			mXrefTable[i] =	inTableToMerge[i];
1184 	}
1185 }
1186 
1187 
ParseFileDirectory()1188 EStatusCode PDFParser::ParseFileDirectory()
1189 {
1190 	EStatusCode status = PDFHummus::eSuccess;
1191 
1192 
1193 	MovePositionInStream(mLastXrefPosition);
1194 
1195 	do
1196 	{
1197 		// take the object, so that we can check whether this is an Xref or an Xref stream
1198 		RefCountPtr<PDFObject> anObject(mObjectParser.ParseNewObject(mParserExtender));
1199 		if(!anObject)
1200 		{
1201 			status = PDFHummus::eFailure;
1202 			break;
1203 		}
1204 
1205 		if(anObject->GetType() == PDFObject::ePDFObjectSymbol && ((PDFSymbol*)anObject.GetPtr())->GetValue() == scXref)
1206 		{
1207 			// this would be a normal xref case
1208 			// jump lines till you get to a line where the token is "trailer". then parse.
1209 			status = ParseTrailerDictionary();
1210 			if(status != PDFHummus::eSuccess)
1211 				break;
1212 
1213 			status = BuildXrefTableFromTable();
1214 			if(status != PDFHummus::eSuccess)
1215 				break;
1216 		}
1217 		else if(anObject->GetType() == PDFObject::ePDFObjectInteger && ((PDFInteger*)anObject.GetPtr())->GetValue() > 0)
1218 		{
1219 			// Xref stream case
1220 			status = BuildXrefTableAndTrailerFromXrefStream(((PDFInteger*)anObject.GetPtr())->GetValue());
1221 			if(status != PDFHummus::eSuccess)
1222 				break;
1223 
1224 		}
1225 		else
1226 		{
1227 			TRACE_LOG("PDFParser::ParseFileDirectory,Unexpected object at xref start");
1228 			status = eFailure;
1229 		}
1230 
1231 
1232 	}while(false);
1233 
1234 
1235 
1236 	return status;
1237 }
1238 
BuildXrefTableAndTrailerFromXrefStream(long long inXrefStreamObjectID)1239 EStatusCode PDFParser::BuildXrefTableAndTrailerFromXrefStream(long long inXrefStreamObjectID)
1240 {
1241 	// xref stream is trailer and stream togather. need to parse them both.
1242 	// the object parser is now after the object ID. so verify that next we goot a version and the obj keyword
1243 	// then parse the xref stream
1244 	EStatusCode status = PDFHummus::eSuccess;
1245 
1246 	PDFObjectCastPtr<PDFInteger> versionObject(mObjectParser.ParseNewObject(mParserExtender));
1247 
1248 	do
1249 	{
1250 		if(!versionObject)
1251 		{
1252 			TRACE_LOG("PDFParser::BuildXrefTableAndTrailerFromXrefStream, failed to read xref object declaration, Version");
1253 			status = PDFHummus::eFailure;
1254 			break;
1255 		}
1256 
1257 
1258 		if(mParserExtender)
1259 			mParserExtender->OnObjectStart(inXrefStreamObjectID,versionObject->GetValue());
1260 
1261 		PDFObjectCastPtr<PDFSymbol> objKeyword(mObjectParser.ParseNewObject(mParserExtender));
1262 
1263 		if(!objKeyword)
1264 		{
1265 			TRACE_LOG("PDFParser::BuildXrefTableAndTrailerFromXrefStream, failed to read xref object declaration, obj keyword");
1266 			status = PDFHummus::eFailure;
1267 			break;
1268 		}
1269 
1270 		if(objKeyword->GetValue() != scObj)
1271 		{
1272 			TRACE_LOG1("PDFParser::BuildXrefTableAndTrailerFromXrefStream, failed to read xref object declaration, expected obj keyword found %s",
1273 				objKeyword->GetValue().c_str());
1274 			status = PDFHummus::eFailure;
1275 			break;
1276 		}
1277 
1278 		// k. now just parse the object which should be a stream
1279 		PDFObjectCastPtr<PDFStreamInput> xrefStream(mObjectParser.ParseNewObject(mParserExtender));
1280 		if(!xrefStream)
1281 		{
1282 			TRACE_LOG("PDFParser::BuildXrefTableAndTrailerFromXrefStream, failure to parse xref stream");
1283 			status = PDFHummus::eFailure;
1284 			break;
1285 		}
1286 
1287 		if(mParserExtender)
1288 			mParserExtender->OnObjectEnd(xrefStream.GetPtr());
1289 
1290 		RefCountPtr<PDFDictionary> xrefDictionary(xrefStream->QueryStreamDictionary());
1291 		mTrailer = xrefDictionary;
1292 
1293 		status = DetermineXrefSize();
1294 		if(status != PDFHummus::eSuccess)
1295 			break;
1296 
1297 		status = InitializeXref();
1298 		if(status != PDFHummus::eSuccess)
1299 			break;
1300 
1301 		if(mTrailer->Exists("Prev"))
1302 		{
1303 			status = ParsePreviousXrefs(mTrailer.GetPtr());
1304 			if(status != PDFHummus::eSuccess)
1305 				break;
1306 		}
1307 
1308         XrefEntryInput* extendedTable = NULL;
1309         ObjectIDType extendedTableSize;
1310 		status = ParseXrefFromXrefStream(mXrefTable,mXrefSize,xrefStream.GetPtr(),&extendedTable,&extendedTableSize);
1311 		if(status != PDFHummus::eSuccess)
1312 			break;
1313 
1314         // Table may have been extended, in which case replace the pointer and current size
1315         if(extendedTable)
1316         {
1317             mXrefSize = extendedTableSize;
1318             delete[] mXrefTable;
1319             mXrefTable = extendedTable;
1320         }
1321 
1322 	}while(false);
1323 
1324 	return status;
1325 
1326 }
1327 
ParseXrefFromXrefStream(XrefEntryInput * inXrefTable,ObjectIDType inXrefSize,LongFilePositionType inXrefPosition,XrefEntryInput ** outExtendedTable,ObjectIDType * outExtendedTableSize)1328 EStatusCode PDFParser::ParseXrefFromXrefStream(XrefEntryInput* inXrefTable,
1329                                                ObjectIDType inXrefSize,
1330                                                LongFilePositionType inXrefPosition,
1331                                                XrefEntryInput** outExtendedTable,
1332                                                ObjectIDType* outExtendedTableSize)
1333 {
1334 	EStatusCode status = PDFHummus::eSuccess;
1335 
1336 	MovePositionInStream(inXrefPosition);
1337 
1338 	do
1339 	{
1340 		// take the object, so that we can check whether this is an Xref or an Xref stream
1341 		PDFObjectCastPtr<PDFInteger> anObject(mObjectParser.ParseNewObject(mParserExtender));
1342 		if(!anObject || anObject->GetValue() <= 0)
1343 		{
1344 			TRACE_LOG1("PDFParser::ParseXrefFromXrefStream, expecting object number for xref stream at %ld",inXrefPosition);
1345 			status = PDFHummus::eFailure;
1346 			break;
1347 		}
1348 
1349 		PDFObjectCastPtr<PDFInteger> versionObject(mObjectParser.ParseNewObject(mParserExtender));
1350 
1351 		if(!versionObject)
1352 		{
1353 			TRACE_LOG("PDFParser::ParseXrefFromXrefStream, failed to read xref object declaration, Version");
1354 			status = PDFHummus::eFailure;
1355 			break;
1356 		}
1357 
1358 		if(mParserExtender)
1359 			mParserExtender->OnObjectStart(anObject->GetValue(),versionObject->GetValue());
1360 
1361 		PDFObjectCastPtr<PDFSymbol> objKeyword(mObjectParser.ParseNewObject(mParserExtender));
1362 
1363 		if(!objKeyword)
1364 		{
1365 			TRACE_LOG("PDFParser::ParseXrefFromXrefStream, failed to read xref object declaration, obj keyword");
1366 			status = PDFHummus::eFailure;
1367 			break;
1368 		}
1369 
1370 		if(objKeyword->GetValue() != scObj)
1371 		{
1372 			TRACE_LOG1("PDFParser::ParseXrefFromXrefStream, failed to read xref object declaration, expected obj keyword found %s",
1373 				objKeyword->GetValue().c_str());
1374 			status = PDFHummus::eFailure;
1375 			break;
1376 		}
1377 
1378 		PDFObjectCastPtr<PDFStreamInput> xrefStream(mObjectParser.ParseNewObject(mParserExtender));
1379 		if(!xrefStream)
1380 		{
1381 			TRACE_LOG("PDFParser::ParseXrefFromXrefStream, failure to parse xref stream");
1382 			status = PDFHummus::eFailure;
1383 			break;
1384 		}
1385 
1386 		if(mParserExtender)
1387 			mParserExtender->OnObjectEnd(xrefStream.GetPtr());
1388 
1389 		status = ParseXrefFromXrefStream(inXrefTable,inXrefSize,xrefStream.GetPtr(),outExtendedTable,outExtendedTableSize);
1390 	}while(false);
1391 	return status;
1392 }
1393 
ParseXrefFromXrefStream(XrefEntryInput * inXrefTable,ObjectIDType inXrefSize,PDFStreamInput * inXrefStream,XrefEntryInput ** outExtendedTable,ObjectIDType * outExtendedTableSize)1394 EStatusCode PDFParser::ParseXrefFromXrefStream(XrefEntryInput* inXrefTable,
1395                                                ObjectIDType inXrefSize,
1396                                                PDFStreamInput* inXrefStream,
1397                                                XrefEntryInput** outExtendedTable,
1398                                                ObjectIDType* outExtendedTableSize)
1399 {
1400 	// 1. Setup the stream to read from the stream start location
1401 	// 2. Set it up with an input stream to decode if required
1402 	// 3. if there are subsections, loop them, otherwise assume a single section of 0..size
1403 	// 4. for each subsection use the base number as starting, and count as well, to read the stream entries to the right position in the table
1404 	//    The entries are read using the "W" value. make sure to read even values that you don't need.
1405 
1406 	EStatusCode status = PDFHummus::eSuccess;
1407 
1408     outExtendedTable = NULL;
1409 
1410 	IByteReader* xrefStreamSource = CreateInputStreamReader(inXrefStream);
1411 	int* widthsArray = NULL;
1412 
1413 	do
1414 	{
1415 		if(!xrefStreamSource)
1416 		{
1417 			status = PDFHummus::eFailure;
1418 			break;
1419 		}
1420 
1421 		RefCountPtr<PDFDictionary> streamDictionary(inXrefStream->QueryStreamDictionary());
1422 
1423 		// setup w array
1424 		PDFObjectCastPtr<PDFArray> wArray(QueryDictionaryObject(streamDictionary.GetPtr(),"W"));
1425 		if(!wArray)
1426 		{
1427 			TRACE_LOG("PDFParser::ParseXrefFromXrefStream, W array not available. failing");
1428 			status = PDFHummus::eFailure;
1429 			break;
1430 		}
1431 
1432 		widthsArray = new int[wArray->GetLength()];
1433 		for(unsigned long i=0;i <wArray->GetLength();++i)
1434 		{
1435 			PDFObjectCastPtr<PDFInteger> widthObject(wArray->QueryObject(i));
1436 			if(!widthObject)
1437 			{
1438 				TRACE_LOG("PDFParser::ParseXrefFromXrefStream, wrong items in width array (supposed to have only integers)");
1439 				status = PDFHummus::eFailure;
1440 				break;
1441 			}
1442 			widthsArray[i] = (int)widthObject->GetValue();
1443 		}
1444 		if(status != PDFHummus::eSuccess)
1445 			break;
1446 
1447 		// read the segments from the stream
1448 		PDFObjectCastPtr<PDFArray> subsectionsIndex(QueryDictionaryObject(streamDictionary.GetPtr(),"Index"));
1449 		MovePositionInStream(inXrefStream->GetStreamContentStart());
1450 
1451 		if(!subsectionsIndex)
1452 		{
1453 			PDFObjectCastPtr<PDFInteger> xrefSize(QueryDictionaryObject(streamDictionary.GetPtr(),"Size"));
1454 			if(!xrefSize)
1455 			{
1456 				TRACE_LOG("PDFParser::ParseXrefFromXrefStream, xref size does not exist for this stream");
1457 				status = PDFHummus::eFailure;
1458 				break;
1459 			}
1460 
1461             // if reading objects past expected range interesting consult policy
1462             ObjectIDType readXrefSize = (ObjectIDType)xrefSize->GetValue();
1463             if(readXrefSize > inXrefSize)
1464             {
1465                 if(mAllowExtendingSegments)
1466                 {
1467                     inXrefTable = ExtendXrefTableToSize(inXrefTable,inXrefSize,readXrefSize);
1468                     inXrefSize = readXrefSize;
1469                     if(outExtendedTable && *outExtendedTable)
1470                         delete[] *outExtendedTable;
1471                     *outExtendedTable = inXrefTable;
1472                     *outExtendedTableSize = readXrefSize;
1473                 }
1474                 else
1475                     break;
1476             }
1477 			status = ReadXrefStreamSegment(inXrefTable,0,readXrefSize,xrefStreamSource,widthsArray,wArray->GetLength());
1478 		}
1479 		else
1480 		{
1481 			SingleValueContainerIterator<PDFObjectVector> segmentsIterator  = subsectionsIndex->GetIterator();
1482 			PDFObjectCastPtr<PDFInteger> segmentValue;
1483 			while(segmentsIterator.MoveNext() && PDFHummus::eSuccess == status)
1484 			{
1485 				segmentValue = segmentsIterator.GetItem();
1486 				if(!segmentValue)
1487 				{
1488 					TRACE_LOG("PDFParser::ParseXrefFromXrefStream, found non integer value in Index array of xref stream");
1489 					status = PDFHummus::eFailure;
1490 					break;
1491 				}
1492 				ObjectIDType startObject = (ObjectIDType)segmentValue->GetValue();
1493 				if(!segmentsIterator.MoveNext())
1494 				{
1495 					TRACE_LOG("PDFParser::ParseXrefFromXrefStream,Index array of xref stream should have an even number of values");
1496 					status = PDFHummus::eFailure;
1497 					break;
1498 				}
1499 
1500 				segmentValue = segmentsIterator.GetItem();
1501 				if(!segmentValue)
1502 				{
1503 					TRACE_LOG("PDFParser::ParseXrefFromXrefStream, found non integer value in Index array of xref stream");
1504 					status = PDFHummus::eFailure;
1505 					break;
1506 				}
1507 				ObjectIDType objectsCount = (ObjectIDType)segmentValue->GetValue();
1508 				// if reading objects past expected range interesting consult policy
1509 				if(startObject +  objectsCount > inXrefSize)
1510                 {
1511                     if(mAllowExtendingSegments)
1512                     {
1513                         inXrefTable = ExtendXrefTableToSize(inXrefTable,inXrefSize,startObject +  objectsCount);
1514                         inXrefSize = startObject +  objectsCount;
1515                         if(outExtendedTable && *outExtendedTable)
1516                             delete[] *outExtendedTable;
1517                         *outExtendedTable = inXrefTable;
1518                         *outExtendedTableSize = startObject +  objectsCount;
1519                     }
1520                     else
1521                         break;
1522                 }
1523 				status = ReadXrefStreamSegment(inXrefTable,startObject,std::min<ObjectIDType>(objectsCount,inXrefSize - startObject),xrefStreamSource,widthsArray,wArray->GetLength());
1524 			}
1525 		}
1526 	}while(false);
1527 
1528 	delete xrefStreamSource;
1529 	delete[] widthsArray;
1530 	return status;
1531 }
1532 
MovePositionInStream(LongFilePositionType inPosition)1533 void PDFParser::MovePositionInStream(LongFilePositionType inPosition)
1534 {
1535 	mStream->SetPosition(inPosition);
1536 	mObjectParser.ResetReadState();
1537 }
1538 
ReadXrefStreamSegment(XrefEntryInput * inXrefTable,ObjectIDType inSegmentStartObject,ObjectIDType inSegmentCount,IByteReader * inReadFrom,int * inEntryWidths,unsigned long inEntryWidthsSize)1539 EStatusCode PDFParser::ReadXrefStreamSegment(XrefEntryInput* inXrefTable,
1540 											 ObjectIDType inSegmentStartObject,
1541 											 ObjectIDType inSegmentCount,
1542 											 IByteReader* inReadFrom,
1543 											 int* inEntryWidths,
1544 											 unsigned long inEntryWidthsSize)
1545 {
1546 	ObjectIDType objectToRead = inSegmentStartObject;
1547 	EStatusCode status = PDFHummus::eSuccess;
1548 	if(inEntryWidthsSize != 3)
1549 	{
1550 		TRACE_LOG("PDFParser::ReadXrefStreamSegment, can handle only 3 length entries");
1551 		return PDFHummus::eFailure;
1552 	}
1553 
1554 	// Note - i'm also checking that the stream is not ended. in non-finite segments, it could be that the particular
1555 	// stream does no define all objects...just the "updated" ones
1556 	for(; (objectToRead < inSegmentStartObject + inSegmentCount) && PDFHummus::eSuccess == status && inReadFrom->NotEnded();++objectToRead)
1557 	{
1558 		long long entryType;
1559 		status = ReadXrefSegmentValue(inReadFrom,inEntryWidths[0],entryType);
1560 		if(status != PDFHummus::eSuccess)
1561 			break;
1562 		status = ReadXrefSegmentValue(inReadFrom,inEntryWidths[1],inXrefTable[objectToRead].mObjectPosition);
1563 		if(status != PDFHummus::eSuccess)
1564 			break;
1565 		status = ReadXrefSegmentValue(inReadFrom,inEntryWidths[2],inXrefTable[objectToRead].mRivision);
1566 		if(status != PDFHummus::eSuccess)
1567 			break;
1568 
1569 		if(0 == entryType)
1570 		{
1571 			inXrefTable[objectToRead].mType = eXrefEntryDelete;
1572 		}
1573 		else if (1 == entryType)
1574 		{
1575 			inXrefTable[objectToRead].mType = eXrefEntryExisting;
1576 		}
1577 		else if(2 == entryType)
1578 		{
1579 			inXrefTable[objectToRead].mType = eXrefEntryStreamObject;
1580 		}
1581 		else
1582 		{
1583 			TRACE_LOG("PDFParser::ReadXrefStreamSegment, unfamiliar entry type. must be either 0,1 or 2");
1584 			status = PDFHummus::eFailure;
1585 		}
1586 	}
1587 	return status;
1588 }
1589 
ReadXrefSegmentValue(IByteReader * inSource,int inEntrySize,long long & outValue)1590 EStatusCode PDFParser::ReadXrefSegmentValue(IByteReader* inSource,int inEntrySize,long long& outValue)
1591 {
1592 	outValue = 0;
1593 	Byte buffer;
1594 	EStatusCode status = PDFHummus::eSuccess;
1595 
1596 	for(int i=0;i<inEntrySize && PDFHummus::eSuccess == status;++i)
1597 	{
1598 		status = (inSource->Read(&buffer,1) == 1 ? PDFHummus::eSuccess : PDFHummus::eFailure);
1599 		if(status != PDFHummus::eFailure)
1600 			outValue = (outValue<<8) + buffer;
1601 	}
1602 	return status;
1603 }
1604 
ReadXrefSegmentValue(IByteReader * inSource,int inEntrySize,ObjectIDType & outValue)1605 EStatusCode PDFParser::ReadXrefSegmentValue(IByteReader* inSource,int inEntrySize,ObjectIDType& outValue)
1606 {
1607 	outValue = 0;
1608 	Byte buffer;
1609 	EStatusCode status = PDFHummus::eSuccess;
1610 
1611 	for(int i=0;i<inEntrySize && PDFHummus::eSuccess == status;++i)
1612 	{
1613 		status = (inSource->Read(&buffer,1) == 1 ? PDFHummus::eSuccess : PDFHummus::eFailure);
1614 		if(status != PDFHummus::eFailure)
1615 			outValue = (outValue<<8) + buffer;
1616 	}
1617 	return status;
1618 }
1619 
ParseExistingInDirectStreamObject(ObjectIDType inObjectId)1620 PDFObject* PDFParser::ParseExistingInDirectStreamObject(ObjectIDType inObjectId)
1621 {
1622 	// parsing an object in an object stream requires the following:
1623 	// 1. Setting the position to this object stream
1624 	// 2. Reading the stream First and N. store.
1625 	// 3. Creating a stream reader for the initial stream position and length, possibly decoding with flate
1626 	// 4. Read the stream header. store.
1627 	// 5. Jump to the right object position (or decode till its position)
1628 	// 6. Read the object
1629 
1630 	EStatusCode status = PDFHummus::eSuccess;
1631 	ObjectStreamHeaderEntry* objectStreamHeader;
1632 	IByteReader* objectSource = NULL;
1633 
1634 	InputStreamSkipperStream skipperStream;
1635 	ObjectIDType objectStreamID;
1636 	PDFObject* anObject = NULL;
1637 
1638 	do
1639 	{
1640 		objectStreamID = (ObjectIDType)mXrefTable[inObjectId].mObjectPosition;
1641 		PDFObjectCastPtr<PDFStreamInput> objectStream(ParseNewObject(objectStreamID));
1642 		if(!objectStream)
1643 		{
1644 			TRACE_LOG2("PDFParser::ParseExistingInDirectStreamObject, failed to parse object %ld. failed to find object stream for it, which should be %ld",
1645 						inObjectId,mXrefTable[inObjectId].mObjectPosition);
1646 			status = PDFHummus::eFailure;
1647 			break;
1648 		}
1649 
1650 		RefCountPtr<PDFDictionary> streamDictionary(objectStream->QueryStreamDictionary());
1651 
1652 		PDFObjectCastPtr<PDFInteger> streamObjectsCount(QueryDictionaryObject(streamDictionary.GetPtr(),"N"));
1653 		if(!streamObjectsCount)
1654 		{
1655 			TRACE_LOG1("PDFParser::ParseExistingInDirectStreamObject, no N key in stream dictionary %ld",objectStreamID);
1656 			status = PDFHummus::eFailure;
1657 			break;
1658 		}
1659 		ObjectIDType objectsCount = (ObjectIDType)streamObjectsCount->GetValue();
1660 
1661 		PDFObjectCastPtr<PDFInteger> firstStreamObjectPosition(QueryDictionaryObject(streamDictionary.GetPtr(),"First"));
1662 		if(!streamObjectsCount)
1663 		{
1664 			TRACE_LOG1("PDFParser::ParseExistingInDirectStreamObject, no First key in stream dictionary %ld",objectStreamID);
1665 			status = PDFHummus::eFailure;
1666 			break;
1667 		}
1668 
1669 		objectSource = CreateInputStreamReader(objectStream.GetPtr());
1670 		skipperStream.Assign(objectSource);
1671 		MovePositionInStream(objectStream->GetStreamContentStart());
1672 
1673 		mObjectParser.SetReadStream(&skipperStream,&skipperStream);
1674 
1675 		ObjectIDTypeToObjectStreamHeaderEntryMap::iterator it = mObjectStreamsCache.find(objectStreamID);
1676 
1677 		if(it == mObjectStreamsCache.end())
1678 		{
1679 			objectStreamHeader = new ObjectStreamHeaderEntry[objectsCount];
1680 			status = ParseObjectStreamHeader(objectStreamHeader,objectsCount);
1681 			if(status != PDFHummus::eSuccess)
1682 			{
1683 				delete[] objectStreamHeader;
1684 				break;
1685 			}
1686 			it = mObjectStreamsCache.insert(ObjectIDTypeToObjectStreamHeaderEntryMap::value_type(objectStreamID,objectStreamHeader)).first;
1687 		}
1688 		objectStreamHeader = it->second;
1689 
1690 		// verify that i got the right object ID
1691 		if(objectsCount <= mXrefTable[inObjectId].mRivision || objectStreamHeader[mXrefTable[inObjectId].mRivision].mObjectNumber != inObjectId)
1692 		{
1693 			TRACE_LOG2("PDFParser::ParseXrefFromXrefStream, wrong object. expecting to find object ID %ld, and found %ld",
1694 						inObjectId,
1695 						objectsCount <= mXrefTable[inObjectId].mRivision ?
1696 							-1 :
1697 							objectStreamHeader[mXrefTable[inObjectId].mRivision].mObjectNumber);
1698 			status = PDFHummus::eFailure;
1699 			break;
1700 		}
1701 
1702 		// when parsing the header, should be at position already..so don't skip if already there [using GetCurrentPosition to see if parsed some]
1703 		if(mXrefTable[inObjectId].mRivision != 0 || skipperStream.GetCurrentPosition() == 0)
1704 		{
1705 			LongFilePositionType objectPositionInStream = objectStreamHeader[mXrefTable[inObjectId].mRivision].mObjectOffset +
1706 														  firstStreamObjectPosition->GetValue();
1707 			skipperStream.SkipTo(objectPositionInStream);
1708 			mObjectParser.ResetReadState();
1709 		}
1710 
1711 		anObject = mObjectParser.ParseNewObject(mParserExtender);
1712 
1713 	}while(false);
1714 
1715 	mObjectParser.SetReadStream(mStream,&mCurrentPositionProvider);
1716 
1717 	if(PDFHummus::eSuccess == status)
1718 	{
1719 		return anObject;
1720 	}
1721 	else
1722 	{
1723 		if(anObject)
1724 			anObject->Release();
1725 		return NULL;
1726 	}
1727 }
1728 
ParseObjectStreamHeader(ObjectStreamHeaderEntry * inHeaderInfo,ObjectIDType inObjectsCount)1729 EStatusCode PDFParser::ParseObjectStreamHeader(ObjectStreamHeaderEntry* inHeaderInfo,ObjectIDType inObjectsCount)
1730 {
1731 	ObjectIDType currentObject = 0;
1732 	EStatusCode status = PDFHummus::eSuccess;
1733 
1734 	while(currentObject < inObjectsCount && (PDFHummus::eSuccess == status))
1735 	{
1736 		PDFObjectCastPtr<PDFInteger> objectNumber(mObjectParser.ParseNewObject(mParserExtender));
1737 		if(!objectNumber)
1738 		{
1739 			TRACE_LOG("PDFParser::ParseObjectStreamHeader, parsing failed when reading object number. either not enough objects, or of the wrong type");
1740 			status = PDFHummus::eFailure;
1741 			break;
1742 		}
1743 
1744 		PDFObjectCastPtr<PDFInteger> objectPosition(mObjectParser.ParseNewObject(mParserExtender));
1745 		if(!objectPosition)
1746 		{
1747 			TRACE_LOG("PDFParser::ParseObjectStreamHeader, parsing failed when reading object position. either not enough objects, or of the wrong type");
1748 			status = PDFHummus::eFailure;
1749 			break;
1750 		}
1751 		inHeaderInfo[currentObject].mObjectNumber = (ObjectIDType)(objectNumber->GetValue());
1752 		inHeaderInfo[currentObject].mObjectOffset = objectPosition->GetValue();
1753 		++currentObject;
1754 	}
1755 	return status;
1756 }
1757 
CreateInputStreamReader(PDFStreamInput * inStream)1758 IByteReader* PDFParser::CreateInputStreamReader(PDFStreamInput* inStream)
1759 {
1760 	RefCountPtr<PDFDictionary> streamDictionary(inStream->QueryStreamDictionary());
1761 	IByteReader* result = NULL;
1762 	EStatusCode status = PDFHummus::eSuccess;
1763 
1764 	do
1765 	{
1766 
1767 		// setup stream according to length and possible filter
1768 		PDFObjectCastPtr<PDFInteger> lengthObject(QueryDictionaryObject(streamDictionary.GetPtr(),"Length"));
1769 		if(!lengthObject)
1770 		{
1771 			TRACE_LOG("PDFParser::CreateInputStreamReader, stream does not have length, failing");
1772 			status = PDFHummus::eFailure;
1773 			break;
1774 		}
1775 
1776 		result = new InputLimitedStream(mStream,lengthObject->GetValue(),false);
1777 
1778 		// call for parser extender for encryption implementation
1779 		if(mParserExtender)
1780 			result = mParserExtender->CreateDecryptionFilterForStream(result);
1781 
1782 		RefCountPtr<PDFObject> filterObject(QueryDictionaryObject(streamDictionary.GetPtr(),"Filter"));
1783 		if(!filterObject)
1784 		{
1785 			// no filter, so stop here
1786 			break;
1787 		}
1788 
1789 		if(filterObject->GetType() == PDFObject::ePDFObjectArray)
1790 		{
1791 			PDFArray* filterObjectArray = (PDFArray*)filterObject.GetPtr();
1792 			PDFObjectCastPtr<PDFArray> decodeParams(QueryDictionaryObject(streamDictionary.GetPtr(),"DecodeParms"));
1793 			for(unsigned long i=0; i < filterObjectArray->GetLength() && eSuccess == status;++i)
1794 			{
1795 				PDFObjectCastPtr<PDFName> filterObjectItem(filterObjectArray->QueryObject(i));
1796 				if(!filterObjectItem)
1797 				{
1798 					TRACE_LOG("PDFParser::CreateInputStreamReader, filter item in an array is not a name. should be a name");
1799 					status = PDFHummus::eFailure;
1800 					break;
1801 				}
1802 
1803 				EStatusCodeAndIByteReader createStatus;
1804 				if(!decodeParams)
1805 				{
1806 					 createStatus = CreateFilterForStream(result,filterObjectItem.GetPtr(), NULL);
1807 				}
1808 				else
1809 				{
1810 					PDFObjectCastPtr<PDFDictionary> decodeParamsItem(QueryArrayObject(decodeParams.GetPtr(),i));
1811 
1812 					createStatus = CreateFilterForStream(result,(PDFName*)filterObject.GetPtr(), !decodeParamsItem ? NULL: decodeParamsItem.GetPtr());
1813 				}
1814 
1815 				if(createStatus.first != eSuccess)
1816 				{
1817 					status = PDFHummus::eFailure;
1818 					break;
1819 				}
1820 				else
1821 					result = createStatus.second;
1822 			}
1823 		}
1824 		else if(filterObject->GetType() == PDFObject::ePDFObjectName)
1825 		{
1826 			PDFObjectCastPtr<PDFDictionary> decodeParams(QueryDictionaryObject(streamDictionary.GetPtr(),"DecodeParms"));
1827 
1828 			EStatusCodeAndIByteReader createStatus = CreateFilterForStream(result,(PDFName*)filterObject.GetPtr(), !decodeParams ? NULL: decodeParams.GetPtr());
1829 			if(createStatus.first != eSuccess)
1830 			{
1831 				status = PDFHummus::eFailure;
1832 				break;
1833 			}
1834 			else
1835 				result = createStatus.second;
1836 
1837 		}
1838 		else
1839 		{
1840 			TRACE_LOG("PDFParser::CreateInputStreamReader, filter parameter is of unkown type. only array and name are supported.");
1841 			status = PDFHummus::eFailure;
1842 			break;
1843 		}
1844 
1845 	}while(false);
1846 
1847 
1848 	if(status != PDFHummus::eSuccess)
1849 	{
1850 		delete result;
1851 		result = NULL;
1852 	}
1853 	return result;
1854 }
1855 
CreateFilterForStream(IByteReader * inStream,PDFName * inFilterName,PDFDictionary * inDecodeParams)1856 EStatusCodeAndIByteReader PDFParser::CreateFilterForStream(IByteReader* inStream,PDFName* inFilterName,PDFDictionary* inDecodeParams)
1857 {
1858 	EStatusCode status = eSuccess;
1859 	IByteReader* result = NULL;
1860 
1861 	do
1862 	{
1863 
1864 		if(inFilterName->GetValue() == "FlateDecode")
1865 		{
1866 			InputFlateDecodeStream* flateStream;
1867 			flateStream = new InputFlateDecodeStream(NULL); // assigning null, so later delete, if failure occurs won't delete the input stream
1868 			result = flateStream;
1869 
1870 			// check for predictor n' such
1871 			if(!inDecodeParams)
1872 			{
1873 				// no predictor, stop here
1874 				flateStream->Assign(inStream);
1875 				break;
1876 			}
1877 
1878 			// read predictor, and apply the relevant predictor function
1879 			PDFObjectCastPtr<PDFInteger> predictor(QueryDictionaryObject(inDecodeParams,"Predictor"));
1880 
1881 			if(!predictor || predictor->GetValue() == 1)
1882 			{
1883 				// no predictor or default, stop here
1884 				flateStream->Assign(inStream);
1885 				break;
1886 			}
1887 
1888 			PDFObjectCastPtr<PDFInteger> columns(QueryDictionaryObject(inDecodeParams,"Columns"));
1889 			LongBufferSizeType columnsValue = columns.GetPtr() ?
1890 																(IOBasicTypes::LongBufferSizeType)columns->GetValue() :
1891 																1;
1892 
1893 			switch(predictor->GetValue())
1894 			{
1895 				case 2:
1896 				{
1897 					PDFObjectCastPtr<PDFInteger> colors(QueryDictionaryObject(inDecodeParams,"Colors"));
1898 					PDFObjectCastPtr<PDFInteger> bitsPerComponent(QueryDictionaryObject(inDecodeParams,"BitsPerComponent"));
1899 					result = new InputPredictorTIFFSubStream(result,
1900 															 colors.GetPtr() ?
1901 																(IOBasicTypes::LongBufferSizeType)colors->GetValue() :
1902 																1,
1903 															 bitsPerComponent.GetPtr() ?
1904 																(IOBasicTypes::Byte)colors->GetValue() :
1905 																8,
1906 																columnsValue);
1907 					break;
1908 				}
1909 				case 10:
1910 				{
1911 					result = new InputPredictorPNGNoneStream(result,columnsValue);
1912 					break;
1913 				}
1914 				case 11:
1915 				{
1916 					result = new InputPredictorPNGSubStream(result,columnsValue);
1917 					break;
1918 				}
1919 				case 12:
1920 				{
1921 
1922 					result =  new InputPredictorPNGUpStream(result,columnsValue);
1923 					break;
1924 				}
1925 				case 13:
1926 				{
1927 
1928 					result =  new InputPredictorPNGAverageStream(result,columnsValue);
1929 					break;
1930 				}
1931 				case 14:
1932 				{
1933 					result =  new InputPredictorPNGPaethStream(result,columnsValue);
1934 					break;
1935 				}
1936 				case 15:
1937 				{
1938 					result =  new InputPredictorPNGOptimumStream(result,columnsValue);
1939 					break;
1940 				}
1941 				default:
1942 				{
1943 					TRACE_LOG("PDFParser::CreateFilterForStream, supporting only predictor of types 1,2,10,11,12,13,14,15, failing");
1944 					status = PDFHummus::eFailure;
1945 					break;
1946 				}
1947 			}
1948 			flateStream->Assign(inStream);
1949 		}
1950 		else if(inFilterName->GetValue() == "ASCII85Decode")
1951 		{
1952 			result = new InputAscii85DecodeStream(inStream);
1953 		}
1954 #ifndef PDFHUMMUS_NO_DCT
1955         else if(inFilterName->GetValue() == "DCTDecode")
1956         {
1957             result = new InputDCTDecodeStream(inStream);
1958         }
1959 #endif
1960 		else if(mParserExtender)
1961 		{
1962 			result = mParserExtender->CreateFilterForStream(inStream,inFilterName,inDecodeParams);
1963 			if(result == inStream)
1964 			{
1965 				TRACE_LOG1("PDFParser::CreateFilterForStream, filter is not supported by extender - %s",inFilterName->GetValue().c_str());
1966 				status = PDFHummus::eFailure;
1967 				break;
1968 			}
1969 		}
1970 		else
1971 		{
1972 			TRACE_LOG("PDFParser::CreateFilterForStream, supporting only flate decode and ascii 85 decode, failing");
1973 			status = PDFHummus::eFailure;
1974 			break;
1975 		}
1976 	}while(false);
1977 
1978 	if(status != PDFHummus::eSuccess)
1979 	{
1980 		delete result;
1981 		result = NULL;
1982 	}
1983 	return EStatusCodeAndIByteReader(status,result);
1984 
1985 }
1986 
StartReadingFromStream(PDFStreamInput * inStream)1987 IByteReader* PDFParser::StartReadingFromStream(PDFStreamInput* inStream)
1988 {
1989     IByteReader* result = CreateInputStreamReader(inStream);
1990     if(result)
1991         MovePositionInStream(inStream->GetStreamContentStart());
1992     return result;
1993 }
1994 
StartStateFileParsing(IByteReaderWithPosition * inSourceStream)1995 EStatusCode PDFParser::StartStateFileParsing(IByteReaderWithPosition* inSourceStream)
1996 {
1997 	EStatusCode status;
1998 
1999 	ResetParser();
2000 
2001 	mStream = inSourceStream;
2002 	mCurrentPositionProvider.Assign(mStream);
2003 	mObjectParser.SetReadStream(inSourceStream,&mCurrentPositionProvider);
2004 
2005 	do
2006 	{
2007 		// initialize reading from end
2008 		mLastReadPositionFromEnd = 0;
2009 		mEncounteredFileStart = false;
2010 		mLastAvailableIndex = mCurrentBufferIndex = mLinesBuffer;
2011 
2012 		status = ParseEOFLine();
2013 		if(status != PDFHummus::eSuccess)
2014 			break;
2015 
2016 		status = ParseLastXrefPosition();
2017 		if(status != PDFHummus::eSuccess)
2018 			break;
2019 
2020 		status = ParseFileDirectory(); // that would be the xref and trailer
2021 		if(status != PDFHummus::eSuccess)
2022 			break;
2023 
2024 	}while(false);
2025 
2026 	return status;
2027 }
2028 
IsEncrypted()2029 bool PDFParser::IsEncrypted()
2030 {
2031 	PDFObjectCastPtr<PDFDictionary> encryptionDictionary(QueryDictionaryObject(mTrailer.GetPtr(),"Encrypt"));
2032 	return encryptionDictionary.GetPtr() != NULL ;
2033 }
2034 
SetParserExtender(IPDFParserExtender * inParserExtender)2035 void PDFParser::SetParserExtender(IPDFParserExtender* inParserExtender)
2036 {
2037 	mParserExtender = inParserExtender;
2038 }
2039 
IsEncryptionSupported()2040 bool PDFParser::IsEncryptionSupported()
2041 {
2042 	return mParserExtender && mParserExtender->DoesSupportEncryption();
2043 }
2044 
GetXrefSize()2045 ObjectIDType PDFParser::GetXrefSize()
2046 {
2047     return mXrefSize;
2048 }
2049 
GetXrefEntry(ObjectIDType inObjectID)2050 XrefEntryInput* PDFParser::GetXrefEntry(ObjectIDType inObjectID)
2051 {
2052     return (inObjectID < mXrefSize) ? mXrefTable+inObjectID : NULL;
2053 }
2054 
GetXrefPosition()2055 LongFilePositionType PDFParser::GetXrefPosition()
2056 {
2057     return mLastXrefPosition;
2058 }
2059 
GetParserStream()2060 IByteReaderWithPosition* PDFParser::GetParserStream()
2061 {
2062     return mStream;
2063 }
2064