1 /***************************************************************************
2  *   Copyright (C) 2005 by Dominik Seichter                                *
3  *   domseichter@web.de                                                    *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU Library General Public License as       *
7  *   published by the Free Software Foundation; either version 2 of the    *
8  *   License, or (at your option) any later version.                       *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU Library General Public     *
16  *   License along with this program; if not, write to the                 *
17  *   Free Software Foundation, Inc.,                                       *
18  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
19  *                                                                         *
20  *   In addition, as a special exception, the copyright holders give       *
21  *   permission to link the code of portions of this program with the      *
22  *   OpenSSL library under certain conditions as described in each         *
23  *   individual source file, and distribute linked combinations            *
24  *   including the two.                                                    *
25  *   You must obey the GNU General Public License in all respects          *
26  *   for all of the code used other than OpenSSL.  If you modify           *
27  *   file(s) with this exception, you may extend this exception to your    *
28  *   version of the file(s), but you are not obligated to do so.  If you   *
29  *   do not wish to do so, delete this exception statement from your       *
30  *   version.  If you delete this exception statement from all source      *
31  *   files in the program, then also delete it here.                       *
32  ***************************************************************************/
33 
34 #include "PdfParserObject.h"
35 
36 #include "PdfArray.h"
37 #include "PdfDictionary.h"
38 #include "PdfEncrypt.h"
39 #include "PdfInputDevice.h"
40 #include "PdfInputStream.h"
41 #include "PdfParser.h"
42 #include "PdfStream.h"
43 #include "PdfVariant.h"
44 #include "PdfDefinesPrivate.h"
45 
46 #include <iostream>
47 #include <sstream>
48 
49 namespace PoDoFo {
50 
51 using namespace std;
52 
53 static const int s_nLenEndObj    = 6; // strlen("endobj");
54 static const int s_nLenStream    = 6; // strlen("stream");
55 //static const int s_nLenEndStream = 9; // strlen("endstream");
56 
PdfParserObject(PdfVecObjects * pCreator,const PdfRefCountedInputDevice & rDevice,const PdfRefCountedBuffer & rBuffer,pdf_long lOffset)57 PdfParserObject::PdfParserObject( PdfVecObjects* pCreator, const PdfRefCountedInputDevice & rDevice,
58                                   const PdfRefCountedBuffer & rBuffer, pdf_long lOffset )
59     : PdfObject( PdfVariant::NullValue ), PdfTokenizer( rDevice, rBuffer ), m_pEncrypt( NULL )
60 {
61     m_pOwner = pCreator;
62 
63     InitPdfParserObject();
64 
65     m_lOffset = lOffset == -1 ? m_device.Device()->Tell() : lOffset;
66 }
67 
PdfParserObject(const PdfRefCountedBuffer & rBuffer)68 PdfParserObject::PdfParserObject( const PdfRefCountedBuffer & rBuffer )
69     : PdfObject( PdfVariant::NullValue ), PdfTokenizer( PdfRefCountedInputDevice(), rBuffer ),
70       m_pEncrypt( NULL )
71 {
72     InitPdfParserObject();
73 }
74 
~PdfParserObject()75 PdfParserObject::~PdfParserObject()
76 {
77 
78 }
79 
InitPdfParserObject()80 void PdfParserObject::InitPdfParserObject()
81 {
82     m_bIsTrailer        = false;
83 
84     // Whether or not demand loading is disabled we still don't load
85     // anything in the ctor. This just controls whether ::ParseFile(...)
86     // forces an immediate demand load, or lets it genuinely happen
87     // on demand.
88     m_bLoadOnDemand     = false;
89 
90     // We rely heavily on the demand loading infrastructure whether or not
91     // we *actually* delay loading.
92     EnableDelayedLoading();
93     EnableDelayedStreamLoading();
94 
95     m_lOffset           = -1;
96 
97     m_bStream           = false;
98     m_lStreamOffset     = 0;
99 }
100 
ReadObjectNumber()101 void PdfParserObject::ReadObjectNumber()
102 {
103     try {
104         pdf_long obj = this->GetNextNumber();
105         pdf_long gen = this->GetNextNumber();
106 
107         m_reference = PdfReference( static_cast<unsigned int>(obj), static_cast<pdf_uint16>(gen) );
108     } catch( PdfError & e ) {
109         e.AddToCallstack( __FILE__, __LINE__, "Object and generation number cannot be read." );
110         throw e;
111     }
112 
113     if( !this->IsNextToken( "obj" ))
114     {
115         std::ostringstream oss;
116         oss << "Error while reading object " << m_reference.ObjectNumber() << " "
117             << m_reference.GenerationNumber() << ": Next token is not 'obj'." << std::endl;
118         PODOFO_RAISE_ERROR_INFO( ePdfError_NoObject, oss.str().c_str() );
119     }
120 }
121 
ParseFile(PdfEncrypt * pEncrypt,bool bIsTrailer)122 void PdfParserObject::ParseFile( PdfEncrypt* pEncrypt, bool bIsTrailer )
123 {
124     if( !m_device.Device() )
125     {
126         PODOFO_RAISE_ERROR( ePdfError_InvalidHandle );
127     }
128 
129     if( m_lOffset > -1 )
130         m_device.Device()->Seek( m_lOffset );
131 
132     if( !bIsTrailer )
133         ReadObjectNumber();
134 
135 #if defined(PODOFO_VERBOSE_DEBUG)
136     std::cerr << "Parsing object number: " << m_reference.ObjectNumber()
137               << " " << m_reference.GenerationNumber() << " obj"
138               << " " << m_lOffset << " offset"
139               << " (DL: " << ( m_bLoadOnDemand ? "on" : "off" ) << ")"
140               << endl;
141 #endif // PODOFO_VERBOSE_DEBUG
142 
143     m_lOffset    = m_device.Device()->Tell();
144     m_pEncrypt   = pEncrypt;
145     m_bIsTrailer = bIsTrailer;
146 
147     if( !m_bLoadOnDemand )
148     {
149         // Force immediate loading of the object.  We need to do this through
150         // the deferred loading machinery to avoid getting the object into an
151         // inconsistent state.
152         // We can't do a full DelayedStreamLoad() because the stream might use
153         // an indirect /Length or /Length1 key that hasn't been read yet.
154         DelayedLoad();
155 
156         // TODO: support immediate loading of the stream here too. For that, we need
157         // to be able to trigger the reading of not-yet-parsed indirect objects
158         // such as might appear in a /Length key with an indirect reference.
159 
160 #if defined(PODOFO_EXTRA_CHECKS)
161         // Sanity check - the variant base must be fully loaded now
162         if (!DelayedLoadDone() )
163         {
164             // We don't know what went wrong, but the internal state is
165             // broken or the API rules aren't being followed and we
166             // can't carry on.
167             PODOFO_RAISE_ERROR( ePdfError_InternalLogic );
168         }
169 #endif // PODOF_EXTRA_CHECKS
170     }
171 }
172 
173 // Only called via the demand loading mechanism
174 // Be very careful to avoid recursive demand loads via PdfVariant
175 // or PdfObject method calls here.
ParseFileComplete(bool bIsTrailer)176 void PdfParserObject::ParseFileComplete( bool bIsTrailer )
177 {
178 #if defined(PODOFO_EXTRA_CHECKS)
179     PODOFO_ASSERT( DelayedLoadInProgress() );
180     PODOFO_ASSERT( !DelayedLoadDone() );
181 #endif
182     const char* pszToken;
183 
184     m_device.Device()->Seek( m_lOffset );
185     if( m_pEncrypt )
186         m_pEncrypt->SetCurrentReference( m_reference );
187 
188     // Do not call GetNextVariant directly,
189     // but GetNextToken, to handle empty objects like:
190     // 13 0 obj
191     // endobj
192 
193     EPdfTokenType eTokenType;
194     bool gotToken = this->GetNextToken( pszToken, &eTokenType );
195 
196     if (!gotToken)
197     {
198         PODOFO_RAISE_ERROR_INFO( ePdfError_UnexpectedEOF, "Expected variant." );
199     }
200 
201     // Check if we have an empty object or data
202     if( strncmp( pszToken, "endobj", s_nLenEndObj ) != 0 )
203     {
204         this->GetNextVariant( pszToken, eTokenType, *this, m_pEncrypt );
205         this->SetDirty( false );
206 
207         if( !bIsTrailer )
208         {
209             bool gotToken = this->GetNextToken( pszToken );
210             if (!gotToken)
211             {
212                 PODOFO_RAISE_ERROR_INFO( ePdfError_UnexpectedEOF, "Expected 'endobj' or (if dict) 'stream', got EOF." );
213             }
214             if( strncmp( pszToken, "endobj", s_nLenEndObj ) == 0 )
215                 ; // nothing to do, just validate that the PDF is correct
216             // If it's a dictionary, it might have a stream, so check for that
217             else if( this->IsDictionary() && strncmp( pszToken, "stream", s_nLenStream ) == 0 )
218             {
219                 m_bStream = true;
220                 m_lStreamOffset = m_device.Device()->Tell(); // NOTE: whitespace after "stream" handle in stream parser!
221 
222                 // Most of the code relies on PdfObjects that are dictionaries
223                 // to have the datatype ePdfDataType_Dictionary and not Stream.
224                 // Please use PdfObject::HasStream to check wether it has a stream.
225                 //
226                 // Commenting this out is right now easier than fixing all code to check
227                 // either for ePdfDataType_Stream or ePdfDataType_Dictionary
228                 //
229                 //eDataType = ePdfDataType_Stream;	// reset the object type to stream!
230             }
231             else
232             {
233                 PODOFO_RAISE_ERROR_INFO( ePdfError_NoObject, pszToken );
234             }
235         }
236     }
237 }
238 
239 
240 // Only called during delayed loading. Must be careful to avoid
241 // triggering recursive delay loading due to use of accessors of
242 // PdfVariant or PdfObject.
ParseStream()243 void PdfParserObject::ParseStream()
244 {
245 #if defined(PODOFO_EXTRA_CHECKS)
246     PODOFO_ASSERT( DelayedLoadDone() );
247     PODOFO_ASSERT( DelayedStreamLoadInProgress() );
248     PODOFO_ASSERT( !DelayedStreamLoadDone() );
249 #endif
250 
251     pdf_int64         lLen  = -1;
252     int          c;
253 
254     if( !m_device.Device() || !m_pOwner )
255     {
256         PODOFO_RAISE_ERROR( ePdfError_InvalidHandle );
257     }
258 
259     m_device.Device()->Seek( m_lStreamOffset );
260 
261     do
262     {
263         // Skip spaces between the stream keyword and the carriage return/line feed or line feed
264         // Actually, this is not required by PDF Reference, but certain PDFs have additionals whitespaces
265         c = m_device.Device()->Look();
266         if ( c == ' ' )
267             c = m_device.Device()->GetChar();
268     } while ( c == ' ' );
269 
270     // From the PDF Reference manual
271     // The keyword stream that follows
272     // the stream dictionary should be followed by an end-of-line marker consisting of
273     // either a carriage return and a line feed or just a line feed, and not by a carriage re-
274     // turn alone.
275     if( PdfTokenizer::IsWhitespace( c ) )
276     {
277         c = m_device.Device()->GetChar();
278 
279         if( c == '\r' )
280         {
281             c = m_device.Device()->Look();
282             if( c == '\n' )
283             {
284                 c = m_device.Device()->GetChar();
285             }
286         }
287     }
288 
289     pdf_long fLoc = m_device.Device()->Tell();	// we need to save this, since loading the Length key could disturb it!
290 
291     PdfObject* pObj = this->GetDictionary_NoDL().GetKey( PdfName::KeyLength );
292     if( pObj && pObj->IsNumber() )
293     {
294         lLen = pObj->GetNumber();
295     }
296     else if( pObj && pObj->IsReference() )
297     {
298         pObj = m_pOwner->GetObject( pObj->GetReference() );
299         if( !pObj )
300         {
301             PODOFO_RAISE_ERROR_INFO( ePdfError_InvalidHandle, "/Length key referenced indirect object that could not be loaded" );
302         }
303 
304         /*PdfError::LogMessage(eLogSeverity_Information,
305                              "Reading object %i 0 R with type: %s\n",
306                              pObj->Reference().ObjectNumber(), pObj->GetDataTypeString());*/
307 
308         if( !pObj->IsNumber() )
309         {
310             PODOFO_RAISE_ERROR_INFO( ePdfError_InvalidStreamLength, "/Length key for stream referenced non-number" );
311         }
312 
313         lLen = pObj->GetNumber();
314 
315         // DS: This code makes no sense,
316         //     as empty streams with length 0 are valid, too.
317         //if( !lLen )
318         //{
319         //    PODOFO_RAISE_ERROR( ePdfError_InvalidStreamLength );
320         //}
321 
322         // we do not use indirect references for the length of the document
323         // DS: Even though do not remove the length key,
324         //     as 2 or more object might use the same object for key lengths.
325         //     Deleting the length object of the first object will make
326         //     all other objects non readable.
327         //     If you want those length object to be removed,
328         //     run the garbage collection of PdfVecObjects over your PDF.
329         //delete m_pOwner->RemoveObject( pObj->Reference() );
330     }
331     else
332     {
333         PODOFO_RAISE_ERROR( ePdfError_InvalidStreamLength );
334     }
335 
336     m_device.Device()->Seek( fLoc );	// reset it before reading!
337     PdfDeviceInputStream reader( m_device.Device() );
338 
339 	if( m_pEncrypt && !m_pEncrypt->IsMetadataEncrypted() ) {
340 		// If metadata is not encrypted the Filter is set to "Crypt"
341 		PdfObject* pFilterObj = this->GetDictionary_NoDL().GetKey( PdfName::KeyFilter );
342         if( pFilterObj && pFilterObj->IsReference() )
343             pFilterObj = m_pOwner->GetObject( pFilterObj->GetReference() );
344 		if( pFilterObj && pFilterObj->IsArray() ) {
345 			PdfArray filters = pFilterObj->GetArray();
346 			for(PdfArray::iterator it = filters.begin(); it != filters.end(); it++) {
347                 PdfObject *filter = &*it;
348                 if( filter->IsReference() )
349                     filter = m_pOwner->GetObject( filter->GetReference() );
350                 if( filter && filter->IsName() )
351                     if( filter->GetName() == "Crypt" )
352 						m_pEncrypt = 0;
353 			}
354 		}
355 	}
356     if( m_pEncrypt )
357     {
358         m_pEncrypt->SetCurrentReference( m_reference );
359         PdfInputStream* pInput = m_pEncrypt->CreateEncryptionInputStream( &reader );
360         this->GetStream_NoDL()->SetRawData( pInput, static_cast<pdf_long>(lLen) );
361         delete pInput;
362     }
363     else
364         this->GetStream_NoDL()->SetRawData( &reader, static_cast<pdf_long>(lLen) );
365 
366     this->SetDirty( false );
367     /*
368     SAFE_OP( GetNextStringFromFile( ) );
369     if( strncmp( m_buffer.Buffer(), "endstream", s_nLenEndStream ) != 0 )
370         return ERROR_PDF_MISSING_ENDSTREAM;
371     */
372 }
373 
374 
DelayedLoadImpl()375 void PdfParserObject::DelayedLoadImpl()
376 {
377 #if defined(PODOFO_EXTRA_CHECKS)
378     // DelayedLoadImpl() should only ever be called via DelayedLoad(),
379     // which ensures that it is never called repeatedly.
380     PODOFO_ASSERT( !DelayedLoadDone() );
381     PODOFO_ASSERT( DelayedLoadInProgress() );
382 #endif
383 
384     ParseFileComplete( m_bIsTrailer );
385 
386     // If we complete without throwing DelayedLoadDone will be set
387     // for us.
388 }
389 
DelayedStreamLoadImpl()390 void PdfParserObject::DelayedStreamLoadImpl()
391 {
392 #if defined(PODOFO_EXTRA_CHECKS)
393     // DelayedLoad() must've been called, either directly earlier
394     // or via DelayedStreamLoad. DelayedLoad() will throw if the load
395     // failed, so if we're being called this condition must be true.
396     PODOFO_ASSERT( DelayedLoadDone() );
397 
398     // Similarly, we should not be being called unless the stream isn't
399     // already loaded.
400     PODOFO_ASSERT( !DelayedStreamLoadDone() );
401     PODOFO_ASSERT( DelayedStreamLoadInProgress() );
402 #endif
403 
404     // Note: we can't use HasStream() here because it'll call DelayedStreamLoad()
405     // causing a nasty loop. test m_pStream directly instead.
406     if( this->HasStreamToParse() && !m_pStream )
407     {
408         try {
409             this->ParseStream();
410         } catch( PdfError & e ) {
411             // TODO: track object ptr in error info so we don't have to do this memory-intensive
412             // formatting here.
413             std::ostringstream s;
414             s << "Unable to parse the stream for object " << Reference().ObjectNumber() << ' '
415               << Reference().GenerationNumber() << " obj .";
416             e.AddToCallstack( __FILE__, __LINE__, s.str().c_str());
417             throw e;
418         }
419     }
420 
421     // If we complete without throwing the stream will be flagged as loaded.
422 }
423 
FreeObjectMemory(bool bForce)424 void PdfParserObject::FreeObjectMemory( bool bForce )
425 {
426     if( this->IsLoadOnDemand() && (bForce || !this->IsDirty()) )
427     {
428         PdfVariant::Clear();
429 
430         delete m_pStream;
431         m_pStream = NULL;
432 
433         EnableDelayedLoading();
434         EnableDelayedStreamLoading();
435     }
436 }
437 
438 };
439