1 /***************************************************************************
2 * Copyright (C) 2005 by Dominik Seichter *
3 * domseichter@web.de *
4 * *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Library General Public License as *
7 * published by the Free Software Foundation; either version 2 of the *
8 * License, or (at your option) any later version. *
9 * *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU General Public License for more details. *
14 * *
15 * You should have received a copy of the GNU Library General Public *
16 * License along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
19 * *
20 * In addition, as a special exception, the copyright holders give *
21 * permission to link the code of portions of this program with the *
22 * OpenSSL library under certain conditions as described in each *
23 * individual source file, and distribute linked combinations *
24 * including the two. *
25 * You must obey the GNU General Public License in all respects *
26 * for all of the code used other than OpenSSL. If you modify *
27 * file(s) with this exception, you may extend this exception to your *
28 * version of the file(s), but you are not obligated to do so. If you *
29 * do not wish to do so, delete this exception statement from your *
30 * version. If you delete this exception statement from all source *
31 * files in the program, then also delete it here. *
32 ***************************************************************************/
33
34 #include "PdfParserObject.h"
35
36 #include "PdfArray.h"
37 #include "PdfDictionary.h"
38 #include "PdfEncrypt.h"
39 #include "PdfInputDevice.h"
40 #include "PdfInputStream.h"
41 #include "PdfParser.h"
42 #include "PdfStream.h"
43 #include "PdfVariant.h"
44 #include "PdfDefinesPrivate.h"
45
46 #include <iostream>
47 #include <sstream>
48
49 namespace PoDoFo {
50
51 using namespace std;
52
53 static const int s_nLenEndObj = 6; // strlen("endobj");
54 static const int s_nLenStream = 6; // strlen("stream");
55 //static const int s_nLenEndStream = 9; // strlen("endstream");
56
PdfParserObject(PdfVecObjects * pCreator,const PdfRefCountedInputDevice & rDevice,const PdfRefCountedBuffer & rBuffer,pdf_long lOffset)57 PdfParserObject::PdfParserObject( PdfVecObjects* pCreator, const PdfRefCountedInputDevice & rDevice,
58 const PdfRefCountedBuffer & rBuffer, pdf_long lOffset )
59 : PdfObject( PdfVariant::NullValue ), PdfTokenizer( rDevice, rBuffer ), m_pEncrypt( NULL )
60 {
61 m_pOwner = pCreator;
62
63 InitPdfParserObject();
64
65 m_lOffset = lOffset == -1 ? m_device.Device()->Tell() : lOffset;
66 }
67
PdfParserObject(const PdfRefCountedBuffer & rBuffer)68 PdfParserObject::PdfParserObject( const PdfRefCountedBuffer & rBuffer )
69 : PdfObject( PdfVariant::NullValue ), PdfTokenizer( PdfRefCountedInputDevice(), rBuffer ),
70 m_pEncrypt( NULL )
71 {
72 InitPdfParserObject();
73 }
74
~PdfParserObject()75 PdfParserObject::~PdfParserObject()
76 {
77
78 }
79
InitPdfParserObject()80 void PdfParserObject::InitPdfParserObject()
81 {
82 m_bIsTrailer = false;
83
84 // Whether or not demand loading is disabled we still don't load
85 // anything in the ctor. This just controls whether ::ParseFile(...)
86 // forces an immediate demand load, or lets it genuinely happen
87 // on demand.
88 m_bLoadOnDemand = false;
89
90 // We rely heavily on the demand loading infrastructure whether or not
91 // we *actually* delay loading.
92 EnableDelayedLoading();
93 EnableDelayedStreamLoading();
94
95 m_lOffset = -1;
96
97 m_bStream = false;
98 m_lStreamOffset = 0;
99 }
100
ReadObjectNumber()101 void PdfParserObject::ReadObjectNumber()
102 {
103 try {
104 pdf_long obj = this->GetNextNumber();
105 pdf_long gen = this->GetNextNumber();
106
107 m_reference = PdfReference( static_cast<unsigned int>(obj), static_cast<pdf_uint16>(gen) );
108 } catch( PdfError & e ) {
109 e.AddToCallstack( __FILE__, __LINE__, "Object and generation number cannot be read." );
110 throw e;
111 }
112
113 if( !this->IsNextToken( "obj" ))
114 {
115 std::ostringstream oss;
116 oss << "Error while reading object " << m_reference.ObjectNumber() << " "
117 << m_reference.GenerationNumber() << ": Next token is not 'obj'." << std::endl;
118 PODOFO_RAISE_ERROR_INFO( ePdfError_NoObject, oss.str().c_str() );
119 }
120 }
121
ParseFile(PdfEncrypt * pEncrypt,bool bIsTrailer)122 void PdfParserObject::ParseFile( PdfEncrypt* pEncrypt, bool bIsTrailer )
123 {
124 if( !m_device.Device() )
125 {
126 PODOFO_RAISE_ERROR( ePdfError_InvalidHandle );
127 }
128
129 if( m_lOffset > -1 )
130 m_device.Device()->Seek( m_lOffset );
131
132 if( !bIsTrailer )
133 ReadObjectNumber();
134
135 #if defined(PODOFO_VERBOSE_DEBUG)
136 std::cerr << "Parsing object number: " << m_reference.ObjectNumber()
137 << " " << m_reference.GenerationNumber() << " obj"
138 << " " << m_lOffset << " offset"
139 << " (DL: " << ( m_bLoadOnDemand ? "on" : "off" ) << ")"
140 << endl;
141 #endif // PODOFO_VERBOSE_DEBUG
142
143 m_lOffset = m_device.Device()->Tell();
144 m_pEncrypt = pEncrypt;
145 m_bIsTrailer = bIsTrailer;
146
147 if( !m_bLoadOnDemand )
148 {
149 // Force immediate loading of the object. We need to do this through
150 // the deferred loading machinery to avoid getting the object into an
151 // inconsistent state.
152 // We can't do a full DelayedStreamLoad() because the stream might use
153 // an indirect /Length or /Length1 key that hasn't been read yet.
154 DelayedLoad();
155
156 // TODO: support immediate loading of the stream here too. For that, we need
157 // to be able to trigger the reading of not-yet-parsed indirect objects
158 // such as might appear in a /Length key with an indirect reference.
159
160 #if defined(PODOFO_EXTRA_CHECKS)
161 // Sanity check - the variant base must be fully loaded now
162 if (!DelayedLoadDone() )
163 {
164 // We don't know what went wrong, but the internal state is
165 // broken or the API rules aren't being followed and we
166 // can't carry on.
167 PODOFO_RAISE_ERROR( ePdfError_InternalLogic );
168 }
169 #endif // PODOF_EXTRA_CHECKS
170 }
171 }
172
173 // Only called via the demand loading mechanism
174 // Be very careful to avoid recursive demand loads via PdfVariant
175 // or PdfObject method calls here.
ParseFileComplete(bool bIsTrailer)176 void PdfParserObject::ParseFileComplete( bool bIsTrailer )
177 {
178 #if defined(PODOFO_EXTRA_CHECKS)
179 PODOFO_ASSERT( DelayedLoadInProgress() );
180 PODOFO_ASSERT( !DelayedLoadDone() );
181 #endif
182 const char* pszToken;
183
184 m_device.Device()->Seek( m_lOffset );
185 if( m_pEncrypt )
186 m_pEncrypt->SetCurrentReference( m_reference );
187
188 // Do not call GetNextVariant directly,
189 // but GetNextToken, to handle empty objects like:
190 // 13 0 obj
191 // endobj
192
193 EPdfTokenType eTokenType;
194 bool gotToken = this->GetNextToken( pszToken, &eTokenType );
195
196 if (!gotToken)
197 {
198 PODOFO_RAISE_ERROR_INFO( ePdfError_UnexpectedEOF, "Expected variant." );
199 }
200
201 // Check if we have an empty object or data
202 if( strncmp( pszToken, "endobj", s_nLenEndObj ) != 0 )
203 {
204 this->GetNextVariant( pszToken, eTokenType, *this, m_pEncrypt );
205 this->SetDirty( false );
206
207 if( !bIsTrailer )
208 {
209 bool gotToken = this->GetNextToken( pszToken );
210 if (!gotToken)
211 {
212 PODOFO_RAISE_ERROR_INFO( ePdfError_UnexpectedEOF, "Expected 'endobj' or (if dict) 'stream', got EOF." );
213 }
214 if( strncmp( pszToken, "endobj", s_nLenEndObj ) == 0 )
215 ; // nothing to do, just validate that the PDF is correct
216 // If it's a dictionary, it might have a stream, so check for that
217 else if( this->IsDictionary() && strncmp( pszToken, "stream", s_nLenStream ) == 0 )
218 {
219 m_bStream = true;
220 m_lStreamOffset = m_device.Device()->Tell(); // NOTE: whitespace after "stream" handle in stream parser!
221
222 // Most of the code relies on PdfObjects that are dictionaries
223 // to have the datatype ePdfDataType_Dictionary and not Stream.
224 // Please use PdfObject::HasStream to check wether it has a stream.
225 //
226 // Commenting this out is right now easier than fixing all code to check
227 // either for ePdfDataType_Stream or ePdfDataType_Dictionary
228 //
229 //eDataType = ePdfDataType_Stream; // reset the object type to stream!
230 }
231 else
232 {
233 PODOFO_RAISE_ERROR_INFO( ePdfError_NoObject, pszToken );
234 }
235 }
236 }
237 }
238
239
240 // Only called during delayed loading. Must be careful to avoid
241 // triggering recursive delay loading due to use of accessors of
242 // PdfVariant or PdfObject.
ParseStream()243 void PdfParserObject::ParseStream()
244 {
245 #if defined(PODOFO_EXTRA_CHECKS)
246 PODOFO_ASSERT( DelayedLoadDone() );
247 PODOFO_ASSERT( DelayedStreamLoadInProgress() );
248 PODOFO_ASSERT( !DelayedStreamLoadDone() );
249 #endif
250
251 pdf_int64 lLen = -1;
252 int c;
253
254 if( !m_device.Device() || !m_pOwner )
255 {
256 PODOFO_RAISE_ERROR( ePdfError_InvalidHandle );
257 }
258
259 m_device.Device()->Seek( m_lStreamOffset );
260
261 do
262 {
263 // Skip spaces between the stream keyword and the carriage return/line feed or line feed
264 // Actually, this is not required by PDF Reference, but certain PDFs have additionals whitespaces
265 c = m_device.Device()->Look();
266 if ( c == ' ' )
267 c = m_device.Device()->GetChar();
268 } while ( c == ' ' );
269
270 // From the PDF Reference manual
271 // The keyword stream that follows
272 // the stream dictionary should be followed by an end-of-line marker consisting of
273 // either a carriage return and a line feed or just a line feed, and not by a carriage re-
274 // turn alone.
275 if( PdfTokenizer::IsWhitespace( c ) )
276 {
277 c = m_device.Device()->GetChar();
278
279 if( c == '\r' )
280 {
281 c = m_device.Device()->Look();
282 if( c == '\n' )
283 {
284 c = m_device.Device()->GetChar();
285 }
286 }
287 }
288
289 pdf_long fLoc = m_device.Device()->Tell(); // we need to save this, since loading the Length key could disturb it!
290
291 PdfObject* pObj = this->GetDictionary_NoDL().GetKey( PdfName::KeyLength );
292 if( pObj && pObj->IsNumber() )
293 {
294 lLen = pObj->GetNumber();
295 }
296 else if( pObj && pObj->IsReference() )
297 {
298 pObj = m_pOwner->GetObject( pObj->GetReference() );
299 if( !pObj )
300 {
301 PODOFO_RAISE_ERROR_INFO( ePdfError_InvalidHandle, "/Length key referenced indirect object that could not be loaded" );
302 }
303
304 /*PdfError::LogMessage(eLogSeverity_Information,
305 "Reading object %i 0 R with type: %s\n",
306 pObj->Reference().ObjectNumber(), pObj->GetDataTypeString());*/
307
308 if( !pObj->IsNumber() )
309 {
310 PODOFO_RAISE_ERROR_INFO( ePdfError_InvalidStreamLength, "/Length key for stream referenced non-number" );
311 }
312
313 lLen = pObj->GetNumber();
314
315 // DS: This code makes no sense,
316 // as empty streams with length 0 are valid, too.
317 //if( !lLen )
318 //{
319 // PODOFO_RAISE_ERROR( ePdfError_InvalidStreamLength );
320 //}
321
322 // we do not use indirect references for the length of the document
323 // DS: Even though do not remove the length key,
324 // as 2 or more object might use the same object for key lengths.
325 // Deleting the length object of the first object will make
326 // all other objects non readable.
327 // If you want those length object to be removed,
328 // run the garbage collection of PdfVecObjects over your PDF.
329 //delete m_pOwner->RemoveObject( pObj->Reference() );
330 }
331 else
332 {
333 PODOFO_RAISE_ERROR( ePdfError_InvalidStreamLength );
334 }
335
336 m_device.Device()->Seek( fLoc ); // reset it before reading!
337 PdfDeviceInputStream reader( m_device.Device() );
338
339 if( m_pEncrypt && !m_pEncrypt->IsMetadataEncrypted() ) {
340 // If metadata is not encrypted the Filter is set to "Crypt"
341 PdfObject* pFilterObj = this->GetDictionary_NoDL().GetKey( PdfName::KeyFilter );
342 if( pFilterObj && pFilterObj->IsReference() )
343 pFilterObj = m_pOwner->GetObject( pFilterObj->GetReference() );
344 if( pFilterObj && pFilterObj->IsArray() ) {
345 PdfArray filters = pFilterObj->GetArray();
346 for(PdfArray::iterator it = filters.begin(); it != filters.end(); it++) {
347 PdfObject *filter = &*it;
348 if( filter->IsReference() )
349 filter = m_pOwner->GetObject( filter->GetReference() );
350 if( filter && filter->IsName() )
351 if( filter->GetName() == "Crypt" )
352 m_pEncrypt = 0;
353 }
354 }
355 }
356 if( m_pEncrypt )
357 {
358 m_pEncrypt->SetCurrentReference( m_reference );
359 PdfInputStream* pInput = m_pEncrypt->CreateEncryptionInputStream( &reader );
360 this->GetStream_NoDL()->SetRawData( pInput, static_cast<pdf_long>(lLen) );
361 delete pInput;
362 }
363 else
364 this->GetStream_NoDL()->SetRawData( &reader, static_cast<pdf_long>(lLen) );
365
366 this->SetDirty( false );
367 /*
368 SAFE_OP( GetNextStringFromFile( ) );
369 if( strncmp( m_buffer.Buffer(), "endstream", s_nLenEndStream ) != 0 )
370 return ERROR_PDF_MISSING_ENDSTREAM;
371 */
372 }
373
374
DelayedLoadImpl()375 void PdfParserObject::DelayedLoadImpl()
376 {
377 #if defined(PODOFO_EXTRA_CHECKS)
378 // DelayedLoadImpl() should only ever be called via DelayedLoad(),
379 // which ensures that it is never called repeatedly.
380 PODOFO_ASSERT( !DelayedLoadDone() );
381 PODOFO_ASSERT( DelayedLoadInProgress() );
382 #endif
383
384 ParseFileComplete( m_bIsTrailer );
385
386 // If we complete without throwing DelayedLoadDone will be set
387 // for us.
388 }
389
DelayedStreamLoadImpl()390 void PdfParserObject::DelayedStreamLoadImpl()
391 {
392 #if defined(PODOFO_EXTRA_CHECKS)
393 // DelayedLoad() must've been called, either directly earlier
394 // or via DelayedStreamLoad. DelayedLoad() will throw if the load
395 // failed, so if we're being called this condition must be true.
396 PODOFO_ASSERT( DelayedLoadDone() );
397
398 // Similarly, we should not be being called unless the stream isn't
399 // already loaded.
400 PODOFO_ASSERT( !DelayedStreamLoadDone() );
401 PODOFO_ASSERT( DelayedStreamLoadInProgress() );
402 #endif
403
404 // Note: we can't use HasStream() here because it'll call DelayedStreamLoad()
405 // causing a nasty loop. test m_pStream directly instead.
406 if( this->HasStreamToParse() && !m_pStream )
407 {
408 try {
409 this->ParseStream();
410 } catch( PdfError & e ) {
411 // TODO: track object ptr in error info so we don't have to do this memory-intensive
412 // formatting here.
413 std::ostringstream s;
414 s << "Unable to parse the stream for object " << Reference().ObjectNumber() << ' '
415 << Reference().GenerationNumber() << " obj .";
416 e.AddToCallstack( __FILE__, __LINE__, s.str().c_str());
417 throw e;
418 }
419 }
420
421 // If we complete without throwing the stream will be flagged as loaded.
422 }
423
FreeObjectMemory(bool bForce)424 void PdfParserObject::FreeObjectMemory( bool bForce )
425 {
426 if( this->IsLoadOnDemand() && (bForce || !this->IsDirty()) )
427 {
428 PdfVariant::Clear();
429
430 delete m_pStream;
431 m_pStream = NULL;
432
433 EnableDelayedLoading();
434 EnableDelayedStreamLoading();
435 }
436 }
437
438 };
439