1 /***************************************************************************
2  *   Copyright (C) 2007 by Dominik Seichter                                *
3  *   domseichter@web.de                                                    *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU Library General Public License as       *
7  *   published by the Free Software Foundation; either version 2 of the    *
8  *   License, or (at your option) any later version.                       *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU Library General Public     *
16  *   License along with this program; if not, write to the                 *
17  *   Free Software Foundation, Inc.,                                       *
18  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
19  *                                                                         *
20  *   In addition, as a special exception, the copyright holders give       *
21  *   permission to link the code of portions of this program with the      *
22  *   OpenSSL library under certain conditions as described in each         *
23  *   individual source file, and distribute linked combinations            *
24  *   including the two.                                                    *
25  *   You must obey the GNU General Public License in all respects          *
26  *   for all of the code used other than OpenSSL.  If you modify           *
27  *   file(s) with this exception, you may extend this exception to your    *
28  *   version of the file(s), but you are not obligated to do so.  If you   *
29  *   do not wish to do so, delete this exception statement from your       *
30  *   version.  If you delete this exception statement from all source      *
31  *   files in the program, then also delete it here.                       *
32  ***************************************************************************/
33 
34 #include "PdfContentsTokenizer.h"
35 
36 #include "PdfCanvas.h"
37 #include "PdfInputDevice.h"
38 #include "PdfOutputStream.h"
39 #include "PdfStream.h"
40 #include "PdfVecObjects.h"
41 #include "PdfData.h"
42 #include "PdfDefinesPrivate.h"
43 
44 #include <iostream>
45 
46 namespace PoDoFo {
47 
PdfContentsTokenizer(PdfCanvas * pCanvas)48 PdfContentsTokenizer::PdfContentsTokenizer( PdfCanvas* pCanvas )
49     : PdfTokenizer(), m_readingInlineImgData(false)
50 {
51     if( !pCanvas )
52     {
53         PODOFO_RAISE_ERROR( ePdfError_InvalidHandle );
54     }
55 
56     PdfObject* pContents = pCanvas->GetContents();
57     if( pContents && pContents->IsArray()  )
58     {
59         PdfArray& a = pContents->GetArray();
60         for ( PdfArray::iterator it = a.begin(); it != a.end() ; ++it )
61         {
62             if ( !(*it).IsReference() )
63             {
64                 PODOFO_RAISE_ERROR_INFO( ePdfError_InvalidDataType, "/Contents array contained non-references" );
65 
66             }
67 
68             if ( !pContents->GetOwner()->GetObject( (*it).GetReference() ) )
69             {
70                 // some damaged PDFs may have dangling references
71                 PODOFO_RAISE_ERROR_INFO( ePdfError_InvalidDataType, "/Contents array NULL reference" );
72             }
73 
74             m_lstContents.push_back( pContents->GetOwner()->GetObject( (*it).GetReference() ) );
75         }
76     }
77     else if ( pContents && pContents->HasStream() )
78     {
79         m_lstContents.push_back( pContents );
80     }
81     else if ( pContents && pContents->IsDictionary() )
82     {
83         m_lstContents.push_back( pContents );
84         PdfError::LogMessage(eLogSeverity_Information,
85                   "PdfContentsTokenizer: found canvas-dictionary without stream => empty page");
86         // OC 18.09.2010 BugFix: Found an empty page in a PDF document:
87         //    103 0 obj
88         //    <<
89         //    /Type /Page
90         //    /MediaBox [ 0 0 595 842 ]
91         //    /Parent 3 0 R
92         //    /Resources <<
93         //    /ProcSet [ /PDF ]
94         //    >>
95         //    /Rotate 0
96         //    >>
97         //    endobj
98     }
99     else
100     {
101         PODOFO_RAISE_ERROR_INFO( ePdfError_InvalidDataType, "Page /Contents not stream or array of streams" );
102     }
103 
104     if( m_lstContents.size() )
105     {
106         SetCurrentContentsStream( m_lstContents.front() );
107         m_lstContents.pop_front();
108     }
109 }
110 
SetCurrentContentsStream(const PdfObject * pObject)111 void PdfContentsTokenizer::SetCurrentContentsStream( const PdfObject* pObject )
112 {
113     PODOFO_RAISE_LOGIC_IF( pObject == NULL, "Content stream object == NULL!" );
114 
115     const PdfStream* pStream = pObject->GetStream();
116 
117     PdfRefCountedBuffer buffer(0);
118     PdfBufferOutputStream stream( &buffer );
119     if( pStream )
120         pStream->GetFilteredCopy( &stream );
121 
122     m_device = PdfRefCountedInputDevice( buffer.GetBuffer(), buffer.GetSize() );
123 }
124 
GetNextToken(const char * & pszToken,EPdfTokenType * peType)125 bool PdfContentsTokenizer::GetNextToken( const char*& pszToken , EPdfTokenType* peType )
126 {
127 	bool result = PdfTokenizer::GetNextToken(pszToken, peType);
128 	while (!result) {
129 		if( !m_lstContents.size() )
130 			return false;
131 
132 		SetCurrentContentsStream( m_lstContents.front() );
133 		m_lstContents.pop_front();
134 		result = PdfTokenizer::GetNextToken(pszToken, peType);
135 	}
136 	return result;
137 }
138 
139 
ReadNext(EPdfContentsType & reType,const char * & rpszKeyword,PdfVariant & rVariant)140 bool PdfContentsTokenizer::ReadNext( EPdfContentsType& reType, const char*& rpszKeyword, PdfVariant & rVariant )
141 {
142     if (m_readingInlineImgData)
143         return ReadInlineImgData(reType, rpszKeyword, rVariant);
144     EPdfTokenType eTokenType;
145     EPdfDataType  eDataType;
146     const char*   pszToken;
147 
148     // While officially the keyword pointer is undefined if not needed, it
149     // costs us practically nothing to zero it (in case someone fails to check
150     // the return value and/or reType). Do so. We won't nullify the variant
151     // since that has a real cost.
152     //rpszKeyword = 0;
153 
154     // If we've run out of data in this stream and there's another one to read,
155     // switch to reading the next stream.
156     //if( m_device.Device() && m_device.Device()->Eof() && m_lstContents.size() )
157     //{
158     //    SetCurrentContentsStream( m_lstContents.front() );
159     //    m_lstContents.pop_front();
160     //}
161 
162     bool gotToken = this->GetNextToken( pszToken, &eTokenType );
163     if ( !gotToken )
164     {
165         if ( m_lstContents.size() )
166         {
167         // We ran out of tokens in this stream. Switch to the next stream
168         // and try again.
169             SetCurrentContentsStream( m_lstContents.front() );
170             m_lstContents.pop_front();
171             return ReadNext( reType, rpszKeyword, rVariant );
172         }
173         else
174         {
175             // No more content stream tokens to read.
176             return false;
177         }
178     }
179 
180     eDataType = this->DetermineDataType( pszToken, eTokenType, rVariant );
181 
182     // asume we read a variant unless we discover otherwise later.
183     reType = ePdfContentsType_Variant;
184 
185     switch( eDataType )
186     {
187         case ePdfDataType_Null:
188         case ePdfDataType_Bool:
189         case ePdfDataType_Number:
190         case ePdfDataType_Real:
191             // the data was already read into rVariant by the DetermineDataType function
192             break;
193 
194         case ePdfDataType_Reference:
195         {
196             // references are invalid in content streams
197             PODOFO_RAISE_ERROR_INFO( ePdfError_InvalidDataType, "references are invalid in content streams" );
198             break;
199         }
200 
201         case ePdfDataType_Dictionary:
202             this->ReadDictionary( rVariant, NULL );
203             break;
204         case ePdfDataType_Array:
205             this->ReadArray( rVariant, NULL );
206             break;
207         case ePdfDataType_String:
208             this->ReadString( rVariant, NULL );
209             break;
210         case ePdfDataType_HexString:
211             this->ReadHexString( rVariant, NULL );
212             break;
213         case ePdfDataType_Name:
214             this->ReadName( rVariant );
215             break;
216 
217         case ePdfDataType_Unknown:
218         case ePdfDataType_RawData:
219         default:
220             // Assume we have a keyword
221             reType     = ePdfContentsType_Keyword;
222             rpszKeyword = pszToken;
223             break;
224     }
225     std::string idKW ("ID");
226     if ((reType == ePdfContentsType_Keyword) && (idKW.compare(rpszKeyword) == 0) )
227         m_readingInlineImgData = true;
228     return true;
229 }
230 
ReadInlineImgData(EPdfContentsType & reType,const char * &,PdfVariant & rVariant)231 bool PdfContentsTokenizer::ReadInlineImgData( EPdfContentsType& reType, const char*&, PdfVariant & rVariant )
232 {
233     int  c;
234     pdf_int64  counter  = 0;
235     if( !m_device.Device() )
236     {
237         PODOFO_RAISE_ERROR( ePdfError_InvalidHandle );
238     }
239 
240     // consume the only whitespace between ID and data
241     c = m_device.Device()->Look();
242     if( PdfTokenizer::IsWhitespace( c ) )
243     {
244         c = m_device.Device()->GetChar();
245     }
246 
247     while((c = m_device.Device()->Look()) != EOF)
248     {
249         c = m_device.Device()->GetChar();
250         if (c=='E' &&  m_device.Device()->Look()=='I')
251         {
252             // Consume character
253             m_device.Device()->GetChar();
254             int w = m_device.Device()->Look();
255             if (w==EOF || PdfTokenizer::IsWhitespace(w))
256             {
257                 // EI is followed by whitespace => stop
258                 m_device.Device()->Seek(-2, std::ios::cur); // put back "EI"
259                 m_buffer.GetBuffer()[counter] = '\0';
260                 rVariant = PdfData(m_buffer.GetBuffer(), static_cast<size_t>(counter));
261                 reType = ePdfContentsType_ImageData;
262                 m_readingInlineImgData = false;
263                 return true;
264             }
265             else
266             {
267                 // no whitespace after EI => do not stop
268                 m_device.Device()->Seek(-1, std::ios::cur); // put back "I"
269                 m_buffer.GetBuffer()[counter] = c;
270                 ++counter;
271             }
272         }
273         else
274         {
275             m_buffer.GetBuffer()[counter] = c;
276             ++counter;
277         }
278 
279         if (counter ==  static_cast<pdf_int64>(m_buffer.GetSize()))
280         {
281             // image is larger than buffer => resize buffer
282             m_buffer.Resize(m_buffer.GetSize()*2);
283         }
284     }
285 
286     return false;
287 }
288 };
289