1 /***************************************************************************
2 * Copyright (C) 2007 by Dominik Seichter *
3 * domseichter@web.de *
4 * *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Library General Public License as *
7 * published by the Free Software Foundation; either version 2 of the *
8 * License, or (at your option) any later version. *
9 * *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU General Public License for more details. *
14 * *
15 * You should have received a copy of the GNU Library General Public *
16 * License along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
19 * *
20 * In addition, as a special exception, the copyright holders give *
21 * permission to link the code of portions of this program with the *
22 * OpenSSL library under certain conditions as described in each *
23 * individual source file, and distribute linked combinations *
24 * including the two. *
25 * You must obey the GNU General Public License in all respects *
26 * for all of the code used other than OpenSSL. If you modify *
27 * file(s) with this exception, you may extend this exception to your *
28 * version of the file(s), but you are not obligated to do so. If you *
29 * do not wish to do so, delete this exception statement from your *
30 * version. If you delete this exception statement from all source *
31 * files in the program, then also delete it here. *
32 ***************************************************************************/
33
34 #include "PdfContentsTokenizer.h"
35
36 #include "PdfCanvas.h"
37 #include "PdfInputDevice.h"
38 #include "PdfOutputStream.h"
39 #include "PdfStream.h"
40 #include "PdfVecObjects.h"
41 #include "PdfData.h"
42 #include "PdfDefinesPrivate.h"
43
44 #include <iostream>
45
46 namespace PoDoFo {
47
PdfContentsTokenizer(PdfCanvas * pCanvas)48 PdfContentsTokenizer::PdfContentsTokenizer( PdfCanvas* pCanvas )
49 : PdfTokenizer(), m_readingInlineImgData(false)
50 {
51 if( !pCanvas )
52 {
53 PODOFO_RAISE_ERROR( ePdfError_InvalidHandle );
54 }
55
56 PdfObject* pContents = pCanvas->GetContents();
57 if( pContents && pContents->IsArray() )
58 {
59 PdfArray& a = pContents->GetArray();
60 for ( PdfArray::iterator it = a.begin(); it != a.end() ; ++it )
61 {
62 if ( !(*it).IsReference() )
63 {
64 PODOFO_RAISE_ERROR_INFO( ePdfError_InvalidDataType, "/Contents array contained non-references" );
65
66 }
67
68 if ( !pContents->GetOwner()->GetObject( (*it).GetReference() ) )
69 {
70 // some damaged PDFs may have dangling references
71 PODOFO_RAISE_ERROR_INFO( ePdfError_InvalidDataType, "/Contents array NULL reference" );
72 }
73
74 m_lstContents.push_back( pContents->GetOwner()->GetObject( (*it).GetReference() ) );
75 }
76 }
77 else if ( pContents && pContents->HasStream() )
78 {
79 m_lstContents.push_back( pContents );
80 }
81 else if ( pContents && pContents->IsDictionary() )
82 {
83 m_lstContents.push_back( pContents );
84 PdfError::LogMessage(eLogSeverity_Information,
85 "PdfContentsTokenizer: found canvas-dictionary without stream => empty page");
86 // OC 18.09.2010 BugFix: Found an empty page in a PDF document:
87 // 103 0 obj
88 // <<
89 // /Type /Page
90 // /MediaBox [ 0 0 595 842 ]
91 // /Parent 3 0 R
92 // /Resources <<
93 // /ProcSet [ /PDF ]
94 // >>
95 // /Rotate 0
96 // >>
97 // endobj
98 }
99 else
100 {
101 PODOFO_RAISE_ERROR_INFO( ePdfError_InvalidDataType, "Page /Contents not stream or array of streams" );
102 }
103
104 if( m_lstContents.size() )
105 {
106 SetCurrentContentsStream( m_lstContents.front() );
107 m_lstContents.pop_front();
108 }
109 }
110
SetCurrentContentsStream(const PdfObject * pObject)111 void PdfContentsTokenizer::SetCurrentContentsStream( const PdfObject* pObject )
112 {
113 PODOFO_RAISE_LOGIC_IF( pObject == NULL, "Content stream object == NULL!" );
114
115 const PdfStream* pStream = pObject->GetStream();
116
117 PdfRefCountedBuffer buffer(0);
118 PdfBufferOutputStream stream( &buffer );
119 if( pStream )
120 pStream->GetFilteredCopy( &stream );
121
122 m_device = PdfRefCountedInputDevice( buffer.GetBuffer(), buffer.GetSize() );
123 }
124
GetNextToken(const char * & pszToken,EPdfTokenType * peType)125 bool PdfContentsTokenizer::GetNextToken( const char*& pszToken , EPdfTokenType* peType )
126 {
127 bool result = PdfTokenizer::GetNextToken(pszToken, peType);
128 while (!result) {
129 if( !m_lstContents.size() )
130 return false;
131
132 SetCurrentContentsStream( m_lstContents.front() );
133 m_lstContents.pop_front();
134 result = PdfTokenizer::GetNextToken(pszToken, peType);
135 }
136 return result;
137 }
138
139
ReadNext(EPdfContentsType & reType,const char * & rpszKeyword,PdfVariant & rVariant)140 bool PdfContentsTokenizer::ReadNext( EPdfContentsType& reType, const char*& rpszKeyword, PdfVariant & rVariant )
141 {
142 if (m_readingInlineImgData)
143 return ReadInlineImgData(reType, rpszKeyword, rVariant);
144 EPdfTokenType eTokenType;
145 EPdfDataType eDataType;
146 const char* pszToken;
147
148 // While officially the keyword pointer is undefined if not needed, it
149 // costs us practically nothing to zero it (in case someone fails to check
150 // the return value and/or reType). Do so. We won't nullify the variant
151 // since that has a real cost.
152 //rpszKeyword = 0;
153
154 // If we've run out of data in this stream and there's another one to read,
155 // switch to reading the next stream.
156 //if( m_device.Device() && m_device.Device()->Eof() && m_lstContents.size() )
157 //{
158 // SetCurrentContentsStream( m_lstContents.front() );
159 // m_lstContents.pop_front();
160 //}
161
162 bool gotToken = this->GetNextToken( pszToken, &eTokenType );
163 if ( !gotToken )
164 {
165 if ( m_lstContents.size() )
166 {
167 // We ran out of tokens in this stream. Switch to the next stream
168 // and try again.
169 SetCurrentContentsStream( m_lstContents.front() );
170 m_lstContents.pop_front();
171 return ReadNext( reType, rpszKeyword, rVariant );
172 }
173 else
174 {
175 // No more content stream tokens to read.
176 return false;
177 }
178 }
179
180 eDataType = this->DetermineDataType( pszToken, eTokenType, rVariant );
181
182 // asume we read a variant unless we discover otherwise later.
183 reType = ePdfContentsType_Variant;
184
185 switch( eDataType )
186 {
187 case ePdfDataType_Null:
188 case ePdfDataType_Bool:
189 case ePdfDataType_Number:
190 case ePdfDataType_Real:
191 // the data was already read into rVariant by the DetermineDataType function
192 break;
193
194 case ePdfDataType_Reference:
195 {
196 // references are invalid in content streams
197 PODOFO_RAISE_ERROR_INFO( ePdfError_InvalidDataType, "references are invalid in content streams" );
198 break;
199 }
200
201 case ePdfDataType_Dictionary:
202 this->ReadDictionary( rVariant, NULL );
203 break;
204 case ePdfDataType_Array:
205 this->ReadArray( rVariant, NULL );
206 break;
207 case ePdfDataType_String:
208 this->ReadString( rVariant, NULL );
209 break;
210 case ePdfDataType_HexString:
211 this->ReadHexString( rVariant, NULL );
212 break;
213 case ePdfDataType_Name:
214 this->ReadName( rVariant );
215 break;
216
217 case ePdfDataType_Unknown:
218 case ePdfDataType_RawData:
219 default:
220 // Assume we have a keyword
221 reType = ePdfContentsType_Keyword;
222 rpszKeyword = pszToken;
223 break;
224 }
225 std::string idKW ("ID");
226 if ((reType == ePdfContentsType_Keyword) && (idKW.compare(rpszKeyword) == 0) )
227 m_readingInlineImgData = true;
228 return true;
229 }
230
ReadInlineImgData(EPdfContentsType & reType,const char * &,PdfVariant & rVariant)231 bool PdfContentsTokenizer::ReadInlineImgData( EPdfContentsType& reType, const char*&, PdfVariant & rVariant )
232 {
233 int c;
234 pdf_int64 counter = 0;
235 if( !m_device.Device() )
236 {
237 PODOFO_RAISE_ERROR( ePdfError_InvalidHandle );
238 }
239
240 // consume the only whitespace between ID and data
241 c = m_device.Device()->Look();
242 if( PdfTokenizer::IsWhitespace( c ) )
243 {
244 c = m_device.Device()->GetChar();
245 }
246
247 while((c = m_device.Device()->Look()) != EOF)
248 {
249 c = m_device.Device()->GetChar();
250 if (c=='E' && m_device.Device()->Look()=='I')
251 {
252 // Consume character
253 m_device.Device()->GetChar();
254 int w = m_device.Device()->Look();
255 if (w==EOF || PdfTokenizer::IsWhitespace(w))
256 {
257 // EI is followed by whitespace => stop
258 m_device.Device()->Seek(-2, std::ios::cur); // put back "EI"
259 m_buffer.GetBuffer()[counter] = '\0';
260 rVariant = PdfData(m_buffer.GetBuffer(), static_cast<size_t>(counter));
261 reType = ePdfContentsType_ImageData;
262 m_readingInlineImgData = false;
263 return true;
264 }
265 else
266 {
267 // no whitespace after EI => do not stop
268 m_device.Device()->Seek(-1, std::ios::cur); // put back "I"
269 m_buffer.GetBuffer()[counter] = c;
270 ++counter;
271 }
272 }
273 else
274 {
275 m_buffer.GetBuffer()[counter] = c;
276 ++counter;
277 }
278
279 if (counter == static_cast<pdf_int64>(m_buffer.GetSize()))
280 {
281 // image is larger than buffer => resize buffer
282 m_buffer.Resize(m_buffer.GetSize()*2);
283 }
284 }
285
286 return false;
287 }
288 };
289