1 //========================================================================
2 //
3 // Parser.cc
4 //
5 // Copyright 1996-2003 Glyph & Cog, LLC
6 //
7 //========================================================================
8
9 //========================================================================
10 //
11 // Modified under the Poppler project - http://poppler.freedesktop.org
12 //
13 // All changes made under the Poppler project to this file are licensed
14 // under GPL version 2 or later
15 //
16 // Copyright (C) 2006, 2009, 201, 2010, 2013, 2014, 2017-2020 Albert Astals Cid <aacid@kde.org>
17 // Copyright (C) 2006 Krzysztof Kowalczyk <kkowalczyk@gmail.com>
18 // Copyright (C) 2009 Ilya Gorenbein <igorenbein@finjan.com>
19 // Copyright (C) 2012 Hib Eris <hib@hiberis.nl>
20 // Copyright (C) 2013 Adrian Johnson <ajohnson@redneon.com>
21 // Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
22 // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
23 // Copyright (C) 2018, 2019 Adam Reichold <adam.reichold@t-online.de>
24 // Copyright (C) 2018 Marek Kasik <mkasik@redhat.com>
25 //
26 // To see a description of the changes please see the Changelog file that
27 // came with your tarball or type make ChangeLog if you are building from git
28 //
29 //========================================================================
30
31 #include <config.h>
32
33 #include <cstddef>
34 #include "Object.h"
35 #include "Array.h"
36 #include "Dict.h"
37 #include "Decrypt.h"
38 #include "Parser.h"
39 #include "XRef.h"
40 #include "Error.h"
41
42 // Max number of nested objects. This is used to catch infinite loops
43 // in the object structure. And also technically valid files with
44 // lots of nested arrays that made us consume all the stack
45 #define recursionLimit 500
46
Parser(XRef * xrefA,Stream * streamA,bool allowStreamsA)47 Parser::Parser(XRef *xrefA, Stream *streamA, bool allowStreamsA) : lexer { xrefA, streamA }
48 {
49 allowStreams = allowStreamsA;
50 buf1 = lexer.getObj();
51 buf2 = lexer.getObj();
52 inlineImg = 0;
53 }
54
Parser(XRef * xrefA,Object * objectA,bool allowStreamsA)55 Parser::Parser(XRef *xrefA, Object *objectA, bool allowStreamsA) : lexer { xrefA, objectA }
56 {
57 allowStreams = allowStreamsA;
58 buf1 = lexer.getObj();
59 buf2 = lexer.getObj();
60 inlineImg = 0;
61 }
62
63 Parser::~Parser() = default;
64
getObj(int recursion)65 Object Parser::getObj(int recursion)
66 {
67 return getObj(false, nullptr, cryptRC4, 0, 0, 0, recursion);
68 }
69
decryptedString(const GooString * s,const unsigned char * fileKey,CryptAlgorithm encAlgorithm,int keyLength,int objNum,int objGen)70 static std::unique_ptr<GooString> decryptedString(const GooString *s, const unsigned char *fileKey, CryptAlgorithm encAlgorithm, int keyLength, int objNum, int objGen)
71 {
72 DecryptStream decrypt(new MemStream(s->c_str(), 0, s->getLength(), Object(objNull)), fileKey, encAlgorithm, keyLength, { objNum, objGen });
73 decrypt.reset();
74 std::unique_ptr<GooString> res = std::make_unique<GooString>();
75 int c;
76 while ((c = decrypt.getChar()) != EOF) {
77 res->append((char)c);
78 }
79 return res;
80 }
81
getObj(bool simpleOnly,const unsigned char * fileKey,CryptAlgorithm encAlgorithm,int keyLength,int objNum,int objGen,int recursion,bool strict,bool decryptString)82 Object Parser::getObj(bool simpleOnly, const unsigned char *fileKey, CryptAlgorithm encAlgorithm, int keyLength, int objNum, int objGen, int recursion, bool strict, bool decryptString)
83 {
84 Object obj;
85 Stream *str;
86
87 // refill buffer after inline image data
88 if (inlineImg == 2) {
89 buf1 = lexer.getObj();
90 buf2 = lexer.getObj();
91 inlineImg = 0;
92 }
93
94 if (unlikely(recursion >= recursionLimit)) {
95 return Object(objError);
96 }
97
98 // array
99 if (!simpleOnly && buf1.isCmd("[")) {
100 shift();
101 obj = Object(new Array(lexer.getXRef()));
102 while (!buf1.isCmd("]") && !buf1.isEOF() && recursion + 1 < recursionLimit) {
103 Object obj2 = getObj(false, fileKey, encAlgorithm, keyLength, objNum, objGen, recursion + 1);
104 obj.arrayAdd(std::move(obj2));
105 }
106 if (recursion + 1 >= recursionLimit && strict)
107 goto err;
108 if (buf1.isEOF()) {
109 error(errSyntaxError, getPos(), "End of file inside array");
110 if (strict)
111 goto err;
112 }
113 shift();
114
115 // dictionary or stream
116 } else if (!simpleOnly && buf1.isCmd("<<")) {
117 shift(objNum);
118 obj = Object(new Dict(lexer.getXRef()));
119 bool hasContentsEntry = false;
120 while (!buf1.isCmd(">>") && !buf1.isEOF()) {
121 if (!buf1.isName()) {
122 error(errSyntaxError, getPos(), "Dictionary key must be a name object");
123 if (strict)
124 goto err;
125 shift();
126 } else {
127 // buf1 will go away in shift(), so keep the key
128 const auto key = std::move(buf1);
129 shift();
130 if (buf1.isEOF() || buf1.isError()) {
131 if (strict && buf1.isError())
132 goto err;
133 break;
134 }
135 // We don't decrypt strings that are the value of "Contents" key entries. We decrypt them if needed a few lines below.
136 // The "Contents" field of Sig dictionaries is not encrypted, but we can't know the type of the dictionary here yet
137 // so we don't decrypt any Contents and if later we find it's not a Sig dictionary we decrypt it
138 const bool isContents = !hasContentsEntry && key.isName("Contents");
139 hasContentsEntry = hasContentsEntry || isContents;
140 Object obj2 = getObj(false, fileKey, encAlgorithm, keyLength, objNum, objGen, recursion + 1, /*strict*/ false, /*decryptString*/ !isContents);
141 if (unlikely(obj2.isError() && recursion + 1 >= recursionLimit)) {
142 break;
143 }
144 obj.dictAdd(key.getName(), std::move(obj2));
145 }
146 }
147 if (buf1.isEOF()) {
148 error(errSyntaxError, getPos(), "End of file inside dictionary");
149 if (strict)
150 goto err;
151 }
152 if (fileKey && hasContentsEntry) {
153 Dict *dict = obj.getDict();
154 const bool isSigDict = dict->is("Sig");
155 if (!isSigDict) {
156 const Object &contentsObj = dict->lookupNF("Contents");
157 if (contentsObj.isString()) {
158 std::unique_ptr<GooString> s = decryptedString(contentsObj.getString(), fileKey, encAlgorithm, keyLength, objNum, objGen);
159 dict->set("Contents", Object(s.release()));
160 }
161 }
162 }
163 // stream objects are not allowed inside content streams or
164 // object streams
165 if (buf2.isCmd("stream")) {
166 if (allowStreams && (str = makeStream(std::move(obj), fileKey, encAlgorithm, keyLength, objNum, objGen, recursion + 1, strict))) {
167 return Object(str);
168 } else {
169 return Object(objError);
170 }
171 } else {
172 shift();
173 }
174
175 // indirect reference or integer
176 } else if (buf1.isInt()) {
177 const int num = buf1.getInt();
178 shift();
179 if (buf1.isInt() && buf2.isCmd("R")) {
180 const int gen = buf1.getInt();
181 shift();
182 shift();
183
184 if (unlikely(num <= 0 || gen < 0)) {
185 return Object();
186 }
187
188 Ref r;
189 r.num = num;
190 r.gen = gen;
191 return Object(r);
192 } else {
193 return Object(num);
194 }
195
196 // string
197 } else if (decryptString && buf1.isString() && fileKey) {
198 std::unique_ptr<GooString> s2 = decryptedString(buf1.getString(), fileKey, encAlgorithm, keyLength, objNum, objGen);
199 obj = Object(s2.release());
200 shift();
201
202 // simple object
203 } else {
204 // avoid re-allocating memory for complex objects like strings by
205 // shallow copy of <buf1> to <obj> and nulling <buf1> so that
206 // subsequent buf1.free() won't free this memory
207 obj = std::move(buf1);
208 shift();
209 }
210
211 return obj;
212
213 err:
214 return Object(objError);
215 }
216
makeStream(Object && dict,const unsigned char * fileKey,CryptAlgorithm encAlgorithm,int keyLength,int objNum,int objGen,int recursion,bool strict)217 Stream *Parser::makeStream(Object &&dict, const unsigned char *fileKey, CryptAlgorithm encAlgorithm, int keyLength, int objNum, int objGen, int recursion, bool strict)
218 {
219 BaseStream *baseStr;
220 Stream *str;
221 Goffset length;
222 Goffset pos, endPos;
223
224 if (XRef *xref = lexer.getXRef()) {
225 XRefEntry *entry = xref->getEntry(objNum, false);
226 if (entry) {
227 if (!entry->getFlag(XRefEntry::Parsing) || (objNum == 0 && objGen == 0)) {
228 entry->setFlag(XRefEntry::Parsing, true);
229 } else {
230 error(errSyntaxError, getPos(), "Object '{0:d} {1:d} obj' is being already parsed", objNum, objGen);
231 return nullptr;
232 }
233 }
234 }
235
236 // get stream start position
237 lexer.skipToNextLine();
238 if (!(str = lexer.getStream())) {
239 return nullptr;
240 }
241 pos = str->getPos();
242
243 // get length
244 Object obj = dict.dictLookup("Length", recursion);
245 if (obj.isInt()) {
246 length = obj.getInt();
247 } else if (obj.isInt64()) {
248 length = obj.getInt64();
249 } else {
250 error(errSyntaxError, getPos(), "Bad 'Length' attribute in stream");
251 if (strict)
252 return nullptr;
253 length = 0;
254 }
255
256 // check for length in damaged file
257 if (lexer.hasXRef() && lexer.getXRef()->getStreamEnd(pos, &endPos)) {
258 length = endPos - pos;
259 }
260
261 // in badly damaged PDF files, we can run off the end of the input
262 // stream immediately after the "stream" token
263 if (!lexer.getStream()) {
264 return nullptr;
265 }
266 baseStr = lexer.getStream()->getBaseStream();
267
268 // skip over stream data
269 if (Lexer::LOOK_VALUE_NOT_CACHED != lexer.lookCharLastValueCached) {
270 // take into account the fact that we've cached one value
271 pos = pos - 1;
272 lexer.lookCharLastValueCached = Lexer::LOOK_VALUE_NOT_CACHED;
273 }
274 if (unlikely(length < 0)) {
275 return nullptr;
276 }
277 if (unlikely(pos > LLONG_MAX - length)) {
278 return nullptr;
279 }
280 lexer.setPos(pos + length);
281
282 // refill token buffers and check for 'endstream'
283 shift(); // kill '>>'
284 shift("endstream", objNum); // kill 'stream'
285 if (buf1.isCmd("endstream")) {
286 shift();
287 } else {
288 error(errSyntaxError, getPos(), "Missing 'endstream' or incorrect stream length");
289 if (strict)
290 return nullptr;
291 if (lexer.hasXRef() && lexer.getStream()) {
292 // shift until we find the proper endstream or we change to another object or reach eof
293 length = lexer.getPos() - pos;
294 if (buf1.isCmd("endstream")) {
295 dict.dictSet("Length", Object(length));
296 }
297 } else {
298 // When building the xref we can't use it so use this
299 // kludge for broken PDF files: just add 5k to the length, and
300 // hope its enough
301 if (length < LLONG_MAX - pos - 5000)
302 length += 5000;
303 }
304 }
305
306 // make base stream
307 str = baseStr->makeSubStream(pos, true, length, std::move(dict));
308
309 // handle decryption
310 if (fileKey) {
311 str = new DecryptStream(str, fileKey, encAlgorithm, keyLength, { objNum, objGen });
312 }
313
314 // get filters
315 str = str->addFilters(str->getDict(), recursion);
316
317 if (XRef *xref = lexer.getXRef()) {
318 // Don't try to reuse the entry from the block at the start
319 // of the function, xref can change in the middle because of
320 // reconstruction
321 XRefEntry *entry = xref->getEntry(objNum, false);
322 if (entry) {
323 entry->setFlag(XRefEntry::Parsing, false);
324 }
325 }
326
327 return str;
328 }
329
shift(int objNum)330 void Parser::shift(int objNum)
331 {
332 if (inlineImg > 0) {
333 if (inlineImg < 2) {
334 ++inlineImg;
335 } else {
336 // in a damaged content stream, if 'ID' shows up in the middle
337 // of a dictionary, we need to reset
338 inlineImg = 0;
339 }
340 } else if (buf2.isCmd("ID")) {
341 lexer.skipChar(); // skip char after 'ID' command
342 inlineImg = 1;
343 }
344 buf1 = std::move(buf2);
345 if (inlineImg > 0) // don't buffer inline image data
346 buf2.setToNull();
347 else {
348 buf2 = lexer.getObj(objNum);
349 }
350 }
351
shift(const char * cmdA,int objNum)352 void Parser::shift(const char *cmdA, int objNum)
353 {
354 if (inlineImg > 0) {
355 if (inlineImg < 2) {
356 ++inlineImg;
357 } else {
358 // in a damaged content stream, if 'ID' shows up in the middle
359 // of a dictionary, we need to reset
360 inlineImg = 0;
361 }
362 } else if (buf2.isCmd("ID")) {
363 lexer.skipChar(); // skip char after 'ID' command
364 inlineImg = 1;
365 }
366 buf1 = std::move(buf2);
367 if (inlineImg > 0) {
368 buf2.setToNull();
369 } else if (buf1.isCmd(cmdA)) {
370 buf2 = lexer.getObj(objNum);
371 } else {
372 buf2 = lexer.getObj(cmdA, objNum);
373 }
374 }
375