1 //========================================================================
2 //
3 // Parser.cc
4 //
5 // Copyright 1996-2003 Glyph & Cog, LLC
6 //
7 //========================================================================
8 
9 //========================================================================
10 //
11 // Modified under the Poppler project - http://poppler.freedesktop.org
12 //
13 // All changes made under the Poppler project to this file are licensed
14 // under GPL version 2 or later
15 //
16 // Copyright (C) 2006, 2009, 201, 2010, 2013, 2014, 2017-2020 Albert Astals Cid <aacid@kde.org>
17 // Copyright (C) 2006 Krzysztof Kowalczyk <kkowalczyk@gmail.com>
18 // Copyright (C) 2009 Ilya Gorenbein <igorenbein@finjan.com>
19 // Copyright (C) 2012 Hib Eris <hib@hiberis.nl>
20 // Copyright (C) 2013 Adrian Johnson <ajohnson@redneon.com>
21 // Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
22 // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
23 // Copyright (C) 2018, 2019 Adam Reichold <adam.reichold@t-online.de>
24 // Copyright (C) 2018 Marek Kasik <mkasik@redhat.com>
25 //
26 // To see a description of the changes please see the Changelog file that
27 // came with your tarball or type make ChangeLog if you are building from git
28 //
29 //========================================================================
30 
31 #include <config.h>
32 
33 #include <cstddef>
34 #include "Object.h"
35 #include "Array.h"
36 #include "Dict.h"
37 #include "Decrypt.h"
38 #include "Parser.h"
39 #include "XRef.h"
40 #include "Error.h"
41 
42 // Max number of nested objects.  This is used to catch infinite loops
43 // in the object structure. And also technically valid files with
44 // lots of nested arrays that made us consume all the stack
45 #define recursionLimit 500
46 
Parser(XRef * xrefA,Stream * streamA,bool allowStreamsA)47 Parser::Parser(XRef *xrefA, Stream *streamA, bool allowStreamsA) : lexer { xrefA, streamA }
48 {
49     allowStreams = allowStreamsA;
50     buf1 = lexer.getObj();
51     buf2 = lexer.getObj();
52     inlineImg = 0;
53 }
54 
Parser(XRef * xrefA,Object * objectA,bool allowStreamsA)55 Parser::Parser(XRef *xrefA, Object *objectA, bool allowStreamsA) : lexer { xrefA, objectA }
56 {
57     allowStreams = allowStreamsA;
58     buf1 = lexer.getObj();
59     buf2 = lexer.getObj();
60     inlineImg = 0;
61 }
62 
63 Parser::~Parser() = default;
64 
getObj(int recursion)65 Object Parser::getObj(int recursion)
66 {
67     return getObj(false, nullptr, cryptRC4, 0, 0, 0, recursion);
68 }
69 
decryptedString(const GooString * s,const unsigned char * fileKey,CryptAlgorithm encAlgorithm,int keyLength,int objNum,int objGen)70 static std::unique_ptr<GooString> decryptedString(const GooString *s, const unsigned char *fileKey, CryptAlgorithm encAlgorithm, int keyLength, int objNum, int objGen)
71 {
72     DecryptStream decrypt(new MemStream(s->c_str(), 0, s->getLength(), Object(objNull)), fileKey, encAlgorithm, keyLength, { objNum, objGen });
73     decrypt.reset();
74     std::unique_ptr<GooString> res = std::make_unique<GooString>();
75     int c;
76     while ((c = decrypt.getChar()) != EOF) {
77         res->append((char)c);
78     }
79     return res;
80 }
81 
getObj(bool simpleOnly,const unsigned char * fileKey,CryptAlgorithm encAlgorithm,int keyLength,int objNum,int objGen,int recursion,bool strict,bool decryptString)82 Object Parser::getObj(bool simpleOnly, const unsigned char *fileKey, CryptAlgorithm encAlgorithm, int keyLength, int objNum, int objGen, int recursion, bool strict, bool decryptString)
83 {
84     Object obj;
85     Stream *str;
86 
87     // refill buffer after inline image data
88     if (inlineImg == 2) {
89         buf1 = lexer.getObj();
90         buf2 = lexer.getObj();
91         inlineImg = 0;
92     }
93 
94     if (unlikely(recursion >= recursionLimit)) {
95         return Object(objError);
96     }
97 
98     // array
99     if (!simpleOnly && buf1.isCmd("[")) {
100         shift();
101         obj = Object(new Array(lexer.getXRef()));
102         while (!buf1.isCmd("]") && !buf1.isEOF() && recursion + 1 < recursionLimit) {
103             Object obj2 = getObj(false, fileKey, encAlgorithm, keyLength, objNum, objGen, recursion + 1);
104             obj.arrayAdd(std::move(obj2));
105         }
106         if (recursion + 1 >= recursionLimit && strict)
107             goto err;
108         if (buf1.isEOF()) {
109             error(errSyntaxError, getPos(), "End of file inside array");
110             if (strict)
111                 goto err;
112         }
113         shift();
114 
115         // dictionary or stream
116     } else if (!simpleOnly && buf1.isCmd("<<")) {
117         shift(objNum);
118         obj = Object(new Dict(lexer.getXRef()));
119         bool hasContentsEntry = false;
120         while (!buf1.isCmd(">>") && !buf1.isEOF()) {
121             if (!buf1.isName()) {
122                 error(errSyntaxError, getPos(), "Dictionary key must be a name object");
123                 if (strict)
124                     goto err;
125                 shift();
126             } else {
127                 // buf1 will go away in shift(), so keep the key
128                 const auto key = std::move(buf1);
129                 shift();
130                 if (buf1.isEOF() || buf1.isError()) {
131                     if (strict && buf1.isError())
132                         goto err;
133                     break;
134                 }
135                 // We don't decrypt strings that are the value of "Contents" key entries. We decrypt them if needed a few lines below.
136                 // The "Contents" field of Sig dictionaries is not encrypted, but we can't know the type of the dictionary here yet
137                 // so we don't decrypt any Contents and if later we find it's not a Sig dictionary we decrypt it
138                 const bool isContents = !hasContentsEntry && key.isName("Contents");
139                 hasContentsEntry = hasContentsEntry || isContents;
140                 Object obj2 = getObj(false, fileKey, encAlgorithm, keyLength, objNum, objGen, recursion + 1, /*strict*/ false, /*decryptString*/ !isContents);
141                 if (unlikely(obj2.isError() && recursion + 1 >= recursionLimit)) {
142                     break;
143                 }
144                 obj.dictAdd(key.getName(), std::move(obj2));
145             }
146         }
147         if (buf1.isEOF()) {
148             error(errSyntaxError, getPos(), "End of file inside dictionary");
149             if (strict)
150                 goto err;
151         }
152         if (fileKey && hasContentsEntry) {
153             Dict *dict = obj.getDict();
154             const bool isSigDict = dict->is("Sig");
155             if (!isSigDict) {
156                 const Object &contentsObj = dict->lookupNF("Contents");
157                 if (contentsObj.isString()) {
158                     std::unique_ptr<GooString> s = decryptedString(contentsObj.getString(), fileKey, encAlgorithm, keyLength, objNum, objGen);
159                     dict->set("Contents", Object(s.release()));
160                 }
161             }
162         }
163         // stream objects are not allowed inside content streams or
164         // object streams
165         if (buf2.isCmd("stream")) {
166             if (allowStreams && (str = makeStream(std::move(obj), fileKey, encAlgorithm, keyLength, objNum, objGen, recursion + 1, strict))) {
167                 return Object(str);
168             } else {
169                 return Object(objError);
170             }
171         } else {
172             shift();
173         }
174 
175         // indirect reference or integer
176     } else if (buf1.isInt()) {
177         const int num = buf1.getInt();
178         shift();
179         if (buf1.isInt() && buf2.isCmd("R")) {
180             const int gen = buf1.getInt();
181             shift();
182             shift();
183 
184             if (unlikely(num <= 0 || gen < 0)) {
185                 return Object();
186             }
187 
188             Ref r;
189             r.num = num;
190             r.gen = gen;
191             return Object(r);
192         } else {
193             return Object(num);
194         }
195 
196         // string
197     } else if (decryptString && buf1.isString() && fileKey) {
198         std::unique_ptr<GooString> s2 = decryptedString(buf1.getString(), fileKey, encAlgorithm, keyLength, objNum, objGen);
199         obj = Object(s2.release());
200         shift();
201 
202         // simple object
203     } else {
204         // avoid re-allocating memory for complex objects like strings by
205         // shallow copy of <buf1> to <obj> and nulling <buf1> so that
206         // subsequent buf1.free() won't free this memory
207         obj = std::move(buf1);
208         shift();
209     }
210 
211     return obj;
212 
213 err:
214     return Object(objError);
215 }
216 
makeStream(Object && dict,const unsigned char * fileKey,CryptAlgorithm encAlgorithm,int keyLength,int objNum,int objGen,int recursion,bool strict)217 Stream *Parser::makeStream(Object &&dict, const unsigned char *fileKey, CryptAlgorithm encAlgorithm, int keyLength, int objNum, int objGen, int recursion, bool strict)
218 {
219     BaseStream *baseStr;
220     Stream *str;
221     Goffset length;
222     Goffset pos, endPos;
223 
224     if (XRef *xref = lexer.getXRef()) {
225         XRefEntry *entry = xref->getEntry(objNum, false);
226         if (entry) {
227             if (!entry->getFlag(XRefEntry::Parsing) || (objNum == 0 && objGen == 0)) {
228                 entry->setFlag(XRefEntry::Parsing, true);
229             } else {
230                 error(errSyntaxError, getPos(), "Object '{0:d} {1:d} obj' is being already parsed", objNum, objGen);
231                 return nullptr;
232             }
233         }
234     }
235 
236     // get stream start position
237     lexer.skipToNextLine();
238     if (!(str = lexer.getStream())) {
239         return nullptr;
240     }
241     pos = str->getPos();
242 
243     // get length
244     Object obj = dict.dictLookup("Length", recursion);
245     if (obj.isInt()) {
246         length = obj.getInt();
247     } else if (obj.isInt64()) {
248         length = obj.getInt64();
249     } else {
250         error(errSyntaxError, getPos(), "Bad 'Length' attribute in stream");
251         if (strict)
252             return nullptr;
253         length = 0;
254     }
255 
256     // check for length in damaged file
257     if (lexer.hasXRef() && lexer.getXRef()->getStreamEnd(pos, &endPos)) {
258         length = endPos - pos;
259     }
260 
261     // in badly damaged PDF files, we can run off the end of the input
262     // stream immediately after the "stream" token
263     if (!lexer.getStream()) {
264         return nullptr;
265     }
266     baseStr = lexer.getStream()->getBaseStream();
267 
268     // skip over stream data
269     if (Lexer::LOOK_VALUE_NOT_CACHED != lexer.lookCharLastValueCached) {
270         // take into account the fact that we've cached one value
271         pos = pos - 1;
272         lexer.lookCharLastValueCached = Lexer::LOOK_VALUE_NOT_CACHED;
273     }
274     if (unlikely(length < 0)) {
275         return nullptr;
276     }
277     if (unlikely(pos > LLONG_MAX - length)) {
278         return nullptr;
279     }
280     lexer.setPos(pos + length);
281 
282     // refill token buffers and check for 'endstream'
283     shift(); // kill '>>'
284     shift("endstream", objNum); // kill 'stream'
285     if (buf1.isCmd("endstream")) {
286         shift();
287     } else {
288         error(errSyntaxError, getPos(), "Missing 'endstream' or incorrect stream length");
289         if (strict)
290             return nullptr;
291         if (lexer.hasXRef() && lexer.getStream()) {
292             // shift until we find the proper endstream or we change to another object or reach eof
293             length = lexer.getPos() - pos;
294             if (buf1.isCmd("endstream")) {
295                 dict.dictSet("Length", Object(length));
296             }
297         } else {
298             // When building the xref we can't use it so use this
299             // kludge for broken PDF files: just add 5k to the length, and
300             // hope its enough
301             if (length < LLONG_MAX - pos - 5000)
302                 length += 5000;
303         }
304     }
305 
306     // make base stream
307     str = baseStr->makeSubStream(pos, true, length, std::move(dict));
308 
309     // handle decryption
310     if (fileKey) {
311         str = new DecryptStream(str, fileKey, encAlgorithm, keyLength, { objNum, objGen });
312     }
313 
314     // get filters
315     str = str->addFilters(str->getDict(), recursion);
316 
317     if (XRef *xref = lexer.getXRef()) {
318         // Don't try to reuse the entry from the block at the start
319         // of the function, xref can change in the middle because of
320         // reconstruction
321         XRefEntry *entry = xref->getEntry(objNum, false);
322         if (entry) {
323             entry->setFlag(XRefEntry::Parsing, false);
324         }
325     }
326 
327     return str;
328 }
329 
shift(int objNum)330 void Parser::shift(int objNum)
331 {
332     if (inlineImg > 0) {
333         if (inlineImg < 2) {
334             ++inlineImg;
335         } else {
336             // in a damaged content stream, if 'ID' shows up in the middle
337             // of a dictionary, we need to reset
338             inlineImg = 0;
339         }
340     } else if (buf2.isCmd("ID")) {
341         lexer.skipChar(); // skip char after 'ID' command
342         inlineImg = 1;
343     }
344     buf1 = std::move(buf2);
345     if (inlineImg > 0) // don't buffer inline image data
346         buf2.setToNull();
347     else {
348         buf2 = lexer.getObj(objNum);
349     }
350 }
351 
shift(const char * cmdA,int objNum)352 void Parser::shift(const char *cmdA, int objNum)
353 {
354     if (inlineImg > 0) {
355         if (inlineImg < 2) {
356             ++inlineImg;
357         } else {
358             // in a damaged content stream, if 'ID' shows up in the middle
359             // of a dictionary, we need to reset
360             inlineImg = 0;
361         }
362     } else if (buf2.isCmd("ID")) {
363         lexer.skipChar(); // skip char after 'ID' command
364         inlineImg = 1;
365     }
366     buf1 = std::move(buf2);
367     if (inlineImg > 0) {
368         buf2.setToNull();
369     } else if (buf1.isCmd(cmdA)) {
370         buf2 = lexer.getObj(objNum);
371     } else {
372         buf2 = lexer.getObj(cmdA, objNum);
373     }
374 }
375