1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdfapi/page/cpdf_streamparser.h"
8 
9 #include <algorithm>
10 #include <memory>
11 #include <sstream>
12 #include <utility>
13 
14 #include "constants/stream_dict_common.h"
15 #include "core/fpdfapi/page/cpdf_docpagedata.h"
16 #include "core/fpdfapi/parser/cpdf_array.h"
17 #include "core/fpdfapi/parser/cpdf_boolean.h"
18 #include "core/fpdfapi/parser/cpdf_dictionary.h"
19 #include "core/fpdfapi/parser/cpdf_name.h"
20 #include "core/fpdfapi/parser/cpdf_null.h"
21 #include "core/fpdfapi/parser/cpdf_number.h"
22 #include "core/fpdfapi/parser/cpdf_stream.h"
23 #include "core/fpdfapi/parser/cpdf_string.h"
24 #include "core/fpdfapi/parser/fpdf_parser_decode.h"
25 #include "core/fpdfapi/parser/fpdf_parser_utility.h"
26 #include "core/fxcodec/fx_codec.h"
27 #include "core/fxcodec/jpeg/jpegmodule.h"
28 #include "core/fxcodec/scanlinedecoder.h"
29 #include "core/fxcrt/fx_extension.h"
30 #include "core/fxcrt/fx_memory_wrappers.h"
31 #include "core/fxcrt/fx_safe_types.h"
32 
33 namespace {
34 
35 const uint32_t kMaxNestedParsingLevel = 512;
36 const size_t kMaxStringLength = 32767;
37 
38 const char kTrue[] = "true";
39 const char kFalse[] = "false";
40 const char kNull[] = "null";
41 
DecodeAllScanlines(std::unique_ptr<ScanlineDecoder> pDecoder)42 uint32_t DecodeAllScanlines(std::unique_ptr<ScanlineDecoder> pDecoder) {
43   if (!pDecoder)
44     return FX_INVALID_OFFSET;
45 
46   int ncomps = pDecoder->CountComps();
47   int bpc = pDecoder->GetBPC();
48   int width = pDecoder->GetWidth();
49   int height = pDecoder->GetHeight();
50   if (width <= 0 || height <= 0)
51     return FX_INVALID_OFFSET;
52 
53   FX_SAFE_UINT32 size = fxcodec::CalculatePitch8(bpc, ncomps, width);
54   size *= height;
55   if (size.ValueOrDefault(0) == 0)
56     return FX_INVALID_OFFSET;
57 
58   for (int row = 0; row < height; ++row) {
59     if (!pDecoder->GetScanline(row))
60       break;
61   }
62   return pDecoder->GetSrcOffset();
63 }
64 
DecodeInlineStream(pdfium::span<const uint8_t> src_span,int width,int height,const ByteString & decoder,const CPDF_Dictionary * pParam,uint32_t orig_size)65 uint32_t DecodeInlineStream(pdfium::span<const uint8_t> src_span,
66                             int width,
67                             int height,
68                             const ByteString& decoder,
69                             const CPDF_Dictionary* pParam,
70                             uint32_t orig_size) {
71   // |decoder| should not be an abbreviation.
72   ASSERT(decoder != "A85");
73   ASSERT(decoder != "AHx");
74   ASSERT(decoder != "CCF");
75   ASSERT(decoder != "DCT");
76   ASSERT(decoder != "Fl");
77   ASSERT(decoder != "LZW");
78   ASSERT(decoder != "RL");
79 
80   std::unique_ptr<uint8_t, FxFreeDeleter> ignored_result;
81   uint32_t ignored_size;
82   if (decoder == "FlateDecode") {
83     return FlateOrLZWDecode(false, src_span, pParam, orig_size, &ignored_result,
84                             &ignored_size);
85   }
86   if (decoder == "LZWDecode") {
87     return FlateOrLZWDecode(true, src_span, pParam, 0, &ignored_result,
88                             &ignored_size);
89   }
90   if (decoder == "DCTDecode") {
91     std::unique_ptr<ScanlineDecoder> pDecoder = JpegModule::CreateDecoder(
92         src_span, width, height, 0,
93         !pParam || pParam->GetIntegerFor("ColorTransform", 1));
94     return DecodeAllScanlines(std::move(pDecoder));
95   }
96   if (decoder == "CCITTFaxDecode") {
97     std::unique_ptr<ScanlineDecoder> pDecoder =
98         CreateFaxDecoder(src_span, width, height, pParam);
99     return DecodeAllScanlines(std::move(pDecoder));
100   }
101 
102   if (decoder == "ASCII85Decode")
103     return A85Decode(src_span, &ignored_result, &ignored_size);
104   if (decoder == "ASCIIHexDecode")
105     return HexDecode(src_span, &ignored_result, &ignored_size);
106   if (decoder == "RunLengthDecode")
107     return RunLengthDecode(src_span, &ignored_result, &ignored_size);
108 
109   return FX_INVALID_OFFSET;
110 }
111 
112 }  // namespace
113 
CPDF_StreamParser(pdfium::span<const uint8_t> span)114 CPDF_StreamParser::CPDF_StreamParser(pdfium::span<const uint8_t> span)
115     : m_pBuf(span) {}
116 
CPDF_StreamParser(pdfium::span<const uint8_t> span,const WeakPtr<ByteStringPool> & pPool)117 CPDF_StreamParser::CPDF_StreamParser(pdfium::span<const uint8_t> span,
118                                      const WeakPtr<ByteStringPool>& pPool)
119     : m_pPool(pPool), m_pBuf(span) {}
120 
121 CPDF_StreamParser::~CPDF_StreamParser() = default;
122 
ReadInlineStream(CPDF_Document * pDoc,RetainPtr<CPDF_Dictionary> pDict,const CPDF_Object * pCSObj)123 RetainPtr<CPDF_Stream> CPDF_StreamParser::ReadInlineStream(
124     CPDF_Document* pDoc,
125     RetainPtr<CPDF_Dictionary> pDict,
126     const CPDF_Object* pCSObj) {
127   if (m_Pos < m_pBuf.size() && PDFCharIsWhitespace(m_pBuf[m_Pos]))
128     m_Pos++;
129 
130   if (m_Pos == m_pBuf.size())
131     return nullptr;
132 
133   ByteString decoder;
134   const CPDF_Dictionary* pParam = nullptr;
135   CPDF_Object* pFilter = pDict->GetDirectObjectFor("Filter");
136   if (pFilter) {
137     const CPDF_Array* pArray = pFilter->AsArray();
138     if (pArray) {
139       decoder = pArray->GetStringAt(0);
140       const CPDF_Array* pParams =
141           pDict->GetArrayFor(pdfium::stream::kDecodeParms);
142       if (pParams)
143         pParam = pParams->GetDictAt(0);
144     } else {
145       decoder = pFilter->GetString();
146       pParam = pDict->GetDictFor(pdfium::stream::kDecodeParms);
147     }
148   }
149   uint32_t width = pDict->GetIntegerFor("Width");
150   uint32_t height = pDict->GetIntegerFor("Height");
151   uint32_t bpc = 1;
152   uint32_t nComponents = 1;
153   if (pCSObj) {
154     RetainPtr<CPDF_ColorSpace> pCS =
155         CPDF_DocPageData::FromDocument(pDoc)->GetColorSpace(pCSObj, nullptr);
156     nComponents = pCS ? pCS->CountComponents() : 3;
157     bpc = pDict->GetIntegerFor("BitsPerComponent");
158   }
159   FX_SAFE_UINT32 size = fxcodec::CalculatePitch8(bpc, nComponents, width);
160   size *= height;
161   if (!size.IsValid())
162     return nullptr;
163 
164   uint32_t dwOrigSize = size.ValueOrDie();
165   std::unique_ptr<uint8_t, FxFreeDeleter> pData;
166   uint32_t dwStreamSize;
167   if (decoder.IsEmpty()) {
168     dwOrigSize = std::min<uint32_t>(dwOrigSize, m_pBuf.size() - m_Pos);
169     pData.reset(FX_AllocUninit(uint8_t, dwOrigSize));
170     auto copy_span = m_pBuf.subspan(m_Pos, dwOrigSize);
171     memcpy(pData.get(), copy_span.data(), copy_span.size());
172     dwStreamSize = dwOrigSize;
173     m_Pos += dwOrigSize;
174   } else {
175     dwStreamSize = DecodeInlineStream(m_pBuf.subspan(m_Pos), width, height,
176                                       decoder, pParam, dwOrigSize);
177     if (!pdfium::base::IsValueInRangeForNumericType<int>(dwStreamSize))
178       return nullptr;
179 
180     uint32_t dwSavePos = m_Pos;
181     m_Pos += dwStreamSize;
182     while (1) {
183       uint32_t dwPrevPos = m_Pos;
184       CPDF_StreamParser::SyntaxType type = ParseNextElement();
185       if (type == CPDF_StreamParser::EndOfData)
186         break;
187 
188       if (type != CPDF_StreamParser::Keyword) {
189         dwStreamSize += m_Pos - dwPrevPos;
190         continue;
191       }
192       if (GetWord() == "EI") {
193         m_Pos = dwPrevPos;
194         break;
195       }
196       dwStreamSize += m_Pos - dwPrevPos;
197     }
198     m_Pos = dwSavePos;
199     pData.reset(FX_AllocUninit(uint8_t, dwStreamSize));
200     auto copy_span = m_pBuf.subspan(m_Pos, dwStreamSize);
201     memcpy(pData.get(), copy_span.data(), copy_span.size());
202     m_Pos += dwStreamSize;
203   }
204   pDict->SetNewFor<CPDF_Number>("Length", static_cast<int>(dwStreamSize));
205   return pdfium::MakeRetain<CPDF_Stream>(std::move(pData), dwStreamSize,
206                                          std::move(pDict));
207 }
208 
ParseNextElement()209 CPDF_StreamParser::SyntaxType CPDF_StreamParser::ParseNextElement() {
210   m_pLastObj.Reset();
211   m_WordSize = 0;
212   if (!PositionIsInBounds())
213     return EndOfData;
214 
215   uint8_t ch = m_pBuf[m_Pos++];
216   while (1) {
217     while (PDFCharIsWhitespace(ch)) {
218       if (!PositionIsInBounds())
219         return EndOfData;
220 
221       ch = m_pBuf[m_Pos++];
222     }
223 
224     if (ch != '%')
225       break;
226 
227     while (1) {
228       if (!PositionIsInBounds())
229         return EndOfData;
230 
231       ch = m_pBuf[m_Pos++];
232       if (PDFCharIsLineEnding(ch))
233         break;
234     }
235   }
236 
237   if (PDFCharIsDelimiter(ch) && ch != '/') {
238     m_Pos--;
239     m_pLastObj = ReadNextObject(false, false, 0);
240     return Others;
241   }
242 
243   bool bIsNumber = true;
244   while (1) {
245     if (m_WordSize < kMaxWordLength)
246       m_WordBuffer[m_WordSize++] = ch;
247 
248     if (!PDFCharIsNumeric(ch))
249       bIsNumber = false;
250 
251     if (!PositionIsInBounds())
252       break;
253 
254     ch = m_pBuf[m_Pos++];
255 
256     if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) {
257       m_Pos--;
258       break;
259     }
260   }
261 
262   m_WordBuffer[m_WordSize] = 0;
263   if (bIsNumber)
264     return Number;
265 
266   if (m_WordBuffer[0] == '/')
267     return Name;
268 
269   if (m_WordSize == 4) {
270     if (WordBufferMatches(kTrue)) {
271       m_pLastObj = pdfium::MakeRetain<CPDF_Boolean>(true);
272       return Others;
273     }
274     if (WordBufferMatches(kNull)) {
275       m_pLastObj = pdfium::MakeRetain<CPDF_Null>();
276       return Others;
277     }
278   } else if (m_WordSize == 5) {
279     if (WordBufferMatches(kFalse)) {
280       m_pLastObj = pdfium::MakeRetain<CPDF_Boolean>(false);
281       return Others;
282     }
283   }
284   return Keyword;
285 }
286 
ReadNextObject(bool bAllowNestedArray,bool bInArray,uint32_t dwRecursionLevel)287 RetainPtr<CPDF_Object> CPDF_StreamParser::ReadNextObject(
288     bool bAllowNestedArray,
289     bool bInArray,
290     uint32_t dwRecursionLevel) {
291   bool bIsNumber;
292   // Must get the next word before returning to avoid infinite loops.
293   GetNextWord(bIsNumber);
294   if (!m_WordSize || dwRecursionLevel > kMaxNestedParsingLevel)
295     return nullptr;
296 
297   if (bIsNumber) {
298     m_WordBuffer[m_WordSize] = 0;
299     return pdfium::MakeRetain<CPDF_Number>(
300         ByteStringView(m_WordBuffer, m_WordSize));
301   }
302 
303   int first_char = m_WordBuffer[0];
304   if (first_char == '/') {
305     ByteString name =
306         PDF_NameDecode(ByteStringView(m_WordBuffer + 1, m_WordSize - 1));
307     return pdfium::MakeRetain<CPDF_Name>(m_pPool, name);
308   }
309 
310   if (first_char == '(') {
311     ByteString str = ReadString();
312     return pdfium::MakeRetain<CPDF_String>(m_pPool, str, false);
313   }
314 
315   if (first_char == '<') {
316     if (m_WordSize == 1)
317       return pdfium::MakeRetain<CPDF_String>(m_pPool, ReadHexString(), true);
318 
319     auto pDict = pdfium::MakeRetain<CPDF_Dictionary>(m_pPool);
320     while (1) {
321       GetNextWord(bIsNumber);
322       if (m_WordSize == 2 && m_WordBuffer[0] == '>')
323         break;
324 
325       if (!m_WordSize || m_WordBuffer[0] != '/')
326         return nullptr;
327 
328       ByteString key =
329           PDF_NameDecode(ByteStringView(m_WordBuffer + 1, m_WordSize - 1));
330       RetainPtr<CPDF_Object> pObj =
331           ReadNextObject(true, bInArray, dwRecursionLevel + 1);
332       if (!pObj)
333         return nullptr;
334 
335       if (!key.IsEmpty())
336         pDict->SetFor(key, std::move(pObj));
337     }
338     return pDict;
339   }
340 
341   if (first_char == '[') {
342     if ((!bAllowNestedArray && bInArray))
343       return nullptr;
344 
345     auto pArray = pdfium::MakeRetain<CPDF_Array>();
346     while (1) {
347       RetainPtr<CPDF_Object> pObj =
348           ReadNextObject(bAllowNestedArray, true, dwRecursionLevel + 1);
349       if (pObj) {
350         pArray->Append(std::move(pObj));
351         continue;
352       }
353       if (!m_WordSize || m_WordBuffer[0] == ']')
354         break;
355     }
356     return pArray;
357   }
358 
359   if (WordBufferMatches(kFalse))
360     return pdfium::MakeRetain<CPDF_Boolean>(false);
361   if (WordBufferMatches(kTrue))
362     return pdfium::MakeRetain<CPDF_Boolean>(true);
363   if (WordBufferMatches(kNull))
364     return pdfium::MakeRetain<CPDF_Null>();
365   return nullptr;
366 }
367 
368 // TODO(npm): the following methods are almost identical in cpdf_syntaxparser
GetNextWord(bool & bIsNumber)369 void CPDF_StreamParser::GetNextWord(bool& bIsNumber) {
370   m_WordSize = 0;
371   bIsNumber = true;
372   if (!PositionIsInBounds())
373     return;
374 
375   uint8_t ch = m_pBuf[m_Pos++];
376   while (1) {
377     while (PDFCharIsWhitespace(ch)) {
378       if (!PositionIsInBounds()) {
379         return;
380       }
381       ch = m_pBuf[m_Pos++];
382     }
383 
384     if (ch != '%')
385       break;
386 
387     while (1) {
388       if (!PositionIsInBounds())
389         return;
390       ch = m_pBuf[m_Pos++];
391       if (PDFCharIsLineEnding(ch))
392         break;
393     }
394   }
395 
396   if (PDFCharIsDelimiter(ch)) {
397     bIsNumber = false;
398     m_WordBuffer[m_WordSize++] = ch;
399     if (ch == '/') {
400       while (1) {
401         if (!PositionIsInBounds())
402           return;
403         ch = m_pBuf[m_Pos++];
404         if (!PDFCharIsOther(ch) && !PDFCharIsNumeric(ch)) {
405           m_Pos--;
406           return;
407         }
408         if (m_WordSize < kMaxWordLength)
409           m_WordBuffer[m_WordSize++] = ch;
410       }
411     } else if (ch == '<') {
412       if (!PositionIsInBounds())
413         return;
414       ch = m_pBuf[m_Pos++];
415       if (ch == '<')
416         m_WordBuffer[m_WordSize++] = ch;
417       else
418         m_Pos--;
419     } else if (ch == '>') {
420       if (!PositionIsInBounds())
421         return;
422       ch = m_pBuf[m_Pos++];
423       if (ch == '>')
424         m_WordBuffer[m_WordSize++] = ch;
425       else
426         m_Pos--;
427     }
428     return;
429   }
430 
431   while (1) {
432     if (m_WordSize < kMaxWordLength)
433       m_WordBuffer[m_WordSize++] = ch;
434     if (!PDFCharIsNumeric(ch))
435       bIsNumber = false;
436     if (!PositionIsInBounds())
437       return;
438 
439     ch = m_pBuf[m_Pos++];
440     if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) {
441       m_Pos--;
442       break;
443     }
444   }
445 }
446 
ReadString()447 ByteString CPDF_StreamParser::ReadString() {
448   if (!PositionIsInBounds())
449     return ByteString();
450 
451   uint8_t ch = m_pBuf[m_Pos++];
452   std::ostringstream buf;
453   int parlevel = 0;
454   int status = 0;
455   int iEscCode = 0;
456   while (1) {
457     switch (status) {
458       case 0:
459         if (ch == ')') {
460           if (parlevel == 0) {
461             if (buf.tellp() <= 0)
462               return ByteString();
463 
464             return ByteString(
465                 buf.str().c_str(),
466                 std::min(static_cast<size_t>(buf.tellp()), kMaxStringLength));
467           }
468           parlevel--;
469           buf << ')';
470         } else if (ch == '(') {
471           parlevel++;
472           buf << '(';
473         } else if (ch == '\\') {
474           status = 1;
475         } else {
476           buf << static_cast<char>(ch);
477         }
478         break;
479       case 1:
480         if (FXSYS_IsOctalDigit(ch)) {
481           iEscCode = FXSYS_DecimalCharToInt(static_cast<char>(ch));
482           status = 2;
483           break;
484         }
485         if (ch == '\r') {
486           status = 4;
487           break;
488         }
489         if (ch == '\n') {
490           // Do nothing.
491         } else if (ch == 'n') {
492           buf << '\n';
493         } else if (ch == 'r') {
494           buf << '\r';
495         } else if (ch == 't') {
496           buf << '\t';
497         } else if (ch == 'b') {
498           buf << '\b';
499         } else if (ch == 'f') {
500           buf << '\f';
501         } else {
502           buf << static_cast<char>(ch);
503         }
504         status = 0;
505         break;
506       case 2:
507         if (FXSYS_IsOctalDigit(ch)) {
508           iEscCode =
509               iEscCode * 8 + FXSYS_DecimalCharToInt(static_cast<char>(ch));
510           status = 3;
511         } else {
512           buf << static_cast<char>(iEscCode);
513           status = 0;
514           continue;
515         }
516         break;
517       case 3:
518         if (FXSYS_IsOctalDigit(ch)) {
519           iEscCode =
520               iEscCode * 8 + FXSYS_DecimalCharToInt(static_cast<char>(ch));
521           buf << static_cast<char>(iEscCode);
522           status = 0;
523         } else {
524           buf << static_cast<char>(iEscCode);
525           status = 0;
526           continue;
527         }
528         break;
529       case 4:
530         status = 0;
531         if (ch != '\n')
532           continue;
533         break;
534     }
535     if (!PositionIsInBounds())
536       break;
537 
538     ch = m_pBuf[m_Pos++];
539   }
540   if (PositionIsInBounds())
541     ++m_Pos;
542 
543   if (buf.tellp() <= 0)
544     return ByteString();
545 
546   return ByteString(
547       buf.str().c_str(),
548       std::min(static_cast<size_t>(buf.tellp()), kMaxStringLength));
549 }
550 
ReadHexString()551 ByteString CPDF_StreamParser::ReadHexString() {
552   if (!PositionIsInBounds())
553     return ByteString();
554 
555   std::ostringstream buf;
556   bool bFirst = true;
557   int code = 0;
558   while (PositionIsInBounds()) {
559     uint8_t ch = m_pBuf[m_Pos++];
560     if (ch == '>')
561       break;
562 
563     if (!std::isxdigit(ch))
564       continue;
565 
566     int val = FXSYS_HexCharToInt(ch);
567     if (bFirst) {
568       code = val * 16;
569     } else {
570       code += val;
571       buf << static_cast<uint8_t>(code);
572     }
573     bFirst = !bFirst;
574   }
575   if (!bFirst)
576     buf << static_cast<char>(code);
577 
578   if (buf.tellp() <= 0)
579     return ByteString();
580 
581   return ByteString(
582       buf.str().c_str(),
583       std::min(static_cast<size_t>(buf.tellp()), kMaxStringLength));
584 }
585 
PositionIsInBounds() const586 bool CPDF_StreamParser::PositionIsInBounds() const {
587   return m_Pos < m_pBuf.size();
588 }
589 
WordBufferMatches(const char * pWord) const590 bool CPDF_StreamParser::WordBufferMatches(const char* pWord) const {
591   const size_t iLength = strlen(pWord);
592   return m_WordSize == iLength && memcmp(m_WordBuffer, pWord, iLength) == 0;
593 }
594