1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdfapi/page/cpdf_streamparser.h"
8
9 #include <algorithm>
10 #include <memory>
11 #include <sstream>
12 #include <utility>
13
14 #include "constants/stream_dict_common.h"
15 #include "core/fpdfapi/page/cpdf_docpagedata.h"
16 #include "core/fpdfapi/parser/cpdf_array.h"
17 #include "core/fpdfapi/parser/cpdf_boolean.h"
18 #include "core/fpdfapi/parser/cpdf_dictionary.h"
19 #include "core/fpdfapi/parser/cpdf_name.h"
20 #include "core/fpdfapi/parser/cpdf_null.h"
21 #include "core/fpdfapi/parser/cpdf_number.h"
22 #include "core/fpdfapi/parser/cpdf_stream.h"
23 #include "core/fpdfapi/parser/cpdf_string.h"
24 #include "core/fpdfapi/parser/fpdf_parser_decode.h"
25 #include "core/fpdfapi/parser/fpdf_parser_utility.h"
26 #include "core/fxcodec/fx_codec.h"
27 #include "core/fxcodec/jpeg/jpegmodule.h"
28 #include "core/fxcodec/scanlinedecoder.h"
29 #include "core/fxcrt/fx_extension.h"
30 #include "core/fxcrt/fx_memory_wrappers.h"
31 #include "core/fxcrt/fx_safe_types.h"
32
33 namespace {
34
35 const uint32_t kMaxNestedParsingLevel = 512;
36 const size_t kMaxStringLength = 32767;
37
38 const char kTrue[] = "true";
39 const char kFalse[] = "false";
40 const char kNull[] = "null";
41
DecodeAllScanlines(std::unique_ptr<ScanlineDecoder> pDecoder)42 uint32_t DecodeAllScanlines(std::unique_ptr<ScanlineDecoder> pDecoder) {
43 if (!pDecoder)
44 return FX_INVALID_OFFSET;
45
46 int ncomps = pDecoder->CountComps();
47 int bpc = pDecoder->GetBPC();
48 int width = pDecoder->GetWidth();
49 int height = pDecoder->GetHeight();
50 if (width <= 0 || height <= 0)
51 return FX_INVALID_OFFSET;
52
53 FX_SAFE_UINT32 size = fxcodec::CalculatePitch8(bpc, ncomps, width);
54 size *= height;
55 if (size.ValueOrDefault(0) == 0)
56 return FX_INVALID_OFFSET;
57
58 for (int row = 0; row < height; ++row) {
59 if (!pDecoder->GetScanline(row))
60 break;
61 }
62 return pDecoder->GetSrcOffset();
63 }
64
DecodeInlineStream(pdfium::span<const uint8_t> src_span,int width,int height,const ByteString & decoder,const CPDF_Dictionary * pParam,uint32_t orig_size)65 uint32_t DecodeInlineStream(pdfium::span<const uint8_t> src_span,
66 int width,
67 int height,
68 const ByteString& decoder,
69 const CPDF_Dictionary* pParam,
70 uint32_t orig_size) {
71 // |decoder| should not be an abbreviation.
72 ASSERT(decoder != "A85");
73 ASSERT(decoder != "AHx");
74 ASSERT(decoder != "CCF");
75 ASSERT(decoder != "DCT");
76 ASSERT(decoder != "Fl");
77 ASSERT(decoder != "LZW");
78 ASSERT(decoder != "RL");
79
80 std::unique_ptr<uint8_t, FxFreeDeleter> ignored_result;
81 uint32_t ignored_size;
82 if (decoder == "FlateDecode") {
83 return FlateOrLZWDecode(false, src_span, pParam, orig_size, &ignored_result,
84 &ignored_size);
85 }
86 if (decoder == "LZWDecode") {
87 return FlateOrLZWDecode(true, src_span, pParam, 0, &ignored_result,
88 &ignored_size);
89 }
90 if (decoder == "DCTDecode") {
91 std::unique_ptr<ScanlineDecoder> pDecoder = JpegModule::CreateDecoder(
92 src_span, width, height, 0,
93 !pParam || pParam->GetIntegerFor("ColorTransform", 1));
94 return DecodeAllScanlines(std::move(pDecoder));
95 }
96 if (decoder == "CCITTFaxDecode") {
97 std::unique_ptr<ScanlineDecoder> pDecoder =
98 CreateFaxDecoder(src_span, width, height, pParam);
99 return DecodeAllScanlines(std::move(pDecoder));
100 }
101
102 if (decoder == "ASCII85Decode")
103 return A85Decode(src_span, &ignored_result, &ignored_size);
104 if (decoder == "ASCIIHexDecode")
105 return HexDecode(src_span, &ignored_result, &ignored_size);
106 if (decoder == "RunLengthDecode")
107 return RunLengthDecode(src_span, &ignored_result, &ignored_size);
108
109 return FX_INVALID_OFFSET;
110 }
111
112 } // namespace
113
CPDF_StreamParser(pdfium::span<const uint8_t> span)114 CPDF_StreamParser::CPDF_StreamParser(pdfium::span<const uint8_t> span)
115 : m_pBuf(span) {}
116
CPDF_StreamParser(pdfium::span<const uint8_t> span,const WeakPtr<ByteStringPool> & pPool)117 CPDF_StreamParser::CPDF_StreamParser(pdfium::span<const uint8_t> span,
118 const WeakPtr<ByteStringPool>& pPool)
119 : m_pPool(pPool), m_pBuf(span) {}
120
121 CPDF_StreamParser::~CPDF_StreamParser() = default;
122
ReadInlineStream(CPDF_Document * pDoc,RetainPtr<CPDF_Dictionary> pDict,const CPDF_Object * pCSObj)123 RetainPtr<CPDF_Stream> CPDF_StreamParser::ReadInlineStream(
124 CPDF_Document* pDoc,
125 RetainPtr<CPDF_Dictionary> pDict,
126 const CPDF_Object* pCSObj) {
127 if (m_Pos < m_pBuf.size() && PDFCharIsWhitespace(m_pBuf[m_Pos]))
128 m_Pos++;
129
130 if (m_Pos == m_pBuf.size())
131 return nullptr;
132
133 ByteString decoder;
134 const CPDF_Dictionary* pParam = nullptr;
135 CPDF_Object* pFilter = pDict->GetDirectObjectFor("Filter");
136 if (pFilter) {
137 const CPDF_Array* pArray = pFilter->AsArray();
138 if (pArray) {
139 decoder = pArray->GetStringAt(0);
140 const CPDF_Array* pParams =
141 pDict->GetArrayFor(pdfium::stream::kDecodeParms);
142 if (pParams)
143 pParam = pParams->GetDictAt(0);
144 } else {
145 decoder = pFilter->GetString();
146 pParam = pDict->GetDictFor(pdfium::stream::kDecodeParms);
147 }
148 }
149 uint32_t width = pDict->GetIntegerFor("Width");
150 uint32_t height = pDict->GetIntegerFor("Height");
151 uint32_t bpc = 1;
152 uint32_t nComponents = 1;
153 if (pCSObj) {
154 RetainPtr<CPDF_ColorSpace> pCS =
155 CPDF_DocPageData::FromDocument(pDoc)->GetColorSpace(pCSObj, nullptr);
156 nComponents = pCS ? pCS->CountComponents() : 3;
157 bpc = pDict->GetIntegerFor("BitsPerComponent");
158 }
159 FX_SAFE_UINT32 size = fxcodec::CalculatePitch8(bpc, nComponents, width);
160 size *= height;
161 if (!size.IsValid())
162 return nullptr;
163
164 uint32_t dwOrigSize = size.ValueOrDie();
165 std::unique_ptr<uint8_t, FxFreeDeleter> pData;
166 uint32_t dwStreamSize;
167 if (decoder.IsEmpty()) {
168 dwOrigSize = std::min<uint32_t>(dwOrigSize, m_pBuf.size() - m_Pos);
169 pData.reset(FX_AllocUninit(uint8_t, dwOrigSize));
170 auto copy_span = m_pBuf.subspan(m_Pos, dwOrigSize);
171 memcpy(pData.get(), copy_span.data(), copy_span.size());
172 dwStreamSize = dwOrigSize;
173 m_Pos += dwOrigSize;
174 } else {
175 dwStreamSize = DecodeInlineStream(m_pBuf.subspan(m_Pos), width, height,
176 decoder, pParam, dwOrigSize);
177 if (!pdfium::base::IsValueInRangeForNumericType<int>(dwStreamSize))
178 return nullptr;
179
180 uint32_t dwSavePos = m_Pos;
181 m_Pos += dwStreamSize;
182 while (1) {
183 uint32_t dwPrevPos = m_Pos;
184 CPDF_StreamParser::SyntaxType type = ParseNextElement();
185 if (type == CPDF_StreamParser::EndOfData)
186 break;
187
188 if (type != CPDF_StreamParser::Keyword) {
189 dwStreamSize += m_Pos - dwPrevPos;
190 continue;
191 }
192 if (GetWord() == "EI") {
193 m_Pos = dwPrevPos;
194 break;
195 }
196 dwStreamSize += m_Pos - dwPrevPos;
197 }
198 m_Pos = dwSavePos;
199 pData.reset(FX_AllocUninit(uint8_t, dwStreamSize));
200 auto copy_span = m_pBuf.subspan(m_Pos, dwStreamSize);
201 memcpy(pData.get(), copy_span.data(), copy_span.size());
202 m_Pos += dwStreamSize;
203 }
204 pDict->SetNewFor<CPDF_Number>("Length", static_cast<int>(dwStreamSize));
205 return pdfium::MakeRetain<CPDF_Stream>(std::move(pData), dwStreamSize,
206 std::move(pDict));
207 }
208
ParseNextElement()209 CPDF_StreamParser::SyntaxType CPDF_StreamParser::ParseNextElement() {
210 m_pLastObj.Reset();
211 m_WordSize = 0;
212 if (!PositionIsInBounds())
213 return EndOfData;
214
215 uint8_t ch = m_pBuf[m_Pos++];
216 while (1) {
217 while (PDFCharIsWhitespace(ch)) {
218 if (!PositionIsInBounds())
219 return EndOfData;
220
221 ch = m_pBuf[m_Pos++];
222 }
223
224 if (ch != '%')
225 break;
226
227 while (1) {
228 if (!PositionIsInBounds())
229 return EndOfData;
230
231 ch = m_pBuf[m_Pos++];
232 if (PDFCharIsLineEnding(ch))
233 break;
234 }
235 }
236
237 if (PDFCharIsDelimiter(ch) && ch != '/') {
238 m_Pos--;
239 m_pLastObj = ReadNextObject(false, false, 0);
240 return Others;
241 }
242
243 bool bIsNumber = true;
244 while (1) {
245 if (m_WordSize < kMaxWordLength)
246 m_WordBuffer[m_WordSize++] = ch;
247
248 if (!PDFCharIsNumeric(ch))
249 bIsNumber = false;
250
251 if (!PositionIsInBounds())
252 break;
253
254 ch = m_pBuf[m_Pos++];
255
256 if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) {
257 m_Pos--;
258 break;
259 }
260 }
261
262 m_WordBuffer[m_WordSize] = 0;
263 if (bIsNumber)
264 return Number;
265
266 if (m_WordBuffer[0] == '/')
267 return Name;
268
269 if (m_WordSize == 4) {
270 if (WordBufferMatches(kTrue)) {
271 m_pLastObj = pdfium::MakeRetain<CPDF_Boolean>(true);
272 return Others;
273 }
274 if (WordBufferMatches(kNull)) {
275 m_pLastObj = pdfium::MakeRetain<CPDF_Null>();
276 return Others;
277 }
278 } else if (m_WordSize == 5) {
279 if (WordBufferMatches(kFalse)) {
280 m_pLastObj = pdfium::MakeRetain<CPDF_Boolean>(false);
281 return Others;
282 }
283 }
284 return Keyword;
285 }
286
ReadNextObject(bool bAllowNestedArray,bool bInArray,uint32_t dwRecursionLevel)287 RetainPtr<CPDF_Object> CPDF_StreamParser::ReadNextObject(
288 bool bAllowNestedArray,
289 bool bInArray,
290 uint32_t dwRecursionLevel) {
291 bool bIsNumber;
292 // Must get the next word before returning to avoid infinite loops.
293 GetNextWord(bIsNumber);
294 if (!m_WordSize || dwRecursionLevel > kMaxNestedParsingLevel)
295 return nullptr;
296
297 if (bIsNumber) {
298 m_WordBuffer[m_WordSize] = 0;
299 return pdfium::MakeRetain<CPDF_Number>(
300 ByteStringView(m_WordBuffer, m_WordSize));
301 }
302
303 int first_char = m_WordBuffer[0];
304 if (first_char == '/') {
305 ByteString name =
306 PDF_NameDecode(ByteStringView(m_WordBuffer + 1, m_WordSize - 1));
307 return pdfium::MakeRetain<CPDF_Name>(m_pPool, name);
308 }
309
310 if (first_char == '(') {
311 ByteString str = ReadString();
312 return pdfium::MakeRetain<CPDF_String>(m_pPool, str, false);
313 }
314
315 if (first_char == '<') {
316 if (m_WordSize == 1)
317 return pdfium::MakeRetain<CPDF_String>(m_pPool, ReadHexString(), true);
318
319 auto pDict = pdfium::MakeRetain<CPDF_Dictionary>(m_pPool);
320 while (1) {
321 GetNextWord(bIsNumber);
322 if (m_WordSize == 2 && m_WordBuffer[0] == '>')
323 break;
324
325 if (!m_WordSize || m_WordBuffer[0] != '/')
326 return nullptr;
327
328 ByteString key =
329 PDF_NameDecode(ByteStringView(m_WordBuffer + 1, m_WordSize - 1));
330 RetainPtr<CPDF_Object> pObj =
331 ReadNextObject(true, bInArray, dwRecursionLevel + 1);
332 if (!pObj)
333 return nullptr;
334
335 if (!key.IsEmpty())
336 pDict->SetFor(key, std::move(pObj));
337 }
338 return pDict;
339 }
340
341 if (first_char == '[') {
342 if ((!bAllowNestedArray && bInArray))
343 return nullptr;
344
345 auto pArray = pdfium::MakeRetain<CPDF_Array>();
346 while (1) {
347 RetainPtr<CPDF_Object> pObj =
348 ReadNextObject(bAllowNestedArray, true, dwRecursionLevel + 1);
349 if (pObj) {
350 pArray->Append(std::move(pObj));
351 continue;
352 }
353 if (!m_WordSize || m_WordBuffer[0] == ']')
354 break;
355 }
356 return pArray;
357 }
358
359 if (WordBufferMatches(kFalse))
360 return pdfium::MakeRetain<CPDF_Boolean>(false);
361 if (WordBufferMatches(kTrue))
362 return pdfium::MakeRetain<CPDF_Boolean>(true);
363 if (WordBufferMatches(kNull))
364 return pdfium::MakeRetain<CPDF_Null>();
365 return nullptr;
366 }
367
368 // TODO(npm): the following methods are almost identical in cpdf_syntaxparser
GetNextWord(bool & bIsNumber)369 void CPDF_StreamParser::GetNextWord(bool& bIsNumber) {
370 m_WordSize = 0;
371 bIsNumber = true;
372 if (!PositionIsInBounds())
373 return;
374
375 uint8_t ch = m_pBuf[m_Pos++];
376 while (1) {
377 while (PDFCharIsWhitespace(ch)) {
378 if (!PositionIsInBounds()) {
379 return;
380 }
381 ch = m_pBuf[m_Pos++];
382 }
383
384 if (ch != '%')
385 break;
386
387 while (1) {
388 if (!PositionIsInBounds())
389 return;
390 ch = m_pBuf[m_Pos++];
391 if (PDFCharIsLineEnding(ch))
392 break;
393 }
394 }
395
396 if (PDFCharIsDelimiter(ch)) {
397 bIsNumber = false;
398 m_WordBuffer[m_WordSize++] = ch;
399 if (ch == '/') {
400 while (1) {
401 if (!PositionIsInBounds())
402 return;
403 ch = m_pBuf[m_Pos++];
404 if (!PDFCharIsOther(ch) && !PDFCharIsNumeric(ch)) {
405 m_Pos--;
406 return;
407 }
408 if (m_WordSize < kMaxWordLength)
409 m_WordBuffer[m_WordSize++] = ch;
410 }
411 } else if (ch == '<') {
412 if (!PositionIsInBounds())
413 return;
414 ch = m_pBuf[m_Pos++];
415 if (ch == '<')
416 m_WordBuffer[m_WordSize++] = ch;
417 else
418 m_Pos--;
419 } else if (ch == '>') {
420 if (!PositionIsInBounds())
421 return;
422 ch = m_pBuf[m_Pos++];
423 if (ch == '>')
424 m_WordBuffer[m_WordSize++] = ch;
425 else
426 m_Pos--;
427 }
428 return;
429 }
430
431 while (1) {
432 if (m_WordSize < kMaxWordLength)
433 m_WordBuffer[m_WordSize++] = ch;
434 if (!PDFCharIsNumeric(ch))
435 bIsNumber = false;
436 if (!PositionIsInBounds())
437 return;
438
439 ch = m_pBuf[m_Pos++];
440 if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) {
441 m_Pos--;
442 break;
443 }
444 }
445 }
446
ReadString()447 ByteString CPDF_StreamParser::ReadString() {
448 if (!PositionIsInBounds())
449 return ByteString();
450
451 uint8_t ch = m_pBuf[m_Pos++];
452 std::ostringstream buf;
453 int parlevel = 0;
454 int status = 0;
455 int iEscCode = 0;
456 while (1) {
457 switch (status) {
458 case 0:
459 if (ch == ')') {
460 if (parlevel == 0) {
461 if (buf.tellp() <= 0)
462 return ByteString();
463
464 return ByteString(
465 buf.str().c_str(),
466 std::min(static_cast<size_t>(buf.tellp()), kMaxStringLength));
467 }
468 parlevel--;
469 buf << ')';
470 } else if (ch == '(') {
471 parlevel++;
472 buf << '(';
473 } else if (ch == '\\') {
474 status = 1;
475 } else {
476 buf << static_cast<char>(ch);
477 }
478 break;
479 case 1:
480 if (FXSYS_IsOctalDigit(ch)) {
481 iEscCode = FXSYS_DecimalCharToInt(static_cast<char>(ch));
482 status = 2;
483 break;
484 }
485 if (ch == '\r') {
486 status = 4;
487 break;
488 }
489 if (ch == '\n') {
490 // Do nothing.
491 } else if (ch == 'n') {
492 buf << '\n';
493 } else if (ch == 'r') {
494 buf << '\r';
495 } else if (ch == 't') {
496 buf << '\t';
497 } else if (ch == 'b') {
498 buf << '\b';
499 } else if (ch == 'f') {
500 buf << '\f';
501 } else {
502 buf << static_cast<char>(ch);
503 }
504 status = 0;
505 break;
506 case 2:
507 if (FXSYS_IsOctalDigit(ch)) {
508 iEscCode =
509 iEscCode * 8 + FXSYS_DecimalCharToInt(static_cast<char>(ch));
510 status = 3;
511 } else {
512 buf << static_cast<char>(iEscCode);
513 status = 0;
514 continue;
515 }
516 break;
517 case 3:
518 if (FXSYS_IsOctalDigit(ch)) {
519 iEscCode =
520 iEscCode * 8 + FXSYS_DecimalCharToInt(static_cast<char>(ch));
521 buf << static_cast<char>(iEscCode);
522 status = 0;
523 } else {
524 buf << static_cast<char>(iEscCode);
525 status = 0;
526 continue;
527 }
528 break;
529 case 4:
530 status = 0;
531 if (ch != '\n')
532 continue;
533 break;
534 }
535 if (!PositionIsInBounds())
536 break;
537
538 ch = m_pBuf[m_Pos++];
539 }
540 if (PositionIsInBounds())
541 ++m_Pos;
542
543 if (buf.tellp() <= 0)
544 return ByteString();
545
546 return ByteString(
547 buf.str().c_str(),
548 std::min(static_cast<size_t>(buf.tellp()), kMaxStringLength));
549 }
550
ReadHexString()551 ByteString CPDF_StreamParser::ReadHexString() {
552 if (!PositionIsInBounds())
553 return ByteString();
554
555 std::ostringstream buf;
556 bool bFirst = true;
557 int code = 0;
558 while (PositionIsInBounds()) {
559 uint8_t ch = m_pBuf[m_Pos++];
560 if (ch == '>')
561 break;
562
563 if (!std::isxdigit(ch))
564 continue;
565
566 int val = FXSYS_HexCharToInt(ch);
567 if (bFirst) {
568 code = val * 16;
569 } else {
570 code += val;
571 buf << static_cast<uint8_t>(code);
572 }
573 bFirst = !bFirst;
574 }
575 if (!bFirst)
576 buf << static_cast<char>(code);
577
578 if (buf.tellp() <= 0)
579 return ByteString();
580
581 return ByteString(
582 buf.str().c_str(),
583 std::min(static_cast<size_t>(buf.tellp()), kMaxStringLength));
584 }
585
PositionIsInBounds() const586 bool CPDF_StreamParser::PositionIsInBounds() const {
587 return m_Pos < m_pBuf.size();
588 }
589
WordBufferMatches(const char * pWord) const590 bool CPDF_StreamParser::WordBufferMatches(const char* pWord) const {
591 const size_t iLength = strlen(pWord);
592 return m_WordSize == iLength && memcmp(m_WordBuffer, pWord, iLength) == 0;
593 }
594