1 #include <qpdf/QPDFTokenizer.hh>
2
3 // DO NOT USE ctype -- it is locale dependent for some things, and
4 // it's not worth the risk of including it in case it may accidentally
5 // be used.
6
7 #include <qpdf/QTC.hh>
8 #include <qpdf/QPDFExc.hh>
9 #include <qpdf/QUtil.hh>
10 #include <qpdf/QPDFObjectHandle.hh>
11 #include <qpdf/QIntC.hh>
12
13 #include <stdexcept>
14 #include <stdlib.h>
15 #include <string.h>
16
is_delimiter(char ch)17 static bool is_delimiter(char ch)
18 {
19 return (strchr(" \t\n\v\f\r()<>[]{}/%", ch) != 0);
20 }
21
22 class QPDFWordTokenFinder: public InputSource::Finder
23 {
24 public:
QPDFWordTokenFinder(PointerHolder<InputSource> is,std::string const & str)25 QPDFWordTokenFinder(PointerHolder<InputSource> is,
26 std::string const& str) :
27 is(is),
28 str(str)
29 {
30 }
~QPDFWordTokenFinder()31 virtual ~QPDFWordTokenFinder()
32 {
33 }
34 virtual bool check();
35
36 private:
37 PointerHolder<InputSource> is;
38 std::string str;
39 };
40
41 bool
check()42 QPDFWordTokenFinder::check()
43 {
44 // Find a word token matching the given string, preceded by a
45 // delimiter, and followed by a delimiter or EOF.
46 QPDFTokenizer tokenizer;
47 QPDFTokenizer::Token t = tokenizer.readToken(is, "finder", true);
48 qpdf_offset_t pos = is->tell();
49 if (! (t == QPDFTokenizer::Token(QPDFTokenizer::tt_word, str)))
50 {
51 QTC::TC("qpdf", "QPDFTokenizer finder found wrong word");
52 return false;
53 }
54 qpdf_offset_t token_start = is->getLastOffset();
55 char next;
56 bool next_okay = false;
57 if (is->read(&next, 1) == 0)
58 {
59 QTC::TC("qpdf", "QPDFTokenizer inline image at EOF");
60 next_okay = true;
61 }
62 else
63 {
64 next_okay = is_delimiter(next);
65 }
66 is->seek(pos, SEEK_SET);
67 if (! next_okay)
68 {
69 return false;
70 }
71 if (token_start == 0)
72 {
73 // Can't actually happen...we never start the search at the
74 // beginning of the input.
75 return false;
76 }
77 return true;
78 }
79
Members()80 QPDFTokenizer::Members::Members() :
81 allow_eof(false),
82 include_ignorable(false)
83 {
84 reset();
85 }
86
87 void
reset()88 QPDFTokenizer::Members::reset()
89 {
90 state = st_top;
91 type = tt_bad;
92 val = "";
93 raw_val = "";
94 error_message = "";
95 unread_char = false;
96 char_to_unread = '\0';
97 inline_image_bytes = 0;
98 string_depth = 0;
99 string_ignoring_newline = false;
100 last_char_was_bs = false;
101 last_char_was_cr = false;
102 }
103
~Members()104 QPDFTokenizer::Members::~Members()
105 {
106 }
107
Token(token_type_e type,std::string const & value)108 QPDFTokenizer::Token::Token(token_type_e type, std::string const& value) :
109 type(type),
110 value(value),
111 raw_value(value)
112 {
113 if (type == tt_string)
114 {
115 raw_value = QPDFObjectHandle::newString(value).unparse();
116 }
117 else if (type == tt_name)
118 {
119 raw_value = QPDFObjectHandle::newName(value).unparse();
120 }
121 }
122
123
124
QPDFTokenizer()125 QPDFTokenizer::QPDFTokenizer() :
126 m(new Members())
127 {
128 }
129
130 void
allowEOF()131 QPDFTokenizer::allowEOF()
132 {
133 this->m->allow_eof = true;
134 }
135
136 void
includeIgnorable()137 QPDFTokenizer::includeIgnorable()
138 {
139 this->m->include_ignorable = true;
140 }
141
142 bool
isSpace(char ch)143 QPDFTokenizer::isSpace(char ch)
144 {
145 return ((ch == '\0') || QUtil::is_space(ch));
146 }
147
148 bool
isDelimiter(char ch)149 QPDFTokenizer::isDelimiter(char ch)
150 {
151 return is_delimiter(ch);
152 }
153
154 void
resolveLiteral()155 QPDFTokenizer::resolveLiteral()
156 {
157 if ((this->m->val.length() > 0) && (this->m->val.at(0) == '/'))
158 {
159 this->m->type = tt_name;
160 // Deal with # in name token. Note: '/' by itself is a
161 // valid name, so don't strip leading /. That way we
162 // don't have to deal with the empty string as a name.
163 std::string nval = "/";
164 size_t len = this->m->val.length();
165 for (size_t i = 1; i < len; ++i)
166 {
167 char ch = this->m->val.at(i);
168 if (ch == '#')
169 {
170 if ((i + 2 < len) &&
171 QUtil::is_hex_digit(this->m->val.at(i+1)) &&
172 QUtil::is_hex_digit(this->m->val.at(i+2)))
173 {
174 char num[3];
175 num[0] = this->m->val.at(i+1);
176 num[1] = this->m->val.at(i+2);
177 num[2] = '\0';
178 char ch2 = static_cast<char>(strtol(num, 0, 16));
179 if (ch2 == '\0')
180 {
181 this->m->type = tt_bad;
182 QTC::TC("qpdf", "QPDFTokenizer null in name");
183 this->m->error_message =
184 "null character not allowed in name token";
185 nval += "#00";
186 }
187 else
188 {
189 nval.append(1, ch2);
190 }
191 i += 2;
192 }
193 else
194 {
195 QTC::TC("qpdf", "QPDFTokenizer bad name");
196 this->m->error_message =
197 "name with stray # will not work with PDF >= 1.2";
198 // Use null to encode a bad # -- this is reversed
199 // in QPDF_Name::normalizeName.
200 nval += '\0';
201 }
202 }
203 else
204 {
205 nval.append(1, ch);
206 }
207 }
208 this->m->val = nval;
209 }
210 else if (QUtil::is_number(this->m->val.c_str()))
211 {
212 if (this->m->val.find('.') != std::string::npos)
213 {
214 this->m->type = tt_real;
215 }
216 else
217 {
218 this->m->type = tt_integer;
219 }
220 }
221 else if ((this->m->val == "true") || (this->m->val == "false"))
222 {
223 this->m->type = tt_bool;
224 }
225 else if (this->m->val == "null")
226 {
227 this->m->type = tt_null;
228 }
229 else
230 {
231 // I don't really know what it is, so leave it as tt_word.
232 // Lots of cases ($, #, etc.) other than actual words fall
233 // into this category, but that's okay at least for now.
234 this->m->type = tt_word;
235 }
236 }
237
238 void
presentCharacter(char ch)239 QPDFTokenizer::presentCharacter(char ch)
240 {
241 if (this->m->state == st_token_ready)
242 {
243 throw std::logic_error(
244 "INTERNAL ERROR: QPDF tokenizer presented character "
245 "while token is waiting");
246 }
247
248 char orig_ch = ch;
249
250 // State machine is implemented such that some characters may be
251 // handled more than once. This happens whenever you have to use
252 // the character that caused a state change in the new state.
253
254 bool handled = true;
255 if (this->m->state == st_top)
256 {
257 // Note: we specifically do not use ctype here. It is
258 // locale-dependent.
259 if (isSpace(ch))
260 {
261 if (this->m->include_ignorable)
262 {
263 this->m->state = st_in_space;
264 this->m->val += ch;
265 }
266 }
267 else if (ch == '%')
268 {
269 this->m->state = st_in_comment;
270 if (this->m->include_ignorable)
271 {
272 this->m->val += ch;
273 }
274 }
275 else if (ch == '(')
276 {
277 this->m->string_depth = 1;
278 this->m->string_ignoring_newline = false;
279 memset(this->m->bs_num_register, '\0',
280 sizeof(this->m->bs_num_register));
281 this->m->last_char_was_bs = false;
282 this->m->last_char_was_cr = false;
283 this->m->state = st_in_string;
284 }
285 else if (ch == '<')
286 {
287 this->m->state = st_lt;
288 }
289 else if (ch == '>')
290 {
291 this->m->state = st_gt;
292 }
293 else
294 {
295 this->m->val += ch;
296 if (ch == ')')
297 {
298 this->m->type = tt_bad;
299 QTC::TC("qpdf", "QPDFTokenizer bad )");
300 this->m->error_message = "unexpected )";
301 this->m->state = st_token_ready;
302 }
303 else if (ch == '[')
304 {
305 this->m->type = tt_array_open;
306 this->m->state = st_token_ready;
307 }
308 else if (ch == ']')
309 {
310 this->m->type = tt_array_close;
311 this->m->state = st_token_ready;
312 }
313 else if (ch == '{')
314 {
315 this->m->type = tt_brace_open;
316 this->m->state = st_token_ready;
317 }
318 else if (ch == '}')
319 {
320 this->m->type = tt_brace_close;
321 this->m->state = st_token_ready;
322 }
323 else
324 {
325 this->m->state = st_literal;
326 }
327 }
328 }
329 else if (this->m->state == st_in_space)
330 {
331 // We only enter this state if include_ignorable is true.
332 if (! isSpace(ch))
333 {
334 this->m->type = tt_space;
335 this->m->unread_char = true;
336 this->m->char_to_unread = ch;
337 this->m->state = st_token_ready;
338 }
339 else
340 {
341 this->m->val += ch;
342 }
343 }
344 else if (this->m->state == st_in_comment)
345 {
346 if ((ch == '\r') || (ch == '\n'))
347 {
348 if (this->m->include_ignorable)
349 {
350 this->m->type = tt_comment;
351 this->m->unread_char = true;
352 this->m->char_to_unread = ch;
353 this->m->state = st_token_ready;
354 }
355 else
356 {
357 this->m->state = st_top;
358 }
359 }
360 else if (this->m->include_ignorable)
361 {
362 this->m->val += ch;
363 }
364 }
365 else if (this->m->state == st_lt)
366 {
367 if (ch == '<')
368 {
369 this->m->val = "<<";
370 this->m->type = tt_dict_open;
371 this->m->state = st_token_ready;
372 }
373 else
374 {
375 handled = false;
376 this->m->state = st_in_hexstring;
377 }
378 }
379 else if (this->m->state == st_gt)
380 {
381 if (ch == '>')
382 {
383 this->m->val = ">>";
384 this->m->type = tt_dict_close;
385 this->m->state = st_token_ready;
386 }
387 else
388 {
389 this->m->val = ">";
390 this->m->type = tt_bad;
391 QTC::TC("qpdf", "QPDFTokenizer bad >");
392 this->m->error_message = "unexpected >";
393 this->m->unread_char = true;
394 this->m->char_to_unread = ch;
395 this->m->state = st_token_ready;
396 }
397 }
398 else if (this->m->state == st_in_string)
399 {
400 if (this->m->string_ignoring_newline && (ch != '\n'))
401 {
402 this->m->string_ignoring_newline = false;
403 }
404
405 size_t bs_num_count = strlen(this->m->bs_num_register);
406 bool ch_is_octal = ((ch >= '0') && (ch <= '7'));
407 if ((bs_num_count == 3) || ((bs_num_count > 0) && (! ch_is_octal)))
408 {
409 // We've accumulated \ddd. PDF Spec says to ignore
410 // high-order overflow.
411 this->m->val += static_cast<char>(
412 strtol(this->m->bs_num_register, 0, 8));
413 memset(this->m->bs_num_register, '\0',
414 sizeof(this->m->bs_num_register));
415 bs_num_count = 0;
416 }
417
418 if (this->m->string_ignoring_newline && (ch == '\n'))
419 {
420 // ignore
421 this->m->string_ignoring_newline = false;
422 }
423 else if (ch_is_octal &&
424 (this->m->last_char_was_bs || (bs_num_count > 0)))
425 {
426 this->m->bs_num_register[bs_num_count++] = ch;
427 }
428 else if (this->m->last_char_was_bs)
429 {
430 switch (ch)
431 {
432 case 'n':
433 this->m->val += '\n';
434 break;
435
436 case 'r':
437 this->m->val += '\r';
438 break;
439
440 case 't':
441 this->m->val += '\t';
442 break;
443
444 case 'b':
445 this->m->val += '\b';
446 break;
447
448 case 'f':
449 this->m->val += '\f';
450 break;
451
452 case '\n':
453 break;
454
455 case '\r':
456 this->m->string_ignoring_newline = true;
457 break;
458
459 default:
460 // PDF spec says backslash is ignored before anything else
461 this->m->val += ch;
462 break;
463 }
464 }
465 else if (ch == '\\')
466 {
467 // last_char_was_bs is set/cleared below as appropriate
468 if (bs_num_count)
469 {
470 throw std::logic_error(
471 "INTERNAL ERROR: QPDFTokenizer: bs_num_count != 0 "
472 "when ch == '\\'");
473 }
474 }
475 else if (ch == '(')
476 {
477 this->m->val += ch;
478 ++this->m->string_depth;
479 }
480 else if ((ch == ')') && (--this->m->string_depth == 0))
481 {
482 this->m->type = tt_string;
483 this->m->state = st_token_ready;
484 }
485 else if (ch == '\r')
486 {
487 // CR by itself is converted to LF
488 this->m->val += '\n';
489 }
490 else if (ch == '\n')
491 {
492 // CR LF is converted to LF
493 if (! this->m->last_char_was_cr)
494 {
495 this->m->val += ch;
496 }
497 }
498 else
499 {
500 this->m->val += ch;
501 }
502
503 this->m->last_char_was_cr =
504 ((! this->m->string_ignoring_newline) && (ch == '\r'));
505 this->m->last_char_was_bs =
506 ((! this->m->last_char_was_bs) && (ch == '\\'));
507 }
508 else if (this->m->state == st_literal)
509 {
510 if (isDelimiter(ch))
511 {
512 // A C-locale whitespace character or delimiter terminates
513 // token. It is important to unread the whitespace
514 // character even though it is ignored since it may be the
515 // newline after a stream keyword. Removing it here could
516 // make the stream-reading code break on some files,
517 // though not on any files in the test suite as of this
518 // writing.
519
520 this->m->type = tt_word;
521 this->m->unread_char = true;
522 this->m->char_to_unread = ch;
523 this->m->state = st_token_ready;
524 }
525 else
526 {
527 this->m->val += ch;
528 }
529 }
530 else if (this->m->state == st_inline_image)
531 {
532 this->m->val += ch;
533 size_t len = this->m->val.length();
534 if (len == this->m->inline_image_bytes)
535 {
536 QTC::TC("qpdf", "QPDFTokenizer found EI by byte count");
537 this->m->type = tt_inline_image;
538 this->m->inline_image_bytes = 0;
539 this->m->state = st_token_ready;
540 }
541 }
542 else
543 {
544 handled = false;
545 }
546
547 if (handled)
548 {
549 // okay
550 }
551 else if (this->m->state == st_in_hexstring)
552 {
553 if (ch == '>')
554 {
555 this->m->type = tt_string;
556 this->m->state = st_token_ready;
557 if (this->m->val.length() % 2)
558 {
559 // PDF spec says odd hexstrings have implicit
560 // trailing 0.
561 this->m->val += '0';
562 }
563 char num[3];
564 num[2] = '\0';
565 std::string nval;
566 for (unsigned int i = 0; i < this->m->val.length(); i += 2)
567 {
568 num[0] = this->m->val.at(i);
569 num[1] = this->m->val.at(i+1);
570 char nch = static_cast<char>(strtol(num, 0, 16));
571 nval += nch;
572 }
573 this->m->val = nval;
574 }
575 else if (QUtil::is_hex_digit(ch))
576 {
577 this->m->val += ch;
578 }
579 else if (isSpace(ch))
580 {
581 // ignore
582 }
583 else
584 {
585 this->m->type = tt_bad;
586 QTC::TC("qpdf", "QPDFTokenizer bad hexstring character");
587 this->m->error_message = std::string("invalid character (") +
588 ch + ") in hexstring";
589 this->m->state = st_token_ready;
590 }
591 }
592 else
593 {
594 throw std::logic_error(
595 "INTERNAL ERROR: invalid state while reading token");
596 }
597
598 if ((this->m->state == st_token_ready) && (this->m->type == tt_word))
599 {
600 resolveLiteral();
601 }
602
603 if (! (betweenTokens() ||
604 ((this->m->state == st_token_ready) && this->m->unread_char)))
605 {
606 this->m->raw_val += orig_ch;
607 }
608 }
609
610 void
presentEOF()611 QPDFTokenizer::presentEOF()
612 {
613 if (this->m->state == st_literal)
614 {
615 QTC::TC("qpdf", "QPDFTokenizer EOF reading appendable token");
616 resolveLiteral();
617 }
618 else if ((this->m->include_ignorable) && (this->m->state == st_in_space))
619 {
620 this->m->type = tt_space;
621 }
622 else if ((this->m->include_ignorable) && (this->m->state == st_in_comment))
623 {
624 this->m->type = tt_comment;
625 }
626 else if (betweenTokens())
627 {
628 this->m->type = tt_eof;
629 }
630 else if (this->m->state != st_token_ready)
631 {
632 QTC::TC("qpdf", "QPDFTokenizer EOF reading token");
633 this->m->type = tt_bad;
634 this->m->error_message = "EOF while reading token";
635 }
636
637 this->m->state = st_token_ready;
638 }
639
640 void
expectInlineImage(PointerHolder<InputSource> input)641 QPDFTokenizer::expectInlineImage(PointerHolder<InputSource> input)
642 {
643 if (this->m->state != st_top)
644 {
645 throw std::logic_error("QPDFTokenizer::expectInlineImage called"
646 " when tokenizer is in improper state");
647 }
648 findEI(input);
649 this->m->state = st_inline_image;
650 }
651
652 void
findEI(PointerHolder<InputSource> input)653 QPDFTokenizer::findEI(PointerHolder<InputSource> input)
654 {
655 if (! input.getPointer())
656 {
657 return;
658 }
659
660 qpdf_offset_t last_offset = input->getLastOffset();
661 qpdf_offset_t pos = input->tell();
662
663 // Use QPDFWordTokenFinder to find EI surrounded by delimiters.
664 // Then read the next several tokens or up to EOF. If we find any
665 // suspicious-looking or tokens, this is probably still part of
666 // the image data, so keep looking for EI. Stop at the first EI
667 // that passes. If we get to the end without finding one, return
668 // the last EI we found. Store the number of bytes expected in the
669 // inline image including the EI and use that to break out of
670 // inline image, falling back to the old method if needed.
671
672 bool okay = false;
673 bool first_try = true;
674 while (! okay)
675 {
676 QPDFWordTokenFinder f(input, "EI");
677 if (! input->findFirst("EI", input->tell(), 0, f))
678 {
679 break;
680 }
681 this->m->inline_image_bytes = QIntC::to_size(input->tell() - pos - 2);
682
683 QPDFTokenizer check;
684 bool found_bad = false;
685 // Look at the next 10 tokens or up to EOF. The next inline
686 // image's image data would look like bad tokens, but there
687 // will always be at least 10 tokens between one inline
688 // image's EI and the next valid one's ID since width, height,
689 // bits per pixel, and color space are all required as well as
690 // a BI and ID. If we get 10 good tokens in a row or hit EOF,
691 // we can be pretty sure we've found the actual EI.
692 for (int i = 0; i < 10; ++i)
693 {
694 QPDFTokenizer::Token t =
695 check.readToken(input, "checker", true);
696 token_type_e type = t.getType();
697 if (type == tt_eof)
698 {
699 okay = true;
700 }
701 else if (type == tt_bad)
702 {
703 found_bad = true;
704 }
705 else if (type == tt_word)
706 {
707 // The qpdf tokenizer lumps alphabetic and otherwise
708 // uncategorized characters into "words". We recognize
709 // strings of alphabetic characters as potential valid
710 // operators for purposes of telling whether we're in
711 // valid content or not. It's not perfect, but it
712 // should work more reliably than what we used to do,
713 // which was already good enough for the vast majority
714 // of files.
715 bool found_alpha = false;
716 bool found_non_printable = false;
717 bool found_other = false;
718 std::string value = t.getValue();
719 for (std::string::iterator iter = value.begin();
720 iter != value.end(); ++iter)
721 {
722 signed char ch = *iter;
723 if (((ch >= 'a') && (ch <= 'z')) ||
724 ((ch >= 'A') && (ch <= 'Z')) ||
725 (ch == '*'))
726 {
727 // Treat '*' as alpha since there are valid
728 // PDF operators that contain * along with
729 // alphabetic characters.
730 found_alpha = true;
731 }
732 else if ((ch < 32) && (! isSpace(ch)))
733 {
734 // ch is signed, so characters outside of
735 // 7-bit will be < 0.
736 found_non_printable = true;
737 break;
738 }
739 else
740 {
741 found_other = true;
742 }
743 }
744 if (found_non_printable || (found_alpha && found_other))
745 {
746 found_bad = true;
747 }
748 }
749 if (okay || found_bad)
750 {
751 break;
752 }
753 }
754 if (! found_bad)
755 {
756 okay = true;
757 }
758 if (! okay)
759 {
760 first_try = false;
761 }
762 }
763 if (okay && (! first_try))
764 {
765 QTC::TC("qpdf", "QPDFTokenizer found EI after more than one try");
766 }
767
768 input->seek(pos, SEEK_SET);
769 input->setLastOffset(last_offset);
770 }
771
772 bool
getToken(Token & token,bool & unread_char,char & ch)773 QPDFTokenizer::getToken(Token& token, bool& unread_char, char& ch)
774 {
775 bool ready = (this->m->state == st_token_ready);
776 unread_char = this->m->unread_char;
777 ch = this->m->char_to_unread;
778 if (ready)
779 {
780 if (this->m->type == tt_bad)
781 {
782 this->m->val = this->m->raw_val;
783 }
784 token = Token(this->m->type, this->m->val,
785 this->m->raw_val, this->m->error_message);
786 this->m->reset();
787 }
788 return ready;
789 }
790
791 bool
betweenTokens()792 QPDFTokenizer::betweenTokens()
793 {
794 return ((this->m->state == st_top) ||
795 ((! this->m->include_ignorable) &&
796 ((this->m->state == st_in_comment) ||
797 (this->m->state == st_in_space))));
798 }
799
800 QPDFTokenizer::Token
readToken(PointerHolder<InputSource> input,std::string const & context,bool allow_bad,size_t max_len)801 QPDFTokenizer::readToken(PointerHolder<InputSource> input,
802 std::string const& context,
803 bool allow_bad,
804 size_t max_len)
805 {
806 qpdf_offset_t offset = input->tell();
807 Token token;
808 bool unread_char;
809 char char_to_unread;
810 bool presented_eof = false;
811 while (! getToken(token, unread_char, char_to_unread))
812 {
813 char ch;
814 if (input->read(&ch, 1) == 0)
815 {
816 if (! presented_eof)
817 {
818 presentEOF();
819 presented_eof = true;
820 if ((this->m->type == tt_eof) && (! this->m->allow_eof))
821 {
822 // Nothing in the qpdf library calls readToken
823 // without allowEOF anymore, so this case is not
824 // exercised.
825 this->m->type = tt_bad;
826 this->m->error_message = "unexpected EOF";
827 offset = input->getLastOffset();
828 }
829 }
830 else
831 {
832 throw std::logic_error(
833 "getToken returned false after presenting EOF");
834 }
835 }
836 else
837 {
838 presentCharacter(ch);
839 if (betweenTokens() && (input->getLastOffset() == offset))
840 {
841 ++offset;
842 }
843 if (max_len && (this->m->raw_val.length() >= max_len) &&
844 (this->m->state != st_token_ready))
845 {
846 // terminate this token now
847 QTC::TC("qpdf", "QPDFTokenizer block long token");
848 this->m->type = tt_bad;
849 this->m->state = st_token_ready;
850 this->m->error_message =
851 "exceeded allowable length while reading token";
852 }
853 }
854 }
855
856 if (unread_char)
857 {
858 input->unreadCh(char_to_unread);
859 }
860
861 if (token.getType() != tt_eof)
862 {
863 input->setLastOffset(offset);
864 }
865
866 if (token.getType() == tt_bad)
867 {
868 if (allow_bad)
869 {
870 QTC::TC("qpdf", "QPDFTokenizer allowing bad token");
871 }
872 else
873 {
874 throw QPDFExc(qpdf_e_damaged_pdf, input->getName(),
875 context, offset, token.getErrorMessage());
876 }
877 }
878
879 return token;
880 }
881