1 // Copyright (c) 2005-2021 Jay Berkenbilt
2 //
3 // This file is part of qpdf.
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License");
6 // you may not use this file except in compliance with the License.
7 // You may obtain a copy of the License at
8 //
9 //   http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
17 // Versions of qpdf prior to version 7 were released under the terms
18 // of version 2.0 of the Artistic License. At your option, you may
19 // continue to consider qpdf to be licensed under those terms. Please
20 // see the manual for additional information.
21 
22 #ifndef QPDFTOKENIZER_HH
23 #define QPDFTOKENIZER_HH
24 
25 #include <qpdf/DLL.h>
26 
27 #include <qpdf/InputSource.hh>
28 #include <qpdf/PointerHolder.hh>
29 #include <string>
30 #include <stdio.h>
31 
32 class QPDFTokenizer
33 {
34   public:
35     // Token type tt_eof is only returned of allowEOF() is called on
36     // the tokenizer. tt_eof was introduced in QPDF version 4.1.
37     // tt_space, tt_comment, and tt_inline_image were added in QPDF
38     // version 8.
39     enum token_type_e
40     {
41 	tt_bad,
42 	tt_array_close,
43 	tt_array_open,
44 	tt_brace_close,
45 	tt_brace_open,
46 	tt_dict_close,
47 	tt_dict_open,
48 	tt_integer,
49 	tt_name,
50 	tt_real,
51 	tt_string,
52 	tt_null,
53 	tt_bool,
54 	tt_word,
55         tt_eof,
56         tt_space,
57         tt_comment,
58         tt_inline_image,
59     };
60 
61     class Token
62     {
63       public:
Token()64 	Token() : type(tt_bad) {}
65         QPDF_DLL
66 	Token(token_type_e type, std::string const& value);
Token(token_type_e type,std::string const & value,std::string raw_value,std::string error_message)67 	Token(token_type_e type, std::string const& value,
68 	      std::string raw_value, std::string error_message) :
69 	    type(type),
70 	    value(value),
71 	    raw_value(raw_value),
72 	    error_message(error_message)
73 	{
74 	}
getType() const75 	token_type_e getType() const
76 	{
77 	    return this->type;
78 	}
getValue() const79 	std::string const& getValue() const
80 	{
81 	    return this->value;
82 	}
getRawValue() const83 	std::string const& getRawValue() const
84 	{
85 	    return this->raw_value;
86 	}
getErrorMessage() const87 	std::string const& getErrorMessage() const
88 	{
89 	    return this->error_message;
90 	}
operator ==(Token const & rhs) const91 	bool operator==(Token const& rhs) const
92 	{
93 	    // Ignore fields other than type and value
94 	    return ((this->type != tt_bad) &&
95 		    (this->type == rhs.type) &&
96 		    (this->value == rhs.value));
97 	}
98 
99       private:
100 	token_type_e type;
101 	std::string value;
102 	std::string raw_value;
103 	std::string error_message;
104     };
105 
106     QPDF_DLL
107     QPDFTokenizer();
108 
109     // If called, treat EOF as a separate token type instead of an
110     // error.  This was introduced in QPDF 4.1 to facilitate
111     // tokenizing content streams.
112     QPDF_DLL
113     void allowEOF();
114 
115     // If called, readToken will return "ignorable" tokens for space
116     // and comments. This was added in QPDF 8.
117     QPDF_DLL
118     void includeIgnorable();
119 
120     // There are two modes of operation: push and pull. The pull
121     // method is easier but requires an input source. The push method
122     // is more complicated but can be used to tokenize a stream of
123     // incoming characters in a pipeline.
124 
125     // Push mode:
126 
127     // Keep presenting characters with presentCharacter() and
128     // presentEOF() and calling getToken() until getToken() returns
129     // true. When it does, be sure to check unread_ch and to unread ch
130     // if it is true.
131 
132     // It these are called when a token is available, an exception
133     // will be thrown.
134     QPDF_DLL
135     void presentCharacter(char ch);
136     QPDF_DLL
137     void presentEOF();
138 
139     // If a token is available, return true and initialize token with
140     // the token, unread_char with whether or not we have to unread
141     // the last character, and if unread_char, ch with the character
142     // to unread.
143     QPDF_DLL
144     bool getToken(Token& token, bool& unread_char, char& ch);
145 
146     // This function returns true of the current character is between
147     // tokens (i.e., white space that is not part of a string) or is
148     // part of a comment.  A tokenizing filter can call this to
149     // determine whether to output the character.
150     QPDF_DLL
151     bool betweenTokens();
152 
153     // Pull mode:
154 
155     // Read a token from an input source. Context describes the
156     // context in which the token is being read and is used in the
157     // exception thrown if there is an error. After a token is read,
158     // the position of the input source returned by input->tell()
159     // points to just after the token, and the input source's "last
160     // offset" as returned by input->getLastOffset() points to the
161     // beginning of the token.
162     QPDF_DLL
163     Token readToken(PointerHolder<InputSource> input,
164                     std::string const& context,
165                     bool allow_bad = false,
166                     size_t max_len = 0);
167 
168     // Calling this method puts the tokenizer in a state for reading
169     // inline images. You should call this method after reading the
170     // character following the ID operator. In that state, it will
171     // return all data up to BUT NOT INCLUDING the next EI token.
172     // After you call this method, the next call to readToken (or the
173     // token created next time getToken returns true) will either be
174     // tt_inline_image or tt_bad. This is the only way readToken
175     // returns a tt_inline_image token.
176     QPDF_DLL
177     void expectInlineImage(PointerHolder<InputSource> input);
178 
179   private:
180     QPDFTokenizer(QPDFTokenizer const&) = delete;
181     QPDFTokenizer& operator=(QPDFTokenizer const&) = delete;
182 
183     void resolveLiteral();
184     bool isSpace(char);
185     bool isDelimiter(char);
186     void findEI(PointerHolder<InputSource> input);
187 
188     enum state_e {
189         st_top, st_in_space, st_in_comment, st_in_string, st_lt, st_gt,
190         st_literal, st_in_hexstring, st_inline_image, st_token_ready
191     };
192 
193     class Members
194     {
195         friend class QPDFTokenizer;
196 
197       public:
198         QPDF_DLL
199         ~Members();
200 
201       private:
202         Members();
203         Members(Members const&);
204         void reset();
205 
206         // Lexer state
207         state_e state;
208 
209         bool allow_eof;
210         bool include_ignorable;
211 
212         // Current token accumulation
213         token_type_e type;
214         std::string val;
215         std::string raw_val;
216         std::string error_message;
217         bool unread_char;
218         char char_to_unread;
219         size_t inline_image_bytes;
220 
221         // State for strings
222         int string_depth;
223         bool string_ignoring_newline;
224         char bs_num_register[4];
225         bool last_char_was_bs;
226         bool last_char_was_cr;
227     };
228     PointerHolder<Members> m;
229 };
230 
231 #endif // QPDFTOKENIZER_HH
232