1 // Copyright (c) 2005-2021 Jay Berkenbilt 2 // 3 // This file is part of qpdf. 4 // 5 // Licensed under the Apache License, Version 2.0 (the "License"); 6 // you may not use this file except in compliance with the License. 7 // You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 // 17 // Versions of qpdf prior to version 7 were released under the terms 18 // of version 2.0 of the Artistic License. At your option, you may 19 // continue to consider qpdf to be licensed under those terms. Please 20 // see the manual for additional information. 21 22 #ifndef QPDFTOKENIZER_HH 23 #define QPDFTOKENIZER_HH 24 25 #include <qpdf/DLL.h> 26 27 #include <qpdf/InputSource.hh> 28 #include <qpdf/PointerHolder.hh> 29 #include <string> 30 #include <stdio.h> 31 32 class QPDFTokenizer 33 { 34 public: 35 // Token type tt_eof is only returned of allowEOF() is called on 36 // the tokenizer. tt_eof was introduced in QPDF version 4.1. 37 // tt_space, tt_comment, and tt_inline_image were added in QPDF 38 // version 8. 39 enum token_type_e 40 { 41 tt_bad, 42 tt_array_close, 43 tt_array_open, 44 tt_brace_close, 45 tt_brace_open, 46 tt_dict_close, 47 tt_dict_open, 48 tt_integer, 49 tt_name, 50 tt_real, 51 tt_string, 52 tt_null, 53 tt_bool, 54 tt_word, 55 tt_eof, 56 tt_space, 57 tt_comment, 58 tt_inline_image, 59 }; 60 61 class Token 62 { 63 public: Token()64 Token() : type(tt_bad) {} 65 QPDF_DLL 66 Token(token_type_e type, std::string const& value); Token(token_type_e type,std::string const & value,std::string raw_value,std::string error_message)67 Token(token_type_e type, std::string const& value, 68 std::string raw_value, std::string error_message) : 69 type(type), 70 value(value), 71 raw_value(raw_value), 72 error_message(error_message) 73 { 74 } getType() const75 token_type_e getType() const 76 { 77 return this->type; 78 } getValue() const79 std::string const& getValue() const 80 { 81 return this->value; 82 } getRawValue() const83 std::string const& getRawValue() const 84 { 85 return this->raw_value; 86 } getErrorMessage() const87 std::string const& getErrorMessage() const 88 { 89 return this->error_message; 90 } operator ==(Token const & rhs) const91 bool operator==(Token const& rhs) const 92 { 93 // Ignore fields other than type and value 94 return ((this->type != tt_bad) && 95 (this->type == rhs.type) && 96 (this->value == rhs.value)); 97 } 98 99 private: 100 token_type_e type; 101 std::string value; 102 std::string raw_value; 103 std::string error_message; 104 }; 105 106 QPDF_DLL 107 QPDFTokenizer(); 108 109 // If called, treat EOF as a separate token type instead of an 110 // error. This was introduced in QPDF 4.1 to facilitate 111 // tokenizing content streams. 112 QPDF_DLL 113 void allowEOF(); 114 115 // If called, readToken will return "ignorable" tokens for space 116 // and comments. This was added in QPDF 8. 117 QPDF_DLL 118 void includeIgnorable(); 119 120 // There are two modes of operation: push and pull. The pull 121 // method is easier but requires an input source. The push method 122 // is more complicated but can be used to tokenize a stream of 123 // incoming characters in a pipeline. 124 125 // Push mode: 126 127 // Keep presenting characters with presentCharacter() and 128 // presentEOF() and calling getToken() until getToken() returns 129 // true. When it does, be sure to check unread_ch and to unread ch 130 // if it is true. 131 132 // It these are called when a token is available, an exception 133 // will be thrown. 134 QPDF_DLL 135 void presentCharacter(char ch); 136 QPDF_DLL 137 void presentEOF(); 138 139 // If a token is available, return true and initialize token with 140 // the token, unread_char with whether or not we have to unread 141 // the last character, and if unread_char, ch with the character 142 // to unread. 143 QPDF_DLL 144 bool getToken(Token& token, bool& unread_char, char& ch); 145 146 // This function returns true of the current character is between 147 // tokens (i.e., white space that is not part of a string) or is 148 // part of a comment. A tokenizing filter can call this to 149 // determine whether to output the character. 150 QPDF_DLL 151 bool betweenTokens(); 152 153 // Pull mode: 154 155 // Read a token from an input source. Context describes the 156 // context in which the token is being read and is used in the 157 // exception thrown if there is an error. After a token is read, 158 // the position of the input source returned by input->tell() 159 // points to just after the token, and the input source's "last 160 // offset" as returned by input->getLastOffset() points to the 161 // beginning of the token. 162 QPDF_DLL 163 Token readToken(PointerHolder<InputSource> input, 164 std::string const& context, 165 bool allow_bad = false, 166 size_t max_len = 0); 167 168 // Calling this method puts the tokenizer in a state for reading 169 // inline images. You should call this method after reading the 170 // character following the ID operator. In that state, it will 171 // return all data up to BUT NOT INCLUDING the next EI token. 172 // After you call this method, the next call to readToken (or the 173 // token created next time getToken returns true) will either be 174 // tt_inline_image or tt_bad. This is the only way readToken 175 // returns a tt_inline_image token. 176 QPDF_DLL 177 void expectInlineImage(PointerHolder<InputSource> input); 178 179 private: 180 QPDFTokenizer(QPDFTokenizer const&) = delete; 181 QPDFTokenizer& operator=(QPDFTokenizer const&) = delete; 182 183 void resolveLiteral(); 184 bool isSpace(char); 185 bool isDelimiter(char); 186 void findEI(PointerHolder<InputSource> input); 187 188 enum state_e { 189 st_top, st_in_space, st_in_comment, st_in_string, st_lt, st_gt, 190 st_literal, st_in_hexstring, st_inline_image, st_token_ready 191 }; 192 193 class Members 194 { 195 friend class QPDFTokenizer; 196 197 public: 198 QPDF_DLL 199 ~Members(); 200 201 private: 202 Members(); 203 Members(Members const&); 204 void reset(); 205 206 // Lexer state 207 state_e state; 208 209 bool allow_eof; 210 bool include_ignorable; 211 212 // Current token accumulation 213 token_type_e type; 214 std::string val; 215 std::string raw_val; 216 std::string error_message; 217 bool unread_char; 218 char char_to_unread; 219 size_t inline_image_bytes; 220 221 // State for strings 222 int string_depth; 223 bool string_ignoring_newline; 224 char bs_num_register[4]; 225 bool last_char_was_bs; 226 bool last_char_was_cr; 227 }; 228 PointerHolder<Members> m; 229 }; 230 231 #endif // QPDFTOKENIZER_HH 232