1 /* 2 * Copyright 2006-2008 The FLWOR Foundation. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef ZORBA_JSON_PARSER_H 18 #define ZORBA_JSON_PARSER_H 19 20 #include <zorba/config.h> 21 22 #include <exception> 23 #include <iostream> 24 #include <stack> 25 #include <string> 26 27 #include <zorba/internal/diagnostic.h> 28 29 #include "zorbatypes/zstring.h" 30 31 #include "cxx_util.h" 32 #include "unicode_util.h" 33 34 namespace zorba { 35 namespace json { 36 37 /////////////////////////////////////////////////////////////////////////////// 38 39 typedef internal::diagnostic::location location; 40 41 /////////////////////////////////////////////////////////////////////////////// 42 43 /** 44 * A JSON %type is the type of JSON data. This isn't used by the lexer or 45 * parser implementation at all, but it's handy. 46 */ 47 enum type { 48 none, // meaning "not set" as opposed to "null" 49 array, 50 boolean, 51 null, 52 number, 53 object, 54 string 55 }; 56 extern char const *const type_string_of[]; 57 58 inline std::ostream& operator<<( std::ostream &o, type t ) { 59 return o << type_string_of[ t ]; 60 } 61 62 /** 63 * A JSON %token. Tokens have a type, location at which they were found, and 64 * sometimes a value. 65 * 66 * See: "RFC 4627: The application/json Media Type for JavaScript Object 67 * Notation (JSON)." 68 */ 69 class token { 70 // see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2333.html 71 struct pointer_conversion { int valid; }; 72 typedef int pointer_conversion::*explicit_bool; 73 public: 74 typedef zstring value_type; 75 76 /** 77 * The types of tokens in JSON. The first 6 constants have values that 78 * correspond to the actual structural characters used by JSON; the rest were 79 * assigned non-standard, mnemonic values for convenience. 80 */ 81 enum type { 82 none, 83 begin_array = '[', 84 begin_object = '{', 85 end_array = ']', 86 end_object = '}', 87 name_separator = ':', 88 value_separator = ',', 89 string = 'S', 90 number = 'N', 91 json_false = 'F', 92 json_null = '0', 93 json_true = 'T', 94 }; 95 96 /** 97 * Default constructor. 98 */ 99 token(); 100 101 /** 102 * Clears this %token. 103 */ clear()104 void clear() { 105 type_ = none; 106 value_.clear(); 107 } 108 109 /** 110 * Gets the location at which this %token was found. 111 * 112 * @return Returns said location. 113 */ get_loc()114 location const& get_loc() const { 115 return loc_; 116 } 117 118 /** 119 * Gets the type of this %token. 120 * 121 * @return Returns said type. 122 */ get_type()123 type get_type() const { 124 return type_; 125 } 126 127 /** 128 * Gets the value of this %token, if any. Only %token types string, number, 129 * false, null, and true have a value. 130 * 131 * @return Returns said value or the empty string. 132 */ get_value()133 value_type const& get_value() const { 134 return value_; 135 } 136 137 /** 138 * Conversion to \c bool. 139 * 140 * @return Returns \c true only if this token's type is not \c none. 141 */ explicit_bool()142 operator explicit_bool() const { 143 return type_ ? &pointer_conversion::valid : nullptr; 144 } 145 146 private: 147 location loc_; 148 type type_; 149 value_type value_; 150 151 friend class lexer; 152 }; 153 154 /** 155 * Map a token's type to a JSON type. 156 * 157 * @param tt The token::type to map. 158 * @return Returns the corresponding JSON type or \c none if \a tt doesn't map. 159 */ 160 type map_type( token::type tt ); 161 162 /** 163 * Emits the given token type to an ostream. 164 * 165 * @param o The ostream to emit to. 166 * @param tt The token type to emit. 167 * @return Returns \a o. 168 */ 169 std::ostream& operator<<( std::ostream &o, token::type tt ); 170 171 /** 172 * Emits the given token to an ostream. 173 * 174 * @param o The ostream to emit to. 175 * @param t The token to emit. 176 * @return Returns \a o. 177 */ 178 std::ostream& operator<<( std::ostream &o, token const &t ); 179 180 /** 181 * Compares two tokens for equality. 182 * 183 * @param t1 The first token. 184 * @param t2 The second token. 185 * @return Returns \c true only if the two tokens' types and values are equal. 186 */ 187 inline bool operator==( token const &t1, token const &t2 ) { 188 return t1.get_type() == t2.get_type() && t1.get_value() == t2.get_value(); 189 } 190 191 /** 192 * Compares a token's type to another type for equality. 193 * 194 * @param t The token whose type to compare. 195 * @param tt The type to compare to. 196 * @return Returns \c true only if the token's type equals \a tt. 197 */ 198 inline bool operator==( token const &t, token::type tt ) { 199 return t.get_type() == tt; 200 } 201 202 /** 203 * Compares a token's type to another type for equality. 204 * 205 * @param tt The type to compare. 206 * @param t The token whose type to compare to. 207 * @return Returns \c true only if \a tt equals the token's type. 208 */ 209 inline bool operator==( token::type tt, token const &t ) { 210 return t == tt; 211 } 212 213 /** 214 * Compares a token's value to a C string for equality. 215 * 216 * @param t The token whose value to compare. 217 * @param value The value to compare to. 218 * @return Returns \c true only if the token's value equals \a value. 219 */ 220 inline bool operator==( token const &t, char const *value ) { 221 return t.get_value() == value; 222 } 223 224 /** 225 * Compares a C string to a token's value for equality. 226 * 227 * @param value The value to compare. 228 * @param t The token whose value to compare to. 229 * @return Returns \c true only if \a value equals the token's value. 230 */ 231 inline bool operator==( char const *value, token const &t ) { 232 return t == value; 233 } 234 235 /** 236 * Compares two tokens for inequality. 237 * 238 * @param t1 The first token. 239 * @param t2 The second token. 240 * @return Returns \c true if either the two tokens' types or values are not 241 * equal. 242 */ 243 inline bool operator!=( token const &t1, token const &t2 ) { 244 return !(t1 == t2); 245 } 246 247 /** 248 * Compares a token's type to another type for inequality. 249 * 250 * @param t The token whose type to compare. 251 * @param tt The type to compare to. 252 * @return Returns \c true only if the token's type is not equal to \a tt. 253 */ 254 inline bool operator!=( token const &t, token::type tt ) { 255 return !(t == tt); 256 } 257 258 /** 259 * Compares a token's type to another type for inequality. 260 * 261 * @param tt The type to compare. 262 * @param t The token whose type to compare to. 263 * @return Returns \c true only if \a tt is not equal to the token's type. 264 */ 265 inline bool operator!=( token::type tt, token const &t ) { 266 return !(tt == t); 267 } 268 269 /** 270 * Compares a token's value to a C string for inequality. 271 * 272 * @param t The token whose value to compare. 273 * @param value The value to compare to. 274 * @return Returns \c true only if the token's value is not equal to \a value. 275 */ 276 inline bool operator!=( token const &t, char const *value ) { 277 return !(t == value); 278 } 279 280 /** 281 * Compares a token's value to a C string for inequality. 282 * 283 * @param value The value to compare. 284 * @param t The token whose value to compare to. 285 * @return Returns \c true only if \a value is not equal to the token's value. 286 */ 287 inline bool operator!=( char const *value, token const &t ) { 288 return !(value == t); 289 } 290 291 /////////////////////////////////////////////////////////////////////////////// 292 293 /** 294 * An %exception is the root of the JSON %exception hierarchy. 295 */ 296 class exception : public std::exception { 297 public: 298 ~exception() throw(); 299 300 /** 301 * Gets the location in the JSON source whence this exception was thrown. 302 */ get_loc()303 location const& get_loc() const { 304 return loc_; 305 } 306 307 // inherited 308 char const* what() const throw(); 309 310 protected: 311 exception( location const &loc, std::string const &message ); 312 313 private: 314 location loc_; 315 std::string message_; 316 }; 317 318 /** 319 * This exception is thrown when an illegal character is encountered in a JSON 320 * data stream. 321 */ 322 class illegal_character : public exception { 323 public: 324 illegal_character( location const &loc, char c ); 325 ~illegal_character() throw(); 326 327 /** 328 * Gets the illegal character. 329 * 330 * @return Returns said character. 331 */ get_char()332 char get_char() const { 333 return c_; 334 } 335 336 private: 337 char c_; 338 }; 339 340 /** 341 * This exception is thrown when an illegal Unicode code-point escape sequence 342 * (\uHHHH) is encountered. 343 */ 344 class illegal_codepoint : public exception { 345 public: 346 illegal_codepoint( location const &loc, token::value_type const &cp ); 347 ~illegal_codepoint() throw(); 348 349 /** 350 * Gets the illegal code-point. 351 * 352 * @return Returns said code-point. 353 */ get_codepoint()354 token::value_type get_codepoint() const { 355 return codepoint_; 356 } 357 358 private: 359 token::value_type codepoint_; 360 }; 361 362 /** 363 * This exception is thrown when an illegal character follows a backslash 364 * (escape) within a string literal. The legal escape characters are: 365 * ["/\bfnrtu]. 366 */ 367 class illegal_escape : public exception { 368 public: 369 illegal_escape( location const &loc, char escape ); 370 ~illegal_escape() throw(); 371 372 /** 373 * Gets the illegal escape character. 374 * 375 * @return Returns said character. 376 */ get_escape()377 char get_escape() const { 378 return esc_; 379 } 380 381 private: 382 char esc_; 383 }; 384 385 /** 386 * This exception is thrown when a literal other than \c false, \c null, or 387 * \c true is encountered. 388 */ 389 class illegal_literal : public exception { 390 public: 391 illegal_literal( location const &loc ); 392 ~illegal_literal() throw(); 393 }; 394 395 /** 396 * This exception is thrown when an illegal number is encountered. 397 */ 398 class illegal_number : public exception { 399 public: 400 illegal_number( location const &loc ); 401 ~illegal_number() throw(); 402 }; 403 404 /** 405 * This exception is thrown when an unexpected token is encountered. 406 */ 407 class unexpected_token : public exception { 408 public: 409 unexpected_token( token const &t ); 410 ~unexpected_token() throw(); 411 412 /** 413 * Gets the unexpected token 414 * 415 * @return Returns said token. 416 */ get_token()417 token const& get_token() const { 418 return token_; 419 } 420 421 private: 422 token token_; 423 }; 424 425 /** 426 * This exception is thrown when an EOF is encountered before a string's 427 * terminating quote. 428 */ 429 class unterminated_string : public exception { 430 public: 431 unterminated_string( location const &loc ); 432 ~unterminated_string() throw(); 433 }; 434 435 /////////////////////////////////////////////////////////////////////////////// 436 437 /** 438 * A %lexer extracts JSON tokens from an istream. 439 */ 440 class lexer { 441 public: 442 typedef location::line_type line_type; 443 typedef location::column_type column_type; 444 445 /** 446 * Constructs a %lexer on the given istream. 447 * 448 * @param in The istream to read from. 449 */ 450 lexer( std::istream &in ); 451 452 /** 453 * Gets the next token, if any. 454 * 455 * @param result A pointer to the token to get into. 456 * @return Returns \c true only if there was a next token. 457 * @throws exception upon error. 458 */ 459 bool next( token *result ); 460 461 /** 462 * Sets the file location. 463 * 464 * @param file The source file name. 465 * @param line The source line number. 466 * @param col The source column number. 467 */ 468 void set_loc( char const *file, line_type line, column_type col ); 469 470 private: cur_loc()471 location cur_loc() const { 472 return location( file_, line_, col_ ); 473 } 474 475 bool get_char( char* = nullptr ); 476 bool peek_char( char* ); 477 unicode::code_point parse_codepoint(); 478 token::type parse_literal( char, token::value_type* ); 479 void parse_number( char, token::value_type* ); 480 void parse_string( token::value_type* ); 481 482 std::istream *in_; 483 std::string file_; 484 line_type line_; 485 column_type col_; 486 location cur_loc_; 487 }; 488 489 /////////////////////////////////////////////////////////////////////////////// 490 491 /** 492 * A %parser extracts JSON tokens from an istream while checking to ensure the 493 * token sequence is valid. 494 */ 495 class parser { 496 public: 497 typedef lexer::line_type line_type; 498 typedef lexer::column_type column_type; 499 500 /** 501 * Constructs a %parser on the given istream. 502 * 503 * @param in The istream to read from. 504 */ 505 parser( std::istream &in ); 506 507 /** 508 * Gets the next token, if any. 509 * 510 * @param result A pointer to the token to receive the token. 511 * @return Returns \c true only if there was a next token. 512 * @throws exception upon error. 513 */ 514 bool next( token *result ); 515 516 /** 517 * Peeks at the next token, if any. 518 * 519 * @param result A pointer to the token to receive the token, if any. 520 * @return Returns the type of the peeked token. 521 * @throws exception upon error. 522 */ 523 token::type peek( token *result = nullptr ); 524 525 /** 526 * Sets the file location. 527 * 528 * @param file The source file name. 529 * @param line The source line number. 530 * @param col The source column number. 531 */ set_loc(char const * file,line_type line,column_type col)532 void set_loc( char const *file, line_type line, column_type col ) { 533 lexer_.set_loc( file, line, col ); 534 } 535 536 private: 537 enum state { 538 A0, A1, A2, // Array 539 E0, E1, // Element 540 J0, J1, // JSON 541 M0, M1, // Member 542 O0, O1, O2, // Object 543 P0, P1, // Pair 544 V0 // Value 545 }; 546 547 friend std::ostream& operator<<( std::ostream&, state ); 548 549 bool get_token( token* ); 550 bool get_token_debug( int, token* ); 551 bool matches_token( token::type, token* ); 552 bool matches_token_debug( int, token::type, token* ); 553 token::type peek_token(); 554 token::type peek_token_debug( int ); 555 void require_token( token::type, token* ); 556 void require_token_debug( int, token::type, token* ); 557 558 lexer lexer_; 559 token peeked_token_; 560 std::stack<state> state_stack_; 561 state state_; 562 }; 563 564 /////////////////////////////////////////////////////////////////////////////// 565 566 } // namespace json 567 } // namespace zorba 568 569 #endif /* ZORBA_JSON_PARSER_H */ 570 /* vim:set et sw=2 ts=2: */ 571