1 /*
2  * Copyright 2006-2008 The FLWOR Foundation.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ZORBA_JSON_PARSER_H
18 #define ZORBA_JSON_PARSER_H
19 
20 #include <zorba/config.h>
21 
22 #include <exception>
23 #include <iostream>
24 #include <stack>
25 #include <string>
26 
27 #include <zorba/internal/diagnostic.h>
28 
29 #include "zorbatypes/zstring.h"
30 
31 #include "cxx_util.h"
32 #include "unicode_util.h"
33 
34 namespace zorba {
35 namespace json {
36 
37 ///////////////////////////////////////////////////////////////////////////////
38 
39 typedef internal::diagnostic::location location;
40 
41 ///////////////////////////////////////////////////////////////////////////////
42 
43 /**
44  * A JSON %type is the type of JSON data.  This isn't used by the lexer or
45  * parser implementation at all, but it's handy.
46  */
47 enum type {
48   none,   // meaning "not set" as opposed to "null"
49   array,
50   boolean,
51   null,
52   number,
53   object,
54   string
55 };
56 extern char const *const type_string_of[];
57 
58 inline std::ostream& operator<<( std::ostream &o, type t ) {
59   return o << type_string_of[ t ];
60 }
61 
62 /**
63  * A JSON %token.  Tokens have a type, location at which they were found, and
64  * sometimes a value.
65  *
66  * See: "RFC 4627: The application/json Media Type for JavaScript Object
67  * Notation (JSON)."
68  */
69 class token {
70   // see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2333.html
71   struct pointer_conversion { int valid; };
72   typedef int pointer_conversion::*explicit_bool;
73 public:
74   typedef zstring value_type;
75 
76   /**
77    * The types of tokens in JSON.  The first 6 constants have values that
78    * correspond to the actual structural characters used by JSON; the rest were
79    * assigned non-standard, mnemonic values for convenience.
80    */
81   enum type {
82     none,
83     begin_array     = '[',
84     begin_object    = '{',
85     end_array       = ']',
86     end_object      = '}',
87     name_separator  = ':',
88     value_separator = ',',
89     string          = 'S',
90     number          = 'N',
91     json_false      = 'F',
92     json_null       = '0',
93     json_true       = 'T',
94   };
95 
96   /**
97    * Default constructor.
98    */
99   token();
100 
101   /**
102    * Clears this %token.
103    */
clear()104   void clear() {
105     type_ = none;
106     value_.clear();
107   }
108 
109   /**
110    * Gets the location at which this %token was found.
111    *
112    * @return Returns said location.
113    */
get_loc()114   location const& get_loc() const {
115     return loc_;
116   }
117 
118   /**
119    * Gets the type of this %token.
120    *
121    * @return Returns said type.
122    */
get_type()123   type get_type() const {
124     return type_;
125   }
126 
127   /**
128    * Gets the value of this %token, if any.  Only %token types string, number,
129    * false, null, and true have a value.
130    *
131    * @return Returns said value or the empty string.
132    */
get_value()133   value_type const& get_value() const {
134     return value_;
135   }
136 
137   /**
138    * Conversion to \c bool.
139    *
140    * @return Returns \c true only if this token's type is not \c none.
141    */
explicit_bool()142   operator explicit_bool() const {
143     return type_ ? &pointer_conversion::valid : nullptr;
144   }
145 
146 private:
147   location loc_;
148   type type_;
149   value_type value_;
150 
151   friend class lexer;
152 };
153 
154 /**
155  * Map a token's type to a JSON type.
156  *
157  * @param tt The token::type to map.
158  * @return Returns the corresponding JSON type or \c none if \a tt doesn't map.
159  */
160 type map_type( token::type tt );
161 
162 /**
163  * Emits the given token type to an ostream.
164  *
165  * @param o The ostream to emit to.
166  * @param tt The token type to emit.
167  * @return Returns \a o.
168  */
169 std::ostream& operator<<( std::ostream &o, token::type tt );
170 
171 /**
172  * Emits the given token to an ostream.
173  *
174  * @param o The ostream to emit to.
175  * @param t The token to emit.
176  * @return Returns \a o.
177  */
178 std::ostream& operator<<( std::ostream &o, token const &t );
179 
180 /**
181  * Compares two tokens for equality.
182  *
183  * @param t1 The first token.
184  * @param t2 The second token.
185  * @return Returns \c true only if the two tokens' types and values are equal.
186  */
187 inline bool operator==( token const &t1, token const &t2 ) {
188   return t1.get_type() == t2.get_type() && t1.get_value() == t2.get_value();
189 }
190 
191 /**
192  * Compares a token's type to another type for equality.
193  *
194  * @param t The token whose type to compare.
195  * @param tt The type to compare to.
196  * @return Returns \c true only if the token's type equals \a tt.
197  */
198 inline bool operator==( token const &t, token::type tt ) {
199   return t.get_type() == tt;
200 }
201 
202 /**
203  * Compares a token's type to another type for equality.
204  *
205  * @param tt The type to compare.
206  * @param t The token whose type to compare to.
207  * @return Returns \c true only if \a tt equals the token's type.
208  */
209 inline bool operator==( token::type tt, token const &t ) {
210   return t == tt;
211 }
212 
213 /**
214  * Compares a token's value to a C string for equality.
215  *
216  * @param t The token whose value to compare.
217  * @param value The value to compare to.
218  * @return Returns \c true only if the token's value equals \a value.
219  */
220 inline bool operator==( token const &t, char const *value ) {
221   return t.get_value() == value;
222 }
223 
224 /**
225  * Compares a C string to a token's value for equality.
226  *
227  * @param value The value to compare.
228  * @param t The token whose value to compare to.
229  * @return Returns \c true only if \a value equals the token's value.
230  */
231 inline bool operator==( char const *value, token const &t ) {
232   return t == value;
233 }
234 
235 /**
236  * Compares two tokens for inequality.
237  *
238  * @param t1 The first token.
239  * @param t2 The second token.
240  * @return Returns \c true if either the two tokens' types or values are not
241  * equal.
242  */
243 inline bool operator!=( token const &t1, token const &t2 ) {
244   return !(t1 == t2);
245 }
246 
247 /**
248  * Compares a token's type to another type for inequality.
249  *
250  * @param t The token whose type to compare.
251  * @param tt The type to compare to.
252  * @return Returns \c true only if the token's type is not equal to \a tt.
253  */
254 inline bool operator!=( token const &t, token::type tt ) {
255   return !(t == tt);
256 }
257 
258 /**
259  * Compares a token's type to another type for inequality.
260  *
261  * @param tt The type to compare.
262  * @param t The token whose type to compare to.
263  * @return Returns \c true only if \a tt is not equal to the token's type.
264  */
265 inline bool operator!=( token::type tt, token const &t ) {
266   return !(tt == t);
267 }
268 
269 /**
270  * Compares a token's value to a C string for inequality.
271  *
272  * @param t The token whose value to compare.
273  * @param value The value to compare to.
274  * @return Returns \c true only if the token's value is not equal to \a value.
275  */
276 inline bool operator!=( token const &t, char const *value ) {
277   return !(t == value);
278 }
279 
280 /**
281  * Compares a token's value to a C string for inequality.
282  *
283  * @param value The value to compare.
284  * @param t The token whose value to compare to.
285  * @return Returns \c true only if \a value is not equal to the token's value.
286  */
287 inline bool operator!=( char const *value, token const &t ) {
288   return !(value == t);
289 }
290 
291 ///////////////////////////////////////////////////////////////////////////////
292 
293 /**
294  * An %exception is the root of the JSON %exception hierarchy.
295  */
296 class exception : public std::exception {
297 public:
298   ~exception() throw();
299 
300   /**
301    * Gets the location in the JSON source whence this exception was thrown.
302    */
get_loc()303   location const& get_loc() const {
304     return loc_;
305   }
306 
307   // inherited
308   char const* what() const throw();
309 
310 protected:
311   exception( location const &loc, std::string const &message );
312 
313 private:
314   location loc_;
315   std::string message_;
316 };
317 
318 /**
319  * This exception is thrown when an illegal character is encountered in a JSON
320  * data stream.
321  */
322 class illegal_character : public exception {
323 public:
324   illegal_character( location const &loc, char c );
325   ~illegal_character() throw();
326 
327   /**
328    * Gets the illegal character.
329    *
330    * @return Returns said character.
331    */
get_char()332   char get_char() const {
333     return c_;
334   }
335 
336 private:
337   char c_;
338 };
339 
340 /**
341  * This exception is thrown when an illegal Unicode code-point escape sequence
342  * (\uHHHH) is encountered.
343  */
344 class illegal_codepoint : public exception {
345 public:
346   illegal_codepoint( location const &loc, token::value_type const &cp );
347   ~illegal_codepoint() throw();
348 
349   /**
350    * Gets the illegal code-point.
351    *
352    * @return Returns said code-point.
353    */
get_codepoint()354   token::value_type get_codepoint() const {
355     return codepoint_;
356   }
357 
358 private:
359   token::value_type codepoint_;
360 };
361 
362 /**
363  * This exception is thrown when an illegal character follows a backslash
364  * (escape) within a string literal.  The legal escape characters are:
365  * ["/\bfnrtu].
366  */
367 class illegal_escape : public exception {
368 public:
369   illegal_escape( location const &loc, char escape );
370   ~illegal_escape() throw();
371 
372   /**
373    * Gets the illegal escape character.
374    *
375    * @return Returns said character.
376    */
get_escape()377   char get_escape() const {
378     return esc_;
379   }
380 
381 private:
382   char esc_;
383 };
384 
385 /**
386  * This exception is thrown when a literal other than \c false, \c null, or
387  * \c true is encountered.
388  */
389 class illegal_literal : public exception {
390 public:
391   illegal_literal( location const &loc );
392   ~illegal_literal() throw();
393 };
394 
395 /**
396  * This exception is thrown when an illegal number is encountered.
397  */
398 class illegal_number : public exception {
399 public:
400   illegal_number( location const &loc );
401   ~illegal_number() throw();
402 };
403 
404 /**
405  * This exception is thrown when an unexpected token is encountered.
406  */
407 class unexpected_token : public exception {
408 public:
409   unexpected_token( token const &t );
410   ~unexpected_token() throw();
411 
412   /**
413    * Gets the unexpected token
414    *
415    * @return Returns said token.
416    */
get_token()417   token const& get_token() const {
418     return token_;
419   }
420 
421 private:
422   token token_;
423 };
424 
425 /**
426  * This exception is thrown when an EOF is encountered before a string's
427  * terminating quote.
428  */
429 class unterminated_string : public exception {
430 public:
431   unterminated_string( location const &loc );
432   ~unterminated_string() throw();
433 };
434 
435 ///////////////////////////////////////////////////////////////////////////////
436 
437 /**
438  * A %lexer extracts JSON tokens from an istream.
439  */
440 class lexer {
441 public:
442   typedef location::line_type line_type;
443   typedef location::column_type column_type;
444 
445   /**
446    * Constructs a %lexer on the given istream.
447    *
448    * @param in The istream to read from.
449    */
450   lexer( std::istream &in );
451 
452   /**
453    * Gets the next token, if any.
454    *
455    * @param result A pointer to the token to get into.
456    * @return Returns \c true only if there was a next token.
457    * @throws exception upon error.
458    */
459   bool next( token *result );
460 
461   /**
462    * Sets the file location.
463    *
464    * @param file The source file name.
465    * @param line The source line number.
466    * @param col  The source column number.
467    */
468   void set_loc( char const *file, line_type line, column_type col );
469 
470 private:
cur_loc()471   location cur_loc() const {
472     return location( file_, line_, col_ );
473   }
474 
475   bool get_char( char* = nullptr );
476   bool peek_char( char* );
477   unicode::code_point parse_codepoint();
478   token::type parse_literal( char, token::value_type* );
479   void parse_number( char, token::value_type* );
480   void parse_string( token::value_type* );
481 
482   std::istream *in_;
483   std::string file_;
484   line_type line_;
485   column_type col_;
486   location cur_loc_;
487 };
488 
489 ///////////////////////////////////////////////////////////////////////////////
490 
491 /**
492  * A %parser extracts JSON tokens from an istream while checking to ensure the
493  * token sequence is valid.
494  */
495 class parser {
496 public:
497   typedef lexer::line_type line_type;
498   typedef lexer::column_type column_type;
499 
500   /**
501    * Constructs a %parser on the given istream.
502    *
503    * @param in The istream to read from.
504    */
505   parser( std::istream &in );
506 
507   /**
508    * Gets the next token, if any.
509    *
510    * @param result A pointer to the token to receive the token.
511    * @return Returns \c true only if there was a next token.
512    * @throws exception upon error.
513    */
514   bool next( token *result );
515 
516   /**
517    * Peeks at the next token, if any.
518    *
519    * @param result A pointer to the token to receive the token, if any.
520    * @return Returns the type of the peeked token.
521    * @throws exception upon error.
522    */
523   token::type peek( token *result = nullptr );
524 
525   /**
526    * Sets the file location.
527    *
528    * @param file The source file name.
529    * @param line The source line number.
530    * @param col  The source column number.
531    */
set_loc(char const * file,line_type line,column_type col)532   void set_loc( char const *file, line_type line, column_type col ) {
533     lexer_.set_loc( file, line, col );
534   }
535 
536 private:
537   enum state {
538     A0, A1, A2, // Array
539     E0, E1,     // Element
540     J0, J1,     // JSON
541     M0, M1,     // Member
542     O0, O1, O2, // Object
543     P0, P1,     // Pair
544     V0          // Value
545   };
546 
547   friend std::ostream& operator<<( std::ostream&, state );
548 
549   bool get_token( token* );
550   bool get_token_debug( int, token* );
551   bool matches_token( token::type, token* );
552   bool matches_token_debug( int, token::type, token* );
553   token::type peek_token();
554   token::type peek_token_debug( int );
555   void require_token( token::type, token* );
556   void require_token_debug( int, token::type, token* );
557 
558   lexer lexer_;
559   token peeked_token_;
560   std::stack<state> state_stack_;
561   state state_;
562 };
563 
564 ///////////////////////////////////////////////////////////////////////////////
565 
566 } // namespace json
567 } // namespace zorba
568 
569 #endif /* ZORBA_JSON_PARSER_H */
570 /* vim:set et sw=2 ts=2: */
571