1 /* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License, version 2.0,
5    as published by the Free Software Foundation.
6 
7    This program is also distributed with certain software (including
8    but not limited to OpenSSL) that is licensed under separate terms,
9    as designated in a particular file or component or in included license
10    documentation.  The authors of MySQL hereby grant you an additional
11    permission to link the program and your derivative works with the
12    separately licensed software that they have included with MySQL.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License, version 2.0, for more details.
18 
19    You should have received a copy of the GNU General Public License
20    along with this program; if not, write to the Free Software
21    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
22 
23 /*
24   @file json_path.cc
25 
26   This file contains implementation support for the JSON path abstraction.
27   The path abstraction is described by the functional spec
28   attached to WL#7909.
29  */
30 
31 #include "json_path.h"
32 
33 #include "json_dom.h"
34 #include "mysqld.h"                             // key_memory_JSON
35 #include "rapidjson/rapidjson.h"                // rapidjson::UTF8<char>::Decode
36 #include "rapidjson/memorystream.h"             // rapidjson::MemoryStream
37 #include "sql_const.h"                          // STRING_BUFFER_USUAL_SIZE
38 #include "sql_string.h"                         // String
39 #include "template_utils.h"                     // down_cast
40 
41 #include <m_ctype.h>
42 
43 #include <cwctype>
44 #include <memory>                               // auto_ptr
45 #include <string>
46 
47 // For use in Json_path::parse_path
48 #define PARSER_RETURN(retval) { *status= retval; return charptr; }
49 #define SCOPE '$'
50 #define BEGIN_MEMBER '.'
51 #define BEGIN_ARRAY '['
52 #define END_ARRAY ']'
53 #define DOUBLE_QUOTE '\"'
54 #define WILDCARD '*'
55 #define PRINTABLE_SPACE ' '
56 
57 bool is_ecmascript_identifier(const char *name, size_t name_length);
58 bool is_digit(unsigned codepoint);
59 
60 // Json_path_leg
61 
get_type() const62 enum_json_path_leg_type Json_path_leg::get_type() const
63 {
64   return m_leg_type;
65 }
66 
get_member_name_length() const67 size_t Json_path_leg::get_member_name_length() const
68 {
69   return m_member_name.size();
70 }
71 
get_member_name() const72 const char *Json_path_leg::get_member_name() const
73 {
74   return m_member_name.data();
75 }
76 
get_array_cell_index() const77 size_t Json_path_leg::get_array_cell_index() const
78 {
79   return m_array_cell_index;
80 }
81 
to_string(String * buf) const82 bool Json_path_leg::to_string(String *buf) const
83 {
84   switch(m_leg_type)
85   {
86   case jpl_member:
87     return buf->append(BEGIN_MEMBER) ||
88       (is_ecmascript_identifier(get_member_name(),
89                                 get_member_name_length()) ?
90        buf->append(get_member_name(), get_member_name_length()) :
91        double_quote(get_member_name(), get_member_name_length(), buf));
92   case jpl_array_cell:
93     return buf->append(BEGIN_ARRAY) ||
94       buf->append_ulonglong(m_array_cell_index) ||
95       buf->append(END_ARRAY);
96   case jpl_member_wildcard:
97     return buf->append(BEGIN_MEMBER) || buf->append(WILDCARD);
98   case jpl_array_cell_wildcard:
99     return buf->append(BEGIN_ARRAY) || buf->append(WILDCARD) ||
100       buf->append(END_ARRAY);
101   case jpl_ellipsis:
102     return buf->append(WILDCARD) || buf->append(WILDCARD);
103   }
104 
105   // Unknown leg type.
106   DBUG_ABORT();                                 /* purecov: inspected */
107   return true;                                  /* purecov: inspected */
108 }
109 
110 // Json_path_clone
111 
Json_path_clone()112 Json_path_clone::Json_path_clone()
113   : m_path_legs(key_memory_JSON)
114 {}
115 
116 
~Json_path_clone()117 Json_path_clone::~Json_path_clone()
118 {
119   clear();
120 }
121 
122 
leg_count() const123 size_t Json_path_clone::leg_count() const { return m_path_legs.size(); }
124 
125 
get_leg_at(const size_t index) const126 const Json_path_leg *Json_path_clone::get_leg_at(const size_t index) const
127 {
128   if (index >= m_path_legs.size())
129   {
130     return NULL;
131   }
132 
133   return m_path_legs.at(index);
134 }
135 
136 
append(const Json_path_leg * leg)137 bool Json_path_clone::append(const Json_path_leg *leg)
138 {
139   return m_path_legs.push_back(leg);
140 }
141 
142 
set(Json_seekable_path * source)143 bool Json_path_clone::set(Json_seekable_path *source)
144 {
145   clear();
146 
147   size_t legcount= source->leg_count();
148   for (size_t idx= 0; idx < legcount; idx++)
149   {
150     Json_path_leg *path_leg= (Json_path_leg *) source->get_leg_at(idx);
151     if (append(path_leg))
152     {
153       return true;
154     }
155   }
156 
157   return false;
158 }
159 
160 
pop()161 const Json_path_leg *Json_path_clone::pop()
162 {
163   assert(m_path_legs.size() > 0);
164   const Json_path_leg *p= m_path_legs.back();
165   m_path_legs.pop_back();
166   return p;
167 }
168 
169 
clear()170 void Json_path_clone::clear()
171 {
172   m_path_legs.clear();
173 }
174 
175 
contains_ellipsis() const176 bool Json_path_clone::contains_ellipsis() const
177 {
178   for (Path_leg_pointers::const_iterator iter= m_path_legs.begin();
179        iter != m_path_legs.end(); ++iter)
180   {
181     const Json_path_leg *path_leg= *iter;
182     if (path_leg->get_type() == jpl_ellipsis)
183       return true;
184   }
185 
186   return false;
187 }
188 
189 
190 // Json_path
191 
Json_path()192 Json_path::Json_path()
193   : m_path_legs(key_memory_JSON)
194 {}
195 
196 
~Json_path()197 Json_path::~Json_path()
198 {
199   m_path_legs.clear();
200 }
201 
202 
leg_count() const203 size_t Json_path::leg_count() const { return m_path_legs.size(); }
204 
205 
get_leg_at(const size_t index) const206 const Json_path_leg *Json_path::get_leg_at(const size_t index) const
207 {
208   if (index >= m_path_legs.size())
209   {
210     return NULL;
211   }
212 
213   return &m_path_legs.at(index);
214 }
215 
216 
append(const Json_path_leg & leg)217 bool Json_path::append(const Json_path_leg &leg)
218 {
219   return m_path_legs.push_back(leg);
220 }
221 
pop()222 Json_path_leg Json_path::pop()
223 {
224   assert(m_path_legs.size() > 0);
225   Json_path_leg p= m_path_legs.back();
226   m_path_legs.pop_back();
227   return p;
228 }
229 
clear()230 void Json_path::clear()
231 {
232   m_path_legs.clear();
233 }
234 
to_string(String * buf) const235 bool Json_path::to_string(String *buf) const
236 {
237   /*
238     3-part scope prefixes are not needed by wl7909.
239     There is no way to test them at the SQL level right now
240     since they would raise errors in all possible use-cases.
241     Support for them can be added in some follow-on worklog
242     which actually needs them.
243 
244     This is where we would put pretty-printing support
245     for 3-part scope prefixes.
246   */
247 
248   if (buf->append(SCOPE))
249     return true;
250 
251   for (Path_leg_vector::const_iterator iter= m_path_legs.begin();
252        iter != m_path_legs.end(); ++iter)
253   {
254     if (iter->to_string(buf))
255       return true;
256   }
257 
258   return false;
259 }
260 
261 
is_wildcard_or_ellipsis(const Json_path_leg & leg)262 static inline bool is_wildcard_or_ellipsis(const Json_path_leg &leg)
263 {
264   switch (leg.get_type())
265   {
266   case jpl_member_wildcard:
267   case jpl_array_cell_wildcard:
268   case jpl_ellipsis:
269     return true;
270   default:
271     return false;
272   }
273 }
274 
275 
contains_wildcard_or_ellipsis() const276 bool Json_path::contains_wildcard_or_ellipsis() const
277 {
278   return std::find_if(m_path_legs.begin(), m_path_legs.end(),
279                       is_wildcard_or_ellipsis) != m_path_legs.end();
280 }
281 
282 
is_ellipsis(const Json_path_leg & leg)283 static inline bool is_ellipsis(const Json_path_leg &leg)
284 {
285   return leg.get_type() == jpl_ellipsis;
286 }
287 
288 
contains_ellipsis() const289 bool Json_path::contains_ellipsis() const
290 {
291   return std::find_if(m_path_legs.begin(), m_path_legs.end(),
292                       is_ellipsis) != m_path_legs.end();
293 }
294 
295 
296 // Json_path parsing
297 
initialize()298 void Json_path::initialize()
299 {
300   m_path_legs.clear();
301 }
302 
303 /** Top level parsing factory method */
parse_path(const bool begins_with_column_id,const size_t path_length,const char * path_expression,Json_path * path,size_t * bad_index)304 bool parse_path(const bool begins_with_column_id, const size_t path_length,
305                 const char *path_expression, Json_path *path, size_t *bad_index)
306 {
307   bool  status= false;
308 
309   const char *end_of_parsed_path=
310     path->parse_path(begins_with_column_id, path_length, path_expression,
311                      &status);
312 
313   if (status)
314   {
315     *bad_index= 0;
316     return false;
317   }
318 
319   *bad_index= end_of_parsed_path - path_expression;
320   return true;
321 }
322 
323 
324 /**
325   Purge leading whitespace in a string.
326   @param[in] str  the string to purge whitespace from
327   @param[in] end  the end of the input string
328   @return pointer to the first non-whitespace character in str
329 */
purge_whitespace(const char * str,const char * end)330 static inline const char *purge_whitespace(const char *str, const char *end)
331 {
332   while (str < end && my_isspace(&my_charset_utf8mb4_bin, *str))
333     ++str;
334   return str;
335 }
336 
337 
parse_path(const bool begins_with_column_id,const size_t path_length,const char * path_expression,bool * status)338 const char *Json_path::parse_path(const bool begins_with_column_id,
339                                   const size_t path_length,
340                                   const char *path_expression,
341                                   bool *status)
342 {
343   initialize();
344 
345   const char *charptr= path_expression;
346   const char *endptr= path_expression + path_length;
347 
348   if (begins_with_column_id)
349   {
350     /*
351       3-part scope prefixes are not needed by wl7909.
352       There is no way to test them at the SQL level right now
353       since they would raise errors in all possible use-cases.
354       Support for them can be added in some follow-on worklog
355       which actually needs them.
356 
357       This is where we would add parsing support
358       for 3-part scope prefixes.
359     */
360 
361     // not supported yet
362     PARSER_RETURN(false);
363   }
364   else
365   {
366     // the first non-whitespace character must be $
367     charptr= purge_whitespace(charptr, endptr);
368     if ((charptr >= endptr) || (*charptr++ != SCOPE))
369       PARSER_RETURN(false);
370   }
371 
372   // now add the legs
373   *status= true;
374   while (*status)
375   {
376     charptr= purge_whitespace(charptr, endptr);
377     if (charptr >= endptr)
378       break;                                    // input exhausted
379 
380     charptr= parse_path_leg(charptr, endptr, status);
381   }
382 
383   // a path may not end with an ellipsis
384   if (m_path_legs.size() > 0 && is_ellipsis(m_path_legs.back()))
385   {
386     *status= false;
387   }
388 
389   return charptr;
390 }
391 
392 
parse_path_leg(const char * charptr,const char * endptr,bool * status)393 const char *Json_path::parse_path_leg(const char *charptr,
394                                       const char *endptr,
395                                       bool *status)
396 {
397   switch (*charptr)
398   {
399   case BEGIN_ARRAY:
400     return parse_array_leg(charptr, endptr, status);
401   case BEGIN_MEMBER:
402     return parse_member_leg(charptr, endptr, status);
403   case WILDCARD:
404     return parse_ellipsis_leg(charptr, endptr, status);
405   default:
406     PARSER_RETURN(false);
407   }
408 }
409 
410 
parse_ellipsis_leg(const char * charptr,const char * endptr,bool * status)411 const char *Json_path::parse_ellipsis_leg(const char *charptr,
412                                           const char *endptr,
413                                           bool *status)
414 {
415   // assume the worst
416   *status= false;
417 
418   // advance past the first *
419   charptr++;
420 
421   // must be followed by a second *
422   if ((charptr >= endptr) || (*charptr++ != WILDCARD))
423   {
424     PARSER_RETURN(false);
425   }
426 
427   // may not be the last leg
428   if (charptr >= endptr)
429   {
430     PARSER_RETURN(false);
431   }
432 
433   // forbid the hard-to-read *** combination
434   if (*charptr == WILDCARD)
435   {
436     PARSER_RETURN(false);
437   }
438 
439   PARSER_RETURN(!append(Json_path_leg(jpl_ellipsis)));
440 }
441 
442 
parse_array_leg(const char * charptr,const char * endptr,bool * status)443 const char *Json_path::parse_array_leg(const char *charptr,
444                                        const char *endptr,
445                                        bool *status)
446 {
447   // assume the worst
448   *status= false;
449 
450   // advance past the [
451   charptr++;
452 
453   charptr= purge_whitespace(charptr, endptr);
454   if (charptr >= endptr)
455     PARSER_RETURN(false);                       // input exhausted
456 
457   if (*charptr == WILDCARD)
458   {
459     charptr++;
460 
461     if (append(Json_path_leg(jpl_array_cell_wildcard)))
462       PARSER_RETURN(false);                   /* purecov: inspected */
463   }
464   else
465   {
466     // Not a WILDCARD. Must be an array index.
467     const char *number_start= charptr;
468 
469     while ((charptr < endptr) && is_digit(*charptr))
470     {
471       charptr++;
472     }
473     if (charptr == number_start)
474     {
475       PARSER_RETURN(false);
476     }
477 
478     int dummy_err;
479     longlong cell_index= my_strntoll(&my_charset_utf8mb4_bin, number_start,
480                                      charptr - number_start, 10,
481                                      (char**) 0, &dummy_err);
482 
483     if (dummy_err != 0)
484     {
485       PARSER_RETURN(false);
486     }
487 
488     if (append(Json_path_leg(static_cast<size_t>(cell_index))))
489       PARSER_RETURN(false);                   /* purecov: inspected */
490   }
491 
492   // the next non-whitespace should be the closing ]
493   charptr= purge_whitespace(charptr, endptr);
494   if ((charptr < endptr) && (*charptr++ == END_ARRAY))
495   {
496     // all is well
497     PARSER_RETURN(true);
498   }
499 
500   // An error has occurred.
501   PARSER_RETURN(false);
502 }
503 
504 
505 /**
506   Find the end of a member name in a JSON path. The name could be
507   either a quoted or an unquoted identifier.
508 
509   @param start the start of the member name
510   @param end the end of the JSON path expression
511   @return pointer to the position right after the end of the name, or
512   to the position right after the end of the string if the input
513   string is an unterminated quoted identifier
514 */
find_end_of_member_name(const char * start,const char * end)515 static const char *find_end_of_member_name(const char *start, const char *end)
516 {
517   const char *str= start;
518 
519   /*
520     If we have a double-quoted name, the end of the name is the next
521     unescaped double quote.
522   */
523   if (*str == DOUBLE_QUOTE)
524   {
525     str++;                   // Advance past the opening double quote.
526     while (str < end)
527     {
528       switch (*str++)
529       {
530       case '\\':
531         /*
532           Skip the next character after a backslash. It cannot mark
533           the end of the quoted string.
534         */
535         str++;
536         break;
537       case DOUBLE_QUOTE:
538         // An unescaped double quote marks the end of the quoted string.
539         return str;
540       }
541     }
542 
543     /*
544       Whoops. No terminating quote was found. Just return the end of
545       the string. When we send the unterminated string through the
546       JSON parser, it will detect and report the syntax error, so
547       there is no need to handle the syntax error here.
548     */
549     return end;
550   }
551 
552   /*
553     If we have an unquoted name, the name is terminated by whitespace
554     or [ or . or * or end-of-string.
555   */
556   while (str < end &&
557          !my_isspace(&my_charset_utf8mb4_bin, *str) &&
558          *str != BEGIN_ARRAY &&
559          *str != BEGIN_MEMBER &&
560          *str != WILDCARD)
561   {
562     str++;
563   }
564 
565   return str;
566 }
567 
568 
569 /**
570   Parse a quoted member name using the rapidjson parser, so that we
571   get the name without the enclosing quotes and with any escape
572   sequences replaced with the actual characters.
573 
574   It is the caller's responsibility to destroy the returned
575   Json_string when it's done with it.
576 
577   @param str the input string
578   @param len the length of the input string
579   @return a Json_string that represents the member name, or NULL if
580   the input string is not a valid name
581 */
parse_name_with_rapidjson(const char * str,size_t len)582 static const Json_string *parse_name_with_rapidjson(const char *str, size_t len)
583 {
584   const Json_dom *dom= Json_dom::parse(str, len, NULL, NULL);
585 
586   if (dom != NULL && dom->json_type() == Json_dom::J_STRING)
587     return down_cast<const Json_string *>(dom);
588 
589   delete dom;
590   return NULL;
591 }
592 
593 
parse_member_leg(const char * charptr,const char * endptr,bool * status)594 const char *Json_path::parse_member_leg(const char *charptr,
595                                         const char *endptr,
596                                         bool *status)
597 {
598   // advance past the .
599   charptr++;
600 
601   charptr= purge_whitespace(charptr, endptr);
602   if (charptr >= endptr)
603     PARSER_RETURN(false);                       // input exhausted
604 
605   if (*charptr == WILDCARD)
606   {
607     charptr++;
608 
609     if (append(Json_path_leg(jpl_member_wildcard)))
610       PARSER_RETURN(false);                   /* purecov: inspected */
611   }
612   else
613   {
614     const char *key_start= charptr;
615     const char *key_end= find_end_of_member_name(key_start, endptr);
616     const bool was_quoted= (*key_start == DOUBLE_QUOTE);
617 
618     charptr= key_end;
619 
620     std::auto_ptr<const Json_string> jstr;
621 
622     if (was_quoted)
623     {
624       /*
625         Send the quoted name through the parser to unquote and
626         unescape it.
627       */
628       jstr.reset(parse_name_with_rapidjson(key_start, key_end - key_start));
629     }
630     else
631     {
632       /*
633         An unquoted name may contain escape sequences. Wrap it in
634         double quotes and send it through the JSON parser to unescape
635         it.
636       */
637       char buff[STRING_BUFFER_USUAL_SIZE];
638       String strbuff(buff, sizeof(buff), &my_charset_utf8mb4_bin);
639       strbuff.length(0);
640       if (strbuff.append(DOUBLE_QUOTE) ||
641           strbuff.append(key_start, key_end - key_start) ||
642           strbuff.append(DOUBLE_QUOTE))
643         PARSER_RETURN(false);                 /* purecov: inspected */
644       jstr.reset(parse_name_with_rapidjson(strbuff.ptr(), strbuff.length()));
645     }
646 
647     if (jstr.get() == NULL)
648       PARSER_RETURN(false);
649 
650     // empty key names are illegal
651     if (jstr->size() == 0)
652       PARSER_RETURN(false);
653 
654     // unquoted names must be valid ECMAScript identifiers
655     if (!was_quoted &&
656         !is_ecmascript_identifier(jstr->value().data(), jstr->size()))
657       PARSER_RETURN(false);
658 
659     // Looking good.
660     if (append(Json_path_leg(jstr->value())))
661       PARSER_RETURN(false);                   /* purecov: inspected */
662   }
663 
664   PARSER_RETURN(true);
665 }
666 
667 
668 /**
669    Return true if the character is a unicode combining mark.
670 
671    @param codepoint [in] A unicode codepoint.
672 
673    @return True if the codepoint is a unicode combining mark.
674 */
unicode_combining_mark(unsigned codepoint)675 inline bool unicode_combining_mark(unsigned codepoint)
676 {
677   return ((0x300 <= codepoint) && (codepoint <= 0x36F));
678 }
679 
680 /**
681    Return true if the codepoint is a Unicode letter.
682 
683    This was the best
684    recommendation from the old-timers about how to answer this question.
685    But as you can see from the need to call unicode_combining_mark(),
686    my_isalpha() isn't good enough. It probably has many other defects.
687 
688    FIXME
689 */
is_letter(unsigned codepoint)690 bool is_letter(unsigned codepoint)
691 {
692   /*
693     The Unicode combining mark \u036F passes the my_isalpha() test.
694     That doesn't inspire much confidence in the correctness
695     of my_isalpha().
696    */
697   if (unicode_combining_mark(codepoint))
698   {
699     return false;
700   }
701   return my_isalpha(&my_charset_utf8mb4_bin, codepoint);
702 }
703 
704 
705 /**
706    Return true if the codepoint is a Unicode digit.
707 
708    This was the best
709    recommendation from the old-times about how to answer this question.
710 */
is_digit(unsigned codepoint)711 bool is_digit(unsigned codepoint)
712 {
713   return my_isdigit(&my_charset_utf8mb4_bin, codepoint);
714 }
715 
716 
717 /**
718    Return true if the codepoint is Unicode connector punctuation.
719 */
is_connector_punctuation(unsigned codepoint)720 bool is_connector_punctuation(unsigned codepoint)
721 {
722   switch(codepoint)
723   {
724   case 0x5F:  // low line
725   case 0x203F:  // undertie
726   case 0x2040:  // character tie
727   case 0x2054:  // inverted undertie
728   case 0xFE33:  // presentation form for vertical low line
729   case 0xFE34:  // presentation form for vertical wavy low line
730   case 0xFE4D:  // dashed low line
731   case 0xFE4E:  // centerline low line
732   case 0xFE4F:  // wavy low line
733   case 0xFF3F:  // fullwidth low line
734     {
735       return true;
736     }
737   default:
738     {
739       return false;
740     }
741   }
742 }
743 
744 
745 /**
746    Returns true if the name is a valid ECMAScript identifier.
747 
748    The name
749    must be a sequence of UTF8-encoded bytes. All escape sequences
750    have been replaced with UTF8-encoded bytes.
751 
752    @param[in] name        name to check
753    @param[in] name_length its length
754 
755    @return True if the name is a valid ECMAScript identifier. False otherwise.
756 */
is_ecmascript_identifier(const char * name,size_t name_length)757 bool is_ecmascript_identifier(const char *name, size_t name_length)
758 {
759   /*
760     At this point, The unicode escape sequences have already
761     been replaced with the corresponding UTF-8 bytes. Now we apply
762     the rules here: https://es5.github.io/x7.html#x7.6
763   */
764   rapidjson::MemoryStream input_stream(name, name_length);
765   unsigned  codepoint;
766 
767   while (input_stream.Tell() < name_length)
768   {
769     bool  first_codepoint= (input_stream.Tell() == 0);
770     if (!rapidjson::UTF8<char>::Decode(input_stream, &codepoint))
771       return false;
772 
773     // a unicode letter
774     if (is_letter(codepoint))
775       continue;
776     // $ is ok
777     if (codepoint == 0x24)
778       continue;
779     // _ is ok
780     if (codepoint == 0x5F)
781       continue;
782 
783     /*
784       the first character must be one of the above.
785       more possibilities are available for subsequent characters.
786     */
787 
788     if (first_codepoint)
789     {
790       return false;
791     }
792     else
793     {
794       // unicode combining mark
795       if (unicode_combining_mark(codepoint))
796         continue;
797 
798       // a unicode digit
799       if (is_digit(codepoint))
800         continue;
801       if (is_connector_punctuation(codepoint))
802         continue;
803       // <ZWNJ>
804       if (codepoint == 0x200C)
805         continue;
806       // <ZWJ>
807       if (codepoint == 0x200D)
808         continue;
809     }
810 
811     // nope
812     return false;
813   }
814 
815   return true;
816 }
817