1 /* Copyright (c) 2015, 2021, Oracle and/or its affiliates.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License, version 2.0,
5 as published by the Free Software Foundation.
6
7 This program is also distributed with certain software (including
8 but not limited to OpenSSL) that is licensed under separate terms,
9 as designated in a particular file or component or in included license
10 documentation. The authors of MySQL hereby grant you an additional
11 permission to link the program and your derivative works with the
12 separately licensed software that they have included with MySQL.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License, version 2.0, for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
22
23 /*
24 @file json_path.cc
25
26 This file contains implementation support for the JSON path abstraction.
27 The path abstraction is described by the functional spec
28 attached to WL#7909.
29 */
30
31 #include "json_path.h"
32
33 #include "json_dom.h"
34 #include "mysqld.h" // key_memory_JSON
35 #include "rapidjson/rapidjson.h" // rapidjson::UTF8<char>::Decode
36 #include "rapidjson/memorystream.h" // rapidjson::MemoryStream
37 #include "sql_const.h" // STRING_BUFFER_USUAL_SIZE
38 #include "sql_string.h" // String
39 #include "template_utils.h" // down_cast
40
41 #include <m_ctype.h>
42
43 #include <cwctype>
44 #include <memory> // auto_ptr
45 #include <string>
46
47 // For use in Json_path::parse_path
48 #define PARSER_RETURN(retval) { *status= retval; return charptr; }
49 #define SCOPE '$'
50 #define BEGIN_MEMBER '.'
51 #define BEGIN_ARRAY '['
52 #define END_ARRAY ']'
53 #define DOUBLE_QUOTE '\"'
54 #define WILDCARD '*'
55 #define PRINTABLE_SPACE ' '
56
57 bool is_ecmascript_identifier(const char *name, size_t name_length);
58 bool is_digit(unsigned codepoint);
59
60 // Json_path_leg
61
get_type() const62 enum_json_path_leg_type Json_path_leg::get_type() const
63 {
64 return m_leg_type;
65 }
66
get_member_name_length() const67 size_t Json_path_leg::get_member_name_length() const
68 {
69 return m_member_name.size();
70 }
71
get_member_name() const72 const char *Json_path_leg::get_member_name() const
73 {
74 return m_member_name.data();
75 }
76
get_array_cell_index() const77 size_t Json_path_leg::get_array_cell_index() const
78 {
79 return m_array_cell_index;
80 }
81
to_string(String * buf) const82 bool Json_path_leg::to_string(String *buf) const
83 {
84 switch(m_leg_type)
85 {
86 case jpl_member:
87 return buf->append(BEGIN_MEMBER) ||
88 (is_ecmascript_identifier(get_member_name(),
89 get_member_name_length()) ?
90 buf->append(get_member_name(), get_member_name_length()) :
91 double_quote(get_member_name(), get_member_name_length(), buf));
92 case jpl_array_cell:
93 return buf->append(BEGIN_ARRAY) ||
94 buf->append_ulonglong(m_array_cell_index) ||
95 buf->append(END_ARRAY);
96 case jpl_member_wildcard:
97 return buf->append(BEGIN_MEMBER) || buf->append(WILDCARD);
98 case jpl_array_cell_wildcard:
99 return buf->append(BEGIN_ARRAY) || buf->append(WILDCARD) ||
100 buf->append(END_ARRAY);
101 case jpl_ellipsis:
102 return buf->append(WILDCARD) || buf->append(WILDCARD);
103 }
104
105 // Unknown leg type.
106 DBUG_ABORT(); /* purecov: inspected */
107 return true; /* purecov: inspected */
108 }
109
110 // Json_path_clone
111
Json_path_clone()112 Json_path_clone::Json_path_clone()
113 : m_path_legs(key_memory_JSON)
114 {}
115
116
~Json_path_clone()117 Json_path_clone::~Json_path_clone()
118 {
119 clear();
120 }
121
122
leg_count() const123 size_t Json_path_clone::leg_count() const { return m_path_legs.size(); }
124
125
get_leg_at(const size_t index) const126 const Json_path_leg *Json_path_clone::get_leg_at(const size_t index) const
127 {
128 if (index >= m_path_legs.size())
129 {
130 return NULL;
131 }
132
133 return m_path_legs.at(index);
134 }
135
136
append(const Json_path_leg * leg)137 bool Json_path_clone::append(const Json_path_leg *leg)
138 {
139 return m_path_legs.push_back(leg);
140 }
141
142
set(Json_seekable_path * source)143 bool Json_path_clone::set(Json_seekable_path *source)
144 {
145 clear();
146
147 size_t legcount= source->leg_count();
148 for (size_t idx= 0; idx < legcount; idx++)
149 {
150 Json_path_leg *path_leg= (Json_path_leg *) source->get_leg_at(idx);
151 if (append(path_leg))
152 {
153 return true;
154 }
155 }
156
157 return false;
158 }
159
160
pop()161 const Json_path_leg *Json_path_clone::pop()
162 {
163 assert(m_path_legs.size() > 0);
164 const Json_path_leg *p= m_path_legs.back();
165 m_path_legs.pop_back();
166 return p;
167 }
168
169
clear()170 void Json_path_clone::clear()
171 {
172 m_path_legs.clear();
173 }
174
175
contains_ellipsis() const176 bool Json_path_clone::contains_ellipsis() const
177 {
178 for (Path_leg_pointers::const_iterator iter= m_path_legs.begin();
179 iter != m_path_legs.end(); ++iter)
180 {
181 const Json_path_leg *path_leg= *iter;
182 if (path_leg->get_type() == jpl_ellipsis)
183 return true;
184 }
185
186 return false;
187 }
188
189
190 // Json_path
191
Json_path()192 Json_path::Json_path()
193 : m_path_legs(key_memory_JSON)
194 {}
195
196
~Json_path()197 Json_path::~Json_path()
198 {
199 m_path_legs.clear();
200 }
201
202
leg_count() const203 size_t Json_path::leg_count() const { return m_path_legs.size(); }
204
205
get_leg_at(const size_t index) const206 const Json_path_leg *Json_path::get_leg_at(const size_t index) const
207 {
208 if (index >= m_path_legs.size())
209 {
210 return NULL;
211 }
212
213 return &m_path_legs.at(index);
214 }
215
216
append(const Json_path_leg & leg)217 bool Json_path::append(const Json_path_leg &leg)
218 {
219 return m_path_legs.push_back(leg);
220 }
221
pop()222 Json_path_leg Json_path::pop()
223 {
224 assert(m_path_legs.size() > 0);
225 Json_path_leg p= m_path_legs.back();
226 m_path_legs.pop_back();
227 return p;
228 }
229
clear()230 void Json_path::clear()
231 {
232 m_path_legs.clear();
233 }
234
to_string(String * buf) const235 bool Json_path::to_string(String *buf) const
236 {
237 /*
238 3-part scope prefixes are not needed by wl7909.
239 There is no way to test them at the SQL level right now
240 since they would raise errors in all possible use-cases.
241 Support for them can be added in some follow-on worklog
242 which actually needs them.
243
244 This is where we would put pretty-printing support
245 for 3-part scope prefixes.
246 */
247
248 if (buf->append(SCOPE))
249 return true;
250
251 for (Path_leg_vector::const_iterator iter= m_path_legs.begin();
252 iter != m_path_legs.end(); ++iter)
253 {
254 if (iter->to_string(buf))
255 return true;
256 }
257
258 return false;
259 }
260
261
is_wildcard_or_ellipsis(const Json_path_leg & leg)262 static inline bool is_wildcard_or_ellipsis(const Json_path_leg &leg)
263 {
264 switch (leg.get_type())
265 {
266 case jpl_member_wildcard:
267 case jpl_array_cell_wildcard:
268 case jpl_ellipsis:
269 return true;
270 default:
271 return false;
272 }
273 }
274
275
contains_wildcard_or_ellipsis() const276 bool Json_path::contains_wildcard_or_ellipsis() const
277 {
278 return std::find_if(m_path_legs.begin(), m_path_legs.end(),
279 is_wildcard_or_ellipsis) != m_path_legs.end();
280 }
281
282
is_ellipsis(const Json_path_leg & leg)283 static inline bool is_ellipsis(const Json_path_leg &leg)
284 {
285 return leg.get_type() == jpl_ellipsis;
286 }
287
288
contains_ellipsis() const289 bool Json_path::contains_ellipsis() const
290 {
291 return std::find_if(m_path_legs.begin(), m_path_legs.end(),
292 is_ellipsis) != m_path_legs.end();
293 }
294
295
296 // Json_path parsing
297
initialize()298 void Json_path::initialize()
299 {
300 m_path_legs.clear();
301 }
302
303 /** Top level parsing factory method */
parse_path(const bool begins_with_column_id,const size_t path_length,const char * path_expression,Json_path * path,size_t * bad_index)304 bool parse_path(const bool begins_with_column_id, const size_t path_length,
305 const char *path_expression, Json_path *path, size_t *bad_index)
306 {
307 bool status= false;
308
309 const char *end_of_parsed_path=
310 path->parse_path(begins_with_column_id, path_length, path_expression,
311 &status);
312
313 if (status)
314 {
315 *bad_index= 0;
316 return false;
317 }
318
319 *bad_index= end_of_parsed_path - path_expression;
320 return true;
321 }
322
323
324 /**
325 Purge leading whitespace in a string.
326 @param[in] str the string to purge whitespace from
327 @param[in] end the end of the input string
328 @return pointer to the first non-whitespace character in str
329 */
purge_whitespace(const char * str,const char * end)330 static inline const char *purge_whitespace(const char *str, const char *end)
331 {
332 while (str < end && my_isspace(&my_charset_utf8mb4_bin, *str))
333 ++str;
334 return str;
335 }
336
337
parse_path(const bool begins_with_column_id,const size_t path_length,const char * path_expression,bool * status)338 const char *Json_path::parse_path(const bool begins_with_column_id,
339 const size_t path_length,
340 const char *path_expression,
341 bool *status)
342 {
343 initialize();
344
345 const char *charptr= path_expression;
346 const char *endptr= path_expression + path_length;
347
348 if (begins_with_column_id)
349 {
350 /*
351 3-part scope prefixes are not needed by wl7909.
352 There is no way to test them at the SQL level right now
353 since they would raise errors in all possible use-cases.
354 Support for them can be added in some follow-on worklog
355 which actually needs them.
356
357 This is where we would add parsing support
358 for 3-part scope prefixes.
359 */
360
361 // not supported yet
362 PARSER_RETURN(false);
363 }
364 else
365 {
366 // the first non-whitespace character must be $
367 charptr= purge_whitespace(charptr, endptr);
368 if ((charptr >= endptr) || (*charptr++ != SCOPE))
369 PARSER_RETURN(false);
370 }
371
372 // now add the legs
373 *status= true;
374 while (*status)
375 {
376 charptr= purge_whitespace(charptr, endptr);
377 if (charptr >= endptr)
378 break; // input exhausted
379
380 charptr= parse_path_leg(charptr, endptr, status);
381 }
382
383 // a path may not end with an ellipsis
384 if (m_path_legs.size() > 0 && is_ellipsis(m_path_legs.back()))
385 {
386 *status= false;
387 }
388
389 return charptr;
390 }
391
392
parse_path_leg(const char * charptr,const char * endptr,bool * status)393 const char *Json_path::parse_path_leg(const char *charptr,
394 const char *endptr,
395 bool *status)
396 {
397 switch (*charptr)
398 {
399 case BEGIN_ARRAY:
400 return parse_array_leg(charptr, endptr, status);
401 case BEGIN_MEMBER:
402 return parse_member_leg(charptr, endptr, status);
403 case WILDCARD:
404 return parse_ellipsis_leg(charptr, endptr, status);
405 default:
406 PARSER_RETURN(false);
407 }
408 }
409
410
parse_ellipsis_leg(const char * charptr,const char * endptr,bool * status)411 const char *Json_path::parse_ellipsis_leg(const char *charptr,
412 const char *endptr,
413 bool *status)
414 {
415 // assume the worst
416 *status= false;
417
418 // advance past the first *
419 charptr++;
420
421 // must be followed by a second *
422 if ((charptr >= endptr) || (*charptr++ != WILDCARD))
423 {
424 PARSER_RETURN(false);
425 }
426
427 // may not be the last leg
428 if (charptr >= endptr)
429 {
430 PARSER_RETURN(false);
431 }
432
433 // forbid the hard-to-read *** combination
434 if (*charptr == WILDCARD)
435 {
436 PARSER_RETURN(false);
437 }
438
439 PARSER_RETURN(!append(Json_path_leg(jpl_ellipsis)));
440 }
441
442
parse_array_leg(const char * charptr,const char * endptr,bool * status)443 const char *Json_path::parse_array_leg(const char *charptr,
444 const char *endptr,
445 bool *status)
446 {
447 // assume the worst
448 *status= false;
449
450 // advance past the [
451 charptr++;
452
453 charptr= purge_whitespace(charptr, endptr);
454 if (charptr >= endptr)
455 PARSER_RETURN(false); // input exhausted
456
457 if (*charptr == WILDCARD)
458 {
459 charptr++;
460
461 if (append(Json_path_leg(jpl_array_cell_wildcard)))
462 PARSER_RETURN(false); /* purecov: inspected */
463 }
464 else
465 {
466 // Not a WILDCARD. Must be an array index.
467 const char *number_start= charptr;
468
469 while ((charptr < endptr) && is_digit(*charptr))
470 {
471 charptr++;
472 }
473 if (charptr == number_start)
474 {
475 PARSER_RETURN(false);
476 }
477
478 int dummy_err;
479 longlong cell_index= my_strntoll(&my_charset_utf8mb4_bin, number_start,
480 charptr - number_start, 10,
481 (char**) 0, &dummy_err);
482
483 if (dummy_err != 0)
484 {
485 PARSER_RETURN(false);
486 }
487
488 if (append(Json_path_leg(static_cast<size_t>(cell_index))))
489 PARSER_RETURN(false); /* purecov: inspected */
490 }
491
492 // the next non-whitespace should be the closing ]
493 charptr= purge_whitespace(charptr, endptr);
494 if ((charptr < endptr) && (*charptr++ == END_ARRAY))
495 {
496 // all is well
497 PARSER_RETURN(true);
498 }
499
500 // An error has occurred.
501 PARSER_RETURN(false);
502 }
503
504
505 /**
506 Find the end of a member name in a JSON path. The name could be
507 either a quoted or an unquoted identifier.
508
509 @param start the start of the member name
510 @param end the end of the JSON path expression
511 @return pointer to the position right after the end of the name, or
512 to the position right after the end of the string if the input
513 string is an unterminated quoted identifier
514 */
find_end_of_member_name(const char * start,const char * end)515 static const char *find_end_of_member_name(const char *start, const char *end)
516 {
517 const char *str= start;
518
519 /*
520 If we have a double-quoted name, the end of the name is the next
521 unescaped double quote.
522 */
523 if (*str == DOUBLE_QUOTE)
524 {
525 str++; // Advance past the opening double quote.
526 while (str < end)
527 {
528 switch (*str++)
529 {
530 case '\\':
531 /*
532 Skip the next character after a backslash. It cannot mark
533 the end of the quoted string.
534 */
535 str++;
536 break;
537 case DOUBLE_QUOTE:
538 // An unescaped double quote marks the end of the quoted string.
539 return str;
540 }
541 }
542
543 /*
544 Whoops. No terminating quote was found. Just return the end of
545 the string. When we send the unterminated string through the
546 JSON parser, it will detect and report the syntax error, so
547 there is no need to handle the syntax error here.
548 */
549 return end;
550 }
551
552 /*
553 If we have an unquoted name, the name is terminated by whitespace
554 or [ or . or * or end-of-string.
555 */
556 while (str < end &&
557 !my_isspace(&my_charset_utf8mb4_bin, *str) &&
558 *str != BEGIN_ARRAY &&
559 *str != BEGIN_MEMBER &&
560 *str != WILDCARD)
561 {
562 str++;
563 }
564
565 return str;
566 }
567
568
569 /**
570 Parse a quoted member name using the rapidjson parser, so that we
571 get the name without the enclosing quotes and with any escape
572 sequences replaced with the actual characters.
573
574 It is the caller's responsibility to destroy the returned
575 Json_string when it's done with it.
576
577 @param str the input string
578 @param len the length of the input string
579 @return a Json_string that represents the member name, or NULL if
580 the input string is not a valid name
581 */
parse_name_with_rapidjson(const char * str,size_t len)582 static const Json_string *parse_name_with_rapidjson(const char *str, size_t len)
583 {
584 const Json_dom *dom= Json_dom::parse(str, len, NULL, NULL);
585
586 if (dom != NULL && dom->json_type() == Json_dom::J_STRING)
587 return down_cast<const Json_string *>(dom);
588
589 delete dom;
590 return NULL;
591 }
592
593
parse_member_leg(const char * charptr,const char * endptr,bool * status)594 const char *Json_path::parse_member_leg(const char *charptr,
595 const char *endptr,
596 bool *status)
597 {
598 // advance past the .
599 charptr++;
600
601 charptr= purge_whitespace(charptr, endptr);
602 if (charptr >= endptr)
603 PARSER_RETURN(false); // input exhausted
604
605 if (*charptr == WILDCARD)
606 {
607 charptr++;
608
609 if (append(Json_path_leg(jpl_member_wildcard)))
610 PARSER_RETURN(false); /* purecov: inspected */
611 }
612 else
613 {
614 const char *key_start= charptr;
615 const char *key_end= find_end_of_member_name(key_start, endptr);
616 const bool was_quoted= (*key_start == DOUBLE_QUOTE);
617
618 charptr= key_end;
619
620 std::auto_ptr<const Json_string> jstr;
621
622 if (was_quoted)
623 {
624 /*
625 Send the quoted name through the parser to unquote and
626 unescape it.
627 */
628 jstr.reset(parse_name_with_rapidjson(key_start, key_end - key_start));
629 }
630 else
631 {
632 /*
633 An unquoted name may contain escape sequences. Wrap it in
634 double quotes and send it through the JSON parser to unescape
635 it.
636 */
637 char buff[STRING_BUFFER_USUAL_SIZE];
638 String strbuff(buff, sizeof(buff), &my_charset_utf8mb4_bin);
639 strbuff.length(0);
640 if (strbuff.append(DOUBLE_QUOTE) ||
641 strbuff.append(key_start, key_end - key_start) ||
642 strbuff.append(DOUBLE_QUOTE))
643 PARSER_RETURN(false); /* purecov: inspected */
644 jstr.reset(parse_name_with_rapidjson(strbuff.ptr(), strbuff.length()));
645 }
646
647 if (jstr.get() == NULL)
648 PARSER_RETURN(false);
649
650 // empty key names are illegal
651 if (jstr->size() == 0)
652 PARSER_RETURN(false);
653
654 // unquoted names must be valid ECMAScript identifiers
655 if (!was_quoted &&
656 !is_ecmascript_identifier(jstr->value().data(), jstr->size()))
657 PARSER_RETURN(false);
658
659 // Looking good.
660 if (append(Json_path_leg(jstr->value())))
661 PARSER_RETURN(false); /* purecov: inspected */
662 }
663
664 PARSER_RETURN(true);
665 }
666
667
668 /**
669 Return true if the character is a unicode combining mark.
670
671 @param codepoint [in] A unicode codepoint.
672
673 @return True if the codepoint is a unicode combining mark.
674 */
unicode_combining_mark(unsigned codepoint)675 inline bool unicode_combining_mark(unsigned codepoint)
676 {
677 return ((0x300 <= codepoint) && (codepoint <= 0x36F));
678 }
679
680 /**
681 Return true if the codepoint is a Unicode letter.
682
683 This was the best
684 recommendation from the old-timers about how to answer this question.
685 But as you can see from the need to call unicode_combining_mark(),
686 my_isalpha() isn't good enough. It probably has many other defects.
687
688 FIXME
689 */
is_letter(unsigned codepoint)690 bool is_letter(unsigned codepoint)
691 {
692 /*
693 The Unicode combining mark \u036F passes the my_isalpha() test.
694 That doesn't inspire much confidence in the correctness
695 of my_isalpha().
696 */
697 if (unicode_combining_mark(codepoint))
698 {
699 return false;
700 }
701 return my_isalpha(&my_charset_utf8mb4_bin, codepoint);
702 }
703
704
705 /**
706 Return true if the codepoint is a Unicode digit.
707
708 This was the best
709 recommendation from the old-times about how to answer this question.
710 */
is_digit(unsigned codepoint)711 bool is_digit(unsigned codepoint)
712 {
713 return my_isdigit(&my_charset_utf8mb4_bin, codepoint);
714 }
715
716
717 /**
718 Return true if the codepoint is Unicode connector punctuation.
719 */
is_connector_punctuation(unsigned codepoint)720 bool is_connector_punctuation(unsigned codepoint)
721 {
722 switch(codepoint)
723 {
724 case 0x5F: // low line
725 case 0x203F: // undertie
726 case 0x2040: // character tie
727 case 0x2054: // inverted undertie
728 case 0xFE33: // presentation form for vertical low line
729 case 0xFE34: // presentation form for vertical wavy low line
730 case 0xFE4D: // dashed low line
731 case 0xFE4E: // centerline low line
732 case 0xFE4F: // wavy low line
733 case 0xFF3F: // fullwidth low line
734 {
735 return true;
736 }
737 default:
738 {
739 return false;
740 }
741 }
742 }
743
744
745 /**
746 Returns true if the name is a valid ECMAScript identifier.
747
748 The name
749 must be a sequence of UTF8-encoded bytes. All escape sequences
750 have been replaced with UTF8-encoded bytes.
751
752 @param[in] name name to check
753 @param[in] name_length its length
754
755 @return True if the name is a valid ECMAScript identifier. False otherwise.
756 */
is_ecmascript_identifier(const char * name,size_t name_length)757 bool is_ecmascript_identifier(const char *name, size_t name_length)
758 {
759 /*
760 At this point, The unicode escape sequences have already
761 been replaced with the corresponding UTF-8 bytes. Now we apply
762 the rules here: https://es5.github.io/x7.html#x7.6
763 */
764 rapidjson::MemoryStream input_stream(name, name_length);
765 unsigned codepoint;
766
767 while (input_stream.Tell() < name_length)
768 {
769 bool first_codepoint= (input_stream.Tell() == 0);
770 if (!rapidjson::UTF8<char>::Decode(input_stream, &codepoint))
771 return false;
772
773 // a unicode letter
774 if (is_letter(codepoint))
775 continue;
776 // $ is ok
777 if (codepoint == 0x24)
778 continue;
779 // _ is ok
780 if (codepoint == 0x5F)
781 continue;
782
783 /*
784 the first character must be one of the above.
785 more possibilities are available for subsequent characters.
786 */
787
788 if (first_codepoint)
789 {
790 return false;
791 }
792 else
793 {
794 // unicode combining mark
795 if (unicode_combining_mark(codepoint))
796 continue;
797
798 // a unicode digit
799 if (is_digit(codepoint))
800 continue;
801 if (is_connector_punctuation(codepoint))
802 continue;
803 // <ZWNJ>
804 if (codepoint == 0x200C)
805 continue;
806 // <ZWJ>
807 if (codepoint == 0x200D)
808 continue;
809 }
810
811 // nope
812 return false;
813 }
814
815 return true;
816 }
817