1from __future__ import unicode_literals, print_function, absolute_import, division, generators, nested_scopes 2import logging 3import six 4from six.moves import xrange 5from itertools import * # noqa 6from .exceptions import JSONPathError 7 8# Get logger name 9logger = logging.getLogger(__name__) 10 11# Turn on/off the automatic creation of id attributes 12# ... could be a kwarg pervasively but uses are rare and simple today 13auto_id_field = None 14 15NOT_SET = object() 16LIST_KEY = object() 17 18 19class JSONPath(object): 20 """ 21 The base class for JSONPath abstract syntax; those 22 methods stubbed here are the interface to supported 23 JSONPath semantics. 24 """ 25 26 def find(self, data): 27 """ 28 All `JSONPath` types support `find()`, which returns an iterable of `DatumInContext`s. 29 They keep track of the path followed to the current location, so if the calling code 30 has some opinion about that, it can be passed in here as a starting point. 31 """ 32 raise NotImplementedError() 33 34 def find_or_create(self, data): 35 return self.find(data) 36 37 def update(self, data, val): 38 """ 39 Returns `data` with the specified path replaced by `val`. Only updates 40 if the specified path exists. 41 """ 42 43 raise NotImplementedError() 44 45 def update_or_create(self, data, val): 46 return self.update(data, val) 47 48 def filter(self, fn, data): 49 """ 50 Returns `data` with the specified path filtering nodes according 51 the filter evaluation result returned by the filter function. 52 53 Arguments: 54 fn (function): unary function that accepts one argument 55 and returns bool. 56 data (dict|list|tuple): JSON object to filter. 57 """ 58 59 raise NotImplementedError() 60 61 def child(self, child): 62 """ 63 Equivalent to Child(self, next) but with some canonicalization 64 """ 65 if isinstance(self, This) or isinstance(self, Root): 66 return child 67 elif isinstance(child, This): 68 return self 69 elif isinstance(child, Root): 70 return child 71 else: 72 return Child(self, child) 73 74 def make_datum(self, value): 75 if isinstance(value, DatumInContext): 76 return value 77 else: 78 return DatumInContext(value, path=Root(), context=None) 79 80 81class DatumInContext(object): 82 """ 83 Represents a datum along a path from a context. 84 85 Essentially a zipper but with a structure represented by JsonPath, 86 and where the context is more of a parent pointer than a proper 87 representation of the context. 88 89 For quick-and-dirty work, this proxies any non-special attributes 90 to the underlying datum, but the actual datum can (and usually should) 91 be retrieved via the `value` attribute. 92 93 To place `datum` within another, use `datum.in_context(context=..., path=...)` 94 which extends the path. If the datum already has a context, it places the entire 95 context within that passed in, so an object can be built from the inside 96 out. 97 """ 98 @classmethod 99 def wrap(cls, data): 100 if isinstance(data, cls): 101 return data 102 else: 103 return cls(data) 104 105 def __init__(self, value, path=None, context=None): 106 self.value = value 107 self.path = path or This() 108 self.context = None if context is None else DatumInContext.wrap(context) 109 110 def in_context(self, context, path): 111 context = DatumInContext.wrap(context) 112 113 if self.context: 114 return DatumInContext(value=self.value, path=self.path, context=context.in_context(path=path, context=context)) 115 else: 116 return DatumInContext(value=self.value, path=path, context=context) 117 118 @property 119 def full_path(self): 120 return self.path if self.context is None else self.context.full_path.child(self.path) 121 122 @property 123 def id_pseudopath(self): 124 """ 125 Looks like a path, but with ids stuck in when available 126 """ 127 try: 128 pseudopath = Fields(str(self.value[auto_id_field])) 129 except (TypeError, AttributeError, KeyError): # This may not be all the interesting exceptions 130 pseudopath = self.path 131 132 if self.context: 133 return self.context.id_pseudopath.child(pseudopath) 134 else: 135 return pseudopath 136 137 def __repr__(self): 138 return '%s(value=%r, path=%r, context=%r)' % (self.__class__.__name__, self.value, self.path, self.context) 139 140 def __eq__(self, other): 141 return isinstance(other, DatumInContext) and other.value == self.value and other.path == self.path and self.context == other.context 142 143 144class AutoIdForDatum(DatumInContext): 145 """ 146 This behaves like a DatumInContext, but the value is 147 always the path leading up to it, not including the "id", 148 and with any "id" fields along the way replacing the prior 149 segment of the path 150 151 For example, it will make "foo.bar.id" return a datum 152 that behaves like DatumInContext(value="foo.bar", path="foo.bar.id"). 153 154 This is disabled by default; it can be turned on by 155 settings the `auto_id_field` global to a value other 156 than `None`. 157 """ 158 159 def __init__(self, datum, id_field=None): 160 """ 161 Invariant is that datum.path is the path from context to datum. The auto id 162 will either be the id in the datum (if present) or the id of the context 163 followed by the path to the datum. 164 165 The path to this datum is always the path to the context, the path to the 166 datum, and then the auto id field. 167 """ 168 self.datum = datum 169 self.id_field = id_field or auto_id_field 170 171 @property 172 def value(self): 173 return str(self.datum.id_pseudopath) 174 175 @property 176 def path(self): 177 return self.id_field 178 179 @property 180 def context(self): 181 return self.datum 182 183 def __repr__(self): 184 return '%s(%r)' % (self.__class__.__name__, self.datum) 185 186 def in_context(self, context, path): 187 return AutoIdForDatum(self.datum.in_context(context=context, path=path)) 188 189 def __eq__(self, other): 190 return isinstance(other, AutoIdForDatum) and other.datum == self.datum and self.id_field == other.id_field 191 192 193class Root(JSONPath): 194 """ 195 The JSONPath referring to the "root" object. Concrete syntax is '$'. 196 The root is the topmost datum without any context attached. 197 """ 198 199 def find(self, data): 200 if not isinstance(data, DatumInContext): 201 return [DatumInContext(data, path=Root(), context=None)] 202 else: 203 if data.context is None: 204 return [DatumInContext(data.value, context=None, path=Root())] 205 else: 206 return Root().find(data.context) 207 208 def update(self, data, val): 209 return val 210 211 def filter(self, fn, data): 212 return data if fn(data) else None 213 214 def __str__(self): 215 return '$' 216 217 def __repr__(self): 218 return 'Root()' 219 220 def __eq__(self, other): 221 return isinstance(other, Root) 222 223 224class This(JSONPath): 225 """ 226 The JSONPath referring to the current datum. Concrete syntax is '@'. 227 """ 228 229 def find(self, datum): 230 return [DatumInContext.wrap(datum)] 231 232 def update(self, data, val): 233 return val 234 235 def filter(self, fn, data): 236 return data if fn(data) else None 237 238 def __str__(self): 239 return '`this`' 240 241 def __repr__(self): 242 return 'This()' 243 244 def __eq__(self, other): 245 return isinstance(other, This) 246 247 248class Child(JSONPath): 249 """ 250 JSONPath that first matches the left, then the right. 251 Concrete syntax is <left> '.' <right> 252 """ 253 254 def __init__(self, left, right): 255 self.left = left 256 self.right = right 257 258 def find(self, datum): 259 """ 260 Extra special case: auto ids do not have children, 261 so cut it off right now rather than auto id the auto id 262 """ 263 264 return [submatch 265 for subdata in self.left.find(datum) 266 if not isinstance(subdata, AutoIdForDatum) 267 for submatch in self.right.find(subdata)] 268 269 def update(self, data, val): 270 for datum in self.left.find(data): 271 self.right.update(datum.value, val) 272 return data 273 274 def find_or_create(self, datum): 275 datum = DatumInContext.wrap(datum) 276 submatches = [] 277 for subdata in self.left.find_or_create(datum): 278 if isinstance(subdata, AutoIdForDatum): 279 # Extra special case: auto ids do not have children, 280 # so cut it off right now rather than auto id the auto id 281 continue 282 for submatch in self.right.find_or_create(subdata): 283 submatches.append(submatch) 284 return submatches 285 286 def update_or_create(self, data, val): 287 for datum in self.left.find_or_create(data): 288 self.right.update_or_create(datum.value, val) 289 return _clean_list_keys(data) 290 291 def filter(self, fn, data): 292 for datum in self.left.find(data): 293 self.right.filter(fn, datum.value) 294 return data 295 296 def __eq__(self, other): 297 return isinstance(other, Child) and self.left == other.left and self.right == other.right 298 299 def __str__(self): 300 return '%s.%s' % (self.left, self.right) 301 302 def __repr__(self): 303 return '%s(%r, %r)' % (self.__class__.__name__, self.left, self.right) 304 305 306class Parent(JSONPath): 307 """ 308 JSONPath that matches the parent node of the current match. 309 Will crash if no such parent exists. 310 Available via named operator `parent`. 311 """ 312 313 def find(self, datum): 314 datum = DatumInContext.wrap(datum) 315 return [datum.context] 316 317 def __eq__(self, other): 318 return isinstance(other, Parent) 319 320 def __str__(self): 321 return '`parent`' 322 323 def __repr__(self): 324 return 'Parent()' 325 326 327class Where(JSONPath): 328 """ 329 JSONPath that first matches the left, and then 330 filters for only those nodes that have 331 a match on the right. 332 333 WARNING: Subject to change. May want to have "contains" 334 or some other better word for it. 335 """ 336 337 def __init__(self, left, right): 338 self.left = left 339 self.right = right 340 341 def find(self, data): 342 return [subdata for subdata in self.left.find(data) if self.right.find(subdata)] 343 344 def update(self, data, val): 345 for datum in self.find(data): 346 datum.path.update(data, val) 347 return data 348 349 def filter(self, fn, data): 350 for datum in self.find(data): 351 datum.path.filter(fn, datum.value) 352 return data 353 354 def __str__(self): 355 return '%s where %s' % (self.left, self.right) 356 357 def __eq__(self, other): 358 return isinstance(other, Where) and other.left == self.left and other.right == self.right 359 360class Descendants(JSONPath): 361 """ 362 JSONPath that matches first the left expression then any descendant 363 of it which matches the right expression. 364 """ 365 366 def __init__(self, left, right): 367 self.left = left 368 self.right = right 369 370 def find(self, datum): 371 # <left> .. <right> ==> <left> . (<right> | *..<right> | [*]..<right>) 372 # 373 # With with a wonky caveat that since Slice() has funky coercions 374 # we cannot just delegate to that equivalence or we'll hit an 375 # infinite loop. So right here we implement the coercion-free version. 376 377 # Get all left matches into a list 378 left_matches = self.left.find(datum) 379 if not isinstance(left_matches, list): 380 left_matches = [left_matches] 381 382 def match_recursively(datum): 383 right_matches = self.right.find(datum) 384 385 # Manually do the * or [*] to avoid coercion and recurse just the right-hand pattern 386 if isinstance(datum.value, list): 387 recursive_matches = [submatch 388 for i in range(0, len(datum.value)) 389 for submatch in match_recursively(DatumInContext(datum.value[i], context=datum, path=Index(i)))] 390 391 elif isinstance(datum.value, dict): 392 recursive_matches = [submatch 393 for field in datum.value.keys() 394 for submatch in match_recursively(DatumInContext(datum.value[field], context=datum, path=Fields(field)))] 395 396 else: 397 recursive_matches = [] 398 399 return right_matches + list(recursive_matches) 400 401 # TODO: repeatable iterator instead of list? 402 return [submatch 403 for left_match in left_matches 404 for submatch in match_recursively(left_match)] 405 406 def is_singular(self): 407 return False 408 409 def update(self, data, val): 410 # Get all left matches into a list 411 left_matches = self.left.find(data) 412 if not isinstance(left_matches, list): 413 left_matches = [left_matches] 414 415 def update_recursively(data): 416 # Update only mutable values corresponding to JSON types 417 if not (isinstance(data, list) or isinstance(data, dict)): 418 return 419 420 self.right.update(data, val) 421 422 # Manually do the * or [*] to avoid coercion and recurse just the right-hand pattern 423 if isinstance(data, list): 424 for i in range(0, len(data)): 425 update_recursively(data[i]) 426 427 elif isinstance(data, dict): 428 for field in data.keys(): 429 update_recursively(data[field]) 430 431 for submatch in left_matches: 432 update_recursively(submatch.value) 433 434 return data 435 436 def filter(self, fn, data): 437 # Get all left matches into a list 438 left_matches = self.left.find(data) 439 if not isinstance(left_matches, list): 440 left_matches = [left_matches] 441 442 def filter_recursively(data): 443 # Update only mutable values corresponding to JSON types 444 if not (isinstance(data, list) or isinstance(data, dict)): 445 return 446 447 self.right.filter(fn, data) 448 449 # Manually do the * or [*] to avoid coercion and recurse just the right-hand pattern 450 if isinstance(data, list): 451 for i in range(0, len(data)): 452 filter_recursively(data[i]) 453 454 elif isinstance(data, dict): 455 for field in data.keys(): 456 filter_recursively(data[field]) 457 458 for submatch in left_matches: 459 filter_recursively(submatch.value) 460 461 return data 462 463 def __str__(self): 464 return '%s..%s' % (self.left, self.right) 465 466 def __eq__(self, other): 467 return isinstance(other, Descendants) and self.left == other.left and self.right == other.right 468 469 def __repr__(self): 470 return '%s(%r, %r)' % (self.__class__.__name__, self.left, self.right) 471 472 473class Union(JSONPath): 474 """ 475 JSONPath that returns the union of the results of each match. 476 This is pretty shoddily implemented for now. The nicest semantics 477 in case of mismatched bits (list vs atomic) is to put 478 them all in a list, but I haven't done that yet. 479 480 WARNING: Any appearance of this being the _concatenation_ is 481 coincidence. It may even be a bug! (or laziness) 482 """ 483 def __init__(self, left, right): 484 self.left = left 485 self.right = right 486 487 def is_singular(self): 488 return False 489 490 def find(self, data): 491 return self.left.find(data) + self.right.find(data) 492 493class Intersect(JSONPath): 494 """ 495 JSONPath for bits that match *both* patterns. 496 497 This can be accomplished a couple of ways. The most 498 efficient is to actually build the intersected 499 AST as in building a state machine for matching the 500 intersection of regular languages. The next 501 idea is to build a filtered data and match against 502 that. 503 """ 504 def __init__(self, left, right): 505 self.left = left 506 self.right = right 507 508 def is_singular(self): 509 return False 510 511 def find(self, data): 512 raise NotImplementedError() 513 514 515class Fields(JSONPath): 516 """ 517 JSONPath referring to some field of the current object. 518 Concrete syntax ix comma-separated field names. 519 520 WARNING: If '*' is any of the field names, then they will 521 all be returned. 522 """ 523 524 def __init__(self, *fields): 525 self.fields = fields 526 527 @staticmethod 528 def get_field_datum(datum, field, create): 529 if field == auto_id_field: 530 return AutoIdForDatum(datum) 531 try: 532 field_value = datum.value.get(field, NOT_SET) 533 if field_value is NOT_SET: 534 if create: 535 datum.value[field] = field_value = {} 536 else: 537 return None 538 return DatumInContext(field_value, path=Fields(field), context=datum) 539 except (TypeError, AttributeError): 540 return None 541 542 def reified_fields(self, datum): 543 if '*' not in self.fields: 544 return self.fields 545 else: 546 try: 547 fields = tuple(datum.value.keys()) 548 return fields if auto_id_field is None else fields + (auto_id_field,) 549 except AttributeError: 550 return () 551 552 def find(self, datum): 553 return self._find_base(datum, create=False) 554 555 def find_or_create(self, datum): 556 return self._find_base(datum, create=True) 557 558 def _find_base(self, datum, create): 559 datum = DatumInContext.wrap(datum) 560 field_data = [self.get_field_datum(datum, field, create) 561 for field in self.reified_fields(datum)] 562 return [fd for fd in field_data if fd is not None] 563 564 def update(self, data, val): 565 return self._update_base(data, val, create=False) 566 567 def update_or_create(self, data, val): 568 return self._update_base(data, val, create=True) 569 570 def _update_base(self, data, val, create): 571 if data is not None: 572 for field in self.reified_fields(DatumInContext.wrap(data)): 573 if field not in data and create: 574 data[field] = {} 575 if field in data: 576 if hasattr(val, '__call__'): 577 val(data[field], data, field) 578 else: 579 data[field] = val 580 return data 581 582 def filter(self, fn, data): 583 if data is not None: 584 for field in self.reified_fields(DatumInContext.wrap(data)): 585 if field in data: 586 if fn(data[field]): 587 data.pop(field) 588 return data 589 590 def __str__(self): 591 return ','.join(map(str, self.fields)) 592 593 def __repr__(self): 594 return '%s(%s)' % (self.__class__.__name__, ','.join(map(repr, self.fields))) 595 596 def __eq__(self, other): 597 return isinstance(other, Fields) and tuple(self.fields) == tuple(other.fields) 598 599 600class Index(JSONPath): 601 """ 602 JSONPath that matches indices of the current datum, or none if not large enough. 603 Concrete syntax is brackets. 604 605 WARNING: If the datum is None or not long enough, it will not crash but will not match anything. 606 NOTE: For the concrete syntax of `[*]`, the abstract syntax is a Slice() with no parameters (equiv to `[:]` 607 """ 608 609 def __init__(self, index): 610 self.index = index 611 612 def find(self, datum): 613 return self._find_base(datum, create=False) 614 615 def find_or_create(self, datum): 616 return self._find_base(datum, create=True) 617 618 def _find_base(self, datum, create): 619 datum = DatumInContext.wrap(datum) 620 if create: 621 if datum.value == {}: 622 datum.value = _create_list_key(datum.value) 623 self._pad_value(datum.value) 624 if datum.value and len(datum.value) > self.index: 625 return [DatumInContext(datum.value[self.index], path=self, context=datum)] 626 else: 627 return [] 628 629 def update(self, data, val): 630 return self._update_base(data, val, create=False) 631 632 def update_or_create(self, data, val): 633 return self._update_base(data, val, create=True) 634 635 def _update_base(self, data, val, create): 636 if create: 637 if data == {}: 638 data = _create_list_key(data) 639 self._pad_value(data) 640 if hasattr(val, '__call__'): 641 val.__call__(data[self.index], data, self.index) 642 elif len(data) > self.index: 643 data[self.index] = val 644 return data 645 646 def filter(self, fn, data): 647 if fn(data[self.index]): 648 data.pop(self.index) # relies on mutation :( 649 return data 650 651 def __eq__(self, other): 652 return isinstance(other, Index) and self.index == other.index 653 654 def __str__(self): 655 return '[%i]' % self.index 656 657 def __repr__(self): 658 return '%s(index=%r)' % (self.__class__.__name__, self.index) 659 660 def _pad_value(self, value): 661 if len(value) <= self.index: 662 pad = self.index - len(value) + 1 663 value += [{} for __ in range(pad)] 664 665 666class Slice(JSONPath): 667 """ 668 JSONPath matching a slice of an array. 669 670 Because of a mismatch between JSON and XML when schema-unaware, 671 this always returns an iterable; if the incoming data 672 was not a list, then it returns a one element list _containing_ that 673 data. 674 675 Consider these two docs, and their schema-unaware translation to JSON: 676 677 <a><b>hello</b></a> ==> {"a": {"b": "hello"}} 678 <a><b>hello</b><b>goodbye</b></a> ==> {"a": {"b": ["hello", "goodbye"]}} 679 680 If there were a schema, it would be known that "b" should always be an 681 array (unless the schema were wonky, but that is too much to fix here) 682 so when querying with JSON if the one writing the JSON knows that it 683 should be an array, they can write a slice operator and it will coerce 684 a non-array value to an array. 685 686 This may be a bit unfortunate because it would be nice to always have 687 an iterator, but dictionaries and other objects may also be iterable, 688 so this is the compromise. 689 """ 690 def __init__(self, start=None, end=None, step=None): 691 self.start = start 692 self.end = end 693 self.step = step 694 695 def find(self, datum): 696 datum = DatumInContext.wrap(datum) 697 698 # Used for catching null value instead of empty list in path 699 if not datum.value: 700 return [] 701 # Here's the hack. If it is a dictionary or some kind of constant, 702 # put it in a single-element list 703 if (isinstance(datum.value, dict) or isinstance(datum.value, six.integer_types) or isinstance(datum.value, six.string_types)): 704 return self.find(DatumInContext([datum.value], path=datum.path, context=datum.context)) 705 706 # Some iterators do not support slicing but we can still 707 # at least work for '*' 708 if self.start == None and self.end == None and self.step == None: 709 return [DatumInContext(datum.value[i], path=Index(i), context=datum) for i in xrange(0, len(datum.value))] 710 else: 711 return [DatumInContext(datum.value[i], path=Index(i), context=datum) for i in range(0, len(datum.value))[self.start:self.end:self.step]] 712 713 def update(self, data, val): 714 for datum in self.find(data): 715 datum.path.update(data, val) 716 return data 717 718 def filter(self, fn, data): 719 while True: 720 length = len(data) 721 for datum in self.find(data): 722 data = datum.path.filter(fn, data) 723 if len(data) < length: 724 break 725 726 if length == len(data): 727 break 728 return data 729 730 def __str__(self): 731 if self.start == None and self.end == None and self.step == None: 732 return '[*]' 733 else: 734 return '[%s%s%s]' % (self.start or '', 735 ':%d'%self.end if self.end else '', 736 ':%d'%self.step if self.step else '') 737 738 def __repr__(self): 739 return '%s(start=%r,end=%r,step=%r)' % (self.__class__.__name__, self.start, self.end, self.step) 740 741 def __eq__(self, other): 742 return isinstance(other, Slice) and other.start == self.start and self.end == other.end and other.step == self.step 743 744 745def _create_list_key(dict_): 746 """ 747 Adds a list to a dictionary by reference and returns the list. 748 749 See `_clean_list_keys()` 750 """ 751 dict_[LIST_KEY] = new_list = [{}] 752 return new_list 753 754 755def _clean_list_keys(dict_): 756 """ 757 Replace {LIST_KEY: ['foo', 'bar']} with ['foo', 'bar']. 758 759 >>> _clean_list_keys({LIST_KEY: ['foo', 'bar']}) 760 ['foo', 'bar'] 761 762 """ 763 for key, value in dict_.items(): 764 if isinstance(value, dict): 765 dict_[key] = _clean_list_keys(value) 766 elif isinstance(value, list): 767 dict_[key] = [_clean_list_keys(v) if isinstance(v, dict) else v 768 for v in value] 769 if LIST_KEY in dict_: 770 return dict_[LIST_KEY] 771 return dict_ 772