1#!/usr/bin/env python 2 3# In order to run the docstrings: 4# python3 -m deepdiff.diff 5# You might need to run it many times since dictionaries come in different orders 6# every time you run the docstrings. 7# However the docstring expects it in a specific order in order to pass! 8import difflib 9import logging 10from copy import deepcopy 11from math import isclose as is_close 12from collections.abc import Mapping, Iterable 13from collections import defaultdict 14from itertools import zip_longest 15from ordered_set import OrderedSet 16from deepdiff.helper import (strings, bytes_type, numbers, times, ListItemRemovedOrAdded, notpresent, 17 IndexedHash, unprocessed, add_to_frozen_set, 18 convert_item_or_items_into_set_else_none, get_type, 19 convert_item_or_items_into_compiled_regexes_else_none, 20 type_is_subclass_of_type_group, type_in_type_group, get_doc, 21 number_to_string, datetime_normalize, KEY_TO_VAL_STR, booleans, 22 np_ndarray, get_numpy_ndarray_rows, OrderedSetPlus, RepeatedTimer, 23 TEXT_VIEW, TREE_VIEW, DELTA_VIEW, 24 np, get_truncate_datetime, dict_, CannotCompare) 25from deepdiff.serialization import SerializationMixin 26from deepdiff.distance import DistanceMixin 27from deepdiff.model import ( 28 RemapDict, ResultDict, TextResult, TreeResult, DiffLevel, 29 DictRelationship, AttributeRelationship, 30 SubscriptableIterableRelationship, NonSubscriptableIterableRelationship, 31 SetRelationship, NumpyArrayRelationship, CUSTOM_FIELD) 32from deepdiff.deephash import DeepHash, combine_hashes_lists 33from deepdiff.base import Base 34from deepdiff.lfucache import LFUCache, DummyLFU 35 36logger = logging.getLogger(__name__) 37 38MAX_PASSES_REACHED_MSG = ( 39 'DeepDiff has reached the max number of passes of {}. ' 40 'You can possibly get more accurate results by increasing the max_passes parameter.') 41 42MAX_DIFFS_REACHED_MSG = ( 43 'DeepDiff has reached the max number of diffs of {}. ' 44 'You can possibly get more accurate results by increasing the max_diffs parameter.') 45 46 47notpresent_indexed = IndexedHash(indexes=[0], item=notpresent) 48 49doc = get_doc('diff_doc.rst') 50 51 52PROGRESS_MSG = "DeepDiff {} seconds in progress. Pass #{}, Diff #{}" 53 54 55def _report_progress(_stats, progress_logger, duration): 56 """ 57 Report the progress every few seconds. 58 """ 59 progress_logger(PROGRESS_MSG.format(duration, _stats[PASSES_COUNT], _stats[DIFF_COUNT])) 60 61 62DISTANCE_CACHE_HIT_COUNT = 'DISTANCE CACHE HIT COUNT' 63DIFF_COUNT = 'DIFF COUNT' 64PASSES_COUNT = 'PASSES COUNT' 65MAX_PASS_LIMIT_REACHED = 'MAX PASS LIMIT REACHED' 66MAX_DIFF_LIMIT_REACHED = 'MAX DIFF LIMIT REACHED' 67DISTANCE_CACHE_ENABLED = 'DISTANCE CACHE ENABLED' 68PREVIOUS_DIFF_COUNT = 'PREVIOUS DIFF COUNT' 69PREVIOUS_DISTANCE_CACHE_HIT_COUNT = 'PREVIOUS DISTANCE CACHE HIT COUNT' 70CANT_FIND_NUMPY_MSG = 'Unable to import numpy. This must be a bug in DeepDiff since a numpy array is detected.' 71INVALID_VIEW_MSG = 'The only valid values for the view parameter are text and tree. But {} was passed.' 72CUTOFF_RANGE_ERROR_MSG = 'cutoff_distance_for_pairs needs to be a positive float max 1.' 73VERBOSE_LEVEL_RANGE_MSG = 'verbose_level should be 0, 1, or 2.' 74PURGE_LEVEL_RANGE_MSG = 'cache_purge_level should be 0, 1, or 2.' 75_ENABLE_CACHE_EVERY_X_DIFF = '_ENABLE_CACHE_EVERY_X_DIFF' 76 77# What is the threshold to consider 2 items to be pairs. Only used when ignore_order = True. 78CUTOFF_DISTANCE_FOR_PAIRS_DEFAULT = 0.3 79 80# What is the threshold to calculate pairs of items between 2 iterables. 81# For example 2 iterables that have nothing in common, do not need their pairs to be calculated. 82CUTOFF_INTERSECTION_FOR_PAIRS_DEFAULT = 0.7 83 84DEEPHASH_PARAM_KEYS = ( 85 'exclude_types', 86 'exclude_paths', 87 'exclude_regex_paths', 88 'hasher', 89 'significant_digits', 90 'number_format_notation', 91 'ignore_string_type_changes', 92 'ignore_numeric_type_changes', 93 'ignore_type_in_groups', 94 'ignore_type_subclasses', 95 'ignore_string_case', 96 'exclude_obj_callback', 97 'ignore_private_variables',) 98 99 100class DeepDiff(ResultDict, SerializationMixin, DistanceMixin, Base): 101 __doc__ = doc 102 103 CACHE_AUTO_ADJUST_THRESHOLD = 0.25 104 105 def __init__(self, 106 t1, 107 t2, 108 cutoff_distance_for_pairs=CUTOFF_DISTANCE_FOR_PAIRS_DEFAULT, 109 cutoff_intersection_for_pairs=CUTOFF_INTERSECTION_FOR_PAIRS_DEFAULT, 110 cache_size=0, 111 cache_tuning_sample_size=0, 112 cache_purge_level=1, 113 exclude_paths=None, 114 exclude_regex_paths=None, 115 exclude_types=None, 116 exclude_obj_callback=None, 117 get_deep_distance=False, 118 group_by=None, 119 hasher=None, 120 hashes=None, 121 ignore_order=False, 122 ignore_order_func=None, 123 ignore_type_in_groups=None, 124 ignore_string_type_changes=False, 125 ignore_numeric_type_changes=False, 126 ignore_type_subclasses=False, 127 ignore_string_case=False, 128 ignore_nan_inequality=False, 129 ignore_private_variables=True, 130 log_frequency_in_sec=0, 131 math_epsilon=None, 132 max_passes=10000000, 133 max_diffs=None, 134 number_format_notation="f", 135 number_to_string_func=None, 136 progress_logger=logger.info, 137 report_repetition=False, 138 significant_digits=None, 139 truncate_datetime=None, 140 verbose_level=1, 141 view=TEXT_VIEW, 142 iterable_compare_func=None, 143 custom_operators=None, 144 _original_type=None, 145 _parameters=None, 146 _shared_parameters=None, 147 **kwargs): 148 super().__init__() 149 if kwargs: 150 raise ValueError(( 151 "The following parameter(s) are not valid: %s\n" 152 "The valid parameters are ignore_order, report_repetition, significant_digits, " 153 "number_format_notation, exclude_paths, exclude_types, exclude_regex_paths, ignore_type_in_groups, " 154 "ignore_string_type_changes, ignore_numeric_type_changes, ignore_type_subclasses, truncate_datetime, " 155 "ignore_private_variables, ignore_nan_inequality, number_to_string_func, verbose_level, " 156 "view, hasher, hashes, max_passes, max_diffs, " 157 "cutoff_distance_for_pairs, cutoff_intersection_for_pairs, log_frequency_in_sec, cache_size, " 158 "cache_tuning_sample_size, get_deep_distance, group_by, cache_purge_level, " 159 "math_epsilon, iterable_compare_func, _original_type, " 160 "ignore_order_func, custom_operators, " 161 "_parameters and _shared_parameters.") % ', '.join(kwargs.keys())) 162 163 if _parameters: 164 self.__dict__.update(_parameters) 165 else: 166 self.custom_operators = custom_operators or [] 167 self.ignore_order = ignore_order 168 169 self.ignore_order_func = ignore_order_func or (lambda *_args, **_kwargs: ignore_order) 170 171 ignore_type_in_groups = ignore_type_in_groups or [] 172 if numbers == ignore_type_in_groups or numbers in ignore_type_in_groups: 173 ignore_numeric_type_changes = True 174 self.ignore_numeric_type_changes = ignore_numeric_type_changes 175 if strings == ignore_type_in_groups or strings in ignore_type_in_groups: 176 ignore_string_type_changes = True 177 self.ignore_string_type_changes = ignore_string_type_changes 178 self.ignore_type_in_groups = self.get_ignore_types_in_groups( 179 ignore_type_in_groups=ignore_type_in_groups, 180 ignore_string_type_changes=ignore_string_type_changes, 181 ignore_numeric_type_changes=ignore_numeric_type_changes, 182 ignore_type_subclasses=ignore_type_subclasses) 183 self.report_repetition = report_repetition 184 self.exclude_paths = convert_item_or_items_into_set_else_none(exclude_paths) 185 self.exclude_regex_paths = convert_item_or_items_into_compiled_regexes_else_none(exclude_regex_paths) 186 self.exclude_types = set(exclude_types) if exclude_types else None 187 self.exclude_types_tuple = tuple(exclude_types) if exclude_types else None # we need tuple for checking isinstance 188 self.ignore_type_subclasses = ignore_type_subclasses 189 self.type_check_func = type_is_subclass_of_type_group if ignore_type_subclasses else type_in_type_group 190 self.ignore_string_case = ignore_string_case 191 self.exclude_obj_callback = exclude_obj_callback 192 self.number_to_string = number_to_string_func or number_to_string 193 self.iterable_compare_func = iterable_compare_func 194 self.ignore_private_variables = ignore_private_variables 195 self.ignore_nan_inequality = ignore_nan_inequality 196 self.hasher = hasher 197 self.cache_tuning_sample_size = cache_tuning_sample_size 198 self.group_by = group_by 199 200 self.significant_digits = self.get_significant_digits(significant_digits, ignore_numeric_type_changes) 201 self.math_epsilon = math_epsilon 202 if self.math_epsilon is not None and self.ignore_order: 203 logger.warning("math_epsilon will be ignored. It cannot be used when ignore_order is True.") 204 self.truncate_datetime = get_truncate_datetime(truncate_datetime) 205 self.number_format_notation = number_format_notation 206 if verbose_level in {0, 1, 2}: 207 self.verbose_level = verbose_level 208 else: 209 raise ValueError(VERBOSE_LEVEL_RANGE_MSG) 210 if cache_purge_level not in {0, 1, 2}: 211 raise ValueError(PURGE_LEVEL_RANGE_MSG) 212 self.view = view 213 # Setting up the cache for dynamic programming. One dictionary per instance of root of DeepDiff running. 214 self.max_passes = max_passes 215 self.max_diffs = max_diffs 216 self.cutoff_distance_for_pairs = float(cutoff_distance_for_pairs) 217 self.cutoff_intersection_for_pairs = float(cutoff_intersection_for_pairs) 218 if self.cutoff_distance_for_pairs < 0 or self.cutoff_distance_for_pairs > 1: 219 raise ValueError(CUTOFF_RANGE_ERROR_MSG) 220 # _Parameters are the clean _parameters to initialize DeepDiff with so we avoid all the above 221 # cleaning functionalities when running DeepDiff recursively. 222 # However DeepHash has its own set of _parameters that are slightly different than DeepDIff. 223 # DeepDiff _parameters are transformed to DeepHash _parameters via _get_deephash_params method. 224 self.progress_logger = progress_logger 225 self.cache_size = cache_size 226 _parameters = self.__dict__.copy() 227 228 # Non-Root 229 if _shared_parameters: 230 self.is_root = False 231 self._shared_parameters = _shared_parameters 232 self.__dict__.update(_shared_parameters) 233 # We are in some pass other than root 234 progress_timer = None 235 # Root 236 else: 237 self.is_root = True 238 # Caching the DeepDiff results for dynamic programming 239 self._distance_cache = LFUCache(cache_size) if cache_size else DummyLFU() 240 self._stats = { 241 PASSES_COUNT: 0, 242 DIFF_COUNT: 0, 243 DISTANCE_CACHE_HIT_COUNT: 0, 244 PREVIOUS_DIFF_COUNT: 0, 245 PREVIOUS_DISTANCE_CACHE_HIT_COUNT: 0, 246 MAX_PASS_LIMIT_REACHED: False, 247 MAX_DIFF_LIMIT_REACHED: False, 248 DISTANCE_CACHE_ENABLED: bool(cache_size), 249 } 250 self.hashes = dict_() if hashes is None else hashes 251 self._numpy_paths = dict_() # if _numpy_paths is None else _numpy_paths 252 self._shared_parameters = { 253 'hashes': self.hashes, 254 '_stats': self._stats, 255 '_distance_cache': self._distance_cache, 256 '_numpy_paths': self._numpy_paths, 257 _ENABLE_CACHE_EVERY_X_DIFF: self.cache_tuning_sample_size * 10, 258 } 259 if log_frequency_in_sec: 260 # Creating a progress log reporter that runs in a separate thread every log_frequency_in_sec seconds. 261 progress_timer = RepeatedTimer(log_frequency_in_sec, _report_progress, self._stats, progress_logger) 262 else: 263 progress_timer = None 264 265 self._parameters = _parameters 266 self.deephash_parameters = self._get_deephash_params() 267 self.tree = TreeResult() 268 if group_by and self.is_root: 269 try: 270 original_t1 = t1 271 t1 = self._group_iterable_to_dict(t1, group_by, item_name='t1') 272 except (KeyError, ValueError): 273 pass 274 else: 275 try: 276 t2 = self._group_iterable_to_dict(t2, group_by, item_name='t2') 277 except (KeyError, ValueError): 278 t1 = original_t1 279 280 self.t1 = t1 281 self.t2 = t2 282 283 try: 284 root = DiffLevel(t1, t2, verbose_level=self.verbose_level) 285 # _original_type is only used to pass the original type of the data. Currently only used for numpy arrays. 286 # The reason is that we convert the numpy array to python list and then later for distance calculations 287 # we convert only the the last dimension of it into numpy arrays. 288 self._diff(root, parents_ids=frozenset({id(t1)}), _original_type=_original_type) 289 290 if get_deep_distance and view in {TEXT_VIEW, TREE_VIEW}: 291 self.tree['deep_distance'] = self._get_rough_distance() 292 293 self.tree.remove_empty_keys() 294 view_results = self._get_view_results(self.view) 295 self.update(view_results) 296 finally: 297 if self.is_root: 298 if cache_purge_level: 299 del self._distance_cache 300 del self.hashes 301 del self._shared_parameters 302 del self._parameters 303 for key in (PREVIOUS_DIFF_COUNT, PREVIOUS_DISTANCE_CACHE_HIT_COUNT, 304 DISTANCE_CACHE_ENABLED): 305 del self._stats[key] 306 if progress_timer: 307 duration = progress_timer.stop() 308 self._stats['DURATION SEC'] = duration 309 logger.info('stats {}'.format(self.get_stats())) 310 if cache_purge_level == 2: 311 self.__dict__.clear() 312 313 def _get_deephash_params(self): 314 result = {key: self._parameters[key] for key in DEEPHASH_PARAM_KEYS} 315 result['ignore_repetition'] = not self.report_repetition 316 result['number_to_string_func'] = self.number_to_string 317 return result 318 319 def _report_result(self, report_type, level): 320 """ 321 Add a detected change to the reference-style result dictionary. 322 report_type will be added to level. 323 (We'll create the text-style report from there later.) 324 :param report_type: A well defined string key describing the type of change. 325 Examples: "set_item_added", "values_changed" 326 :param parent: A DiffLevel object describing the objects in question in their 327 before-change and after-change object structure. 328 329 :rtype: None 330 """ 331 332 if not self._skip_this(level): 333 level.report_type = report_type 334 self.tree[report_type].add(level) 335 336 def custom_report_result(self, report_type, level, extra_info=None): 337 """ 338 Add a detected change to the reference-style result dictionary. 339 report_type will be added to level. 340 (We'll create the text-style report from there later.) 341 :param report_type: A well defined string key describing the type of change. 342 Examples: "set_item_added", "values_changed" 343 :param parent: A DiffLevel object describing the objects in question in their 344 before-change and after-change object structure. 345 :param extra_info: A dict that describe this result 346 :rtype: None 347 """ 348 349 if not self._skip_this(level): 350 level.report_type = report_type 351 level.additional[CUSTOM_FIELD] = extra_info 352 self.tree[report_type].add(level) 353 354 @staticmethod 355 def _dict_from_slots(object): 356 def unmangle(attribute): 357 if attribute.startswith('__') and attribute != '__weakref__': 358 return '_{type}{attribute}'.format( 359 type=type(object).__name__, 360 attribute=attribute 361 ) 362 return attribute 363 364 all_slots = [] 365 366 if isinstance(object, type): 367 mro = object.__mro__ # pragma: no cover. I have not been able to write a test for this case. But we still check for it. 368 else: 369 mro = object.__class__.__mro__ 370 371 for type_in_mro in mro: 372 slots = getattr(type_in_mro, '__slots__', None) 373 if slots: 374 if isinstance(slots, strings): 375 all_slots.append(slots) 376 else: 377 all_slots.extend(slots) 378 379 return {i: getattr(object, unmangle(i)) for i in all_slots} 380 381 def _diff_obj(self, level, parents_ids=frozenset(), 382 is_namedtuple=False): 383 """Difference of 2 objects""" 384 try: 385 if is_namedtuple: 386 t1 = level.t1._asdict() 387 t2 = level.t2._asdict() 388 else: 389 t1 = level.t1.__dict__ 390 t2 = level.t2.__dict__ 391 except AttributeError: 392 try: 393 t1 = self._dict_from_slots(level.t1) 394 t2 = self._dict_from_slots(level.t2) 395 except AttributeError: 396 self._report_result('unprocessed', level) 397 return 398 399 self._diff_dict( 400 level, 401 parents_ids, 402 print_as_attribute=True, 403 override=True, 404 override_t1=t1, 405 override_t2=t2) 406 407 def _skip_this(self, level): 408 """ 409 Check whether this comparison should be skipped because one of the objects to compare meets exclusion criteria. 410 :rtype: bool 411 """ 412 skip = False 413 if self.exclude_paths and level.path() in self.exclude_paths: 414 skip = True 415 elif self.exclude_regex_paths and any( 416 [exclude_regex_path.search(level.path()) for exclude_regex_path in self.exclude_regex_paths]): 417 skip = True 418 elif self.exclude_types_tuple and \ 419 (isinstance(level.t1, self.exclude_types_tuple) or isinstance(level.t2, self.exclude_types_tuple)): 420 skip = True 421 elif self.exclude_obj_callback and \ 422 (self.exclude_obj_callback(level.t1, level.path()) or self.exclude_obj_callback(level.t2, level.path())): 423 skip = True 424 425 return skip 426 427 def _get_clean_to_keys_mapping(self, keys, level): 428 """ 429 Get a dictionary of cleaned value of keys to the keys themselves. 430 This is mainly used to transform the keys when the type changes of keys should be ignored. 431 432 TODO: needs also some key conversion for groups of types other than the built-in strings and numbers. 433 """ 434 result = dict_() 435 for key in keys: 436 if self.ignore_string_type_changes and isinstance(key, bytes): 437 clean_key = key.decode('utf-8') 438 elif isinstance(key, numbers): 439 type_ = "number" if self.ignore_numeric_type_changes else key.__class__.__name__ 440 clean_key = self.number_to_string(key, significant_digits=self.significant_digits, 441 number_format_notation=self.number_format_notation) 442 clean_key = KEY_TO_VAL_STR.format(type_, clean_key) 443 else: 444 clean_key = key 445 if clean_key in result: 446 logger.warning(('{} and {} in {} become the same key when ignore_numeric_type_changes' 447 'or ignore_numeric_type_changes are set to be true.').format( 448 key, result[clean_key], level.path())) 449 else: 450 result[clean_key] = key 451 return result 452 453 def _diff_dict(self, 454 level, 455 parents_ids=frozenset([]), 456 print_as_attribute=False, 457 override=False, 458 override_t1=None, 459 override_t2=None): 460 """Difference of 2 dictionaries""" 461 if override: 462 # for special stuff like custom objects and named tuples we receive preprocessed t1 and t2 463 # but must not spoil the chain (=level) with it 464 t1 = override_t1 465 t2 = override_t2 466 else: 467 t1 = level.t1 468 t2 = level.t2 469 470 if print_as_attribute: 471 item_added_key = "attribute_added" 472 item_removed_key = "attribute_removed" 473 rel_class = AttributeRelationship 474 else: 475 item_added_key = "dictionary_item_added" 476 item_removed_key = "dictionary_item_removed" 477 rel_class = DictRelationship 478 479 if self.ignore_private_variables: 480 t1_keys = OrderedSet([key for key in t1 if not(isinstance(key, str) and key.startswith('__'))]) 481 t2_keys = OrderedSet([key for key in t2 if not(isinstance(key, str) and key.startswith('__'))]) 482 else: 483 t1_keys = OrderedSet(t1.keys()) 484 t2_keys = OrderedSet(t2.keys()) 485 if self.ignore_string_type_changes or self.ignore_numeric_type_changes: 486 t1_clean_to_keys = self._get_clean_to_keys_mapping(keys=t1_keys, level=level) 487 t2_clean_to_keys = self._get_clean_to_keys_mapping(keys=t2_keys, level=level) 488 t1_keys = OrderedSet(t1_clean_to_keys.keys()) 489 t2_keys = OrderedSet(t2_clean_to_keys.keys()) 490 else: 491 t1_clean_to_keys = t2_clean_to_keys = None 492 493 t_keys_intersect = t2_keys.intersection(t1_keys) 494 495 t_keys_added = t2_keys - t_keys_intersect 496 t_keys_removed = t1_keys - t_keys_intersect 497 498 for key in t_keys_added: 499 if self._count_diff() is StopIteration: 500 return 501 502 key = t2_clean_to_keys[key] if t2_clean_to_keys else key 503 change_level = level.branch_deeper( 504 notpresent, 505 t2[key], 506 child_relationship_class=rel_class, 507 child_relationship_param=key) 508 self._report_result(item_added_key, change_level) 509 510 for key in t_keys_removed: 511 if self._count_diff() is StopIteration: 512 return # pragma: no cover. This is already covered for addition. 513 514 key = t1_clean_to_keys[key] if t1_clean_to_keys else key 515 change_level = level.branch_deeper( 516 t1[key], 517 notpresent, 518 child_relationship_class=rel_class, 519 child_relationship_param=key) 520 self._report_result(item_removed_key, change_level) 521 522 for key in t_keys_intersect: # key present in both dicts - need to compare values 523 if self._count_diff() is StopIteration: 524 return # pragma: no cover. This is already covered for addition. 525 526 key1 = t1_clean_to_keys[key] if t1_clean_to_keys else key 527 key2 = t2_clean_to_keys[key] if t2_clean_to_keys else key 528 item_id = id(t1[key1]) 529 if parents_ids and item_id in parents_ids: 530 continue 531 parents_ids_added = add_to_frozen_set(parents_ids, item_id) 532 533 # Go one level deeper 534 next_level = level.branch_deeper( 535 t1[key1], 536 t2[key2], 537 child_relationship_class=rel_class, 538 child_relationship_param=key) 539 self._diff(next_level, parents_ids_added) 540 541 def _diff_set(self, level): 542 """Difference of sets""" 543 t1_hashtable = self._create_hashtable(level, 't1') 544 t2_hashtable = self._create_hashtable(level, 't2') 545 546 t1_hashes = set(t1_hashtable.keys()) 547 t2_hashes = set(t2_hashtable.keys()) 548 549 hashes_added = t2_hashes - t1_hashes 550 hashes_removed = t1_hashes - t2_hashes 551 552 items_added = [t2_hashtable[i].item for i in hashes_added] 553 items_removed = [t1_hashtable[i].item for i in hashes_removed] 554 555 for item in items_added: 556 if self._count_diff() is StopIteration: 557 return # pragma: no cover. This is already covered for addition. 558 559 change_level = level.branch_deeper( 560 notpresent, item, child_relationship_class=SetRelationship) 561 self._report_result('set_item_added', change_level) 562 563 for item in items_removed: 564 if self._count_diff() is StopIteration: 565 return # pragma: no cover. This is already covered for addition. 566 567 change_level = level.branch_deeper( 568 item, notpresent, child_relationship_class=SetRelationship) 569 self._report_result('set_item_removed', change_level) 570 571 @staticmethod 572 def _iterables_subscriptable(t1, t2): 573 try: 574 if getattr(t1, '__getitem__') and getattr(t2, '__getitem__'): 575 return True 576 else: # pragma: no cover 577 return False # should never happen 578 except AttributeError: 579 return False 580 581 def _diff_iterable(self, level, parents_ids=frozenset(), _original_type=None): 582 """Difference of iterables""" 583 if self.ignore_order_func(level): 584 self._diff_iterable_with_deephash(level, parents_ids, _original_type=_original_type) 585 else: 586 self._diff_iterable_in_order(level, parents_ids, _original_type=_original_type) 587 588 def _compare_in_order(self, level): 589 """ 590 Default compare if `iterable_compare_func` is not provided. 591 This will compare in sequence order. 592 """ 593 594 return [((i, i), (x, y)) for i, (x, y) in enumerate( 595 zip_longest( 596 level.t1, level.t2, fillvalue=ListItemRemovedOrAdded))] 597 598 def _get_matching_pairs(self, level): 599 """ 600 Given a level get matching pairs. This returns list of two tuples in the form: 601 [ 602 (t1 index, t2 index), (t1 item, t2 item) 603 ] 604 605 This will compare using the passed in `iterable_compare_func` if available. 606 Default it to compare in order 607 """ 608 609 if(self.iterable_compare_func is None): 610 # Match in order if there is no compare function provided 611 return self._compare_in_order(level) 612 try: 613 matches = [] 614 y_matched = set() 615 y_index_matched = set() 616 for i, x in enumerate(level.t1): 617 x_found = False 618 for j, y in enumerate(level.t2): 619 620 if(j in y_index_matched): 621 # This ensures a one-to-one relationship of matches from t1 to t2. 622 # If y this index in t2 has already been matched to another x 623 # it cannot have another match, so just continue. 624 continue 625 626 if(self.iterable_compare_func(x, y, level)): 627 deep_hash = DeepHash(y, 628 hashes=self.hashes, 629 apply_hash=True, 630 **self.deephash_parameters, 631 ) 632 y_index_matched.add(j) 633 y_matched.add(deep_hash[y]) 634 matches.append(((i, j), (x, y))) 635 x_found = True 636 break 637 638 if(not x_found): 639 matches.append(((i, -1), (x, ListItemRemovedOrAdded))) 640 for j, y in enumerate(level.t2): 641 642 deep_hash = DeepHash(y, 643 hashes=self.hashes, 644 apply_hash=True, 645 **self.deephash_parameters, 646 ) 647 if(deep_hash[y] not in y_matched): 648 matches.append(((-1, j), (ListItemRemovedOrAdded, y))) 649 return matches 650 except CannotCompare: 651 return self._compare_in_order(level) 652 653 def _diff_iterable_in_order(self, level, parents_ids=frozenset(), _original_type=None): 654 # We're handling both subscriptable and non-subscriptable iterables. Which one is it? 655 subscriptable = self._iterables_subscriptable(level.t1, level.t2) 656 if subscriptable: 657 child_relationship_class = SubscriptableIterableRelationship 658 else: 659 child_relationship_class = NonSubscriptableIterableRelationship 660 661 for (i, j), (x, y) in self._get_matching_pairs(level): 662 if self._count_diff() is StopIteration: 663 return # pragma: no cover. This is already covered for addition. 664 665 if y is ListItemRemovedOrAdded: # item removed completely 666 change_level = level.branch_deeper( 667 x, 668 notpresent, 669 child_relationship_class=child_relationship_class, 670 child_relationship_param=i) 671 self._report_result('iterable_item_removed', change_level) 672 673 elif x is ListItemRemovedOrAdded: # new item added 674 change_level = level.branch_deeper( 675 notpresent, 676 y, 677 child_relationship_class=child_relationship_class, 678 child_relationship_param=j) 679 self._report_result('iterable_item_added', change_level) 680 681 else: # check if item value has changed 682 683 if (i != j): 684 # Item moved 685 change_level = level.branch_deeper( 686 x, 687 y, 688 child_relationship_class=child_relationship_class, 689 child_relationship_param=i, 690 child_relationship_param2=j 691 ) 692 self._report_result('iterable_item_moved', change_level) 693 694 item_id = id(x) 695 if parents_ids and item_id in parents_ids: 696 continue 697 parents_ids_added = add_to_frozen_set(parents_ids, item_id) 698 699 # Go one level deeper 700 next_level = level.branch_deeper( 701 x, 702 y, 703 child_relationship_class=child_relationship_class, 704 child_relationship_param=i) 705 self._diff(next_level, parents_ids_added) 706 707 def _diff_str(self, level): 708 """Compare strings""" 709 if self.ignore_string_case: 710 level.t1 = level.t1.lower() 711 level.t2 = level.t2.lower() 712 713 if type(level.t1) == type(level.t2) and level.t1 == level.t2: # NOQA 714 return 715 716 # do we add a diff for convenience? 717 do_diff = True 718 t1_str = level.t1 719 t2_str = level.t2 720 721 if isinstance(level.t1, bytes_type): 722 try: 723 t1_str = level.t1.decode('ascii') 724 except UnicodeDecodeError: 725 do_diff = False 726 727 if isinstance(level.t2, bytes_type): 728 try: 729 t2_str = level.t2.decode('ascii') 730 except UnicodeDecodeError: 731 do_diff = False 732 733 if t1_str == t2_str: 734 return 735 736 if do_diff: 737 if '\n' in t1_str or '\n' in t2_str: 738 diff = difflib.unified_diff( 739 t1_str.splitlines(), t2_str.splitlines(), lineterm='') 740 diff = list(diff) 741 if diff: 742 level.additional['diff'] = '\n'.join(diff) 743 744 self._report_result('values_changed', level) 745 746 def _diff_tuple(self, level, parents_ids): 747 # Checking to see if it has _fields. Which probably means it is a named 748 # tuple. 749 try: 750 level.t1._asdict 751 # It must be a normal tuple 752 except AttributeError: 753 self._diff_iterable(level, parents_ids) 754 # We assume it is a namedtuple then 755 else: 756 self._diff_obj(level, parents_ids, is_namedtuple=True) 757 758 def _add_hash(self, hashes, item_hash, item, i): 759 if item_hash in hashes: 760 hashes[item_hash].indexes.append(i) 761 else: 762 hashes[item_hash] = IndexedHash(indexes=[i], item=item) 763 764 def _create_hashtable(self, level, t): 765 """Create hashtable of {item_hash: (indexes, item)}""" 766 obj = getattr(level, t) 767 768 local_hashes = dict_() 769 for (i, item) in enumerate(obj): 770 try: 771 parent = "{}[{}]".format(level.path(), i) 772 # Note: in the DeepDiff we only calculate the hash of items when we have to. 773 # So self.hashes does not include hashes of all objects in t1 and t2. 774 # It only includes the ones needed when comparing iterables. 775 # The self.hashes dictionary gets shared between different runs of DeepHash 776 # So that any object that is already calculated to have a hash is not re-calculated. 777 deep_hash = DeepHash(item, 778 hashes=self.hashes, 779 parent=parent, 780 apply_hash=True, 781 **self.deephash_parameters, 782 ) 783 item_hash = deep_hash[item] 784 except Exception as e: # pragma: no cover 785 logger.error("Can not produce a hash for %s." 786 "Not counting this object.\n %s" % 787 (level.path(), e)) 788 else: 789 if item_hash is unprocessed: # pragma: no cover 790 logger.warning("Item %s was not processed while hashing " 791 "thus not counting this object." % 792 level.path()) 793 else: 794 self._add_hash(hashes=local_hashes, item_hash=item_hash, item=item, i=i) 795 796 # Also we hash the iterables themselves too so that we can later create cache keys from those hashes. 797 try: 798 DeepHash( 799 obj, 800 hashes=self.hashes, 801 parent=level.path(), 802 apply_hash=True, 803 **self.deephash_parameters, 804 ) 805 except Exception as e: # pragma: no cover 806 logger.error("Can not produce a hash for iterable %s. %s" % 807 (level.path(), e)) 808 return local_hashes 809 810 @staticmethod 811 def _get_distance_cache_key(added_hash, removed_hash): 812 key1, key2 = (added_hash, removed_hash) if added_hash > removed_hash else (removed_hash, added_hash) 813 if isinstance(key1, int): 814 # If the hash function produces integers we convert them to hex values. 815 # This was used when the default hash function was Murmur3 128bit which produces integers. 816 key1 = hex(key1).encode('utf-8') 817 key2 = hex(key2).encode('utf-8') 818 elif isinstance(key1, str): 819 key1 = key1.encode('utf-8') 820 key2 = key2.encode('utf-8') 821 return key1 + b'--' + key2 + b'dc' 822 823 def _get_rough_distance_of_hashed_objs( 824 self, added_hash, removed_hash, added_hash_obj, removed_hash_obj, _original_type=None): 825 # We need the rough distance between the 2 objects to see if they qualify to be pairs or not 826 _distance = cache_key = None 827 if self._stats[DISTANCE_CACHE_ENABLED]: 828 cache_key = self._get_distance_cache_key(added_hash, removed_hash) 829 if cache_key in self._distance_cache: 830 self._stats[DISTANCE_CACHE_HIT_COUNT] += 1 831 _distance = self._distance_cache.get(cache_key) 832 if _distance is None: 833 # We can only cache the rough distance and not the actual diff result for reuse. 834 # The reason is that we have modified the parameters explicitly so they are different and can't 835 # be used for diff reporting 836 diff = DeepDiff( 837 removed_hash_obj.item, added_hash_obj.item, 838 _parameters=self._parameters, 839 _shared_parameters=self._shared_parameters, 840 view=DELTA_VIEW, 841 _original_type=_original_type, 842 iterable_compare_func=self.iterable_compare_func, 843 ) 844 _distance = diff._get_rough_distance() 845 if cache_key and self._stats[DISTANCE_CACHE_ENABLED]: 846 self._distance_cache.set(cache_key, value=_distance) 847 return _distance 848 849 def _get_most_in_common_pairs_in_iterables( 850 self, hashes_added, hashes_removed, t1_hashtable, t2_hashtable, parents_ids, _original_type): 851 """ 852 Get the closest pairs between items that are removed and items that are added. 853 854 returns a dictionary of hashes that are closest to each other. 855 The dictionary is going to be symmetrical so any key will be a value too and otherwise. 856 857 Note that due to the current reporting structure in DeepDiff, we don't compare an item that 858 was added to an item that is in both t1 and t2. 859 860 For example 861 862 [{1, 2}, {4, 5, 6}] 863 [{1, 2}, {1, 2, 3}] 864 865 is only compared between {4, 5, 6} and {1, 2, 3} even though technically {1, 2, 3} is 866 just one item different than {1, 2} 867 868 Perhaps in future we can have a report key that is item duplicated and modified instead of just added. 869 """ 870 cache_key = None 871 if self._stats[DISTANCE_CACHE_ENABLED]: 872 cache_key = combine_hashes_lists(items=[hashes_added, hashes_removed], prefix='pairs_cache') 873 if cache_key in self._distance_cache: 874 return self._distance_cache.get(cache_key).copy() 875 876 # A dictionary of hashes to distances and each distance to an ordered set of hashes. 877 # It tells us about the distance of each object from other objects. 878 # And the objects with the same distances are grouped together in an ordered set. 879 # It also includes a "max" key that is just the value of the biggest current distance in the 880 # most_in_common_pairs dictionary. 881 most_in_common_pairs = defaultdict(lambda: defaultdict(OrderedSetPlus)) 882 pairs = dict_() 883 884 pre_calced_distances = None 885 886 if hashes_added and hashes_removed and np and len(hashes_added) > 1 and len(hashes_removed) > 1: 887 # pre-calculates distances ONLY for 1D arrays whether an _original_type 888 # was explicitly passed or a homogeneous array is detected. 889 # Numpy is needed for this optimization. 890 pre_calced_distances = self._precalculate_numpy_arrays_distance( 891 hashes_added, hashes_removed, t1_hashtable, t2_hashtable, _original_type) 892 893 if hashes_added and hashes_removed and self.iterable_compare_func and len(hashes_added) > 1 and len(hashes_removed) > 1: 894 pre_calced_distances = self._precalculate_distance_by_custom_compare_func( 895 hashes_added, hashes_removed, t1_hashtable, t2_hashtable, _original_type) 896 897 for added_hash in hashes_added: 898 for removed_hash in hashes_removed: 899 added_hash_obj = t2_hashtable[added_hash] 900 removed_hash_obj = t1_hashtable[removed_hash] 901 902 # Loop is detected 903 if id(removed_hash_obj.item) in parents_ids: 904 continue 905 906 _distance = None 907 if pre_calced_distances: 908 _distance = pre_calced_distances.get("{}--{}".format(added_hash, removed_hash)) 909 if _distance is None: 910 _distance = self._get_rough_distance_of_hashed_objs( 911 added_hash, removed_hash, added_hash_obj, removed_hash_obj, _original_type) 912 # Left for future debugging 913 # print(f'{Fore.RED}distance of {added_hash_obj.item} and {removed_hash_obj.item}: {_distance}{Style.RESET_ALL}') 914 # Discard potential pairs that are too far. 915 if _distance >= self.cutoff_distance_for_pairs: 916 continue 917 pairs_of_item = most_in_common_pairs[added_hash] 918 pairs_of_item[_distance].add(removed_hash) 919 used_to_hashes = set() 920 921 distances_to_from_hashes = defaultdict(OrderedSetPlus) 922 for from_hash, distances_to_to_hashes in most_in_common_pairs.items(): 923 # del distances_to_to_hashes['max'] 924 for dist in distances_to_to_hashes: 925 distances_to_from_hashes[dist].add(from_hash) 926 927 for dist in sorted(distances_to_from_hashes.keys()): 928 from_hashes = distances_to_from_hashes[dist] 929 while from_hashes: 930 from_hash = from_hashes.lpop() 931 if from_hash not in used_to_hashes: 932 to_hashes = most_in_common_pairs[from_hash][dist] 933 while to_hashes: 934 to_hash = to_hashes.lpop() 935 if to_hash not in used_to_hashes: 936 used_to_hashes.add(from_hash) 937 used_to_hashes.add(to_hash) 938 # Left for future debugging: 939 # print(f'{bcolors.FAIL}Adding {t2_hashtable[from_hash].item} as a pairs of {t1_hashtable[to_hash].item} with distance of {dist}{bcolors.ENDC}') 940 pairs[from_hash] = to_hash 941 942 inverse_pairs = {v: k for k, v in pairs.items()} 943 pairs.update(inverse_pairs) 944 if cache_key and self._stats[DISTANCE_CACHE_ENABLED]: 945 self._distance_cache.set(cache_key, value=pairs) 946 return pairs.copy() 947 948 def _diff_iterable_with_deephash(self, level, parents_ids, _original_type=None): 949 """Diff of hashable or unhashable iterables. Only used when ignoring the order.""" 950 951 full_t1_hashtable = self._create_hashtable(level, 't1') 952 full_t2_hashtable = self._create_hashtable(level, 't2') 953 t1_hashes = OrderedSetPlus(full_t1_hashtable.keys()) 954 t2_hashes = OrderedSetPlus(full_t2_hashtable.keys()) 955 hashes_added = t2_hashes - t1_hashes 956 hashes_removed = t1_hashes - t2_hashes 957 958 # Deciding whether to calculate pairs or not. 959 if (len(hashes_added) + len(hashes_removed)) / (len(full_t1_hashtable) + len(full_t2_hashtable) + 1) > self.cutoff_intersection_for_pairs: 960 get_pairs = False 961 else: 962 get_pairs = True 963 964 # reduce the size of hashtables 965 if self.report_repetition: 966 t1_hashtable = full_t1_hashtable 967 t2_hashtable = full_t2_hashtable 968 else: 969 t1_hashtable = {k: v for k, v in full_t1_hashtable.items() if k in hashes_removed} 970 t2_hashtable = {k: v for k, v in full_t2_hashtable.items() if k in hashes_added} 971 972 if self._stats[PASSES_COUNT] < self.max_passes and get_pairs: 973 self._stats[PASSES_COUNT] += 1 974 pairs = self._get_most_in_common_pairs_in_iterables( 975 hashes_added, hashes_removed, t1_hashtable, t2_hashtable, parents_ids, _original_type) 976 elif get_pairs: 977 if not self._stats[MAX_PASS_LIMIT_REACHED]: 978 self._stats[MAX_PASS_LIMIT_REACHED] = True 979 logger.warning(MAX_PASSES_REACHED_MSG.format(self.max_passes)) 980 pairs = dict_() 981 else: 982 pairs = dict_() 983 984 def get_other_pair(hash_value, in_t1=True): 985 """ 986 Gets the other paired indexed hash item to the hash_value in the pairs dictionary 987 in_t1: are we looking for the other pair in t1 or t2? 988 """ 989 if in_t1: 990 hashtable = t1_hashtable 991 the_other_hashes = hashes_removed 992 else: 993 hashtable = t2_hashtable 994 the_other_hashes = hashes_added 995 other = pairs.pop(hash_value, notpresent) 996 if other is notpresent: 997 other = notpresent_indexed 998 else: 999 # The pairs are symmetrical. 1000 # removing the other direction of pair 1001 # so it does not get used. 1002 del pairs[other] 1003 the_other_hashes.remove(other) 1004 other = hashtable[other] 1005 return other 1006 1007 if self.report_repetition: 1008 for hash_value in hashes_added: 1009 if self._count_diff() is StopIteration: 1010 return # pragma: no cover. This is already covered for addition (when report_repetition=False). 1011 other = get_other_pair(hash_value) 1012 item_id = id(other.item) 1013 indexes = t2_hashtable[hash_value].indexes if other.item is notpresent else other.indexes 1014 for i in indexes: 1015 change_level = level.branch_deeper( 1016 other.item, 1017 t2_hashtable[hash_value].item, 1018 child_relationship_class=SubscriptableIterableRelationship, 1019 child_relationship_param=i 1020 ) 1021 if other.item is notpresent: 1022 self._report_result('iterable_item_added', change_level) 1023 else: 1024 parents_ids_added = add_to_frozen_set(parents_ids, item_id) 1025 self._diff(change_level, parents_ids_added) 1026 for hash_value in hashes_removed: 1027 if self._count_diff() is StopIteration: 1028 return # pragma: no cover. This is already covered for addition. 1029 other = get_other_pair(hash_value, in_t1=False) 1030 item_id = id(other.item) 1031 for i in t1_hashtable[hash_value].indexes: 1032 change_level = level.branch_deeper( 1033 t1_hashtable[hash_value].item, 1034 other.item, 1035 child_relationship_class=SubscriptableIterableRelationship, 1036 child_relationship_param=i) 1037 if other.item is notpresent: 1038 self._report_result('iterable_item_removed', change_level) 1039 else: 1040 # I was not able to make a test case for the following 2 lines since the cases end up 1041 # getting resolved above in the hashes_added calcs. However I am leaving these 2 lines 1042 # in case things change in future. 1043 parents_ids_added = add_to_frozen_set(parents_ids, item_id) # pragma: no cover. 1044 self._diff(change_level, parents_ids_added) # pragma: no cover. 1045 1046 items_intersect = t2_hashes.intersection(t1_hashes) 1047 1048 for hash_value in items_intersect: 1049 t1_indexes = t1_hashtable[hash_value].indexes 1050 t2_indexes = t2_hashtable[hash_value].indexes 1051 t1_indexes_len = len(t1_indexes) 1052 t2_indexes_len = len(t2_indexes) 1053 if t1_indexes_len != t2_indexes_len: # this is a repetition change! 1054 # create "change" entry, keep current level untouched to handle further changes 1055 repetition_change_level = level.branch_deeper( 1056 t1_hashtable[hash_value].item, 1057 t2_hashtable[hash_value].item, # nb: those are equal! 1058 child_relationship_class=SubscriptableIterableRelationship, 1059 child_relationship_param=t1_hashtable[hash_value] 1060 .indexes[0]) 1061 repetition_change_level.additional['repetition'] = RemapDict( 1062 old_repeat=t1_indexes_len, 1063 new_repeat=t2_indexes_len, 1064 old_indexes=t1_indexes, 1065 new_indexes=t2_indexes) 1066 self._report_result('repetition_change', 1067 repetition_change_level) 1068 1069 else: 1070 for hash_value in hashes_added: 1071 if self._count_diff() is StopIteration: 1072 return 1073 other = get_other_pair(hash_value) 1074 item_id = id(other.item) 1075 index = t2_hashtable[hash_value].indexes[0] if other.item is notpresent else other.indexes[0] 1076 change_level = level.branch_deeper( 1077 other.item, 1078 t2_hashtable[hash_value].item, 1079 child_relationship_class=SubscriptableIterableRelationship, 1080 child_relationship_param=index) 1081 if other.item is notpresent: 1082 self._report_result('iterable_item_added', change_level) 1083 else: 1084 parents_ids_added = add_to_frozen_set(parents_ids, item_id) 1085 self._diff(change_level, parents_ids_added) 1086 1087 for hash_value in hashes_removed: 1088 if self._count_diff() is StopIteration: 1089 return # pragma: no cover. This is already covered for addition. 1090 other = get_other_pair(hash_value, in_t1=False) 1091 item_id = id(other.item) 1092 change_level = level.branch_deeper( 1093 t1_hashtable[hash_value].item, 1094 other.item, 1095 child_relationship_class=SubscriptableIterableRelationship, 1096 child_relationship_param=t1_hashtable[hash_value].indexes[ 1097 0]) 1098 if other.item is notpresent: 1099 self._report_result('iterable_item_removed', change_level) 1100 else: 1101 # Just like the case when report_repetition = True, these lines never run currently. 1102 # However they will stay here in case things change in future. 1103 parents_ids_added = add_to_frozen_set(parents_ids, item_id) # pragma: no cover. 1104 self._diff(change_level, parents_ids_added) # pragma: no cover. 1105 1106 def _diff_booleans(self, level): 1107 if level.t1 != level.t2: 1108 self._report_result('values_changed', level) 1109 1110 def _diff_numbers(self, level): 1111 """Diff Numbers""" 1112 t1_type = "number" if self.ignore_numeric_type_changes else level.t1.__class__.__name__ 1113 t2_type = "number" if self.ignore_numeric_type_changes else level.t2.__class__.__name__ 1114 1115 if self.math_epsilon is not None: 1116 if not is_close(level.t1, level.t2, abs_tol=self.math_epsilon): 1117 self._report_result('values_changed', level) 1118 elif self.significant_digits is None: 1119 if level.t1 != level.t2: 1120 self._report_result('values_changed', level) 1121 else: 1122 # Bernhard10: I use string formatting for comparison, to be consistent with usecases where 1123 # data is read from files that were previousely written from python and 1124 # to be consistent with on-screen representation of numbers. 1125 # Other options would be abs(t1-t2)<10**-self.significant_digits 1126 # or math.is_close (python3.5+) 1127 # Note that abs(3.25-3.251) = 0.0009999999999998899 < 0.001 1128 # Note also that "{:.3f}".format(1.1135) = 1.113, but "{:.3f}".format(1.11351) = 1.114 1129 # For Decimals, format seems to round 2.5 to 2 and 3.5 to 4 (to closest even number) 1130 t1_s = self.number_to_string(level.t1, 1131 significant_digits=self.significant_digits, 1132 number_format_notation=self.number_format_notation) 1133 t2_s = self.number_to_string(level.t2, 1134 significant_digits=self.significant_digits, 1135 number_format_notation=self.number_format_notation) 1136 1137 t1_s = KEY_TO_VAL_STR.format(t1_type, t1_s) 1138 t2_s = KEY_TO_VAL_STR.format(t2_type, t2_s) 1139 if t1_s != t2_s: 1140 self._report_result('values_changed', level) 1141 1142 def _diff_datetimes(self, level): 1143 """Diff DateTimes""" 1144 if self.truncate_datetime: 1145 level.t1 = datetime_normalize(self.truncate_datetime, level.t1) 1146 level.t2 = datetime_normalize(self.truncate_datetime, level.t2) 1147 1148 if level.t1 != level.t2: 1149 self._report_result('values_changed', level) 1150 1151 def _diff_numpy_array(self, level, parents_ids=frozenset()): 1152 """Diff numpy arrays""" 1153 if level.path() not in self._numpy_paths: 1154 self._numpy_paths[level.path()] = get_type(level.t2).__name__ 1155 if np is None: 1156 # This line should never be run. If it is ever called means the type check detected a numpy array 1157 # which means numpy module needs to be available. So np can't be None. 1158 raise ImportError(CANT_FIND_NUMPY_MSG) # pragma: no cover 1159 1160 if not self.ignore_order_func(level): 1161 # fast checks 1162 if self.significant_digits is None: 1163 if np.array_equal(level.t1, level.t2): 1164 return # all good 1165 else: 1166 try: 1167 np.testing.assert_almost_equal(level.t1, level.t2, decimal=self.significant_digits) 1168 return # all good 1169 except AssertionError: 1170 pass # do detailed checking below 1171 1172 # compare array meta-data 1173 _original_type = level.t1.dtype 1174 if level.t1.shape != level.t2.shape: 1175 # arrays are converted to python lists so that certain features of DeepDiff can apply on them easier. 1176 # They will be converted back to Numpy at their final dimension. 1177 level.t1 = level.t1.tolist() 1178 level.t2 = level.t2.tolist() 1179 self._diff_iterable(level, parents_ids, _original_type=_original_type) 1180 else: 1181 # metadata same -- the difference is in the content 1182 shape = level.t1.shape 1183 dimensions = len(shape) 1184 if dimensions == 1: 1185 self._diff_iterable(level, parents_ids, _original_type=_original_type) 1186 elif self.ignore_order_func(level): 1187 # arrays are converted to python lists so that certain features of DeepDiff can apply on them easier. 1188 # They will be converted back to Numpy at their final dimension. 1189 level.t1 = level.t1.tolist() 1190 level.t2 = level.t2.tolist() 1191 self._diff_iterable_with_deephash(level, parents_ids, _original_type=_original_type) 1192 else: 1193 for (t1_path, t1_row), (t2_path, t2_row) in zip( 1194 get_numpy_ndarray_rows(level.t1, shape), 1195 get_numpy_ndarray_rows(level.t2, shape)): 1196 1197 new_level = level.branch_deeper( 1198 t1_row, 1199 t2_row, 1200 child_relationship_class=NumpyArrayRelationship, 1201 child_relationship_param=t1_path) 1202 1203 self._diff_iterable_in_order(new_level, parents_ids, _original_type=_original_type) 1204 1205 def _diff_types(self, level): 1206 """Diff types""" 1207 level.report_type = 'type_changes' 1208 self._report_result('type_changes', level) 1209 1210 def _count_diff(self): 1211 if (self.max_diffs is not None and self._stats[DIFF_COUNT] > self.max_diffs): 1212 if not self._stats[MAX_DIFF_LIMIT_REACHED]: 1213 self._stats[MAX_DIFF_LIMIT_REACHED] = True 1214 logger.warning(MAX_DIFFS_REACHED_MSG.format(self.max_diffs)) 1215 return StopIteration 1216 self._stats[DIFF_COUNT] += 1 1217 if self.cache_size and self.cache_tuning_sample_size: 1218 self._auto_tune_cache() 1219 1220 def _auto_tune_cache(self): 1221 take_sample = (self._stats[DIFF_COUNT] % self.cache_tuning_sample_size == 0) 1222 if self.cache_tuning_sample_size: 1223 if self._stats[DISTANCE_CACHE_ENABLED]: 1224 if take_sample: 1225 self._auto_off_cache() 1226 # Turn on the cache once in a while 1227 elif self._stats[DIFF_COUNT] % self._shared_parameters[_ENABLE_CACHE_EVERY_X_DIFF] == 0: 1228 self.progress_logger('Re-enabling the distance and level caches.') 1229 # decreasing the sampling frequency 1230 self._shared_parameters[_ENABLE_CACHE_EVERY_X_DIFF] *= 10 1231 self._stats[DISTANCE_CACHE_ENABLED] = True 1232 if take_sample: 1233 for key in (PREVIOUS_DIFF_COUNT, PREVIOUS_DISTANCE_CACHE_HIT_COUNT): 1234 self._stats[key] = self._stats[key[9:]] 1235 1236 def _auto_off_cache(self): 1237 """ 1238 Auto adjust the cache based on the usage 1239 """ 1240 if self._stats[DISTANCE_CACHE_ENABLED]: 1241 angle = (self._stats[DISTANCE_CACHE_HIT_COUNT] - self._stats['PREVIOUS {}'.format(DISTANCE_CACHE_HIT_COUNT)]) / (self._stats[DIFF_COUNT] - self._stats[PREVIOUS_DIFF_COUNT]) 1242 if angle < self.CACHE_AUTO_ADJUST_THRESHOLD: 1243 self._stats[DISTANCE_CACHE_ENABLED] = False 1244 self.progress_logger('Due to minimal cache hits, {} is disabled.'.format('distance cache')) 1245 1246 def _use_custom_operator(self, level): 1247 """ 1248 For each level we check all custom operators. 1249 If any one of them was a match for the level, we run the diff of the operator. 1250 If the operator returned True, the operator must have decided these objects should not 1251 be compared anymore. It might have already reported their results. 1252 In that case the report will appear in the final results of this diff. 1253 Otherwise basically the 2 objects in the level are being omitted from the results. 1254 """ 1255 1256 # used = False 1257 1258 # for operator in self.custom_operators: 1259 # if operator.match(level): 1260 # prevent_default = operator.diff(level, self) 1261 # used = True if prevent_default is None else prevent_default 1262 1263 # return used 1264 1265 for operator in self.custom_operators: 1266 if operator.match(level): 1267 prevent_default = operator.give_up_diffing(level=level, diff_instance=self) 1268 if prevent_default: 1269 return True 1270 1271 return False 1272 1273 def _diff(self, level, parents_ids=frozenset(), _original_type=None): 1274 """ 1275 The main diff method 1276 1277 **parameters** 1278 1279 level: the tree level or tree node 1280 parents_ids: the ids of all the parent objects in the tree from the current node. 1281 _original_type: If the objects had an original type that was different than what currently exists in the level.t1 and t2 1282 """ 1283 if self._count_diff() is StopIteration: 1284 return 1285 1286 if self._use_custom_operator(level): 1287 return 1288 1289 if level.t1 is level.t2: 1290 return 1291 1292 if self._skip_this(level): 1293 return 1294 1295 if get_type(level.t1) != get_type(level.t2): 1296 report_type_change = True 1297 for type_group in self.ignore_type_in_groups: 1298 if self.type_check_func(level.t1, type_group) and self.type_check_func(level.t2, type_group): 1299 report_type_change = False 1300 break 1301 if report_type_change: 1302 self._diff_types(level) 1303 return 1304 # This is an edge case where t1=None or t2=None and None is in the ignore type group. 1305 if level.t1 is None or level.t2 is None: 1306 self._report_result('values_changed', level) 1307 return 1308 1309 if self.ignore_nan_inequality and isinstance(level.t1, float) and str(level.t1) == str(level.t2) == 'nan': 1310 return 1311 1312 if isinstance(level.t1, booleans): 1313 self._diff_booleans(level) 1314 1315 if isinstance(level.t1, strings): 1316 self._diff_str(level) 1317 1318 elif isinstance(level.t1, times): 1319 self._diff_datetimes(level) 1320 1321 elif isinstance(level.t1, numbers): 1322 self._diff_numbers(level) 1323 1324 elif isinstance(level.t1, Mapping): 1325 self._diff_dict(level, parents_ids) 1326 1327 elif isinstance(level.t1, tuple): 1328 self._diff_tuple(level, parents_ids) 1329 1330 elif isinstance(level.t1, (set, frozenset, OrderedSet)): 1331 self._diff_set(level) 1332 1333 elif isinstance(level.t1, np_ndarray): 1334 self._diff_numpy_array(level, parents_ids) 1335 1336 elif isinstance(level.t1, Iterable): 1337 self._diff_iterable(level, parents_ids, _original_type=_original_type) 1338 1339 else: 1340 self._diff_obj(level, parents_ids) 1341 1342 def _get_view_results(self, view): 1343 """ 1344 Get the results based on the view 1345 """ 1346 result = self.tree 1347 if not self.report_repetition: # and self.is_root: 1348 result.mutual_add_removes_to_become_value_changes() 1349 if view == TREE_VIEW: 1350 pass 1351 elif view == TEXT_VIEW: 1352 result = TextResult(tree_results=self.tree, verbose_level=self.verbose_level) 1353 result.remove_empty_keys() 1354 elif view == DELTA_VIEW: 1355 result = self._to_delta_dict(report_repetition_required=False) 1356 else: 1357 raise ValueError(INVALID_VIEW_MSG.format(view)) 1358 return result 1359 1360 def _group_iterable_to_dict(self, item, group_by, item_name): 1361 """ 1362 Convert a list of dictionaries into a dictionary of dictionaries 1363 where the key is the value of the group_by key in each dictionary. 1364 """ 1365 if isinstance(item, Iterable) and not isinstance(item, Mapping): 1366 result = {} 1367 item_copy = deepcopy(item) 1368 for row in item_copy: 1369 if isinstance(row, Mapping): 1370 try: 1371 key = row.pop(group_by) 1372 except KeyError: 1373 logger.error("Unable to group {} by {}. The key is missing in {}".format(item_name, group_by, row)) 1374 raise 1375 result[key] = row 1376 else: 1377 msg = "Unable to group {} by {} since the item {} is not a dictionary.".format(item_name, group_by, row) 1378 logger.error(msg) 1379 raise ValueError(msg) 1380 return result 1381 msg = "Unable to group {} by {}".format(item_name, group_by) 1382 logger.error(msg) 1383 raise ValueError(msg) 1384 1385 def get_stats(self): 1386 """ 1387 Get some stats on internals of the DeepDiff run. 1388 """ 1389 return self._stats 1390 1391 1392if __name__ == "__main__": # pragma: no cover 1393 import doctest 1394 doctest.testmod() 1395