1#!/usr/bin/env python 2import logging 3from collections.abc import Iterable, MutableMapping 4from collections import defaultdict 5from hashlib import sha1, sha256 6from enum import Enum 7from deepdiff.helper import (strings, numbers, times, unprocessed, not_hashed, add_to_frozen_set, 8 convert_item_or_items_into_set_else_none, get_doc, 9 convert_item_or_items_into_compiled_regexes_else_none, 10 get_id, type_is_subclass_of_type_group, type_in_type_group, 11 number_to_string, datetime_normalize, KEY_TO_VAL_STR, short_repr, 12 get_truncate_datetime, dict_) 13from deepdiff.base import Base 14logger = logging.getLogger(__name__) 15 16UNPROCESSED_KEY = object() 17 18EMPTY_FROZENSET = frozenset() 19 20INDEX_VS_ATTRIBUTE = ('[%s]', '.%s') 21 22 23HASH_LOOKUP_ERR_MSG = '{} is not one of the hashed items.' 24 25 26def sha256hex(obj): 27 """Use Sha256 as a cryptographic hash.""" 28 if isinstance(obj, str): 29 obj = obj.encode('utf-8') 30 return sha256(obj).hexdigest() 31 32 33def sha1hex(obj): 34 """Use Sha1 as a cryptographic hash.""" 35 if isinstance(obj, str): 36 obj = obj.encode('utf-8') 37 return sha1(obj).hexdigest() 38 39 40default_hasher = sha256hex 41 42 43def combine_hashes_lists(items, prefix): 44 """ 45 Combines lists of hashes into one hash 46 This can be optimized in future. 47 It needs to work with both murmur3 hashes (int) and sha256 (str) 48 Although murmur3 is not used anymore. 49 """ 50 if isinstance(prefix, bytes): 51 prefix = prefix.decode('utf-8') 52 hashes_bytes = b'' 53 for item in items: 54 # In order to make sure the order of hashes in each item does not affect the hash 55 # we resort them. 56 hashes_bytes += (''.join(map(str, sorted(item))) + '--').encode('utf-8') 57 return prefix + str(default_hasher(hashes_bytes)) 58 59 60class BoolObj(Enum): 61 TRUE = 1 62 FALSE = 0 63 64 65def prepare_string_for_hashing(obj, ignore_string_type_changes=False, ignore_string_case=False): 66 """ 67 Clean type conversions 68 """ 69 original_type = obj.__class__.__name__ 70 if isinstance(obj, bytes): 71 obj = obj.decode('utf-8') 72 if not ignore_string_type_changes: 73 obj = KEY_TO_VAL_STR.format(original_type, obj) 74 if ignore_string_case: 75 obj = obj.lower() 76 return obj 77 78 79doc = get_doc('deephash_doc.rst') 80 81 82class DeepHash(Base): 83 __doc__ = doc 84 85 def __init__(self, 86 obj, 87 *, 88 hashes=None, 89 exclude_types=None, 90 exclude_paths=None, 91 exclude_regex_paths=None, 92 hasher=None, 93 ignore_repetition=True, 94 significant_digits=None, 95 truncate_datetime=None, 96 number_format_notation="f", 97 apply_hash=True, 98 ignore_type_in_groups=None, 99 ignore_string_type_changes=False, 100 ignore_numeric_type_changes=False, 101 ignore_type_subclasses=False, 102 ignore_string_case=False, 103 exclude_obj_callback=None, 104 number_to_string_func=None, 105 ignore_private_variables=True, 106 parent="root", 107 **kwargs): 108 if kwargs: 109 raise ValueError( 110 ("The following parameter(s) are not valid: %s\n" 111 "The valid parameters are obj, hashes, exclude_types, significant_digits, truncate_datetime," 112 "exclude_paths, exclude_regex_paths, hasher, ignore_repetition, " 113 "number_format_notation, apply_hash, ignore_type_in_groups, ignore_string_type_changes, " 114 "ignore_numeric_type_changes, ignore_type_subclasses, ignore_string_case " 115 "number_to_string_func, ignore_private_variables, parent") % ', '.join(kwargs.keys())) 116 if isinstance(hashes, MutableMapping): 117 self.hashes = hashes 118 elif isinstance(hashes, DeepHash): 119 self.hashes = hashes.hashes 120 else: 121 self.hashes = dict_() 122 exclude_types = set() if exclude_types is None else set(exclude_types) 123 self.exclude_types_tuple = tuple(exclude_types) # we need tuple for checking isinstance 124 self.ignore_repetition = ignore_repetition 125 self.exclude_paths = convert_item_or_items_into_set_else_none(exclude_paths) 126 self.exclude_regex_paths = convert_item_or_items_into_compiled_regexes_else_none(exclude_regex_paths) 127 self.hasher = default_hasher if hasher is None else hasher 128 self.hashes[UNPROCESSED_KEY] = [] 129 130 self.significant_digits = self.get_significant_digits(significant_digits, ignore_numeric_type_changes) 131 self.truncate_datetime = get_truncate_datetime(truncate_datetime) 132 self.number_format_notation = number_format_notation 133 self.ignore_type_in_groups = self.get_ignore_types_in_groups( 134 ignore_type_in_groups=ignore_type_in_groups, 135 ignore_string_type_changes=ignore_string_type_changes, 136 ignore_numeric_type_changes=ignore_numeric_type_changes, 137 ignore_type_subclasses=ignore_type_subclasses) 138 self.ignore_string_type_changes = ignore_string_type_changes 139 self.ignore_numeric_type_changes = ignore_numeric_type_changes 140 self.ignore_string_case = ignore_string_case 141 self.exclude_obj_callback = exclude_obj_callback 142 # makes the hash return constant size result if true 143 # the only time it should be set to False is when 144 # testing the individual hash functions for different types of objects. 145 self.apply_hash = apply_hash 146 self.type_check_func = type_is_subclass_of_type_group if ignore_type_subclasses else type_in_type_group 147 self.number_to_string = number_to_string_func or number_to_string 148 self.ignore_private_variables = ignore_private_variables 149 150 self._hash(obj, parent=parent, parents_ids=frozenset({get_id(obj)})) 151 152 if self.hashes[UNPROCESSED_KEY]: 153 logger.warning("Can not hash the following items: {}.".format(self.hashes[UNPROCESSED_KEY])) 154 else: 155 del self.hashes[UNPROCESSED_KEY] 156 157 sha256hex = sha256hex 158 sha1hex = sha1hex 159 160 def __getitem__(self, obj, extract_index=0): 161 return self._getitem(self.hashes, obj, extract_index=extract_index) 162 163 @staticmethod 164 def _getitem(hashes, obj, extract_index=0): 165 """ 166 extract_index is zero for hash and 1 for count and None to get them both. 167 To keep it backward compatible, we only get the hash by default so it is set to zero by default. 168 """ 169 170 key = obj 171 if obj is True: 172 key = BoolObj.TRUE 173 elif obj is False: 174 key = BoolObj.FALSE 175 176 result_n_count = (None, 0) 177 178 try: 179 result_n_count = hashes[key] 180 except (TypeError, KeyError): 181 key = get_id(obj) 182 try: 183 result_n_count = hashes[key] 184 except KeyError: 185 raise KeyError(HASH_LOOKUP_ERR_MSG.format(obj)) from None 186 187 if obj is UNPROCESSED_KEY: 188 extract_index = None 189 190 return result_n_count if extract_index is None else result_n_count[extract_index] 191 192 def __contains__(self, obj): 193 result = False 194 try: 195 result = obj in self.hashes 196 except (TypeError, KeyError): 197 result = False 198 if not result: 199 result = get_id(obj) in self.hashes 200 return result 201 202 def get(self, key, default=None, extract_index=0): 203 """ 204 Get method for the hashes dictionary. 205 It can extract the hash for a given key that is already calculated when extract_index=0 206 or the count of items that went to building the object whenextract_index=1. 207 """ 208 return self.get_key(self.hashes, key, default=default, extract_index=extract_index) 209 210 @staticmethod 211 def get_key(hashes, key, default=None, extract_index=0): 212 """ 213 get_key method for the hashes dictionary. 214 It can extract the hash for a given key that is already calculated when extract_index=0 215 or the count of items that went to building the object whenextract_index=1. 216 """ 217 try: 218 result = DeepHash._getitem(hashes, key, extract_index=extract_index) 219 except KeyError: 220 result = default 221 return result 222 223 def _get_objects_to_hashes_dict(self, extract_index=0): 224 """ 225 A dictionary containing only the objects to hashes, 226 or a dictionary of objects to the count of items that went to build them. 227 extract_index=0 for hashes and extract_index=1 for counts. 228 """ 229 result = dict_() 230 for key, value in self.hashes.items(): 231 if key is UNPROCESSED_KEY: 232 result[key] = value 233 else: 234 result[key] = value[extract_index] 235 return result 236 237 def __eq__(self, other): 238 if isinstance(other, DeepHash): 239 return self.hashes == other.hashes 240 else: 241 # We only care about the hashes 242 return self._get_objects_to_hashes_dict() == other 243 244 __req__ = __eq__ 245 246 def __repr__(self): 247 """ 248 Hide the counts since it will be confusing to see them when they are hidden everywhere else. 249 """ 250 return short_repr(self._get_objects_to_hashes_dict(extract_index=0), max_length=500) 251 252 __str__ = __repr__ 253 254 def __bool__(self): 255 return bool(self.hashes) 256 257 def keys(self): 258 return self.hashes.keys() 259 260 def values(self): 261 return (i[0] for i in self.hashes.values()) # Just grab the item and not its count 262 263 def items(self): 264 return ((i, v[0]) for i, v in self.hashes.items()) 265 266 def _prep_obj(self, obj, parent, parents_ids=EMPTY_FROZENSET, is_namedtuple=False): 267 """prepping objects""" 268 original_type = type(obj) if not isinstance(obj, type) else obj 269 try: 270 if is_namedtuple: 271 obj = obj._asdict() 272 else: 273 obj = obj.__dict__ 274 except AttributeError: 275 try: 276 obj = {i: getattr(obj, i) for i in obj.__slots__} 277 except AttributeError: 278 self.hashes[UNPROCESSED_KEY].append(obj) 279 return (unprocessed, 0) 280 281 result, counts = self._prep_dict(obj, parent=parent, parents_ids=parents_ids, 282 print_as_attribute=True, original_type=original_type) 283 result = "nt{}".format(result) if is_namedtuple else "obj{}".format(result) 284 return result, counts 285 286 def _skip_this(self, obj, parent): 287 skip = False 288 if self.exclude_paths and parent in self.exclude_paths: 289 skip = True 290 elif self.exclude_regex_paths and any( 291 [exclude_regex_path.search(parent) for exclude_regex_path in self.exclude_regex_paths]): 292 skip = True 293 elif self.exclude_types_tuple and isinstance(obj, self.exclude_types_tuple): 294 skip = True 295 elif self.exclude_obj_callback and self.exclude_obj_callback(obj, parent): 296 skip = True 297 return skip 298 299 def _prep_dict(self, obj, parent, parents_ids=EMPTY_FROZENSET, print_as_attribute=False, original_type=None): 300 301 result = [] 302 counts = 1 303 304 key_text = "%s{}".format(INDEX_VS_ATTRIBUTE[print_as_attribute]) 305 for key, item in obj.items(): 306 counts += 1 307 # ignore private variables 308 if self.ignore_private_variables and isinstance(key, str) and key.startswith('__'): 309 continue 310 key_formatted = "'%s'" % key if not print_as_attribute and isinstance(key, strings) else key 311 key_in_report = key_text % (parent, key_formatted) 312 313 key_hash, _ = self._hash(key, parent=key_in_report, parents_ids=parents_ids) 314 if not key_hash: 315 continue 316 item_id = get_id(item) 317 if (parents_ids and item_id in parents_ids) or self._skip_this(item, parent=key_in_report): 318 continue 319 parents_ids_added = add_to_frozen_set(parents_ids, item_id) 320 hashed, count = self._hash(item, parent=key_in_report, parents_ids=parents_ids_added) 321 hashed = KEY_TO_VAL_STR.format(key_hash, hashed) 322 result.append(hashed) 323 counts += count 324 325 result.sort() 326 result = ';'.join(result) 327 if print_as_attribute: 328 type_ = original_type or type(obj) 329 type_str = type_.__name__ 330 for type_group in self.ignore_type_in_groups: 331 if self.type_check_func(type_, type_group): 332 type_str = ','.join(map(lambda x: x.__name__, type_group)) 333 break 334 else: 335 type_str = 'dict' 336 return "{}:{{{}}}".format(type_str, result), counts 337 338 def _prep_iterable(self, obj, parent, parents_ids=EMPTY_FROZENSET): 339 340 counts = 1 341 result = defaultdict(int) 342 343 for i, item in enumerate(obj): 344 new_parent = "{}[{}]".format(parent, i) 345 if self._skip_this(item, parent=new_parent): 346 continue 347 348 item_id = get_id(item) 349 if parents_ids and item_id in parents_ids: 350 continue 351 352 parents_ids_added = add_to_frozen_set(parents_ids, item_id) 353 hashed, count = self._hash(item, parent=new_parent, parents_ids=parents_ids_added) 354 # counting repetitions 355 result[hashed] += 1 356 counts += count 357 358 if self.ignore_repetition: 359 result = list(result.keys()) 360 else: 361 result = [ 362 '{}|{}'.format(i, v) for i, v in result.items() 363 ] 364 365 result = sorted(map(str, result)) # making sure the result items are string and sorted so join command works. 366 result = ','.join(result) 367 result = KEY_TO_VAL_STR.format(type(obj).__name__, result) 368 369 return result, counts 370 371 def _prep_bool(self, obj): 372 return BoolObj.TRUE if obj else BoolObj.FALSE 373 374 def _prep_number(self, obj): 375 type_ = "number" if self.ignore_numeric_type_changes else obj.__class__.__name__ 376 if self.significant_digits is not None: 377 obj = self.number_to_string(obj, significant_digits=self.significant_digits, 378 number_format_notation=self.number_format_notation) 379 return KEY_TO_VAL_STR.format(type_, obj) 380 381 def _prep_datetime(self, obj): 382 type_ = 'datetime' 383 obj = datetime_normalize(self.truncate_datetime, obj) 384 return KEY_TO_VAL_STR.format(type_, obj) 385 386 def _prep_tuple(self, obj, parent, parents_ids): 387 # Checking to see if it has _fields. Which probably means it is a named 388 # tuple. 389 try: 390 obj._asdict 391 # It must be a normal tuple 392 except AttributeError: 393 result, counts = self._prep_iterable(obj=obj, parent=parent, parents_ids=parents_ids) 394 # We assume it is a namedtuple then 395 else: 396 result, counts = self._prep_obj(obj, parent, parents_ids=parents_ids, is_namedtuple=True) 397 return result, counts 398 399 def _hash(self, obj, parent, parents_ids=EMPTY_FROZENSET): 400 """The main diff method""" 401 counts = 1 402 403 if isinstance(obj, bool): 404 obj = self._prep_bool(obj) 405 result = None 406 else: 407 result = not_hashed 408 try: 409 result, counts = self.hashes[obj] 410 except (TypeError, KeyError): 411 pass 412 else: 413 return result, counts 414 415 if self._skip_this(obj, parent): 416 return None, 0 417 418 elif obj is None: 419 result = 'NONE' 420 421 elif isinstance(obj, strings): 422 result = prepare_string_for_hashing( 423 obj, ignore_string_type_changes=self.ignore_string_type_changes, 424 ignore_string_case=self.ignore_string_case) 425 426 elif isinstance(obj, times): 427 result = self._prep_datetime(obj) 428 429 elif isinstance(obj, numbers): 430 result = self._prep_number(obj) 431 432 elif isinstance(obj, MutableMapping): 433 result, counts = self._prep_dict(obj=obj, parent=parent, parents_ids=parents_ids) 434 435 elif isinstance(obj, tuple): 436 result, counts = self._prep_tuple(obj=obj, parent=parent, parents_ids=parents_ids) 437 438 elif isinstance(obj, Iterable): 439 result, counts = self._prep_iterable(obj=obj, parent=parent, parents_ids=parents_ids) 440 441 elif obj == BoolObj.TRUE or obj == BoolObj.FALSE: 442 result = 'bool:true' if obj is BoolObj.TRUE else 'bool:false' 443 else: 444 result, counts = self._prep_obj(obj=obj, parent=parent, parents_ids=parents_ids) 445 446 if result is not_hashed: # pragma: no cover 447 self.hashes[UNPROCESSED_KEY].append(obj) 448 449 elif result is unprocessed: 450 pass 451 452 elif self.apply_hash: 453 if isinstance(obj, strings): 454 result_cleaned = result 455 else: 456 result_cleaned = prepare_string_for_hashing( 457 result, ignore_string_type_changes=self.ignore_string_type_changes, 458 ignore_string_case=self.ignore_string_case) 459 result = self.hasher(result_cleaned) 460 461 # It is important to keep the hash of all objects. 462 # The hashes will be later used for comparing the objects. 463 # Object to hash when possible otherwise ObjectID to hash 464 try: 465 self.hashes[obj] = (result, counts) 466 except TypeError: 467 obj_id = get_id(obj) 468 self.hashes[obj_id] = (result, counts) 469 470 return result, counts 471 472 473if __name__ == "__main__": # pragma: no cover 474 import doctest 475 doctest.testmod() 476