1#!/usr/bin/env python
2import logging
3from collections.abc import Iterable, MutableMapping
4from collections import defaultdict
5from hashlib import sha1, sha256
6from enum import Enum
7from deepdiff.helper import (strings, numbers, times, unprocessed, not_hashed, add_to_frozen_set,
8                             convert_item_or_items_into_set_else_none, get_doc,
9                             convert_item_or_items_into_compiled_regexes_else_none,
10                             get_id, type_is_subclass_of_type_group, type_in_type_group,
11                             number_to_string, datetime_normalize, KEY_TO_VAL_STR, short_repr,
12                             get_truncate_datetime, dict_)
13from deepdiff.base import Base
14logger = logging.getLogger(__name__)
15
16UNPROCESSED_KEY = object()
17
18EMPTY_FROZENSET = frozenset()
19
20INDEX_VS_ATTRIBUTE = ('[%s]', '.%s')
21
22
23HASH_LOOKUP_ERR_MSG = '{} is not one of the hashed items.'
24
25
26def sha256hex(obj):
27    """Use Sha256 as a cryptographic hash."""
28    if isinstance(obj, str):
29        obj = obj.encode('utf-8')
30    return sha256(obj).hexdigest()
31
32
33def sha1hex(obj):
34    """Use Sha1 as a cryptographic hash."""
35    if isinstance(obj, str):
36        obj = obj.encode('utf-8')
37    return sha1(obj).hexdigest()
38
39
40default_hasher = sha256hex
41
42
43def combine_hashes_lists(items, prefix):
44    """
45    Combines lists of hashes into one hash
46    This can be optimized in future.
47    It needs to work with both murmur3 hashes (int) and sha256 (str)
48    Although murmur3 is not used anymore.
49    """
50    if isinstance(prefix, bytes):
51        prefix = prefix.decode('utf-8')
52    hashes_bytes = b''
53    for item in items:
54        # In order to make sure the order of hashes in each item does not affect the hash
55        # we resort them.
56        hashes_bytes += (''.join(map(str, sorted(item))) + '--').encode('utf-8')
57    return prefix + str(default_hasher(hashes_bytes))
58
59
60class BoolObj(Enum):
61    TRUE = 1
62    FALSE = 0
63
64
65def prepare_string_for_hashing(obj, ignore_string_type_changes=False, ignore_string_case=False):
66    """
67    Clean type conversions
68    """
69    original_type = obj.__class__.__name__
70    if isinstance(obj, bytes):
71        obj = obj.decode('utf-8')
72    if not ignore_string_type_changes:
73        obj = KEY_TO_VAL_STR.format(original_type, obj)
74    if ignore_string_case:
75        obj = obj.lower()
76    return obj
77
78
79doc = get_doc('deephash_doc.rst')
80
81
82class DeepHash(Base):
83    __doc__ = doc
84
85    def __init__(self,
86                 obj,
87                 *,
88                 hashes=None,
89                 exclude_types=None,
90                 exclude_paths=None,
91                 exclude_regex_paths=None,
92                 hasher=None,
93                 ignore_repetition=True,
94                 significant_digits=None,
95                 truncate_datetime=None,
96                 number_format_notation="f",
97                 apply_hash=True,
98                 ignore_type_in_groups=None,
99                 ignore_string_type_changes=False,
100                 ignore_numeric_type_changes=False,
101                 ignore_type_subclasses=False,
102                 ignore_string_case=False,
103                 exclude_obj_callback=None,
104                 number_to_string_func=None,
105                 ignore_private_variables=True,
106                 parent="root",
107                 **kwargs):
108        if kwargs:
109            raise ValueError(
110                ("The following parameter(s) are not valid: %s\n"
111                 "The valid parameters are obj, hashes, exclude_types, significant_digits, truncate_datetime,"
112                 "exclude_paths, exclude_regex_paths, hasher, ignore_repetition, "
113                 "number_format_notation, apply_hash, ignore_type_in_groups, ignore_string_type_changes, "
114                 "ignore_numeric_type_changes, ignore_type_subclasses, ignore_string_case "
115                 "number_to_string_func, ignore_private_variables, parent") % ', '.join(kwargs.keys()))
116        if isinstance(hashes, MutableMapping):
117            self.hashes = hashes
118        elif isinstance(hashes, DeepHash):
119            self.hashes = hashes.hashes
120        else:
121            self.hashes = dict_()
122        exclude_types = set() if exclude_types is None else set(exclude_types)
123        self.exclude_types_tuple = tuple(exclude_types)  # we need tuple for checking isinstance
124        self.ignore_repetition = ignore_repetition
125        self.exclude_paths = convert_item_or_items_into_set_else_none(exclude_paths)
126        self.exclude_regex_paths = convert_item_or_items_into_compiled_regexes_else_none(exclude_regex_paths)
127        self.hasher = default_hasher if hasher is None else hasher
128        self.hashes[UNPROCESSED_KEY] = []
129
130        self.significant_digits = self.get_significant_digits(significant_digits, ignore_numeric_type_changes)
131        self.truncate_datetime = get_truncate_datetime(truncate_datetime)
132        self.number_format_notation = number_format_notation
133        self.ignore_type_in_groups = self.get_ignore_types_in_groups(
134            ignore_type_in_groups=ignore_type_in_groups,
135            ignore_string_type_changes=ignore_string_type_changes,
136            ignore_numeric_type_changes=ignore_numeric_type_changes,
137            ignore_type_subclasses=ignore_type_subclasses)
138        self.ignore_string_type_changes = ignore_string_type_changes
139        self.ignore_numeric_type_changes = ignore_numeric_type_changes
140        self.ignore_string_case = ignore_string_case
141        self.exclude_obj_callback = exclude_obj_callback
142        # makes the hash return constant size result if true
143        # the only time it should be set to False is when
144        # testing the individual hash functions for different types of objects.
145        self.apply_hash = apply_hash
146        self.type_check_func = type_is_subclass_of_type_group if ignore_type_subclasses else type_in_type_group
147        self.number_to_string = number_to_string_func or number_to_string
148        self.ignore_private_variables = ignore_private_variables
149
150        self._hash(obj, parent=parent, parents_ids=frozenset({get_id(obj)}))
151
152        if self.hashes[UNPROCESSED_KEY]:
153            logger.warning("Can not hash the following items: {}.".format(self.hashes[UNPROCESSED_KEY]))
154        else:
155            del self.hashes[UNPROCESSED_KEY]
156
157    sha256hex = sha256hex
158    sha1hex = sha1hex
159
160    def __getitem__(self, obj, extract_index=0):
161        return self._getitem(self.hashes, obj, extract_index=extract_index)
162
163    @staticmethod
164    def _getitem(hashes, obj, extract_index=0):
165        """
166        extract_index is zero for hash and 1 for count and None to get them both.
167        To keep it backward compatible, we only get the hash by default so it is set to zero by default.
168        """
169
170        key = obj
171        if obj is True:
172            key = BoolObj.TRUE
173        elif obj is False:
174            key = BoolObj.FALSE
175
176        result_n_count = (None, 0)
177
178        try:
179            result_n_count = hashes[key]
180        except (TypeError, KeyError):
181            key = get_id(obj)
182            try:
183                result_n_count = hashes[key]
184            except KeyError:
185                raise KeyError(HASH_LOOKUP_ERR_MSG.format(obj)) from None
186
187        if obj is UNPROCESSED_KEY:
188            extract_index = None
189
190        return result_n_count if extract_index is None else result_n_count[extract_index]
191
192    def __contains__(self, obj):
193        result = False
194        try:
195            result = obj in self.hashes
196        except (TypeError, KeyError):
197            result = False
198        if not result:
199            result = get_id(obj) in self.hashes
200        return result
201
202    def get(self, key, default=None, extract_index=0):
203        """
204        Get method for the hashes dictionary.
205        It can extract the hash for a given key that is already calculated when extract_index=0
206        or the count of items that went to building the object whenextract_index=1.
207        """
208        return self.get_key(self.hashes, key, default=default, extract_index=extract_index)
209
210    @staticmethod
211    def get_key(hashes, key, default=None, extract_index=0):
212        """
213        get_key method for the hashes dictionary.
214        It can extract the hash for a given key that is already calculated when extract_index=0
215        or the count of items that went to building the object whenextract_index=1.
216        """
217        try:
218            result = DeepHash._getitem(hashes, key, extract_index=extract_index)
219        except KeyError:
220            result = default
221        return result
222
223    def _get_objects_to_hashes_dict(self, extract_index=0):
224        """
225        A dictionary containing only the objects to hashes,
226        or a dictionary of objects to the count of items that went to build them.
227        extract_index=0 for hashes and extract_index=1 for counts.
228        """
229        result = dict_()
230        for key, value in self.hashes.items():
231            if key is UNPROCESSED_KEY:
232                result[key] = value
233            else:
234                result[key] = value[extract_index]
235        return result
236
237    def __eq__(self, other):
238        if isinstance(other, DeepHash):
239            return self.hashes == other.hashes
240        else:
241            # We only care about the hashes
242            return self._get_objects_to_hashes_dict() == other
243
244    __req__ = __eq__
245
246    def __repr__(self):
247        """
248        Hide the counts since it will be confusing to see them when they are hidden everywhere else.
249        """
250        return short_repr(self._get_objects_to_hashes_dict(extract_index=0), max_length=500)
251
252    __str__ = __repr__
253
254    def __bool__(self):
255        return bool(self.hashes)
256
257    def keys(self):
258        return self.hashes.keys()
259
260    def values(self):
261        return (i[0] for i in self.hashes.values())  # Just grab the item and not its count
262
263    def items(self):
264        return ((i, v[0]) for i, v in self.hashes.items())
265
266    def _prep_obj(self, obj, parent, parents_ids=EMPTY_FROZENSET, is_namedtuple=False):
267        """prepping objects"""
268        original_type = type(obj) if not isinstance(obj, type) else obj
269        try:
270            if is_namedtuple:
271                obj = obj._asdict()
272            else:
273                obj = obj.__dict__
274        except AttributeError:
275            try:
276                obj = {i: getattr(obj, i) for i in obj.__slots__}
277            except AttributeError:
278                self.hashes[UNPROCESSED_KEY].append(obj)
279                return (unprocessed, 0)
280
281        result, counts = self._prep_dict(obj, parent=parent, parents_ids=parents_ids,
282                                         print_as_attribute=True, original_type=original_type)
283        result = "nt{}".format(result) if is_namedtuple else "obj{}".format(result)
284        return result, counts
285
286    def _skip_this(self, obj, parent):
287        skip = False
288        if self.exclude_paths and parent in self.exclude_paths:
289            skip = True
290        elif self.exclude_regex_paths and any(
291                [exclude_regex_path.search(parent) for exclude_regex_path in self.exclude_regex_paths]):
292            skip = True
293        elif self.exclude_types_tuple and isinstance(obj, self.exclude_types_tuple):
294            skip = True
295        elif self.exclude_obj_callback and self.exclude_obj_callback(obj, parent):
296            skip = True
297        return skip
298
299    def _prep_dict(self, obj, parent, parents_ids=EMPTY_FROZENSET, print_as_attribute=False, original_type=None):
300
301        result = []
302        counts = 1
303
304        key_text = "%s{}".format(INDEX_VS_ATTRIBUTE[print_as_attribute])
305        for key, item in obj.items():
306            counts += 1
307            # ignore private variables
308            if self.ignore_private_variables and isinstance(key, str) and key.startswith('__'):
309                continue
310            key_formatted = "'%s'" % key if not print_as_attribute and isinstance(key, strings) else key
311            key_in_report = key_text % (parent, key_formatted)
312
313            key_hash, _ = self._hash(key, parent=key_in_report, parents_ids=parents_ids)
314            if not key_hash:
315                continue
316            item_id = get_id(item)
317            if (parents_ids and item_id in parents_ids) or self._skip_this(item, parent=key_in_report):
318                continue
319            parents_ids_added = add_to_frozen_set(parents_ids, item_id)
320            hashed, count = self._hash(item, parent=key_in_report, parents_ids=parents_ids_added)
321            hashed = KEY_TO_VAL_STR.format(key_hash, hashed)
322            result.append(hashed)
323            counts += count
324
325        result.sort()
326        result = ';'.join(result)
327        if print_as_attribute:
328            type_ = original_type or type(obj)
329            type_str = type_.__name__
330            for type_group in self.ignore_type_in_groups:
331                if self.type_check_func(type_, type_group):
332                    type_str = ','.join(map(lambda x: x.__name__, type_group))
333                    break
334        else:
335            type_str = 'dict'
336        return "{}:{{{}}}".format(type_str, result), counts
337
338    def _prep_iterable(self, obj, parent, parents_ids=EMPTY_FROZENSET):
339
340        counts = 1
341        result = defaultdict(int)
342
343        for i, item in enumerate(obj):
344            new_parent = "{}[{}]".format(parent, i)
345            if self._skip_this(item, parent=new_parent):
346                continue
347
348            item_id = get_id(item)
349            if parents_ids and item_id in parents_ids:
350                continue
351
352            parents_ids_added = add_to_frozen_set(parents_ids, item_id)
353            hashed, count = self._hash(item, parent=new_parent, parents_ids=parents_ids_added)
354            # counting repetitions
355            result[hashed] += 1
356            counts += count
357
358        if self.ignore_repetition:
359            result = list(result.keys())
360        else:
361            result = [
362                '{}|{}'.format(i, v) for i, v in result.items()
363            ]
364
365        result = sorted(map(str, result))  # making sure the result items are string and sorted so join command works.
366        result = ','.join(result)
367        result = KEY_TO_VAL_STR.format(type(obj).__name__, result)
368
369        return result, counts
370
371    def _prep_bool(self, obj):
372        return BoolObj.TRUE if obj else BoolObj.FALSE
373
374    def _prep_number(self, obj):
375        type_ = "number" if self.ignore_numeric_type_changes else obj.__class__.__name__
376        if self.significant_digits is not None:
377            obj = self.number_to_string(obj, significant_digits=self.significant_digits,
378                                        number_format_notation=self.number_format_notation)
379        return KEY_TO_VAL_STR.format(type_, obj)
380
381    def _prep_datetime(self, obj):
382        type_ = 'datetime'
383        obj = datetime_normalize(self.truncate_datetime, obj)
384        return KEY_TO_VAL_STR.format(type_, obj)
385
386    def _prep_tuple(self, obj, parent, parents_ids):
387        # Checking to see if it has _fields. Which probably means it is a named
388        # tuple.
389        try:
390            obj._asdict
391        # It must be a normal tuple
392        except AttributeError:
393            result, counts = self._prep_iterable(obj=obj, parent=parent, parents_ids=parents_ids)
394        # We assume it is a namedtuple then
395        else:
396            result, counts = self._prep_obj(obj, parent, parents_ids=parents_ids, is_namedtuple=True)
397        return result, counts
398
399    def _hash(self, obj, parent, parents_ids=EMPTY_FROZENSET):
400        """The main diff method"""
401        counts = 1
402
403        if isinstance(obj, bool):
404            obj = self._prep_bool(obj)
405            result = None
406        else:
407            result = not_hashed
408        try:
409            result, counts = self.hashes[obj]
410        except (TypeError, KeyError):
411            pass
412        else:
413            return result, counts
414
415        if self._skip_this(obj, parent):
416            return None, 0
417
418        elif obj is None:
419            result = 'NONE'
420
421        elif isinstance(obj, strings):
422            result = prepare_string_for_hashing(
423                obj, ignore_string_type_changes=self.ignore_string_type_changes,
424                ignore_string_case=self.ignore_string_case)
425
426        elif isinstance(obj, times):
427            result = self._prep_datetime(obj)
428
429        elif isinstance(obj, numbers):
430            result = self._prep_number(obj)
431
432        elif isinstance(obj, MutableMapping):
433            result, counts = self._prep_dict(obj=obj, parent=parent, parents_ids=parents_ids)
434
435        elif isinstance(obj, tuple):
436            result, counts = self._prep_tuple(obj=obj, parent=parent, parents_ids=parents_ids)
437
438        elif isinstance(obj, Iterable):
439            result, counts = self._prep_iterable(obj=obj, parent=parent, parents_ids=parents_ids)
440
441        elif obj == BoolObj.TRUE or obj == BoolObj.FALSE:
442            result = 'bool:true' if obj is BoolObj.TRUE else 'bool:false'
443        else:
444            result, counts = self._prep_obj(obj=obj, parent=parent, parents_ids=parents_ids)
445
446        if result is not_hashed:  # pragma: no cover
447            self.hashes[UNPROCESSED_KEY].append(obj)
448
449        elif result is unprocessed:
450            pass
451
452        elif self.apply_hash:
453            if isinstance(obj, strings):
454                result_cleaned = result
455            else:
456                result_cleaned = prepare_string_for_hashing(
457                    result, ignore_string_type_changes=self.ignore_string_type_changes,
458                    ignore_string_case=self.ignore_string_case)
459            result = self.hasher(result_cleaned)
460
461        # It is important to keep the hash of all objects.
462        # The hashes will be later used for comparing the objects.
463        # Object to hash when possible otherwise ObjectID to hash
464        try:
465            self.hashes[obj] = (result, counts)
466        except TypeError:
467            obj_id = get_id(obj)
468            self.hashes[obj_id] = (result, counts)
469
470        return result, counts
471
472
473if __name__ == "__main__":  # pragma: no cover
474    import doctest
475    doctest.testmod()
476