1# MAEC Distance Measure-related Classes - BETA
2# Copyright (c) 2018, The MITRE Corporation
3# All rights reserved
4
5# See LICENSE.txt for complete terms
6import sys
7try:
8    import numpy
9except ImportError:
10    sys.stdout.write("Error: unable to import required numpy module.\nSee https://pypi.python.org/pypi/numpy.")
11import os
12import subprocess
13import maec
14import itertools
15import math
16from maec.package.package import Package
17from maec.package.malware_subject import MalwareSubject
18from maec.utils.deduplicator import BundleDeduplicator
19from maec.utils.merge import merge_malware_subjects
20from maec.analytics.static_features import static_features_dict
21
22class DynamicFeatureVector(object):
23    '''Generate a feature vector for a Malware Subject based on its dynamic features'''
24    def __init__(self, malware_subject, deduplicator, ignored_object_properties, ignored_actions):
25        self.deduplicator = deduplicator
26        self.dynamic_features = []
27        self.unique_dynamic_features = []
28        self.ignored_object_properties = ignored_object_properties
29        self.ignored_actions = ignored_actions
30        # Extract the features and build the vector
31        self.extract_features(malware_subject)
32        # Calculate the unique features
33        self.get_unique_features()
34
35    def create_action_vector(self, action):
36        '''Create a vector from a single Action'''
37        action_vector = set()
38        # Add the Action Name to the set
39        if action.name:
40            action_vector.add("act:" + action.name.value)
41        # Add the Object values to the set
42        if action.associated_objects:
43            for associated_object in action.associated_objects:
44                if associated_object.properties:
45                    object_vector = self.deduplicator.get_object_values(associated_object)
46                    updated_vector = set()
47                    for entry in object_vector:
48                        updated_vector.add(entry.replace(',', ';').rstrip('\n'))
49                    action_vector.update(updated_vector)
50        return action_vector
51
52    def create_dynamic_vectors(self, malware_subject):
53        '''Create a vector of unique action/object pairs for an input Malware Subject'''
54        action_vectors = []
55        # Extract the Bundles from the Malware Subject
56        bundles = malware_subject.get_all_bundles()
57        for bundle in bundles:
58            # Create the vector for each Action
59            all_actions = bundle.get_all_actions()
60            for action in all_actions:
61                action_vector = self.create_action_vector(action)
62                if action_vector:
63                    action_vectors.append(action_vector)
64        return action_vectors
65
66    def extract_features(self, malware_subject):
67        '''Extract the dynamic features from the Malware Subject'''
68        # Extract the Dynamic (Action) features
69        self.dynamic_features = self.create_dynamic_vectors(malware_subject)
70        # Prune the Dynamic features
71        self.prune_dynamic_features()
72
73    def prune_dynamic_features(self, min_length = 2):
74        '''Prune the dynamic features based on ignored Object properties/Actions'''
75        pruned_dynamic_features = []
76        for dynamic_vector in self.dynamic_features:
77            ignore_vector = False
78            pruned_vector = set()
79            # Do the minimum length check (to prune Actions with no Objects)
80            if len(dynamic_vector) < min_length:
81                continue
82            # Prune any vectors with ignored actions or object properites
83            for entity in dynamic_vector:
84                split_entity = str(entity).split(':')
85                if split_entity[0] == 'act':
86                    action_name = split_entity[1]
87                    if action_name in self.ignored_actions:
88                        ignore_vector = True
89                        break
90                    else:
91                        pruned_vector.add(entity)
92                elif split_entity[0] in self.ignored_object_properties:
93                    continue
94                else:
95                    pruned_vector.add(entity)
96            if ignore_vector:
97                continue
98            else:
99                pruned_dynamic_features.append(pruned_vector)
100            # Update the existing dynamic feature with the pruned versions
101            self.dynamic_features = pruned_dynamic_features
102
103    def get_unique_features(self):
104        '''Calculates the unique set of dynamic features for the Malware Subject'''
105        self.unique_dynamic_features = [x for x in self.dynamic_features if self.dynamic_features.count(x) == 1]
106
107class StaticFeatureVector(object):
108    '''Generate a feature vector for a Malware Subject based on its static features'''
109    def __init__(self, malware_subject, deduplicator):
110        self.deduplicator = deduplicator
111        self.static_features = {}
112        self.unique_static_features = {}
113        # Extract the features and build the vector
114        self.extract_features(malware_subject)
115        # Calculate the unique features
116        self.get_unique_features()
117
118    def create_object_vector(self, object, static_feature_dict, callback_function = None):
119        '''Create a vector from a single Object'''
120        object_vector = self.deduplicator.get_object_values(object)
121        for entity_string in object_vector:
122            split_string =  entity_string.split(':')
123            feature_path = str(split_string[0])
124            feature_value = str(split_string[1]).lower()
125            # Test if this is a feature that we want to keep
126            if feature_path in static_features_dict.keys():
127                feature_dict = static_features_dict[feature_path]
128                feature_name = feature_dict['feature_name']
129                # Set the key in the object feature dictionary
130                if feature_name in static_feature_dict:
131                    # Test if multiple values are allowed for this feature
132                    if 'options' in feature_dict and 'allow_multiple' in feature_dict['options']:
133                        if isinstance(static_feature_dict[feature_name], list):
134                            static_feature_dict[feature_name].append(feature_value)
135                        else:
136                                static_feature_dict[feature_name] = [static_feature_dict[feature_name], feature_value]
137                    # If they're not allowed, use a callback function to determine what to do
138                    # E.g., if two different tools report the same value differently, this can be used to resolve that
139                    # Callback function parameters : feature name, existing feature value, new feature value
140                    elif callback_function:
141                        existing_value = static_feature_dict[feature_name]
142                        static_feature_dict[feature_name] = callback_function(feature_name, existing_value, feature_value)
143
144                else:
145                    static_feature_dict[feature_name] = feature_value
146
147    def create_static_vectors(self, malware_subject):
148        '''Create a vector of static features for an input Malware Subject'''
149        static_feature_dict = {}
150        # Extract any feature from the Malware Instance Object Attributes of the Malware Subject
151        if malware_subject.malware_instance_object_attributes and malware_subject.malware_instance_object_attributes.properties:
152            # Add the properties of the Object to the feature dict
153            self.create_object_vector(malware_subject.malware_instance_object_attributes, static_feature_dict)
154        # Extract any feature from the Bundles in the Malware Subject
155        bundles = malware_subject.get_all_bundles()
156        for bundle in bundles:
157            # Test the Bundle's content_type to make sure we're dealing with static analysis tool output
158            if bundle.content_type and bundle.content_type == 'static analysis tool output':
159                # Extract the Objects from the Bundle
160                for obj in bundle.get_all_objects():
161                    if obj.properties:
162                        # Add the properties of the Object to the feature dict
163                        self.create_object_vector(obj, static_feature_dict)
164        if static_feature_dict:
165            return static_feature_dict
166
167    def extract_features(self, malware_subject):
168        '''Extract the static features from the Malware Subject'''
169        # Extract the Static features
170        self.static_features = self.create_static_vectors(malware_subject)
171
172    def get_unique_features(self):
173        '''Calculates the unique set of static features for the Malware Subject'''
174        self.unique_static_features = {}
175        for feature_name, feature_value in self.static_features.items():
176            # Prune any list-type values
177            if isinstance(feature_value, list):
178                pruned_value_list = []
179                for value in feature_value:
180                    if value not in pruned_value_list:
181                        pruned_value_list.append(value)
182                self.unique_static_features[feature_name] = pruned_value_list
183            else:
184                self.unique_static_features[feature_name] = feature_value
185
186class Distance(object):
187    '''Calculates distance between two or more MAEC entities.
188       Currently supports only Packages or Malware Subjects.'''
189    def __init__(self, maec_entity_list):
190        self.maec_entity_list = maec_entity_list
191        # Options dictionary
192        # currently available options:
193        # use_dynamic_features : True/False. Use dynamic features (Actions) in the distance calculation.
194        # use_static_features : True/False. Use static features (File/PE attributes) in the distance calculation.
195        self.options_dict = {'use_dynamic_features' : True,
196                             'use_static_features' : True}
197        self.deduplicator = BundleDeduplicator()
198        self.feature_vectors = {}
199        self.superset_dynamic_vectors = []
200        self.superset_static_vectors = {}
201        # A list of normalized/merged Malware Subjects
202        self.normalized_subjects = []
203        # Dictionary of distances
204        # Key = Malware Subject ID
205        # Value = dictionary of distances
206        #     key = Malware Subject ID
207        #     value = distance
208        self.distances = {}
209        # Dictionary of static features to use in the distance calculation
210        # Also, defines how they should be post-processed/compared
211        # NOTE: The default features here are merely a suggestion!
212        # Options:
213        # datatype = Required. The datatype of the values for the feature.
214        #            Possible values: hex, hex list, int, int list, float, float list, string.
215        # normalize = Optional. Normalize/scale the data.
216        #             True by default.
217        # scale_log = Optional. Use logarithmic scaling for the list of numeric features.
218        #             True by default.
219        # bin = Optional. For numerical features, use bins for the distance measure.
220        # number of bins = Optional. Valid only if bin = true. The number of bins to use in binning.
221        # use_raw_value = Optional. Use the raw value for the field, without any post-processing.
222        #                           All other options are ignored when this setting is used.
223        self.compared_static_features = {'imported_files' : {'datatype' : 'string'},
224                                         'section_entropies' : {'datatype' : 'float list', 'scale log' : False},
225                                         'section_virtual_sizes' : {'datatype' : 'hex list','scale log' : False},
226                                         'address_of_entry_point' : {'datatype' : 'hex', 'scale log' : False, 'bin' : True},
227                                         'size_in_bytes' : {'datatype' : 'int', 'bin' : True},
228                                         'size_of_initialized_data' : {'datatype' : 'hex', 'scale log' : False, 'bin' : True, 'number of bins' : 5},
229                                         'size_of_image' : {'datatype' : 'hex', 'bin' : True}}
230        # List of ignored object attributes, for use in dynamic vector creation
231        self.ignored_object_properties = ['address',
232                                          'hashes/simple_hash_value',
233                                          'id_',
234                                          'type_',
235                                          'pid',
236                                          'size_in_bytes']
237        # List of ignored actions (not useful/difficult to correlate on), for use in dynamic vector creation
238        self.ignored_actions = ['map view of section',
239                                'create section',
240                                'create thread',
241                                'open section']
242
243    def bin_list(self, numeric_value, numeric_list, n=10):
244        '''Bin a numeric value into a bucket, based on a parent list of values.
245           N = number of buckets to use (default = 10).'''
246        bin_vector = numpy.array([0] * n)
247        # Sanity checking for lists with a single value
248        if len(numeric_list) == 1:
249            bin_vector = numpy.array([0] * n)
250            bin_vector[n-1] = 1
251            return bin_vector
252        max_list = max(numeric_list)
253        min_list = min(numeric_list)
254        bucket_size = (max_list-min_list)/n
255        bin_value = int(math.floor((numeric_value - min_list)/bucket_size))
256        if bin_value == n:
257            bin_value -= 1
258        bin_vector[bin_value] = 1
259        return bin_vector
260
261    def add_log(self, number, log_list):
262        '''Added a log'd (log-ized??) number to a list'''
263        if number != 0:
264            log_list.append(float(math.log(number)))
265        else:
266            log_list.append(float(number))
267
268    def normalize_numeric(self, numeric_value, numeric_list, normalize = True, scale_log = True):
269        '''Scale a numeric value, based on a parent list of values.
270           Return the scaled/normalized form.'''
271        # Sanity check for zeros
272        if numeric_value == 0:
273            return float(0)
274        if normalize:
275            if scale_log:
276                log_list = []
277                for number in numeric_list:
278                    self.add_log(number, log_list)
279                return math.log(float(numeric_value))/max(log_list)
280            else:
281                return float(numeric_value)/max(numeric_list)
282        else:
283            return numeric_value
284
285    def normalize_numeric_list(self, value_list, numeric_list, normalize = True, scale_log = True):
286        '''Scale a list of numeric values, based on a parent list of numeric value lists.
287           Return the scaled/normalized form.'''
288        # Find the maximum length of all of the lists
289        max_len = max(len(p) for p in numeric_list)
290        if normalize:
291            # Find the maximum value in all of the lists
292            max_val = max(max(p) for p in numeric_list)
293            if scale_log:
294                log_list = []
295                for vector_entry in value_list:
296                    self.add_log(vector_entry, log_list)
297                # Scale the list
298                scaled_list = [float(x)/math.log(max_val) for x in log_list]
299                scaled_vector = numpy.array(scaled_list)
300                # Resize the vector
301                scaled_vector.resize(max_len, refcheck = False)
302                return scaled_vector
303            else:
304                # Scale the list
305                scaled_list = [float(x)/max_val for x in value_list]
306                scaled_vector = numpy.array(scaled_list)
307                # Resize the vector
308                scaled_vector.resize(max_len, refcheck = False)
309                return scaled_vector
310        else:
311            # Resize the vector
312            return value_list.resize(max_len, refcheck = False)
313
314    def build_string_vector(self, string_list, superset_string_list, ignore_case = True):
315        '''Build a vector from an input list of strings and superset list of strings.'''
316        # Flatten the superset list
317        flattened_string_list = self.flatten_vector(superset_string_list)
318        # List of ignored/skipped strings
319        ignored_strings = ['none']
320        # List of unique strings
321        unique_strings = []
322        # First, build up the unique strings
323        for string in flattened_string_list:
324            normalized_string = string
325            # Ignore case if specified
326            if ignore_case:
327                normalized_string = string.lower()
328            if normalized_string not in ignored_strings and normalized_string not in unique_strings:
329                unique_strings.append(normalized_string)
330        # Next, build the actual strings vector
331        string_vector = numpy.array([0] * len(unique_strings))
332        normalized_string_list = string_list
333        # Ignore case if specified
334        if ignore_case:
335            normalized_string_list = [str(x).lower() for x in string_list]
336        for i in range(0, len(unique_strings)):
337            if unique_strings[i] in normalized_string_list:
338                string_vector[i] = 1
339            else:
340                string_vector[i] = 0
341        return string_vector
342
343    def preprocess_entities(self, dereference = True):
344        '''Pre-process the MAEC entities'''
345        malware_subjects = []
346        # Dereference and normalize the Malware Subjects in the Package
347        for entity in self.maec_entity_list:
348            # Test if we're dealing with a package or Malware Subject
349            if isinstance(entity, Package):
350                action_vectors = []
351                for malware_subject in entity.malware_subjects:
352                    # Dereference the Bundles in the Malware Subject
353                    if dereference:
354                        malware_subject.dereference_bundles()
355                    # Normalize the Bundles in the Malware Subject
356                    malware_subject.normalize_bundles()
357                    # Add the Malware Subject to the list
358                    malware_subjects.append(malware_subject)
359            elif isinstance(entity, MalwareSubject):
360                # Dereference the Bundles in the Malware Subject
361                if dereference:
362                    entity.dereference_bundles()
363                # Normalize the Bundles in the Malware Subject
364                entity.normalize_bundles()
365                # Add the Malware Subject to the list
366                malware_subjects.append(malware_subject)
367        # Merge the Malware Subjects by hash (if possible)
368        return merge_malware_subjects(malware_subjects)
369
370    def generate_feature_vectors(self, merged_subjects):
371        '''Generate a feature vector for the binned Malware Subjects'''
372        for malware_subject in merged_subjects:
373            feature_vector_dict = {'dynamic' : DynamicFeatureVector(malware_subject, self.deduplicator, self.ignored_object_properties, self.ignored_actions),
374                                   'static' : StaticFeatureVector(malware_subject, self.deduplicator)}
375            self.feature_vectors[malware_subject.id_] = feature_vector_dict
376
377    def flatten_vector(self, vector_entry_list):
378        '''Generate a single, flattened vector from an input list of vectors or values.'''
379        component_list = []
380        for vector_entry in vector_entry_list:
381            if isinstance(vector_entry, numpy.ndarray) or isinstance(vector_entry, list):
382                for component in vector_entry:
383                    component_list.append(component)
384            else:
385                component_list.append(vector_entry)
386        return component_list
387
388    def normalize_vectors(self, vector_1, vector_2):
389        '''Normalize two input vectors so that they have similar composition.'''
390        for i in range(0, len(vector_1)):
391            if type(vector_1[i]) != type(vector_2[i]):
392                if isinstance(vector_1[i], numpy.ndarray) and not isinstance(vector_2[i], numpy.ndarray):
393                    vector_2[i] = numpy.array([0] * len(vector_1[i]))
394                elif not isinstance(vector_1[i], numpy.ndarray) and isinstance(vector_2[i], numpy.ndarray):
395                    vector_1[i] = numpy.array([0] * len(vector_2[i]))
396
397    def create_static_result_vector(self, static_vector):
398        '''Construct the static result (matching) vector for a corresponding feature vector'''
399        results_vector = []
400        for feature_name in self.compared_static_features:
401            # Test if we wish to use the feature in the comparison
402            if feature_name in static_vector.unique_static_features:
403                # Get the value of the feature
404                feature_value = static_vector.unique_static_features[feature_name]
405                # Get the options dictionary for the feature
406                feature_options_dict = self.compared_static_features[feature_name]
407                feature_items = self.superset_static_vectors[feature_name]
408                # Check if the raw value setting is specified
409                if 'use_raw_value' in feature_options_dict:
410                    results_vector.append(feature_value)
411                    continue
412                # Determine if numeric values should be logarithmically scaled - true by default
413                scale_log = True
414                if 'scale log' in feature_options_dict:
415                    scale_log = feature_options_dict['scale log']
416                # Determine if numeric values should be normalized - true by default
417                normalize = True
418                if 'normalize' in feature_options_dict:
419                    normalize = feature_options_dict['normalize']
420                # Normalize the items for the feature based on the specified datatype
421                # Use this to construct the results vector
422                # Normalize on hex values
423                normalized_value = None
424                if feature_options_dict['datatype'] == 'hex':
425                    converted_types = [int(x,0) for x in feature_items]
426                    normalized_value = self.normalize_numeric(int(feature_value,0), converted_types, normalize, scale_log)
427                # Normalize on lists of hex values
428                if feature_options_dict['datatype'] == 'hex list':
429                    converted_types = [numpy.array([int(x, 0) for x in y]) for y in feature_items]
430                    normalized_value = self.normalize_numeric_list(numpy.array([int(x,0) for x in feature_value]), converted_types, normalize, scale_log)
431                # Normalize on int values
432                elif feature_options_dict['datatype'] == 'int':
433                    converted_types = [int(x) for x in feature_items]
434                    normalized_value = self.normalize_numeric(int(feature_value), converted_types, normalize, scale_log)
435                # Normalize on lists of int values
436                elif feature_options_dict['datatype'] == 'int list':
437                    converted_types = [numpy.array([int(x) for x in y]) for y in feature_items]
438                    normalized_value = self.normalize_numeric_list(numpy.array([int(x) for x in feature_value]), converted_types, normalize, scale_log)
439                # Normalize on float values
440                elif feature_options_dict['datatype'] == 'float':
441                    converted_types = [float(x) for x in feature_items]
442                    normalized_value = self.normalize_numeric(float(feature_value), converted_types, normalize, scale_log)
443                # Normalize on lists of float values
444                elif feature_options_dict['datatype'] == 'float list':
445                    converted_types = [numpy.array([float(x) for x in y]) for y in feature_items]
446                    normalized_value = self.normalize_numeric_list(numpy.array([float(x) for x in feature_value]), converted_types, normalize, scale_log)
447                # Normalize on string values
448                elif feature_options_dict['datatype'] == 'string':
449                    string_vector = self.build_string_vector(feature_value, feature_items)
450                    results_vector.append(string_vector)
451                # Bin any values, if specified in the options dictionary
452                if 'bin' in feature_options_dict and feature_options_dict['bin']:
453                    normalized_items = [self.normalize_numeric(x, converted_types, scale_log) for x in converted_types]
454                    if 'number of bins' in feature_options_dict:
455                        bin = self.bin_list(normalized_value, normalized_items, feature_options_dict['number of bins'])
456                    else:
457                        bin = self.bin_list(normalized_value, normalized_items)
458                    results_vector.append(bin)
459                elif normalized_value is not None:
460                    results_vector.append(normalized_value)
461            else:
462                results_vector.append(0)
463        return results_vector
464
465    def create_dynamic_result_vector(self, dynamic_vector):
466        '''Construct the dynamic result (matching) vector for a corresponding feature vector'''
467        # First, construct the results vector for the dynamic vectors
468        results_vector = numpy.array([0] * len(self.superset_dynamic_vectors))
469        i = 0
470        for vector in self.superset_dynamic_vectors:
471            if vector in dynamic_vector.unique_dynamic_features:
472                results_vector[i] = 1
473            i+= 1
474        return results_vector
475
476    def create_superset_vectors(self):
477        '''Calculate vector supersets from the feature vectors'''
478        for feature_vector_dict in self.feature_vectors.values():
479            dynamic_vector = feature_vector_dict['dynamic']
480            static_vector = feature_vector_dict['static']
481            # Build the superset of dynamic vectors
482            for vector in dynamic_vector.unique_dynamic_features:
483                if vector not in self.superset_dynamic_vectors:
484                    self.superset_dynamic_vectors.append(vector)
485            # Build the superset of static vectors
486            for feature_name, feature_value in static_vector.unique_static_features.items():
487                if feature_name not in self.superset_static_vectors:
488                    self.superset_static_vectors[feature_name] = [feature_value]
489                else:
490                    self.superset_static_vectors[feature_name].append(feature_value)
491
492    def euclidean_distance(self, vector_1, vector_2):
493        '''Calculate the Euclidean distance between two input vectors'''
494        distance = 0.0
495        for i in range(0, len(vector_1)):
496            if isinstance(vector_1[i], float):
497                distance += math.pow(vector_1[i] - vector_2[i], 2)
498            elif isinstance(vector_1[i], numpy.ndarray):
499                for vi in range(0, len(vector_1[i])):
500                    distance += math.pow(vector_1[i][vi] - vector_2[i][vi], 2)
501            elif isinstance(vector_1[i], int):
502                if vector_1[i] != vector_2[i]:
503                    distance += 1.0
504            elif isinstance(vector_1[i], str):
505                if vector_1[i] != vector_2[i]:
506                    distance += 1.0
507        return math.sqrt(distance)
508
509    def populate_hashes_mapping(self, malware_subject_list):
510        '''Populate and return the Malware Subject -> Hashes mapping from an input list of Malware Subjects.'''
511        hashes_mapping = {}
512        for malware_subject in malware_subject_list:
513            mal_inst_obj = malware_subject.malware_instance_object_attributes
514            if mal_inst_obj.properties and mal_inst_obj.properties.hashes:
515                hashes_dict = {}
516                for hash in mal_inst_obj.properties.hashes:
517                    type = None
518                    value = None
519                    if hash.type_:
520                        type = hash.type_.value
521                    if hash.simple_hash_value:
522                        value = hash.simple_hash_value.value
523                    elif hash.fuzzy_hash_value:
524                        value = hash.fuzzy_hash_value.value
525                    if type and value:
526                        hashes_dict[str(type).lower()] = str(value).lower()
527                hashes_mapping[malware_subject.id_] = hashes_dict
528        return hashes_mapping
529
530    def perform_calculation(self):
531        '''Perform the actual distance calculation.
532           Store the results in the distances dictionary.'''
533        # Determine the different combinations of Malware Subjects
534        combinations = itertools.combinations(self.feature_vectors, r=2)
535        for combination in combinations:
536            if self.options_dict['use_dynamic_features']:
537                dynamic_vectors = (self.feature_vectors[combination[0]]['dynamic_result'],
538                                   self.feature_vectors[combination[1]]['dynamic_result'])
539            if self.options_dict['use_static_features']:
540                static_vectors = (self.feature_vectors[combination[0]]['static_result'],
541                                   self.feature_vectors[combination[1]]['static_result'])
542                # Normalize the static vectors (to make them equal length)
543                self.normalize_vectors(static_vectors[0], static_vectors[1])
544            # Generate the combined vectors if necessary and calculate the distance
545            if self.options_dict['use_dynamic_features'] and self.options_dict['use_static_features']:
546                result_vectors = (numpy.array(list(dynamic_vectors[0]) + self.flatten_vector(static_vectors[0])),
547                                  numpy.array(list(dynamic_vectors[1]) + self.flatten_vector(static_vectors[1])))
548            elif self.options_dict['use_dynamic_features'] and not self.options_dict['use_static_features']:
549                result_vectors = (numpy.array(list(dynamic_vectors[0])),
550                                  numpy.array(list(dynamic_vectors[1])))
551            elif not self.options_dict['use_dynamic_features'] and self.options_dict['use_static_features']:
552                result_vectors = (self.flatten_vector(static_vectors[0]),
553                                  self.flatten_vector(static_vectors[1]))
554            distance = self.euclidean_distance(result_vectors[0], result_vectors[1])
555            # Add the result to the distances dictionary
556            for i in range(0,2):
557                opposite = 1 - i
558                if combination[i] not in self.distances:
559                    self.distances[combination[i]] = {combination[opposite] : distance}
560                else:
561                    self.distances[combination[i]][combination[opposite]] = distance
562
563    def calculate(self):
564        '''Calculate the distances between the input Malware Subjects.'''
565        # Pre-process and merge the entities
566        self.normalized_subjects = self.preprocess_entities()
567        # Generate the feature vectors for the entities
568        self.generate_feature_vectors(self.normalized_subjects)
569        # Build up the supersets of unique vectors
570        self.create_superset_vectors()
571        # Construct the result vectors
572        for feature_vector_dict in self.feature_vectors.values():
573            if self.options_dict['use_dynamic_features']:
574                # Construct the dynamic result vector
575                feature_vector_dict['dynamic_result'] = self.create_dynamic_result_vector(feature_vector_dict['dynamic'])
576            if self.options_dict['use_static_features']:
577                # Construct the static result vector
578                feature_vector_dict['static_result'] = self.create_static_result_vector(feature_vector_dict['static'])
579        # Perform the actual distance calculation
580        self.perform_calculation()
581
582    def print_distances(self, file_object, default_label = 'md5', delimiter = ','):
583        '''Print the distances between the Malware Subjects in delimited matrix format
584           to a File-like object.
585
586           Try to use the MD5s of the Malware Subjects as the default label.
587           Uses commas as the default delimiter, for CSV-like output.'''
588        hashes_mapping = self.populate_hashes_mapping(self.normalized_subjects)
589        distance_strings = []
590        # Generate the header string and individual distance strings
591        header_string = '' + delimiter
592        for malware_subject in self.normalized_subjects:
593            distance_string = ''
594            hashes = hashes_mapping[malware_subject.id_]
595            if default_label in hashes:
596                distance_string += (hashes[default_label] + delimiter)
597                header_string += (hashes[default_label] + delimiter)
598            else:
599                distance_string += (malware_subject.id_ + delimiter)
600                header_string += (malware_subject.id_ + delimiter)
601            for other_malware_subject in self.normalized_subjects:
602                if malware_subject.id_ == other_malware_subject.id_:
603                    distance_string += ('0.0' + delimiter)
604                else:
605                    distance_string += (str(self.distances[malware_subject.id_][other_malware_subject.id_])
606                                        + delimiter)
607            distance_strings.append(distance_string.rstrip(delimiter))
608
609        # Print the header and distance strings
610        file_object.write(header_string.rstrip(delimiter) + "\n")
611        for distance_string in distance_strings:
612            file_object.write(distance_string + "\n")
613        file_object.flush()
614