1# MAEC Distance Measure-related Classes - BETA 2# Copyright (c) 2018, The MITRE Corporation 3# All rights reserved 4 5# See LICENSE.txt for complete terms 6import sys 7try: 8 import numpy 9except ImportError: 10 sys.stdout.write("Error: unable to import required numpy module.\nSee https://pypi.python.org/pypi/numpy.") 11import os 12import subprocess 13import maec 14import itertools 15import math 16from maec.package.package import Package 17from maec.package.malware_subject import MalwareSubject 18from maec.utils.deduplicator import BundleDeduplicator 19from maec.utils.merge import merge_malware_subjects 20from maec.analytics.static_features import static_features_dict 21 22class DynamicFeatureVector(object): 23 '''Generate a feature vector for a Malware Subject based on its dynamic features''' 24 def __init__(self, malware_subject, deduplicator, ignored_object_properties, ignored_actions): 25 self.deduplicator = deduplicator 26 self.dynamic_features = [] 27 self.unique_dynamic_features = [] 28 self.ignored_object_properties = ignored_object_properties 29 self.ignored_actions = ignored_actions 30 # Extract the features and build the vector 31 self.extract_features(malware_subject) 32 # Calculate the unique features 33 self.get_unique_features() 34 35 def create_action_vector(self, action): 36 '''Create a vector from a single Action''' 37 action_vector = set() 38 # Add the Action Name to the set 39 if action.name: 40 action_vector.add("act:" + action.name.value) 41 # Add the Object values to the set 42 if action.associated_objects: 43 for associated_object in action.associated_objects: 44 if associated_object.properties: 45 object_vector = self.deduplicator.get_object_values(associated_object) 46 updated_vector = set() 47 for entry in object_vector: 48 updated_vector.add(entry.replace(',', ';').rstrip('\n')) 49 action_vector.update(updated_vector) 50 return action_vector 51 52 def create_dynamic_vectors(self, malware_subject): 53 '''Create a vector of unique action/object pairs for an input Malware Subject''' 54 action_vectors = [] 55 # Extract the Bundles from the Malware Subject 56 bundles = malware_subject.get_all_bundles() 57 for bundle in bundles: 58 # Create the vector for each Action 59 all_actions = bundle.get_all_actions() 60 for action in all_actions: 61 action_vector = self.create_action_vector(action) 62 if action_vector: 63 action_vectors.append(action_vector) 64 return action_vectors 65 66 def extract_features(self, malware_subject): 67 '''Extract the dynamic features from the Malware Subject''' 68 # Extract the Dynamic (Action) features 69 self.dynamic_features = self.create_dynamic_vectors(malware_subject) 70 # Prune the Dynamic features 71 self.prune_dynamic_features() 72 73 def prune_dynamic_features(self, min_length = 2): 74 '''Prune the dynamic features based on ignored Object properties/Actions''' 75 pruned_dynamic_features = [] 76 for dynamic_vector in self.dynamic_features: 77 ignore_vector = False 78 pruned_vector = set() 79 # Do the minimum length check (to prune Actions with no Objects) 80 if len(dynamic_vector) < min_length: 81 continue 82 # Prune any vectors with ignored actions or object properites 83 for entity in dynamic_vector: 84 split_entity = str(entity).split(':') 85 if split_entity[0] == 'act': 86 action_name = split_entity[1] 87 if action_name in self.ignored_actions: 88 ignore_vector = True 89 break 90 else: 91 pruned_vector.add(entity) 92 elif split_entity[0] in self.ignored_object_properties: 93 continue 94 else: 95 pruned_vector.add(entity) 96 if ignore_vector: 97 continue 98 else: 99 pruned_dynamic_features.append(pruned_vector) 100 # Update the existing dynamic feature with the pruned versions 101 self.dynamic_features = pruned_dynamic_features 102 103 def get_unique_features(self): 104 '''Calculates the unique set of dynamic features for the Malware Subject''' 105 self.unique_dynamic_features = [x for x in self.dynamic_features if self.dynamic_features.count(x) == 1] 106 107class StaticFeatureVector(object): 108 '''Generate a feature vector for a Malware Subject based on its static features''' 109 def __init__(self, malware_subject, deduplicator): 110 self.deduplicator = deduplicator 111 self.static_features = {} 112 self.unique_static_features = {} 113 # Extract the features and build the vector 114 self.extract_features(malware_subject) 115 # Calculate the unique features 116 self.get_unique_features() 117 118 def create_object_vector(self, object, static_feature_dict, callback_function = None): 119 '''Create a vector from a single Object''' 120 object_vector = self.deduplicator.get_object_values(object) 121 for entity_string in object_vector: 122 split_string = entity_string.split(':') 123 feature_path = str(split_string[0]) 124 feature_value = str(split_string[1]).lower() 125 # Test if this is a feature that we want to keep 126 if feature_path in static_features_dict.keys(): 127 feature_dict = static_features_dict[feature_path] 128 feature_name = feature_dict['feature_name'] 129 # Set the key in the object feature dictionary 130 if feature_name in static_feature_dict: 131 # Test if multiple values are allowed for this feature 132 if 'options' in feature_dict and 'allow_multiple' in feature_dict['options']: 133 if isinstance(static_feature_dict[feature_name], list): 134 static_feature_dict[feature_name].append(feature_value) 135 else: 136 static_feature_dict[feature_name] = [static_feature_dict[feature_name], feature_value] 137 # If they're not allowed, use a callback function to determine what to do 138 # E.g., if two different tools report the same value differently, this can be used to resolve that 139 # Callback function parameters : feature name, existing feature value, new feature value 140 elif callback_function: 141 existing_value = static_feature_dict[feature_name] 142 static_feature_dict[feature_name] = callback_function(feature_name, existing_value, feature_value) 143 144 else: 145 static_feature_dict[feature_name] = feature_value 146 147 def create_static_vectors(self, malware_subject): 148 '''Create a vector of static features for an input Malware Subject''' 149 static_feature_dict = {} 150 # Extract any feature from the Malware Instance Object Attributes of the Malware Subject 151 if malware_subject.malware_instance_object_attributes and malware_subject.malware_instance_object_attributes.properties: 152 # Add the properties of the Object to the feature dict 153 self.create_object_vector(malware_subject.malware_instance_object_attributes, static_feature_dict) 154 # Extract any feature from the Bundles in the Malware Subject 155 bundles = malware_subject.get_all_bundles() 156 for bundle in bundles: 157 # Test the Bundle's content_type to make sure we're dealing with static analysis tool output 158 if bundle.content_type and bundle.content_type == 'static analysis tool output': 159 # Extract the Objects from the Bundle 160 for obj in bundle.get_all_objects(): 161 if obj.properties: 162 # Add the properties of the Object to the feature dict 163 self.create_object_vector(obj, static_feature_dict) 164 if static_feature_dict: 165 return static_feature_dict 166 167 def extract_features(self, malware_subject): 168 '''Extract the static features from the Malware Subject''' 169 # Extract the Static features 170 self.static_features = self.create_static_vectors(malware_subject) 171 172 def get_unique_features(self): 173 '''Calculates the unique set of static features for the Malware Subject''' 174 self.unique_static_features = {} 175 for feature_name, feature_value in self.static_features.items(): 176 # Prune any list-type values 177 if isinstance(feature_value, list): 178 pruned_value_list = [] 179 for value in feature_value: 180 if value not in pruned_value_list: 181 pruned_value_list.append(value) 182 self.unique_static_features[feature_name] = pruned_value_list 183 else: 184 self.unique_static_features[feature_name] = feature_value 185 186class Distance(object): 187 '''Calculates distance between two or more MAEC entities. 188 Currently supports only Packages or Malware Subjects.''' 189 def __init__(self, maec_entity_list): 190 self.maec_entity_list = maec_entity_list 191 # Options dictionary 192 # currently available options: 193 # use_dynamic_features : True/False. Use dynamic features (Actions) in the distance calculation. 194 # use_static_features : True/False. Use static features (File/PE attributes) in the distance calculation. 195 self.options_dict = {'use_dynamic_features' : True, 196 'use_static_features' : True} 197 self.deduplicator = BundleDeduplicator() 198 self.feature_vectors = {} 199 self.superset_dynamic_vectors = [] 200 self.superset_static_vectors = {} 201 # A list of normalized/merged Malware Subjects 202 self.normalized_subjects = [] 203 # Dictionary of distances 204 # Key = Malware Subject ID 205 # Value = dictionary of distances 206 # key = Malware Subject ID 207 # value = distance 208 self.distances = {} 209 # Dictionary of static features to use in the distance calculation 210 # Also, defines how they should be post-processed/compared 211 # NOTE: The default features here are merely a suggestion! 212 # Options: 213 # datatype = Required. The datatype of the values for the feature. 214 # Possible values: hex, hex list, int, int list, float, float list, string. 215 # normalize = Optional. Normalize/scale the data. 216 # True by default. 217 # scale_log = Optional. Use logarithmic scaling for the list of numeric features. 218 # True by default. 219 # bin = Optional. For numerical features, use bins for the distance measure. 220 # number of bins = Optional. Valid only if bin = true. The number of bins to use in binning. 221 # use_raw_value = Optional. Use the raw value for the field, without any post-processing. 222 # All other options are ignored when this setting is used. 223 self.compared_static_features = {'imported_files' : {'datatype' : 'string'}, 224 'section_entropies' : {'datatype' : 'float list', 'scale log' : False}, 225 'section_virtual_sizes' : {'datatype' : 'hex list','scale log' : False}, 226 'address_of_entry_point' : {'datatype' : 'hex', 'scale log' : False, 'bin' : True}, 227 'size_in_bytes' : {'datatype' : 'int', 'bin' : True}, 228 'size_of_initialized_data' : {'datatype' : 'hex', 'scale log' : False, 'bin' : True, 'number of bins' : 5}, 229 'size_of_image' : {'datatype' : 'hex', 'bin' : True}} 230 # List of ignored object attributes, for use in dynamic vector creation 231 self.ignored_object_properties = ['address', 232 'hashes/simple_hash_value', 233 'id_', 234 'type_', 235 'pid', 236 'size_in_bytes'] 237 # List of ignored actions (not useful/difficult to correlate on), for use in dynamic vector creation 238 self.ignored_actions = ['map view of section', 239 'create section', 240 'create thread', 241 'open section'] 242 243 def bin_list(self, numeric_value, numeric_list, n=10): 244 '''Bin a numeric value into a bucket, based on a parent list of values. 245 N = number of buckets to use (default = 10).''' 246 bin_vector = numpy.array([0] * n) 247 # Sanity checking for lists with a single value 248 if len(numeric_list) == 1: 249 bin_vector = numpy.array([0] * n) 250 bin_vector[n-1] = 1 251 return bin_vector 252 max_list = max(numeric_list) 253 min_list = min(numeric_list) 254 bucket_size = (max_list-min_list)/n 255 bin_value = int(math.floor((numeric_value - min_list)/bucket_size)) 256 if bin_value == n: 257 bin_value -= 1 258 bin_vector[bin_value] = 1 259 return bin_vector 260 261 def add_log(self, number, log_list): 262 '''Added a log'd (log-ized??) number to a list''' 263 if number != 0: 264 log_list.append(float(math.log(number))) 265 else: 266 log_list.append(float(number)) 267 268 def normalize_numeric(self, numeric_value, numeric_list, normalize = True, scale_log = True): 269 '''Scale a numeric value, based on a parent list of values. 270 Return the scaled/normalized form.''' 271 # Sanity check for zeros 272 if numeric_value == 0: 273 return float(0) 274 if normalize: 275 if scale_log: 276 log_list = [] 277 for number in numeric_list: 278 self.add_log(number, log_list) 279 return math.log(float(numeric_value))/max(log_list) 280 else: 281 return float(numeric_value)/max(numeric_list) 282 else: 283 return numeric_value 284 285 def normalize_numeric_list(self, value_list, numeric_list, normalize = True, scale_log = True): 286 '''Scale a list of numeric values, based on a parent list of numeric value lists. 287 Return the scaled/normalized form.''' 288 # Find the maximum length of all of the lists 289 max_len = max(len(p) for p in numeric_list) 290 if normalize: 291 # Find the maximum value in all of the lists 292 max_val = max(max(p) for p in numeric_list) 293 if scale_log: 294 log_list = [] 295 for vector_entry in value_list: 296 self.add_log(vector_entry, log_list) 297 # Scale the list 298 scaled_list = [float(x)/math.log(max_val) for x in log_list] 299 scaled_vector = numpy.array(scaled_list) 300 # Resize the vector 301 scaled_vector.resize(max_len, refcheck = False) 302 return scaled_vector 303 else: 304 # Scale the list 305 scaled_list = [float(x)/max_val for x in value_list] 306 scaled_vector = numpy.array(scaled_list) 307 # Resize the vector 308 scaled_vector.resize(max_len, refcheck = False) 309 return scaled_vector 310 else: 311 # Resize the vector 312 return value_list.resize(max_len, refcheck = False) 313 314 def build_string_vector(self, string_list, superset_string_list, ignore_case = True): 315 '''Build a vector from an input list of strings and superset list of strings.''' 316 # Flatten the superset list 317 flattened_string_list = self.flatten_vector(superset_string_list) 318 # List of ignored/skipped strings 319 ignored_strings = ['none'] 320 # List of unique strings 321 unique_strings = [] 322 # First, build up the unique strings 323 for string in flattened_string_list: 324 normalized_string = string 325 # Ignore case if specified 326 if ignore_case: 327 normalized_string = string.lower() 328 if normalized_string not in ignored_strings and normalized_string not in unique_strings: 329 unique_strings.append(normalized_string) 330 # Next, build the actual strings vector 331 string_vector = numpy.array([0] * len(unique_strings)) 332 normalized_string_list = string_list 333 # Ignore case if specified 334 if ignore_case: 335 normalized_string_list = [str(x).lower() for x in string_list] 336 for i in range(0, len(unique_strings)): 337 if unique_strings[i] in normalized_string_list: 338 string_vector[i] = 1 339 else: 340 string_vector[i] = 0 341 return string_vector 342 343 def preprocess_entities(self, dereference = True): 344 '''Pre-process the MAEC entities''' 345 malware_subjects = [] 346 # Dereference and normalize the Malware Subjects in the Package 347 for entity in self.maec_entity_list: 348 # Test if we're dealing with a package or Malware Subject 349 if isinstance(entity, Package): 350 action_vectors = [] 351 for malware_subject in entity.malware_subjects: 352 # Dereference the Bundles in the Malware Subject 353 if dereference: 354 malware_subject.dereference_bundles() 355 # Normalize the Bundles in the Malware Subject 356 malware_subject.normalize_bundles() 357 # Add the Malware Subject to the list 358 malware_subjects.append(malware_subject) 359 elif isinstance(entity, MalwareSubject): 360 # Dereference the Bundles in the Malware Subject 361 if dereference: 362 entity.dereference_bundles() 363 # Normalize the Bundles in the Malware Subject 364 entity.normalize_bundles() 365 # Add the Malware Subject to the list 366 malware_subjects.append(malware_subject) 367 # Merge the Malware Subjects by hash (if possible) 368 return merge_malware_subjects(malware_subjects) 369 370 def generate_feature_vectors(self, merged_subjects): 371 '''Generate a feature vector for the binned Malware Subjects''' 372 for malware_subject in merged_subjects: 373 feature_vector_dict = {'dynamic' : DynamicFeatureVector(malware_subject, self.deduplicator, self.ignored_object_properties, self.ignored_actions), 374 'static' : StaticFeatureVector(malware_subject, self.deduplicator)} 375 self.feature_vectors[malware_subject.id_] = feature_vector_dict 376 377 def flatten_vector(self, vector_entry_list): 378 '''Generate a single, flattened vector from an input list of vectors or values.''' 379 component_list = [] 380 for vector_entry in vector_entry_list: 381 if isinstance(vector_entry, numpy.ndarray) or isinstance(vector_entry, list): 382 for component in vector_entry: 383 component_list.append(component) 384 else: 385 component_list.append(vector_entry) 386 return component_list 387 388 def normalize_vectors(self, vector_1, vector_2): 389 '''Normalize two input vectors so that they have similar composition.''' 390 for i in range(0, len(vector_1)): 391 if type(vector_1[i]) != type(vector_2[i]): 392 if isinstance(vector_1[i], numpy.ndarray) and not isinstance(vector_2[i], numpy.ndarray): 393 vector_2[i] = numpy.array([0] * len(vector_1[i])) 394 elif not isinstance(vector_1[i], numpy.ndarray) and isinstance(vector_2[i], numpy.ndarray): 395 vector_1[i] = numpy.array([0] * len(vector_2[i])) 396 397 def create_static_result_vector(self, static_vector): 398 '''Construct the static result (matching) vector for a corresponding feature vector''' 399 results_vector = [] 400 for feature_name in self.compared_static_features: 401 # Test if we wish to use the feature in the comparison 402 if feature_name in static_vector.unique_static_features: 403 # Get the value of the feature 404 feature_value = static_vector.unique_static_features[feature_name] 405 # Get the options dictionary for the feature 406 feature_options_dict = self.compared_static_features[feature_name] 407 feature_items = self.superset_static_vectors[feature_name] 408 # Check if the raw value setting is specified 409 if 'use_raw_value' in feature_options_dict: 410 results_vector.append(feature_value) 411 continue 412 # Determine if numeric values should be logarithmically scaled - true by default 413 scale_log = True 414 if 'scale log' in feature_options_dict: 415 scale_log = feature_options_dict['scale log'] 416 # Determine if numeric values should be normalized - true by default 417 normalize = True 418 if 'normalize' in feature_options_dict: 419 normalize = feature_options_dict['normalize'] 420 # Normalize the items for the feature based on the specified datatype 421 # Use this to construct the results vector 422 # Normalize on hex values 423 normalized_value = None 424 if feature_options_dict['datatype'] == 'hex': 425 converted_types = [int(x,0) for x in feature_items] 426 normalized_value = self.normalize_numeric(int(feature_value,0), converted_types, normalize, scale_log) 427 # Normalize on lists of hex values 428 if feature_options_dict['datatype'] == 'hex list': 429 converted_types = [numpy.array([int(x, 0) for x in y]) for y in feature_items] 430 normalized_value = self.normalize_numeric_list(numpy.array([int(x,0) for x in feature_value]), converted_types, normalize, scale_log) 431 # Normalize on int values 432 elif feature_options_dict['datatype'] == 'int': 433 converted_types = [int(x) for x in feature_items] 434 normalized_value = self.normalize_numeric(int(feature_value), converted_types, normalize, scale_log) 435 # Normalize on lists of int values 436 elif feature_options_dict['datatype'] == 'int list': 437 converted_types = [numpy.array([int(x) for x in y]) for y in feature_items] 438 normalized_value = self.normalize_numeric_list(numpy.array([int(x) for x in feature_value]), converted_types, normalize, scale_log) 439 # Normalize on float values 440 elif feature_options_dict['datatype'] == 'float': 441 converted_types = [float(x) for x in feature_items] 442 normalized_value = self.normalize_numeric(float(feature_value), converted_types, normalize, scale_log) 443 # Normalize on lists of float values 444 elif feature_options_dict['datatype'] == 'float list': 445 converted_types = [numpy.array([float(x) for x in y]) for y in feature_items] 446 normalized_value = self.normalize_numeric_list(numpy.array([float(x) for x in feature_value]), converted_types, normalize, scale_log) 447 # Normalize on string values 448 elif feature_options_dict['datatype'] == 'string': 449 string_vector = self.build_string_vector(feature_value, feature_items) 450 results_vector.append(string_vector) 451 # Bin any values, if specified in the options dictionary 452 if 'bin' in feature_options_dict and feature_options_dict['bin']: 453 normalized_items = [self.normalize_numeric(x, converted_types, scale_log) for x in converted_types] 454 if 'number of bins' in feature_options_dict: 455 bin = self.bin_list(normalized_value, normalized_items, feature_options_dict['number of bins']) 456 else: 457 bin = self.bin_list(normalized_value, normalized_items) 458 results_vector.append(bin) 459 elif normalized_value is not None: 460 results_vector.append(normalized_value) 461 else: 462 results_vector.append(0) 463 return results_vector 464 465 def create_dynamic_result_vector(self, dynamic_vector): 466 '''Construct the dynamic result (matching) vector for a corresponding feature vector''' 467 # First, construct the results vector for the dynamic vectors 468 results_vector = numpy.array([0] * len(self.superset_dynamic_vectors)) 469 i = 0 470 for vector in self.superset_dynamic_vectors: 471 if vector in dynamic_vector.unique_dynamic_features: 472 results_vector[i] = 1 473 i+= 1 474 return results_vector 475 476 def create_superset_vectors(self): 477 '''Calculate vector supersets from the feature vectors''' 478 for feature_vector_dict in self.feature_vectors.values(): 479 dynamic_vector = feature_vector_dict['dynamic'] 480 static_vector = feature_vector_dict['static'] 481 # Build the superset of dynamic vectors 482 for vector in dynamic_vector.unique_dynamic_features: 483 if vector not in self.superset_dynamic_vectors: 484 self.superset_dynamic_vectors.append(vector) 485 # Build the superset of static vectors 486 for feature_name, feature_value in static_vector.unique_static_features.items(): 487 if feature_name not in self.superset_static_vectors: 488 self.superset_static_vectors[feature_name] = [feature_value] 489 else: 490 self.superset_static_vectors[feature_name].append(feature_value) 491 492 def euclidean_distance(self, vector_1, vector_2): 493 '''Calculate the Euclidean distance between two input vectors''' 494 distance = 0.0 495 for i in range(0, len(vector_1)): 496 if isinstance(vector_1[i], float): 497 distance += math.pow(vector_1[i] - vector_2[i], 2) 498 elif isinstance(vector_1[i], numpy.ndarray): 499 for vi in range(0, len(vector_1[i])): 500 distance += math.pow(vector_1[i][vi] - vector_2[i][vi], 2) 501 elif isinstance(vector_1[i], int): 502 if vector_1[i] != vector_2[i]: 503 distance += 1.0 504 elif isinstance(vector_1[i], str): 505 if vector_1[i] != vector_2[i]: 506 distance += 1.0 507 return math.sqrt(distance) 508 509 def populate_hashes_mapping(self, malware_subject_list): 510 '''Populate and return the Malware Subject -> Hashes mapping from an input list of Malware Subjects.''' 511 hashes_mapping = {} 512 for malware_subject in malware_subject_list: 513 mal_inst_obj = malware_subject.malware_instance_object_attributes 514 if mal_inst_obj.properties and mal_inst_obj.properties.hashes: 515 hashes_dict = {} 516 for hash in mal_inst_obj.properties.hashes: 517 type = None 518 value = None 519 if hash.type_: 520 type = hash.type_.value 521 if hash.simple_hash_value: 522 value = hash.simple_hash_value.value 523 elif hash.fuzzy_hash_value: 524 value = hash.fuzzy_hash_value.value 525 if type and value: 526 hashes_dict[str(type).lower()] = str(value).lower() 527 hashes_mapping[malware_subject.id_] = hashes_dict 528 return hashes_mapping 529 530 def perform_calculation(self): 531 '''Perform the actual distance calculation. 532 Store the results in the distances dictionary.''' 533 # Determine the different combinations of Malware Subjects 534 combinations = itertools.combinations(self.feature_vectors, r=2) 535 for combination in combinations: 536 if self.options_dict['use_dynamic_features']: 537 dynamic_vectors = (self.feature_vectors[combination[0]]['dynamic_result'], 538 self.feature_vectors[combination[1]]['dynamic_result']) 539 if self.options_dict['use_static_features']: 540 static_vectors = (self.feature_vectors[combination[0]]['static_result'], 541 self.feature_vectors[combination[1]]['static_result']) 542 # Normalize the static vectors (to make them equal length) 543 self.normalize_vectors(static_vectors[0], static_vectors[1]) 544 # Generate the combined vectors if necessary and calculate the distance 545 if self.options_dict['use_dynamic_features'] and self.options_dict['use_static_features']: 546 result_vectors = (numpy.array(list(dynamic_vectors[0]) + self.flatten_vector(static_vectors[0])), 547 numpy.array(list(dynamic_vectors[1]) + self.flatten_vector(static_vectors[1]))) 548 elif self.options_dict['use_dynamic_features'] and not self.options_dict['use_static_features']: 549 result_vectors = (numpy.array(list(dynamic_vectors[0])), 550 numpy.array(list(dynamic_vectors[1]))) 551 elif not self.options_dict['use_dynamic_features'] and self.options_dict['use_static_features']: 552 result_vectors = (self.flatten_vector(static_vectors[0]), 553 self.flatten_vector(static_vectors[1])) 554 distance = self.euclidean_distance(result_vectors[0], result_vectors[1]) 555 # Add the result to the distances dictionary 556 for i in range(0,2): 557 opposite = 1 - i 558 if combination[i] not in self.distances: 559 self.distances[combination[i]] = {combination[opposite] : distance} 560 else: 561 self.distances[combination[i]][combination[opposite]] = distance 562 563 def calculate(self): 564 '''Calculate the distances between the input Malware Subjects.''' 565 # Pre-process and merge the entities 566 self.normalized_subjects = self.preprocess_entities() 567 # Generate the feature vectors for the entities 568 self.generate_feature_vectors(self.normalized_subjects) 569 # Build up the supersets of unique vectors 570 self.create_superset_vectors() 571 # Construct the result vectors 572 for feature_vector_dict in self.feature_vectors.values(): 573 if self.options_dict['use_dynamic_features']: 574 # Construct the dynamic result vector 575 feature_vector_dict['dynamic_result'] = self.create_dynamic_result_vector(feature_vector_dict['dynamic']) 576 if self.options_dict['use_static_features']: 577 # Construct the static result vector 578 feature_vector_dict['static_result'] = self.create_static_result_vector(feature_vector_dict['static']) 579 # Perform the actual distance calculation 580 self.perform_calculation() 581 582 def print_distances(self, file_object, default_label = 'md5', delimiter = ','): 583 '''Print the distances between the Malware Subjects in delimited matrix format 584 to a File-like object. 585 586 Try to use the MD5s of the Malware Subjects as the default label. 587 Uses commas as the default delimiter, for CSV-like output.''' 588 hashes_mapping = self.populate_hashes_mapping(self.normalized_subjects) 589 distance_strings = [] 590 # Generate the header string and individual distance strings 591 header_string = '' + delimiter 592 for malware_subject in self.normalized_subjects: 593 distance_string = '' 594 hashes = hashes_mapping[malware_subject.id_] 595 if default_label in hashes: 596 distance_string += (hashes[default_label] + delimiter) 597 header_string += (hashes[default_label] + delimiter) 598 else: 599 distance_string += (malware_subject.id_ + delimiter) 600 header_string += (malware_subject.id_ + delimiter) 601 for other_malware_subject in self.normalized_subjects: 602 if malware_subject.id_ == other_malware_subject.id_: 603 distance_string += ('0.0' + delimiter) 604 else: 605 distance_string += (str(self.distances[malware_subject.id_][other_malware_subject.id_]) 606 + delimiter) 607 distance_strings.append(distance_string.rstrip(delimiter)) 608 609 # Print the header and distance strings 610 file_object.write(header_string.rstrip(delimiter) + "\n") 611 for distance_string in distance_strings: 612 file_object.write(distance_string + "\n") 613 file_object.flush() 614