1import sys 2import gzip 3from collections import OrderedDict 4import numpy as np 5from copy import deepcopy 6 7import pyBigWig 8from deeptools import getScorePerBigWigBin 9from deeptools import mapReduce 10from deeptools.utilities import toString, toBytes, smartLabels 11from deeptools.heatmapper_utilities import getProfileTicks 12 13 14old_settings = np.seterr(all='ignore') 15 16 17def chopRegions(exonsInput, left=0, right=0): 18 """ 19 exons is a list of (start, end) tuples. The goal is to chop these into 20 separate lists of tuples, to take care or unscaled regions. "left" and 21 "right" denote regions of a given size to exclude from the normal binning 22 process (unscaled regions). 23 24 This outputs three lists of (start, end) tuples: 25 26 leftBins: 5' unscaled regions 27 bodyBins: body bins for scaling 28 rightBins: 3' unscaled regions 29 30 In addition are two integers 31 padLeft: Number of bases of padding on the left (due to not being able to fulfill "left") 32 padRight: As above, but on the right side 33 """ 34 leftBins = [] 35 rightBins = [] 36 padLeft = 0 37 padRight = 0 38 exons = deepcopy(exonsInput) 39 while len(exons) > 0 and left > 0: 40 width = exons[0][1] - exons[0][0] 41 if width <= left: 42 leftBins.append(exons[0]) 43 del exons[0] 44 left -= width 45 else: 46 leftBins.append((exons[0][0], exons[0][0] + left)) 47 exons[0] = (exons[0][0] + left, exons[0][1]) 48 left = 0 49 if left > 0: 50 padLeft = left 51 52 while len(exons) > 0 and right > 0: 53 width = exons[-1][1] - exons[-1][0] 54 if width <= right: 55 rightBins.append(exons[-1]) 56 del exons[-1] 57 right -= width 58 else: 59 rightBins.append((exons[-1][1] - right, exons[-1][1])) 60 exons[-1] = (exons[-1][0], exons[-1][1] - right) 61 right = 0 62 if right > 0: 63 padRight = right 64 65 return leftBins, exons, rightBins[::-1], padLeft, padRight 66 67 68def chopRegionsFromMiddle(exonsInput, left=0, right=0): 69 """ 70 Like chopRegions(), above, but returns two lists of tuples on each side of 71 the center point of the exons. 72 73 The steps are as follow: 74 75 1) Find the center point of the set of exons (e.g., [(0, 200), (300, 400), (800, 900)] would be centered at 200) 76 * If a given exon spans the center point then the exon is split 77 2) The given number of bases at the end of the left-of-center list are extracted 78 * If the set of exons don't contain enough bases, then padLeft is incremented accordingly 79 3) As above but for the right-of-center list 80 4) A tuple of (#2, #3, pading on the left, and padding on the right) is returned 81 """ 82 leftBins = [] 83 rightBins = [] 84 size = sum([x[1] - x[0] for x in exonsInput]) 85 middle = size // 2 86 cumulativeSum = 0 87 padLeft = 0 88 padRight = 0 89 exons = deepcopy(exonsInput) 90 91 # Split exons in half 92 for exon in exons: 93 size = exon[1] - exon[0] 94 if cumulativeSum >= middle: 95 rightBins.append(exon) 96 elif cumulativeSum + size < middle: 97 leftBins.append(exon) 98 else: 99 # Don't add 0-width exonic bins! 100 if exon[0] < exon[1] - cumulativeSum - size + middle: 101 leftBins.append((exon[0], exon[1] - cumulativeSum - size + middle)) 102 if exon[1] - cumulativeSum - size + middle < exon[1]: 103 rightBins.append((exon[1] - cumulativeSum - size + middle, exon[1])) 104 cumulativeSum += size 105 106 # Trim leftBins/adjust padLeft 107 lSum = sum([x[1] - x[0] for x in leftBins]) 108 if lSum > left: 109 lSum = 0 110 for i, exon in enumerate(leftBins[::-1]): 111 size = exon[1] - exon[0] 112 if lSum + size > left: 113 leftBins[-i - 1] = (exon[1] + lSum - left, exon[1]) 114 break 115 lSum += size 116 if lSum == left: 117 break 118 i += 1 119 if i < len(leftBins): 120 leftBins = leftBins[-i:] 121 elif lSum < left: 122 padLeft = left - lSum 123 124 # Trim rightBins/adjust padRight 125 rSum = sum([x[1] - x[0] for x in rightBins]) 126 if rSum > right: 127 rSum = 0 128 for i, exon in enumerate(rightBins): 129 size = exon[1] - exon[0] 130 if rSum + size > right: 131 rightBins[i] = (exon[0], exon[1] - rSum - size + right) 132 break 133 rSum += size 134 if rSum == right: 135 break 136 rightBins = rightBins[:i + 1] 137 elif rSum < right: 138 padRight = right - rSum 139 140 return leftBins, rightBins, padLeft, padRight 141 142 143def trimZones(zones, maxLength, binSize, padRight): 144 """ 145 Given a (variable length) list of lists of (start, end) tuples, trim/remove and tuple that extends past maxLength (e.g., the end of a chromosome) 146 147 Returns the trimmed zones and padding 148 """ 149 output = [] 150 for zone, nbins in zones: 151 outZone = [] 152 changed = False 153 for reg in zone: 154 if reg[0] >= maxLength: 155 changed = True 156 padRight += reg[1] - reg[0] 157 continue 158 159 if reg[1] > maxLength: 160 changed = True 161 padRight += reg[1] - maxLength 162 reg = (reg[0], maxLength) 163 if reg[1] > reg[0]: 164 outZone.append(reg) 165 if changed: 166 nBins = sum(x[1] - x[0] for x in outZone) // binSize 167 else: 168 nBins = nbins 169 output.append((outZone, nBins)) 170 return output, padRight 171 172 173def compute_sub_matrix_wrapper(args): 174 return heatmapper.compute_sub_matrix_worker(*args) 175 176 177class heatmapper(object): 178 """ 179 Class to handle the reading and 180 plotting of matrices. 181 """ 182 183 def __init__(self): 184 self.parameters = None 185 self.lengthDict = None 186 self.matrix = None 187 self.regions = None 188 self.blackList = None 189 self.quiet = True 190 # These are parameters that were single values in versions <3 but are now internally lists. See issue #614 191 self.special_params = set(['unscaled 5 prime', 'unscaled 3 prime', 'body', 'downstream', 'upstream', 'ref point', 'bin size']) 192 193 def getTicks(self, idx): 194 """ 195 This is essentially a wrapper around getProfileTicks to accomdate the fact that each column has its own ticks. 196 """ 197 xticks, xtickslabel = getProfileTicks(self, self.reference_point_label[idx], self.startLabel, self.endLabel, idx) 198 return xticks, xtickslabel 199 200 def computeMatrix(self, score_file_list, regions_file, parameters, blackListFileName=None, verbose=False, allArgs=None): 201 """ 202 Splits into 203 multiple cores the computation of the scores 204 per bin for each region (defined by a hash '#' 205 in the regions (BED/GFF) file. 206 """ 207 if parameters['body'] > 0 and \ 208 parameters['body'] % parameters['bin size'] > 0: 209 exit("The --regionBodyLength has to be " 210 "a multiple of --binSize.\nCurrently the " 211 "values are {} {} for\nregionsBodyLength and " 212 "binSize respectively\n".format(parameters['body'], 213 parameters['bin size'])) 214 215 # the beforeRegionStartLength is extended such that 216 # length is a multiple of binSize 217 if parameters['downstream'] % parameters['bin size'] > 0: 218 exit("Length of region after the body has to be " 219 "a multiple of --binSize.\nCurrent value " 220 "is {}\n".format(parameters['downstream'])) 221 222 if parameters['upstream'] % parameters['bin size'] > 0: 223 exit("Length of region before the body has to be a multiple of " 224 "--binSize\nCurrent value is {}\n".format(parameters['upstream'])) 225 226 if parameters['unscaled 5 prime'] % parameters['bin size'] > 0: 227 exit("Length of the unscaled 5 prime region has to be a multiple of " 228 "--binSize\nCurrent value is {}\n".format(parameters['unscaled 5 prime'])) 229 230 if parameters['unscaled 3 prime'] % parameters['bin size'] > 0: 231 exit("Length of the unscaled 5 prime region has to be a multiple of " 232 "--binSize\nCurrent value is {}\n".format(parameters['unscaled 3 prime'])) 233 234 if parameters['unscaled 5 prime'] + parameters['unscaled 3 prime'] > 0 and parameters['body'] == 0: 235 exit('Unscaled 5- and 3-prime regions only make sense with the scale-regions subcommand.\n') 236 237 # Take care of GTF options 238 transcriptID = "transcript" 239 exonID = "exon" 240 transcript_id_designator = "transcript_id" 241 keepExons = False 242 self.quiet = False 243 if allArgs is not None: 244 allArgs = vars(allArgs) 245 transcriptID = allArgs.get("transcriptID", transcriptID) 246 exonID = allArgs.get("exonID", exonID) 247 transcript_id_designator = allArgs.get("transcript_id_designator", transcript_id_designator) 248 keepExons = allArgs.get("keepExons", keepExons) 249 self.quiet = allArgs.get("quiet", self.quiet) 250 251 chromSizes, _ = getScorePerBigWigBin.getChromSizes(score_file_list) 252 res, labels = mapReduce.mapReduce([score_file_list, parameters], 253 compute_sub_matrix_wrapper, 254 chromSizes, 255 self_=self, 256 bedFile=regions_file, 257 blackListFileName=blackListFileName, 258 numberOfProcessors=parameters['proc number'], 259 includeLabels=True, 260 transcriptID=transcriptID, 261 exonID=exonID, 262 transcript_id_designator=transcript_id_designator, 263 keepExons=keepExons, 264 verbose=verbose) 265 # each worker in the pool returns a tuple containing 266 # the submatrix data, the regions that correspond to the 267 # submatrix, and the number of regions lacking scores 268 # Since this is largely unsorted, we need to sort by group 269 270 # merge all the submatrices into matrix 271 matrix = np.concatenate([r[0] for r in res], axis=0) 272 regions = [] 273 regions_no_score = 0 274 for idx in range(len(res)): 275 if len(res[idx][1]): 276 regions.extend(res[idx][1]) 277 regions_no_score += res[idx][2] 278 groups = [x[3] for x in regions] 279 foo = sorted(zip(groups, list(range(len(regions))), regions)) 280 sortIdx = [x[1] for x in foo] 281 regions = [x[2] for x in foo] 282 matrix = matrix[sortIdx] 283 284 # mask invalid (nan) values 285 matrix = np.ma.masked_invalid(matrix) 286 287 assert matrix.shape[0] == len(regions), \ 288 "matrix length does not match regions length" 289 290 if len(regions) == 0: 291 sys.stderr.write("\nERROR: Either the BED file does not contain any valid regions or there are none remaining after filtering.\n") 292 exit(1) 293 if regions_no_score == len(regions): 294 exit("\nERROR: None of the BED regions could be found in the bigWig" 295 "file.\nPlease check that the bigwig file is valid and " 296 "that the chromosome names between the BED file and " 297 "the bigWig file correspond to each other\n") 298 299 if regions_no_score > len(regions) * 0.75: 300 file_type = 'bigwig' if score_file_list[0].endswith(".bw") else "BAM" 301 prcnt = 100 * float(regions_no_score) / len(regions) 302 sys.stderr.write( 303 "\n\nWarning: {0:.2f}% of regions are *not* associated\n" 304 "to any score in the given {1} file. Check that the\n" 305 "chromosome names from the BED file are consistent with\n" 306 "the chromosome names in the given {2} file and that both\n" 307 "files refer to the same species\n\n".format(prcnt, 308 file_type, 309 file_type)) 310 311 self.parameters = parameters 312 313 numcols = matrix.shape[1] 314 num_ind_cols = self.get_num_individual_matrix_cols() 315 sample_boundaries = list(range(0, numcols + num_ind_cols, num_ind_cols)) 316 if allArgs is not None and allArgs['samplesLabel'] is not None: 317 sample_labels = allArgs['samplesLabel'] 318 else: 319 sample_labels = smartLabels(score_file_list) 320 321 # Determine the group boundaries 322 group_boundaries = [] 323 group_labels_filtered = [] 324 last_idx = -1 325 for x in range(len(regions)): 326 if regions[x][3] != last_idx: 327 last_idx = regions[x][3] 328 group_boundaries.append(x) 329 group_labels_filtered.append(labels[last_idx]) 330 group_boundaries.append(len(regions)) 331 332 # check if a given group is too small. Groups that 333 # are too small can't be plotted and an exception is thrown. 334 group_len = np.diff(group_boundaries) 335 if len(group_len) > 1: 336 sum_len = sum(group_len) 337 group_frac = [float(x) / sum_len for x in group_len] 338 if min(group_frac) <= 0.002: 339 sys.stderr.write( 340 "One of the groups defined in the bed file is " 341 "too small.\nGroups that are too small can't be plotted. " 342 "\n") 343 344 self.matrix = _matrix(regions, matrix, 345 group_boundaries, 346 sample_boundaries, 347 group_labels_filtered, 348 sample_labels) 349 350 if parameters['skip zeros']: 351 self.matrix.removeempty() 352 353 @staticmethod 354 def compute_sub_matrix_worker(self, chrom, start, end, score_file_list, parameters, regions): 355 """ 356 Returns 357 ------- 358 numpy matrix 359 A numpy matrix that contains per each row the values found per each of the regions given 360 """ 361 if parameters['verbose']: 362 sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end)) 363 364 # read BAM or scores file 365 score_file_handles = [] 366 for sc_file in score_file_list: 367 score_file_handles.append(pyBigWig.open(sc_file)) 368 369 # determine the number of matrix columns based on the lengths 370 # given by the user, times the number of score files 371 matrix_cols = len(score_file_list) * \ 372 ((parameters['downstream'] + 373 parameters['unscaled 5 prime'] + parameters['unscaled 3 prime'] + 374 parameters['upstream'] + parameters['body']) // 375 parameters['bin size']) 376 377 # create an empty matrix to store the values 378 sub_matrix = np.zeros((len(regions), matrix_cols)) 379 sub_matrix[:] = np.NAN 380 381 j = 0 382 sub_regions = [] 383 regions_no_score = 0 384 for transcript in regions: 385 feature_chrom = transcript[0] 386 exons = transcript[1] 387 feature_start = exons[0][0] 388 feature_end = exons[-1][1] 389 feature_name = transcript[2] 390 feature_strand = transcript[4] 391 padLeft = 0 392 padRight = 0 393 padLeftNaN = 0 394 padRightNaN = 0 395 upstream = [] 396 downstream = [] 397 398 # get the body length 399 body_length = np.sum([x[1] - x[0] for x in exons]) - parameters['unscaled 5 prime'] - parameters['unscaled 3 prime'] 400 401 # print some information 402 if parameters['body'] > 0 and \ 403 body_length < parameters['bin size']: 404 if not self.quiet: 405 sys.stderr.write("A region that is shorter than the bin size (possibly only after accounting for unscaled regions) was found: " 406 "({0}) {1} {2}:{3}:{4}. Skipping...\n".format((body_length - parameters['unscaled 5 prime'] - parameters['unscaled 3 prime']), 407 feature_name, feature_chrom, 408 feature_start, feature_end)) 409 coverage = np.zeros(matrix_cols) 410 if not parameters['missing data as zero']: 411 coverage[:] = np.nan 412 else: 413 if feature_strand == '-': 414 if parameters['downstream'] > 0: 415 upstream = [(feature_start - parameters['downstream'], feature_start)] 416 if parameters['upstream'] > 0: 417 downstream = [(feature_end, feature_end + parameters['upstream'])] 418 unscaled5prime, body, unscaled3prime, padLeft, padRight = chopRegions(exons, left=parameters['unscaled 3 prime'], right=parameters['unscaled 5 prime']) 419 # bins per zone 420 a = parameters['downstream'] // parameters['bin size'] 421 b = parameters['unscaled 3 prime'] // parameters['bin size'] 422 d = parameters['unscaled 5 prime'] // parameters['bin size'] 423 e = parameters['upstream'] // parameters['bin size'] 424 else: 425 if parameters['upstream'] > 0: 426 upstream = [(feature_start - parameters['upstream'], feature_start)] 427 if parameters['downstream'] > 0: 428 downstream = [(feature_end, feature_end + parameters['downstream'])] 429 unscaled5prime, body, unscaled3prime, padLeft, padRight = chopRegions(exons, left=parameters['unscaled 5 prime'], right=parameters['unscaled 3 prime']) 430 a = parameters['upstream'] // parameters['bin size'] 431 b = parameters['unscaled 5 prime'] // parameters['bin size'] 432 d = parameters['unscaled 3 prime'] // parameters['bin size'] 433 e = parameters['downstream'] // parameters['bin size'] 434 c = parameters['body'] // parameters['bin size'] 435 436 # build zones (each is a list of tuples) 437 # zone0: region before the region start, 438 # zone1: unscaled 5 prime region 439 # zone2: the body of the region 440 # zone3: unscaled 3 prime region 441 # zone4: the region from the end of the region downstream 442 # the format for each zone is: [(start, end), ...], number of bins 443 # Note that for "reference-point", upstream/downstream will go 444 # through the exons (if requested) and then possibly continue 445 # on the other side (unless parameters['nan after end'] is true) 446 if parameters['body'] > 0: 447 zones = [(upstream, a), (unscaled5prime, b), (body, c), (unscaled3prime, d), (downstream, e)] 448 elif parameters['ref point'] == 'TES': # around TES 449 if feature_strand == '-': 450 downstream, body, unscaled3prime, padRight, _ = chopRegions(exons, left=parameters['upstream']) 451 if padRight > 0 and parameters['nan after end'] is True: 452 padRightNaN += padRight 453 elif padRight > 0: 454 downstream.append((downstream[-1][1], downstream[-1][1] + padRight)) 455 padRight = 0 456 else: 457 unscale5prime, body, upstream, _, padLeft = chopRegions(exons, right=parameters['upstream']) 458 if padLeft > 0 and parameters['nan after end'] is True: 459 padLeftNaN += padLeft 460 elif padLeft > 0: 461 upstream.insert(0, (upstream[0][0] - padLeft, upstream[0][0])) 462 padLeft = 0 463 e = np.sum([x[1] - x[0] for x in downstream]) // parameters['bin size'] 464 a = np.sum([x[1] - x[0] for x in upstream]) // parameters['bin size'] 465 zones = [(upstream, a), (downstream, e)] 466 elif parameters['ref point'] == 'center': # at the region center 467 if feature_strand == '-': 468 upstream, downstream, padLeft, padRight = chopRegionsFromMiddle(exons, left=parameters['downstream'], right=parameters['upstream']) 469 else: 470 upstream, downstream, padLeft, padRight = chopRegionsFromMiddle(exons, left=parameters['upstream'], right=parameters['downstream']) 471 if padLeft > 0 and parameters['nan after end'] is True: 472 padLeftNaN += padLeft 473 elif padLeft > 0: 474 if len(upstream) > 0: 475 upstream.insert(0, (upstream[0][0] - padLeft, upstream[0][0])) 476 else: 477 upstream = [(downstream[0][0] - padLeft, downstream[0][0])] 478 padLeft = 0 479 if padRight > 0 and parameters['nan after end'] is True: 480 padRightNaN += padRight 481 elif padRight > 0: 482 downstream.append((downstream[-1][1], downstream[-1][1] + padRight)) 483 padRight = 0 484 a = np.sum([x[1] - x[0] for x in upstream]) // parameters['bin size'] 485 e = np.sum([x[1] - x[0] for x in downstream]) // parameters['bin size'] 486 # It's possible for a/e to be floats or 0 yet upstream/downstream isn't empty 487 if a < 1: 488 upstream = [] 489 a = 0 490 if e < 1: 491 downstream = [] 492 e = 0 493 zones = [(upstream, a), (downstream, e)] 494 else: # around TSS 495 if feature_strand == '-': 496 unscale5prime, body, upstream, _, padLeft = chopRegions(exons, right=parameters['downstream']) 497 if padLeft > 0 and parameters['nan after end'] is True: 498 padLeftNaN += padLeft 499 elif padLeft > 0: 500 upstream.insert(0, (upstream[0][0] - padLeft, upstream[0][0])) 501 padLeft = 0 502 else: 503 downstream, body, unscaled3prime, padRight, _ = chopRegions(exons, left=parameters['downstream']) 504 if padRight > 0 and parameters['nan after end'] is True: 505 padRightNaN += padRight 506 elif padRight > 0: 507 downstream.append((downstream[-1][1], downstream[-1][1] + padRight)) 508 padRight = 0 509 a = np.sum([x[1] - x[0] for x in upstream]) // parameters['bin size'] 510 e = np.sum([x[1] - x[0] for x in downstream]) // parameters['bin size'] 511 zones = [(upstream, a), (downstream, e)] 512 513 foo = parameters['upstream'] 514 bar = parameters['downstream'] 515 if feature_strand == '-': 516 foo, bar = bar, foo 517 if padLeftNaN > 0: 518 expected = foo // parameters['bin size'] 519 padLeftNaN = int(round(float(padLeftNaN) / parameters['bin size'])) 520 if expected - padLeftNaN - a > 0: 521 padLeftNaN += 1 522 if padRightNaN > 0: 523 expected = bar // parameters['bin size'] 524 padRightNaN = int(round(float(padRightNaN) / parameters['bin size'])) 525 if expected - padRightNaN - e > 0: 526 padRightNaN += 1 527 528 coverage = [] 529 # compute the values for each of the files being processed. 530 # "cov" is a numpy array of bins 531 for sc_handler in score_file_handles: 532 # We're only supporting bigWig files at this point 533 cov = heatmapper.coverage_from_big_wig( 534 sc_handler, feature_chrom, zones, 535 parameters['bin size'], 536 parameters['bin avg type'], 537 parameters['missing data as zero'], 538 not self.quiet) 539 540 if padLeftNaN > 0: 541 cov = np.concatenate([[np.nan] * padLeftNaN, cov]) 542 if padRightNaN > 0: 543 cov = np.concatenate([cov, [np.nan] * padRightNaN]) 544 545 if feature_strand == "-": 546 cov = cov[::-1] 547 548 coverage = np.hstack([coverage, cov]) 549 550 if coverage is None: 551 regions_no_score += 1 552 if not self.quiet: 553 sys.stderr.write( 554 "No data was found for region " 555 "{0} {1}:{2}-{3}. Skipping...\n".format( 556 feature_name, feature_chrom, 557 feature_start, feature_end)) 558 559 coverage = np.zeros(matrix_cols) 560 if not parameters['missing data as zero']: 561 coverage[:] = np.nan 562 563 try: 564 temp = coverage.copy() 565 temp[np.isnan(temp)] = 0 566 except: 567 if not self.quiet: 568 sys.stderr.write( 569 "No scores defined for region " 570 "{0} {1}:{2}-{3}. Skipping...\n".format(feature_name, 571 feature_chrom, 572 feature_start, 573 feature_end)) 574 coverage = np.zeros(matrix_cols) 575 if not parameters['missing data as zero']: 576 coverage[:] = np.nan 577 578 if parameters['min threshold'] is not None and coverage.min() <= parameters['min threshold']: 579 continue 580 if parameters['max threshold'] is not None and coverage.max() >= parameters['max threshold']: 581 continue 582 if parameters['scale'] != 1: 583 coverage = parameters['scale'] * coverage 584 585 sub_matrix[j, :] = coverage 586 587 sub_regions.append(transcript) 588 j += 1 589 590 # remove empty rows 591 sub_matrix = sub_matrix[0:j, :] 592 if len(sub_regions) != len(sub_matrix[:, 0]): 593 sys.stderr.write("regions lengths do not match\n") 594 return sub_matrix, sub_regions, regions_no_score 595 596 @staticmethod 597 def coverage_from_array(valuesArray, zones, binSize, avgType): 598 try: 599 valuesArray[0] 600 except (IndexError, TypeError) as detail: 601 sys.stderr.write("{0}\nvalues array value: {1}, zones {2}\n".format(detail, valuesArray, zones)) 602 603 cvglist = [] 604 zoneEnd = 0 605 valStart = 0 606 valEnd = 0 607 for zone, nBins in zones: 608 if nBins: 609 # linspace is used to more or less evenly partition the data points into the given number of bins 610 zoneEnd += nBins 611 valStart = valEnd 612 valEnd += np.sum([x[1] - x[0] for x in zone]) 613 counts_list = [] 614 615 # Partition the space into bins 616 if nBins == 1: 617 pos_array = np.array([valStart]) 618 else: 619 pos_array = np.linspace(valStart, valEnd, nBins, endpoint=False, dtype=int) 620 pos_array = np.append(pos_array, valEnd) 621 622 idx = 0 623 while idx < nBins: 624 idxStart = int(pos_array[idx]) 625 idxEnd = max(int(pos_array[idx + 1]), idxStart + 1) 626 try: 627 counts_list.append(heatmapper.my_average(valuesArray[idxStart:idxEnd], avgType)) 628 except Exception as detail: 629 sys.stderr.write("Exception found: {0}\n".format(detail)) 630 idx += 1 631 cvglist.append(np.array(counts_list)) 632 633 return np.concatenate(cvglist) 634 635 @staticmethod 636 def change_chrom_names(chrom): 637 """ 638 Changes UCSC chromosome names to ensembl chromosome names 639 and vice versa. 640 """ 641 if chrom.startswith('chr'): 642 # remove the chr part from chromosome name 643 chrom = chrom[3:] 644 if chrom == "M": 645 chrom = "MT" 646 else: 647 # prefix with 'chr' the chromosome name 648 chrom = 'chr' + chrom 649 if chrom == "chrMT": 650 chrom = "chrM" 651 652 return chrom 653 654 @staticmethod 655 def coverage_from_big_wig(bigwig, chrom, zones, binSize, avgType, nansAsZeros=False, verbose=True): 656 657 """ 658 uses pyBigWig 659 to query a region define by chrom and zones. 660 The output is an array that contains the bigwig 661 value per base pair. The summary over bins is 662 done in a later step when coverage_from_array is called. 663 This method is more reliable than querying the bins 664 directly from the bigwig, which should be more efficient. 665 666 By default, any region, even if no chromosome match is found 667 on the bigwig file, produces a result. In other words 668 no regions are skipped. 669 670 zones: array as follows zone0: region before the region start, 671 zone1: 5' unscaled region (if present) 672 zone2: the body of the region (not always present) 673 zone3: 3' unscaled region (if present) 674 zone4: the region from the end of the region downstream 675 676 each zone is a tuple containing start, end, and number of bins 677 678 679 This is useful if several matrices wants to be merged 680 or if the sorted BED output of one computeMatrix operation 681 needs to be used for other cases 682 """ 683 nVals = 0 684 for zone, _ in zones: 685 for region in zone: 686 nVals += region[1] - region[0] 687 688 values_array = np.zeros(nVals) 689 if not nansAsZeros: 690 values_array[:] = np.nan 691 if chrom not in list(bigwig.chroms().keys()): 692 unmod_name = chrom 693 chrom = heatmapper.change_chrom_names(chrom) 694 if chrom not in list(bigwig.chroms().keys()): 695 if verbose: 696 sys.stderr.write("Warning: Your chromosome names do not match.\nPlease check that the " 697 "chromosome names in your BED file\ncorrespond to the names in your " 698 "bigWig file.\nAn empty line will be added to your heatmap.\nThe problematic " 699 "chromosome name is {0}\n\n".format(unmod_name)) 700 701 # return empty nan array 702 return heatmapper.coverage_from_array(values_array, zones, binSize, avgType) 703 704 maxLen = bigwig.chroms(chrom) 705 startIdx = 0 706 endIdx = 0 707 for zone, _ in zones: 708 for region in zone: 709 startIdx = endIdx 710 if region[0] < 0: 711 endIdx += abs(region[0]) 712 values_array[startIdx:endIdx] = np.nan 713 startIdx = endIdx 714 start = max(0, region[0]) 715 end = min(maxLen, region[1]) 716 endIdx += end - start 717 if start < end: 718 # This won't be the case if we extend off the front of a chromosome, such as (-100, 0) 719 values_array[startIdx:endIdx] = bigwig.values(chrom, start, end) 720 if end < region[1]: 721 startIdx = endIdx 722 endIdx += region[1] - end 723 values_array[startIdx:endIdx] = np.nan 724 725 # replaces nans for zeros 726 if nansAsZeros: 727 values_array[np.isnan(values_array)] = 0 728 729 return heatmapper.coverage_from_array(values_array, zones, 730 binSize, avgType) 731 732 @staticmethod 733 def my_average(valuesArray, avgType='mean'): 734 """ 735 computes the mean, median, etc but only for those values 736 that are not Nan 737 """ 738 valuesArray = np.ma.masked_invalid(valuesArray) 739 avg = np.ma.__getattribute__(avgType)(valuesArray) 740 if isinstance(avg, np.ma.core.MaskedConstant): 741 return np.nan 742 else: 743 return avg 744 745 def matrix_from_dict(self, matrixDict, regionsDict, parameters): 746 self.regionsDict = regionsDict 747 self.matrixDict = matrixDict 748 self.parameters = parameters 749 self.lengthDict = OrderedDict() 750 self.matrixAvgsDict = OrderedDict() 751 752 def read_matrix_file(self, matrix_file): 753 # reads a bed file containing the position 754 # of genomic intervals 755 # In case a hash sign '#' is found in the 756 # file, this is considered as a delimiter 757 # to split the heatmap into groups 758 759 import json 760 regions = [] 761 matrix_rows = [] 762 current_group_index = 0 763 max_group_bound = None 764 765 fh = gzip.open(matrix_file) 766 for line in fh: 767 line = toString(line).strip() 768 # read the header file containing the parameters 769 # used 770 if line.startswith("@"): 771 # the parameters used are saved using 772 # json 773 self.parameters = json.loads(line[1:].strip()) 774 max_group_bound = self.parameters['group_boundaries'][1] 775 continue 776 777 # split the line into bed interval and matrix values 778 region = line.split('\t') 779 chrom, start, end, name, score, strand = region[0:6] 780 matrix_row = np.ma.masked_invalid(np.fromiter(region[6:], np.float)) 781 matrix_rows.append(matrix_row) 782 starts = start.split(",") 783 ends = end.split(",") 784 regs = [(int(x), int(y)) for x, y in zip(starts, ends)] 785 # get the group index 786 if len(regions) >= max_group_bound: 787 current_group_index += 1 788 max_group_bound = self.parameters['group_boundaries'][current_group_index + 1] 789 regions.append([chrom, regs, name, max_group_bound, strand, score]) 790 791 matrix = np.vstack(matrix_rows) 792 self.matrix = _matrix(regions, matrix, self.parameters['group_boundaries'], 793 self.parameters['sample_boundaries'], 794 group_labels=self.parameters['group_labels'], 795 sample_labels=self.parameters['sample_labels']) 796 797 if 'sort regions' in self.parameters: 798 self.matrix.set_sorting_method(self.parameters['sort regions'], 799 self.parameters['sort using']) 800 801 # Versions of computeMatrix before 3.0 didn't have an entry of these per column, fix that 802 nSamples = len(self.matrix.sample_labels) 803 h = dict() 804 for k, v in self.parameters.items(): 805 if k in self.special_params and type(v) is not list: 806 v = [v] * nSamples 807 if len(v) == 0: 808 v = [None] * nSamples 809 h[k] = v 810 self.parameters = h 811 812 return 813 814 def save_matrix(self, file_name): 815 """ 816 saves the data required to reconstruct the matrix 817 the format is: 818 A header containing the parameters used to create the matrix 819 encoded as: 820 @key:value\tkey2:value2 etc... 821 The rest of the file has the same first 5 columns of a 822 BED file: chromosome name, start, end, name, score and strand, 823 all separated by tabs. After the fifth column the matrix 824 values are appended separated by tabs. 825 Groups are separated by adding a line starting with a hash (#) 826 and followed by the group name. 827 828 The file is gzipped. 829 """ 830 import json 831 self.parameters['sample_labels'] = self.matrix.sample_labels 832 self.parameters['group_labels'] = self.matrix.group_labels 833 self.parameters['sample_boundaries'] = self.matrix.sample_boundaries 834 self.parameters['group_boundaries'] = self.matrix.group_boundaries 835 836 # Redo the parameters, ensuring things related to ticks and labels are repeated appropriately 837 nSamples = len(self.matrix.sample_labels) 838 h = dict() 839 for k, v in self.parameters.items(): 840 if type(v) is list and len(v) == 0: 841 v = None 842 if k in self.special_params and type(v) is not list: 843 v = [v] * nSamples 844 if len(v) == 0: 845 v = [None] * nSamples 846 h[k] = v 847 fh = gzip.open(file_name, 'wb') 848 params_str = json.dumps(h, separators=(',', ':')) 849 fh.write(toBytes("@" + params_str + "\n")) 850 score_list = np.ma.masked_invalid(np.mean(self.matrix.matrix, axis=1)) 851 for idx, region in enumerate(self.matrix.regions): 852 # join np_array values 853 # keeping nans while converting them to strings 854 if not np.ma.is_masked(score_list[idx]): 855 np.float(score_list[idx]) 856 matrix_values = "\t".join( 857 np.char.mod('%f', self.matrix.matrix[idx, :])) 858 starts = ["{0}".format(x[0]) for x in region[1]] 859 ends = ["{0}".format(x[1]) for x in region[1]] 860 starts = ",".join(starts) 861 ends = ",".join(ends) 862 # BEDish format (we don't currently store the score) 863 fh.write( 864 toBytes('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n'.format( 865 region[0], 866 starts, 867 ends, 868 region[2], 869 region[5], 870 region[4], 871 matrix_values))) 872 fh.close() 873 874 def save_tabulated_values(self, file_handle, reference_point_label='TSS', start_label='TSS', end_label='TES', averagetype='mean'): 875 """ 876 Saves the values averaged by col using the avg_type 877 given 878 879 Args: 880 file_handle: file name to save the file 881 reference_point_label: Name of the reference point label 882 start_label: Name of the star label 883 end_label: Name of the end label 884 averagetype: average type (e.g. mean, median, std) 885 886 """ 887 # get X labels 888 w = self.parameters['bin size'] 889 b = self.parameters['upstream'] 890 a = self.parameters['downstream'] 891 c = self.parameters.get('unscaled 5 prime', 0) 892 d = self.parameters.get('unscaled 3 prime', 0) 893 m = self.parameters['body'] 894 895 xticks = [] 896 xtickslabel = [] 897 for idx in range(self.matrix.get_num_samples()): 898 if b[idx] < 1e5: 899 quotient = 1000 900 symbol = 'Kb' 901 else: 902 quotient = 1e6 903 symbol = 'Mb' 904 905 if m[idx] == 0: 906 last = 0 907 if len(xticks): 908 last = xticks[-1] 909 xticks.extend([last + (k / w[idx]) for k in [w[idx], b[idx], b[idx] + a[idx]]]) 910 xtickslabel.extend(['{0:.1f}{1}'.format(-(float(b[idx]) / quotient), symbol), reference_point_label, 911 '{0:.1f}{1}'.format(float(a[idx]) / quotient, symbol)]) 912 913 else: 914 xticks_values = [w[idx]] 915 916 # only if upstream region is set, add a x tick 917 if b[idx] > 0: 918 xticks_values.append(b[idx]) 919 xtickslabel.append('{0:.1f}{1}'.format(-(float(b[idx]) / quotient), symbol)) 920 921 xtickslabel.append(start_label) 922 923 if c[idx] > 0: 924 xticks_values.append(b[idx] + c[idx]) 925 xtickslabel.append("") 926 927 if d[idx] > 0: 928 xticks_values.append(b[idx] + c[idx] + m[idx]) 929 xtickslabel.append("") 930 931 xticks_values.append(b[idx] + c[idx] + m[idx] + d[idx]) 932 xtickslabel.append(end_label) 933 934 if a[idx] > 0: 935 xticks_values.append(b[idx] + c[idx] + m[idx] + d[idx] + a[idx]) 936 xtickslabel.append('{0:.1f}{1}'.format(float(a[idx]) / quotient, symbol)) 937 938 last = 0 939 if len(xticks): 940 last = xticks[-1] 941 xticks.extend([last + (k / w[idx]) for k in xticks_values]) 942 x_axis = np.arange(xticks[-1]) + 1 943 labs = [] 944 for x_value in x_axis: 945 if x_value in xticks and xtickslabel[xticks.index(x_value)]: 946 labs.append(xtickslabel[xticks.index(x_value)]) 947 elif x_value in xticks: 948 labs.append("tick") 949 else: 950 labs.append("") 951 952 with open(file_handle, 'w') as fh: 953 # write labels 954 fh.write("bin labels\t\t{}\n".format("\t".join(labs))) 955 fh.write('bins\t\t{}\n'.format("\t".join([str(x) for x in x_axis]))) 956 957 for sample_idx in range(self.matrix.get_num_samples()): 958 for group_idx in range(self.matrix.get_num_groups()): 959 sub_matrix = self.matrix.get_matrix(group_idx, sample_idx) 960 values = [str(x) for x in np.ma.__getattribute__(averagetype)(sub_matrix['matrix'], axis=0)] 961 fh.write("{}\t{}\t{}\n".format(sub_matrix['sample'], sub_matrix['group'], "\t".join(values))) 962 963 def save_matrix_values(self, file_name): 964 # print a header telling the group names and their length 965 fh = open(file_name, 'wb') 966 info = [] 967 groups_len = np.diff(self.matrix.group_boundaries) 968 for i in range(len(self.matrix.group_labels)): 969 info.append("{}:{}".format(self.matrix.group_labels[i], 970 groups_len[i])) 971 fh.write(toBytes("#{}\n".format("\t".join(info)))) 972 # add to header the x axis values 973 fh.write(toBytes("#downstream:{}\tupstream:{}\tbody:{}\tbin size:{}\tunscaled 5 prime:{}\tunscaled 3 prime:{}\n".format( 974 self.parameters['downstream'], 975 self.parameters['upstream'], 976 self.parameters['body'], 977 self.parameters['bin size'], 978 self.parameters.get('unscaled 5 prime', 0), 979 self.parameters.get('unscaled 3 prime', 0)))) 980 sample_len = np.diff(self.matrix.sample_boundaries) 981 for i in range(len(self.matrix.sample_labels)): 982 info.extend([self.matrix.sample_labels[i]] * sample_len[i]) 983 fh.write(toBytes("{}\n".format("\t".join(info)))) 984 985 fh.close() 986 # reopen again using append mode 987 fh = open(file_name, 'ab') 988 np.savetxt(fh, self.matrix.matrix, fmt="%.4g", delimiter="\t") 989 fh.close() 990 991 def save_BED(self, file_handle): 992 boundaries = np.array(self.matrix.group_boundaries) 993 # Add a header 994 file_handle.write("#chrom\tstart\tend\tname\tscore\tstrand\tthickStart\tthickEnd\titemRGB\tblockCount\tblockSizes\tblockStart\tdeepTools_group") 995 if self.matrix.silhouette is not None: 996 file_handle.write("\tsilhouette") 997 file_handle.write("\n") 998 for idx, region in enumerate(self.matrix.regions): 999 # the label id corresponds to the last boundary 1000 # that is smaller than the region index. 1001 # for example for a boundary array = [0, 10, 20] 1002 # and labels ['a', 'b', 'c'], 1003 # for index 5, the label is 'a', for 1004 # index 10, the label is 'b' etc 1005 label_idx = np.flatnonzero(boundaries <= idx)[-1] 1006 starts = ["{0}".format(x[0]) for x in region[1]] 1007 ends = ["{0}".format(x[1]) for x in region[1]] 1008 starts = ",".join(starts) 1009 ends = ",".join(ends) 1010 file_handle.write( 1011 '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{1}\t{2}\t0'.format( 1012 region[0], 1013 region[1][0][0], 1014 region[1][-1][1], 1015 region[2], 1016 region[5], 1017 region[4])) 1018 file_handle.write( 1019 '\t{0}\t{1}\t{2}\t{3}'.format( 1020 len(region[1]), 1021 ",".join([str(int(y) - int(x)) for x, y in region[1]]), 1022 ",".join([str(int(x) - int(starts[0])) for x, y in region[1]]), 1023 self.matrix.group_labels[label_idx])) 1024 if self.matrix.silhouette is not None: 1025 file_handle.write("\t{}".format(self.matrix.silhouette[idx])) 1026 file_handle.write("\n") 1027 file_handle.close() 1028 1029 @staticmethod 1030 def matrix_avg(matrix, avgType='mean'): 1031 matrix = np.ma.masked_invalid(matrix) 1032 return np.ma.__getattribute__(avgType)(matrix, axis=0) 1033 1034 def get_individual_matrices(self, matrix): 1035 """In case multiple matrices are saved one after the other 1036 this method splits them appart. 1037 Returns a list containing the matrices 1038 """ 1039 num_cols = matrix.shape[1] 1040 num_ind_cols = self.get_num_individual_matrix_cols() 1041 matrices_list = [] 1042 for i in range(0, num_cols, num_ind_cols): 1043 if i + num_ind_cols > num_cols: 1044 break 1045 matrices_list.append(matrix[:, i:i + num_ind_cols]) 1046 return matrices_list 1047 1048 def get_num_individual_matrix_cols(self): 1049 """ 1050 returns the number of columns that 1051 each matrix should have. This is done because 1052 the final matrix that is plotted can be composed 1053 of smaller matrices that are merged one after 1054 the other. 1055 """ 1056 matrixCols = ((self.parameters['downstream'] + self.parameters['upstream'] + self.parameters['body'] + self.parameters['unscaled 5 prime'] + self.parameters['unscaled 3 prime']) // 1057 self.parameters['bin size']) 1058 1059 return matrixCols 1060 1061 1062def computeSilhouetteScore(d, idx, labels): 1063 """ 1064 Given a square distance matrix with NaN diagonals, compute the silhouette score 1065 of a given row (idx). Each row should have an associated label (labels). 1066 """ 1067 keep = ~np.isnan(d[idx, ]) 1068 foo = np.bincount(labels[keep], weights=d[idx, ][keep]) 1069 groupSizes = np.bincount(labels[keep]) 1070 intraIdx = labels[idx] 1071 if groupSizes[intraIdx] == 1: 1072 return 0 1073 intra = foo[labels[idx]] / groupSizes[intraIdx] 1074 interMask = np.arange(len(foo))[np.arange(len(foo)) != labels[idx]] 1075 inter = np.min(foo[interMask] / groupSizes[interMask]) 1076 return (inter - intra) / max(inter, intra) 1077 1078 1079class _matrix(object): 1080 """ 1081 class to hold heatmapper matrices 1082 The base data is a large matrix 1083 with definition to know the boundaries for row and col divisions. 1084 Col divisions represent groups within a subset, e.g. Active and 1085 inactive from PolII bigwig data. 1086 1087 Row division represent different samples, for example 1088 PolII in males vs. PolII in females. 1089 1090 This is an internal class of the heatmapper class 1091 """ 1092 1093 def __init__(self, regions, matrix, group_boundaries, sample_boundaries, 1094 group_labels=None, sample_labels=None): 1095 1096 # simple checks 1097 assert matrix.shape[0] == group_boundaries[-1], \ 1098 "row max do not match matrix shape" 1099 assert matrix.shape[1] == sample_boundaries[-1], \ 1100 "col max do not match matrix shape" 1101 1102 self.regions = regions 1103 self.matrix = matrix 1104 self.group_boundaries = group_boundaries 1105 self.sample_boundaries = sample_boundaries 1106 self.sort_method = None 1107 self.sort_using = None 1108 self.silhouette = None 1109 1110 if group_labels is None: 1111 self.group_labels = ['group {}'.format(x) 1112 for x in range(len(group_boundaries) - 1)] 1113 else: 1114 assert len(group_labels) == len(group_boundaries) - 1, \ 1115 "number of group labels does not match number of groups" 1116 self.group_labels = group_labels 1117 1118 if sample_labels is None: 1119 self.sample_labels = ['sample {}'.format(x) 1120 for x in range(len(sample_boundaries) - 1)] 1121 else: 1122 assert len(sample_labels) == len(sample_boundaries) - 1, \ 1123 "number of sample labels does not match number of samples" 1124 self.sample_labels = sample_labels 1125 1126 def get_matrix(self, group, sample): 1127 """ 1128 Returns a sub matrix from the large 1129 matrix. Group and sample are ids, 1130 thus, row = 0, col=0 get the first group 1131 of the first sample. 1132 1133 Returns 1134 ------- 1135 dictionary containing the matrix, 1136 the group label and the sample label 1137 """ 1138 group_start = self.group_boundaries[group] 1139 group_end = self.group_boundaries[group + 1] 1140 sample_start = self.sample_boundaries[sample] 1141 sample_end = self.sample_boundaries[sample + 1] 1142 1143 return {'matrix': np.ma.masked_invalid(self.matrix[group_start:group_end, :][:, sample_start:sample_end]), 1144 'group': self.group_labels[group], 1145 'sample': self.sample_labels[sample]} 1146 1147 def get_num_samples(self): 1148 return len(self.sample_labels) 1149 1150 def get_num_groups(self): 1151 return len(self.group_labels) 1152 1153 def set_group_labels(self, new_labels): 1154 """ sets new labels for groups 1155 """ 1156 if len(new_labels) != len(self.group_labels): 1157 raise ValueError("length new labels != length original labels") 1158 self.group_labels = new_labels 1159 1160 def set_sample_labels(self, new_labels): 1161 """ sets new labels for groups 1162 """ 1163 if len(new_labels) != len(self.sample_labels): 1164 raise ValueError("length new labels != length original labels") 1165 self.sample_labels = new_labels 1166 1167 def set_sorting_method(self, sort_method, sort_using): 1168 self.sort_method = sort_method 1169 self.sort_using = sort_using 1170 1171 def get_regions(self): 1172 """Returns the regions per group 1173 1174 Returns 1175 ------ 1176 list 1177 1178 Each element of the list is itself a list 1179 of dictionaries containing the regions info: 1180 chrom, start, end, strand, name etc. 1181 1182 Each element of the list corresponds to each 1183 of the groups 1184 """ 1185 regions = [] 1186 for idx in range(len(self.group_labels)): 1187 start = self.group_boundaries[idx] 1188 end = self.group_boundaries[idx + 1] 1189 regions.append(self.regions[start:end]) 1190 1191 return regions 1192 1193 def sort_groups(self, sort_using='mean', sort_method='no', sample_list=None): 1194 """ 1195 Sorts and rearranges the submatrices according to the 1196 sorting method given. 1197 """ 1198 if sort_method == 'no': 1199 return 1200 1201 if (sample_list is not None) and (len(sample_list) > 0): 1202 # get the ids that correspond to the selected sample list 1203 idx_to_keep = [] 1204 for sample_idx in sample_list: 1205 idx_to_keep += range(self.sample_boundaries[sample_idx], self.sample_boundaries[sample_idx + 1]) 1206 1207 matrix = self.matrix[:, idx_to_keep] 1208 1209 else: 1210 matrix = self.matrix 1211 1212 # compute the row average: 1213 if sort_using == 'region_length': 1214 matrix_avgs = list() 1215 for x in self.regions: 1216 matrix_avgs.append(np.sum([bar[1] - bar[0] for bar in x[1]])) 1217 matrix_avgs = np.array(matrix_avgs) 1218 elif sort_using == 'mean': 1219 matrix_avgs = np.nanmean(matrix, axis=1) 1220 elif sort_using == 'mean': 1221 matrix_avgs = np.nanmean(matrix, axis=1) 1222 elif sort_using == 'median': 1223 matrix_avgs = np.nanmedian(matrix, axis=1) 1224 elif sort_using == 'max': 1225 matrix_avgs = np.nanmax(matrix, axis=1) 1226 elif sort_using == 'min': 1227 matrix_avgs = np.nanmin(matrix, axis=1) 1228 elif sort_using == 'sum': 1229 matrix_avgs = np.nansum(matrix, axis=1) 1230 else: 1231 sys.exit("{} is an unsupported sorting method".format(sort_using)) 1232 1233 # order per group 1234 _sorted_regions = [] 1235 _sorted_matrix = [] 1236 for idx in range(len(self.group_labels)): 1237 start = self.group_boundaries[idx] 1238 end = self.group_boundaries[idx + 1] 1239 order = matrix_avgs[start:end].argsort() 1240 if sort_method == 'descend': 1241 order = order[::-1] 1242 _sorted_matrix.append(self.matrix[start:end, :][order, :]) 1243 # sort the regions 1244 _reg = self.regions[start:end] 1245 for idx in order: 1246 _sorted_regions.append(_reg[idx]) 1247 1248 self.matrix = np.vstack(_sorted_matrix) 1249 self.regions = _sorted_regions 1250 self.set_sorting_method(sort_method, sort_using) 1251 1252 def hmcluster(self, k, evaluate_silhouette=True, method='kmeans', clustering_samples=None): 1253 matrix = np.asarray(self.matrix) 1254 matrix_to_cluster = matrix 1255 if clustering_samples is not None: 1256 assert all(i > 0 for i in clustering_samples),\ 1257 "all indices should be bigger than or equal to 1." 1258 assert all(i <= len(self.sample_labels) for i in 1259 clustering_samples),\ 1260 "each index should be smaller than or equal to {}(total "\ 1261 "number of samples.)".format(len(self.sample_labels)) 1262 1263 clustering_samples = np.asarray(clustering_samples) - 1 1264 1265 samples_cols = [] 1266 for idx in clustering_samples: 1267 samples_cols += range(self.sample_boundaries[idx], 1268 self.sample_boundaries[idx + 1]) 1269 1270 matrix_to_cluster = matrix_to_cluster[:, samples_cols] 1271 if np.any(np.isnan(matrix_to_cluster)): 1272 # replace nans for 0 otherwise kmeans produces a weird behaviour 1273 sys.stderr.write("*Warning* For clustering nan values have to be replaced by zeros \n") 1274 matrix_to_cluster[np.isnan(matrix_to_cluster)] = 0 1275 1276 if method == 'kmeans': 1277 from scipy.cluster.vq import vq, kmeans 1278 1279 centroids, _ = kmeans(matrix_to_cluster, k) 1280 # order the centroids in an attempt to 1281 # get the same cluster order 1282 cluster_labels, _ = vq(matrix_to_cluster, centroids) 1283 1284 if method == 'hierarchical': 1285 # normally too slow for large data sets 1286 from scipy.cluster.hierarchy import fcluster, linkage 1287 Z = linkage(matrix_to_cluster, method='ward', metric='euclidean') 1288 cluster_labels = fcluster(Z, k, criterion='maxclust') 1289 # hierarchical clustering labels from 1 .. k 1290 # while k-means labels 0 .. k -1 1291 # Thus, for consistency, we subtract 1 1292 cluster_labels -= 1 1293 1294 # sort clusters 1295 _clustered_mean = [] 1296 _cluster_ids_list = [] 1297 for cluster in range(k): 1298 cluster_ids = np.flatnonzero(cluster_labels == cluster) 1299 _cluster_ids_list.append(cluster_ids) 1300 _clustered_mean.append(matrix_to_cluster[cluster_ids, :].mean()) 1301 1302 # reorder clusters based on mean 1303 cluster_order = np.argsort(_clustered_mean)[::-1] 1304 # create groups using the clustering 1305 self.group_labels = [] 1306 self.group_boundaries = [0] 1307 _clustered_regions = [] 1308 _clustered_matrix = [] 1309 cluster_number = 1 1310 for cluster in cluster_order: 1311 self.group_labels.append("cluster_{}".format(cluster_number)) 1312 cluster_number += 1 1313 cluster_ids = _cluster_ids_list[cluster] 1314 self.group_boundaries.append(self.group_boundaries[-1] + 1315 len(cluster_ids)) 1316 _clustered_matrix.append(self.matrix[cluster_ids, :]) 1317 for idx in cluster_ids: 1318 _clustered_regions.append(self.regions[idx]) 1319 1320 self.regions = _clustered_regions 1321 self.matrix = np.vstack(_clustered_matrix) 1322 1323 return idx 1324 1325 def computeSilhouette(self, k): 1326 if k > 1: 1327 from scipy.spatial.distance import pdist, squareform 1328 1329 silhouette = np.repeat(0.0, self.group_boundaries[-1]) 1330 groupSizes = np.subtract(self.group_boundaries[1:], self.group_boundaries[:-1]) 1331 labels = np.repeat(np.arange(k), groupSizes) 1332 1333 d = pdist(self.matrix) 1334 d2 = squareform(d) 1335 np.fill_diagonal(d2, np.nan) # This excludes the diagonal 1336 for idx in range(len(labels)): 1337 silhouette[idx] = computeSilhouetteScore(d2, idx, labels) 1338 sys.stderr.write("The average silhouette score is: {}\n".format(np.mean(silhouette))) 1339 self.silhouette = silhouette 1340 1341 def removeempty(self): 1342 """ 1343 removes matrix rows containing only zeros or nans 1344 """ 1345 to_keep = [] 1346 score_list = np.ma.masked_invalid(np.mean(self.matrix, axis=1)) 1347 for idx, region in enumerate(self.regions): 1348 if np.ma.is_masked(score_list[idx]) or np.float(score_list[idx]) == 0: 1349 continue 1350 else: 1351 to_keep.append(idx) 1352 self.regions = [self.regions[x] for x in to_keep] 1353 self.matrix = self.matrix[to_keep, :] 1354 # adjust sample boundaries 1355 to_keep = np.array(to_keep) 1356 self.group_boundaries = [len(to_keep[to_keep < x]) for x in self.group_boundaries] 1357 1358 def flatten(self): 1359 """ 1360 flatten and remove nans from matrix. Useful 1361 to get max and mins from matrix. 1362 1363 :return flattened matrix 1364 """ 1365 matrix_flatten = np.asarray(self.matrix.flatten()) 1366 # nans are removed from the flattened array 1367 matrix_flatten = matrix_flatten[~np.isnan(matrix_flatten)] 1368 if len(matrix_flatten) == 0: 1369 num_nan = len(np.flatnonzero(np.isnan(self.matrix.flatten()))) 1370 raise ValueError("matrix only contains nans " 1371 "(total nans: {})".format(num_nan)) 1372 return matrix_flatten 1373