1import sys
2import gzip
3from collections import OrderedDict
4import numpy as np
5from copy import deepcopy
6
7import pyBigWig
8from deeptools import getScorePerBigWigBin
9from deeptools import mapReduce
10from deeptools.utilities import toString, toBytes, smartLabels
11from deeptools.heatmapper_utilities import getProfileTicks
12
13
14old_settings = np.seterr(all='ignore')
15
16
17def chopRegions(exonsInput, left=0, right=0):
18    """
19    exons is a list of (start, end) tuples. The goal is to chop these into
20    separate lists of tuples, to take care or unscaled regions. "left" and
21    "right" denote regions of a given size to exclude from the normal binning
22    process (unscaled regions).
23
24    This outputs three lists of (start, end) tuples:
25
26    leftBins: 5' unscaled regions
27    bodyBins: body bins for scaling
28    rightBins: 3' unscaled regions
29
30    In addition are two integers
31    padLeft: Number of bases of padding on the left (due to not being able to fulfill "left")
32    padRight: As above, but on the right side
33    """
34    leftBins = []
35    rightBins = []
36    padLeft = 0
37    padRight = 0
38    exons = deepcopy(exonsInput)
39    while len(exons) > 0 and left > 0:
40        width = exons[0][1] - exons[0][0]
41        if width <= left:
42            leftBins.append(exons[0])
43            del exons[0]
44            left -= width
45        else:
46            leftBins.append((exons[0][0], exons[0][0] + left))
47            exons[0] = (exons[0][0] + left, exons[0][1])
48            left = 0
49    if left > 0:
50        padLeft = left
51
52    while len(exons) > 0 and right > 0:
53        width = exons[-1][1] - exons[-1][0]
54        if width <= right:
55            rightBins.append(exons[-1])
56            del exons[-1]
57            right -= width
58        else:
59            rightBins.append((exons[-1][1] - right, exons[-1][1]))
60            exons[-1] = (exons[-1][0], exons[-1][1] - right)
61            right = 0
62    if right > 0:
63        padRight = right
64
65    return leftBins, exons, rightBins[::-1], padLeft, padRight
66
67
68def chopRegionsFromMiddle(exonsInput, left=0, right=0):
69    """
70    Like chopRegions(), above, but returns two lists of tuples on each side of
71    the center point of the exons.
72
73    The steps are as follow:
74
75     1) Find the center point of the set of exons (e.g., [(0, 200), (300, 400), (800, 900)] would be centered at 200)
76       * If a given exon spans the center point then the exon is split
77     2) The given number of bases at the end of the left-of-center list are extracted
78       * If the set of exons don't contain enough bases, then padLeft is incremented accordingly
79     3) As above but for the right-of-center list
80     4) A tuple of (#2, #3, pading on the left, and padding on the right) is returned
81    """
82    leftBins = []
83    rightBins = []
84    size = sum([x[1] - x[0] for x in exonsInput])
85    middle = size // 2
86    cumulativeSum = 0
87    padLeft = 0
88    padRight = 0
89    exons = deepcopy(exonsInput)
90
91    # Split exons in half
92    for exon in exons:
93        size = exon[1] - exon[0]
94        if cumulativeSum >= middle:
95            rightBins.append(exon)
96        elif cumulativeSum + size < middle:
97            leftBins.append(exon)
98        else:
99            # Don't add 0-width exonic bins!
100            if exon[0] < exon[1] - cumulativeSum - size + middle:
101                leftBins.append((exon[0], exon[1] - cumulativeSum - size + middle))
102            if exon[1] - cumulativeSum - size + middle < exon[1]:
103                rightBins.append((exon[1] - cumulativeSum - size + middle, exon[1]))
104        cumulativeSum += size
105
106    # Trim leftBins/adjust padLeft
107    lSum = sum([x[1] - x[0] for x in leftBins])
108    if lSum > left:
109        lSum = 0
110        for i, exon in enumerate(leftBins[::-1]):
111            size = exon[1] - exon[0]
112            if lSum + size > left:
113                leftBins[-i - 1] = (exon[1] + lSum - left, exon[1])
114                break
115            lSum += size
116            if lSum == left:
117                break
118        i += 1
119        if i < len(leftBins):
120            leftBins = leftBins[-i:]
121    elif lSum < left:
122        padLeft = left - lSum
123
124    # Trim rightBins/adjust padRight
125    rSum = sum([x[1] - x[0] for x in rightBins])
126    if rSum > right:
127        rSum = 0
128        for i, exon in enumerate(rightBins):
129            size = exon[1] - exon[0]
130            if rSum + size > right:
131                rightBins[i] = (exon[0], exon[1] - rSum - size + right)
132                break
133            rSum += size
134            if rSum == right:
135                break
136        rightBins = rightBins[:i + 1]
137    elif rSum < right:
138        padRight = right - rSum
139
140    return leftBins, rightBins, padLeft, padRight
141
142
143def trimZones(zones, maxLength, binSize, padRight):
144    """
145    Given a (variable length) list of lists of (start, end) tuples, trim/remove and tuple that extends past maxLength (e.g., the end of a chromosome)
146
147    Returns the trimmed zones and padding
148    """
149    output = []
150    for zone, nbins in zones:
151        outZone = []
152        changed = False
153        for reg in zone:
154            if reg[0] >= maxLength:
155                changed = True
156                padRight += reg[1] - reg[0]
157                continue
158
159            if reg[1] > maxLength:
160                changed = True
161                padRight += reg[1] - maxLength
162                reg = (reg[0], maxLength)
163            if reg[1] > reg[0]:
164                outZone.append(reg)
165        if changed:
166            nBins = sum(x[1] - x[0] for x in outZone) // binSize
167        else:
168            nBins = nbins
169        output.append((outZone, nBins))
170    return output, padRight
171
172
173def compute_sub_matrix_wrapper(args):
174    return heatmapper.compute_sub_matrix_worker(*args)
175
176
177class heatmapper(object):
178    """
179    Class to handle the reading and
180    plotting of matrices.
181    """
182
183    def __init__(self):
184        self.parameters = None
185        self.lengthDict = None
186        self.matrix = None
187        self.regions = None
188        self.blackList = None
189        self.quiet = True
190        # These are parameters that were single values in versions <3 but are now internally lists. See issue #614
191        self.special_params = set(['unscaled 5 prime', 'unscaled 3 prime', 'body', 'downstream', 'upstream', 'ref point', 'bin size'])
192
193    def getTicks(self, idx):
194        """
195        This is essentially a wrapper around getProfileTicks to accomdate the fact that each column has its own ticks.
196        """
197        xticks, xtickslabel = getProfileTicks(self, self.reference_point_label[idx], self.startLabel, self.endLabel, idx)
198        return xticks, xtickslabel
199
200    def computeMatrix(self, score_file_list, regions_file, parameters, blackListFileName=None, verbose=False, allArgs=None):
201        """
202        Splits into
203        multiple cores the computation of the scores
204        per bin for each region (defined by a hash '#'
205        in the regions (BED/GFF) file.
206        """
207        if parameters['body'] > 0 and \
208                parameters['body'] % parameters['bin size'] > 0:
209            exit("The --regionBodyLength has to be "
210                 "a multiple of --binSize.\nCurrently the "
211                 "values are {} {} for\nregionsBodyLength and "
212                 "binSize respectively\n".format(parameters['body'],
213                                                 parameters['bin size']))
214
215        # the beforeRegionStartLength is extended such that
216        # length is a multiple of binSize
217        if parameters['downstream'] % parameters['bin size'] > 0:
218            exit("Length of region after the body has to be "
219                 "a multiple of --binSize.\nCurrent value "
220                 "is {}\n".format(parameters['downstream']))
221
222        if parameters['upstream'] % parameters['bin size'] > 0:
223            exit("Length of region before the body has to be a multiple of "
224                 "--binSize\nCurrent value is {}\n".format(parameters['upstream']))
225
226        if parameters['unscaled 5 prime'] % parameters['bin size'] > 0:
227            exit("Length of the unscaled 5 prime region has to be a multiple of "
228                 "--binSize\nCurrent value is {}\n".format(parameters['unscaled 5 prime']))
229
230        if parameters['unscaled 3 prime'] % parameters['bin size'] > 0:
231            exit("Length of the unscaled 5 prime region has to be a multiple of "
232                 "--binSize\nCurrent value is {}\n".format(parameters['unscaled 3 prime']))
233
234        if parameters['unscaled 5 prime'] + parameters['unscaled 3 prime'] > 0 and parameters['body'] == 0:
235            exit('Unscaled 5- and 3-prime regions only make sense with the scale-regions subcommand.\n')
236
237        # Take care of GTF options
238        transcriptID = "transcript"
239        exonID = "exon"
240        transcript_id_designator = "transcript_id"
241        keepExons = False
242        self.quiet = False
243        if allArgs is not None:
244            allArgs = vars(allArgs)
245            transcriptID = allArgs.get("transcriptID", transcriptID)
246            exonID = allArgs.get("exonID", exonID)
247            transcript_id_designator = allArgs.get("transcript_id_designator", transcript_id_designator)
248            keepExons = allArgs.get("keepExons", keepExons)
249            self.quiet = allArgs.get("quiet", self.quiet)
250
251        chromSizes, _ = getScorePerBigWigBin.getChromSizes(score_file_list)
252        res, labels = mapReduce.mapReduce([score_file_list, parameters],
253                                          compute_sub_matrix_wrapper,
254                                          chromSizes,
255                                          self_=self,
256                                          bedFile=regions_file,
257                                          blackListFileName=blackListFileName,
258                                          numberOfProcessors=parameters['proc number'],
259                                          includeLabels=True,
260                                          transcriptID=transcriptID,
261                                          exonID=exonID,
262                                          transcript_id_designator=transcript_id_designator,
263                                          keepExons=keepExons,
264                                          verbose=verbose)
265        # each worker in the pool returns a tuple containing
266        # the submatrix data, the regions that correspond to the
267        # submatrix, and the number of regions lacking scores
268        # Since this is largely unsorted, we need to sort by group
269
270        # merge all the submatrices into matrix
271        matrix = np.concatenate([r[0] for r in res], axis=0)
272        regions = []
273        regions_no_score = 0
274        for idx in range(len(res)):
275            if len(res[idx][1]):
276                regions.extend(res[idx][1])
277                regions_no_score += res[idx][2]
278        groups = [x[3] for x in regions]
279        foo = sorted(zip(groups, list(range(len(regions))), regions))
280        sortIdx = [x[1] for x in foo]
281        regions = [x[2] for x in foo]
282        matrix = matrix[sortIdx]
283
284        # mask invalid (nan) values
285        matrix = np.ma.masked_invalid(matrix)
286
287        assert matrix.shape[0] == len(regions), \
288            "matrix length does not match regions length"
289
290        if len(regions) == 0:
291            sys.stderr.write("\nERROR: Either the BED file does not contain any valid regions or there are none remaining after filtering.\n")
292            exit(1)
293        if regions_no_score == len(regions):
294            exit("\nERROR: None of the BED regions could be found in the bigWig"
295                 "file.\nPlease check that the bigwig file is valid and "
296                 "that the chromosome names between the BED file and "
297                 "the bigWig file correspond to each other\n")
298
299        if regions_no_score > len(regions) * 0.75:
300            file_type = 'bigwig' if score_file_list[0].endswith(".bw") else "BAM"
301            prcnt = 100 * float(regions_no_score) / len(regions)
302            sys.stderr.write(
303                "\n\nWarning: {0:.2f}% of regions are *not* associated\n"
304                "to any score in the given {1} file. Check that the\n"
305                "chromosome names from the BED file are consistent with\n"
306                "the chromosome names in the given {2} file and that both\n"
307                "files refer to the same species\n\n".format(prcnt,
308                                                             file_type,
309                                                             file_type))
310
311        self.parameters = parameters
312
313        numcols = matrix.shape[1]
314        num_ind_cols = self.get_num_individual_matrix_cols()
315        sample_boundaries = list(range(0, numcols + num_ind_cols, num_ind_cols))
316        if allArgs is not None and allArgs['samplesLabel'] is not None:
317            sample_labels = allArgs['samplesLabel']
318        else:
319            sample_labels = smartLabels(score_file_list)
320
321        # Determine the group boundaries
322        group_boundaries = []
323        group_labels_filtered = []
324        last_idx = -1
325        for x in range(len(regions)):
326            if regions[x][3] != last_idx:
327                last_idx = regions[x][3]
328                group_boundaries.append(x)
329                group_labels_filtered.append(labels[last_idx])
330        group_boundaries.append(len(regions))
331
332        # check if a given group is too small. Groups that
333        # are too small can't be plotted and an exception is thrown.
334        group_len = np.diff(group_boundaries)
335        if len(group_len) > 1:
336            sum_len = sum(group_len)
337            group_frac = [float(x) / sum_len for x in group_len]
338            if min(group_frac) <= 0.002:
339                sys.stderr.write(
340                    "One of the groups defined in the bed file is "
341                    "too small.\nGroups that are too small can't be plotted. "
342                    "\n")
343
344        self.matrix = _matrix(regions, matrix,
345                              group_boundaries,
346                              sample_boundaries,
347                              group_labels_filtered,
348                              sample_labels)
349
350        if parameters['skip zeros']:
351            self.matrix.removeempty()
352
353    @staticmethod
354    def compute_sub_matrix_worker(self, chrom, start, end, score_file_list, parameters, regions):
355        """
356        Returns
357        -------
358        numpy matrix
359            A numpy matrix that contains per each row the values found per each of the regions given
360        """
361        if parameters['verbose']:
362            sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end))
363
364        # read BAM or scores file
365        score_file_handles = []
366        for sc_file in score_file_list:
367            score_file_handles.append(pyBigWig.open(sc_file))
368
369        # determine the number of matrix columns based on the lengths
370        # given by the user, times the number of score files
371        matrix_cols = len(score_file_list) * \
372            ((parameters['downstream'] +
373              parameters['unscaled 5 prime'] + parameters['unscaled 3 prime'] +
374              parameters['upstream'] + parameters['body']) //
375             parameters['bin size'])
376
377        # create an empty matrix to store the values
378        sub_matrix = np.zeros((len(regions), matrix_cols))
379        sub_matrix[:] = np.NAN
380
381        j = 0
382        sub_regions = []
383        regions_no_score = 0
384        for transcript in regions:
385            feature_chrom = transcript[0]
386            exons = transcript[1]
387            feature_start = exons[0][0]
388            feature_end = exons[-1][1]
389            feature_name = transcript[2]
390            feature_strand = transcript[4]
391            padLeft = 0
392            padRight = 0
393            padLeftNaN = 0
394            padRightNaN = 0
395            upstream = []
396            downstream = []
397
398            # get the body length
399            body_length = np.sum([x[1] - x[0] for x in exons]) - parameters['unscaled 5 prime'] - parameters['unscaled 3 prime']
400
401            # print some information
402            if parameters['body'] > 0 and \
403                    body_length < parameters['bin size']:
404                if not self.quiet:
405                    sys.stderr.write("A region that is shorter than the bin size (possibly only after accounting for unscaled regions) was found: "
406                                     "({0}) {1} {2}:{3}:{4}. Skipping...\n".format((body_length - parameters['unscaled 5 prime'] - parameters['unscaled 3 prime']),
407                                                                                   feature_name, feature_chrom,
408                                                                                   feature_start, feature_end))
409                coverage = np.zeros(matrix_cols)
410                if not parameters['missing data as zero']:
411                    coverage[:] = np.nan
412            else:
413                if feature_strand == '-':
414                    if parameters['downstream'] > 0:
415                        upstream = [(feature_start - parameters['downstream'], feature_start)]
416                    if parameters['upstream'] > 0:
417                        downstream = [(feature_end, feature_end + parameters['upstream'])]
418                    unscaled5prime, body, unscaled3prime, padLeft, padRight = chopRegions(exons, left=parameters['unscaled 3 prime'], right=parameters['unscaled 5 prime'])
419                    # bins per zone
420                    a = parameters['downstream'] // parameters['bin size']
421                    b = parameters['unscaled 3 prime'] // parameters['bin size']
422                    d = parameters['unscaled 5 prime'] // parameters['bin size']
423                    e = parameters['upstream'] // parameters['bin size']
424                else:
425                    if parameters['upstream'] > 0:
426                        upstream = [(feature_start - parameters['upstream'], feature_start)]
427                    if parameters['downstream'] > 0:
428                        downstream = [(feature_end, feature_end + parameters['downstream'])]
429                    unscaled5prime, body, unscaled3prime, padLeft, padRight = chopRegions(exons, left=parameters['unscaled 5 prime'], right=parameters['unscaled 3 prime'])
430                    a = parameters['upstream'] // parameters['bin size']
431                    b = parameters['unscaled 5 prime'] // parameters['bin size']
432                    d = parameters['unscaled 3 prime'] // parameters['bin size']
433                    e = parameters['downstream'] // parameters['bin size']
434                c = parameters['body'] // parameters['bin size']
435
436                # build zones (each is a list of tuples)
437                #  zone0: region before the region start,
438                #  zone1: unscaled 5 prime region
439                #  zone2: the body of the region
440                #  zone3: unscaled 3 prime region
441                #  zone4: the region from the end of the region downstream
442                #  the format for each zone is: [(start, end), ...], number of bins
443                # Note that for "reference-point", upstream/downstream will go
444                # through the exons (if requested) and then possibly continue
445                # on the other side (unless parameters['nan after end'] is true)
446                if parameters['body'] > 0:
447                    zones = [(upstream, a), (unscaled5prime, b), (body, c), (unscaled3prime, d), (downstream, e)]
448                elif parameters['ref point'] == 'TES':  # around TES
449                    if feature_strand == '-':
450                        downstream, body, unscaled3prime, padRight, _ = chopRegions(exons, left=parameters['upstream'])
451                        if padRight > 0 and parameters['nan after end'] is True:
452                            padRightNaN += padRight
453                        elif padRight > 0:
454                            downstream.append((downstream[-1][1], downstream[-1][1] + padRight))
455                        padRight = 0
456                    else:
457                        unscale5prime, body, upstream, _, padLeft = chopRegions(exons, right=parameters['upstream'])
458                        if padLeft > 0 and parameters['nan after end'] is True:
459                            padLeftNaN += padLeft
460                        elif padLeft > 0:
461                            upstream.insert(0, (upstream[0][0] - padLeft, upstream[0][0]))
462                        padLeft = 0
463                    e = np.sum([x[1] - x[0] for x in downstream]) // parameters['bin size']
464                    a = np.sum([x[1] - x[0] for x in upstream]) // parameters['bin size']
465                    zones = [(upstream, a), (downstream, e)]
466                elif parameters['ref point'] == 'center':  # at the region center
467                    if feature_strand == '-':
468                        upstream, downstream, padLeft, padRight = chopRegionsFromMiddle(exons, left=parameters['downstream'], right=parameters['upstream'])
469                    else:
470                        upstream, downstream, padLeft, padRight = chopRegionsFromMiddle(exons, left=parameters['upstream'], right=parameters['downstream'])
471                    if padLeft > 0 and parameters['nan after end'] is True:
472                        padLeftNaN += padLeft
473                    elif padLeft > 0:
474                        if len(upstream) > 0:
475                            upstream.insert(0, (upstream[0][0] - padLeft, upstream[0][0]))
476                        else:
477                            upstream = [(downstream[0][0] - padLeft, downstream[0][0])]
478                    padLeft = 0
479                    if padRight > 0 and parameters['nan after end'] is True:
480                        padRightNaN += padRight
481                    elif padRight > 0:
482                        downstream.append((downstream[-1][1], downstream[-1][1] + padRight))
483                    padRight = 0
484                    a = np.sum([x[1] - x[0] for x in upstream]) // parameters['bin size']
485                    e = np.sum([x[1] - x[0] for x in downstream]) // parameters['bin size']
486                    # It's possible for a/e to be floats or 0 yet upstream/downstream isn't empty
487                    if a < 1:
488                        upstream = []
489                        a = 0
490                    if e < 1:
491                        downstream = []
492                        e = 0
493                    zones = [(upstream, a), (downstream, e)]
494                else:  # around TSS
495                    if feature_strand == '-':
496                        unscale5prime, body, upstream, _, padLeft = chopRegions(exons, right=parameters['downstream'])
497                        if padLeft > 0 and parameters['nan after end'] is True:
498                            padLeftNaN += padLeft
499                        elif padLeft > 0:
500                            upstream.insert(0, (upstream[0][0] - padLeft, upstream[0][0]))
501                        padLeft = 0
502                    else:
503                        downstream, body, unscaled3prime, padRight, _ = chopRegions(exons, left=parameters['downstream'])
504                        if padRight > 0 and parameters['nan after end'] is True:
505                            padRightNaN += padRight
506                        elif padRight > 0:
507                            downstream.append((downstream[-1][1], downstream[-1][1] + padRight))
508                        padRight = 0
509                    a = np.sum([x[1] - x[0] for x in upstream]) // parameters['bin size']
510                    e = np.sum([x[1] - x[0] for x in downstream]) // parameters['bin size']
511                    zones = [(upstream, a), (downstream, e)]
512
513                foo = parameters['upstream']
514                bar = parameters['downstream']
515                if feature_strand == '-':
516                    foo, bar = bar, foo
517                if padLeftNaN > 0:
518                    expected = foo // parameters['bin size']
519                    padLeftNaN = int(round(float(padLeftNaN) / parameters['bin size']))
520                    if expected - padLeftNaN - a > 0:
521                        padLeftNaN += 1
522                if padRightNaN > 0:
523                    expected = bar // parameters['bin size']
524                    padRightNaN = int(round(float(padRightNaN) / parameters['bin size']))
525                    if expected - padRightNaN - e > 0:
526                        padRightNaN += 1
527
528                coverage = []
529                # compute the values for each of the files being processed.
530                # "cov" is a numpy array of bins
531                for sc_handler in score_file_handles:
532                    # We're only supporting bigWig files at this point
533                    cov = heatmapper.coverage_from_big_wig(
534                        sc_handler, feature_chrom, zones,
535                        parameters['bin size'],
536                        parameters['bin avg type'],
537                        parameters['missing data as zero'],
538                        not self.quiet)
539
540                    if padLeftNaN > 0:
541                        cov = np.concatenate([[np.nan] * padLeftNaN, cov])
542                    if padRightNaN > 0:
543                        cov = np.concatenate([cov, [np.nan] * padRightNaN])
544
545                    if feature_strand == "-":
546                        cov = cov[::-1]
547
548                    coverage = np.hstack([coverage, cov])
549
550            if coverage is None:
551                regions_no_score += 1
552                if not self.quiet:
553                    sys.stderr.write(
554                        "No data was found for region "
555                        "{0} {1}:{2}-{3}. Skipping...\n".format(
556                            feature_name, feature_chrom,
557                            feature_start, feature_end))
558
559                coverage = np.zeros(matrix_cols)
560                if not parameters['missing data as zero']:
561                    coverage[:] = np.nan
562
563            try:
564                temp = coverage.copy()
565                temp[np.isnan(temp)] = 0
566            except:
567                if not self.quiet:
568                    sys.stderr.write(
569                        "No scores defined for region "
570                        "{0} {1}:{2}-{3}. Skipping...\n".format(feature_name,
571                                                                feature_chrom,
572                                                                feature_start,
573                                                                feature_end))
574                coverage = np.zeros(matrix_cols)
575                if not parameters['missing data as zero']:
576                    coverage[:] = np.nan
577
578            if parameters['min threshold'] is not None and coverage.min() <= parameters['min threshold']:
579                continue
580            if parameters['max threshold'] is not None and coverage.max() >= parameters['max threshold']:
581                continue
582            if parameters['scale'] != 1:
583                coverage = parameters['scale'] * coverage
584
585            sub_matrix[j, :] = coverage
586
587            sub_regions.append(transcript)
588            j += 1
589
590        # remove empty rows
591        sub_matrix = sub_matrix[0:j, :]
592        if len(sub_regions) != len(sub_matrix[:, 0]):
593            sys.stderr.write("regions lengths do not match\n")
594        return sub_matrix, sub_regions, regions_no_score
595
596    @staticmethod
597    def coverage_from_array(valuesArray, zones, binSize, avgType):
598        try:
599            valuesArray[0]
600        except (IndexError, TypeError) as detail:
601            sys.stderr.write("{0}\nvalues array value: {1}, zones {2}\n".format(detail, valuesArray, zones))
602
603        cvglist = []
604        zoneEnd = 0
605        valStart = 0
606        valEnd = 0
607        for zone, nBins in zones:
608            if nBins:
609                # linspace is used to more or less evenly partition the data points into the given number of bins
610                zoneEnd += nBins
611                valStart = valEnd
612                valEnd += np.sum([x[1] - x[0] for x in zone])
613                counts_list = []
614
615                # Partition the space into bins
616                if nBins == 1:
617                    pos_array = np.array([valStart])
618                else:
619                    pos_array = np.linspace(valStart, valEnd, nBins, endpoint=False, dtype=int)
620                pos_array = np.append(pos_array, valEnd)
621
622                idx = 0
623                while idx < nBins:
624                    idxStart = int(pos_array[idx])
625                    idxEnd = max(int(pos_array[idx + 1]), idxStart + 1)
626                    try:
627                        counts_list.append(heatmapper.my_average(valuesArray[idxStart:idxEnd], avgType))
628                    except Exception as detail:
629                        sys.stderr.write("Exception found: {0}\n".format(detail))
630                    idx += 1
631                cvglist.append(np.array(counts_list))
632
633        return np.concatenate(cvglist)
634
635    @staticmethod
636    def change_chrom_names(chrom):
637        """
638        Changes UCSC chromosome names to ensembl chromosome names
639        and vice versa.
640        """
641        if chrom.startswith('chr'):
642            # remove the chr part from chromosome name
643            chrom = chrom[3:]
644            if chrom == "M":
645                chrom = "MT"
646        else:
647            # prefix with 'chr' the chromosome name
648            chrom = 'chr' + chrom
649            if chrom == "chrMT":
650                chrom = "chrM"
651
652        return chrom
653
654    @staticmethod
655    def coverage_from_big_wig(bigwig, chrom, zones, binSize, avgType, nansAsZeros=False, verbose=True):
656
657        """
658        uses pyBigWig
659        to query a region define by chrom and zones.
660        The output is an array that contains the bigwig
661        value per base pair. The summary over bins is
662        done in a later step when coverage_from_array is called.
663        This method is more reliable than querying the bins
664        directly from the bigwig, which should be more efficient.
665
666        By default, any region, even if no chromosome match is found
667        on the bigwig file, produces a result. In other words
668        no regions are skipped.
669
670        zones: array as follows zone0: region before the region start,
671                                zone1: 5' unscaled region (if present)
672                                zone2: the body of the region (not always present)
673                                zone3: 3' unscaled region (if present)
674                                zone4: the region from the end of the region downstream
675
676               each zone is a tuple containing start, end, and number of bins
677
678
679        This is useful if several matrices wants to be merged
680        or if the sorted BED output of one computeMatrix operation
681        needs to be used for other cases
682        """
683        nVals = 0
684        for zone, _ in zones:
685            for region in zone:
686                nVals += region[1] - region[0]
687
688        values_array = np.zeros(nVals)
689        if not nansAsZeros:
690            values_array[:] = np.nan
691        if chrom not in list(bigwig.chroms().keys()):
692            unmod_name = chrom
693            chrom = heatmapper.change_chrom_names(chrom)
694            if chrom not in list(bigwig.chroms().keys()):
695                if verbose:
696                    sys.stderr.write("Warning: Your chromosome names do not match.\nPlease check that the "
697                                     "chromosome names in your BED file\ncorrespond to the names in your "
698                                     "bigWig file.\nAn empty line will be added to your heatmap.\nThe problematic "
699                                     "chromosome name is {0}\n\n".format(unmod_name))
700
701                # return empty nan array
702                return heatmapper.coverage_from_array(values_array, zones, binSize, avgType)
703
704        maxLen = bigwig.chroms(chrom)
705        startIdx = 0
706        endIdx = 0
707        for zone, _ in zones:
708            for region in zone:
709                startIdx = endIdx
710                if region[0] < 0:
711                    endIdx += abs(region[0])
712                    values_array[startIdx:endIdx] = np.nan
713                    startIdx = endIdx
714                start = max(0, region[0])
715                end = min(maxLen, region[1])
716                endIdx += end - start
717                if start < end:
718                    # This won't be the case if we extend off the front of a chromosome, such as (-100, 0)
719                    values_array[startIdx:endIdx] = bigwig.values(chrom, start, end)
720                if end < region[1]:
721                    startIdx = endIdx
722                    endIdx += region[1] - end
723                    values_array[startIdx:endIdx] = np.nan
724
725        # replaces nans for zeros
726        if nansAsZeros:
727            values_array[np.isnan(values_array)] = 0
728
729        return heatmapper.coverage_from_array(values_array, zones,
730                                              binSize, avgType)
731
732    @staticmethod
733    def my_average(valuesArray, avgType='mean'):
734        """
735        computes the mean, median, etc but only for those values
736        that are not Nan
737        """
738        valuesArray = np.ma.masked_invalid(valuesArray)
739        avg = np.ma.__getattribute__(avgType)(valuesArray)
740        if isinstance(avg, np.ma.core.MaskedConstant):
741            return np.nan
742        else:
743            return avg
744
745    def matrix_from_dict(self, matrixDict, regionsDict, parameters):
746        self.regionsDict = regionsDict
747        self.matrixDict = matrixDict
748        self.parameters = parameters
749        self.lengthDict = OrderedDict()
750        self.matrixAvgsDict = OrderedDict()
751
752    def read_matrix_file(self, matrix_file):
753        # reads a bed file containing the position
754        # of genomic intervals
755        # In case a hash sign '#' is found in the
756        # file, this is considered as a delimiter
757        # to split the heatmap into groups
758
759        import json
760        regions = []
761        matrix_rows = []
762        current_group_index = 0
763        max_group_bound = None
764
765        fh = gzip.open(matrix_file)
766        for line in fh:
767            line = toString(line).strip()
768            # read the header file containing the parameters
769            # used
770            if line.startswith("@"):
771                # the parameters used are saved using
772                # json
773                self.parameters = json.loads(line[1:].strip())
774                max_group_bound = self.parameters['group_boundaries'][1]
775                continue
776
777            # split the line into bed interval and matrix values
778            region = line.split('\t')
779            chrom, start, end, name, score, strand = region[0:6]
780            matrix_row = np.ma.masked_invalid(np.fromiter(region[6:], np.float))
781            matrix_rows.append(matrix_row)
782            starts = start.split(",")
783            ends = end.split(",")
784            regs = [(int(x), int(y)) for x, y in zip(starts, ends)]
785            # get the group index
786            if len(regions) >= max_group_bound:
787                current_group_index += 1
788                max_group_bound = self.parameters['group_boundaries'][current_group_index + 1]
789            regions.append([chrom, regs, name, max_group_bound, strand, score])
790
791        matrix = np.vstack(matrix_rows)
792        self.matrix = _matrix(regions, matrix, self.parameters['group_boundaries'],
793                              self.parameters['sample_boundaries'],
794                              group_labels=self.parameters['group_labels'],
795                              sample_labels=self.parameters['sample_labels'])
796
797        if 'sort regions' in self.parameters:
798            self.matrix.set_sorting_method(self.parameters['sort regions'],
799                                           self.parameters['sort using'])
800
801        # Versions of computeMatrix before 3.0 didn't have an entry of these per column, fix that
802        nSamples = len(self.matrix.sample_labels)
803        h = dict()
804        for k, v in self.parameters.items():
805            if k in self.special_params and type(v) is not list:
806                v = [v] * nSamples
807                if len(v) == 0:
808                    v = [None] * nSamples
809            h[k] = v
810        self.parameters = h
811
812        return
813
814    def save_matrix(self, file_name):
815        """
816        saves the data required to reconstruct the matrix
817        the format is:
818        A header containing the parameters used to create the matrix
819        encoded as:
820        @key:value\tkey2:value2 etc...
821        The rest of the file has the same first 5 columns of a
822        BED file: chromosome name, start, end, name, score and strand,
823        all separated by tabs. After the fifth column the matrix
824        values are appended separated by tabs.
825        Groups are separated by adding a line starting with a hash (#)
826        and followed by the group name.
827
828        The file is gzipped.
829        """
830        import json
831        self.parameters['sample_labels'] = self.matrix.sample_labels
832        self.parameters['group_labels'] = self.matrix.group_labels
833        self.parameters['sample_boundaries'] = self.matrix.sample_boundaries
834        self.parameters['group_boundaries'] = self.matrix.group_boundaries
835
836        # Redo the parameters, ensuring things related to ticks and labels are repeated appropriately
837        nSamples = len(self.matrix.sample_labels)
838        h = dict()
839        for k, v in self.parameters.items():
840            if type(v) is list and len(v) == 0:
841                v = None
842            if k in self.special_params and type(v) is not list:
843                v = [v] * nSamples
844                if len(v) == 0:
845                    v = [None] * nSamples
846            h[k] = v
847        fh = gzip.open(file_name, 'wb')
848        params_str = json.dumps(h, separators=(',', ':'))
849        fh.write(toBytes("@" + params_str + "\n"))
850        score_list = np.ma.masked_invalid(np.mean(self.matrix.matrix, axis=1))
851        for idx, region in enumerate(self.matrix.regions):
852            # join np_array values
853            # keeping nans while converting them to strings
854            if not np.ma.is_masked(score_list[idx]):
855                np.float(score_list[idx])
856            matrix_values = "\t".join(
857                np.char.mod('%f', self.matrix.matrix[idx, :]))
858            starts = ["{0}".format(x[0]) for x in region[1]]
859            ends = ["{0}".format(x[1]) for x in region[1]]
860            starts = ",".join(starts)
861            ends = ",".join(ends)
862            # BEDish format (we don't currently store the score)
863            fh.write(
864                toBytes('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n'.format(
865                        region[0],
866                        starts,
867                        ends,
868                        region[2],
869                        region[5],
870                        region[4],
871                        matrix_values)))
872        fh.close()
873
874    def save_tabulated_values(self, file_handle, reference_point_label='TSS', start_label='TSS', end_label='TES', averagetype='mean'):
875        """
876        Saves the values averaged by col using the avg_type
877        given
878
879        Args:
880            file_handle: file name to save the file
881            reference_point_label: Name of the reference point label
882            start_label: Name of the star label
883            end_label: Name of the end label
884            averagetype: average type (e.g. mean, median, std)
885
886        """
887        #  get X labels
888        w = self.parameters['bin size']
889        b = self.parameters['upstream']
890        a = self.parameters['downstream']
891        c = self.parameters.get('unscaled 5 prime', 0)
892        d = self.parameters.get('unscaled 3 prime', 0)
893        m = self.parameters['body']
894
895        xticks = []
896        xtickslabel = []
897        for idx in range(self.matrix.get_num_samples()):
898            if b[idx] < 1e5:
899                quotient = 1000
900                symbol = 'Kb'
901            else:
902                quotient = 1e6
903                symbol = 'Mb'
904
905            if m[idx] == 0:
906                last = 0
907                if len(xticks):
908                    last = xticks[-1]
909                xticks.extend([last + (k / w[idx]) for k in [w[idx], b[idx], b[idx] + a[idx]]])
910                xtickslabel.extend(['{0:.1f}{1}'.format(-(float(b[idx]) / quotient), symbol), reference_point_label,
911                                    '{0:.1f}{1}'.format(float(a[idx]) / quotient, symbol)])
912
913            else:
914                xticks_values = [w[idx]]
915
916                # only if upstream region is set, add a x tick
917                if b[idx] > 0:
918                    xticks_values.append(b[idx])
919                    xtickslabel.append('{0:.1f}{1}'.format(-(float(b[idx]) / quotient), symbol))
920
921                xtickslabel.append(start_label)
922
923                if c[idx] > 0:
924                    xticks_values.append(b[idx] + c[idx])
925                    xtickslabel.append("")
926
927                if d[idx] > 0:
928                    xticks_values.append(b[idx] + c[idx] + m[idx])
929                    xtickslabel.append("")
930
931                xticks_values.append(b[idx] + c[idx] + m[idx] + d[idx])
932                xtickslabel.append(end_label)
933
934                if a[idx] > 0:
935                    xticks_values.append(b[idx] + c[idx] + m[idx] + d[idx] + a[idx])
936                    xtickslabel.append('{0:.1f}{1}'.format(float(a[idx]) / quotient, symbol))
937
938                last = 0
939                if len(xticks):
940                    last = xticks[-1]
941                xticks.extend([last + (k / w[idx]) for k in xticks_values])
942        x_axis = np.arange(xticks[-1]) + 1
943        labs = []
944        for x_value in x_axis:
945            if x_value in xticks and xtickslabel[xticks.index(x_value)]:
946                labs.append(xtickslabel[xticks.index(x_value)])
947            elif x_value in xticks:
948                labs.append("tick")
949            else:
950                labs.append("")
951
952        with open(file_handle, 'w') as fh:
953            # write labels
954            fh.write("bin labels\t\t{}\n".format("\t".join(labs)))
955            fh.write('bins\t\t{}\n'.format("\t".join([str(x) for x in x_axis])))
956
957            for sample_idx in range(self.matrix.get_num_samples()):
958                for group_idx in range(self.matrix.get_num_groups()):
959                    sub_matrix = self.matrix.get_matrix(group_idx, sample_idx)
960                    values = [str(x) for x in np.ma.__getattribute__(averagetype)(sub_matrix['matrix'], axis=0)]
961                    fh.write("{}\t{}\t{}\n".format(sub_matrix['sample'], sub_matrix['group'], "\t".join(values)))
962
963    def save_matrix_values(self, file_name):
964        # print a header telling the group names and their length
965        fh = open(file_name, 'wb')
966        info = []
967        groups_len = np.diff(self.matrix.group_boundaries)
968        for i in range(len(self.matrix.group_labels)):
969            info.append("{}:{}".format(self.matrix.group_labels[i],
970                                       groups_len[i]))
971        fh.write(toBytes("#{}\n".format("\t".join(info))))
972        # add to header the x axis values
973        fh.write(toBytes("#downstream:{}\tupstream:{}\tbody:{}\tbin size:{}\tunscaled 5 prime:{}\tunscaled 3 prime:{}\n".format(
974                 self.parameters['downstream'],
975                 self.parameters['upstream'],
976                 self.parameters['body'],
977                 self.parameters['bin size'],
978                 self.parameters.get('unscaled 5 prime', 0),
979                 self.parameters.get('unscaled 3 prime', 0))))
980        sample_len = np.diff(self.matrix.sample_boundaries)
981        for i in range(len(self.matrix.sample_labels)):
982            info.extend([self.matrix.sample_labels[i]] * sample_len[i])
983        fh.write(toBytes("{}\n".format("\t".join(info))))
984
985        fh.close()
986        # reopen again using append mode
987        fh = open(file_name, 'ab')
988        np.savetxt(fh, self.matrix.matrix, fmt="%.4g", delimiter="\t")
989        fh.close()
990
991    def save_BED(self, file_handle):
992        boundaries = np.array(self.matrix.group_boundaries)
993        # Add a header
994        file_handle.write("#chrom\tstart\tend\tname\tscore\tstrand\tthickStart\tthickEnd\titemRGB\tblockCount\tblockSizes\tblockStart\tdeepTools_group")
995        if self.matrix.silhouette is not None:
996            file_handle.write("\tsilhouette")
997        file_handle.write("\n")
998        for idx, region in enumerate(self.matrix.regions):
999            # the label id corresponds to the last boundary
1000            # that is smaller than the region index.
1001            # for example for a boundary array = [0, 10, 20]
1002            # and labels ['a', 'b', 'c'],
1003            # for index 5, the label is 'a', for
1004            # index 10, the label is 'b' etc
1005            label_idx = np.flatnonzero(boundaries <= idx)[-1]
1006            starts = ["{0}".format(x[0]) for x in region[1]]
1007            ends = ["{0}".format(x[1]) for x in region[1]]
1008            starts = ",".join(starts)
1009            ends = ",".join(ends)
1010            file_handle.write(
1011                '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{1}\t{2}\t0'.format(
1012                    region[0],
1013                    region[1][0][0],
1014                    region[1][-1][1],
1015                    region[2],
1016                    region[5],
1017                    region[4]))
1018            file_handle.write(
1019                '\t{0}\t{1}\t{2}\t{3}'.format(
1020                    len(region[1]),
1021                    ",".join([str(int(y) - int(x)) for x, y in region[1]]),
1022                    ",".join([str(int(x) - int(starts[0])) for x, y in region[1]]),
1023                    self.matrix.group_labels[label_idx]))
1024            if self.matrix.silhouette is not None:
1025                file_handle.write("\t{}".format(self.matrix.silhouette[idx]))
1026            file_handle.write("\n")
1027        file_handle.close()
1028
1029    @staticmethod
1030    def matrix_avg(matrix, avgType='mean'):
1031        matrix = np.ma.masked_invalid(matrix)
1032        return np.ma.__getattribute__(avgType)(matrix, axis=0)
1033
1034    def get_individual_matrices(self, matrix):
1035        """In case multiple matrices are saved one after the other
1036        this method splits them appart.
1037        Returns a list containing the matrices
1038        """
1039        num_cols = matrix.shape[1]
1040        num_ind_cols = self.get_num_individual_matrix_cols()
1041        matrices_list = []
1042        for i in range(0, num_cols, num_ind_cols):
1043            if i + num_ind_cols > num_cols:
1044                break
1045            matrices_list.append(matrix[:, i:i + num_ind_cols])
1046        return matrices_list
1047
1048    def get_num_individual_matrix_cols(self):
1049        """
1050        returns the number of columns  that
1051        each matrix should have. This is done because
1052        the final matrix that is plotted can be composed
1053        of smaller matrices that are merged one after
1054        the other.
1055        """
1056        matrixCols = ((self.parameters['downstream'] + self.parameters['upstream'] + self.parameters['body'] + self.parameters['unscaled 5 prime'] + self.parameters['unscaled 3 prime']) //
1057                      self.parameters['bin size'])
1058
1059        return matrixCols
1060
1061
1062def computeSilhouetteScore(d, idx, labels):
1063    """
1064    Given a square distance matrix with NaN diagonals, compute the silhouette score
1065    of a given row (idx). Each row should have an associated label (labels).
1066    """
1067    keep = ~np.isnan(d[idx, ])
1068    foo = np.bincount(labels[keep], weights=d[idx, ][keep])
1069    groupSizes = np.bincount(labels[keep])
1070    intraIdx = labels[idx]
1071    if groupSizes[intraIdx] == 1:
1072        return 0
1073    intra = foo[labels[idx]] / groupSizes[intraIdx]
1074    interMask = np.arange(len(foo))[np.arange(len(foo)) != labels[idx]]
1075    inter = np.min(foo[interMask] / groupSizes[interMask])
1076    return (inter - intra) / max(inter, intra)
1077
1078
1079class _matrix(object):
1080    """
1081    class to hold heatmapper matrices
1082    The base data is a large matrix
1083    with definition to know the boundaries for row and col divisions.
1084    Col divisions represent groups within a subset, e.g. Active and
1085    inactive from PolII bigwig data.
1086
1087    Row division represent different samples, for example
1088    PolII in males vs. PolII in females.
1089
1090    This is an internal class of the heatmapper class
1091    """
1092
1093    def __init__(self, regions, matrix, group_boundaries, sample_boundaries,
1094                 group_labels=None, sample_labels=None):
1095
1096        # simple checks
1097        assert matrix.shape[0] == group_boundaries[-1], \
1098            "row max do not match matrix shape"
1099        assert matrix.shape[1] == sample_boundaries[-1], \
1100            "col max do not match matrix shape"
1101
1102        self.regions = regions
1103        self.matrix = matrix
1104        self.group_boundaries = group_boundaries
1105        self.sample_boundaries = sample_boundaries
1106        self.sort_method = None
1107        self.sort_using = None
1108        self.silhouette = None
1109
1110        if group_labels is None:
1111            self.group_labels = ['group {}'.format(x)
1112                                 for x in range(len(group_boundaries) - 1)]
1113        else:
1114            assert len(group_labels) == len(group_boundaries) - 1, \
1115                "number of group labels does not match number of groups"
1116            self.group_labels = group_labels
1117
1118        if sample_labels is None:
1119            self.sample_labels = ['sample {}'.format(x)
1120                                  for x in range(len(sample_boundaries) - 1)]
1121        else:
1122            assert len(sample_labels) == len(sample_boundaries) - 1, \
1123                "number of sample labels does not match number of samples"
1124            self.sample_labels = sample_labels
1125
1126    def get_matrix(self, group, sample):
1127        """
1128        Returns a sub matrix from the large
1129        matrix. Group and sample are ids,
1130        thus, row = 0, col=0 get the first group
1131        of the first sample.
1132
1133        Returns
1134        -------
1135        dictionary containing the matrix,
1136        the group label and the sample label
1137        """
1138        group_start = self.group_boundaries[group]
1139        group_end = self.group_boundaries[group + 1]
1140        sample_start = self.sample_boundaries[sample]
1141        sample_end = self.sample_boundaries[sample + 1]
1142
1143        return {'matrix': np.ma.masked_invalid(self.matrix[group_start:group_end, :][:, sample_start:sample_end]),
1144                'group': self.group_labels[group],
1145                'sample': self.sample_labels[sample]}
1146
1147    def get_num_samples(self):
1148        return len(self.sample_labels)
1149
1150    def get_num_groups(self):
1151        return len(self.group_labels)
1152
1153    def set_group_labels(self, new_labels):
1154        """ sets new labels for groups
1155        """
1156        if len(new_labels) != len(self.group_labels):
1157            raise ValueError("length new labels != length original labels")
1158        self.group_labels = new_labels
1159
1160    def set_sample_labels(self, new_labels):
1161        """ sets new labels for groups
1162        """
1163        if len(new_labels) != len(self.sample_labels):
1164            raise ValueError("length new labels != length original labels")
1165        self.sample_labels = new_labels
1166
1167    def set_sorting_method(self, sort_method, sort_using):
1168        self.sort_method = sort_method
1169        self.sort_using = sort_using
1170
1171    def get_regions(self):
1172        """Returns the regions per group
1173
1174        Returns
1175        ------
1176        list
1177
1178            Each element of the list is itself a list
1179            of dictionaries containing the regions info:
1180            chrom, start, end, strand, name etc.
1181
1182            Each element of the list corresponds to each
1183            of the groups
1184        """
1185        regions = []
1186        for idx in range(len(self.group_labels)):
1187            start = self.group_boundaries[idx]
1188            end = self.group_boundaries[idx + 1]
1189            regions.append(self.regions[start:end])
1190
1191        return regions
1192
1193    def sort_groups(self, sort_using='mean', sort_method='no', sample_list=None):
1194        """
1195        Sorts and rearranges the submatrices according to the
1196        sorting method given.
1197        """
1198        if sort_method == 'no':
1199            return
1200
1201        if (sample_list is not None) and (len(sample_list) > 0):
1202            # get the ids that correspond to the selected sample list
1203            idx_to_keep = []
1204            for sample_idx in sample_list:
1205                idx_to_keep += range(self.sample_boundaries[sample_idx], self.sample_boundaries[sample_idx + 1])
1206
1207            matrix = self.matrix[:, idx_to_keep]
1208
1209        else:
1210            matrix = self.matrix
1211
1212        # compute the row average:
1213        if sort_using == 'region_length':
1214            matrix_avgs = list()
1215            for x in self.regions:
1216                matrix_avgs.append(np.sum([bar[1] - bar[0] for bar in x[1]]))
1217            matrix_avgs = np.array(matrix_avgs)
1218        elif sort_using == 'mean':
1219            matrix_avgs = np.nanmean(matrix, axis=1)
1220        elif sort_using == 'mean':
1221            matrix_avgs = np.nanmean(matrix, axis=1)
1222        elif sort_using == 'median':
1223            matrix_avgs = np.nanmedian(matrix, axis=1)
1224        elif sort_using == 'max':
1225            matrix_avgs = np.nanmax(matrix, axis=1)
1226        elif sort_using == 'min':
1227            matrix_avgs = np.nanmin(matrix, axis=1)
1228        elif sort_using == 'sum':
1229            matrix_avgs = np.nansum(matrix, axis=1)
1230        else:
1231            sys.exit("{} is an unsupported sorting method".format(sort_using))
1232
1233        # order per group
1234        _sorted_regions = []
1235        _sorted_matrix = []
1236        for idx in range(len(self.group_labels)):
1237            start = self.group_boundaries[idx]
1238            end = self.group_boundaries[idx + 1]
1239            order = matrix_avgs[start:end].argsort()
1240            if sort_method == 'descend':
1241                order = order[::-1]
1242            _sorted_matrix.append(self.matrix[start:end, :][order, :])
1243            # sort the regions
1244            _reg = self.regions[start:end]
1245            for idx in order:
1246                _sorted_regions.append(_reg[idx])
1247
1248        self.matrix = np.vstack(_sorted_matrix)
1249        self.regions = _sorted_regions
1250        self.set_sorting_method(sort_method, sort_using)
1251
1252    def hmcluster(self, k, evaluate_silhouette=True, method='kmeans', clustering_samples=None):
1253        matrix = np.asarray(self.matrix)
1254        matrix_to_cluster = matrix
1255        if clustering_samples is not None:
1256            assert all(i > 0 for i in clustering_samples),\
1257                "all indices should be bigger than or equal to 1."
1258            assert all(i <= len(self.sample_labels) for i in
1259                       clustering_samples),\
1260                "each index should be smaller than or equal to {}(total "\
1261                "number of samples.)".format(len(self.sample_labels))
1262
1263            clustering_samples = np.asarray(clustering_samples) - 1
1264
1265            samples_cols = []
1266            for idx in clustering_samples:
1267                samples_cols += range(self.sample_boundaries[idx],
1268                                      self.sample_boundaries[idx + 1])
1269
1270            matrix_to_cluster = matrix_to_cluster[:, samples_cols]
1271        if np.any(np.isnan(matrix_to_cluster)):
1272            # replace nans for 0 otherwise kmeans produces a weird behaviour
1273            sys.stderr.write("*Warning* For clustering nan values have to be replaced by zeros \n")
1274            matrix_to_cluster[np.isnan(matrix_to_cluster)] = 0
1275
1276        if method == 'kmeans':
1277            from scipy.cluster.vq import vq, kmeans
1278
1279            centroids, _ = kmeans(matrix_to_cluster, k)
1280            # order the centroids in an attempt to
1281            # get the same cluster order
1282            cluster_labels, _ = vq(matrix_to_cluster, centroids)
1283
1284        if method == 'hierarchical':
1285            # normally too slow for large data sets
1286            from scipy.cluster.hierarchy import fcluster, linkage
1287            Z = linkage(matrix_to_cluster, method='ward', metric='euclidean')
1288            cluster_labels = fcluster(Z, k, criterion='maxclust')
1289            # hierarchical clustering labels from 1 .. k
1290            # while k-means labels 0 .. k -1
1291            # Thus, for consistency, we subtract 1
1292            cluster_labels -= 1
1293
1294        # sort clusters
1295        _clustered_mean = []
1296        _cluster_ids_list = []
1297        for cluster in range(k):
1298            cluster_ids = np.flatnonzero(cluster_labels == cluster)
1299            _cluster_ids_list.append(cluster_ids)
1300            _clustered_mean.append(matrix_to_cluster[cluster_ids, :].mean())
1301
1302        # reorder clusters based on mean
1303        cluster_order = np.argsort(_clustered_mean)[::-1]
1304        # create groups using the clustering
1305        self.group_labels = []
1306        self.group_boundaries = [0]
1307        _clustered_regions = []
1308        _clustered_matrix = []
1309        cluster_number = 1
1310        for cluster in cluster_order:
1311            self.group_labels.append("cluster_{}".format(cluster_number))
1312            cluster_number += 1
1313            cluster_ids = _cluster_ids_list[cluster]
1314            self.group_boundaries.append(self.group_boundaries[-1] +
1315                                         len(cluster_ids))
1316            _clustered_matrix.append(self.matrix[cluster_ids, :])
1317            for idx in cluster_ids:
1318                _clustered_regions.append(self.regions[idx])
1319
1320        self.regions = _clustered_regions
1321        self.matrix = np.vstack(_clustered_matrix)
1322
1323        return idx
1324
1325    def computeSilhouette(self, k):
1326        if k > 1:
1327            from scipy.spatial.distance import pdist, squareform
1328
1329            silhouette = np.repeat(0.0, self.group_boundaries[-1])
1330            groupSizes = np.subtract(self.group_boundaries[1:], self.group_boundaries[:-1])
1331            labels = np.repeat(np.arange(k), groupSizes)
1332
1333            d = pdist(self.matrix)
1334            d2 = squareform(d)
1335            np.fill_diagonal(d2, np.nan)  # This excludes the diagonal
1336            for idx in range(len(labels)):
1337                silhouette[idx] = computeSilhouetteScore(d2, idx, labels)
1338            sys.stderr.write("The average silhouette score is: {}\n".format(np.mean(silhouette)))
1339            self.silhouette = silhouette
1340
1341    def removeempty(self):
1342        """
1343        removes matrix rows containing only zeros or nans
1344        """
1345        to_keep = []
1346        score_list = np.ma.masked_invalid(np.mean(self.matrix, axis=1))
1347        for idx, region in enumerate(self.regions):
1348            if np.ma.is_masked(score_list[idx]) or np.float(score_list[idx]) == 0:
1349                continue
1350            else:
1351                to_keep.append(idx)
1352        self.regions = [self.regions[x] for x in to_keep]
1353        self.matrix = self.matrix[to_keep, :]
1354        # adjust sample boundaries
1355        to_keep = np.array(to_keep)
1356        self.group_boundaries = [len(to_keep[to_keep < x]) for x in self.group_boundaries]
1357
1358    def flatten(self):
1359        """
1360        flatten and remove nans from matrix. Useful
1361        to get max and mins from matrix.
1362
1363        :return flattened matrix
1364        """
1365        matrix_flatten = np.asarray(self.matrix.flatten())
1366        # nans are removed from the flattened array
1367        matrix_flatten = matrix_flatten[~np.isnan(matrix_flatten)]
1368        if len(matrix_flatten) == 0:
1369            num_nan = len(np.flatnonzero(np.isnan(self.matrix.flatten())))
1370            raise ValueError("matrix only contains nans "
1371                             "(total nans: {})".format(num_nan))
1372        return matrix_flatten
1373