1import numpy as np
2import multiprocessing
3import time
4
5from deeptools import countReadsPerBin
6from deeptools.utilities import getTLen
7from deeptoolsintervals import GTF
8
9
10class SumCoveragePerBin(countReadsPerBin.CountReadsPerBin):
11    r"""This is an extension of CountReadsPerBin for use with plotFingerprint.
12    There, we need to sum the per-base coverage.
13    """
14    def get_coverage_of_region(self, bamHandle, chrom, regions,
15                               fragmentFromRead_func=None):
16        """
17        Returns a numpy array that corresponds to the number of reads
18        that overlap with each tile.
19
20        >>> test = Tester()
21        >>> import pysam
22        >>> c = SumCoveragePerBin([], stepSize=1, extendReads=300)
23
24        For this case the reads are length 36. The number of overlapping
25        read fragments is 4 and 5 for the positions tested. Note that reads are
26        NOT extended, due to there being a 0 length input list of BAM files!
27
28        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2',
29        ... [(5000833, 5000834), (5000834, 5000835)])
30        array([4., 5.])
31
32        In the following  case the reads length is 50. Reads are not extended.
33
34        >>> c.extendReads=False
35        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)])
36        array([2., 4., 4.])
37
38
39        """
40        if not fragmentFromRead_func:
41            fragmentFromRead_func = self.get_fragment_from_read
42        nbins = len(regions)
43        if len(regions[0]) == 3:
44            nbins = 0
45            for reg in regions:
46                nbins += (reg[1] - reg[0]) // reg[2]
47        coverages = np.zeros(nbins, dtype='float64')
48
49        if self.defaultFragmentLength == 'read length':
50            extension = 0
51        else:
52            extension = self.maxPairedFragmentLength
53
54        blackList = None
55        if self.blackListFileName is not None:
56            blackList = GTF(self.blackListFileName)
57
58        vector_start = 0
59        for idx, reg in enumerate(regions):
60            if len(reg) == 3:
61                tileSize = int(reg[2])
62                nRegBins = (reg[1] - reg[0]) // tileSize
63            else:
64                nRegBins = 1
65                tileSize = int(reg[1] - reg[0])
66
67            # Blacklisted regions have a coverage of 0
68            if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]):
69                continue
70            regStart = int(max(0, reg[0] - extension))
71            regEnd = reg[1] + int(extension)
72
73            # If alignments are extended and there's a blacklist, ensure that no
74            # reads originating in a blacklist are fetched
75            if blackList and reg[0] > 0 and extension > 0:
76                o = blackList.findOverlaps(chrom, regStart, reg[0])
77                if o is not None and len(o) > 0:
78                    regStart = o[-1][1]
79                o = blackList.findOverlaps(chrom, reg[1], regEnd)
80                if o is not None and len(o) > 0:
81                    regEnd = o[0][0]
82
83            start_time = time.time()
84            # caching seems faster. TODO: profile the function
85            c = 0
86            try:
87                # BAM input
88                if chrom not in bamHandle.references:
89                    raise NameError("chromosome {} not found in bam file".format(chrom))
90            except:
91                # bigWig input, as used by plotFingerprint
92                if bamHandle.chroms(chrom):
93                    _ = np.array(bamHandle.stats(chrom, regStart, regEnd, type="mean", nBins=nRegBins), dtype=np.float)
94                    _[np.isnan(_)] = 0.0
95                    _ = _ * tileSize
96                    coverages += _
97                    continue
98                else:
99                    raise NameError("chromosome {} not found in bigWig file with chroms {}".format(chrom, bamHandle.chroms()))
100
101            prev_pos = set()
102            lpos = None
103            # of previous processed read pair
104            for read in bamHandle.fetch(chrom, regStart, regEnd):
105                if read.is_unmapped:
106                    continue
107                if self.minMappingQuality and read.mapq < self.minMappingQuality:
108                    continue
109
110                # filter reads based on SAM flag
111                if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include:
112                    continue
113                if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0:
114                    continue
115
116                # Fragment lengths
117                tLen = getTLen(read)
118                if self.minFragmentLength > 0 and tLen < self.minFragmentLength:
119                    continue
120                if self.maxFragmentLength > 0 and tLen > self.maxFragmentLength:
121                    continue
122
123                # get rid of duplicate reads that have same position on each of the
124                # pairs
125                if self.ignoreDuplicates:
126                    # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions
127                    if tLen >= 0:
128                        s = read.pos
129                        e = s + tLen
130                    else:
131                        s = read.pnext
132                        e = s - tLen
133                    if read.reference_id != read.next_reference_id:
134                        e = read.pnext
135                    if lpos is not None and lpos == read.reference_start \
136                            and (s, e, read.next_reference_id, read.is_reverse) in prev_pos:
137                        continue
138                    if lpos != read.reference_start:
139                        prev_pos.clear()
140                    lpos = read.reference_start
141                    prev_pos.add((s, e, read.next_reference_id, read.is_reverse))
142
143                # since reads can be split (e.g. RNA-seq reads) each part of the
144                # read that maps is called a position block.
145                try:
146                    position_blocks = fragmentFromRead_func(read)
147                except TypeError:
148                    # the get_fragment_from_read functions returns None in some cases.
149                    # Those cases are to be skipped, hence the continue line.
150                    continue
151
152                last_eIdx = None
153                for fragmentStart, fragmentEnd in position_blocks:
154                    if fragmentEnd is None or fragmentStart is None:
155                        continue
156                    fragmentLength = fragmentEnd - fragmentStart
157                    if fragmentLength == 0:
158                        continue
159                    # skip reads that are not in the region being
160                    # evaluated.
161                    if fragmentEnd <= reg[0] or fragmentStart >= reg[1]:
162                        continue
163
164                    if fragmentStart < reg[0]:
165                        fragmentStart = reg[0]
166                    if fragmentEnd > reg[0] + len(coverages) * tileSize:
167                        fragmentEnd = reg[0] + len(coverages) * tileSize
168
169                    sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0)
170                    eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins)
171                    if eIdx >= len(coverages):
172                        eIdx = len(coverages) - 1
173                    if last_eIdx is not None:
174                        sIdx = max(last_eIdx, sIdx)
175                        if sIdx >= eIdx:
176                            continue
177
178                    # First bin
179                    if fragmentEnd < reg[0] + (sIdx + 1) * tileSize:
180                        _ = fragmentEnd - fragmentStart
181                    else:
182                        _ = reg[0] + (sIdx + 1) * tileSize - fragmentStart
183                    if _ > tileSize:
184                        _ = tileSize
185                    coverages[sIdx] += _
186                    _ = sIdx + 1
187                    while _ < eIdx:
188                        coverages[_] += tileSize
189                        _ += 1
190                    while eIdx - sIdx >= nRegBins:
191                        eIdx -= 1
192                    if eIdx > sIdx:
193                        _ = fragmentEnd - (reg[0] + eIdx * tileSize)
194                        if _ > tileSize:
195                            _ = tileSize
196                        elif _ < 0:
197                            _ = 0
198                        coverages[eIdx] += _
199                    last_eIdx = eIdx
200
201                c += 1
202
203            if self.verbose:
204                endTime = time.time()
205                print("%s,  processing %s (%.1f per sec) reads @ %s:%s-%s" % (
206                    multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1]))
207
208            vector_start += nRegBins
209
210        # change zeros to NAN
211        if self.zerosToNans:
212            coverages[coverages == 0] = np.nan
213
214        return coverages
215
216
217class Tester(object):
218
219    def __init__(self):
220        """
221        The distribution of reads between the two bam files is as follows.
222
223        They cover 200 bp
224
225          0                              100                           200
226          |------------------------------------------------------------|
227        A                                ===============
228                                                        ===============
229
230
231        B                 ===============               ===============
232                                         ===============
233                                                        ===============
234        """
235        import os
236        self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
237        self.bamFile1 = self.root + "testA.bam"
238        self.bamFile2 = self.root + "testB.bam"
239        self.bamFile_PE = self.root + "test_paired2.bam"
240        self.chrom = '3R'
241