1import numpy as np 2import multiprocessing 3import time 4 5from deeptools import countReadsPerBin 6from deeptools.utilities import getTLen 7from deeptoolsintervals import GTF 8 9 10class SumCoveragePerBin(countReadsPerBin.CountReadsPerBin): 11 r"""This is an extension of CountReadsPerBin for use with plotFingerprint. 12 There, we need to sum the per-base coverage. 13 """ 14 def get_coverage_of_region(self, bamHandle, chrom, regions, 15 fragmentFromRead_func=None): 16 """ 17 Returns a numpy array that corresponds to the number of reads 18 that overlap with each tile. 19 20 >>> test = Tester() 21 >>> import pysam 22 >>> c = SumCoveragePerBin([], stepSize=1, extendReads=300) 23 24 For this case the reads are length 36. The number of overlapping 25 read fragments is 4 and 5 for the positions tested. Note that reads are 26 NOT extended, due to there being a 0 length input list of BAM files! 27 28 >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2', 29 ... [(5000833, 5000834), (5000834, 5000835)]) 30 array([4., 5.]) 31 32 In the following case the reads length is 50. Reads are not extended. 33 34 >>> c.extendReads=False 35 >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)]) 36 array([2., 4., 4.]) 37 38 39 """ 40 if not fragmentFromRead_func: 41 fragmentFromRead_func = self.get_fragment_from_read 42 nbins = len(regions) 43 if len(regions[0]) == 3: 44 nbins = 0 45 for reg in regions: 46 nbins += (reg[1] - reg[0]) // reg[2] 47 coverages = np.zeros(nbins, dtype='float64') 48 49 if self.defaultFragmentLength == 'read length': 50 extension = 0 51 else: 52 extension = self.maxPairedFragmentLength 53 54 blackList = None 55 if self.blackListFileName is not None: 56 blackList = GTF(self.blackListFileName) 57 58 vector_start = 0 59 for idx, reg in enumerate(regions): 60 if len(reg) == 3: 61 tileSize = int(reg[2]) 62 nRegBins = (reg[1] - reg[0]) // tileSize 63 else: 64 nRegBins = 1 65 tileSize = int(reg[1] - reg[0]) 66 67 # Blacklisted regions have a coverage of 0 68 if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]): 69 continue 70 regStart = int(max(0, reg[0] - extension)) 71 regEnd = reg[1] + int(extension) 72 73 # If alignments are extended and there's a blacklist, ensure that no 74 # reads originating in a blacklist are fetched 75 if blackList and reg[0] > 0 and extension > 0: 76 o = blackList.findOverlaps(chrom, regStart, reg[0]) 77 if o is not None and len(o) > 0: 78 regStart = o[-1][1] 79 o = blackList.findOverlaps(chrom, reg[1], regEnd) 80 if o is not None and len(o) > 0: 81 regEnd = o[0][0] 82 83 start_time = time.time() 84 # caching seems faster. TODO: profile the function 85 c = 0 86 try: 87 # BAM input 88 if chrom not in bamHandle.references: 89 raise NameError("chromosome {} not found in bam file".format(chrom)) 90 except: 91 # bigWig input, as used by plotFingerprint 92 if bamHandle.chroms(chrom): 93 _ = np.array(bamHandle.stats(chrom, regStart, regEnd, type="mean", nBins=nRegBins), dtype=np.float) 94 _[np.isnan(_)] = 0.0 95 _ = _ * tileSize 96 coverages += _ 97 continue 98 else: 99 raise NameError("chromosome {} not found in bigWig file with chroms {}".format(chrom, bamHandle.chroms())) 100 101 prev_pos = set() 102 lpos = None 103 # of previous processed read pair 104 for read in bamHandle.fetch(chrom, regStart, regEnd): 105 if read.is_unmapped: 106 continue 107 if self.minMappingQuality and read.mapq < self.minMappingQuality: 108 continue 109 110 # filter reads based on SAM flag 111 if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include: 112 continue 113 if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0: 114 continue 115 116 # Fragment lengths 117 tLen = getTLen(read) 118 if self.minFragmentLength > 0 and tLen < self.minFragmentLength: 119 continue 120 if self.maxFragmentLength > 0 and tLen > self.maxFragmentLength: 121 continue 122 123 # get rid of duplicate reads that have same position on each of the 124 # pairs 125 if self.ignoreDuplicates: 126 # Assuming more or less concordant reads, use the fragment bounds, otherwise the start positions 127 if tLen >= 0: 128 s = read.pos 129 e = s + tLen 130 else: 131 s = read.pnext 132 e = s - tLen 133 if read.reference_id != read.next_reference_id: 134 e = read.pnext 135 if lpos is not None and lpos == read.reference_start \ 136 and (s, e, read.next_reference_id, read.is_reverse) in prev_pos: 137 continue 138 if lpos != read.reference_start: 139 prev_pos.clear() 140 lpos = read.reference_start 141 prev_pos.add((s, e, read.next_reference_id, read.is_reverse)) 142 143 # since reads can be split (e.g. RNA-seq reads) each part of the 144 # read that maps is called a position block. 145 try: 146 position_blocks = fragmentFromRead_func(read) 147 except TypeError: 148 # the get_fragment_from_read functions returns None in some cases. 149 # Those cases are to be skipped, hence the continue line. 150 continue 151 152 last_eIdx = None 153 for fragmentStart, fragmentEnd in position_blocks: 154 if fragmentEnd is None or fragmentStart is None: 155 continue 156 fragmentLength = fragmentEnd - fragmentStart 157 if fragmentLength == 0: 158 continue 159 # skip reads that are not in the region being 160 # evaluated. 161 if fragmentEnd <= reg[0] or fragmentStart >= reg[1]: 162 continue 163 164 if fragmentStart < reg[0]: 165 fragmentStart = reg[0] 166 if fragmentEnd > reg[0] + len(coverages) * tileSize: 167 fragmentEnd = reg[0] + len(coverages) * tileSize 168 169 sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0) 170 eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins) 171 if eIdx >= len(coverages): 172 eIdx = len(coverages) - 1 173 if last_eIdx is not None: 174 sIdx = max(last_eIdx, sIdx) 175 if sIdx >= eIdx: 176 continue 177 178 # First bin 179 if fragmentEnd < reg[0] + (sIdx + 1) * tileSize: 180 _ = fragmentEnd - fragmentStart 181 else: 182 _ = reg[0] + (sIdx + 1) * tileSize - fragmentStart 183 if _ > tileSize: 184 _ = tileSize 185 coverages[sIdx] += _ 186 _ = sIdx + 1 187 while _ < eIdx: 188 coverages[_] += tileSize 189 _ += 1 190 while eIdx - sIdx >= nRegBins: 191 eIdx -= 1 192 if eIdx > sIdx: 193 _ = fragmentEnd - (reg[0] + eIdx * tileSize) 194 if _ > tileSize: 195 _ = tileSize 196 elif _ < 0: 197 _ = 0 198 coverages[eIdx] += _ 199 last_eIdx = eIdx 200 201 c += 1 202 203 if self.verbose: 204 endTime = time.time() 205 print("%s, processing %s (%.1f per sec) reads @ %s:%s-%s" % ( 206 multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1])) 207 208 vector_start += nRegBins 209 210 # change zeros to NAN 211 if self.zerosToNans: 212 coverages[coverages == 0] = np.nan 213 214 return coverages 215 216 217class Tester(object): 218 219 def __init__(self): 220 """ 221 The distribution of reads between the two bam files is as follows. 222 223 They cover 200 bp 224 225 0 100 200 226 |------------------------------------------------------------| 227 A =============== 228 =============== 229 230 231 B =============== =============== 232 =============== 233 =============== 234 """ 235 import os 236 self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/" 237 self.bamFile1 = self.root + "testA.bam" 238 self.bamFile2 = self.root + "testB.bam" 239 self.bamFile_PE = self.root + "test_paired2.bam" 240 self.chrom = '3R' 241