1import sys 2import pysam 3from deeptools.mapReduce import mapReduce 4 5 6def countReadsInInterval(args): 7 chrom, start, end, fname, toEOF = args 8 9 bam = openBam(fname) 10 mapped = 0 11 unmapped = 0 12 for b in bam.fetch(chrom, start, end): 13 if chrom == "*": 14 unmapped += 1 15 continue 16 if b.pos < start: 17 continue 18 if not b.is_unmapped: 19 mapped += 1 20 else: 21 unmapped += 1 22 return mapped, unmapped, chrom 23 24 25def getMappingStats(bam, nThreads): 26 """ 27 This is used for CRAM files, since idxstats() and .mapped/.unmapped are meaningless 28 29 This requires pysam > 0.13.0 30 """ 31 header = [(x, y) for x, y in zip(bam.references, bam.lengths)] 32 res = mapReduce([bam.filename, False], countReadsInInterval, header, numberOfProcessors=nThreads) 33 34 mapped = sum([x[0] for x in res]) 35 unmapped = sum([x[1] for x in res]) 36 stats = {x[0]: [0, 0] for x in header} 37 for r in res: 38 stats[r[2]][0] += r[0] 39 stats[r[2]][1] += r[1] 40 41 # We need to count the number of unmapped reads as well 42 unmapped += bam.count("*") 43 44 return mapped, unmapped, stats 45 46 47def openBam(bamFile, returnStats=False, nThreads=1, minimalDecoding=True): 48 """ 49 A wrapper for opening BAM/CRAM files. 50 51 bamFile: str 52 A BAM/CRAM file name 53 54 returnStats: bool 55 Return a tuple of (file_handle, nMappedReads, nUnmappedReads, statsDict). 56 These additional values are needed by some downstream functions, since one 57 can't use file_handle.mapped on CRAM files (or idxstats()) 58 59 nThreads: int 60 If returnStats is True, number of threads to use for computing statistics 61 62 minimalDecoding: Bool 63 For CRAM files, don't decode the read name, sequence, qual, or auxiliary tag fields (these aren't used by most functions). 64 65 Returns either the file handle or a tuple as described in returnStats 66 """ 67 format_options = ["required_fields=0x1FF"] 68 if sys.version_info.major >= 3: 69 format_options = [b"required_fields=0x1FF"] 70 if not minimalDecoding: 71 format_options = None 72 try: 73 bam = pysam.Samfile(bamFile, 'rb', format_options=format_options) 74 except IOError: 75 sys.exit("The file '{}' does not exist".format(bamFile)) 76 except: 77 sys.exit("The file '{}' does not have BAM or CRAM format ".format(bamFile)) 78 79 try: 80 assert(bam.check_index() is not False) 81 except: 82 sys.exit("'{}' does not appear to have an index. You MUST index the file first!".format(bamFile)) 83 84 if bam.is_cram and returnStats: 85 mapped, unmapped, stats = getMappingStats(bam, nThreads) 86 elif bam.is_bam: 87 mapped = bam.mapped 88 unmapped = bam.unmapped 89 90 # Make the dictionary to hold the stats 91 if returnStats: 92 stats = {chrom.contig: [chrom.mapped, chrom.unmapped] for chrom in bam.get_index_statistics()} 93 94 if bam.is_bam or (bam.is_cram and returnStats): 95 if mapped == 0: 96 sys.stderr.write("WARNING! '{}' does not have any mapped reads. Please " 97 "check that the file is properly indexed and " 98 "that it contains mapped reads.\n".format(bamFile)) 99 100 if returnStats: 101 return bam, mapped, unmapped, stats 102 else: 103 return bam 104