1import sys
2import pysam
3from deeptools.mapReduce import mapReduce
4
5
6def countReadsInInterval(args):
7    chrom, start, end, fname, toEOF = args
8
9    bam = openBam(fname)
10    mapped = 0
11    unmapped = 0
12    for b in bam.fetch(chrom, start, end):
13        if chrom == "*":
14            unmapped += 1
15            continue
16        if b.pos < start:
17            continue
18        if not b.is_unmapped:
19            mapped += 1
20        else:
21            unmapped += 1
22    return mapped, unmapped, chrom
23
24
25def getMappingStats(bam, nThreads):
26    """
27    This is used for CRAM files, since idxstats() and .mapped/.unmapped are meaningless
28
29    This requires pysam > 0.13.0
30    """
31    header = [(x, y) for x, y in zip(bam.references, bam.lengths)]
32    res = mapReduce([bam.filename, False], countReadsInInterval, header, numberOfProcessors=nThreads)
33
34    mapped = sum([x[0] for x in res])
35    unmapped = sum([x[1] for x in res])
36    stats = {x[0]: [0, 0] for x in header}
37    for r in res:
38        stats[r[2]][0] += r[0]
39        stats[r[2]][1] += r[1]
40
41    # We need to count the number of unmapped reads as well
42    unmapped += bam.count("*")
43
44    return mapped, unmapped, stats
45
46
47def openBam(bamFile, returnStats=False, nThreads=1, minimalDecoding=True):
48    """
49    A wrapper for opening BAM/CRAM files.
50
51    bamFile: str
52        A BAM/CRAM file name
53
54    returnStats: bool
55        Return a tuple of (file_handle, nMappedReads, nUnmappedReads, statsDict).
56        These additional values are needed by some downstream functions, since one
57        can't use file_handle.mapped on CRAM files (or idxstats())
58
59    nThreads: int
60        If returnStats is True, number of threads to use for computing statistics
61
62    minimalDecoding: Bool
63        For CRAM files, don't decode the read name, sequence, qual, or auxiliary tag fields (these aren't used by most functions).
64
65    Returns either the file handle or a tuple as described in returnStats
66    """
67    format_options = ["required_fields=0x1FF"]
68    if sys.version_info.major >= 3:
69        format_options = [b"required_fields=0x1FF"]
70    if not minimalDecoding:
71        format_options = None
72    try:
73        bam = pysam.Samfile(bamFile, 'rb', format_options=format_options)
74    except IOError:
75        sys.exit("The file '{}' does not exist".format(bamFile))
76    except:
77        sys.exit("The file '{}' does not have BAM or CRAM format ".format(bamFile))
78
79    try:
80        assert(bam.check_index() is not False)
81    except:
82        sys.exit("'{}' does not appear to have an index. You MUST index the file first!".format(bamFile))
83
84    if bam.is_cram and returnStats:
85        mapped, unmapped, stats = getMappingStats(bam, nThreads)
86    elif bam.is_bam:
87        mapped = bam.mapped
88        unmapped = bam.unmapped
89
90        # Make the dictionary to hold the stats
91        if returnStats:
92            stats = {chrom.contig: [chrom.mapped, chrom.unmapped] for chrom in bam.get_index_statistics()}
93
94    if bam.is_bam or (bam.is_cram and returnStats):
95        if mapped == 0:
96            sys.stderr.write("WARNING! '{}' does not have any mapped reads. Please "
97                             "check that the file is properly indexed and "
98                             "that it contains mapped reads.\n".format(bamFile))
99
100    if returnStats:
101        return bam, mapped, unmapped, stats
102    else:
103        return bam
104