1###############################################################################
2#
3# profile.py - calculate percentage of reads mapped to each bin
4#
5###############################################################################
6#                                                                             #
7#    This program is free software: you can redistribute it and/or modify     #
8#    it under the terms of the GNU General Public License as published by     #
9#    the Free Software Foundation, either version 3 of the License, or        #
10#    (at your option) any later version.                                      #
11#                                                                             #
12#    This program is distributed in the hope that it will be useful,          #
13#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
14#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
15#    GNU General Public License for more details.                             #
16#                                                                             #
17#    You should have received a copy of the GNU General Public License        #
18#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
19#                                                                             #
20###############################################################################
21
22import logging
23
24import prettytable
25
26from checkm.defaultValues import DefaultValues
27from common import checkFileExists, reassignStdOut, restoreStdOut
28
29
30class Profile():
31    def __init__(self):
32        self.logger = logging.getLogger('timestamp')
33
34    def run(self, coverageFile, outFile, bTabTable):
35        checkFileExists(coverageFile)
36
37        # get number of reads mapped to each bin
38        self.logger.info('Determining number of reads mapped to each bin.')
39
40        readsMappedToBin = {}
41        binSize = {}
42        totalMappedReads = {}
43        bHeader = True
44        for line in open(coverageFile):
45            if bHeader:
46                bHeader = False
47                continue
48
49            lineSplit = line.split('\t')
50
51            # seqId = lineSplit[0]
52            binId = lineSplit[1]
53
54            seqLen = int(lineSplit[2])
55            binSize[binId] = binSize.get(binId, 0) + seqLen
56
57            if binId not in readsMappedToBin:
58                readsMappedToBin[binId] = {}
59
60            for i in xrange(3, len(lineSplit), 3):
61                bamId = lineSplit[i]
62                mappedReads = int(lineSplit[i + 2])
63
64                totalMappedReads[bamId] = totalMappedReads.get(bamId, 0) + mappedReads
65                readsMappedToBin[binId][bamId] = readsMappedToBin[binId].get(bamId, 0) + mappedReads
66
67        # calculate percentage of mapped reads to binned populations
68        perMappedReads = {}
69        normBinCoverage = {}
70        sumNormBinCoverage = {}
71        for binId, bamIds in readsMappedToBin.iteritems():
72            perMappedReads[binId] = {}
73            normBinCoverage[binId] = {}
74
75            for bamId in bamIds:
76                perMR = float(readsMappedToBin[binId][bamId]) / totalMappedReads[bamId]
77                perMappedReads[binId][bamId] = perMR
78
79                if binId == DefaultValues.UNBINNED:
80                    continue
81
82                normCoverage = perMR / binSize[binId]
83                normBinCoverage[binId][bamId] = normCoverage
84                sumNormBinCoverage[bamId] = sumNormBinCoverage.get(bamId, 0) + normCoverage
85
86        for binId, bamIds in normBinCoverage.iteritems():
87            for bamId in bamIds:
88                if sumNormBinCoverage[bamId] != 0:
89                    normBinCoverage[binId][bamId] /= sumNormBinCoverage[bamId]
90                else:
91                    normBinCoverage[binId][bamId] = 0
92
93        # write community profile
94        oldStdOut = reassignStdOut(outFile)
95
96        sortedBinIds = sorted(readsMappedToBin.keys())
97        sortedBamIds = sorted(readsMappedToBin[sortedBinIds[0]].keys())
98
99        header = ['Bin Id', 'Bin size (Mbp)']
100        for bamId in sortedBamIds:
101            header += [bamId + ': mapped reads']
102            header += [bamId + ': % mapped reads']
103            header += [bamId + ': % binned populations']
104            header += [bamId + ': % community']
105
106        if bTabTable:
107            print('\t'.join(header))
108        else:
109            pTable = prettytable.PrettyTable(header)
110            pTable.float_format = '.2'
111            pTable.align = 'c'
112            pTable.align[header[0]] = 'l'
113            pTable.hrules = prettytable.FRAME
114            pTable.vrules = prettytable.NONE
115
116        for binId in sortedBinIds:
117            row = [binId]
118            row += [float(binSize[binId]) / 1e6]
119
120            for bamId in sortedBamIds:
121                row += [readsMappedToBin[binId][bamId]]
122                row += [perMappedReads[binId][bamId] * 100.0]
123
124                if DefaultValues.UNBINNED in perMappedReads:
125                    unbinnedPercentage = perMappedReads[DefaultValues.UNBINNED][bamId]
126                else:
127                    unbinnedPercentage = 0
128
129                if binId == DefaultValues.UNBINNED:
130                    row += ['NA']
131                    row += [unbinnedPercentage * 100.0]
132                else:
133                    row += [normBinCoverage[binId][bamId] * 100.0]
134                    row += [normBinCoverage[binId][bamId] * 100.0 * (1.0 - unbinnedPercentage)]
135
136            if bTabTable:
137                print('\t'.join(map(str, row)))
138            else:
139                pTable.add_row(row)
140
141        if not bTabTable:
142            print(pTable.get_string())
143
144        restoreStdOut(outFile, oldStdOut)
145