1###############################################################################
2#
3# binStatistics.py - calculate statistics for each putative genome bin
4#
5###############################################################################
6#                                                                             #
7#    This program is free software: you can redistribute it and/or modify     #
8#    it under the terms of the GNU General Public License as published by     #
9#    the Free Software Foundation, either version 3 of the License, or        #
10#    (at your option) any later version.                                      #
11#                                                                             #
12#    This program is distributed in the hope that it will be useful,          #
13#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
14#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
15#    GNU General Public License for more details.                             #
16#                                                                             #
17#    You should have received a copy of the GNU General Public License        #
18#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
19#                                                                             #
20###############################################################################
21
22import os
23import sys
24import multiprocessing as mp
25import math
26import logging
27
28from checkm.defaultValues import DefaultValues
29from checkm.util.seqUtils import readFasta, baseCount, calculateN50
30from checkm.common import binIdFromFilename, makeSurePathExists
31from checkm.prodigal import ProdigalGeneFeatureParser
32
33from numpy import mean
34
35
36class BinStatistics():
37    """Statistics for genomes.
38
39    This class calculates statistics for genomes comprised
40    of one or more scaffolds. Mean statistics are weighted by
41    scaffold length. The following statistics are calculated:
42     - bin assignment
43     - mean GC
44     - mean and median scaffold length
45     - N50
46     - mean coverage
47     - mean tetranucleotide signature
48
49    """
50
51    def __init__(self, threads=1):
52        """Initialization.
53
54        Parameters
55        ----------
56        cpus : int
57            Number of cpus to use.
58        """
59
60        self.logger = logging.getLogger('timestamp')
61        self.totalThreads = threads
62
63    def calculate(self, binFiles, outDir, binStatsFile):
64        """Calculate statistics for each putative genome bin."""
65
66        # process each bin
67        self.logger.info("Calculating genome statistics for %d bins with %d threads:" % (len(binFiles), self.totalThreads))
68
69        workerQueue = mp.Queue()
70        writerQueue = mp.Queue()
71
72        for binFile in binFiles:
73            workerQueue.put(binFile)
74
75        for _ in range(self.totalThreads):
76            workerQueue.put(None)
77
78        try:
79            calcProc = [mp.Process(target=self.__processBin, args=(outDir, workerQueue, writerQueue)) for _ in range(self.totalThreads)]
80            writeProc = mp.Process(target=self.__reportProgress, args=(outDir, binStatsFile, len(binFiles), writerQueue))
81
82            writeProc.start()
83
84            for p in calcProc:
85                p.start()
86
87            for p in calcProc:
88                p.join()
89
90            writerQueue.put((None, None))
91            writeProc.join()
92        except:
93            # make sure all processes are terminated
94            for p in calcProc:
95                p.terminate()
96
97            writeProc.terminate()
98
99    def __processBin(self, outDir, queueIn, queueOut):
100        """Thread safe bin processing."""
101        while True:
102            binFile = queueIn.get(block=True, timeout=None)
103            if binFile == None:
104                break
105
106            binStats = {}
107
108            binId = binIdFromFilename(binFile)
109            binDir = os.path.join(outDir, 'bins', binId)
110            makeSurePathExists(binDir)
111
112            # read scaffolds
113            scaffolds = readFasta(binFile)
114
115            # calculate GC statistics
116            GC, stdGC = self.calculateGC(scaffolds)
117            binStats['GC'] = GC
118            binStats['GC std'] = stdGC
119
120            # calculate statistics related to contigs and scaffolds
121            maxScaffoldLen, maxContigLen, genomeSize, scaffold_N50, contig_N50, scaffoldAvgLen, contigAvgLen, numContigs, numAmbiguousBases = self.calculateSeqStats(scaffolds)
122            binStats['Genome size'] = genomeSize
123            binStats['# ambiguous bases'] = numAmbiguousBases
124            binStats['# scaffolds'] = len(scaffolds)
125            binStats['# contigs'] = numContigs
126            binStats['Longest scaffold'] = maxScaffoldLen
127            binStats['Longest contig'] = maxContigLen
128            binStats['N50 (scaffolds)'] = scaffold_N50
129            binStats['N50 (contigs)'] = contig_N50
130            binStats['Mean scaffold length'] = scaffoldAvgLen
131            binStats['Mean contig length'] = contigAvgLen
132
133            # calculate coding density statistics
134            codingDensity, translationTable, numORFs = self.calculateCodingDensity(binDir, scaffolds, genomeSize)
135            binStats['Coding density'] = codingDensity
136            binStats['Translation table'] = translationTable
137            binStats['# predicted genes'] = numORFs
138
139            queueOut.put((binId, binStats))
140
141    def __reportProgress(self, outDir, binStatsFile, numBins, queueIn):
142        """Report number of processed bins and write statistics to file."""
143
144        storagePath = os.path.join(outDir, 'storage')
145        fout = open(os.path.join(storagePath, binStatsFile), 'w')
146
147        numProcessedBins = 0
148        if self.logger.getEffectiveLevel() <= logging.INFO:
149            statusStr = '    Finished processing %d of %d (%.2f%%) bins.' % (numProcessedBins, numBins, float(numProcessedBins) * 100 / numBins)
150            sys.stderr.write('%s\r' % statusStr)
151            sys.stderr.flush()
152
153        while True:
154            binId, curBinStats = queueIn.get(block=True, timeout=None)
155            if binId == None:
156                break
157
158            fout.write(binId + '\t' + str(curBinStats) + '\n')
159            if self.logger.getEffectiveLevel() <= logging.INFO:
160                numProcessedBins += 1
161                statusStr = '    Finished processing %d of %d (%.2f%%) bins.' % (numProcessedBins, numBins, float(numProcessedBins) * 100 / numBins)
162                sys.stderr.write('%s\r' % statusStr)
163                sys.stderr.flush()
164
165        if self.logger.getEffectiveLevel() <= logging.INFO:
166            sys.stderr.write('\n')
167
168        fout.close()
169
170    def calculateGC(self, seqs, seqStats=None):
171        """Calculate fraction of nucleotides that are G or C."""
172        totalGC = 0
173        totalAT = 0
174        gcPerSeq = []
175        for seqId, seq in seqs.iteritems():
176            a, c, g, t = baseCount(seq)
177
178            gc = g + c
179            at = a + t
180
181            totalGC += gc
182            totalAT += at
183
184            if (gc + at) > 0:
185                gcContent = float(gc) / (gc + at)
186            else:
187                gcContent = 0.0
188
189            if seqStats:
190                seqStats[seqId]['GC'] = gcContent
191
192            if len(seq) > DefaultValues.MIN_SEQ_LEN_GC_STD:
193                gcPerSeq.append(gcContent)
194
195        if (totalGC + totalAT) > 0:
196            GC = float(totalGC) / (totalGC + totalAT)
197        else:
198            GC = 0.0
199
200        varGC = 0
201        if len(gcPerSeq) > 1:
202            varGC = mean(map(lambda x: (x - GC) ** 2, gcPerSeq))
203
204        return GC, math.sqrt(varGC)
205
206    def calculateSeqStats(self, scaffolds, seqStats=None):
207        """Calculate scaffold length statistics (min length, max length, total length, N50, # contigs)."""
208        scaffoldLens = []
209        contigLens = []
210        numAmbiguousBases = 0
211        for scaffoldId, scaffold in scaffolds.iteritems():
212            scaffoldLen = len(scaffold)
213            scaffoldLens.append(scaffoldLen)
214
215            splitScaffold = scaffold.split(DefaultValues.CONTIG_BREAK)
216            lenContigsInScaffold = []
217            for contig in splitScaffold:
218                contigLen = len(contig.replace('N', ''))
219                if contigLen > 0:
220                    lenContigsInScaffold.append(contigLen)
221
222            contigLens += lenContigsInScaffold
223
224            if seqStats:
225                seqStats[scaffoldId]['Length'] = scaffoldLen
226                seqStats[scaffoldId]['Total contig length'] = sum(lenContigsInScaffold)
227                seqStats[scaffoldId]['# contigs'] = len(lenContigsInScaffold)
228
229            numAmbiguousBases += scaffold.count('N') + scaffold.count('n')
230
231        scaffold_N50 = calculateN50(scaffoldLens)
232        contig_N50 = calculateN50(contigLens)
233
234        return max(scaffoldLens), max(contigLens), sum(scaffoldLens), scaffold_N50, contig_N50, mean(scaffoldLens), mean(contigLens), len(contigLens), numAmbiguousBases
235
236    def calculateCodingDensity(self, outDir, scaffolds, genomeSize):
237        """Calculate coding density of putative genome bin."""
238        gffFile = os.path.join(outDir, DefaultValues.PRODIGAL_GFF)
239        if os.path.exists(gffFile):
240            prodigalParserGFF = ProdigalGeneFeatureParser(gffFile)
241
242            aaFile = os.path.join(outDir, DefaultValues.PRODIGAL_AA)  # use AA file as nucleotide file is optional
243            aaGenes = readFasta(aaFile)
244
245            codingBasePairs = 0  # self.__calculateCodingBases(aaGenes)
246            for scaffold_id in scaffolds.keys():
247                codingBasePairs += prodigalParserGFF.codingBases(scaffold_id)
248
249            return float(codingBasePairs) / genomeSize, prodigalParserGFF.translationTable, len(aaGenes)
250        else:
251            # there is no gene feature file (perhaps the user specified pre-calculated genes)
252            # so calculating the coding density is not possible
253            return -1, -1, -1
254
255    def __calculateCodingBases(self, aaGenes):
256        """Calculate number of coding bases in a set of genes."""
257        codingBasePairs = 0
258        for _geneId, gene in aaGenes.iteritems():
259            codingBasePairs += len(gene) * 3
260
261        return codingBasePairs
262
263    def sequenceStats(self, outDir, binFile):
264        """Calculate statistics for all sequences within a bin."""
265
266        # read scaffolds
267        seqs = readFasta(binFile)
268
269        seqStats = {}
270        for seqId in seqs:
271            seqStats[seqId] = {}
272
273        self.calculateGC(seqs, seqStats)
274        self.calculateSeqStats(seqs, seqStats)
275
276        binId = binIdFromFilename(binFile)
277        aaFile = os.path.join(outDir, 'bins', binId, DefaultValues.PRODIGAL_AA)
278        if os.path.exists(aaFile):
279            aaGenes = readFasta(aaFile)
280            for geneId, gene in aaGenes.iteritems():
281                seqId = geneId[0:geneId.rfind('_')]
282                seqStats[seqId]['# ORFs'] = seqStats[seqId].get('# ORFs', 0) + 1
283                seqStats[seqId]['Coding bases'] = seqStats[seqId].get('Coding bases', 0) + len(gene) * 3
284        else:
285            # missing amino acid file likely indicates users used a pre-called gene file, so
286            # just set some defaults
287            seqStats[seqId]['# ORFs'] = seqStats[seqId].get('# ORFs', 0) + 1
288            seqStats[seqId]['Coding bases'] = seqStats[seqId].get('Coding bases', 0) + len(gene) * 3
289
290        return seqStats
291