1############################################################################### 2# 3# binStatistics.py - calculate statistics for each putative genome bin 4# 5############################################################################### 6# # 7# This program is free software: you can redistribute it and/or modify # 8# it under the terms of the GNU General Public License as published by # 9# the Free Software Foundation, either version 3 of the License, or # 10# (at your option) any later version. # 11# # 12# This program is distributed in the hope that it will be useful, # 13# but WITHOUT ANY WARRANTY; without even the implied warranty of # 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 15# GNU General Public License for more details. # 16# # 17# You should have received a copy of the GNU General Public License # 18# along with this program. If not, see <http://www.gnu.org/licenses/>. # 19# # 20############################################################################### 21 22import os 23import sys 24import multiprocessing as mp 25import math 26import logging 27 28from checkm.defaultValues import DefaultValues 29from checkm.util.seqUtils import readFasta, baseCount, calculateN50 30from checkm.common import binIdFromFilename, makeSurePathExists 31from checkm.prodigal import ProdigalGeneFeatureParser 32 33from numpy import mean 34 35 36class BinStatistics(): 37 """Statistics for genomes. 38 39 This class calculates statistics for genomes comprised 40 of one or more scaffolds. Mean statistics are weighted by 41 scaffold length. The following statistics are calculated: 42 - bin assignment 43 - mean GC 44 - mean and median scaffold length 45 - N50 46 - mean coverage 47 - mean tetranucleotide signature 48 49 """ 50 51 def __init__(self, threads=1): 52 """Initialization. 53 54 Parameters 55 ---------- 56 cpus : int 57 Number of cpus to use. 58 """ 59 60 self.logger = logging.getLogger('timestamp') 61 self.totalThreads = threads 62 63 def calculate(self, binFiles, outDir, binStatsFile): 64 """Calculate statistics for each putative genome bin.""" 65 66 # process each bin 67 self.logger.info("Calculating genome statistics for %d bins with %d threads:" % (len(binFiles), self.totalThreads)) 68 69 workerQueue = mp.Queue() 70 writerQueue = mp.Queue() 71 72 for binFile in binFiles: 73 workerQueue.put(binFile) 74 75 for _ in range(self.totalThreads): 76 workerQueue.put(None) 77 78 try: 79 calcProc = [mp.Process(target=self.__processBin, args=(outDir, workerQueue, writerQueue)) for _ in range(self.totalThreads)] 80 writeProc = mp.Process(target=self.__reportProgress, args=(outDir, binStatsFile, len(binFiles), writerQueue)) 81 82 writeProc.start() 83 84 for p in calcProc: 85 p.start() 86 87 for p in calcProc: 88 p.join() 89 90 writerQueue.put((None, None)) 91 writeProc.join() 92 except: 93 # make sure all processes are terminated 94 for p in calcProc: 95 p.terminate() 96 97 writeProc.terminate() 98 99 def __processBin(self, outDir, queueIn, queueOut): 100 """Thread safe bin processing.""" 101 while True: 102 binFile = queueIn.get(block=True, timeout=None) 103 if binFile == None: 104 break 105 106 binStats = {} 107 108 binId = binIdFromFilename(binFile) 109 binDir = os.path.join(outDir, 'bins', binId) 110 makeSurePathExists(binDir) 111 112 # read scaffolds 113 scaffolds = readFasta(binFile) 114 115 # calculate GC statistics 116 GC, stdGC = self.calculateGC(scaffolds) 117 binStats['GC'] = GC 118 binStats['GC std'] = stdGC 119 120 # calculate statistics related to contigs and scaffolds 121 maxScaffoldLen, maxContigLen, genomeSize, scaffold_N50, contig_N50, scaffoldAvgLen, contigAvgLen, numContigs, numAmbiguousBases = self.calculateSeqStats(scaffolds) 122 binStats['Genome size'] = genomeSize 123 binStats['# ambiguous bases'] = numAmbiguousBases 124 binStats['# scaffolds'] = len(scaffolds) 125 binStats['# contigs'] = numContigs 126 binStats['Longest scaffold'] = maxScaffoldLen 127 binStats['Longest contig'] = maxContigLen 128 binStats['N50 (scaffolds)'] = scaffold_N50 129 binStats['N50 (contigs)'] = contig_N50 130 binStats['Mean scaffold length'] = scaffoldAvgLen 131 binStats['Mean contig length'] = contigAvgLen 132 133 # calculate coding density statistics 134 codingDensity, translationTable, numORFs = self.calculateCodingDensity(binDir, scaffolds, genomeSize) 135 binStats['Coding density'] = codingDensity 136 binStats['Translation table'] = translationTable 137 binStats['# predicted genes'] = numORFs 138 139 queueOut.put((binId, binStats)) 140 141 def __reportProgress(self, outDir, binStatsFile, numBins, queueIn): 142 """Report number of processed bins and write statistics to file.""" 143 144 storagePath = os.path.join(outDir, 'storage') 145 fout = open(os.path.join(storagePath, binStatsFile), 'w') 146 147 numProcessedBins = 0 148 if self.logger.getEffectiveLevel() <= logging.INFO: 149 statusStr = ' Finished processing %d of %d (%.2f%%) bins.' % (numProcessedBins, numBins, float(numProcessedBins) * 100 / numBins) 150 sys.stderr.write('%s\r' % statusStr) 151 sys.stderr.flush() 152 153 while True: 154 binId, curBinStats = queueIn.get(block=True, timeout=None) 155 if binId == None: 156 break 157 158 fout.write(binId + '\t' + str(curBinStats) + '\n') 159 if self.logger.getEffectiveLevel() <= logging.INFO: 160 numProcessedBins += 1 161 statusStr = ' Finished processing %d of %d (%.2f%%) bins.' % (numProcessedBins, numBins, float(numProcessedBins) * 100 / numBins) 162 sys.stderr.write('%s\r' % statusStr) 163 sys.stderr.flush() 164 165 if self.logger.getEffectiveLevel() <= logging.INFO: 166 sys.stderr.write('\n') 167 168 fout.close() 169 170 def calculateGC(self, seqs, seqStats=None): 171 """Calculate fraction of nucleotides that are G or C.""" 172 totalGC = 0 173 totalAT = 0 174 gcPerSeq = [] 175 for seqId, seq in seqs.iteritems(): 176 a, c, g, t = baseCount(seq) 177 178 gc = g + c 179 at = a + t 180 181 totalGC += gc 182 totalAT += at 183 184 if (gc + at) > 0: 185 gcContent = float(gc) / (gc + at) 186 else: 187 gcContent = 0.0 188 189 if seqStats: 190 seqStats[seqId]['GC'] = gcContent 191 192 if len(seq) > DefaultValues.MIN_SEQ_LEN_GC_STD: 193 gcPerSeq.append(gcContent) 194 195 if (totalGC + totalAT) > 0: 196 GC = float(totalGC) / (totalGC + totalAT) 197 else: 198 GC = 0.0 199 200 varGC = 0 201 if len(gcPerSeq) > 1: 202 varGC = mean(map(lambda x: (x - GC) ** 2, gcPerSeq)) 203 204 return GC, math.sqrt(varGC) 205 206 def calculateSeqStats(self, scaffolds, seqStats=None): 207 """Calculate scaffold length statistics (min length, max length, total length, N50, # contigs).""" 208 scaffoldLens = [] 209 contigLens = [] 210 numAmbiguousBases = 0 211 for scaffoldId, scaffold in scaffolds.iteritems(): 212 scaffoldLen = len(scaffold) 213 scaffoldLens.append(scaffoldLen) 214 215 splitScaffold = scaffold.split(DefaultValues.CONTIG_BREAK) 216 lenContigsInScaffold = [] 217 for contig in splitScaffold: 218 contigLen = len(contig.replace('N', '')) 219 if contigLen > 0: 220 lenContigsInScaffold.append(contigLen) 221 222 contigLens += lenContigsInScaffold 223 224 if seqStats: 225 seqStats[scaffoldId]['Length'] = scaffoldLen 226 seqStats[scaffoldId]['Total contig length'] = sum(lenContigsInScaffold) 227 seqStats[scaffoldId]['# contigs'] = len(lenContigsInScaffold) 228 229 numAmbiguousBases += scaffold.count('N') + scaffold.count('n') 230 231 scaffold_N50 = calculateN50(scaffoldLens) 232 contig_N50 = calculateN50(contigLens) 233 234 return max(scaffoldLens), max(contigLens), sum(scaffoldLens), scaffold_N50, contig_N50, mean(scaffoldLens), mean(contigLens), len(contigLens), numAmbiguousBases 235 236 def calculateCodingDensity(self, outDir, scaffolds, genomeSize): 237 """Calculate coding density of putative genome bin.""" 238 gffFile = os.path.join(outDir, DefaultValues.PRODIGAL_GFF) 239 if os.path.exists(gffFile): 240 prodigalParserGFF = ProdigalGeneFeatureParser(gffFile) 241 242 aaFile = os.path.join(outDir, DefaultValues.PRODIGAL_AA) # use AA file as nucleotide file is optional 243 aaGenes = readFasta(aaFile) 244 245 codingBasePairs = 0 # self.__calculateCodingBases(aaGenes) 246 for scaffold_id in scaffolds.keys(): 247 codingBasePairs += prodigalParserGFF.codingBases(scaffold_id) 248 249 return float(codingBasePairs) / genomeSize, prodigalParserGFF.translationTable, len(aaGenes) 250 else: 251 # there is no gene feature file (perhaps the user specified pre-calculated genes) 252 # so calculating the coding density is not possible 253 return -1, -1, -1 254 255 def __calculateCodingBases(self, aaGenes): 256 """Calculate number of coding bases in a set of genes.""" 257 codingBasePairs = 0 258 for _geneId, gene in aaGenes.iteritems(): 259 codingBasePairs += len(gene) * 3 260 261 return codingBasePairs 262 263 def sequenceStats(self, outDir, binFile): 264 """Calculate statistics for all sequences within a bin.""" 265 266 # read scaffolds 267 seqs = readFasta(binFile) 268 269 seqStats = {} 270 for seqId in seqs: 271 seqStats[seqId] = {} 272 273 self.calculateGC(seqs, seqStats) 274 self.calculateSeqStats(seqs, seqStats) 275 276 binId = binIdFromFilename(binFile) 277 aaFile = os.path.join(outDir, 'bins', binId, DefaultValues.PRODIGAL_AA) 278 if os.path.exists(aaFile): 279 aaGenes = readFasta(aaFile) 280 for geneId, gene in aaGenes.iteritems(): 281 seqId = geneId[0:geneId.rfind('_')] 282 seqStats[seqId]['# ORFs'] = seqStats[seqId].get('# ORFs', 0) + 1 283 seqStats[seqId]['Coding bases'] = seqStats[seqId].get('Coding bases', 0) + len(gene) * 3 284 else: 285 # missing amino acid file likely indicates users used a pre-called gene file, so 286 # just set some defaults 287 seqStats[seqId]['# ORFs'] = seqStats[seqId].get('# ORFs', 0) + 1 288 seqStats[seqId]['Coding bases'] = seqStats[seqId].get('Coding bases', 0) + len(gene) * 3 289 290 return seqStats 291