1############################################################################### 2# 3# binComparer.py - compare two sets of bins 4# 5############################################################################### 6# # 7# This program is free software: you can redistribute it and/or modify # 8# it under the terms of the GNU General Public License as published by # 9# the Free Software Foundation, either version 3 of the License, or # 10# (at your option) any later version. # 11# # 12# This program is distributed in the hope that it will be useful, # 13# but WITHOUT ANY WARRANTY; without even the implied warranty of # 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 15# GNU General Public License for more details. # 16# # 17# You should have received a copy of the GNU General Public License # 18# along with this program. If not, see <http://www.gnu.org/licenses/>. # 19# # 20############################################################################### 21 22__author__ = "Ben Woodcroft" 23__copyright__ = "Copyright 2014" 24__credits__ = ["Ben Woodcroft"] 25__license__ = "GPL3" 26__maintainer__ = "Ben Woodcroft" 27__email__ = "" 28 29import logging 30import math 31import csv 32 33from common import binIdFromFilename 34from checkm.util.seqUtils import readFasta 35 36 37class UnionBin: 38 def __init__(self, binningIndex, completeness, contamination, binFile): 39 self.binningIndex = binningIndex 40 self.completeness = completeness 41 self.contamination = contamination 42 self.binId = binIdFromFilename(binFile) 43 self.seqs = readFasta(binFile) 44 self.binFile = binFile 45 46 def numBasesOverlapping(self, anotherUnionBin): 47 commonContigs = set(self.seqs.keys()).intersection(anotherUnionBin.seqs.keys()) 48 return sum([len(self.seqs[contig]) for contig in commonContigs]) 49 50 def numBases(self): 51 return sum([len(seq) for seq in self.seqs.values()]) 52 53 def compContSquaredScored(self): 54 # The divide by 100s are not necessary for direct comparison, but have the 55 # advantage of a perfect bin having a score of 1 and terrible bins 0. 56 return self.completeness / 100 * math.pow(1 - (self.contamination / 100), 2) 57 58 59class UnionCheckmQaTsv: 60 def __init__(self, tsvFile): 61 csv.register_dialect('tabffs', delimiter='\t', quoting=csv.QUOTE_NONE) 62 self.binIdToCompleteness = {} 63 self.binIdToContamination = {} 64 with open(tsvFile) as f: 65 for row in csv.DictReader(f, dialect='tabffs'): 66 binId = row['Bin Id'] 67 self.binIdToCompleteness[binId] = float(row['Completeness']) 68 self.binIdToContamination[binId] = float(row['Contamination']) 69 70 def completeness(self, binId): 71 return self.binIdToCompleteness[binId] 72 73 def contamination(self, binId): 74 return self.binIdToContamination[binId] 75 76 77class BinUnion(object): 78 def __init__(self): 79 self.logger = logging.getLogger('timestamp') 80 81 def report(self, binFolders, binFileSets, checkmQaTsvs, unionBinOutputFile, multiplyBinnedContigsFile, minCompleteness=0, maxContamination=0): 82 # Read QA files 83 qas = [UnionCheckmQaTsv(f) for f in checkmQaTsvs] 84 85 bestCandidates = self.getBestCandidates(binFileSets, qas, minCompleteness, maxContamination) 86 87 numBinsBinningWise = [0] * len(binFolders) 88 for candidate in bestCandidates: 89 numBinsBinningWise[candidate.binningIndex] += 1 90 91 for binIndex, numBins in enumerate(numBinsBinningWise): 92 self.logger.info("Kept %i out of %i bins from %s" % (numBins, 93 len(binFileSets[binIndex]), 94 binFolders[binIndex] 95 )) 96 97 with open(multiplyBinnedContigsFile, 'w') as multiplyBinnedOutput: 98 self.printMultiplyBinnedContigs(bestCandidates, multiplyBinnedOutput) 99 100 with open(unionBinOutputFile, 'w') as out: 101 for candidate in bestCandidates: 102 out.write(candidate.binFile) 103 out.write("\n") 104 105 self.logger.info("Wrote %i bins to %s" % (len(bestCandidates), 106 unionBinOutputFile, 107 )) 108 109 def getBestCandidates(self, binFileSets, qas, minCompleteness, maxContamination): 110 111 # Take the first set of bins as the best set yet 112 bestCandidates = [] 113 for f in binFileSets[0]: 114 if qas[0].completeness(binIdFromFilename(f)) >= minCompleteness and qas[0].contamination(binIdFromFilename(f)) <= maxContamination: 115 bestCandidates.append(UnionBin(0, 116 qas[0].completeness(binIdFromFilename(f)), 117 qas[0].contamination(binIdFromFilename(f)), 118 f 119 )) 120 121 # For each bin in the second or after set, 122 for binningIndex, binFileSet in enumerate(binFileSets): 123 if binningIndex == 0: 124 continue 125 126 currentRoundCandidatesToAdd = [] 127 for binFile in binFileSet: 128 # Is it >50% (by sequence) aligned with any of the bins in the best set? 129 binId = binIdFromFilename(binFile) 130 131 if qas[binningIndex].completeness(binId) >= minCompleteness and qas[binningIndex].contamination(binId) <= maxContamination: 132 133 current = UnionBin(binningIndex, 134 qas[binningIndex].completeness(binId), 135 qas[binningIndex].contamination(binId), 136 binFile) 137 138 fiftyPercent = 0.5 * current.numBases() 139 accountedFor = False 140 for i, bestBin in enumerate(bestCandidates): 141 overlap = current.numBasesOverlapping(bestBin) 142 fiftyPercentBest = 0.5 * bestBin.numBases() 143 144 if overlap > fiftyPercent or overlap > fiftyPercentBest: 145 self.logger.debug("Comparing best bin %s and current bin %s, overlap is %i" % (bestBin.binId, current.binId, overlap)) 146 147 if overlap > fiftyPercent and overlap > fiftyPercentBest: 148 accountedFor = True 149 # Choose the best one 150 if current.compContSquaredScored() > bestBin.compContSquaredScored(): 151 self.logger.debug("The newly found bin is better, going with that") 152 # Found a better one, replace the best bin with that 153 bestCandidates[i] = current 154 # There's a bug here, but is sufficiently rare and hard to fix that meh. If a multiple bins have 155 # the same contig, then it is possible that a bin can overlap > 50% with more 156 # than one bin. So by breaking out of this for loop we may not be replacing the 'optimal' 157 # bin. But then should it be a 1:1 swap anyway? meh. 158 break 159 elif overlap > fiftyPercent or overlap > fiftyPercentBest: 160 self.logger.warn("Bins %s and %s with sizes %i and %i overlap by %i bases and so have unusual overlap ratios, proceeding as if they are distinct bins" % ( 161 bestBin.binId, 162 current.binId, 163 bestBin.numBases(), 164 current.numBases(), 165 overlap 166 )) 167 # Bins don't overlap, continue to go through the loop again 168 169 if not accountedFor: 170 currentRoundCandidatesToAdd.append(current) 171 172 # Add all the bins that hit no other bins to the bestCandidates list 173 # Do this after so that bins are not compared to themselves (saves some time?) 174 for b in currentRoundCandidatesToAdd: 175 self.logger.debug("Adding unmatched bin %s from %s" % (b.binId, b.binningIndex)) 176 bestCandidates.append(b) 177 178 return bestCandidates 179 180 def printMultiplyBinnedContigs(self, bestCandidates, multiplyBinnedOutput): 181 contigToBin = {} 182 for binn in bestCandidates: 183 for contigName in binn.seqs.keys(): 184 if contigName in contigToBin: 185 contigToBin[contigName].append(binn) 186 else: 187 contigToBin[contigName] = [binn] 188 189 # IPython.embed() 190 numMultiplyBinnedContigs = 0 191 multiplyBinnedContigsLength = 0 192 for contigName, binList in contigToBin.iteritems(): 193 if len(binList) > 1: 194 numMultiplyBinnedContigs += 1 195 multiplyBinnedContigsLength += len(binList[0].seqs[contigName]) 196 197 for binn in binList: 198 multiplyBinnedOutput.write("\t".join((contigName, 199 str(len(binList[0].seqs[contigName])), 200 binn.binFile 201 ))) 202 multiplyBinnedOutput.write("\n") 203 204 if numMultiplyBinnedContigs != 0: 205 self.logger.warn(" [Warning] Found %i contigs totaling %i bp that are shared between multiple bins" % (numMultiplyBinnedContigs, 206 multiplyBinnedContigsLength)) 207 self.logger.warn(" [Warning] Wrote summary to %s" % multiplyBinnedOutput.name) 208