1###############################################################################
2#
3# binComparer.py - compare two sets of bins
4#
5###############################################################################
6#                                                                             #
7#    This program is free software: you can redistribute it and/or modify     #
8#    it under the terms of the GNU General Public License as published by     #
9#    the Free Software Foundation, either version 3 of the License, or        #
10#    (at your option) any later version.                                      #
11#                                                                             #
12#    This program is distributed in the hope that it will be useful,          #
13#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
14#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
15#    GNU General Public License for more details.                             #
16#                                                                             #
17#    You should have received a copy of the GNU General Public License        #
18#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
19#                                                                             #
20###############################################################################
21
22__author__ = "Ben Woodcroft"
23__copyright__ = "Copyright 2014"
24__credits__ = ["Ben Woodcroft"]
25__license__ = "GPL3"
26__maintainer__ = "Ben Woodcroft"
27__email__ = ""
28
29import logging
30import math
31import csv
32
33from common import binIdFromFilename
34from checkm.util.seqUtils import readFasta
35
36
37class UnionBin:
38    def __init__(self, binningIndex, completeness, contamination, binFile):
39        self.binningIndex = binningIndex
40        self.completeness = completeness
41        self.contamination = contamination
42        self.binId = binIdFromFilename(binFile)
43        self.seqs = readFasta(binFile)
44        self.binFile = binFile
45
46    def numBasesOverlapping(self, anotherUnionBin):
47        commonContigs = set(self.seqs.keys()).intersection(anotherUnionBin.seqs.keys())
48        return sum([len(self.seqs[contig]) for contig in commonContigs])
49
50    def numBases(self):
51        return sum([len(seq) for seq in self.seqs.values()])
52
53    def compContSquaredScored(self):
54        # The divide by 100s are not necessary for direct comparison, but have the
55        # advantage of a perfect bin having a score of 1 and terrible bins 0.
56        return self.completeness / 100 * math.pow(1 - (self.contamination / 100), 2)
57
58
59class UnionCheckmQaTsv:
60    def __init__(self, tsvFile):
61        csv.register_dialect('tabffs', delimiter='\t', quoting=csv.QUOTE_NONE)
62        self.binIdToCompleteness = {}
63        self.binIdToContamination = {}
64        with open(tsvFile) as f:
65            for row in csv.DictReader(f, dialect='tabffs'):
66                binId = row['Bin Id']
67                self.binIdToCompleteness[binId] = float(row['Completeness'])
68                self.binIdToContamination[binId] = float(row['Contamination'])
69
70    def completeness(self, binId):
71        return self.binIdToCompleteness[binId]
72
73    def contamination(self, binId):
74        return self.binIdToContamination[binId]
75
76
77class BinUnion(object):
78    def __init__(self):
79        self.logger = logging.getLogger('timestamp')
80
81    def report(self, binFolders, binFileSets, checkmQaTsvs, unionBinOutputFile, multiplyBinnedContigsFile, minCompleteness=0, maxContamination=0):
82        # Read QA files
83        qas = [UnionCheckmQaTsv(f) for f in checkmQaTsvs]
84
85        bestCandidates = self.getBestCandidates(binFileSets, qas, minCompleteness, maxContamination)
86
87        numBinsBinningWise = [0] * len(binFolders)
88        for candidate in bestCandidates:
89            numBinsBinningWise[candidate.binningIndex] += 1
90
91        for binIndex, numBins in enumerate(numBinsBinningWise):
92            self.logger.info("Kept %i out of %i bins from %s" % (numBins,
93                                                                len(binFileSets[binIndex]),
94                                                                binFolders[binIndex]
95                                                                ))
96
97        with open(multiplyBinnedContigsFile, 'w') as multiplyBinnedOutput:
98            self.printMultiplyBinnedContigs(bestCandidates, multiplyBinnedOutput)
99
100        with open(unionBinOutputFile, 'w') as out:
101            for candidate in bestCandidates:
102                out.write(candidate.binFile)
103                out.write("\n")
104
105        self.logger.info("Wrote %i bins to %s" % (len(bestCandidates),
106                                                  unionBinOutputFile,
107                                                  ))
108
109    def getBestCandidates(self, binFileSets, qas, minCompleteness, maxContamination):
110
111        # Take the first set of bins as the best set yet
112        bestCandidates = []
113        for f in binFileSets[0]:
114            if qas[0].completeness(binIdFromFilename(f)) >= minCompleteness and qas[0].contamination(binIdFromFilename(f)) <= maxContamination:
115                bestCandidates.append(UnionBin(0,
116                                       qas[0].completeness(binIdFromFilename(f)),
117                                       qas[0].contamination(binIdFromFilename(f)),
118                                       f
119                                       ))
120
121        # For each bin in the second or after set,
122        for binningIndex, binFileSet in enumerate(binFileSets):
123            if binningIndex == 0:
124                continue
125
126            currentRoundCandidatesToAdd = []
127            for binFile in binFileSet:
128                # Is it >50% (by sequence) aligned with any of the bins in the best set?
129                binId = binIdFromFilename(binFile)
130
131                if qas[binningIndex].completeness(binId) >= minCompleteness and qas[binningIndex].contamination(binId) <= maxContamination:
132
133                    current = UnionBin(binningIndex,
134                                       qas[binningIndex].completeness(binId),
135                                       qas[binningIndex].contamination(binId),
136                                       binFile)
137
138                    fiftyPercent = 0.5 * current.numBases()
139                    accountedFor = False
140                    for i, bestBin in enumerate(bestCandidates):
141                        overlap = current.numBasesOverlapping(bestBin)
142                        fiftyPercentBest = 0.5 * bestBin.numBases()
143
144                        if overlap > fiftyPercent or overlap > fiftyPercentBest:
145                            self.logger.debug("Comparing best bin %s and current bin %s, overlap is %i" % (bestBin.binId, current.binId, overlap))
146
147                        if overlap > fiftyPercent and overlap > fiftyPercentBest:
148                            accountedFor = True
149                            # Choose the best one
150                            if current.compContSquaredScored() > bestBin.compContSquaredScored():
151                                self.logger.debug("The newly found bin is better, going with that")
152                                # Found a better one, replace the best bin with that
153                                bestCandidates[i] = current
154                                # There's a bug here, but is sufficiently rare and hard to fix that meh. If a multiple bins have
155                                # the same contig, then it is possible that a bin can overlap > 50% with more
156                                # than one bin. So by breaking out of this for loop we may not be replacing the 'optimal'
157                                # bin. But then should it be a 1:1 swap anyway? meh.
158                                break
159                        elif overlap > fiftyPercent or overlap > fiftyPercentBest:
160                            self.logger.warn("Bins %s and %s with sizes %i and %i overlap by %i bases and so have unusual overlap ratios, proceeding as if they are distinct bins" % (
161                                                                                                                                                          bestBin.binId,
162                                                                                                                                                          current.binId,
163                                                                                                                                                          bestBin.numBases(),
164                                                                                                                                                          current.numBases(),
165                                                                                                                                                          overlap
166                                                                                                                                                          ))
167                            # Bins don't overlap, continue to go through the loop again
168
169                    if not accountedFor:
170                        currentRoundCandidatesToAdd.append(current)
171
172            # Add all the bins that hit no other bins to the bestCandidates list
173            # Do this after so that bins are not compared to themselves (saves some time?)
174            for b in currentRoundCandidatesToAdd:
175                self.logger.debug("Adding unmatched bin %s from %s" % (b.binId, b.binningIndex))
176                bestCandidates.append(b)
177
178        return bestCandidates
179
180    def printMultiplyBinnedContigs(self, bestCandidates, multiplyBinnedOutput):
181        contigToBin = {}
182        for binn in bestCandidates:
183            for contigName in binn.seqs.keys():
184                if contigName in contigToBin:
185                    contigToBin[contigName].append(binn)
186                else:
187                    contigToBin[contigName] = [binn]
188
189        # IPython.embed()
190        numMultiplyBinnedContigs = 0
191        multiplyBinnedContigsLength = 0
192        for contigName, binList in contigToBin.iteritems():
193            if len(binList) > 1:
194                numMultiplyBinnedContigs += 1
195                multiplyBinnedContigsLength += len(binList[0].seqs[contigName])
196
197                for binn in binList:
198                    multiplyBinnedOutput.write("\t".join((contigName,
199                                                           str(len(binList[0].seqs[contigName])),
200                                                           binn.binFile
201                                                           )))
202                    multiplyBinnedOutput.write("\n")
203
204        if numMultiplyBinnedContigs != 0:
205            self.logger.warn("   [Warning] Found %i contigs totaling %i bp that are shared between multiple bins" % (numMultiplyBinnedContigs,
206                                                                                                          multiplyBinnedContigsLength))
207            self.logger.warn("   [Warning] Wrote summary to %s" % multiplyBinnedOutput.name)
208