1###############################################################################
2#
3# merger.py - identify bins with complementary sets of marker genes
4#
5###############################################################################
6#                                                                             #
7#    This program is free software: you can redistribute it and/or modify     #
8#    it under the terms of the GNU General Public License as published by     #
9#    the Free Software Foundation, either version 3 of the License, or        #
10#    (at your option) any later version.                                      #
11#                                                                             #
12#    This program is distributed in the hope that it will be useful,          #
13#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
14#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
15#    GNU General Public License for more details.                             #
16#                                                                             #
17#    You should have received a copy of the GNU General Public License        #
18#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
19#                                                                             #
20###############################################################################
21
22import os
23import sys
24import logging
25
26from checkm.common import checkDirExists
27from checkm.resultsParser import ResultsParser
28
29
30class Merger():
31    def __init__(self):
32        self.logger = logging.getLogger('timestamp')
33
34    def run(self, binFiles, outDir, hmmTableFile,
35                binIdToModels, binIdToBinMarkerSets,
36                minDeltaComp, maxDeltaCont,
37                minMergedComp, maxMergedCont):
38        checkDirExists(outDir)
39
40        self.logger.info('Comparing marker sets between all pairs of bins.')
41
42        # ensure all bins are using the same marker set
43        markerGenesI = binIdToBinMarkerSets[binIdToBinMarkerSets.keys()[0]].mostSpecificMarkerSet().getMarkerGenes()
44        for binIdJ in binIdToBinMarkerSets:
45            if markerGenesI != binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet().getMarkerGenes():
46                self.logger.error('  [Error] All bins must use the same marker set to assess potential mergers.')
47                sys.exit(0)
48
49        # parse HMM information
50        resultsParser = ResultsParser(binIdToModels)
51
52        # get HMM hits to each bin
53        resultsParser.parseBinHits(outDir, hmmTableFile)
54
55        # determine union and intersection of marker sets for each pair of bins
56        outputFile = os.path.join(outDir, "merger.tsv")
57        fout = open(outputFile, 'w')
58        fout.write('Bin Id 1\tBin Id 2')
59        fout.write('\tBin 1 completeness\tBin 1 contamination')
60        fout.write('\tBin 2 completeness\tBin 2 contamination')
61        fout.write('\tDelta completeness\tDelta contamination\tMerger delta')
62        fout.write('\tMerged completeness\tMerged contamination\n')
63
64        binMarkerHits = resultsParser.results
65        binIds = sorted(binMarkerHits.keys())
66        for i in xrange(0, len(binMarkerHits)):
67            binIdI = binIds[i]
68
69            geneCountsI = binMarkerHits[binIdI].geneCounts(binIdToBinMarkerSets[binIdI].mostSpecificMarkerSet(), binMarkerHits[binIdI].markerHits, True)
70            completenessI, contaminationI = geneCountsI[6:8]
71
72            for j in xrange(i + 1, len(binMarkerHits)):
73                binIdJ = binIds[j]
74
75                geneCountsJ = binMarkerHits[binIdJ].geneCounts(binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(), binMarkerHits[binIdJ].markerHits, True)
76                completenessJ, contaminationJ = geneCountsJ[6:8]
77
78                # merge together hits from both bins and calculate completeness and contamination
79                mergedHits = {}
80                for markerId, hits in binMarkerHits[binIdI].markerHits.iteritems():
81                    mergedHits[markerId] = list(hits)
82
83                for markerId, hits in binMarkerHits[binIdJ].markerHits.iteritems():
84                    if markerId in mergedHits:
85                        mergedHits[markerId].extend(hits)
86                    else:
87                        mergedHits[markerId] = hits
88
89                geneCountsMerged = binMarkerHits[binIdI].geneCounts(binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(), mergedHits, True)
90                completenessMerged, contaminationMerged = geneCountsMerged[6:8]
91
92                if not (completenessMerged >= minMergedComp and contaminationMerged < maxMergedCont):
93                    continue
94
95                # calculate merged statistics
96                deltaComp = completenessMerged - max(completenessI, completenessJ)
97                deltaCont = contaminationMerged - max(contaminationI, contaminationJ)
98                delta = deltaComp - deltaCont
99
100                if deltaComp >= minDeltaComp and deltaCont < maxDeltaCont:
101                    fout.write('%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n' %
102                                                                        (binIdI, binIdJ,
103                                                                         completenessI, contaminationI,
104                                                                         completenessJ, contaminationJ,
105                                                                         deltaComp, deltaCont, delta,
106                                                                         completenessMerged, contaminationMerged))
107
108        fout.close()
109
110        return outputFile
111