1############################################################################### 2# 3# merger.py - identify bins with complementary sets of marker genes 4# 5############################################################################### 6# # 7# This program is free software: you can redistribute it and/or modify # 8# it under the terms of the GNU General Public License as published by # 9# the Free Software Foundation, either version 3 of the License, or # 10# (at your option) any later version. # 11# # 12# This program is distributed in the hope that it will be useful, # 13# but WITHOUT ANY WARRANTY; without even the implied warranty of # 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 15# GNU General Public License for more details. # 16# # 17# You should have received a copy of the GNU General Public License # 18# along with this program. If not, see <http://www.gnu.org/licenses/>. # 19# # 20############################################################################### 21 22import os 23import sys 24import logging 25 26from checkm.common import checkDirExists 27from checkm.resultsParser import ResultsParser 28 29 30class Merger(): 31 def __init__(self): 32 self.logger = logging.getLogger('timestamp') 33 34 def run(self, binFiles, outDir, hmmTableFile, 35 binIdToModels, binIdToBinMarkerSets, 36 minDeltaComp, maxDeltaCont, 37 minMergedComp, maxMergedCont): 38 checkDirExists(outDir) 39 40 self.logger.info('Comparing marker sets between all pairs of bins.') 41 42 # ensure all bins are using the same marker set 43 markerGenesI = binIdToBinMarkerSets[binIdToBinMarkerSets.keys()[0]].mostSpecificMarkerSet().getMarkerGenes() 44 for binIdJ in binIdToBinMarkerSets: 45 if markerGenesI != binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet().getMarkerGenes(): 46 self.logger.error(' [Error] All bins must use the same marker set to assess potential mergers.') 47 sys.exit(0) 48 49 # parse HMM information 50 resultsParser = ResultsParser(binIdToModels) 51 52 # get HMM hits to each bin 53 resultsParser.parseBinHits(outDir, hmmTableFile) 54 55 # determine union and intersection of marker sets for each pair of bins 56 outputFile = os.path.join(outDir, "merger.tsv") 57 fout = open(outputFile, 'w') 58 fout.write('Bin Id 1\tBin Id 2') 59 fout.write('\tBin 1 completeness\tBin 1 contamination') 60 fout.write('\tBin 2 completeness\tBin 2 contamination') 61 fout.write('\tDelta completeness\tDelta contamination\tMerger delta') 62 fout.write('\tMerged completeness\tMerged contamination\n') 63 64 binMarkerHits = resultsParser.results 65 binIds = sorted(binMarkerHits.keys()) 66 for i in xrange(0, len(binMarkerHits)): 67 binIdI = binIds[i] 68 69 geneCountsI = binMarkerHits[binIdI].geneCounts(binIdToBinMarkerSets[binIdI].mostSpecificMarkerSet(), binMarkerHits[binIdI].markerHits, True) 70 completenessI, contaminationI = geneCountsI[6:8] 71 72 for j in xrange(i + 1, len(binMarkerHits)): 73 binIdJ = binIds[j] 74 75 geneCountsJ = binMarkerHits[binIdJ].geneCounts(binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(), binMarkerHits[binIdJ].markerHits, True) 76 completenessJ, contaminationJ = geneCountsJ[6:8] 77 78 # merge together hits from both bins and calculate completeness and contamination 79 mergedHits = {} 80 for markerId, hits in binMarkerHits[binIdI].markerHits.iteritems(): 81 mergedHits[markerId] = list(hits) 82 83 for markerId, hits in binMarkerHits[binIdJ].markerHits.iteritems(): 84 if markerId in mergedHits: 85 mergedHits[markerId].extend(hits) 86 else: 87 mergedHits[markerId] = hits 88 89 geneCountsMerged = binMarkerHits[binIdI].geneCounts(binIdToBinMarkerSets[binIdJ].mostSpecificMarkerSet(), mergedHits, True) 90 completenessMerged, contaminationMerged = geneCountsMerged[6:8] 91 92 if not (completenessMerged >= minMergedComp and contaminationMerged < maxMergedCont): 93 continue 94 95 # calculate merged statistics 96 deltaComp = completenessMerged - max(completenessI, completenessJ) 97 deltaCont = contaminationMerged - max(contaminationI, contaminationJ) 98 delta = deltaComp - deltaCont 99 100 if deltaComp >= minDeltaComp and deltaCont < maxDeltaCont: 101 fout.write('%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n' % 102 (binIdI, binIdJ, 103 completenessI, contaminationI, 104 completenessJ, contaminationJ, 105 deltaComp, deltaCont, delta, 106 completenessMerged, contaminationMerged)) 107 108 fout.close() 109 110 return outputFile 111