1############################################################################### 2# 3# binComparer.py - compare two sets of bins 4# 5############################################################################### 6# # 7# This program is free software: you can redistribute it and/or modify # 8# it under the terms of the GNU General Public License as published by # 9# the Free Software Foundation, either version 3 of the License, or # 10# (at your option) any later version. # 11# # 12# This program is distributed in the hope that it will be useful, # 13# but WITHOUT ANY WARRANTY; without even the implied warranty of # 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 15# GNU General Public License for more details. # 16# # 17# You should have received a copy of the GNU General Public License # 18# along with this program. If not, see <http://www.gnu.org/licenses/>. # 19# # 20############################################################################### 21 22import logging 23from collections import defaultdict 24 25from common import binIdFromFilename 26from checkm.util.seqUtils import readFasta, readFastaSeqIds 27 28 29class BinComparer(object): 30 def __init__(self): 31 self.logger = logging.getLogger('timestamp') 32 33 def __readBins(self, binFiles): 34 bins = {} 35 for binFile in binFiles: 36 binId = binIdFromFilename(binFile) 37 bins[binId] = set(readFastaSeqIds(binFile)) 38 39 return bins 40 41 def __binningStats(self, bins, seqLens): 42 totalUniqueBinnedSeqs = 0 43 totalUniqueBinnedBases = 0 44 45 binStats = {} 46 processedSeqs = set() 47 repeats = set() 48 for binId, seqs in bins.iteritems(): 49 numBinnedBases = 0 50 for seqId in seqs: 51 numBinnedBases += seqLens[seqId] 52 if seqId not in processedSeqs: 53 processedSeqs.add(seqId) 54 totalUniqueBinnedBases += seqLens[seqId] 55 totalUniqueBinnedSeqs += 1 56 else: 57 repeats.add(seqId) 58 59 binStats[binId] = [len(seqs), numBinnedBases] 60 61 return binStats, totalUniqueBinnedSeqs, totalUniqueBinnedBases, len(repeats) 62 63 def report(self, binFiles1, binFiles2, seqFile, outputFile): 64 # determine total number of sequences 65 self.logger.info('Reading sequences.') 66 seqs = readFasta(seqFile) 67 68 seqLens = {} 69 totalBases = 0 70 numSeq1K = 0 71 totalBases1K = 0 72 numSeq5K = 0 73 totalBases5K = 0 74 for seqId, seq in seqs.iteritems(): 75 seqLen = len(seq) 76 seqLens[seqId] = seqLen 77 totalBases += seqLen 78 if seqLen >= 1000: 79 numSeq1K += 1 80 totalBases1K += seqLen 81 if seqLen >= 5000: 82 numSeq5K += 1 83 totalBases5K += seqLen 84 85 # determine sequences in each bin 86 bins1 = self.__readBins(binFiles1) 87 bins2 = self.__readBins(binFiles2) 88 89 # determine bin stats 90 binStats1, totalUniqueBinnedSeqs1, totalUniqueBinnedBases1, numRepeats1 = self.__binningStats(bins1, seqLens) 91 binStats2, totalUniqueBinnedSeqs2, totalUniqueBinnedBases2, numRepeats2 = self.__binningStats(bins2, seqLens) 92 93 # sort bins by size 94 binStats1 = sorted(binStats1.iteritems(), key=lambda x: x[1][1], reverse=True) 95 binStats2 = sorted(binStats2.iteritems(), key=lambda x: x[1][1], reverse=True) 96 97 # report summary results 98 self.logger.info('Total seqs = %d (%.2f Mbp)' % (len(seqs), float(totalBases) / 1e6)) 99 self.logger.info(' # seqs > 1 kbp = %d (%.2f Mbp)' % (numSeq1K, float(totalBases1K) / 1e6)) 100 self.logger.info(' # seqs > 5 kbp = %d (%.2f Mbp)' % (numSeq5K, float(totalBases5K) / 1e6)) 101 self.logger.info('Binned seqs statistics:') 102 self.logger.info(' 1) # bins: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%), # seqs in multiple bins: %d' 103 % (len(bins1), 104 totalUniqueBinnedSeqs1, 105 float(totalUniqueBinnedSeqs1) * 100 / len(seqs), 106 float(totalUniqueBinnedBases1) / 1e6, 107 float(totalUniqueBinnedBases1) * 100 / totalBases, 108 numRepeats1)) 109 self.logger.info(' 2) # bins: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%), # seqs in multiple bins: %d' 110 % (len(bins2), 111 totalUniqueBinnedSeqs2, 112 float(totalUniqueBinnedSeqs2) * 100 / len(seqs), 113 float(totalUniqueBinnedBases2) / 1e6, 114 float(totalUniqueBinnedBases2) * 100 / totalBases, 115 numRepeats2)) 116 117 # output report 118 fout = open(outputFile, 'w') 119 for data in binStats2: 120 fout.write('\t' + data[0]) 121 fout.write('\tunbinned\t# seqs\t# bases (Mbp)\tBest match\t% bases in common\t% seqs in common\n') 122 123 totalSeqsInCommon2 = defaultdict(int) 124 maxBasesInCommon2 = defaultdict(int) 125 maxSeqsInCommon2 = defaultdict(int) 126 bestMatchingBin2 = {} 127 binnedSeqs2 = defaultdict(set) 128 for data1 in binStats1: 129 binId1 = data1[0] 130 fout.write(binId1) 131 132 seqs1 = bins1[binId1] 133 134 maxBasesInCommon = 0 135 maxSeqsInCommon = 0 136 bestMatchingBin = 'n/a' 137 binnedSeqs = set() 138 for data2 in binStats2: 139 binId2 = data2[0] 140 seqs2 = bins2[binId2] 141 142 seqsInCommon = seqs1.intersection(seqs2) 143 binnedSeqs.update(seqsInCommon) 144 numSeqsInCommon = len(seqsInCommon) 145 fout.write('\t' + str(numSeqsInCommon)) 146 147 basesInCommon = 0 148 for seqId in seqsInCommon: 149 basesInCommon += seqLens[seqId] 150 151 if basesInCommon > maxBasesInCommon: 152 maxBasesInCommon = basesInCommon 153 maxSeqsInCommon = numSeqsInCommon 154 bestMatchingBin = binId2 155 156 if basesInCommon > maxBasesInCommon2[binId2]: 157 maxBasesInCommon2[binId2] = basesInCommon 158 maxSeqsInCommon2[binId2] = numSeqsInCommon 159 bestMatchingBin2[binId2] = binId1 160 161 binnedSeqs2[binId2].update(seqsInCommon) 162 fout.write('\t%d\t%d\t%.2f\t%s\t%.2f\t%.2f\n' % (len(seqs1) - len(binnedSeqs), 163 data1[1][0], 164 float(data1[1][1]) / 1e6, 165 bestMatchingBin, 166 float(maxBasesInCommon) * 100 / data1[1][1], 167 float(maxSeqsInCommon) * 100 / data1[1][0], 168 )) 169 170 fout.write('unbinned') 171 for data in binStats2: 172 binId = data[0] 173 fout.write('\t%d' % (len(bins2[binId]) - len(binnedSeqs2[binId]))) 174 fout.write('\n') 175 176 fout.write('# seqs') 177 for data in binStats2: 178 fout.write('\t%d' % data[1][0]) 179 fout.write('\n') 180 181 fout.write('# bases (Mbp)') 182 for data in binStats2: 183 fout.write('\t%.2f' % (float(data[1][1]) / 1e6)) 184 fout.write('\n') 185 186 fout.write('Best match') 187 for data in binStats2: 188 binId = data[0] 189 fout.write('\t%s' % bestMatchingBin2.get(binId, 'n/a')) 190 fout.write('\n') 191 192 fout.write('% bases in common') 193 for data in binStats2: 194 binId = data[0] 195 fout.write('\t%.2f' % (float(maxBasesInCommon2[binId]) * 100 / data[1][1])) 196 fout.write('\n') 197 198 fout.write('% seqs in common') 199 for data in binStats2: 200 binId = data[0] 201 fout.write('\t%.2f' % (float(maxSeqsInCommon2[binId]) * 100 / data[1][0])) 202 fout.write('\n') 203 204 fout.close() 205