1###############################################################################
2#
3# binComparer.py - compare two sets of bins
4#
5###############################################################################
6#                                                                             #
7#    This program is free software: you can redistribute it and/or modify     #
8#    it under the terms of the GNU General Public License as published by     #
9#    the Free Software Foundation, either version 3 of the License, or        #
10#    (at your option) any later version.                                      #
11#                                                                             #
12#    This program is distributed in the hope that it will be useful,          #
13#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
14#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
15#    GNU General Public License for more details.                             #
16#                                                                             #
17#    You should have received a copy of the GNU General Public License        #
18#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
19#                                                                             #
20###############################################################################
21
22import logging
23from collections import defaultdict
24
25from common import binIdFromFilename
26from checkm.util.seqUtils import readFasta, readFastaSeqIds
27
28
29class BinComparer(object):
30    def __init__(self):
31        self.logger = logging.getLogger('timestamp')
32
33    def __readBins(self, binFiles):
34        bins = {}
35        for binFile in binFiles:
36            binId = binIdFromFilename(binFile)
37            bins[binId] = set(readFastaSeqIds(binFile))
38
39        return bins
40
41    def __binningStats(self, bins, seqLens):
42        totalUniqueBinnedSeqs = 0
43        totalUniqueBinnedBases = 0
44
45        binStats = {}
46        processedSeqs = set()
47        repeats = set()
48        for binId, seqs in bins.iteritems():
49            numBinnedBases = 0
50            for seqId in seqs:
51                numBinnedBases += seqLens[seqId]
52                if seqId not in processedSeqs:
53                    processedSeqs.add(seqId)
54                    totalUniqueBinnedBases += seqLens[seqId]
55                    totalUniqueBinnedSeqs += 1
56                else:
57                    repeats.add(seqId)
58
59            binStats[binId] = [len(seqs), numBinnedBases]
60
61        return binStats, totalUniqueBinnedSeqs, totalUniqueBinnedBases, len(repeats)
62
63    def report(self, binFiles1, binFiles2, seqFile, outputFile):
64        # determine total number of sequences
65        self.logger.info('Reading sequences.')
66        seqs = readFasta(seqFile)
67
68        seqLens = {}
69        totalBases = 0
70        numSeq1K = 0
71        totalBases1K = 0
72        numSeq5K = 0
73        totalBases5K = 0
74        for seqId, seq in seqs.iteritems():
75            seqLen = len(seq)
76            seqLens[seqId] = seqLen
77            totalBases += seqLen
78            if seqLen >= 1000:
79                numSeq1K += 1
80                totalBases1K += seqLen
81            if seqLen >= 5000:
82                numSeq5K += 1
83                totalBases5K += seqLen
84
85        # determine sequences in each bin
86        bins1 = self.__readBins(binFiles1)
87        bins2 = self.__readBins(binFiles2)
88
89        # determine bin stats
90        binStats1, totalUniqueBinnedSeqs1, totalUniqueBinnedBases1, numRepeats1 = self.__binningStats(bins1, seqLens)
91        binStats2, totalUniqueBinnedSeqs2, totalUniqueBinnedBases2, numRepeats2 = self.__binningStats(bins2, seqLens)
92
93        # sort bins by size
94        binStats1 = sorted(binStats1.iteritems(), key=lambda x: x[1][1], reverse=True)
95        binStats2 = sorted(binStats2.iteritems(), key=lambda x: x[1][1], reverse=True)
96
97        # report summary results
98        self.logger.info('Total seqs = %d (%.2f Mbp)' % (len(seqs), float(totalBases) / 1e6))
99        self.logger.info('  # seqs > 1 kbp = %d (%.2f Mbp)' % (numSeq1K, float(totalBases1K) / 1e6))
100        self.logger.info('  # seqs > 5 kbp = %d (%.2f Mbp)' % (numSeq5K, float(totalBases5K) / 1e6))
101        self.logger.info('Binned seqs statistics:')
102        self.logger.info('  1) # bins: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%), # seqs in multiple bins: %d'
103                                % (len(bins1),
104                                   totalUniqueBinnedSeqs1,
105                                   float(totalUniqueBinnedSeqs1) * 100 / len(seqs),
106                                   float(totalUniqueBinnedBases1) / 1e6,
107                                   float(totalUniqueBinnedBases1) * 100 / totalBases,
108                                   numRepeats1))
109        self.logger.info('  2) # bins: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%), # seqs in multiple bins: %d'
110                                % (len(bins2),
111                                   totalUniqueBinnedSeqs2,
112                                   float(totalUniqueBinnedSeqs2) * 100 / len(seqs),
113                                   float(totalUniqueBinnedBases2) / 1e6,
114                                   float(totalUniqueBinnedBases2) * 100 / totalBases,
115                                   numRepeats2))
116
117        # output report
118        fout = open(outputFile, 'w')
119        for data in binStats2:
120            fout.write('\t' + data[0])
121        fout.write('\tunbinned\t# seqs\t# bases (Mbp)\tBest match\t% bases in common\t% seqs in common\n')
122
123        totalSeqsInCommon2 = defaultdict(int)
124        maxBasesInCommon2 = defaultdict(int)
125        maxSeqsInCommon2 = defaultdict(int)
126        bestMatchingBin2 = {}
127        binnedSeqs2 = defaultdict(set)
128        for data1 in binStats1:
129            binId1 = data1[0]
130            fout.write(binId1)
131
132            seqs1 = bins1[binId1]
133
134            maxBasesInCommon = 0
135            maxSeqsInCommon = 0
136            bestMatchingBin = 'n/a'
137            binnedSeqs = set()
138            for data2 in binStats2:
139                binId2 = data2[0]
140                seqs2 = bins2[binId2]
141
142                seqsInCommon = seqs1.intersection(seqs2)
143                binnedSeqs.update(seqsInCommon)
144                numSeqsInCommon = len(seqsInCommon)
145                fout.write('\t' + str(numSeqsInCommon))
146
147                basesInCommon = 0
148                for seqId in seqsInCommon:
149                    basesInCommon += seqLens[seqId]
150
151                if basesInCommon > maxBasesInCommon:
152                    maxBasesInCommon = basesInCommon
153                    maxSeqsInCommon = numSeqsInCommon
154                    bestMatchingBin = binId2
155
156                if basesInCommon > maxBasesInCommon2[binId2]:
157                    maxBasesInCommon2[binId2] = basesInCommon
158                    maxSeqsInCommon2[binId2] = numSeqsInCommon
159                    bestMatchingBin2[binId2] = binId1
160
161                binnedSeqs2[binId2].update(seqsInCommon)
162            fout.write('\t%d\t%d\t%.2f\t%s\t%.2f\t%.2f\n' % (len(seqs1) - len(binnedSeqs),
163                                                             data1[1][0],
164                                                             float(data1[1][1]) / 1e6,
165                                                             bestMatchingBin,
166                                                             float(maxBasesInCommon) * 100 / data1[1][1],
167                                                             float(maxSeqsInCommon) * 100 / data1[1][0],
168                                                             ))
169
170        fout.write('unbinned')
171        for data in binStats2:
172            binId = data[0]
173            fout.write('\t%d' % (len(bins2[binId]) - len(binnedSeqs2[binId])))
174        fout.write('\n')
175
176        fout.write('# seqs')
177        for data in binStats2:
178            fout.write('\t%d' % data[1][0])
179        fout.write('\n')
180
181        fout.write('# bases (Mbp)')
182        for data in binStats2:
183            fout.write('\t%.2f' % (float(data[1][1]) / 1e6))
184        fout.write('\n')
185
186        fout.write('Best match')
187        for data in binStats2:
188            binId = data[0]
189            fout.write('\t%s' % bestMatchingBin2.get(binId, 'n/a'))
190        fout.write('\n')
191
192        fout.write('% bases in common')
193        for data in binStats2:
194            binId = data[0]
195            fout.write('\t%.2f' % (float(maxBasesInCommon2[binId]) * 100 / data[1][1]))
196        fout.write('\n')
197
198        fout.write('% seqs in common')
199        for data in binStats2:
200            binId = data[0]
201            fout.write('\t%.2f' % (float(maxSeqsInCommon2[binId]) * 100 / data[1][0]))
202        fout.write('\n')
203
204        fout.close()
205