1###############################################################################
2#
3# unbinned.py - identify unbinned sequences
4#
5###############################################################################
6#                                                                             #
7#    This program is free software: you can redistribute it and/or modify     #
8#    it under the terms of the GNU General Public License as published by     #
9#    the Free Software Foundation, either version 3 of the License, or        #
10#    (at your option) any later version.                                      #
11#                                                                             #
12#    This program is distributed in the hope that it will be useful,          #
13#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
14#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
15#    GNU General Public License for more details.                             #
16#                                                                             #
17#    You should have received a copy of the GNU General Public License        #
18#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
19#                                                                             #
20###############################################################################
21
22import logging
23
24from checkm.common import checkFileExists
25
26from checkm.util.seqUtils import baseCount, readFasta
27
28
29class Unbinned():
30    def __init__(self):
31        self.logger = logging.getLogger('timestamp')
32
33    def run(self, binFiles, seqFile, outSeqFile, outStatsFile, minSeqLen):
34        checkFileExists(seqFile)
35
36        # get list of sequences in bins
37        self.logger.info('Reading binned sequences.')
38
39        binnedSeqs = {}
40        totalBinnedBases = 0
41        for binFile in binFiles:
42            seqs = readFasta(binFile)
43            binnedSeqs.update(seqs)
44            for seq in seqs.values():
45                totalBinnedBases += len(seq)
46
47        self.logger.info('  Read %d (%.2f Mbp) binned sequences.' % (len(binnedSeqs), float(totalBinnedBases) / 1e6))
48
49        # get list of all sequences
50        self.logger.info('Reading all sequences.')
51        allSeqs = readFasta(seqFile)
52        totalBases = 0
53        for seq in allSeqs.values():
54            totalBases += len(seq)
55        self.logger.info('  Read %d (%.2f Mbp) sequences.' % (len(allSeqs), float(totalBases) / 1e6))
56
57        # write all unbinned sequences
58        self.logger.info('Identifying unbinned sequences >= %d bp.' % minSeqLen)
59        seqOut = open(outSeqFile, 'w')
60
61        statsOut = open(outStatsFile, 'w')
62        statsOut.write('Sequence Id\tLength\tGC\n')
63
64        unbinnedCount = 0
65        unbinnedBases = 0
66        for seqId, seq in allSeqs.iteritems():
67            if seqId not in binnedSeqs:
68                if len(seq) >= minSeqLen:
69                    unbinnedCount += 1
70                    seqOut.write('>' + seqId + '\n')
71                    seqOut.write(seq + '\n')
72
73                    unbinnedBases += len(seq)
74
75                    a, c, g, t = baseCount(seq)
76
77                    statsOut.write('%s\t%d\t%.2f\n' % (seqId, len(seq), float(g + c) * 100 / (a + c + g + t)))
78
79        seqOut.close()
80        statsOut.close()
81
82        self.logger.info('  Identified %d (%.2f Mbp) unbinned sequences.' % (unbinnedCount, float(unbinnedBases) / 1e6))
83
84        self.logger.info('Percentage of unbinned sequences: %.2f%%' % (unbinnedCount * 100.0 / len(allSeqs)))
85        self.logger.info('Percentage of unbinned bases: %.2f%%' % (unbinnedBases * 100.0 / totalBases))
86