1############################################################################### 2# 3# unbinned.py - identify unbinned sequences 4# 5############################################################################### 6# # 7# This program is free software: you can redistribute it and/or modify # 8# it under the terms of the GNU General Public License as published by # 9# the Free Software Foundation, either version 3 of the License, or # 10# (at your option) any later version. # 11# # 12# This program is distributed in the hope that it will be useful, # 13# but WITHOUT ANY WARRANTY; without even the implied warranty of # 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 15# GNU General Public License for more details. # 16# # 17# You should have received a copy of the GNU General Public License # 18# along with this program. If not, see <http://www.gnu.org/licenses/>. # 19# # 20############################################################################### 21 22import logging 23 24from checkm.common import checkFileExists 25 26from checkm.util.seqUtils import baseCount, readFasta 27 28 29class Unbinned(): 30 def __init__(self): 31 self.logger = logging.getLogger('timestamp') 32 33 def run(self, binFiles, seqFile, outSeqFile, outStatsFile, minSeqLen): 34 checkFileExists(seqFile) 35 36 # get list of sequences in bins 37 self.logger.info('Reading binned sequences.') 38 39 binnedSeqs = {} 40 totalBinnedBases = 0 41 for binFile in binFiles: 42 seqs = readFasta(binFile) 43 binnedSeqs.update(seqs) 44 for seq in seqs.values(): 45 totalBinnedBases += len(seq) 46 47 self.logger.info(' Read %d (%.2f Mbp) binned sequences.' % (len(binnedSeqs), float(totalBinnedBases) / 1e6)) 48 49 # get list of all sequences 50 self.logger.info('Reading all sequences.') 51 allSeqs = readFasta(seqFile) 52 totalBases = 0 53 for seq in allSeqs.values(): 54 totalBases += len(seq) 55 self.logger.info(' Read %d (%.2f Mbp) sequences.' % (len(allSeqs), float(totalBases) / 1e6)) 56 57 # write all unbinned sequences 58 self.logger.info('Identifying unbinned sequences >= %d bp.' % minSeqLen) 59 seqOut = open(outSeqFile, 'w') 60 61 statsOut = open(outStatsFile, 'w') 62 statsOut.write('Sequence Id\tLength\tGC\n') 63 64 unbinnedCount = 0 65 unbinnedBases = 0 66 for seqId, seq in allSeqs.iteritems(): 67 if seqId not in binnedSeqs: 68 if len(seq) >= minSeqLen: 69 unbinnedCount += 1 70 seqOut.write('>' + seqId + '\n') 71 seqOut.write(seq + '\n') 72 73 unbinnedBases += len(seq) 74 75 a, c, g, t = baseCount(seq) 76 77 statsOut.write('%s\t%d\t%.2f\n' % (seqId, len(seq), float(g + c) * 100 / (a + c + g + t))) 78 79 seqOut.close() 80 statsOut.close() 81 82 self.logger.info(' Identified %d (%.2f Mbp) unbinned sequences.' % (unbinnedCount, float(unbinnedBases) / 1e6)) 83 84 self.logger.info('Percentage of unbinned sequences: %.2f%%' % (unbinnedCount * 100.0 / len(allSeqs))) 85 self.logger.info('Percentage of unbinned bases: %.2f%%' % (unbinnedBases * 100.0 / totalBases)) 86