1###############################################################################
2#
3# ssuFinder.py - identify SSU (16S rRNA) in genome bins
4#
5###############################################################################
6#                                                                             #
7#    This program is free software: you can redistribute it and/or modify     #
8#    it under the terms of the GNU General Public License as published by     #
9#    the Free Software Foundation, either version 3 of the License, or        #
10#    (at your option) any later version.                                      #
11#                                                                             #
12#    This program is distributed in the hope that it will be useful,          #
13#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
14#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
15#    GNU General Public License for more details.                             #
16#                                                                             #
17#    You should have received a copy of the GNU General Public License        #
18#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
19#                                                                             #
20###############################################################################
21
22import os
23import logging
24
25from checkm.defaultValues import DefaultValues
26from checkm.common import binIdFromFilename
27from checkm.util.seqUtils import readFasta, readFastaSeqIds
28
29
30class SSU_Finder(object):
31    def __init__(self, threads):
32        self.logger = logging.getLogger('timestamp')
33
34        self.totalThreads = threads
35
36        self.bacteriaModelFile = os.path.join(DefaultValues.CHECKM_DATA_DIR, 'hmms_ssu', 'SSU_bacteria.hmm')
37        self.archaeaModelFile = os.path.join(DefaultValues.CHECKM_DATA_DIR, 'hmms_ssu', 'SSU_archaea.hmm')
38        self.eukModelFile = os.path.join(DefaultValues.CHECKM_DATA_DIR, 'hmms_ssu', 'SSU_euk.hmm')
39
40    def __hmmSearch(self, seqFile, evalue, outputPrefix):
41        if seqFile.endswith('gz'):
42            pipe = 'zcat ' + seqFile + ' | '
43        else:
44            pipe = 'cat ' + seqFile + ' | '
45
46        self.logger.info('Identifying bacterial 16S.')
47        os.system(pipe + 'nhmmer --noali --cpu ' + str(self.totalThreads) + ' -o ' + outputPrefix + '.bacteria.txt --tblout ' + outputPrefix + '.bacteria.table.txt -E ' + str(evalue) + ' ' + self.bacteriaModelFile + ' -')
48
49        self.logger.info('Identifying archaeal 16S.')
50        os.system(pipe + 'nhmmer --noali --cpu ' + str(self.totalThreads) + ' -o ' + outputPrefix + '.archaea.txt --tblout ' + outputPrefix + '.archaea.table.txt -E ' + str(evalue) + ' ' + self.archaeaModelFile + ' -')
51
52        self.logger.info('Identifying eukaryotic 18S.')
53        os.system(pipe + 'nhmmer --noali --cpu ' + str(self.totalThreads) + ' -o ' + outputPrefix + '.euk.txt --tblout ' + outputPrefix + '.euk.table.txt -E ' + str(evalue) + ' ' + self.eukModelFile + ' -')
54
55    def __readHits(self, resultsFile, domain, evalueThreshold):
56        """Parse hits from nhmmer output."""
57        seqInfo = {}
58
59        bReadHit = False
60        for line in open(resultsFile):
61            if line[0:2] == '>>':
62                lineSplit = line.split()
63                seqId = lineSplit[1]
64                bReadHit = True
65                hitCounter = 0
66                continue
67            elif line.strip() == '':
68                bReadHit = False
69            elif bReadHit:
70                hitCounter += 1
71                if hitCounter >= 3:
72                    lineSplit = line.split()
73
74                    iEvalue = lineSplit[3]
75                    aliFrom = int(lineSplit[7])
76                    aliTo = int(lineSplit[8])
77
78                    revComp = False
79                    if aliFrom > aliTo:
80                        revComp = True
81                        aliFrom, aliTo = aliTo, aliFrom
82
83                    alignLen = int(aliTo) - int(aliFrom)
84
85                    if float(iEvalue) <= evalueThreshold:
86                        seqInfo[seqId] = seqInfo.get(seqId, []) + [[domain, iEvalue, str(aliFrom), str(aliTo), str(alignLen), str(revComp)]]
87
88        return seqInfo
89
90    def __addHit(self, hits, seqId, info, concatenateThreshold):
91        """Add hits from individual HMMs and concatenate nearby hits."""
92
93        # check if this is the first hit to this sequence
94        if seqId not in hits:
95            hits[seqId] = info
96            return
97
98        # check if hits to sequence are close enough to warrant concatenating them,
99        # otherwise record both hits
100        baseSeqId = seqId
101        index = 1
102        bConcatenate = False
103        concateSeqId = seqId
104        while(True):
105            # if hits overlap then retain only the longest
106            startNew = int(info[2])
107            endNew = int(info[3])
108            bRevNew = bool(info[5])
109
110            start = int(hits[seqId][2])
111            end = int(hits[seqId][3])
112            bRev = bool(info[5])
113
114            # check if hits should be concatenated
115            if abs(start - endNew) < concatenateThreshold and bRevNew == bRev:
116                # new hit closely preceded old hit and is on same strand
117                del hits[seqId]
118                info[2] = str(startNew)
119                info[3] = str(end)
120                info[4] = str(end - startNew)
121                hits[concateSeqId] = info
122                bConcatenate = True
123
124            elif abs(startNew - end) < concatenateThreshold and bRevNew == bRev:
125                # new hit closely follows old hit and is on same strand
126                del hits[seqId]
127                info[2] = str(start)
128                info[3] = str(endNew)
129                info[4] = str(endNew - start)
130                hits[concateSeqId] = info
131                bConcatenate = True
132
133            index += 1
134            newSeqId = baseSeqId + '-#' + str(index)
135            if bConcatenate:
136                if newSeqId in hits:
137                    seqId = newSeqId  # see if other sequences concatenate
138                else:
139                    break
140            else:
141                # hits are not close enough to concatenate
142                if newSeqId in hits:
143                    seqId = newSeqId  # see if the new hit overlaps with this
144                    concateSeqId = newSeqId
145                else:
146                    hits[newSeqId] = info
147                    break
148
149    def __addDomainHit(self, hits, seqId, info):
150        """Add hits from different domain models and concatenate nearby hits."""
151        if seqId not in hits:
152            hits[seqId] = info
153            return
154
155        baseSeqId = seqId
156        overlapSeqId = seqId
157
158        index = 1
159        bOverlap = False
160        while(True):
161            # if hits overlap then retain only the longest
162            startNew = int(info[2])
163            endNew = int(info[3])
164            lengthNew = int(info[4])
165
166            start = int(hits[seqId][2])
167            end = int(hits[seqId][3])
168            length = int(hits[seqId][4])
169
170            if (startNew <= start and endNew >= start) or (start <= startNew and end >= startNew):
171                bOverlap = True
172
173                if lengthNew > length:
174                    hits[overlapSeqId] = info
175                else:
176                    hits[overlapSeqId] = hits[seqId]
177
178                if overlapSeqId != seqId:
179                    del hits[seqId]
180
181            index += 1
182            newSeqId = baseSeqId + '-#' + str(index)
183            if newSeqId in hits:
184                seqId = newSeqId  # see if the new hit overlaps with this
185                if not bOverlap:
186                    overlapSeqId = seqId
187            else:
188                break
189
190        if not bOverlap:
191            hits[newSeqId] = info
192
193    def run(self, contigFile, binFiles, outputDir, evalueThreshold, concatenateThreshold):
194        # make sure output directory exists
195        if not os.path.exists(outputDir):
196            os.makedirs(outputDir)
197
198        # get bin id of binned contigs
199        self.logger.info('Determining bin assignment of sequences.')
200        seqIdToBinId = {}
201        for f in binFiles:
202            binId = binIdFromFilename(f)
203            seqIds = readFastaSeqIds(f)
204            for seqId in seqIds:
205                seqIdToBinId[seqId] = binId
206
207        # identify 16S reads from contigs/scaffolds
208        self.logger.info('Identifying SSU rRNAs on sequences.')
209        self.__hmmSearch(contigFile, evalueThreshold, os.path.join(outputDir, 'ssu'))
210
211        # read HMM hits
212        hitsPerDomain = {}
213        for domain in ['archaea', 'bacteria', 'euk']:
214            hits = {}
215
216            seqInfo = self.__readHits(os.path.join(outputDir, 'ssu' + '.' + domain + '.txt'), domain, evalueThreshold)
217            if len(seqInfo) > 0:
218                for seqId, seqHits in seqInfo.iteritems():
219                    for hit in seqHits:
220                        self.__addHit(hits, seqId, hit, concatenateThreshold)
221
222            hitsPerDomain[domain] = hits
223
224        # find best domain hit for each sequence
225        bestHits = {}
226        for _, hits in hitsPerDomain.iteritems():
227            for seqId, info in hits.iteritems():
228                if '-#' in seqId:
229                    seqId = seqId[0:seqId.rfind('-#')]
230
231                self.__addDomainHit(bestHits, seqId, info)
232
233        # write summary file and putative SSU rRNAs to file
234        summaryFile = os.path.join(outputDir, 'ssu_summary.tsv')
235        summaryOut = open(summaryFile, 'w')
236        summaryOut.write('Bin Id\tSeq. Id\tHMM\ti-Evalue\tStart hit\tEnd hit\t16S/18S gene length\tRev. Complement\tSequence length\n')
237
238        seqFile = os.path.join(outputDir, 'ssu.fna')
239        seqOut = open(seqFile, 'w')
240
241        seqs = readFasta(contigFile)
242
243        hitsToBins = {}
244        for seqId in bestHits:
245            origSeqId = seqId
246            if '-#' in seqId:
247                seqId = seqId[0:seqId.rfind('-#')]
248
249            if seqId in seqIdToBinId:
250                binId = seqIdToBinId[seqId]
251            else:
252                binId = DefaultValues.UNBINNED
253
254            seqInfo = [origSeqId] + bestHits[origSeqId]
255            hitsToBins[binId] = hitsToBins.get(binId, []) + [seqInfo]
256
257        for binId in sorted(hitsToBins.keys()):
258            for seqInfo in hitsToBins[binId]:
259                seqId = seqInfo[0]
260                if '-#' in seqId:
261                    seqId = seqId[0:seqId.rfind('-#')]
262
263                seq = seqs[seqId]
264                summaryOut.write(binId + '\t' + '\t'.join(seqInfo) + '\t' + str(len(seq)) + '\n')
265                seqOut.write('>' + binId + DefaultValues.SEQ_CONCAT_CHAR + seqInfo[0] + '\n')
266                seqOut.write(seq[int(seqInfo[3]) + 1:int(seqInfo[4]) + 1] + '\n')
267
268        summaryOut.close()
269        seqOut.close()
270
271        self.logger.info('Identified ' + str(len(bestHits)) + ' putative SSU genes.')
272        self.logger.info('Summary of identified hits written to: ' + summaryFile)
273        self.logger.info('SSU sequences written to: ' + seqFile)
274