1############################################################################### 2# 3# ssuFinder.py - identify SSU (16S rRNA) in genome bins 4# 5############################################################################### 6# # 7# This program is free software: you can redistribute it and/or modify # 8# it under the terms of the GNU General Public License as published by # 9# the Free Software Foundation, either version 3 of the License, or # 10# (at your option) any later version. # 11# # 12# This program is distributed in the hope that it will be useful, # 13# but WITHOUT ANY WARRANTY; without even the implied warranty of # 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 15# GNU General Public License for more details. # 16# # 17# You should have received a copy of the GNU General Public License # 18# along with this program. If not, see <http://www.gnu.org/licenses/>. # 19# # 20############################################################################### 21 22import os 23import logging 24 25from checkm.defaultValues import DefaultValues 26from checkm.common import binIdFromFilename 27from checkm.util.seqUtils import readFasta, readFastaSeqIds 28 29 30class SSU_Finder(object): 31 def __init__(self, threads): 32 self.logger = logging.getLogger('timestamp') 33 34 self.totalThreads = threads 35 36 self.bacteriaModelFile = os.path.join(DefaultValues.CHECKM_DATA_DIR, 'hmms_ssu', 'SSU_bacteria.hmm') 37 self.archaeaModelFile = os.path.join(DefaultValues.CHECKM_DATA_DIR, 'hmms_ssu', 'SSU_archaea.hmm') 38 self.eukModelFile = os.path.join(DefaultValues.CHECKM_DATA_DIR, 'hmms_ssu', 'SSU_euk.hmm') 39 40 def __hmmSearch(self, seqFile, evalue, outputPrefix): 41 if seqFile.endswith('gz'): 42 pipe = 'zcat ' + seqFile + ' | ' 43 else: 44 pipe = 'cat ' + seqFile + ' | ' 45 46 self.logger.info('Identifying bacterial 16S.') 47 os.system(pipe + 'nhmmer --noali --cpu ' + str(self.totalThreads) + ' -o ' + outputPrefix + '.bacteria.txt --tblout ' + outputPrefix + '.bacteria.table.txt -E ' + str(evalue) + ' ' + self.bacteriaModelFile + ' -') 48 49 self.logger.info('Identifying archaeal 16S.') 50 os.system(pipe + 'nhmmer --noali --cpu ' + str(self.totalThreads) + ' -o ' + outputPrefix + '.archaea.txt --tblout ' + outputPrefix + '.archaea.table.txt -E ' + str(evalue) + ' ' + self.archaeaModelFile + ' -') 51 52 self.logger.info('Identifying eukaryotic 18S.') 53 os.system(pipe + 'nhmmer --noali --cpu ' + str(self.totalThreads) + ' -o ' + outputPrefix + '.euk.txt --tblout ' + outputPrefix + '.euk.table.txt -E ' + str(evalue) + ' ' + self.eukModelFile + ' -') 54 55 def __readHits(self, resultsFile, domain, evalueThreshold): 56 """Parse hits from nhmmer output.""" 57 seqInfo = {} 58 59 bReadHit = False 60 for line in open(resultsFile): 61 if line[0:2] == '>>': 62 lineSplit = line.split() 63 seqId = lineSplit[1] 64 bReadHit = True 65 hitCounter = 0 66 continue 67 elif line.strip() == '': 68 bReadHit = False 69 elif bReadHit: 70 hitCounter += 1 71 if hitCounter >= 3: 72 lineSplit = line.split() 73 74 iEvalue = lineSplit[3] 75 aliFrom = int(lineSplit[7]) 76 aliTo = int(lineSplit[8]) 77 78 revComp = False 79 if aliFrom > aliTo: 80 revComp = True 81 aliFrom, aliTo = aliTo, aliFrom 82 83 alignLen = int(aliTo) - int(aliFrom) 84 85 if float(iEvalue) <= evalueThreshold: 86 seqInfo[seqId] = seqInfo.get(seqId, []) + [[domain, iEvalue, str(aliFrom), str(aliTo), str(alignLen), str(revComp)]] 87 88 return seqInfo 89 90 def __addHit(self, hits, seqId, info, concatenateThreshold): 91 """Add hits from individual HMMs and concatenate nearby hits.""" 92 93 # check if this is the first hit to this sequence 94 if seqId not in hits: 95 hits[seqId] = info 96 return 97 98 # check if hits to sequence are close enough to warrant concatenating them, 99 # otherwise record both hits 100 baseSeqId = seqId 101 index = 1 102 bConcatenate = False 103 concateSeqId = seqId 104 while(True): 105 # if hits overlap then retain only the longest 106 startNew = int(info[2]) 107 endNew = int(info[3]) 108 bRevNew = bool(info[5]) 109 110 start = int(hits[seqId][2]) 111 end = int(hits[seqId][3]) 112 bRev = bool(info[5]) 113 114 # check if hits should be concatenated 115 if abs(start - endNew) < concatenateThreshold and bRevNew == bRev: 116 # new hit closely preceded old hit and is on same strand 117 del hits[seqId] 118 info[2] = str(startNew) 119 info[3] = str(end) 120 info[4] = str(end - startNew) 121 hits[concateSeqId] = info 122 bConcatenate = True 123 124 elif abs(startNew - end) < concatenateThreshold and bRevNew == bRev: 125 # new hit closely follows old hit and is on same strand 126 del hits[seqId] 127 info[2] = str(start) 128 info[3] = str(endNew) 129 info[4] = str(endNew - start) 130 hits[concateSeqId] = info 131 bConcatenate = True 132 133 index += 1 134 newSeqId = baseSeqId + '-#' + str(index) 135 if bConcatenate: 136 if newSeqId in hits: 137 seqId = newSeqId # see if other sequences concatenate 138 else: 139 break 140 else: 141 # hits are not close enough to concatenate 142 if newSeqId in hits: 143 seqId = newSeqId # see if the new hit overlaps with this 144 concateSeqId = newSeqId 145 else: 146 hits[newSeqId] = info 147 break 148 149 def __addDomainHit(self, hits, seqId, info): 150 """Add hits from different domain models and concatenate nearby hits.""" 151 if seqId not in hits: 152 hits[seqId] = info 153 return 154 155 baseSeqId = seqId 156 overlapSeqId = seqId 157 158 index = 1 159 bOverlap = False 160 while(True): 161 # if hits overlap then retain only the longest 162 startNew = int(info[2]) 163 endNew = int(info[3]) 164 lengthNew = int(info[4]) 165 166 start = int(hits[seqId][2]) 167 end = int(hits[seqId][3]) 168 length = int(hits[seqId][4]) 169 170 if (startNew <= start and endNew >= start) or (start <= startNew and end >= startNew): 171 bOverlap = True 172 173 if lengthNew > length: 174 hits[overlapSeqId] = info 175 else: 176 hits[overlapSeqId] = hits[seqId] 177 178 if overlapSeqId != seqId: 179 del hits[seqId] 180 181 index += 1 182 newSeqId = baseSeqId + '-#' + str(index) 183 if newSeqId in hits: 184 seqId = newSeqId # see if the new hit overlaps with this 185 if not bOverlap: 186 overlapSeqId = seqId 187 else: 188 break 189 190 if not bOverlap: 191 hits[newSeqId] = info 192 193 def run(self, contigFile, binFiles, outputDir, evalueThreshold, concatenateThreshold): 194 # make sure output directory exists 195 if not os.path.exists(outputDir): 196 os.makedirs(outputDir) 197 198 # get bin id of binned contigs 199 self.logger.info('Determining bin assignment of sequences.') 200 seqIdToBinId = {} 201 for f in binFiles: 202 binId = binIdFromFilename(f) 203 seqIds = readFastaSeqIds(f) 204 for seqId in seqIds: 205 seqIdToBinId[seqId] = binId 206 207 # identify 16S reads from contigs/scaffolds 208 self.logger.info('Identifying SSU rRNAs on sequences.') 209 self.__hmmSearch(contigFile, evalueThreshold, os.path.join(outputDir, 'ssu')) 210 211 # read HMM hits 212 hitsPerDomain = {} 213 for domain in ['archaea', 'bacteria', 'euk']: 214 hits = {} 215 216 seqInfo = self.__readHits(os.path.join(outputDir, 'ssu' + '.' + domain + '.txt'), domain, evalueThreshold) 217 if len(seqInfo) > 0: 218 for seqId, seqHits in seqInfo.iteritems(): 219 for hit in seqHits: 220 self.__addHit(hits, seqId, hit, concatenateThreshold) 221 222 hitsPerDomain[domain] = hits 223 224 # find best domain hit for each sequence 225 bestHits = {} 226 for _, hits in hitsPerDomain.iteritems(): 227 for seqId, info in hits.iteritems(): 228 if '-#' in seqId: 229 seqId = seqId[0:seqId.rfind('-#')] 230 231 self.__addDomainHit(bestHits, seqId, info) 232 233 # write summary file and putative SSU rRNAs to file 234 summaryFile = os.path.join(outputDir, 'ssu_summary.tsv') 235 summaryOut = open(summaryFile, 'w') 236 summaryOut.write('Bin Id\tSeq. Id\tHMM\ti-Evalue\tStart hit\tEnd hit\t16S/18S gene length\tRev. Complement\tSequence length\n') 237 238 seqFile = os.path.join(outputDir, 'ssu.fna') 239 seqOut = open(seqFile, 'w') 240 241 seqs = readFasta(contigFile) 242 243 hitsToBins = {} 244 for seqId in bestHits: 245 origSeqId = seqId 246 if '-#' in seqId: 247 seqId = seqId[0:seqId.rfind('-#')] 248 249 if seqId in seqIdToBinId: 250 binId = seqIdToBinId[seqId] 251 else: 252 binId = DefaultValues.UNBINNED 253 254 seqInfo = [origSeqId] + bestHits[origSeqId] 255 hitsToBins[binId] = hitsToBins.get(binId, []) + [seqInfo] 256 257 for binId in sorted(hitsToBins.keys()): 258 for seqInfo in hitsToBins[binId]: 259 seqId = seqInfo[0] 260 if '-#' in seqId: 261 seqId = seqId[0:seqId.rfind('-#')] 262 263 seq = seqs[seqId] 264 summaryOut.write(binId + '\t' + '\t'.join(seqInfo) + '\t' + str(len(seq)) + '\n') 265 seqOut.write('>' + binId + DefaultValues.SEQ_CONCAT_CHAR + seqInfo[0] + '\n') 266 seqOut.write(seq[int(seqInfo[3]) + 1:int(seqInfo[4]) + 1] + '\n') 267 268 summaryOut.close() 269 seqOut.close() 270 271 self.logger.info('Identified ' + str(len(bestHits)) + ' putative SSU genes.') 272 self.logger.info('Summary of identified hits written to: ' + summaryFile) 273 self.logger.info('SSU sequences written to: ' + seqFile) 274