1############################################################################### 2# 3# hmmer.py - runs HMMER and provides functions for parsing output 4# 5############################################################################### 6# # 7# This program is free software: you can redistribute it and/or modify # 8# it under the terms of the GNU General Public License as published by # 9# the Free Software Foundation, either version 3 of the License, or # 10# (at your option) any later version. # 11# # 12# This program is distributed in the hope that it will be useful, # 13# but WITHOUT ANY WARRANTY; without even the implied warranty of # 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 15# GNU General Public License for more details. # 16# # 17# You should have received a copy of the GNU General Public License # 18# along with this program. If not, see <http://www.gnu.org/licenses/>. # 19# # 20############################################################################### 21 22import os 23import sys 24import logging 25import subprocess 26from re import split as re_split 27 28 29class FormatError(BaseException): 30 pass 31 32 33class HMMERError(BaseException): 34 pass 35 36 37class HMMMERModeError(BaseException): 38 pass 39 40 41class HMMERRunner(): 42 """Wrapper for running HMMER3.""" 43 def __init__(self, mode="dom"): 44 self.logger = logging.getLogger('timestamp') 45 46 # make sure HMMER is installed 47 self.checkForHMMER() 48 49 # set the mode 50 if mode == "dom": 51 self.mode = 'domtblout' 52 elif mode == "tbl": 53 self.mode = 'tblout' 54 elif mode == 'align': 55 self.mode = 'align' 56 elif mode == 'fetch': 57 self.mode = 'fetch' 58 else: 59 raise HMMMERModeError("Mode %s not understood" % mode) 60 61 def search(self, db, query, tableOut, hmmerOut, cmdlineOptions='', bKeepOutput=True): 62 """Run hmmsearch""" 63 # make the output dir and files 64 if self.mode != 'domtblout' and self.mode != 'tblout': 65 raise HMMMERModeError("Mode %s not compatible with search" % self.mode) 66 67 if not bKeepOutput: 68 hmmerOut = '/dev/null' 69 70 cmd = ('hmmsearch --%s %s %s %s %s > %s' % (self.mode, tableOut, cmdlineOptions, db, query, hmmerOut)) 71 os.system(cmd) 72 73 def align(self, db, query, outputFile, writeMode='>', outputFormat='PSIBLAST', trim=True): 74 """Run hmmalign""" 75 if self.mode != 'align': 76 raise HMMMERModeError("Mode %s not compatible with align" % self.mode) 77 78 # build up the option string 79 opts = '' 80 if trim: 81 opts += ' --trim ' 82 83 # run hmmer 84 cmd = 'hmmalign --allcol %s --outformat %s %s %s %s %s' % (opts, outputFormat, db, query, writeMode, outputFile) 85 rtn = os.system(cmd) 86 if rtn == 256: 87 # assume user has a newer version of HMMER (>= 3.1b1) and the allcol parameter is no longer valid 88 cmd = cmd.replace('--allcol', '') 89 os.system(cmd) 90 91 def fetch(self, db, key, fetchFileName, bKeyFile=False): 92 """Run hmmfetch""" 93 if self.mode != 'fetch': 94 raise HMMMERModeError("Mode %s not compatible with fetch" % self.mode) 95 96 keyFileOpt = '' 97 if bKeyFile: 98 keyFileOpt = '-f' 99 100 os.system('hmmfetch ' + keyFileOpt + ' %s %s > %s' % (db, key, fetchFileName)) 101 102 def press(self, hmmModelFile): 103 """Press a HMM file.""" 104 os.system('hmmpress %s > /dev/null' % hmmModelFile) 105 106 def index(self, hmmModelFile): 107 """Index a HMM file.""" 108 if self.mode != 'fetch': 109 raise HMMMERModeError("Mode %s not compatible with fetch" % self.mode) 110 111 os.system('hmmfetch --index %s > /dev/null' % hmmModelFile) 112 113 def checkForHMMER(self): 114 """Check to see if HMMER is on the system path.""" 115 try: 116 subprocess.call(['hmmsearch', '-h'], stdout=open(os.devnull, 'w'), stderr=subprocess.STDOUT) 117 except: 118 self.logger.error(" [Error] Make sure HMMER executables (e.g., hmmsearch, hmmfetch) are on your system path.") 119 sys.exit(1) 120 121 122class HMMERParser(): 123 """Parses tabular output.""" 124 def __init__(self, fileHandle, mode='dom'): 125 self.handle = fileHandle 126 if mode == 'dom': 127 self.mode = 'domtblout' 128 elif mode == 'tbl': 129 self.mode = 'tblout' 130 else: 131 raise HMMERError("Mode %s not understood, please use 'dom' or 'tbl'" % mode) 132 133 def next(self): 134 """Process each hit in the file.""" 135 while 1: 136 if self.mode == 'domtblout': 137 hit = self.readHitsDOM() 138 elif self.mode == 'tblout': 139 hit = self.readHitsTBL() 140 else: 141 raise HMMERError("Mode %s not understood" % self.mode) 142 143 if hit == {}: 144 return None 145 else: 146 return hit 147 148 def readHitsTBL(self): 149 """Process single hit in tblout format.""" 150 """ 151We expect line to look like: 152NODE_110054_length_1926_cov_24.692627_41_3 - Ribosomal_S9 PF00380.14 5.9e-48 158.7 0.0 6.7e-48 158.5 0.0 1.0 1 0 0 1 1 1 1 # 1370 # 1756 # 1 # ID=41_3;partial=00;start_type=ATG;rbs_motif=None;rbs_spacer=None 153 """ 154 while (1): 155 line = self.handle.readline().rstrip() 156 try: 157 if line[0] != '#' and len(line) != 0: 158 dMatch = re_split(r'\s+', line.rstrip()) 159 if len(dMatch) < 19: 160 raise FormatError("Error processing line:\n%s" % (line)) 161 refined_match = dMatch[0:18] + [" ".join([str(i) for i in dMatch[18:]])] 162 return HmmerHitTBL(refined_match) 163 except IndexError: 164 return {} 165 166 def readHitsDOM(self): 167 """Process single hit in domtblout format.""" 168 """ 169We expect the line to look like: 170NODE_925902_length_6780_cov_18.428171_754_2 - 399 PGK PF00162.14 384 2.2e-164 543.7 0.1 1 1 1.3e-167 2.5e-164 543.5 0.1 1 384 9 386 9 386 1.00 # 1767 # 2963 # -1 # ID=754_2;partial=00;start_type=ATG;rbs_motif=AGGA;rbs_spacer=5-10bp 171 """ 172 while (1): 173 line = self.handle.readline().rstrip() 174 try: 175 if line[0] != '#' and len(line) != 0: 176 dMatch = re_split(r'\s+', line.rstrip()) 177 if len(dMatch) < 23: 178 raise FormatError("Error processing line:\n%s" % (line)) 179 refined_match = dMatch[0:22] + [" ".join([str(i) for i in dMatch[22:]])] 180 return HmmerHitDOM(refined_match) 181 except IndexError: 182 return {} 183 184 185class HmmerHitTBL(): 186 """Encapsulate a HMMER hit given in tblout format.""" 187 def __init__(self, values): 188 if len(values) == 19: 189 self.target_name = values[0] 190 self.target_accession = values[1] 191 self.query_name = values[2] 192 193 self.query_accession = values[3] 194 if self.query_accession == '-': 195 self.query_accession = self.query_name 196 197 self.full_e_value = float(values[4]) 198 self.full_score = float(values[5]) 199 self.full_bias = float(values[6]) 200 self.best_e_value = float(values[7]) 201 self.best_score = float(values[8]) 202 self.best_bias = float(values[9]) 203 self.exp = float(values[10]) 204 self.reg = int(values[11]) 205 self.clu = int(values[12]) 206 self.ov = int(values[13]) 207 self.env = int(values[14]) 208 self.dom = int(values[15]) 209 self.rep = int(values[16]) 210 self.inc = int(values[17]) 211 self.target_description = values[18] 212 213 def __str__(self): 214 return "\t".join([self.target_name, 215 self.target_accession, 216 self.query_name, 217 self.query_accession, 218 str(self.full_e_value), 219 str(self.full_score), 220 str(self.full_bias), 221 str(self.best_e_value), 222 str(self.best_score), 223 str(self.best_bias), 224 str(self.exp), 225 str(self.reg), 226 str(self.clu), 227 str(self.ov), 228 str(self.env), 229 str(self.dom), 230 str(self.rep), 231 str(self.inc), 232 self.target_description 233 ] 234 ) 235 236 237class HmmerHitDOM(): 238 """Encapsulate a HMMER hit given in domtblout format.""" 239 def __init__(self, values): 240 if len(values) == 23: 241 self.target_name = values[0] 242 self.target_accession = values[1] 243 self.target_length = int(values[2]) 244 self.query_name = values[3] 245 246 self.query_accession = values[4] 247 if self.query_accession == '-': 248 self.query_accession = self.query_name 249 250 self.query_length = int(values[5]) 251 self.full_e_value = float(values[6]) 252 self.full_score = float(values[7]) 253 self.full_bias = float(values[8]) 254 self.dom = int(values[9]) 255 self.ndom = int(values[10]) 256 self.c_evalue = float(values[11]) 257 self.i_evalue = float(values[12]) 258 self.dom_score = float(values[13]) 259 self.dom_bias = float(values[14]) 260 self.hmm_from = int(values[15]) 261 self.hmm_to = int(values[16]) 262 self.ali_from = int(values[17]) 263 self.ali_to = int(values[18]) 264 self.env_from = int(values[19]) 265 self.env_to = int(values[20]) 266 self.acc = float(values[21]) 267 self.target_description = values[22] 268 269 def __str__(self): 270 return "\t".join([self.target_name, 271 self.target_accession, 272 str(self.target_length), 273 self.query_name, 274 self.query_accession, 275 str(self.query_length), 276 str(self.full_e_value), 277 str(self.full_score), 278 str(self.full_bias), 279 str(self.dom), 280 str(self.ndom), 281 str(self.c_evalue), 282 str(self.i_evalue), 283 str(self.dom_score), 284 str(self.dom_bias), 285 str(self.hmm_from), 286 str(self.hmm_to), 287 str(self.ali_from), 288 str(self.ali_to), 289 str(self.env_from), 290 str(self.env_to), 291 str(self.acc), 292 self.target_description] 293 ) 294