1############################################################################### 2# 3# pfam.py - methods for identifying PFAM genes from HMMER models. 4# 5############################################################################### 6# # 7# This program is free software: you can redistribute it and/or modify # 8# it under the terms of the GNU General Public License as published by # 9# the Free Software Foundation, either version 3 of the License, or # 10# (at your option) any later version. # 11# # 12# This program is distributed in the hope that it will be useful, # 13# but WITHOUT ANY WARRANTY; without even the implied warranty of # 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 15# GNU General Public License for more details. # 16# # 17# You should have received a copy of the GNU General Public License # 18# along with this program. If not, see <http://www.gnu.org/licenses/>. # 19# # 20############################################################################### 21 22from collections import defaultdict 23 24from checkm.common import checkFileExists 25 26 27class PFAM(object): 28 def __init__(self, pfamClanFile): 29 self.pfamClanFile = pfamClanFile 30 self.idToAcc = {} # map ids to pfam accessions 31 self.clan = {} # map pfam accessions to clans 32 self.nested = {} # map pfam accessions to nested pfam accessions 33 34 def __readClansAndNesting(self): 35 checkFileExists(self.pfamClanFile) 36 37 idNested = defaultdict(list) 38 for line in open(self.pfamClanFile): 39 if '#=GF ID' in line: 40 ID = line.split()[2].strip() 41 elif '#=GF AC' in line: 42 pfamAcc = line.split()[2].strip() 43 pfamAcc = pfamAcc[0:pfamAcc.rfind('.')] 44 self.idToAcc[ID] = pfamAcc 45 elif '#=GF CL' in line: 46 clanId = line.split()[2].strip() 47 self.clan[pfamAcc] = clanId 48 elif '#=GF NE' in line: 49 nestedId = line.split()[2].strip() 50 idNested[nestedId].append(ID) 51 idNested[ID].append(nestedId) 52 53 # set nested structure to use pfam accessions instead of IDs 54 for ID, nested in idNested.iteritems(): 55 pfamAcc = self.idToAcc[ID] 56 self.nested[pfamAcc] = set([self.idToAcc[x] for x in nested]) 57 58 def pfamIdToClanId(self): 59 """Determine clan of each pfam.""" 60 checkFileExists(self.pfamClanFile) 61 62 d = {} 63 for line in open(self.pfamClanFile): 64 if '#=GF AC' in line: 65 pfamAcc = line.split()[2].strip() 66 elif '#=GF CL' in line: 67 clanId = line.split()[2].strip() 68 d[pfamAcc] = clanId 69 70 return d 71 72 def genesInClan(self): 73 """Determine all genes within a each clan.""" 74 checkFileExists(self.pfamClanFile) 75 76 d = defaultdict(set) 77 for line in open(self.pfamClanFile): 78 if '#=GF AC' in line: 79 pfamAcc = line.split()[2].strip() 80 elif '#=GF CL' in line: 81 clanId = line.split()[2].strip() 82 d[clanId].update([pfamAcc]) 83 84 return d 85 86 def filterHitsFromSameClan(self, markerHits): 87 """Filter hits to ORF from same clan.""" 88 89 # check if clan and nesting need to be computer 90 if len(self.clan) == 0: 91 self.__readClansAndNesting() 92 93 # determine all PFAM hits to each ORF and setup initial set of filtered markers 94 filteredMarkers = defaultdict(list) 95 hitsToORFs = defaultdict(list) 96 for markerId, hits in markerHits.iteritems(): 97 if markerId.startswith('PF'): 98 for hit in hits: 99 hitsToORFs[hit.target_name].append(hit) 100 else: 101 # retain all non-PFAM markers 102 filteredMarkers[markerId] = hits 103 104 # for each gene, take only the best hit for each PFAM clan 105 for _, hits in hitsToORFs.iteritems(): 106 # sort in ascending order of e-value 107 hits.sort(key=lambda x: x.full_e_value) 108 109 filtered = set() 110 for i in xrange(0, len(hits)): 111 if i in filtered: 112 continue 113 114 pfamIdI = hits[i].query_accession 115 pfamIdI = pfamIdI[0:pfamIdI.rfind('.')] 116 clanI = self.clan.get(pfamIdI, None) 117 startI = hits[i].ali_from 118 endI = hits[i].ali_to 119 120 for j in xrange(i + 1, len(hits)): 121 if j in filtered: 122 continue 123 124 pfamIdJ = hits[j].query_accession 125 pfamIdJ = pfamIdJ[0:pfamIdJ.rfind('.')] 126 clanJ = self.clan.get(pfamIdJ, None) 127 startJ = hits[j].ali_from 128 endJ = hits[j].ali_to 129 130 # check if hits are from the same clan 131 if pfamIdI != None and pfamIdJ != None and clanI == clanJ: 132 # check if hits overlap 133 if (startI <= startJ and endI > startJ) or (startJ <= startI and endJ > startI): 134 # check if pfams are nested 135 if not (pfamIdI in self.nested and pfamIdJ in self.nested[pfamIdI]): 136 # hits should be filtered as it is from the same clan, overlaps, and is not 137 # nested with a pfam hit with a lower e-value 138 filtered.add(j) 139 140 # tabulate unfiltered hits 141 for i in xrange(0, len(hits)): 142 if i in filtered: 143 continue 144 145 filteredMarkers[hits[i].query_accession].append(hits[i]) 146 147 return filteredMarkers 148 149 def genesInSameClan(self, genes): 150 """Get all genes from the PFAM clans spanned by the input gene set.""" 151 152 # get a list of clans spanned by the input gene set 153 pfamIdToClanId = self.pfamIdToClanId() 154 155 clans = set() 156 for gene in genes: 157 clanId = pfamIdToClanId.get(gene, None) 158 if clanId != None: 159 clans.add(clanId) 160 161 # get a list of all other genes from these clans 162 genesInClan = self.genesInClan() 163 164 allGenesInClans = set() 165 for clan in clans: 166 allGenesInClans.update(genesInClan[clan]) 167 168 return allGenesInClans - genes 169