1###############################################################################
2#
3# pfam.py - methods for identifying PFAM genes from HMMER models.
4#
5###############################################################################
6#                                                                             #
7#    This program is free software: you can redistribute it and/or modify     #
8#    it under the terms of the GNU General Public License as published by     #
9#    the Free Software Foundation, either version 3 of the License, or        #
10#    (at your option) any later version.                                      #
11#                                                                             #
12#    This program is distributed in the hope that it will be useful,          #
13#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
14#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
15#    GNU General Public License for more details.                             #
16#                                                                             #
17#    You should have received a copy of the GNU General Public License        #
18#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
19#                                                                             #
20###############################################################################
21
22from collections import defaultdict
23
24from checkm.common import checkFileExists
25
26
27class PFAM(object):
28    def __init__(self, pfamClanFile):
29        self.pfamClanFile = pfamClanFile
30        self.idToAcc = {}  # map ids to pfam accessions
31        self.clan = {}  # map pfam accessions to clans
32        self.nested = {}  # map pfam accessions to nested pfam accessions
33
34    def __readClansAndNesting(self):
35        checkFileExists(self.pfamClanFile)
36
37        idNested = defaultdict(list)
38        for line in open(self.pfamClanFile):
39            if '#=GF ID' in line:
40                ID = line.split()[2].strip()
41            elif '#=GF AC' in line:
42                pfamAcc = line.split()[2].strip()
43                pfamAcc = pfamAcc[0:pfamAcc.rfind('.')]
44                self.idToAcc[ID] = pfamAcc
45            elif '#=GF CL' in line:
46                clanId = line.split()[2].strip()
47                self.clan[pfamAcc] = clanId
48            elif '#=GF NE' in line:
49                nestedId = line.split()[2].strip()
50                idNested[nestedId].append(ID)
51                idNested[ID].append(nestedId)
52
53        # set nested structure to use pfam accessions instead of IDs
54        for ID, nested in idNested.iteritems():
55            pfamAcc = self.idToAcc[ID]
56            self.nested[pfamAcc] = set([self.idToAcc[x] for x in nested])
57
58    def pfamIdToClanId(self):
59        """Determine clan of each pfam."""
60        checkFileExists(self.pfamClanFile)
61
62        d = {}
63        for line in open(self.pfamClanFile):
64            if '#=GF AC' in line:
65                pfamAcc = line.split()[2].strip()
66            elif '#=GF CL' in line:
67                clanId = line.split()[2].strip()
68                d[pfamAcc] = clanId
69
70        return d
71
72    def genesInClan(self):
73        """Determine all genes within a each clan."""
74        checkFileExists(self.pfamClanFile)
75
76        d = defaultdict(set)
77        for line in open(self.pfamClanFile):
78            if '#=GF AC' in line:
79                pfamAcc = line.split()[2].strip()
80            elif '#=GF CL' in line:
81                clanId = line.split()[2].strip()
82                d[clanId].update([pfamAcc])
83
84        return d
85
86    def filterHitsFromSameClan(self, markerHits):
87        """Filter hits to ORF from same clan."""
88
89        # check if clan and nesting need to be computer
90        if len(self.clan) == 0:
91            self.__readClansAndNesting()
92
93        # determine all PFAM hits to each ORF and setup initial set of filtered markers
94        filteredMarkers = defaultdict(list)
95        hitsToORFs = defaultdict(list)
96        for markerId, hits in markerHits.iteritems():
97            if markerId.startswith('PF'):
98                for hit in hits:
99                    hitsToORFs[hit.target_name].append(hit)
100            else:
101                # retain all non-PFAM markers
102                filteredMarkers[markerId] = hits
103
104        # for each gene, take only the best hit for each PFAM clan
105        for _, hits in hitsToORFs.iteritems():
106            # sort in ascending order of e-value
107            hits.sort(key=lambda x: x.full_e_value)
108
109            filtered = set()
110            for i in xrange(0, len(hits)):
111                if i in filtered:
112                    continue
113
114                pfamIdI = hits[i].query_accession
115                pfamIdI = pfamIdI[0:pfamIdI.rfind('.')]
116                clanI = self.clan.get(pfamIdI, None)
117                startI = hits[i].ali_from
118                endI = hits[i].ali_to
119
120                for j in xrange(i + 1, len(hits)):
121                    if j in filtered:
122                        continue
123
124                    pfamIdJ = hits[j].query_accession
125                    pfamIdJ = pfamIdJ[0:pfamIdJ.rfind('.')]
126                    clanJ = self.clan.get(pfamIdJ, None)
127                    startJ = hits[j].ali_from
128                    endJ = hits[j].ali_to
129
130                    # check if hits are from the same clan
131                    if pfamIdI != None and pfamIdJ != None and clanI == clanJ:
132                        # check if hits overlap
133                        if (startI <= startJ and endI > startJ) or (startJ <= startI and endJ > startI):
134                            # check if pfams are nested
135                            if not (pfamIdI in self.nested and pfamIdJ in self.nested[pfamIdI]):
136                                # hits should be filtered as it is from the same clan, overlaps, and is not
137                                # nested with a pfam hit with a lower e-value
138                                filtered.add(j)
139
140            # tabulate unfiltered hits
141            for i in xrange(0, len(hits)):
142                if i in filtered:
143                    continue
144
145                filteredMarkers[hits[i].query_accession].append(hits[i])
146
147        return filteredMarkers
148
149    def genesInSameClan(self, genes):
150        """Get all genes from the PFAM clans spanned by the input gene set."""
151
152        # get a list of clans spanned by the input gene set
153        pfamIdToClanId = self.pfamIdToClanId()
154
155        clans = set()
156        for gene in genes:
157            clanId = pfamIdToClanId.get(gene, None)
158            if clanId != None:
159                clans.add(clanId)
160
161        # get a list of all other genes from these clans
162        genesInClan = self.genesInClan()
163
164        allGenesInClans = set()
165        for clan in clans:
166            allGenesInClans.update(genesInClan[clan])
167
168        return allGenesInClans - genes
169