1###############################################################################
2#
3# taxonParser.py - parse taxonomic-specific marker sets
4#
5###############################################################################
6#                                                                             #
7#    This program is free software: you can redistribute it and/or modify     #
8#    it under the terms of the GNU General Public License as published by     #
9#    the Free Software Foundation, either version 3 of the License, or        #
10#    (at your option) any later version.                                      #
11#                                                                             #
12#    This program is distributed in the hope that it will be useful,          #
13#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
14#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
15#    GNU General Public License for more details.                             #
16#                                                                             #
17#    You should have received a copy of the GNU General Public License        #
18#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
19#                                                                             #
20###############################################################################
21
22import logging
23from collections import defaultdict
24
25import checkm.prettytable as prettytable
26
27from checkm.markerSets import BinMarkerSets, MarkerSet
28from checkm.util.taxonomyUtils import taxonomicRanks, ranksByLevel, ranksByLabel
29
30from checkm.defaultValues import DefaultValues
31
32
33class TaxonParser():
34    """Parse taxonomic-specific marker sets."""
35    def __init__(self):
36        self.logger = logging.getLogger('timestamp')
37
38    def readMarkerSets(self):
39        taxonMarkerSets = defaultdict(dict)
40        for line in open(DefaultValues.TAXON_MARKER_SETS):
41            lineSplit = line.split('\t')
42            rank = lineSplit[0]
43            taxon = lineSplit[1]
44            lineage = lineSplit[2]
45            numGenomes = int(lineSplit[3])
46            markerSet = eval(lineSplit[6].rstrip())
47
48            ms = MarkerSet(ranksByLabel[rank], lineage, numGenomes, markerSet)
49            ms.removeMarkers(DefaultValues.MARKERS_TO_EXCLUDE)
50
51            taxonMarkerSets[rank][taxon] = ms
52
53        return taxonMarkerSets
54
55    def list(self, rankFilter='ALL'):
56        """ List all available marker sets from the specified rank."""
57
58        taxonMarkerSets = self.readMarkerSets()
59
60        header = ['Rank', 'Taxon', '# genomes', '# marker genes', '# marker sets']
61        pTable = prettytable.PrettyTable(header)
62        pTable.align = 'c'
63        pTable.align['Rank'] = 'l'
64        pTable.align['Taxon'] = 'l'
65        pTable.hrules = prettytable.FRAME
66        pTable.vrules = prettytable.NONE
67
68        for rank in taxonomicRanks:
69            if rankFilter == 'ALL' or rankFilter == rank:
70                for taxon in sorted(taxonMarkerSets[rank]):
71                    markerSet = taxonMarkerSets[rank][taxon]
72
73                    numMarkers, numMarkerSets = markerSet.size()
74                    pTable.add_row([rank, taxon, markerSet.numGenomes, numMarkers, numMarkerSets])
75
76        print ''
77        print pTable.get_string()
78
79    def markerSet(self, rank, taxon, markerFile):
80        """Obtain specified taxonomic-specific marker set."""
81
82        taxonMarkerSets = self.readMarkerSets()
83
84        if rank not in taxonMarkerSets:
85            self.logger.error('Unrecognized taxonomic rank: ' + rank)
86            return False
87        elif taxon not in taxonMarkerSets[rank]:
88            self.logger.error('Unrecognized taxon: %s (in rank %s): ' % (taxon, rank))
89            return False
90
91        markerSet = taxonMarkerSets[rank][taxon]
92
93        taxonomy = markerSet.lineageStr.split(';')[::-1]
94        binMarkerSets = BinMarkerSets(taxon, BinMarkerSets.TAXONOMIC_MARKER_SET)
95        for i, taxon in enumerate(taxonomy):
96            if rank != 'life':
97                rank = ranksByLevel[len(taxonomy) - i - 1]
98
99            if rank == 'species':
100                taxon = taxonomy[1] + ' ' + taxonomy[0]
101
102            markerSet = taxonMarkerSets[rank][taxon]
103            numMarkers, numMarkerSets = markerSet.size()
104            self.logger.info('Marker set for %s contains %d marker genes arranged in %d sets.' % (taxon, numMarkers, numMarkerSets))
105            self.logger.info('Marker set inferred from %d reference genomes.' % markerSet.numGenomes)
106
107            markerSet.lineageStr = taxon
108            binMarkerSets.addMarkerSet(markerSet)
109
110        fout = open(markerFile, 'w')
111        fout.write(DefaultValues.TAXON_MARKER_FILE_HEADER + '\n')
112        binMarkerSets.write(fout)
113        fout.close()
114
115        return True
116