1############################################################################### 2# 3# taxonParser.py - parse taxonomic-specific marker sets 4# 5############################################################################### 6# # 7# This program is free software: you can redistribute it and/or modify # 8# it under the terms of the GNU General Public License as published by # 9# the Free Software Foundation, either version 3 of the License, or # 10# (at your option) any later version. # 11# # 12# This program is distributed in the hope that it will be useful, # 13# but WITHOUT ANY WARRANTY; without even the implied warranty of # 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 15# GNU General Public License for more details. # 16# # 17# You should have received a copy of the GNU General Public License # 18# along with this program. If not, see <http://www.gnu.org/licenses/>. # 19# # 20############################################################################### 21 22import logging 23from collections import defaultdict 24 25import checkm.prettytable as prettytable 26 27from checkm.markerSets import BinMarkerSets, MarkerSet 28from checkm.util.taxonomyUtils import taxonomicRanks, ranksByLevel, ranksByLabel 29 30from checkm.defaultValues import DefaultValues 31 32 33class TaxonParser(): 34 """Parse taxonomic-specific marker sets.""" 35 def __init__(self): 36 self.logger = logging.getLogger('timestamp') 37 38 def readMarkerSets(self): 39 taxonMarkerSets = defaultdict(dict) 40 for line in open(DefaultValues.TAXON_MARKER_SETS): 41 lineSplit = line.split('\t') 42 rank = lineSplit[0] 43 taxon = lineSplit[1] 44 lineage = lineSplit[2] 45 numGenomes = int(lineSplit[3]) 46 markerSet = eval(lineSplit[6].rstrip()) 47 48 ms = MarkerSet(ranksByLabel[rank], lineage, numGenomes, markerSet) 49 ms.removeMarkers(DefaultValues.MARKERS_TO_EXCLUDE) 50 51 taxonMarkerSets[rank][taxon] = ms 52 53 return taxonMarkerSets 54 55 def list(self, rankFilter='ALL'): 56 """ List all available marker sets from the specified rank.""" 57 58 taxonMarkerSets = self.readMarkerSets() 59 60 header = ['Rank', 'Taxon', '# genomes', '# marker genes', '# marker sets'] 61 pTable = prettytable.PrettyTable(header) 62 pTable.align = 'c' 63 pTable.align['Rank'] = 'l' 64 pTable.align['Taxon'] = 'l' 65 pTable.hrules = prettytable.FRAME 66 pTable.vrules = prettytable.NONE 67 68 for rank in taxonomicRanks: 69 if rankFilter == 'ALL' or rankFilter == rank: 70 for taxon in sorted(taxonMarkerSets[rank]): 71 markerSet = taxonMarkerSets[rank][taxon] 72 73 numMarkers, numMarkerSets = markerSet.size() 74 pTable.add_row([rank, taxon, markerSet.numGenomes, numMarkers, numMarkerSets]) 75 76 print '' 77 print pTable.get_string() 78 79 def markerSet(self, rank, taxon, markerFile): 80 """Obtain specified taxonomic-specific marker set.""" 81 82 taxonMarkerSets = self.readMarkerSets() 83 84 if rank not in taxonMarkerSets: 85 self.logger.error('Unrecognized taxonomic rank: ' + rank) 86 return False 87 elif taxon not in taxonMarkerSets[rank]: 88 self.logger.error('Unrecognized taxon: %s (in rank %s): ' % (taxon, rank)) 89 return False 90 91 markerSet = taxonMarkerSets[rank][taxon] 92 93 taxonomy = markerSet.lineageStr.split(';')[::-1] 94 binMarkerSets = BinMarkerSets(taxon, BinMarkerSets.TAXONOMIC_MARKER_SET) 95 for i, taxon in enumerate(taxonomy): 96 if rank != 'life': 97 rank = ranksByLevel[len(taxonomy) - i - 1] 98 99 if rank == 'species': 100 taxon = taxonomy[1] + ' ' + taxonomy[0] 101 102 markerSet = taxonMarkerSets[rank][taxon] 103 numMarkers, numMarkerSets = markerSet.size() 104 self.logger.info('Marker set for %s contains %d marker genes arranged in %d sets.' % (taxon, numMarkers, numMarkerSets)) 105 self.logger.info('Marker set inferred from %d reference genomes.' % markerSet.numGenomes) 106 107 markerSet.lineageStr = taxon 108 binMarkerSets.addMarkerSet(markerSet) 109 110 fout = open(markerFile, 'w') 111 fout.write(DefaultValues.TAXON_MARKER_FILE_HEADER + '\n') 112 binMarkerSets.write(fout) 113 fout.close() 114 115 return True 116