1############################################################################### 2# 3# uniqueMarkers.py - find a set of markers that are descriptive for a taxonomy 4# 5############################################################################### 6# # 7# This program is free software: you can redistribute it and/or modify # 8# it under the terms of the GNU General Public License as published by # 9# the Free Software Foundation, either version 3 of the License, or # 10# (at your option) any later version. # 11# # 12# This program is distributed in the hope that it will be useful, # 13# but WITHOUT ANY WARRANTY; without even the implied warranty of # 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 15# GNU General Public License for more details. # 16# # 17# You should have received a copy of the GNU General Public License # 18# along with this program. If not, see <http://www.gnu.org/licenses/>. # 19# # 20############################################################################### 21 22import argparse 23import sqlite3 24import re 25from collections import defaultdict 26 27 28def parseTaxonomy(taxonomy): 29 tax = re.compile('[;,:]?\s?[kpcofgs]__|[;,:]') 30 return tax.split(taxonomy)[1:] 31 32 33def getTaxonId(cursor, *args): 34 ranks = ['Domain', 'Phylum', 'Class', '"Order"', 'Family', 'Genus', 'Species'] 35 query = [] 36 for rank, value in zip(ranks, args): 37 query.append(' %s = \'%s\' ' % (rank, value)) 38 query.append(' %s IS NULL' % ranks[len(args)]) 39 query_string = 'AND'.join(query) 40 result = cursor.execute('SELECT Id, "Count" FROM taxons WHERE %s' % query_string) 41 return result.fetchall() 42 43 44def getOppositeRankSpecificTaxonId(cursor, *args): 45 ''' Get all other taxon lineages at the same level as the requested taxon 46 ''' 47 ranks = ['Domain', 'Phylum', 'Class', '"Order"', 'Family', 'Genus', 'Species'] 48 query = [] 49 for rank, value in zip(ranks, args[:-1]): 50 query.append(' %s = \'%s\' ' % (rank, value)) 51 query.append(' %s != \'%s\' ' % (ranks[len(args) - 1], args[-1])) 52 query.append(' %s IS NULL' % ranks[len(args)]) 53 query_string = 'AND'.join(query) 54 print query_string 55 result = cursor.execute('SELECT Id, "Count" FROM taxons WHERE %s' % query_string) 56 return result.fetchall() 57 58 59def getOppositeRankInspecificTaxonId(cursor, *args): 60 ''' Get all other taxon lineages at the same level as the requested taxon 61 ''' 62 ranks = ['Domain', 'Phylum', 'Class', '"Order"', 'Family', 'Genus', 'Species'] 63 query = [] 64 for rank, value in zip(ranks, args): 65 query.append(' %s != \'%s\' ' % (rank, value)) 66 # query.append(' %s IS NULL' % ranks[len(args)]) 67 query_string = query[-1] 68 result = cursor.execute('SELECT Id, "Count" FROM taxons WHERE %s' % query_string) 69 return result.fetchall() 70 71 72def getMarkersFromTaxon(cursor, taxid): 73 result = cursor.execute('''SELECT Marker, "Count" FROM marker_mapping WHERE Taxon = ?''', (taxid,)) 74 return result.fetchall() 75 76 77def getMarkersNotInTaxon(cursor, taxid): 78 result = cursor.execute('''SELECT Marker, "Count" FROM marker_mapping WHERE Taxon != ?''', (taxid,)) 79 return result.fetchall() 80 81 82def countAllGenomes(cursor): 83 result = cursor.execute('''SELECT Id, "Count" FROM taxons''') 84 return result.fetchall() 85 86 87def countGenomesInTaxon(cursor, taxId): 88 result = cursor.execute('''SELECT "Count" FROM taxons WHERE Id = ?''', (taxId,)) 89 return result.fetchone()[0] 90 91 92def getDescriptiveMarkers(cursor, markers): 93 result = cursor.execute('''SELECT Acc, Name FROM markers WHERE Id = ?''', markers) 94 return result.fetchone() 95 96 97def doWork(args): 98 taxon_ranks = parseTaxonomy(args.taxonomy) 99 con = sqlite3.connect(args.database) 100 cur = con.cursor() 101 102 taxon_ids = getTaxonId(cur, *taxon_ranks) 103 if len(taxon_ids) > 1: 104 raise RuntimeError("Taxon string returns more than one lineage "\ 105 "please be more specific") 106 else: 107 tax_id, tax_count = taxon_ids[0] 108 all_markers = getMarkersFromTaxon(cur, tax_id) 109 marker_in_taxon_mapping = {} 110 for (Id, count) in all_markers: 111 if float(count) / float(tax_count) >= args.include: 112 marker_in_taxon_mapping[Id] = float(count) / float(tax_count) 113 114 opposite_taxons = getOppositeRankInspecificTaxonId(cur, *taxon_ranks) 115 markers_from_others = defaultdict(int) 116 others_total_count = 0 117 for (Id, count) in opposite_taxons: 118 others_total_count += count 119 this_taxon_count = getMarkersFromTaxon(cur, Id) 120 for (Id, count) in this_taxon_count: 121 markers_from_others[Id] += count 122 123 descriptive_markers = [] 124 for marker_id, _ in marker_in_taxon_mapping.items(): 125 if marker_id in markers_from_others: 126 fraction_in_others = float(markers_from_others[marker_id]) / float(others_total_count) 127 if fraction_in_others <= args.exclude: 128 descriptive_markers.append((marker_id,)) 129 else: 130 # not found in anything else 131 descriptive_markers.append((marker_id,)) 132 133 des_markers = [] 134 for i in descriptive_markers: 135 des_markers.append(getDescriptiveMarkers(cur, i)) 136 137 for des_acc, des_name in des_markers: 138 print des_acc, des_name 139 140if __name__ == '__main__': 141 142 parser = argparse.ArgumentParser() 143 144 parser.add_argument('-t', '--taxon-string', 145 default='k__Bacteria;p__Proteobacteria', 146 dest='taxonomy', help="specify the taxonomy") 147 parser.add_argument('-i', '--inclusive-percent', dest='include', 148 default=0.95, help="The required percentage of member of the "\ 149 "specified taxonomy must have a particular marker") 150 parser.add_argument('-e', '--exclusive-percent', dest='exclude', 151 default=0.05, help="The maximum prevelence of the marker in "\ 152 "all other non-target taxons") 153 parser.add_argument('-d', '--database', dest='database', 154 default='markers.db', help='specify path to database') 155 156 # parse the arguments 157 args = parser.parse_args() 158 159 # do what we came here to do 160 doWork(args) 161