1###############################################################################
2#
3# uniqueMarkers.py - find a set of markers that are descriptive for a taxonomy
4#
5###############################################################################
6#                                                                             #
7#    This program is free software: you can redistribute it and/or modify     #
8#    it under the terms of the GNU General Public License as published by     #
9#    the Free Software Foundation, either version 3 of the License, or        #
10#    (at your option) any later version.                                      #
11#                                                                             #
12#    This program is distributed in the hope that it will be useful,          #
13#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
14#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
15#    GNU General Public License for more details.                             #
16#                                                                             #
17#    You should have received a copy of the GNU General Public License        #
18#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
19#                                                                             #
20###############################################################################
21
22import argparse
23import sqlite3
24import re
25from collections import defaultdict
26
27
28def parseTaxonomy(taxonomy):
29    tax = re.compile('[;,:]?\s?[kpcofgs]__|[;,:]')
30    return tax.split(taxonomy)[1:]
31
32
33def getTaxonId(cursor, *args):
34    ranks = ['Domain', 'Phylum', 'Class', '"Order"', 'Family', 'Genus', 'Species']
35    query = []
36    for rank, value in zip(ranks, args):
37        query.append(' %s = \'%s\' ' % (rank, value))
38    query.append(' %s IS NULL' % ranks[len(args)])
39    query_string = 'AND'.join(query)
40    result = cursor.execute('SELECT Id, "Count" FROM taxons WHERE %s' % query_string)
41    return result.fetchall()
42
43
44def getOppositeRankSpecificTaxonId(cursor, *args):
45    ''' Get all other taxon lineages at the same level as the requested taxon
46    '''
47    ranks = ['Domain', 'Phylum', 'Class', '"Order"', 'Family', 'Genus', 'Species']
48    query = []
49    for rank, value in zip(ranks, args[:-1]):
50        query.append(' %s = \'%s\' ' % (rank, value))
51    query.append(' %s != \'%s\' ' % (ranks[len(args) - 1], args[-1]))
52    query.append(' %s IS NULL' % ranks[len(args)])
53    query_string = 'AND'.join(query)
54    print query_string
55    result = cursor.execute('SELECT Id, "Count" FROM taxons WHERE %s' % query_string)
56    return result.fetchall()
57
58
59def getOppositeRankInspecificTaxonId(cursor, *args):
60    ''' Get all other taxon lineages at the same level as the requested taxon
61    '''
62    ranks = ['Domain', 'Phylum', 'Class', '"Order"', 'Family', 'Genus', 'Species']
63    query = []
64    for rank, value in zip(ranks, args):
65        query.append(' %s != \'%s\' ' % (rank, value))
66    # query.append(' %s IS NULL' % ranks[len(args)])
67    query_string = query[-1]
68    result = cursor.execute('SELECT Id, "Count" FROM taxons WHERE %s' % query_string)
69    return result.fetchall()
70
71
72def getMarkersFromTaxon(cursor, taxid):
73    result = cursor.execute('''SELECT Marker, "Count" FROM marker_mapping WHERE Taxon = ?''', (taxid,))
74    return result.fetchall()
75
76
77def getMarkersNotInTaxon(cursor, taxid):
78    result = cursor.execute('''SELECT Marker, "Count" FROM marker_mapping WHERE Taxon != ?''', (taxid,))
79    return result.fetchall()
80
81
82def countAllGenomes(cursor):
83    result = cursor.execute('''SELECT Id, "Count" FROM taxons''')
84    return result.fetchall()
85
86
87def countGenomesInTaxon(cursor, taxId):
88    result = cursor.execute('''SELECT "Count" FROM taxons WHERE Id = ?''', (taxId,))
89    return result.fetchone()[0]
90
91
92def getDescriptiveMarkers(cursor, markers):
93    result = cursor.execute('''SELECT Acc, Name FROM markers WHERE Id = ?''', markers)
94    return result.fetchone()
95
96
97def doWork(args):
98    taxon_ranks = parseTaxonomy(args.taxonomy)
99    con = sqlite3.connect(args.database)
100    cur = con.cursor()
101
102    taxon_ids = getTaxonId(cur, *taxon_ranks)
103    if len(taxon_ids) > 1:
104        raise RuntimeError("Taxon string returns more than one lineage "\
105                "please be more specific")
106    else:
107        tax_id, tax_count = taxon_ids[0]
108        all_markers = getMarkersFromTaxon(cur, tax_id)
109        marker_in_taxon_mapping = {}
110        for (Id, count) in all_markers:
111            if float(count) / float(tax_count) >= args.include:
112                marker_in_taxon_mapping[Id] = float(count) / float(tax_count)
113
114        opposite_taxons = getOppositeRankInspecificTaxonId(cur, *taxon_ranks)
115        markers_from_others = defaultdict(int)
116        others_total_count = 0
117        for (Id, count) in opposite_taxons:
118            others_total_count += count
119            this_taxon_count = getMarkersFromTaxon(cur, Id)
120            for (Id, count) in this_taxon_count:
121                markers_from_others[Id] += count
122
123        descriptive_markers = []
124        for marker_id, _ in marker_in_taxon_mapping.items():
125            if marker_id in markers_from_others:
126                fraction_in_others = float(markers_from_others[marker_id]) / float(others_total_count)
127                if fraction_in_others <= args.exclude:
128                    descriptive_markers.append((marker_id,))
129            else:
130                # not found in anything else
131                descriptive_markers.append((marker_id,))
132
133        des_markers = []
134        for i in descriptive_markers:
135            des_markers.append(getDescriptiveMarkers(cur, i))
136
137        for des_acc, des_name in des_markers:
138            print des_acc, des_name
139
140if __name__ == '__main__':
141
142    parser = argparse.ArgumentParser()
143
144    parser.add_argument('-t', '--taxon-string',
145            default='k__Bacteria;p__Proteobacteria',
146            dest='taxonomy', help="specify the taxonomy")
147    parser.add_argument('-i', '--inclusive-percent', dest='include',
148            default=0.95, help="The required percentage of member of the "\
149            "specified taxonomy must have a particular marker")
150    parser.add_argument('-e', '--exclusive-percent', dest='exclude',
151            default=0.05, help="The maximum prevelence of the marker in "\
152            "all other non-target taxons")
153    parser.add_argument('-d', '--database', dest='database',
154        default='markers.db', help='specify path to database')
155
156    # parse the arguments
157    args = parser.parse_args()
158
159    # do what we came here to do
160    doWork(args)
161