1#!/usr/bin/env python
2
3###############################################################################
4#                                                                             #
5#    This program is free software: you can redistribute it and/or modify     #
6#    it under the terms of the GNU General Public License as published by     #
7#    the Free Software Foundation, either version 3 of the License, or        #
8#    (at your option) any later version.                                      #
9#                                                                             #
10#    This program is distributed in the hope that it will be useful,          #
11#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
12#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
13#    GNU General Public License for more details.                             #
14#                                                                             #
15#    You should have received a copy of the GNU General Public License        #
16#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
17#                                                                             #
18###############################################################################
19
20"""
21Calculate marker set for varying lineages, number of genomes, and marker set parameters.
22"""
23
24__author__ = 'Donovan Parks'
25__copyright__ = 'Copyright 2013'
26__credits__ = ['Donovan Parks']
27__license__ = 'GPL3'
28__version__ = '1.0.0'
29__maintainer__ = 'Donovan Parks'
30__email__ = 'donovan.parks@gmail.com'
31__status__ = 'Development'
32
33import sys
34import argparse
35import random
36
37from numpy import mean, std, arange
38
39from lib.img import IMG
40from lib.markerSet import MarkerGenes
41from lib.plots.errorbar import ErrorBar
42from lib.plots.boxplot import BoxPlot
43
44class MarkerSetTest(object):
45    def __init__(self):
46        pass
47
48    def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, replicates, minGenomes, maxGenomes, stepSize):
49        img = IMG()
50        markergenes = MarkerGenes()
51
52        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')
53
54        print 'Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.'
55        if len(genomeIds) < minGenomes:
56            sys.stderr.write('[Error] Insufficent number of genomes.\n')
57            sys.exit()
58
59        print ''
60        print 'Ubiquity threshold: ' + str(ubiquityThreshold)
61        print 'Single-copy threshold: ' + str(singleCopyThreshold)
62
63        meanMarkerSetSize = []
64        stdMarkerSetSize = []
65        markerSetSizes = []
66        if maxGenomes == -1:
67            maxGenomes = len(genomeIds)
68
69        if maxGenomes > len(genomeIds):
70            maxGenomes = len(genomeIds)
71
72        countTable = img.countTable(genomeIds)
73        countTable = img.filterTable(genomeIds, countTable)
74
75        for numGenomes in xrange(minGenomes, maxGenomes, stepSize):
76            markerSetSize = []
77            for _ in xrange(0, replicates):
78                genomeIdSubset = random.sample(genomeIds, numGenomes)
79
80                markerGenes = markergenes.identify(genomeIdSubset, countTable, ubiquityThreshold*len(genomeIdSubset), singleCopyThreshold*len(genomeIdSubset))
81                geneDistTable = img.geneDistTable(genomeIdSubset, markerGenes, spacingBetweenContigs=1e6)
82                colocatedGenes = img.colocatedGenes(geneDistTable)
83                colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)
84
85                markerSetSize.append(len(colocatedSets))
86
87            markerSetSizes.append(markerSetSize)
88
89            m = mean(markerSetSize)
90            meanMarkerSetSize.append(m)
91
92            s = std(markerSetSize)
93            stdMarkerSetSize.append(s)
94
95            print ''
96            print 'Genomes: ' + str(numGenomes) + ', Ubiquity > ' + str(int(ubiquityThreshold*len(genomeIdSubset))) + ', Single-copy > ' + str(int(singleCopyThreshold*len(genomeIdSubset)))
97            print 'Mean: %.2f +/- %.2f' % (m, s)
98            print 'Min: %d, Max: %d' %(min(markerSetSize), max(markerSetSize))
99
100        # plot data
101        errorBar = ErrorBar()
102        plotFilename = './images/markerset.' + taxonomyStr.replace(';','_') + '.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) +  '.errorbar.png'
103        title = taxonomyStr.replace(';', '; ') + '\n' + 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold
104        errorBar.plot(plotFilename, arange(minGenomes, maxGenomes, stepSize), meanMarkerSetSize, stdMarkerSetSize, 'Number of Genomes', 'Marker Set Size', title)
105
106        boxPlot = BoxPlot()
107        plotFilename = './images/markerset.' + taxonomyStr.replace(';','_') + '.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) +  '.boxplot.png'
108        boxPlot.plot(plotFilename, markerSetSizes, arange(minGenomes, maxGenomes, stepSize), 'Number of Genomes', 'Marker Set Size', True, title)
109
110if __name__ == '__main__':
111    parser = argparse.ArgumentParser(description="Plot change in marker set size for varying subsets of genomes.",
112                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
113
114    parser.add_argument('-T', '--taxonomy', help='IMG taxonomy string indicating lineage of interest', default = 'universal')
115    parser.add_argument('-u', '--ubiquity', help='Ubiquity threshold for defining marker set', type=float, default = 0.97)
116    parser.add_argument('-s', '--single_copy', help='Single-copy threshold for defining marker set', type=float, default = 0.97)
117    parser.add_argument('-r', '--replicates', help='Resampling replicates to perform for varying genomes sets', type=int, default=100)
118    parser.add_argument('-i', '--min_genomes', help='Minimum genomes to consider when building marker set', type=int, default=10)
119    parser.add_argument('-a', '--max_genomes', help='Maximum genomes to consider when building marker set, use -1 for all genomes', type=int, default=200)
120    parser.add_argument('-x', '--step_size', help='Step size for increasing number of genomes under consideration', type=int, default=5)
121
122    args = parser.parse_args()
123
124    markerSetTest = MarkerSetTest()
125    markerSetTest.run(args.taxonomy, args.ubiquity, args.single_copy, args.replicates, args.min_genomes, args.max_genomes, args.step_size)
126