1#!/usr/bin/env python
2
3###############################################################################
4#                                                                             #
5#    This program is free software: you can redistribute it and/or modify     #
6#    it under the terms of the GNU General Public License as published by     #
7#    the Free Software Foundation, either version 3 of the License, or        #
8#    (at your option) any later version.                                      #
9#                                                                             #
10#    This program is distributed in the hope that it will be useful,          #
11#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
12#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
13#    GNU General Public License for more details.                             #
14#                                                                             #
15#    You should have received a copy of the GNU General Public License        #
16#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
17#                                                                             #
18###############################################################################
19
20"""
21Perform simulation to show that marker sets give better completion estimations
22  compared to marker genes.
23"""
24
25__author__ = 'Donovan Parks'
26__copyright__ = 'Copyright 2013'
27__credits__ = ['Donovan Parks']
28__license__ = 'GPL3'
29__version__ = '1.0.0'
30__maintainer__ = 'Donovan Parks'
31__email__ = 'donovan.parks@gmail.com'
32__status__ = 'Development'
33
34import argparse
35import random
36
37from lib.img import IMG
38from lib.taxonomyUtils import ranksByLabel
39from lib.plots.boxplot import BoxPlot
40
41class SimMarkerGenesVsMarkerSets(object):
42    def __init__(self):
43        pass
44
45    def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen):
46        img = IMG()
47
48        genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final')
49        print '\nLineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.'
50
51        # build marker genes and colocated marker sets
52        countTable = img.countTable(genomeIds)
53        markerGenes = img.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
54        print '  Marker genes: ' + str(len(markerGenes))
55
56        geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6)
57        colocatedGenes = img.colocatedGenes(geneDistTable)
58        colocatedSets = img.colocatedSets(colocatedGenes, markerGenes)
59        print '  Co-located gene sets: ' + str(len(colocatedSets))
60
61
62        # random sample genomes
63        if numGenomes == -1:
64            rndGenomeIds = genomeIds
65        else:
66            rndGenomeIds = random.sample(genomeIds, numGenomes)
67
68        # estimate completion for each genome using both the marker genes and marker sets
69        metadata = img.genomeMetadata('Final')
70        plotLabels = []
71        plotData = []
72        for genomeId in rndGenomeIds:
73            mgCompletion = []
74            msCompletion = []
75            for _ in xrange(0, numReplicates):
76                startPartialGenomeContigs = img.sampleGenome(metadata[genomeId]['genome size'], percentCompletion, contigLen)
77
78                # calculate completion with marker genes
79                containedMarkerGenes = img.containedMarkerGenes(markerGenes, geneDistTable[genomeId], startPartialGenomeContigs, contigLen)
80                mgCompletion.append(float(len(containedMarkerGenes))/len(markerGenes) - percentCompletion)
81
82                # calculate completion with marker set
83                comp = 0.0
84                for cs in colocatedSets:
85                    present = 0
86                    for contigId in cs:
87                        if contigId in containedMarkerGenes:
88                            present += 1
89
90                    comp += float(present) / len(cs)
91                msCompletion.append(comp / len(colocatedSets) - percentCompletion)
92
93            plotData.append(mgCompletion)
94            plotData.append(msCompletion)
95
96            species = ' '.join(metadata[genomeId]['taxonomy'][ranksByLabel['Genus']:])
97
98            plotLabels.append(species + ' (' + genomeId + ')')
99            plotLabels.append('')
100
101        # plot data
102        boxPlot = BoxPlot()
103        plotFilename = './images/sim.MGvsMS.' + taxonomyStr.replace(';','_') + '.' + str(percentCompletion) + '.errorbar.png'
104        title = taxonomyStr.replace(';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion
105        boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title)
106
107if __name__ == '__main__':
108    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
109
110    parser.add_argument('-T', '--taxonomy', help='IMG taxonomy string indicating lineage of interest', default = 'prokaryotes')
111    parser.add_argument('-u', '--ubiquity', help='Ubiquity threshold for defining marker set', type=float, default = 0.97)
112    parser.add_argument('-s', '--single_copy', help='Single-copy threshold for defining marker set', type=float, default = 0.97)
113    parser.add_argument('-p', '--percent_complete', help='Percent completion to simulate', type=float, default = 0.75)
114    parser.add_argument('-r', '--replicates', help='Replicates per genome.', type=int, default = 100)
115    parser.add_argument('-g', '--num_genomes', help='Number of random genomes to consider (-1 for all)', type=int, default = 20)
116    parser.add_argument('-c', '--contig_len', help='Length of contigs to simulate', type=int, default = 5000)
117
118    args = parser.parse_args()
119
120    simMarkerGenesVsMarkerSets = SimMarkerGenesVsMarkerSets()
121    simMarkerGenesVsMarkerSets.run(args.taxonomy, args.ubiquity, args.single_copy, args.percent_complete, args.replicates, args.num_genomes, args.contig_len)
122