1#!/usr/bin/env python 2 3############################################################################### 4# # 5# This program is free software: you can redistribute it and/or modify # 6# it under the terms of the GNU General Public License as published by # 7# the Free Software Foundation, either version 3 of the License, or # 8# (at your option) any later version. # 9# # 10# This program is distributed in the hope that it will be useful, # 11# but WITHOUT ANY WARRANTY; without even the implied warranty of # 12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 13# GNU General Public License for more details. # 14# # 15# You should have received a copy of the GNU General Public License # 16# along with this program. If not, see <http://www.gnu.org/licenses/>. # 17# # 18############################################################################### 19 20""" 21Perform simulation to show that marker sets give better completion estimations 22 compared to marker genes. 23""" 24 25__author__ = 'Donovan Parks' 26__copyright__ = 'Copyright 2013' 27__credits__ = ['Donovan Parks'] 28__license__ = 'GPL3' 29__version__ = '1.0.0' 30__maintainer__ = 'Donovan Parks' 31__email__ = 'donovan.parks@gmail.com' 32__status__ = 'Development' 33 34import argparse 35import random 36 37from lib.img import IMG 38from lib.taxonomyUtils import ranksByLabel 39from lib.plots.boxplot import BoxPlot 40 41class SimMarkerGenesVsMarkerSets(object): 42 def __init__(self): 43 pass 44 45 def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, percentCompletion, numReplicates, numGenomes, contigLen): 46 img = IMG() 47 48 genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final') 49 print '\nLineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.' 50 51 # build marker genes and colocated marker sets 52 countTable = img.countTable(genomeIds) 53 markerGenes = img.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds)) 54 print ' Marker genes: ' + str(len(markerGenes)) 55 56 geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6) 57 colocatedGenes = img.colocatedGenes(geneDistTable) 58 colocatedSets = img.colocatedSets(colocatedGenes, markerGenes) 59 print ' Co-located gene sets: ' + str(len(colocatedSets)) 60 61 62 # random sample genomes 63 if numGenomes == -1: 64 rndGenomeIds = genomeIds 65 else: 66 rndGenomeIds = random.sample(genomeIds, numGenomes) 67 68 # estimate completion for each genome using both the marker genes and marker sets 69 metadata = img.genomeMetadata('Final') 70 plotLabels = [] 71 plotData = [] 72 for genomeId in rndGenomeIds: 73 mgCompletion = [] 74 msCompletion = [] 75 for _ in xrange(0, numReplicates): 76 startPartialGenomeContigs = img.sampleGenome(metadata[genomeId]['genome size'], percentCompletion, contigLen) 77 78 # calculate completion with marker genes 79 containedMarkerGenes = img.containedMarkerGenes(markerGenes, geneDistTable[genomeId], startPartialGenomeContigs, contigLen) 80 mgCompletion.append(float(len(containedMarkerGenes))/len(markerGenes) - percentCompletion) 81 82 # calculate completion with marker set 83 comp = 0.0 84 for cs in colocatedSets: 85 present = 0 86 for contigId in cs: 87 if contigId in containedMarkerGenes: 88 present += 1 89 90 comp += float(present) / len(cs) 91 msCompletion.append(comp / len(colocatedSets) - percentCompletion) 92 93 plotData.append(mgCompletion) 94 plotData.append(msCompletion) 95 96 species = ' '.join(metadata[genomeId]['taxonomy'][ranksByLabel['Genus']:]) 97 98 plotLabels.append(species + ' (' + genomeId + ')') 99 plotLabels.append('') 100 101 # plot data 102 boxPlot = BoxPlot() 103 plotFilename = './images/sim.MGvsMS.' + taxonomyStr.replace(';','_') + '.' + str(percentCompletion) + '.errorbar.png' 104 title = taxonomyStr.replace(';', '; ') + '\n' + 'Percent completion = %.2f' % percentCompletion 105 boxPlot.plot(plotFilename, plotData, plotLabels, r'$\Delta$' + ' Percent Completion', '', False, title) 106 107if __name__ == '__main__': 108 parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 109 110 parser.add_argument('-T', '--taxonomy', help='IMG taxonomy string indicating lineage of interest', default = 'prokaryotes') 111 parser.add_argument('-u', '--ubiquity', help='Ubiquity threshold for defining marker set', type=float, default = 0.97) 112 parser.add_argument('-s', '--single_copy', help='Single-copy threshold for defining marker set', type=float, default = 0.97) 113 parser.add_argument('-p', '--percent_complete', help='Percent completion to simulate', type=float, default = 0.75) 114 parser.add_argument('-r', '--replicates', help='Replicates per genome.', type=int, default = 100) 115 parser.add_argument('-g', '--num_genomes', help='Number of random genomes to consider (-1 for all)', type=int, default = 20) 116 parser.add_argument('-c', '--contig_len', help='Length of contigs to simulate', type=int, default = 5000) 117 118 args = parser.parse_args() 119 120 simMarkerGenesVsMarkerSets = SimMarkerGenesVsMarkerSets() 121 simMarkerGenesVsMarkerSets.run(args.taxonomy, args.ubiquity, args.single_copy, args.percent_complete, args.replicates, args.num_genomes, args.contig_len) 122