1#!/usr/bin/env python 2 3############################################################################### 4# # 5# This program is free software: you can redistribute it and/or modify # 6# it under the terms of the GNU General Public License as published by # 7# the Free Software Foundation, either version 3 of the License, or # 8# (at your option) any later version. # 9# # 10# This program is distributed in the hope that it will be useful, # 11# but WITHOUT ANY WARRANTY; without even the implied warranty of # 12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 13# GNU General Public License for more details. # 14# # 15# You should have received a copy of the GNU General Public License # 16# along with this program. If not, see <http://www.gnu.org/licenses/>. # 17# # 18############################################################################### 19 20""" 21Calculate marker set for varying lineages, number of genomes, and marker set parameters. 22""" 23 24__author__ = 'Donovan Parks' 25__copyright__ = 'Copyright 2013' 26__credits__ = ['Donovan Parks'] 27__license__ = 'GPL3' 28__version__ = '1.0.0' 29__maintainer__ = 'Donovan Parks' 30__email__ = 'donovan.parks@gmail.com' 31__status__ = 'Development' 32 33import sys 34import argparse 35import random 36 37from numpy import mean, std, arange 38 39from lib.img import IMG 40from lib.markerSet import MarkerGenes 41from lib.plots.errorbar import ErrorBar 42from lib.plots.boxplot import BoxPlot 43 44class MarkerSetTest(object): 45 def __init__(self): 46 pass 47 48 def run(self, taxonomyStr, ubiquityThreshold, singleCopyThreshold, replicates, minGenomes, maxGenomes, stepSize): 49 img = IMG() 50 markergenes = MarkerGenes() 51 52 genomeIds = img.genomeIdsByTaxonomy(taxonomyStr, 'Final') 53 54 print 'Lineage ' + taxonomyStr + ' contains ' + str(len(genomeIds)) + ' genomes.' 55 if len(genomeIds) < minGenomes: 56 sys.stderr.write('[Error] Insufficent number of genomes.\n') 57 sys.exit() 58 59 print '' 60 print 'Ubiquity threshold: ' + str(ubiquityThreshold) 61 print 'Single-copy threshold: ' + str(singleCopyThreshold) 62 63 meanMarkerSetSize = [] 64 stdMarkerSetSize = [] 65 markerSetSizes = [] 66 if maxGenomes == -1: 67 maxGenomes = len(genomeIds) 68 69 if maxGenomes > len(genomeIds): 70 maxGenomes = len(genomeIds) 71 72 countTable = img.countTable(genomeIds) 73 countTable = img.filterTable(genomeIds, countTable) 74 75 for numGenomes in xrange(minGenomes, maxGenomes, stepSize): 76 markerSetSize = [] 77 for _ in xrange(0, replicates): 78 genomeIdSubset = random.sample(genomeIds, numGenomes) 79 80 markerGenes = markergenes.identify(genomeIdSubset, countTable, ubiquityThreshold*len(genomeIdSubset), singleCopyThreshold*len(genomeIdSubset)) 81 geneDistTable = img.geneDistTable(genomeIdSubset, markerGenes, spacingBetweenContigs=1e6) 82 colocatedGenes = img.colocatedGenes(geneDistTable) 83 colocatedSets = img.colocatedSets(colocatedGenes, markerGenes) 84 85 markerSetSize.append(len(colocatedSets)) 86 87 markerSetSizes.append(markerSetSize) 88 89 m = mean(markerSetSize) 90 meanMarkerSetSize.append(m) 91 92 s = std(markerSetSize) 93 stdMarkerSetSize.append(s) 94 95 print '' 96 print 'Genomes: ' + str(numGenomes) + ', Ubiquity > ' + str(int(ubiquityThreshold*len(genomeIdSubset))) + ', Single-copy > ' + str(int(singleCopyThreshold*len(genomeIdSubset))) 97 print 'Mean: %.2f +/- %.2f' % (m, s) 98 print 'Min: %d, Max: %d' %(min(markerSetSize), max(markerSetSize)) 99 100 # plot data 101 errorBar = ErrorBar() 102 plotFilename = './images/markerset.' + taxonomyStr.replace(';','_') + '.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) + '.errorbar.png' 103 title = taxonomyStr.replace(';', '; ') + '\n' + 'Ubiquity = %.2f' % ubiquityThreshold + ', Single-copy = %.2f' % singleCopyThreshold 104 errorBar.plot(plotFilename, arange(minGenomes, maxGenomes, stepSize), meanMarkerSetSize, stdMarkerSetSize, 'Number of Genomes', 'Marker Set Size', title) 105 106 boxPlot = BoxPlot() 107 plotFilename = './images/markerset.' + taxonomyStr.replace(';','_') + '.' + str(ubiquityThreshold) + '-' + str(singleCopyThreshold) + '.boxplot.png' 108 boxPlot.plot(plotFilename, markerSetSizes, arange(minGenomes, maxGenomes, stepSize), 'Number of Genomes', 'Marker Set Size', True, title) 109 110if __name__ == '__main__': 111 parser = argparse.ArgumentParser(description="Plot change in marker set size for varying subsets of genomes.", 112 formatter_class=argparse.ArgumentDefaultsHelpFormatter) 113 114 parser.add_argument('-T', '--taxonomy', help='IMG taxonomy string indicating lineage of interest', default = 'universal') 115 parser.add_argument('-u', '--ubiquity', help='Ubiquity threshold for defining marker set', type=float, default = 0.97) 116 parser.add_argument('-s', '--single_copy', help='Single-copy threshold for defining marker set', type=float, default = 0.97) 117 parser.add_argument('-r', '--replicates', help='Resampling replicates to perform for varying genomes sets', type=int, default=100) 118 parser.add_argument('-i', '--min_genomes', help='Minimum genomes to consider when building marker set', type=int, default=10) 119 parser.add_argument('-a', '--max_genomes', help='Maximum genomes to consider when building marker set, use -1 for all genomes', type=int, default=200) 120 parser.add_argument('-x', '--step_size', help='Step size for increasing number of genomes under consideration', type=int, default=5) 121 122 args = parser.parse_args() 123 124 markerSetTest = MarkerSetTest() 125 markerSetTest.run(args.taxonomy, args.ubiquity, args.single_copy, args.replicates, args.min_genomes, args.max_genomes, args.step_size) 126