1#!/usr/bin/env python 2 3############################################################################### 4# # 5# This program is free software: you can redistribute it and/or modify # 6# it under the terms of the GNU General Public License as published by # 7# the Free Software Foundation, either version 3 of the License, or # 8# (at your option) any later version. # 9# # 10# This program is distributed in the hope that it will be useful, # 11# but WITHOUT ANY WARRANTY; without even the implied warranty of # 12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 13# GNU General Public License for more details. # 14# # 15# You should have received a copy of the GNU General Public License # 16# along with this program. If not, see <http://www.gnu.org/licenses/>. # 17# # 18############################################################################### 19 20""" 21Identify genes that are consistently co-located within genomes from a specific lineage. 22""" 23 24__author__ = 'Donovan Parks' 25__copyright__ = 'Copyright 2013' 26__credits__ = ['Donovan Parks'] 27__license__ = 'GPL3' 28__version__ = '1.0.0' 29__maintainer__ = 'Donovan Parks' 30__email__ = 'donovan.parks@gmail.com' 31__status__ = 'Development' 32 33import argparse 34 35from lib.img import IMG 36from lib.markerSet import MarkerSet 37 38class IdentifyColocatedGenes(object): 39 def __init__(self): 40 pass 41 42 def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, minMarkers, mostSpecificRank, distThreshold, genomeThreshold): 43 img = IMG() 44 markerset = MarkerSet() 45 46 lineages = img.lineagesSorted(mostSpecificRank) 47 48 fout = open('./data/colocated.tsv', 'w', 1) 49 fout.write('Lineage\t# genomes\t# markers\t# co-located sets\tCo-located markers\n') 50 51 lineageCount = 0 52 for lineage in lineages: 53 lineageCount += 1 54 55 genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final') 56 if len(genomeIds) < minGenomes: 57 continue 58 59 countTable = img.countTable(genomeIds) 60 markerGenes = markerset.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds)) 61 62 geneDistTable = img.geneDistTable(genomeIds, markerGenes) 63 colocatedGenes = markerset.colocatedGenes(geneDistTable, distThreshold, genomeThreshold) 64 colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes) 65 if len(colocatedSets) < minMarkers: 66 continue 67 68 print '\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes (' + str(lineageCount) + ' of ' + str(len(lineages)) + ').' 69 print ' Marker genes: ' + str(len(markerGenes)) 70 print ' Co-located gene sets: ' + str(len(colocatedSets)) 71 72 fout.write(lineage + '\t' + str(len(genomeIds)) + '\t' + str(len(markerGenes)) + '\t' + str(len(colocatedSets))) 73 for cs in colocatedSets: 74 fout.write('\t' + ', '.join(cs)) 75 fout.write('\n') 76 77 fout.close() 78 79if __name__ == '__main__': 80 parser = argparse.ArgumentParser(description="Identify co-located genes within genomes for a specific lineage.", 81 formatter_class=argparse.ArgumentDefaultsHelpFormatter) 82 83 parser.add_argument('-u', '--ubiquity', help='Ubiquity threshold for defining marker set', type=float, default = 0.97) 84 parser.add_argument('-s', '--single_copy', help='Single-copy threshold for defining marker set', type=float, default = 0.97) 85 parser.add_argument('-m', '--min_genomes', help='Minimum genomes required to include in analysis', type=int, default = 40) 86 parser.add_argument('-x', '--min_markers', help='Minimum markers required to include in analysis', type=int, default = 20) 87 parser.add_argument('-r', '--most_specific_rank', help='Most specific rank to include in analysis', type=int, default = 5) 88 parser.add_argument('-d', '--distance_threshold', help='Distance, as percentage of genome length, to be considered co-located', type=float, default=0.001) 89 parser.add_argument('-g', '--genome_threshold', help='Percentage of genomes required to be considered co-located', type=float, default=0.95) 90 91 args = parser.parse_args() 92 93 identifyColocatedGenes = IdentifyColocatedGenes() 94 identifyColocatedGenes.run(args.ubiquity, args.single_copy, args.min_genomes, args.min_markers, args.most_specific_rank, args.distance_threshold, args.genome_threshold) 95