1#!/usr/bin/env python
2
3###############################################################################
4#                                                                             #
5#    This program is free software: you can redistribute it and/or modify     #
6#    it under the terms of the GNU General Public License as published by     #
7#    the Free Software Foundation, either version 3 of the License, or        #
8#    (at your option) any later version.                                      #
9#                                                                             #
10#    This program is distributed in the hope that it will be useful,          #
11#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
12#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
13#    GNU General Public License for more details.                             #
14#                                                                             #
15#    You should have received a copy of the GNU General Public License        #
16#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
17#                                                                             #
18###############################################################################
19
20"""
21Identify genes that are consistently co-located within genomes from a specific lineage.
22"""
23
24__author__ = 'Donovan Parks'
25__copyright__ = 'Copyright 2013'
26__credits__ = ['Donovan Parks']
27__license__ = 'GPL3'
28__version__ = '1.0.0'
29__maintainer__ = 'Donovan Parks'
30__email__ = 'donovan.parks@gmail.com'
31__status__ = 'Development'
32
33import argparse
34
35from lib.img import IMG
36from lib.markerSet import MarkerSet
37
38class IdentifyColocatedGenes(object):
39    def __init__(self):
40        pass
41
42    def run(self, ubiquityThreshold, singleCopyThreshold, minGenomes, minMarkers, mostSpecificRank, distThreshold, genomeThreshold):
43        img = IMG()
44        markerset = MarkerSet()
45
46        lineages = img.lineagesSorted(mostSpecificRank)
47
48        fout = open('./data/colocated.tsv', 'w', 1)
49        fout.write('Lineage\t# genomes\t# markers\t# co-located sets\tCo-located markers\n')
50
51        lineageCount = 0
52        for lineage in lineages:
53            lineageCount += 1
54
55            genomeIds = img.genomeIdsByTaxonomy(lineage, 'Final')
56            if len(genomeIds) < minGenomes:
57                continue
58
59            countTable = img.countTable(genomeIds)
60            markerGenes = markerset.markerGenes(genomeIds, countTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
61
62            geneDistTable = img.geneDistTable(genomeIds, markerGenes)
63            colocatedGenes = markerset.colocatedGenes(geneDistTable, distThreshold, genomeThreshold)
64            colocatedSets = markerset.colocatedSets(colocatedGenes, markerGenes)
65            if len(colocatedSets) < minMarkers:
66                continue
67
68            print '\nLineage ' + lineage + ' contains ' + str(len(genomeIds)) + ' genomes (' + str(lineageCount) + ' of ' + str(len(lineages)) + ').'
69            print '  Marker genes: ' + str(len(markerGenes))
70            print '  Co-located gene sets: ' + str(len(colocatedSets))
71
72            fout.write(lineage + '\t' + str(len(genomeIds)) + '\t' + str(len(markerGenes)) + '\t' + str(len(colocatedSets)))
73            for cs in colocatedSets:
74                fout.write('\t' + ', '.join(cs))
75            fout.write('\n')
76
77        fout.close()
78
79if __name__ == '__main__':
80    parser = argparse.ArgumentParser(description="Identify co-located genes within genomes for a specific lineage.",
81                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
82
83    parser.add_argument('-u', '--ubiquity', help='Ubiquity threshold for defining marker set', type=float, default = 0.97)
84    parser.add_argument('-s', '--single_copy', help='Single-copy threshold for defining marker set', type=float, default = 0.97)
85    parser.add_argument('-m', '--min_genomes', help='Minimum genomes required to include in analysis', type=int, default = 40)
86    parser.add_argument('-x', '--min_markers', help='Minimum markers required to include in analysis', type=int, default = 20)
87    parser.add_argument('-r', '--most_specific_rank', help='Most specific rank to include in analysis', type=int, default = 5)
88    parser.add_argument('-d', '--distance_threshold', help='Distance, as percentage of genome length, to be considered co-located', type=float, default=0.001)
89    parser.add_argument('-g', '--genome_threshold', help='Percentage of genomes required to be considered co-located', type=float, default=0.95)
90
91    args = parser.parse_args()
92
93    identifyColocatedGenes = IdentifyColocatedGenes()
94    identifyColocatedGenes.run(args.ubiquity, args.single_copy, args.min_genomes, args.min_markers, args.most_specific_rank, args.distance_threshold, args.genome_threshold)
95