1#!/usr/bin/env python
2
3###############################################################################
4#                                                                             #
5#    This program is free software: you can redistribute it and/or modify     #
6#    it under the terms of the GNU General Public License as published by     #
7#    the Free Software Foundation, either version 3 of the License, or        #
8#    (at your option) any later version.                                      #
9#                                                                             #
10#    This program is distributed in the hope that it will be useful,          #
11#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
12#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
13#    GNU General Public License for more details.                             #
14#                                                                             #
15#    You should have received a copy of the GNU General Public License        #
16#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
17#                                                                             #
18###############################################################################
19
20__prog_desc__ = 'identify genomes suitable for calculating marker genes'
21
22__author__ = 'Donovan Parks'
23__copyright__ = 'Copyright 2013'
24__credits__ = ['Donovan Parks']
25__license__ = 'GPL3'
26__version__ = '0.1'
27__maintainer__ = 'Donovan Parks'
28__email__ = 'donovan.parks@gmail.com'
29__status__ = 'Development'
30
31import os
32import argparse
33from collections import defaultdict
34
35from checkm.lib.img import IMG
36from lib.markerSetBuilder import MarkerSetBuilder
37
38class QcGenomes(object):
39    def __init__(self):
40        pass
41
42    def __genomeString(self, genomeId, metadata, completeness, contamination, missingMarkers, duplicateMarkers):
43        genomeStr = genomeId + '\t' + '; '.join(metadata[genomeId]['taxonomy'])
44        genomeStr += '\t%.2f' % (float(metadata[genomeId]['genome size']) / 1e6)
45        genomeStr +='\t' + str(metadata[genomeId]['scaffold count'])
46        genomeStr +='\t' + str(metadata[genomeId]['gene count'])
47        genomeStr +='\t' + str(metadata[genomeId]['coding base count'])
48        genomeStr +='\t' + str(metadata[genomeId]['N50'])
49        genomeStr +='\t' + metadata[genomeId]['biotic relationships']
50        genomeStr +='\t' + metadata[genomeId]['status']
51        genomeStr +='\t%.3f\t%.3f' % (completeness, contamination)
52        genomeStr += '\t' + ', '.join(list(missingMarkers))
53        genomeStr += '\t' + ', '.join(list(duplicateMarkers)) + '\n'
54
55        return genomeStr
56
57    def run(self, inputMetadataFile, outputMetadataFile, outputDir, ubiquityThreshold, singleCopyThreshold, trustedCompleteness, trustedContamination):
58        img = IMG()
59        markerSetBuilder = MarkerSetBuilder()
60
61        allOut = open(os.path.join(outputDir, 'genomes_all.tsv'), 'w')
62        allOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n')
63
64        trustedOut = open(os.path.join(outputDir, 'genomes_trusted.tsv'), 'w')
65        trustedOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n')
66
67        filteredOut = open(os.path.join(outputDir, 'genomes_filtered.tsv'), 'w')
68        filteredOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n')
69
70        metadataOut = open(outputMetadataFile, 'w')
71
72        # read input metadata file
73        metadata = img.genomeMetadataFromFile(inputMetadataFile)
74
75        finishedGenomes = defaultdict(set)
76        allGenomes = defaultdict(set)
77
78        metadataLine = {}
79
80        bHeader = True
81        for line in open(inputMetadataFile):
82            if bHeader:
83                metadataOut.write(line)
84                bHeader = False
85                continue
86
87            lineSplit = line.split('\t')
88            genomeId = lineSplit[0]
89            domain = lineSplit[1]
90            status = lineSplit[2]
91
92            if status == 'Finished':
93                finishedGenomes[domain].add(genomeId)
94
95            allGenomes[domain].add(genomeId)
96            metadataLine[genomeId] = line
97
98        allTrustedGenomeIds = set()
99        for lineage, allLineageGenomeIds in allGenomes.iteritems():
100            print '[' + lineage + ']'
101            print '  Number of genomes: %d' % len(allLineageGenomeIds)
102
103            # tabulate genomes from each phylum
104            allPhylumCounts = {}
105            for genomeId in allLineageGenomeIds:
106                taxon = metadata[genomeId]['taxonomy'][1]
107                allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1
108
109            # identify marker genes for finished genomes
110            print '\nDetermining initial marker gene sets for genome filtering.'
111            markerSet = markerSetBuilder.buildMarkerSet(finishedGenomes[lineage], ubiquityThreshold, singleCopyThreshold)
112
113            print '  Marker set consists of %s marker genes organized into %d sets.' % (markerSet.numMarkers(), markerSet.numSets())
114            fout = open(os.path.join(outputDir, 'trusted_marker_sets_' + lineage + '.txt'), 'w')
115            fout.write(str(markerSet.markerSet))
116            fout.close()
117
118            # identifying trusted genomes (highly complete, low contamination genomes)
119            print '\nIdentifying highly complete, low contamination genomes.'
120            trustedGenomeIds = set()
121            filteredGenomes = set()
122            retainedStatus = {}
123            filteredStatus = {}
124            geneCountTable = img.geneCountTable(allLineageGenomeIds)
125            for genomeId in allLineageGenomeIds:
126                completeness, contamination, missingMarkers, duplicateMarkers = markerSetBuilder.genomeCheck(markerSet.markerSet, genomeId, geneCountTable)
127
128                genomeStr = self.__genomeString(genomeId, metadata, completeness, contamination, missingMarkers, duplicateMarkers)
129
130                if completeness >= trustedCompleteness and contamination <= trustedContamination:
131                    trustedGenomeIds.add(genomeId)
132                    allTrustedGenomeIds.add(genomeId)
133                    retainedStatus[metadata[genomeId]['status']] = retainedStatus.get(metadata[genomeId]['status'], 0) + 1
134
135                    trustedOut.write(genomeStr)
136                    allOut.write(genomeStr)
137
138                    metadataOut.write(metadataLine[genomeId])
139                else:
140                    filteredGenomes.add(genomeId)
141                    filteredStatus[metadata[genomeId]['status']] = filteredStatus.get(metadata[genomeId]['status'], 0) + 1
142
143                    filteredOut.write(genomeStr)
144                    allOut.write(genomeStr)
145
146            print '  Filtered genomes: %d (%.2f%%)' % (len(filteredGenomes), len(filteredGenomes)*100.0 / len(allLineageGenomeIds))
147            print '  ' + str(filteredStatus)
148            print '  \nTrusted genomes: %d (%.2f%%)' % (len(trustedGenomeIds), len(trustedGenomeIds)*100.0 / len(allLineageGenomeIds))
149            print '  ' + str(retainedStatus)
150
151            # determine status of retained genomes
152            print '\nTrusted genomes by phylum:'
153            trustedPhylumCounts = {}
154            for genomeId in trustedGenomeIds:
155                taxon = metadata[genomeId]['taxonomy'][1]
156                trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon, 0) + 1
157
158            for phylum, count in allPhylumCounts.iteritems():
159                print '  ' + phylum + ': %d of %d' % (trustedPhylumCounts.get(phylum, 0), count)
160            print ''
161
162        allOut.close()
163        trustedOut.close()
164        filteredOut.close()
165        metadataOut.close()
166
167        # write out lineage statistics for genome distribution
168        allStats = {}
169        trustedStats = {}
170
171        for r in xrange(0, 6): # Domain to Genus
172            for genomeId, data in metadata.iteritems():
173                taxaStr = ';'.join(data['taxonomy'][0:r+1])
174                allStats[taxaStr] = allStats.get(taxaStr, 0) + 1
175                if genomeId in allTrustedGenomeIds:
176                    trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1
177
178        sortedLineages = img.lineagesSorted(metadata)
179
180        fout = open(os.path.join(outputDir, 'lineage_stats.tsv'), 'w')
181        fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n')
182        for lineage in sortedLineages:
183            fout.write(lineage + '\t' + str(allStats.get(lineage, 0))+ '\t' + str(trustedStats.get(lineage, 0))+ '\n')
184        fout.close()
185
186if __name__ == '__main__':
187    print 'QcGenomes v' + __version__ + ': ' + __prog_desc__
188    print '  by ' + __author__ + ' (' + __email__ + ')' + '\n'
189
190    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
191    parser.add_argument('input_file', help='input IMG metadata file')
192    parser.add_argument('output_file', help='new IMG metadata file')
193    parser.add_argument('output_dir', help='output dir')
194
195    parser.add_argument('-u', '--ubiquity', help='ubiquity threshold for defining marker set', type=float, default = 0.97)
196    parser.add_argument('-s', '--single_copy', help='single-copy threshold for defining marker set', type=float, default = 0.97)
197    parser.add_argument('-c', '--trusted_completeness', help='completeness threshold for defining trusted genomes', type=float, default = 0.97)
198    parser.add_argument('-d', '--trusted_contamination', help='contamination threshold for defining trusted genomes', type=float, default = 0.03)
199
200    args = parser.parse_args()
201
202    qcGenomes = QcGenomes()
203    qcGenomes.run(args.input_file, args.output_file, args.output_dir, args.ubiquity, args.single_copy, args.trusted_completeness, args.trusted_contamination)
204