1#!/usr/bin/env python 2 3############################################################################### 4# # 5# This program is free software: you can redistribute it and/or modify # 6# it under the terms of the GNU General Public License as published by # 7# the Free Software Foundation, either version 3 of the License, or # 8# (at your option) any later version. # 9# # 10# This program is distributed in the hope that it will be useful, # 11# but WITHOUT ANY WARRANTY; without even the implied warranty of # 12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 13# GNU General Public License for more details. # 14# # 15# You should have received a copy of the GNU General Public License # 16# along with this program. If not, see <http://www.gnu.org/licenses/>. # 17# # 18############################################################################### 19 20__prog_desc__ = 'identify genomes suitable for calculating marker genes' 21 22__author__ = 'Donovan Parks' 23__copyright__ = 'Copyright 2013' 24__credits__ = ['Donovan Parks'] 25__license__ = 'GPL3' 26__version__ = '0.1' 27__maintainer__ = 'Donovan Parks' 28__email__ = 'donovan.parks@gmail.com' 29__status__ = 'Development' 30 31import os 32import argparse 33from collections import defaultdict 34 35from checkm.lib.img import IMG 36from lib.markerSetBuilder import MarkerSetBuilder 37 38class QcGenomes(object): 39 def __init__(self): 40 pass 41 42 def __genomeString(self, genomeId, metadata, completeness, contamination, missingMarkers, duplicateMarkers): 43 genomeStr = genomeId + '\t' + '; '.join(metadata[genomeId]['taxonomy']) 44 genomeStr += '\t%.2f' % (float(metadata[genomeId]['genome size']) / 1e6) 45 genomeStr +='\t' + str(metadata[genomeId]['scaffold count']) 46 genomeStr +='\t' + str(metadata[genomeId]['gene count']) 47 genomeStr +='\t' + str(metadata[genomeId]['coding base count']) 48 genomeStr +='\t' + str(metadata[genomeId]['N50']) 49 genomeStr +='\t' + metadata[genomeId]['biotic relationships'] 50 genomeStr +='\t' + metadata[genomeId]['status'] 51 genomeStr +='\t%.3f\t%.3f' % (completeness, contamination) 52 genomeStr += '\t' + ', '.join(list(missingMarkers)) 53 genomeStr += '\t' + ', '.join(list(duplicateMarkers)) + '\n' 54 55 return genomeStr 56 57 def run(self, inputMetadataFile, outputMetadataFile, outputDir, ubiquityThreshold, singleCopyThreshold, trustedCompleteness, trustedContamination): 58 img = IMG() 59 markerSetBuilder = MarkerSetBuilder() 60 61 allOut = open(os.path.join(outputDir, 'genomes_all.tsv'), 'w') 62 allOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n') 63 64 trustedOut = open(os.path.join(outputDir, 'genomes_trusted.tsv'), 'w') 65 trustedOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n') 66 67 filteredOut = open(os.path.join(outputDir, 'genomes_filtered.tsv'), 'w') 68 filteredOut.write('Genome Id\tLineage\tGenome size (Mbps)\tScaffold count\tGene count\tCoding base count\tN50\tBiotic Relationship\tStatus\tCompleteness\tContamination\tMissing markers\tDuplicate markers\n') 69 70 metadataOut = open(outputMetadataFile, 'w') 71 72 # read input metadata file 73 metadata = img.genomeMetadataFromFile(inputMetadataFile) 74 75 finishedGenomes = defaultdict(set) 76 allGenomes = defaultdict(set) 77 78 metadataLine = {} 79 80 bHeader = True 81 for line in open(inputMetadataFile): 82 if bHeader: 83 metadataOut.write(line) 84 bHeader = False 85 continue 86 87 lineSplit = line.split('\t') 88 genomeId = lineSplit[0] 89 domain = lineSplit[1] 90 status = lineSplit[2] 91 92 if status == 'Finished': 93 finishedGenomes[domain].add(genomeId) 94 95 allGenomes[domain].add(genomeId) 96 metadataLine[genomeId] = line 97 98 allTrustedGenomeIds = set() 99 for lineage, allLineageGenomeIds in allGenomes.iteritems(): 100 print '[' + lineage + ']' 101 print ' Number of genomes: %d' % len(allLineageGenomeIds) 102 103 # tabulate genomes from each phylum 104 allPhylumCounts = {} 105 for genomeId in allLineageGenomeIds: 106 taxon = metadata[genomeId]['taxonomy'][1] 107 allPhylumCounts[taxon] = allPhylumCounts.get(taxon, 0) + 1 108 109 # identify marker genes for finished genomes 110 print '\nDetermining initial marker gene sets for genome filtering.' 111 markerSet = markerSetBuilder.buildMarkerSet(finishedGenomes[lineage], ubiquityThreshold, singleCopyThreshold) 112 113 print ' Marker set consists of %s marker genes organized into %d sets.' % (markerSet.numMarkers(), markerSet.numSets()) 114 fout = open(os.path.join(outputDir, 'trusted_marker_sets_' + lineage + '.txt'), 'w') 115 fout.write(str(markerSet.markerSet)) 116 fout.close() 117 118 # identifying trusted genomes (highly complete, low contamination genomes) 119 print '\nIdentifying highly complete, low contamination genomes.' 120 trustedGenomeIds = set() 121 filteredGenomes = set() 122 retainedStatus = {} 123 filteredStatus = {} 124 geneCountTable = img.geneCountTable(allLineageGenomeIds) 125 for genomeId in allLineageGenomeIds: 126 completeness, contamination, missingMarkers, duplicateMarkers = markerSetBuilder.genomeCheck(markerSet.markerSet, genomeId, geneCountTable) 127 128 genomeStr = self.__genomeString(genomeId, metadata, completeness, contamination, missingMarkers, duplicateMarkers) 129 130 if completeness >= trustedCompleteness and contamination <= trustedContamination: 131 trustedGenomeIds.add(genomeId) 132 allTrustedGenomeIds.add(genomeId) 133 retainedStatus[metadata[genomeId]['status']] = retainedStatus.get(metadata[genomeId]['status'], 0) + 1 134 135 trustedOut.write(genomeStr) 136 allOut.write(genomeStr) 137 138 metadataOut.write(metadataLine[genomeId]) 139 else: 140 filteredGenomes.add(genomeId) 141 filteredStatus[metadata[genomeId]['status']] = filteredStatus.get(metadata[genomeId]['status'], 0) + 1 142 143 filteredOut.write(genomeStr) 144 allOut.write(genomeStr) 145 146 print ' Filtered genomes: %d (%.2f%%)' % (len(filteredGenomes), len(filteredGenomes)*100.0 / len(allLineageGenomeIds)) 147 print ' ' + str(filteredStatus) 148 print ' \nTrusted genomes: %d (%.2f%%)' % (len(trustedGenomeIds), len(trustedGenomeIds)*100.0 / len(allLineageGenomeIds)) 149 print ' ' + str(retainedStatus) 150 151 # determine status of retained genomes 152 print '\nTrusted genomes by phylum:' 153 trustedPhylumCounts = {} 154 for genomeId in trustedGenomeIds: 155 taxon = metadata[genomeId]['taxonomy'][1] 156 trustedPhylumCounts[taxon] = trustedPhylumCounts.get(taxon, 0) + 1 157 158 for phylum, count in allPhylumCounts.iteritems(): 159 print ' ' + phylum + ': %d of %d' % (trustedPhylumCounts.get(phylum, 0), count) 160 print '' 161 162 allOut.close() 163 trustedOut.close() 164 filteredOut.close() 165 metadataOut.close() 166 167 # write out lineage statistics for genome distribution 168 allStats = {} 169 trustedStats = {} 170 171 for r in xrange(0, 6): # Domain to Genus 172 for genomeId, data in metadata.iteritems(): 173 taxaStr = ';'.join(data['taxonomy'][0:r+1]) 174 allStats[taxaStr] = allStats.get(taxaStr, 0) + 1 175 if genomeId in allTrustedGenomeIds: 176 trustedStats[taxaStr] = trustedStats.get(taxaStr, 0) + 1 177 178 sortedLineages = img.lineagesSorted(metadata) 179 180 fout = open(os.path.join(outputDir, 'lineage_stats.tsv'), 'w') 181 fout.write('Lineage\tGenomes with metadata\tTrusted genomes\n') 182 for lineage in sortedLineages: 183 fout.write(lineage + '\t' + str(allStats.get(lineage, 0))+ '\t' + str(trustedStats.get(lineage, 0))+ '\n') 184 fout.close() 185 186if __name__ == '__main__': 187 print 'QcGenomes v' + __version__ + ': ' + __prog_desc__ 188 print ' by ' + __author__ + ' (' + __email__ + ')' + '\n' 189 190 parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 191 parser.add_argument('input_file', help='input IMG metadata file') 192 parser.add_argument('output_file', help='new IMG metadata file') 193 parser.add_argument('output_dir', help='output dir') 194 195 parser.add_argument('-u', '--ubiquity', help='ubiquity threshold for defining marker set', type=float, default = 0.97) 196 parser.add_argument('-s', '--single_copy', help='single-copy threshold for defining marker set', type=float, default = 0.97) 197 parser.add_argument('-c', '--trusted_completeness', help='completeness threshold for defining trusted genomes', type=float, default = 0.97) 198 parser.add_argument('-d', '--trusted_contamination', help='contamination threshold for defining trusted genomes', type=float, default = 0.03) 199 200 args = parser.parse_args() 201 202 qcGenomes = QcGenomes() 203 qcGenomes.run(args.input_file, args.output_file, args.output_dir, args.ubiquity, args.single_copy, args.trusted_completeness, args.trusted_contamination) 204