1###############################################################################
2#                                                                             #
3#    This program is free software: you can redistribute it and/or modify     #
4#    it under the terms of the GNU General Public License as published by     #
5#    the Free Software Foundation, either version 3 of the License, or        #
6#    (at your option) any later version.                                      #
7#                                                                             #
8#    This program is distributed in the hope that it will be useful,          #
9#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
10#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
11#    GNU General Public License for more details.                             #
12#                                                                             #
13#    You should have received a copy of the GNU General Public License        #
14#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
15#                                                                             #
16###############################################################################
17
18import os
19import sys
20import logging
21from collections import defaultdict
22
23import numpy as np
24
25from checkm.defaultValues import DefaultValues
26from checkm.timeKeeper import TimeKeeper
27from checkm.markerSets import MarkerSetParser, BinMarkerSets
28from checkm.resultsParser import ResultsParser
29from checkm.hmmerAligner import HmmerAligner
30from checkm.markerGeneFinder import MarkerGeneFinder
31from checkm.pplacer import PplacerRunner
32from checkm.treeParser import TreeParser
33from checkm.taxonParser import TaxonParser
34from checkm.aminoAcidIdentity import AminoAcidIdentity
35from checkm.binComparer import BinComparer
36from checkm.binUnion import BinUnion
37from checkm.binStatistics import BinStatistics
38from checkm.coverage import Coverage
39from checkm.coverageWindows import CoverageWindows
40from checkm.genomicSignatures import GenomicSignatures
41from checkm.unbinned import Unbinned
42from checkm.merger import Merger
43from checkm.profile import Profile
44from checkm.binTools import BinTools
45from checkm.ssuFinder import SSU_Finder
46from checkm.PCA import PCA
47from checkm.common import (makeSurePathExists,
48                            checkFileExists,
49                            binIdFromFilename,
50                            reassignStdOut,
51                            restoreStdOut,
52                            getBinIdsFromOutDir,
53                            checkDirExists)
54
55from checkm.plot.gcPlots import GcPlots
56from checkm.plot.codingDensityPlots import CodingDensityPlots
57from checkm.plot.tetraDistPlots import TetraDistPlots
58from checkm.plot.distributionPlots import DistributionPlots
59from checkm.plot.nxPlot import NxPlot
60from checkm.plot.cumulativeLengthPlot import CumulativeLengthPlot
61from checkm.plot.lengthHistogram import LengthHistogram
62from checkm.plot.markerGenePosPlot import MarkerGenePosPlot
63from checkm.plot.parallelCoordPlot import ParallelCoordPlot
64from checkm.plot.binQAPlot import BinQAPlot
65from checkm.plot.pcaPlot import PcaPlot
66from checkm.plot.gcBiasPlots import GcBiasPlot
67
68from checkm.util.seqUtils import checkNuclotideSeqs, checkProteinSeqs
69
70from checkm.checkmData import DBManager
71
72from checkm.test.test_ecoli import VerifyEcoli
73
74
75class OptionsParser():
76    def __init__(self):
77        self.logger = logging.getLogger('timestamp')
78        self.timeKeeper = TimeKeeper()
79
80    def updateCheckM_DB(self, options):
81        self.logger.info('[CheckM - data] Check for database updates. [%s]' % options.action[0])
82
83        action = options.action
84        if action and action[0] == 'setRoot' and len(action) > 1:
85            DBM = DBManager(set_path=action[1])
86        else:
87            self.logger.error('Path to the CheckM reference data must be specified.')
88
89    def binFiles(self, binFolder, binExtension):
90        binFiles = []
91        if binFolder is not None:
92            all_files = os.listdir(binFolder)
93            for f in all_files:
94                if f.endswith(binExtension):
95                    binFile = os.path.join(binFolder, f)
96                    if os.stat(binFile).st_size == 0:
97                        self.logger.warning("  [Warning] Skipping bin %s as it has a size of 0 bytes." % f)
98                    else:
99                        binFiles.append(binFile)
100
101        if not binFiles:
102            self.logger.error("No bins found. Check the extension (-x) used to identify bins.")
103            sys.exit(1)
104
105        return sorted(binFiles)
106
107    def tree(self, options):
108        """Tree command"""
109        self.logger.info('[CheckM - tree] Placing bins in reference genome tree.')
110
111        binFiles = self.binFiles(options.bin_dir, options.extension)
112
113        if not options.bCalledGenes:
114            if not checkNuclotideSeqs(binFiles):
115                return
116        else:
117            if not checkProteinSeqs(binFiles):
118                return
119
120        # setup directory structure
121        makeSurePathExists(os.path.join(options.output_dir, 'bins'))
122        makeSurePathExists(os.path.join(options.output_dir, 'storage'))
123
124        # find phylogenetically informative genes in genome bins
125        mgf = MarkerGeneFinder(options.threads)
126        binIdToModels = mgf.find(binFiles,
127                                 options.output_dir,
128                                 DefaultValues.HMMER_TABLE_PHYLO_OUT,
129                                 DefaultValues.HMMER_PHYLO_OUT,
130                                 DefaultValues.PHYLO_HMM_MODELS,
131                                 options.bKeepAlignment,
132                                 options.bNucORFs,
133                                 options.bCalledGenes)
134
135        # write model information to file
136        markerSetParser = MarkerSetParser(options.threads)
137        hmmModelInfoFile = os.path.join(options.output_dir, 'storage', DefaultValues.PHYLO_HMM_MODEL_INFO)
138        markerSetParser.writeBinModels(binIdToModels, hmmModelInfoFile)
139
140        # calculate statistics for each genome bin
141
142        binStats = BinStatistics(options.threads)
143        binStats.calculate(binFiles, options.output_dir, DefaultValues.BIN_STATS_PHYLO_OUT)
144
145        # align identified marker genes
146
147        HA = HmmerAligner(options.threads)
148        resultsParser = HA.makeAlignmentToPhyloMarkers(options.output_dir,
149                                                            DefaultValues.PHYLO_HMM_MODELS,
150                                                            DefaultValues.HMMER_TABLE_PHYLO_OUT,
151                                                            binIdToModels,
152                                                            False,
153                                                            DefaultValues.E_VAL,
154                                                            DefaultValues.LENGTH,
155                                                            False,
156                                                            os.path.join(options.output_dir, 'storage', 'tree')
157                                                            )
158
159        # place bins into genome tree
160
161        pplacer = PplacerRunner(threads=options.pplacer_threads)  # fix at one thread to keep memory requirements reasonable
162        pplacer.run(binFiles, resultsParser, options.output_dir, options.bReducedTree)
163
164        self.timeKeeper.printTimeStamp()
165
166    def treeQA(self, options):
167        """QA command"""
168        self.logger.info('[CheckM - tree_qa] Assessing phylogenetic markers found in each bin.')
169
170        checkDirExists(options.tree_dir)
171
172        # set HMM file for each bin
173        markerSetParser = MarkerSetParser()
174        hmmModelInfoFile = os.path.join(options.tree_dir, 'storage', DefaultValues.PHYLO_HMM_MODEL_INFO)
175        binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile)
176
177        # calculate marker gene statistics
178        RP = ResultsParser(binIdToModels)
179        binStats = RP.analyseResults(options.tree_dir,
180                                          DefaultValues.BIN_STATS_PHYLO_OUT,
181                                          DefaultValues.HMMER_TABLE_PHYLO_OUT)
182
183        # determine taxonomy of each bin
184
185        treeParser = TreeParser()
186        treeParser.printSummary(options.out_format, options.tree_dir, RP, options.bTabTable, options.file, binStats)
187
188        if options.file != '':
189            self.logger.info('QA information written to: ' + options.file)
190
191        self.timeKeeper.printTimeStamp()
192
193    def lineageSet(self, options, db=None):
194        """Lineage set command"""
195        self.logger.info('[CheckM - lineage_set] Inferring lineage-specific marker sets.')
196
197        checkDirExists(options.tree_dir)
198
199        # set HMM file for each bin
200        markerSetParser = MarkerSetParser()
201        hmmModelInfoFile = os.path.join(options.tree_dir, 'storage', DefaultValues.PHYLO_HMM_MODEL_INFO)
202        binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile)
203
204        # calculate marker gene statistics
205        resultsParser = ResultsParser(binIdToModels)
206        resultsParser.analyseResults(options.tree_dir,
207                                      DefaultValues.BIN_STATS_PHYLO_OUT,
208                                      DefaultValues.HMMER_TABLE_PHYLO_OUT)
209
210        # These options are incompatible with how the lineage-specific marker set is selected, so
211        # the default values are currently hard-coded
212
213        options.num_genomes_markers = 2
214        options.bootstrap = 0
215        options.bRequireTaxonomy = False
216
217        treeParser = TreeParser()
218        treeParser.getBinMarkerSets(options.tree_dir, options.marker_file,
219                                    options.num_genomes_markers,
220                                    options.bootstrap, options.bNoLineageSpecificRefinement,
221                                    options.bForceDomain, options.bRequireTaxonomy,
222                                    resultsParser, options.unique, options.multi)
223
224
225        self.logger.info('Marker set written to: ' + options.marker_file)
226
227        self.timeKeeper.printTimeStamp()
228
229    def taxonList(self, options, db=None):
230        """Lineage set command"""
231        self.logger.info('[CheckM - taxon_list] Listing available taxonomic-specific marker sets.')
232
233        taxonParser = TaxonParser()
234        taxonParser.list(options.rank)
235
236        self.timeKeeper.printTimeStamp()
237
238    def taxonSet(self, options, db=None):
239        """Taxon set command"""
240        self.logger.info('[CheckM - taxon_set] Generate taxonomic-specific marker set.')
241
242        path = os.path.split(options.marker_file)[0]
243        if path:
244            makeSurePathExists(path)
245
246        taxonParser = TaxonParser()
247        bValidSet = taxonParser.markerSet(options.rank, options.taxon, options.marker_file)
248
249        if bValidSet:
250
251            self.logger.info('Marker set written to: ' + options.marker_file)
252
253        self.timeKeeper.printTimeStamp()
254
255    def analyze(self, options, db=None):
256        """Analyze command"""
257        self.logger.info('[CheckM - analyze] Identifying marker genes in bins.')
258
259        binFiles = self.binFiles(options.bin_dir, options.extension)
260
261        if not options.bCalledGenes:
262            if not checkNuclotideSeqs(binFiles):
263                return
264        else:
265            if not checkProteinSeqs(binFiles):
266                return
267
268        # setup directory structure
269        makeSurePathExists(options.output_dir)
270        makeSurePathExists(os.path.join(options.output_dir, 'bins'))
271        makeSurePathExists(os.path.join(options.output_dir, 'storage'))
272        makeSurePathExists(os.path.join(options.output_dir, 'storage', 'aai_qa'))
273
274        # find marker genes in genome bins
275        mgf = MarkerGeneFinder(options.threads)
276        binIdToModels = mgf.find(binFiles,
277                                 options.output_dir,
278                                 DefaultValues.HMMER_TABLE_OUT,
279                                 DefaultValues.HMMER_OUT,
280                                 options.marker_file,
281                                 options.bKeepAlignment,
282                                 options.bNucORFs,
283                                 options.bCalledGenes)
284
285        markerSetParser = MarkerSetParser(options.threads)
286        binIdToBinMarkerSets = markerSetParser.getMarkerSets(options.output_dir,
287                                                             getBinIdsFromOutDir(options.output_dir),
288                                                             options.marker_file)
289
290        hmmModelInfoFile = os.path.join(options.output_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO)
291        markerSetParser.writeBinModels(binIdToModels, hmmModelInfoFile)
292
293        self.timeKeeper.printTimeStamp()
294
295        # HMM model file
296        if markerSetParser.markerFileType(options.marker_file) == BinMarkerSets.HMM_MODELS_SET:
297            markerFile = options.marker_file
298        else:
299            markerFile = DefaultValues.HMM_MODELS
300
301        # align marker genes with multiple hits within a bin
302
303        HA = HmmerAligner(options.threads)
304        HA.makeAlignmentsOfMultipleHits(options.output_dir,
305                                          markerFile,
306                                          DefaultValues.HMMER_TABLE_OUT,
307                                          binIdToModels,
308                                          binIdToBinMarkerSets,
309                                          False,
310                                          DefaultValues.E_VAL,
311                                          DefaultValues.LENGTH,
312                                          os.path.join(options.output_dir, 'storage', 'aai_qa')
313                                          )
314
315        self.timeKeeper.printTimeStamp()
316
317        # calculate statistics for each genome bin
318
319        binStats = BinStatistics(options.threads)
320        binStats.calculate(binFiles, options.output_dir, DefaultValues.BIN_STATS_OUT)
321
322        self.timeKeeper.printTimeStamp()
323
324        # align top hit to each marker if requested
325        if options.bAlignTopHit:
326            alignmentOutputFolder = os.path.join(options.output_dir, 'storage', 'alignments')
327            makeSurePathExists(alignmentOutputFolder)
328
329
330            HA = HmmerAligner(options.threads)
331            resultsParser = HA.makeAlignmentTopHit(options.output_dir,
332                                                                options.marker_file,
333                                                                DefaultValues.HMMER_TABLE_OUT,
334                                                                binIdToModels,
335                                                                False,
336                                                                DefaultValues.E_VAL,
337                                                                DefaultValues.LENGTH,
338                                                                True,
339                                                                alignmentOutputFolder
340                                                                )
341
342            # report marker gene data
343            fout = open(os.path.join(alignmentOutputFolder, 'alignment_info.tsv'), 'w')
344            fout.write('Marker Id\tLength (bp)\n')
345            markerIds = resultsParser.models[resultsParser.models.keys()[0]].keys()
346            for markerId in markerIds:
347                fout.write('%s\t%d\n' % (markerId, resultsParser.models[resultsParser.models.keys()[0]][markerId].leng))
348            fout.close()
349
350
351            self.logger.info('Alignments to top hits stored in: ' + alignmentOutputFolder)
352
353            self.timeKeeper.printTimeStamp()
354
355    def qa(self, options):
356        """QA command"""
357        self.logger.info('[CheckM - qa] Tabulating genome statistics.')
358
359        checkDirExists(options.analyze_dir)
360
361        if options.exclude_markers:
362            checkFileExists(options.exclude_markers)
363
364        # calculate AAI between marks with multiple hits in a single bin
365        aai = AminoAcidIdentity()
366        aai.run(options.aai_strain, options.analyze_dir, options.alignment_file)
367
368        # get HMM file for each bin
369
370        markerSetParser = MarkerSetParser(options.threads)
371
372        hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO)
373        binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile)
374
375        binIdToBinMarkerSets = markerSetParser.getMarkerSets(options.analyze_dir,
376                                                             getBinIdsFromOutDir(options.analyze_dir),
377                                                             options.marker_file,
378                                                             options.exclude_markers)
379
380        # get results for each bin
381        RP = ResultsParser(binIdToModels)
382        RP.analyseResults(options.analyze_dir,
383                          DefaultValues.BIN_STATS_OUT,
384                          DefaultValues.HMMER_TABLE_OUT,
385                          bIgnoreThresholds=options.bIgnoreThresholds,
386                          evalueThreshold=options.e_value,
387                          lengthThreshold=options.length,
388                          bSkipPseudoGeneCorrection=options.bSkipPseudoGeneCorrection,
389                          bSkipAdjCorrection=options.bSkipAdjCorrection
390                          )
391
392
393        RP.printSummary(options.out_format,
394                        aai, binIdToBinMarkerSets,
395                        options.bIndividualMarkers,
396                        options.coverage_file,
397                        options.bTabTable,
398                        options.file,
399                        anaFolder=options.analyze_dir)
400        RP.cacheResults(options.analyze_dir,
401                            binIdToBinMarkerSets,
402                            options.bIndividualMarkers)
403
404        if options.file != '':
405            self.logger.info('QA information written to: ' + options.file)
406
407        self.timeKeeper.printTimeStamp()
408
409    def gcPlot(self, options):
410        """GC plot command"""
411        self.logger.info('[CheckM - gc_plot] Creating GC histogram and delta-GC plot.')
412
413        checkDirExists(options.bin_dir)
414        makeSurePathExists(options.output_dir)
415
416        binFiles = self.binFiles(options.bin_dir, options.extension)
417
418        plots = GcPlots(options)
419        filesProcessed = 1
420        for f in binFiles:
421            self.logger.info('Plotting GC plots for %s (%d of %d)' % (f, filesProcessed, len(binFiles)))
422            filesProcessed += 1
423
424            plots.plot(f, options.distributions)
425
426            binId = binIdFromFilename(f)
427            outputFile = os.path.join(options.output_dir, binId) + '.gc_plots.' + options.image_type
428            plots.savePlot(outputFile, dpi=options.dpi)
429            self.logger.info('Plot written to: ' + outputFile)
430
431        self.timeKeeper.printTimeStamp()
432
433    def codingDensityPlot(self, options):
434        """Coding density plot command"""
435        self.logger.info('[CheckM - coding_plot] Creating coding density histogram and delta-CD plot.')
436
437        checkDirExists(options.bin_dir)
438        makeSurePathExists(options.output_dir)
439
440        binFiles = self.binFiles(options.bin_dir, options.extension)
441
442        plots = CodingDensityPlots(options)
443        filesProcessed = 1
444        for f in binFiles:
445            self.logger.info('Plotting coding density plots for %s (%d of %d)' % (f, filesProcessed, len(binFiles)))
446            filesProcessed += 1
447
448            plots.plot(f, options.distributions)
449
450            binId = binIdFromFilename(f)
451            outputFile = os.path.join(options.output_dir, binId) + '.coding_density_plots.' + options.image_type
452            plots.savePlot(outputFile, dpi=options.dpi)
453            self.logger.info('Plot written to: ' + outputFile)
454
455        self.timeKeeper.printTimeStamp()
456
457    def tetraDistPlot(self, options):
458        """Tetranucleotide distance plot command"""
459        self.logger.info('[CheckM - tetra_plot] Creating tetra-distance histogram and delta-TD plot.')
460
461        checkDirExists(options.bin_dir)
462        makeSurePathExists(options.output_dir)
463
464        binFiles = self.binFiles(options.bin_dir, options.extension)
465
466        genomicSignatures = GenomicSignatures(K=4, threads=1)
467        tetraSigs = genomicSignatures.read(options.tetra_profile)
468
469        plots = TetraDistPlots(options)
470        filesProcessed = 1
471        for f in binFiles:
472            self.logger.info('Plotting tetranuclotide distance plots for %s (%d of %d)' % (f, filesProcessed, len(binFiles)))
473            filesProcessed += 1
474
475            binId = binIdFromFilename(f)
476            plots.plot(f, tetraSigs, options.distributions)
477
478            outputFile = os.path.join(options.output_dir, binId) + '.tetra_dist_plots.' + options.image_type
479            plots.savePlot(outputFile, dpi=options.dpi)
480            self.logger.info('Plot written to: ' + outputFile)
481
482        self.timeKeeper.printTimeStamp()
483
484    def distributionPlots(self, options):
485        """Reference distribution plot command"""
486        self.logger.info('[CheckM - dist_plot] Creating GC, CD, and TD distribution plots.')
487
488        checkDirExists(options.bin_dir)
489        makeSurePathExists(options.output_dir)
490
491        binFiles = self.binFiles(options.bin_dir, options.extension)
492
493        genomicSignatures = GenomicSignatures(K=4, threads=1)
494        tetraSigs = genomicSignatures.read(options.tetra_profile)
495
496        plots = DistributionPlots(options)
497        filesProcessed = 1
498        for f in binFiles:
499            self.logger.info('Plotting reference distribution plots for %s (%d of %d)' % (f, filesProcessed, len(binFiles)))
500            filesProcessed += 1
501
502            binId = binIdFromFilename(f)
503            plots.plot(f, tetraSigs, options.distributions)
504
505            outputFile = os.path.join(options.output_dir, binId) + '.ref_dist_plots.' + options.image_type
506            plots.savePlot(outputFile, dpi=options.dpi)
507            self.logger.info('Plot written to: ' + outputFile)
508
509        self.timeKeeper.printTimeStamp()
510
511    def tetraPcaPlot(self, options):
512        """PCA plot of tetranucleotide signatures"""
513        self.logger.info('[CheckM - tetra_pca] Creating PCA plot of tetranucleotide signatures.')
514
515        checkDirExists(options.bin_dir)
516        makeSurePathExists(options.output_dir)
517
518        binFiles = self.binFiles(options.bin_dir, options.extension)
519
520        self.logger.info('Computing PCA of tetranuclotide signatures.\n')
521        pca = PCA()
522        seqIds, pc, variance = pca.pcaFile(options.tetra_profile, fraction=1.0, bCenter=True, bScale=False)
523
524        plots = PcaPlot(options)
525        filesProcessed = 1
526        for f in binFiles:
527            self.logger.info('Plotting PCA of tetranuclotide signatures for %s (%d of %d)' % (f, filesProcessed, len(binFiles)))
528            filesProcessed += 1
529
530            plots.plot(f, seqIds, pc, variance)
531
532            binId = binIdFromFilename(f)
533            outputFile = os.path.join(options.output_dir, binId) + '.tetra_pca_plots.' + options.image_type
534            plots.savePlot(outputFile, dpi=options.dpi)
535            self.logger.info('Plot written to: ' + outputFile)
536
537        self.timeKeeper.printTimeStamp()
538
539    def coveragePcaPlot(self, options):
540        """PCA plot of coverage profiles"""
541        self.logger.info('[CheckM - cov_pca] Creating PCA plot of coverage profiles.')
542
543        checkDirExists(options.bin_dir)
544        checkFileExists(options.coverage_file)
545        makeSurePathExists(options.output_dir)
546
547        binFiles = self.binFiles(options.bin_dir, options.extension)
548
549        coverage = Coverage(threads=1)
550        coverageStats = coverage.parseCoverage(options.coverage_file)
551
552        seqIds = []
553        coverageProfiles = []
554        for binId, seqDict in coverageStats.iteritems():
555            for seqId, bamDict in seqDict.iteritems():
556                seqIds.append(seqId)
557
558                coverages = []
559                for _, coverage in bamDict.iteritems():
560                    coverages.append(coverage)
561
562                coverageProfiles.append(coverages)
563
564        coverageProfiles = np.array(coverageProfiles)
565        if coverageProfiles.shape[1] < 2:
566            self.logger.error('Coverage profile is 1 dimensional. PCA requires at least 2 dimensions.')
567            sys.exit(1)
568
569        self.logger.info('Computing PCA of coverage profiles.\n')
570        pca = PCA()
571        pc, variance = pca.pcaMatrix(coverageProfiles, fraction=1.0, bCenter=True, bScale=False)
572
573        plots = PcaPlot(options)
574        filesProcessed = 1
575        for f in binFiles:
576            self.logger.info('Plotting PCA of coverage profiles for %s (%d of %d)' % (f, filesProcessed, len(binFiles)))
577            filesProcessed += 1
578
579            plots.plot(f, seqIds, pc, variance)
580
581            binId = binIdFromFilename(f)
582            outputFile = os.path.join(options.output_dir, binId) + '.cov_pca_plots.' + options.image_type
583            plots.savePlot(outputFile, dpi=options.dpi)
584            self.logger.info('Plot written to: ' + outputFile)
585
586        self.timeKeeper.printTimeStamp()
587
588    def gcBiasPlot(self, options):
589        """GC bias plot command"""
590
591        self.logger.info('[CheckM - gc_bias_plot] Plotting bin coverage as a function of GC.')
592
593        checkDirExists(options.bin_dir)
594        makeSurePathExists(options.output_dir)
595
596        binFiles = self.binFiles(options.bin_dir, options.extension)
597
598        coverageWindows = CoverageWindows(options.threads)
599        coverageProfile = coverageWindows.run(binFiles, options.bam_file, options.all_reads, options.min_align, options.max_edit_dist, options.window_size)
600
601        plots = GcBiasPlot(options)
602        filesProcessed = 1
603        for f in binFiles:
604            self.logger.info('Plotting GC plots for %s (%d of %d)' % (f, filesProcessed, len(binFiles)))
605            filesProcessed += 1
606
607            plots.plot(f, coverageProfile)
608
609            binId = binIdFromFilename(f)
610            outputFile = os.path.join(options.output_dir, binId) + '.gc_bias_plot.' + options.image_type
611            plots.savePlot(outputFile, dpi=options.dpi)
612            self.logger.info('Plot written to: ' + outputFile)
613
614        self.timeKeeper.printTimeStamp()
615
616    def nxPlot(self, options):
617        """Nx-plot command"""
618
619        self.logger.info('[CheckM - nx_plot] Creating Nx-plots.')
620
621        checkDirExists(options.bin_dir)
622        makeSurePathExists(options.output_dir)
623
624        binFiles = self.binFiles(options.bin_dir, options.extension)
625
626        nx = NxPlot(options)
627        filesProcessed = 1
628        for f in binFiles:
629            binId = binIdFromFilename(f)
630            self.logger.info('Plotting Nx-plot for %s (%d of %d)' % (binId, filesProcessed, len(binFiles)))
631            filesProcessed += 1
632            nx.plot(f)
633
634            outputFile = os.path.join(options.output_dir, binId) + '.nx_plot.' + options.image_type
635            nx.savePlot(outputFile, dpi=options.dpi)
636            self.logger.info('Plot written to: ' + outputFile)
637
638        self.timeKeeper.printTimeStamp()
639
640    def cumulativeLengthPlot(self, options):
641        """Cumulative sequence length plot command"""
642
643        self.logger.info('[CheckM - len_plot] Creating cumulative sequence length plot.')
644
645        checkDirExists(options.bin_dir)
646        makeSurePathExists(options.output_dir)
647
648        binFiles = self.binFiles(options.bin_dir, options.extension)
649
650        plot = CumulativeLengthPlot(options)
651        filesProcessed = 1
652        for f in binFiles:
653            binId = binIdFromFilename(f)
654            self.logger.info('Plotting cumulative sequence length plot for %s (%d of %d)' % (binId, filesProcessed, len(binFiles)))
655            filesProcessed += 1
656            plot.plot(f)
657
658            outputFile = os.path.join(options.output_dir, binId) + '.seq_len_plot.' + options.image_type
659            plot.savePlot(outputFile, dpi=options.dpi)
660            self.logger.info('Plot written to: ' + outputFile)
661
662        self.timeKeeper.printTimeStamp()
663
664    def lengthHistogram(self, options):
665        """Sequence length histogram command"""
666
667        self.logger.info('[CheckM - len_hist] Creating sequence length histogram.')
668
669        checkDirExists(options.bin_dir)
670        makeSurePathExists(options.output_dir)
671
672        binFiles = self.binFiles(options.bin_dir, options.extension)
673
674        plot = LengthHistogram(options)
675        filesProcessed = 1
676        for f in binFiles:
677            binId = binIdFromFilename(f)
678            self.logger.info('Plotting sequence length histogram for %s (%d of %d)' % (binId, filesProcessed, len(binFiles)))
679            filesProcessed += 1
680            plot.plot(f)
681
682            outputFile = os.path.join(options.output_dir, binId) + '.len_hist.' + options.image_type
683            plot.savePlot(outputFile, dpi=options.dpi)
684            self.logger.info('Plot written to: ' + outputFile)
685
686        self.timeKeeper.printTimeStamp()
687
688    def markerPlot(self, options):
689        """Marker gene position plot command"""
690
691        self.logger.info('[CheckM - marker_plot] Creating marker gene position plot.')
692
693        checkDirExists(options.bin_dir)
694        makeSurePathExists(options.output_dir)
695
696        # generate plot for each bin
697        binFiles = self.binFiles(options.bin_dir, options.extension)
698
699        resultsParser = ResultsParser(None)
700        markerGeneStats = resultsParser.parseMarkerGeneStats(options.results_dir)
701        binStats = resultsParser.parseBinStatsExt(options.results_dir)
702
703        plot = MarkerGenePosPlot(options)
704        filesProcessed = 1
705        for f in binFiles:
706            binId = binIdFromFilename(f)
707            self.logger.info('Plotting marker gene position plot for %s (%d of %d)' % (binId, filesProcessed, len(binFiles)))
708            filesProcessed += 1
709
710            if binId not in markerGeneStats or binId not in binStats:
711                continue  # bin has no marker genes
712
713            bPlotted = plot.plot(f, markerGeneStats[binId], binStats[binId])
714
715            if bPlotted:
716                outputFile = os.path.join(options.output_dir, binId) + '.marker_pos_plot.' + options.image_type
717                plot.savePlot(outputFile, dpi=options.dpi)
718                self.logger.info('Plot written to: ' + outputFile)
719            else:
720                self.logger.info('No marker genes found in bin.')
721
722        self.timeKeeper.printTimeStamp()
723
724    def parallelCoordPlot(self, options):
725        """Parallel coordinate plot command"""
726
727        self.logger.info('[CheckM - par_plot] Creating parallel coordinate plot of GC and coverage.')
728
729        checkDirExists(options.bin_dir)
730        makeSurePathExists(options.output_dir)
731        checkFileExists(options.coverage_file)
732
733        binFiles = self.binFiles(options.bin_dir, options.extension)
734
735        # read coverage stats file
736        coverage = Coverage(threads=1)
737        coverageStats = coverage.parseCoverage(options.coverage_file)
738
739        # calculate sequence stats for all bins
740        self.logger.info('Calculating sequence statistics for each bin.')
741        binStats = BinStatistics()
742        seqStats = {}
743        for f in binFiles:
744            binId = binIdFromFilename(f)
745            seqStats[binId] = binStats.sequenceStats(options.results_dir, f)
746
747        # create plot for each bin
748
749        plot = ParallelCoordPlot(options)
750        filesProcessed = 1
751        for f in binFiles:
752            binId = binIdFromFilename(f)
753            self.logger.info('Plotting parallel coordinates for %s (%d of %d)' % (binId, filesProcessed, len(binFiles)))
754            filesProcessed += 1
755
756            plot.plot(binId, seqStats, coverageStats)
757
758            outputFile = os.path.join(options.output_dir, binId) + '.paralel_coord_plot.' + options.image_type
759            plot.savePlot(outputFile, dpi=options.dpi)
760            self.logger.info('Plot written to: ' + outputFile)
761
762        self.timeKeeper.printTimeStamp()
763
764    def binQAPlot(self, options):
765        """Bin QA plot command"""
766
767        self.logger.info('[CheckM - bin_qa_plot] Creating bar plot of bin quality.')
768
769        checkDirExists(options.bin_dir)
770        makeSurePathExists(options.output_dir)
771
772        binFiles = self.binFiles(options.bin_dir, options.extension)
773
774        # read model info
775        # hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO)
776        # binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile)
777
778        # read sequence stats file
779        resultsParser = ResultsParser(None)
780        binStatsExt = resultsParser.parseBinStatsExt(options.results_dir)
781
782        # create plot for each bin
783        plot = BinQAPlot(options)
784        bMakePlot = True
785        if not options.bIgnoreHetero:
786            aai = AminoAcidIdentity()
787            aai.run(options.aai_strain, options.results_dir, None)
788            bMakePlot = plot.plot(binFiles, binStatsExt, options.bIgnoreHetero, aai.aaiHetero)
789        else:
790            bMakePlot = plot.plot(binFiles, binStatsExt, options.bIgnoreHetero, None)
791
792        if bMakePlot:
793            outputFile = os.path.join(options.output_dir, 'bin_qa_plot.' + options.image_type)
794            plot.savePlot(outputFile, dpi=options.dpi)
795
796            self.logger.info('Plot written to: ' + outputFile)
797
798        self.timeKeeper.printTimeStamp()
799
800    def unbinned(self, options):
801        """Unbinned Command"""
802
803        self.logger.info('[CheckM - unbinned] Identify unbinned sequences.')
804
805        checkDirExists(options.bin_dir)
806
807        binFiles = self.binFiles(options.bin_dir, options.extension)
808
809        unbinned = Unbinned()
810        unbinned.run(binFiles, options.seq_file, options.output_seq_file, options.output_stats_file, options.min_seq_len)
811
812
813        self.logger.info('Unbinned sequences written to: ' + options.output_seq_file)
814        self.logger.info('Unbinned sequences statistics written to: ' + options.output_stats_file)
815
816        self.timeKeeper.printTimeStamp()
817
818    def coverage(self, options):
819        """Coverage command"""
820
821        self.logger.info('[CheckM - coverage] Calculating coverage of sequences.')
822
823        checkDirExists(options.bin_dir)
824        makeSurePathExists(os.path.dirname(options.output_file))
825
826        binFiles = self.binFiles(options.bin_dir, options.extension)
827
828        coverage = Coverage(options.threads)
829        coverage.run(binFiles, options.bam_files, options.output_file, options.all_reads,
830                        options.min_align, options.max_edit_dist, options.min_qc)
831
832        self.logger.info('Coverage information written to: ' + options.output_file)
833
834        self.timeKeeper.printTimeStamp()
835
836    def tetraSignatures(self, options):
837        """Tetranucleotide signature command"""
838
839        self.logger.info('[CheckM - tetra] Calculating tetranucleotide signature of sequences.')
840
841        checkFileExists(options.seq_file)
842        makeSurePathExists(os.path.dirname(options.output_file))
843
844
845        tetraSig = GenomicSignatures(4, options.threads)
846        tetraSig.calculate(options.seq_file, options.output_file)
847
848        self.logger.info('Tetranucletoide signatures written to: ' + options.output_file)
849
850        self.timeKeeper.printTimeStamp()
851
852    def profile(self, options):
853        """Profile command"""
854
855        self.logger.info('[CheckM - profile] Calculating percentage of reads mapped to each bin.')
856
857        checkFileExists(options.coverage_file)
858
859        profile = Profile()
860        profile.run(options.coverage_file, options.file, options.bTabTable)
861
862        if options.file != '':
863            self.logger.info('Profile information written to: ' + options.file)
864
865        self.timeKeeper.printTimeStamp()
866
867    def merge(self, options):
868        """Merge command"""
869
870        self.logger.info('[CheckM - merge] Identifying bins with complementary sets of marker genes.')
871
872        checkDirExists(options.bin_dir)
873
874        binFiles = self.binFiles(options.bin_dir, options.extension)
875
876        if not options.bCalledGenes:
877            if not checkNuclotideSeqs(binFiles):
878                return
879        else:
880            if not checkProteinSeqs(binFiles):
881                return
882
883        markerSetParser = MarkerSetParser()
884        if markerSetParser.markerFileType(options.marker_file) == BinMarkerSets.TREE_MARKER_SET:
885            self.logger.error('Merge command requires a taxonomic-specific marker set or a user-defined HMM file.\n')
886            return
887
888        # setup directory structure
889        makeSurePathExists(options.output_dir)
890        makeSurePathExists(os.path.join(options.output_dir, 'bins'))
891        makeSurePathExists(os.path.join(options.output_dir, 'storage'))
892        makeSurePathExists(os.path.join(options.output_dir, 'storage', 'hmms'))
893
894        binIds = []
895        for binFile in binFiles:
896            binIds.append(binIdFromFilename(binFile))
897
898        # find marker genes in genome bins
899        mgf = MarkerGeneFinder(options.threads)
900        binIdToModels = mgf.find(binFiles,
901                         options.output_dir,
902                         "merger.table.txt",
903                         "merger.hmmer3",
904                         options.marker_file,
905                         False,
906                         False,
907                         options.bCalledGenes)
908
909        # get HMM file for each bin
910        markerSetParser = MarkerSetParser()
911        binIdToBinMarkerSets = markerSetParser.getMarkerSets(options.output_dir, binIds, options.marker_file)
912
913        # compare markers found in each bin
914
915        merger = Merger()
916        outputFile = merger.run(binFiles, options.output_dir, "merger.table.txt", binIdToModels, binIdToBinMarkerSets,
917                                options.delta_comp, options.delta_cont, options.merged_comp, options.merged_cont)
918
919        self.logger.info('Merger information written to: ' + outputFile)
920
921        self.timeKeeper.printTimeStamp()
922
923    def outliers(self, options):
924        """Outlier command"""
925
926        self.logger.info('[CheckM - outlier] Identifying outliers in bins.')
927
928        checkDirExists(options.bin_dir)
929        checkFileExists(options.tetra_profile)
930        makeSurePathExists(os.path.dirname(options.output_file))
931
932        binFiles = self.binFiles(options.bin_dir, options.extension)
933
934        binTools = BinTools()
935        binTools.identifyOutliers(options.output_dir, binFiles, options.tetra_profile, options.distributions, options.report_type, options.output_file)
936
937        self.logger.info('Outlier information written to: ' + options.output_file)
938
939        self.timeKeeper.printTimeStamp()
940
941    def joinTables(self, options):
942        """Join tables command"""
943
944        self.logger.info('[CheckM - join_tables] Joining tables containing bin information.')
945
946        # read all tables
947        headers = {}
948        rows = defaultdict(dict)
949        binIds = set()
950        for f in options.tables:
951            with open(f) as fin:
952                headers[f] = [x.strip() for x in fin.readline().split('\t')][1:]
953
954                for line in fin:
955                    lineSplit = [x.strip() for x in line.split('\t')]
956
957                    binId = lineSplit[0]
958                    binIds.add(binId)
959
960                    for i, header in enumerate(headers[f]):
961                        rows[binId][header] = lineSplit[i + 1]
962
963        # write merge table
964        oldStdOut = reassignStdOut(options.file)
965
966        row = 'Bin Id'
967        for f in options.tables:
968            row += '\t' + '\t'.join(headers[f])
969        print(row)
970
971        for binId in binIds:
972            row = binId
973            for f in options.tables:
974                for header in headers[f]:
975                    row += '\t' + rows[binId].get(header, '')
976            print(row)
977
978        restoreStdOut(options.file, oldStdOut)
979
980        if options.file:
981            self.logger.info('Joined table written to: ' + options.file)
982
983        self.timeKeeper.printTimeStamp()
984
985    def modify(self, options):
986        """Modify command"""
987
988        self.logger.info('[CheckM - modify] Modifying sequences in bin.')
989
990        makeSurePathExists(os.path.dirname(options.output_file))
991
992        if not (options.add or options.remove or options.outlier_file):
993            self.logger.error('No modification to bin requested.\n')
994            sys.exit(1)
995
996        if (options.add or options.remove) and options.outlier_file:
997            self.logger.error("The 'outlier_file' option cannot be specified with 'add' or 'remove'.\n")
998            sys.exit(1)
999
1000        binTools = BinTools()
1001
1002        if options.add or options.remove:
1003            binTools.modify(options.bin_file, options.seq_file, options.add, options.remove, options.output_file)
1004        elif options.outlier_file:
1005            binTools.removeOutliers(options.bin_file, options.outlier_file, options.output_file)
1006
1007        self.logger.info('Modified bin written to: ' + options.output_file)
1008
1009        self.timeKeeper.printTimeStamp()
1010
1011    def unique(self, options):
1012        """Unique command"""
1013
1014        self.logger.info('[CheckM - unique] Ensuring no sequences are assigned to multiple bins.')
1015
1016        binFiles = self.binFiles(options.bin_dir, options.extension)
1017
1018        binTools = BinTools()
1019        binTools.unique(binFiles)
1020
1021        self.timeKeeper.printTimeStamp()
1022
1023    def ssuFinder(self, options):
1024        """SSU finder command"""
1025
1026        self.logger.info('[CheckM - ssu_finder] Identifying SSU (16S/18S) rRNAs in sequences.')
1027
1028        binFiles = self.binFiles(options.bin_dir, options.extension)
1029
1030        checkFileExists(options.seq_file)
1031        makeSurePathExists(options.output_dir)
1032
1033        ssuFinder = SSU_Finder(options.threads)
1034        ssuFinder.run(options.seq_file, binFiles, options.output_dir, options.evalue, options.concatenate)
1035
1036        self.timeKeeper.printTimeStamp()
1037
1038    def binCompare(self, options):
1039        """Bin compare command"""
1040
1041        self.logger.info('[CheckM - bin_compare] Comparing two sets of bins.')
1042
1043        checkDirExists(options.bin_dir1)
1044        checkDirExists(options.bin_dir2)
1045
1046        binFiles1 = self.binFiles(options.bin_dir1, options.extension1)
1047        binFiles2 = self.binFiles(options.bin_dir2, options.extension2)
1048
1049        binComparer = BinComparer()
1050        binComparer.report(binFiles1, binFiles2, options.seq_file, options.output_file)
1051
1052
1053        self.logger.info('Detailed bin comparison written to: ' + options.output_file)
1054
1055        self.timeKeeper.printTimeStamp()
1056
1057    def binUnion(self, options):
1058        """Bin union command"""
1059
1060        self.logger.info('[CheckM - bin_union] Redundancy reduce multiple sets of bins into a single set.')
1061
1062        output_dir = options.output_dir
1063        makeSurePathExists(output_dir)
1064
1065        bin_dirs = []
1066        checkmQaTsvs = []
1067        for i, arg in enumerate(options.bin_or_checkm_qa_table):
1068            if i % 2 == 0:
1069                checkDirExists(arg)
1070                bin_dirs.append(arg)
1071            else:
1072                checkFileExists(arg)
1073                checkmQaTsvs.append(arg)
1074
1075        if len(bin_dirs) < 2:
1076            self.logger.error("Need to specify at least two bin folders, found %i: " % len(bin_dirs))
1077            sys.exit(1)
1078        if len(bin_dirs) != len(checkmQaTsvs):
1079            self.logger.error("Need to specify the same number of bin folders as checkm_qa_tsv files, found %i and %i, respectively: " % (len(bin_dirs), len(checkmQaTsvs)))
1080            sys.exit(1)
1081
1082        binFileSets = []
1083        for bin_dir in bin_dirs:
1084            self.logger.info("Reading fasta files with extension %s from bin folder %s" % (options.extension, bin_dir))
1085            binFileSets.append(self.binFiles(bin_dir, options.extension))
1086
1087        binUnion = BinUnion()
1088
1089        contigConflictsOutputFile = os.path.join(output_dir, 'contigConflicts.csv')
1090        unionBinOutputFile = os.path.join(output_dir, 'union.txt')
1091        binUnion.report(bin_dirs, binFileSets, checkmQaTsvs, unionBinOutputFile, contigConflictsOutputFile, options.min_completeness, options.max_contamination)
1092
1093    def test(self, options):
1094        """Quick test of CheckM"""
1095
1096        self.logger.info('[CheckM - Test] Processing E.coli K12-W3310 to verify operation of CheckM.')
1097
1098        verifyEcoli = VerifyEcoli()
1099        verifyEcoli.run(self, options.output_dir)
1100
1101        self.timeKeeper.printTimeStamp()
1102
1103    def parseOptions(self, options):
1104        """Parse user options and call the correct pipeline(s)"""
1105
1106        try:
1107            if options.file == "stdout":
1108                options.file = ''
1109        except:
1110            pass
1111
1112
1113        if(options.subparser_name == "data"):
1114            self.updateCheckM_DB(options)
1115        elif(options.subparser_name == 'tree'):
1116            self.tree(options)
1117        elif(options.subparser_name == 'tree_qa'):
1118            self.treeQA(options)
1119        elif(options.subparser_name == 'lineage_set'):
1120            self.lineageSet(options)
1121        elif(options.subparser_name == 'taxon_list'):
1122            self.taxonList(options)
1123        elif(options.subparser_name == 'taxon_set'):
1124            self.taxonSet(options)
1125        elif(options.subparser_name == 'analyze'):
1126            self.analyze(options)
1127        elif(options.subparser_name == 'qa'):
1128            self.qa(options)
1129        elif(options.subparser_name == 'lineage_wf'):
1130            options.marker_file = os.path.join(options.output_dir, 'lineage.ms')
1131            options.tree_dir = options.output_dir
1132            options.analyze_dir = options.output_dir
1133            options.out_format = 1
1134            options.bAlignTopHit = False
1135            options.exclude_markers = None
1136            options.coverage_file = None
1137
1138            self.tree(options)
1139            self.lineageSet(options)
1140            self.analyze(options)
1141            self.qa(options)
1142        elif(options.subparser_name == 'taxonomy_wf'):
1143            options.marker_file = os.path.join(options.output_dir, options.taxon + '.ms')
1144            options.analyze_dir = options.output_dir
1145            options.out_format = 1
1146            options.bAlignTopHit = False
1147            options.exclude_markers = None
1148
1149            self.taxonSet(options)
1150            self.analyze(options)
1151            self.qa(options)
1152        elif(options.subparser_name == 'gc_plot'):
1153            self.gcPlot(options)
1154        elif(options.subparser_name == 'coding_plot'):
1155            self.codingDensityPlot(options)
1156        elif(options.subparser_name == 'tetra_plot'):
1157            self.tetraDistPlot(options)
1158        elif(options.subparser_name == 'dist_plot'):
1159            self.distributionPlots(options)
1160        elif(options.subparser_name == 'nx_plot'):
1161            self.nxPlot(options)
1162        elif(options.subparser_name == 'len_plot'):
1163            self.cumulativeLengthPlot(options)
1164        elif(options.subparser_name == 'len_hist'):
1165            self.lengthHistogram(options)
1166        elif(options.subparser_name == 'marker_plot'):
1167            self.markerPlot(options)
1168        elif(options.subparser_name == 'par_plot'):
1169            self.parallelCoordPlot(options)
1170        elif(options.subparser_name == 'tetra_pca'):
1171            self.tetraPcaPlot(options)
1172        elif(options.subparser_name == 'cov_pca'):
1173            self.coveragePcaPlot(options)
1174        elif(options.subparser_name == 'gc_bias_plot'):
1175            self.gcBiasPlot(options)
1176        elif(options.subparser_name == 'bin_qa_plot'):
1177            self.binQAPlot(options)
1178        elif(options.subparser_name == 'unbinned'):
1179            self.unbinned(options)
1180        elif(options.subparser_name == 'coverage'):
1181            self.coverage(options)
1182        elif(options.subparser_name == 'tetra'):
1183            self.tetraSignatures(options)
1184        elif(options.subparser_name == 'profile'):
1185            self.profile(options)
1186        elif(options.subparser_name == 'merge'):
1187            self.merge(options)
1188        elif(options.subparser_name == 'outliers'):
1189            self.outliers(options)
1190        elif(options.subparser_name == 'join_tables'):
1191            self.joinTables(options)
1192        elif(options.subparser_name == 'modify'):
1193            self.modify(options)
1194        elif(options.subparser_name == 'unique'):
1195            self.unique(options)
1196        elif(options.subparser_name == 'ssu_finder'):
1197            self.ssuFinder(options)
1198        elif(options.subparser_name == 'bin_compare'):
1199            self.binCompare(options)
1200        elif(options.subparser_name == 'bin_union'):
1201            self.binUnion(options)
1202        elif(options.subparser_name == 'test'):
1203            options.bCalledGenes = False
1204            options.pplacer_threads = 1
1205            self.test(options)
1206        else:
1207            self.logger.error('Unknown CheckM command: ' + options.subparser_name + '\n')
1208            sys.exit(1)
1209
1210        return 0
1211