1############################################################################### 2# # 3# This program is free software: you can redistribute it and/or modify # 4# it under the terms of the GNU General Public License as published by # 5# the Free Software Foundation, either version 3 of the License, or # 6# (at your option) any later version. # 7# # 8# This program is distributed in the hope that it will be useful, # 9# but WITHOUT ANY WARRANTY; without even the implied warranty of # 10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 11# GNU General Public License for more details. # 12# # 13# You should have received a copy of the GNU General Public License # 14# along with this program. If not, see <http://www.gnu.org/licenses/>. # 15# # 16############################################################################### 17 18import os 19import sys 20import logging 21from collections import defaultdict 22 23import numpy as np 24 25from checkm.defaultValues import DefaultValues 26from checkm.timeKeeper import TimeKeeper 27from checkm.markerSets import MarkerSetParser, BinMarkerSets 28from checkm.resultsParser import ResultsParser 29from checkm.hmmerAligner import HmmerAligner 30from checkm.markerGeneFinder import MarkerGeneFinder 31from checkm.pplacer import PplacerRunner 32from checkm.treeParser import TreeParser 33from checkm.taxonParser import TaxonParser 34from checkm.aminoAcidIdentity import AminoAcidIdentity 35from checkm.binComparer import BinComparer 36from checkm.binUnion import BinUnion 37from checkm.binStatistics import BinStatistics 38from checkm.coverage import Coverage 39from checkm.coverageWindows import CoverageWindows 40from checkm.genomicSignatures import GenomicSignatures 41from checkm.unbinned import Unbinned 42from checkm.merger import Merger 43from checkm.profile import Profile 44from checkm.binTools import BinTools 45from checkm.ssuFinder import SSU_Finder 46from checkm.PCA import PCA 47from checkm.common import (makeSurePathExists, 48 checkFileExists, 49 binIdFromFilename, 50 reassignStdOut, 51 restoreStdOut, 52 getBinIdsFromOutDir, 53 checkDirExists) 54 55from checkm.plot.gcPlots import GcPlots 56from checkm.plot.codingDensityPlots import CodingDensityPlots 57from checkm.plot.tetraDistPlots import TetraDistPlots 58from checkm.plot.distributionPlots import DistributionPlots 59from checkm.plot.nxPlot import NxPlot 60from checkm.plot.cumulativeLengthPlot import CumulativeLengthPlot 61from checkm.plot.lengthHistogram import LengthHistogram 62from checkm.plot.markerGenePosPlot import MarkerGenePosPlot 63from checkm.plot.parallelCoordPlot import ParallelCoordPlot 64from checkm.plot.binQAPlot import BinQAPlot 65from checkm.plot.pcaPlot import PcaPlot 66from checkm.plot.gcBiasPlots import GcBiasPlot 67 68from checkm.util.seqUtils import checkNuclotideSeqs, checkProteinSeqs 69 70from checkm.checkmData import DBManager 71 72from checkm.test.test_ecoli import VerifyEcoli 73 74 75class OptionsParser(): 76 def __init__(self): 77 self.logger = logging.getLogger('timestamp') 78 self.timeKeeper = TimeKeeper() 79 80 def updateCheckM_DB(self, options): 81 self.logger.info('[CheckM - data] Check for database updates. [%s]' % options.action[0]) 82 83 action = options.action 84 if action and action[0] == 'setRoot' and len(action) > 1: 85 DBM = DBManager(set_path=action[1]) 86 else: 87 self.logger.error('Path to the CheckM reference data must be specified.') 88 89 def binFiles(self, binFolder, binExtension): 90 binFiles = [] 91 if binFolder is not None: 92 all_files = os.listdir(binFolder) 93 for f in all_files: 94 if f.endswith(binExtension): 95 binFile = os.path.join(binFolder, f) 96 if os.stat(binFile).st_size == 0: 97 self.logger.warning(" [Warning] Skipping bin %s as it has a size of 0 bytes." % f) 98 else: 99 binFiles.append(binFile) 100 101 if not binFiles: 102 self.logger.error("No bins found. Check the extension (-x) used to identify bins.") 103 sys.exit(1) 104 105 return sorted(binFiles) 106 107 def tree(self, options): 108 """Tree command""" 109 self.logger.info('[CheckM - tree] Placing bins in reference genome tree.') 110 111 binFiles = self.binFiles(options.bin_dir, options.extension) 112 113 if not options.bCalledGenes: 114 if not checkNuclotideSeqs(binFiles): 115 return 116 else: 117 if not checkProteinSeqs(binFiles): 118 return 119 120 # setup directory structure 121 makeSurePathExists(os.path.join(options.output_dir, 'bins')) 122 makeSurePathExists(os.path.join(options.output_dir, 'storage')) 123 124 # find phylogenetically informative genes in genome bins 125 mgf = MarkerGeneFinder(options.threads) 126 binIdToModels = mgf.find(binFiles, 127 options.output_dir, 128 DefaultValues.HMMER_TABLE_PHYLO_OUT, 129 DefaultValues.HMMER_PHYLO_OUT, 130 DefaultValues.PHYLO_HMM_MODELS, 131 options.bKeepAlignment, 132 options.bNucORFs, 133 options.bCalledGenes) 134 135 # write model information to file 136 markerSetParser = MarkerSetParser(options.threads) 137 hmmModelInfoFile = os.path.join(options.output_dir, 'storage', DefaultValues.PHYLO_HMM_MODEL_INFO) 138 markerSetParser.writeBinModels(binIdToModels, hmmModelInfoFile) 139 140 # calculate statistics for each genome bin 141 142 binStats = BinStatistics(options.threads) 143 binStats.calculate(binFiles, options.output_dir, DefaultValues.BIN_STATS_PHYLO_OUT) 144 145 # align identified marker genes 146 147 HA = HmmerAligner(options.threads) 148 resultsParser = HA.makeAlignmentToPhyloMarkers(options.output_dir, 149 DefaultValues.PHYLO_HMM_MODELS, 150 DefaultValues.HMMER_TABLE_PHYLO_OUT, 151 binIdToModels, 152 False, 153 DefaultValues.E_VAL, 154 DefaultValues.LENGTH, 155 False, 156 os.path.join(options.output_dir, 'storage', 'tree') 157 ) 158 159 # place bins into genome tree 160 161 pplacer = PplacerRunner(threads=options.pplacer_threads) # fix at one thread to keep memory requirements reasonable 162 pplacer.run(binFiles, resultsParser, options.output_dir, options.bReducedTree) 163 164 self.timeKeeper.printTimeStamp() 165 166 def treeQA(self, options): 167 """QA command""" 168 self.logger.info('[CheckM - tree_qa] Assessing phylogenetic markers found in each bin.') 169 170 checkDirExists(options.tree_dir) 171 172 # set HMM file for each bin 173 markerSetParser = MarkerSetParser() 174 hmmModelInfoFile = os.path.join(options.tree_dir, 'storage', DefaultValues.PHYLO_HMM_MODEL_INFO) 175 binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile) 176 177 # calculate marker gene statistics 178 RP = ResultsParser(binIdToModels) 179 binStats = RP.analyseResults(options.tree_dir, 180 DefaultValues.BIN_STATS_PHYLO_OUT, 181 DefaultValues.HMMER_TABLE_PHYLO_OUT) 182 183 # determine taxonomy of each bin 184 185 treeParser = TreeParser() 186 treeParser.printSummary(options.out_format, options.tree_dir, RP, options.bTabTable, options.file, binStats) 187 188 if options.file != '': 189 self.logger.info('QA information written to: ' + options.file) 190 191 self.timeKeeper.printTimeStamp() 192 193 def lineageSet(self, options, db=None): 194 """Lineage set command""" 195 self.logger.info('[CheckM - lineage_set] Inferring lineage-specific marker sets.') 196 197 checkDirExists(options.tree_dir) 198 199 # set HMM file for each bin 200 markerSetParser = MarkerSetParser() 201 hmmModelInfoFile = os.path.join(options.tree_dir, 'storage', DefaultValues.PHYLO_HMM_MODEL_INFO) 202 binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile) 203 204 # calculate marker gene statistics 205 resultsParser = ResultsParser(binIdToModels) 206 resultsParser.analyseResults(options.tree_dir, 207 DefaultValues.BIN_STATS_PHYLO_OUT, 208 DefaultValues.HMMER_TABLE_PHYLO_OUT) 209 210 # These options are incompatible with how the lineage-specific marker set is selected, so 211 # the default values are currently hard-coded 212 213 options.num_genomes_markers = 2 214 options.bootstrap = 0 215 options.bRequireTaxonomy = False 216 217 treeParser = TreeParser() 218 treeParser.getBinMarkerSets(options.tree_dir, options.marker_file, 219 options.num_genomes_markers, 220 options.bootstrap, options.bNoLineageSpecificRefinement, 221 options.bForceDomain, options.bRequireTaxonomy, 222 resultsParser, options.unique, options.multi) 223 224 225 self.logger.info('Marker set written to: ' + options.marker_file) 226 227 self.timeKeeper.printTimeStamp() 228 229 def taxonList(self, options, db=None): 230 """Lineage set command""" 231 self.logger.info('[CheckM - taxon_list] Listing available taxonomic-specific marker sets.') 232 233 taxonParser = TaxonParser() 234 taxonParser.list(options.rank) 235 236 self.timeKeeper.printTimeStamp() 237 238 def taxonSet(self, options, db=None): 239 """Taxon set command""" 240 self.logger.info('[CheckM - taxon_set] Generate taxonomic-specific marker set.') 241 242 path = os.path.split(options.marker_file)[0] 243 if path: 244 makeSurePathExists(path) 245 246 taxonParser = TaxonParser() 247 bValidSet = taxonParser.markerSet(options.rank, options.taxon, options.marker_file) 248 249 if bValidSet: 250 251 self.logger.info('Marker set written to: ' + options.marker_file) 252 253 self.timeKeeper.printTimeStamp() 254 255 def analyze(self, options, db=None): 256 """Analyze command""" 257 self.logger.info('[CheckM - analyze] Identifying marker genes in bins.') 258 259 binFiles = self.binFiles(options.bin_dir, options.extension) 260 261 if not options.bCalledGenes: 262 if not checkNuclotideSeqs(binFiles): 263 return 264 else: 265 if not checkProteinSeqs(binFiles): 266 return 267 268 # setup directory structure 269 makeSurePathExists(options.output_dir) 270 makeSurePathExists(os.path.join(options.output_dir, 'bins')) 271 makeSurePathExists(os.path.join(options.output_dir, 'storage')) 272 makeSurePathExists(os.path.join(options.output_dir, 'storage', 'aai_qa')) 273 274 # find marker genes in genome bins 275 mgf = MarkerGeneFinder(options.threads) 276 binIdToModels = mgf.find(binFiles, 277 options.output_dir, 278 DefaultValues.HMMER_TABLE_OUT, 279 DefaultValues.HMMER_OUT, 280 options.marker_file, 281 options.bKeepAlignment, 282 options.bNucORFs, 283 options.bCalledGenes) 284 285 markerSetParser = MarkerSetParser(options.threads) 286 binIdToBinMarkerSets = markerSetParser.getMarkerSets(options.output_dir, 287 getBinIdsFromOutDir(options.output_dir), 288 options.marker_file) 289 290 hmmModelInfoFile = os.path.join(options.output_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO) 291 markerSetParser.writeBinModels(binIdToModels, hmmModelInfoFile) 292 293 self.timeKeeper.printTimeStamp() 294 295 # HMM model file 296 if markerSetParser.markerFileType(options.marker_file) == BinMarkerSets.HMM_MODELS_SET: 297 markerFile = options.marker_file 298 else: 299 markerFile = DefaultValues.HMM_MODELS 300 301 # align marker genes with multiple hits within a bin 302 303 HA = HmmerAligner(options.threads) 304 HA.makeAlignmentsOfMultipleHits(options.output_dir, 305 markerFile, 306 DefaultValues.HMMER_TABLE_OUT, 307 binIdToModels, 308 binIdToBinMarkerSets, 309 False, 310 DefaultValues.E_VAL, 311 DefaultValues.LENGTH, 312 os.path.join(options.output_dir, 'storage', 'aai_qa') 313 ) 314 315 self.timeKeeper.printTimeStamp() 316 317 # calculate statistics for each genome bin 318 319 binStats = BinStatistics(options.threads) 320 binStats.calculate(binFiles, options.output_dir, DefaultValues.BIN_STATS_OUT) 321 322 self.timeKeeper.printTimeStamp() 323 324 # align top hit to each marker if requested 325 if options.bAlignTopHit: 326 alignmentOutputFolder = os.path.join(options.output_dir, 'storage', 'alignments') 327 makeSurePathExists(alignmentOutputFolder) 328 329 330 HA = HmmerAligner(options.threads) 331 resultsParser = HA.makeAlignmentTopHit(options.output_dir, 332 options.marker_file, 333 DefaultValues.HMMER_TABLE_OUT, 334 binIdToModels, 335 False, 336 DefaultValues.E_VAL, 337 DefaultValues.LENGTH, 338 True, 339 alignmentOutputFolder 340 ) 341 342 # report marker gene data 343 fout = open(os.path.join(alignmentOutputFolder, 'alignment_info.tsv'), 'w') 344 fout.write('Marker Id\tLength (bp)\n') 345 markerIds = resultsParser.models[resultsParser.models.keys()[0]].keys() 346 for markerId in markerIds: 347 fout.write('%s\t%d\n' % (markerId, resultsParser.models[resultsParser.models.keys()[0]][markerId].leng)) 348 fout.close() 349 350 351 self.logger.info('Alignments to top hits stored in: ' + alignmentOutputFolder) 352 353 self.timeKeeper.printTimeStamp() 354 355 def qa(self, options): 356 """QA command""" 357 self.logger.info('[CheckM - qa] Tabulating genome statistics.') 358 359 checkDirExists(options.analyze_dir) 360 361 if options.exclude_markers: 362 checkFileExists(options.exclude_markers) 363 364 # calculate AAI between marks with multiple hits in a single bin 365 aai = AminoAcidIdentity() 366 aai.run(options.aai_strain, options.analyze_dir, options.alignment_file) 367 368 # get HMM file for each bin 369 370 markerSetParser = MarkerSetParser(options.threads) 371 372 hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO) 373 binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile) 374 375 binIdToBinMarkerSets = markerSetParser.getMarkerSets(options.analyze_dir, 376 getBinIdsFromOutDir(options.analyze_dir), 377 options.marker_file, 378 options.exclude_markers) 379 380 # get results for each bin 381 RP = ResultsParser(binIdToModels) 382 RP.analyseResults(options.analyze_dir, 383 DefaultValues.BIN_STATS_OUT, 384 DefaultValues.HMMER_TABLE_OUT, 385 bIgnoreThresholds=options.bIgnoreThresholds, 386 evalueThreshold=options.e_value, 387 lengthThreshold=options.length, 388 bSkipPseudoGeneCorrection=options.bSkipPseudoGeneCorrection, 389 bSkipAdjCorrection=options.bSkipAdjCorrection 390 ) 391 392 393 RP.printSummary(options.out_format, 394 aai, binIdToBinMarkerSets, 395 options.bIndividualMarkers, 396 options.coverage_file, 397 options.bTabTable, 398 options.file, 399 anaFolder=options.analyze_dir) 400 RP.cacheResults(options.analyze_dir, 401 binIdToBinMarkerSets, 402 options.bIndividualMarkers) 403 404 if options.file != '': 405 self.logger.info('QA information written to: ' + options.file) 406 407 self.timeKeeper.printTimeStamp() 408 409 def gcPlot(self, options): 410 """GC plot command""" 411 self.logger.info('[CheckM - gc_plot] Creating GC histogram and delta-GC plot.') 412 413 checkDirExists(options.bin_dir) 414 makeSurePathExists(options.output_dir) 415 416 binFiles = self.binFiles(options.bin_dir, options.extension) 417 418 plots = GcPlots(options) 419 filesProcessed = 1 420 for f in binFiles: 421 self.logger.info('Plotting GC plots for %s (%d of %d)' % (f, filesProcessed, len(binFiles))) 422 filesProcessed += 1 423 424 plots.plot(f, options.distributions) 425 426 binId = binIdFromFilename(f) 427 outputFile = os.path.join(options.output_dir, binId) + '.gc_plots.' + options.image_type 428 plots.savePlot(outputFile, dpi=options.dpi) 429 self.logger.info('Plot written to: ' + outputFile) 430 431 self.timeKeeper.printTimeStamp() 432 433 def codingDensityPlot(self, options): 434 """Coding density plot command""" 435 self.logger.info('[CheckM - coding_plot] Creating coding density histogram and delta-CD plot.') 436 437 checkDirExists(options.bin_dir) 438 makeSurePathExists(options.output_dir) 439 440 binFiles = self.binFiles(options.bin_dir, options.extension) 441 442 plots = CodingDensityPlots(options) 443 filesProcessed = 1 444 for f in binFiles: 445 self.logger.info('Plotting coding density plots for %s (%d of %d)' % (f, filesProcessed, len(binFiles))) 446 filesProcessed += 1 447 448 plots.plot(f, options.distributions) 449 450 binId = binIdFromFilename(f) 451 outputFile = os.path.join(options.output_dir, binId) + '.coding_density_plots.' + options.image_type 452 plots.savePlot(outputFile, dpi=options.dpi) 453 self.logger.info('Plot written to: ' + outputFile) 454 455 self.timeKeeper.printTimeStamp() 456 457 def tetraDistPlot(self, options): 458 """Tetranucleotide distance plot command""" 459 self.logger.info('[CheckM - tetra_plot] Creating tetra-distance histogram and delta-TD plot.') 460 461 checkDirExists(options.bin_dir) 462 makeSurePathExists(options.output_dir) 463 464 binFiles = self.binFiles(options.bin_dir, options.extension) 465 466 genomicSignatures = GenomicSignatures(K=4, threads=1) 467 tetraSigs = genomicSignatures.read(options.tetra_profile) 468 469 plots = TetraDistPlots(options) 470 filesProcessed = 1 471 for f in binFiles: 472 self.logger.info('Plotting tetranuclotide distance plots for %s (%d of %d)' % (f, filesProcessed, len(binFiles))) 473 filesProcessed += 1 474 475 binId = binIdFromFilename(f) 476 plots.plot(f, tetraSigs, options.distributions) 477 478 outputFile = os.path.join(options.output_dir, binId) + '.tetra_dist_plots.' + options.image_type 479 plots.savePlot(outputFile, dpi=options.dpi) 480 self.logger.info('Plot written to: ' + outputFile) 481 482 self.timeKeeper.printTimeStamp() 483 484 def distributionPlots(self, options): 485 """Reference distribution plot command""" 486 self.logger.info('[CheckM - dist_plot] Creating GC, CD, and TD distribution plots.') 487 488 checkDirExists(options.bin_dir) 489 makeSurePathExists(options.output_dir) 490 491 binFiles = self.binFiles(options.bin_dir, options.extension) 492 493 genomicSignatures = GenomicSignatures(K=4, threads=1) 494 tetraSigs = genomicSignatures.read(options.tetra_profile) 495 496 plots = DistributionPlots(options) 497 filesProcessed = 1 498 for f in binFiles: 499 self.logger.info('Plotting reference distribution plots for %s (%d of %d)' % (f, filesProcessed, len(binFiles))) 500 filesProcessed += 1 501 502 binId = binIdFromFilename(f) 503 plots.plot(f, tetraSigs, options.distributions) 504 505 outputFile = os.path.join(options.output_dir, binId) + '.ref_dist_plots.' + options.image_type 506 plots.savePlot(outputFile, dpi=options.dpi) 507 self.logger.info('Plot written to: ' + outputFile) 508 509 self.timeKeeper.printTimeStamp() 510 511 def tetraPcaPlot(self, options): 512 """PCA plot of tetranucleotide signatures""" 513 self.logger.info('[CheckM - tetra_pca] Creating PCA plot of tetranucleotide signatures.') 514 515 checkDirExists(options.bin_dir) 516 makeSurePathExists(options.output_dir) 517 518 binFiles = self.binFiles(options.bin_dir, options.extension) 519 520 self.logger.info('Computing PCA of tetranuclotide signatures.\n') 521 pca = PCA() 522 seqIds, pc, variance = pca.pcaFile(options.tetra_profile, fraction=1.0, bCenter=True, bScale=False) 523 524 plots = PcaPlot(options) 525 filesProcessed = 1 526 for f in binFiles: 527 self.logger.info('Plotting PCA of tetranuclotide signatures for %s (%d of %d)' % (f, filesProcessed, len(binFiles))) 528 filesProcessed += 1 529 530 plots.plot(f, seqIds, pc, variance) 531 532 binId = binIdFromFilename(f) 533 outputFile = os.path.join(options.output_dir, binId) + '.tetra_pca_plots.' + options.image_type 534 plots.savePlot(outputFile, dpi=options.dpi) 535 self.logger.info('Plot written to: ' + outputFile) 536 537 self.timeKeeper.printTimeStamp() 538 539 def coveragePcaPlot(self, options): 540 """PCA plot of coverage profiles""" 541 self.logger.info('[CheckM - cov_pca] Creating PCA plot of coverage profiles.') 542 543 checkDirExists(options.bin_dir) 544 checkFileExists(options.coverage_file) 545 makeSurePathExists(options.output_dir) 546 547 binFiles = self.binFiles(options.bin_dir, options.extension) 548 549 coverage = Coverage(threads=1) 550 coverageStats = coverage.parseCoverage(options.coverage_file) 551 552 seqIds = [] 553 coverageProfiles = [] 554 for binId, seqDict in coverageStats.iteritems(): 555 for seqId, bamDict in seqDict.iteritems(): 556 seqIds.append(seqId) 557 558 coverages = [] 559 for _, coverage in bamDict.iteritems(): 560 coverages.append(coverage) 561 562 coverageProfiles.append(coverages) 563 564 coverageProfiles = np.array(coverageProfiles) 565 if coverageProfiles.shape[1] < 2: 566 self.logger.error('Coverage profile is 1 dimensional. PCA requires at least 2 dimensions.') 567 sys.exit(1) 568 569 self.logger.info('Computing PCA of coverage profiles.\n') 570 pca = PCA() 571 pc, variance = pca.pcaMatrix(coverageProfiles, fraction=1.0, bCenter=True, bScale=False) 572 573 plots = PcaPlot(options) 574 filesProcessed = 1 575 for f in binFiles: 576 self.logger.info('Plotting PCA of coverage profiles for %s (%d of %d)' % (f, filesProcessed, len(binFiles))) 577 filesProcessed += 1 578 579 plots.plot(f, seqIds, pc, variance) 580 581 binId = binIdFromFilename(f) 582 outputFile = os.path.join(options.output_dir, binId) + '.cov_pca_plots.' + options.image_type 583 plots.savePlot(outputFile, dpi=options.dpi) 584 self.logger.info('Plot written to: ' + outputFile) 585 586 self.timeKeeper.printTimeStamp() 587 588 def gcBiasPlot(self, options): 589 """GC bias plot command""" 590 591 self.logger.info('[CheckM - gc_bias_plot] Plotting bin coverage as a function of GC.') 592 593 checkDirExists(options.bin_dir) 594 makeSurePathExists(options.output_dir) 595 596 binFiles = self.binFiles(options.bin_dir, options.extension) 597 598 coverageWindows = CoverageWindows(options.threads) 599 coverageProfile = coverageWindows.run(binFiles, options.bam_file, options.all_reads, options.min_align, options.max_edit_dist, options.window_size) 600 601 plots = GcBiasPlot(options) 602 filesProcessed = 1 603 for f in binFiles: 604 self.logger.info('Plotting GC plots for %s (%d of %d)' % (f, filesProcessed, len(binFiles))) 605 filesProcessed += 1 606 607 plots.plot(f, coverageProfile) 608 609 binId = binIdFromFilename(f) 610 outputFile = os.path.join(options.output_dir, binId) + '.gc_bias_plot.' + options.image_type 611 plots.savePlot(outputFile, dpi=options.dpi) 612 self.logger.info('Plot written to: ' + outputFile) 613 614 self.timeKeeper.printTimeStamp() 615 616 def nxPlot(self, options): 617 """Nx-plot command""" 618 619 self.logger.info('[CheckM - nx_plot] Creating Nx-plots.') 620 621 checkDirExists(options.bin_dir) 622 makeSurePathExists(options.output_dir) 623 624 binFiles = self.binFiles(options.bin_dir, options.extension) 625 626 nx = NxPlot(options) 627 filesProcessed = 1 628 for f in binFiles: 629 binId = binIdFromFilename(f) 630 self.logger.info('Plotting Nx-plot for %s (%d of %d)' % (binId, filesProcessed, len(binFiles))) 631 filesProcessed += 1 632 nx.plot(f) 633 634 outputFile = os.path.join(options.output_dir, binId) + '.nx_plot.' + options.image_type 635 nx.savePlot(outputFile, dpi=options.dpi) 636 self.logger.info('Plot written to: ' + outputFile) 637 638 self.timeKeeper.printTimeStamp() 639 640 def cumulativeLengthPlot(self, options): 641 """Cumulative sequence length plot command""" 642 643 self.logger.info('[CheckM - len_plot] Creating cumulative sequence length plot.') 644 645 checkDirExists(options.bin_dir) 646 makeSurePathExists(options.output_dir) 647 648 binFiles = self.binFiles(options.bin_dir, options.extension) 649 650 plot = CumulativeLengthPlot(options) 651 filesProcessed = 1 652 for f in binFiles: 653 binId = binIdFromFilename(f) 654 self.logger.info('Plotting cumulative sequence length plot for %s (%d of %d)' % (binId, filesProcessed, len(binFiles))) 655 filesProcessed += 1 656 plot.plot(f) 657 658 outputFile = os.path.join(options.output_dir, binId) + '.seq_len_plot.' + options.image_type 659 plot.savePlot(outputFile, dpi=options.dpi) 660 self.logger.info('Plot written to: ' + outputFile) 661 662 self.timeKeeper.printTimeStamp() 663 664 def lengthHistogram(self, options): 665 """Sequence length histogram command""" 666 667 self.logger.info('[CheckM - len_hist] Creating sequence length histogram.') 668 669 checkDirExists(options.bin_dir) 670 makeSurePathExists(options.output_dir) 671 672 binFiles = self.binFiles(options.bin_dir, options.extension) 673 674 plot = LengthHistogram(options) 675 filesProcessed = 1 676 for f in binFiles: 677 binId = binIdFromFilename(f) 678 self.logger.info('Plotting sequence length histogram for %s (%d of %d)' % (binId, filesProcessed, len(binFiles))) 679 filesProcessed += 1 680 plot.plot(f) 681 682 outputFile = os.path.join(options.output_dir, binId) + '.len_hist.' + options.image_type 683 plot.savePlot(outputFile, dpi=options.dpi) 684 self.logger.info('Plot written to: ' + outputFile) 685 686 self.timeKeeper.printTimeStamp() 687 688 def markerPlot(self, options): 689 """Marker gene position plot command""" 690 691 self.logger.info('[CheckM - marker_plot] Creating marker gene position plot.') 692 693 checkDirExists(options.bin_dir) 694 makeSurePathExists(options.output_dir) 695 696 # generate plot for each bin 697 binFiles = self.binFiles(options.bin_dir, options.extension) 698 699 resultsParser = ResultsParser(None) 700 markerGeneStats = resultsParser.parseMarkerGeneStats(options.results_dir) 701 binStats = resultsParser.parseBinStatsExt(options.results_dir) 702 703 plot = MarkerGenePosPlot(options) 704 filesProcessed = 1 705 for f in binFiles: 706 binId = binIdFromFilename(f) 707 self.logger.info('Plotting marker gene position plot for %s (%d of %d)' % (binId, filesProcessed, len(binFiles))) 708 filesProcessed += 1 709 710 if binId not in markerGeneStats or binId not in binStats: 711 continue # bin has no marker genes 712 713 bPlotted = plot.plot(f, markerGeneStats[binId], binStats[binId]) 714 715 if bPlotted: 716 outputFile = os.path.join(options.output_dir, binId) + '.marker_pos_plot.' + options.image_type 717 plot.savePlot(outputFile, dpi=options.dpi) 718 self.logger.info('Plot written to: ' + outputFile) 719 else: 720 self.logger.info('No marker genes found in bin.') 721 722 self.timeKeeper.printTimeStamp() 723 724 def parallelCoordPlot(self, options): 725 """Parallel coordinate plot command""" 726 727 self.logger.info('[CheckM - par_plot] Creating parallel coordinate plot of GC and coverage.') 728 729 checkDirExists(options.bin_dir) 730 makeSurePathExists(options.output_dir) 731 checkFileExists(options.coverage_file) 732 733 binFiles = self.binFiles(options.bin_dir, options.extension) 734 735 # read coverage stats file 736 coverage = Coverage(threads=1) 737 coverageStats = coverage.parseCoverage(options.coverage_file) 738 739 # calculate sequence stats for all bins 740 self.logger.info('Calculating sequence statistics for each bin.') 741 binStats = BinStatistics() 742 seqStats = {} 743 for f in binFiles: 744 binId = binIdFromFilename(f) 745 seqStats[binId] = binStats.sequenceStats(options.results_dir, f) 746 747 # create plot for each bin 748 749 plot = ParallelCoordPlot(options) 750 filesProcessed = 1 751 for f in binFiles: 752 binId = binIdFromFilename(f) 753 self.logger.info('Plotting parallel coordinates for %s (%d of %d)' % (binId, filesProcessed, len(binFiles))) 754 filesProcessed += 1 755 756 plot.plot(binId, seqStats, coverageStats) 757 758 outputFile = os.path.join(options.output_dir, binId) + '.paralel_coord_plot.' + options.image_type 759 plot.savePlot(outputFile, dpi=options.dpi) 760 self.logger.info('Plot written to: ' + outputFile) 761 762 self.timeKeeper.printTimeStamp() 763 764 def binQAPlot(self, options): 765 """Bin QA plot command""" 766 767 self.logger.info('[CheckM - bin_qa_plot] Creating bar plot of bin quality.') 768 769 checkDirExists(options.bin_dir) 770 makeSurePathExists(options.output_dir) 771 772 binFiles = self.binFiles(options.bin_dir, options.extension) 773 774 # read model info 775 # hmmModelInfoFile = os.path.join(options.analyze_dir, 'storage', DefaultValues.CHECKM_HMM_MODEL_INFO) 776 # binIdToModels = markerSetParser.loadBinModels(hmmModelInfoFile) 777 778 # read sequence stats file 779 resultsParser = ResultsParser(None) 780 binStatsExt = resultsParser.parseBinStatsExt(options.results_dir) 781 782 # create plot for each bin 783 plot = BinQAPlot(options) 784 bMakePlot = True 785 if not options.bIgnoreHetero: 786 aai = AminoAcidIdentity() 787 aai.run(options.aai_strain, options.results_dir, None) 788 bMakePlot = plot.plot(binFiles, binStatsExt, options.bIgnoreHetero, aai.aaiHetero) 789 else: 790 bMakePlot = plot.plot(binFiles, binStatsExt, options.bIgnoreHetero, None) 791 792 if bMakePlot: 793 outputFile = os.path.join(options.output_dir, 'bin_qa_plot.' + options.image_type) 794 plot.savePlot(outputFile, dpi=options.dpi) 795 796 self.logger.info('Plot written to: ' + outputFile) 797 798 self.timeKeeper.printTimeStamp() 799 800 def unbinned(self, options): 801 """Unbinned Command""" 802 803 self.logger.info('[CheckM - unbinned] Identify unbinned sequences.') 804 805 checkDirExists(options.bin_dir) 806 807 binFiles = self.binFiles(options.bin_dir, options.extension) 808 809 unbinned = Unbinned() 810 unbinned.run(binFiles, options.seq_file, options.output_seq_file, options.output_stats_file, options.min_seq_len) 811 812 813 self.logger.info('Unbinned sequences written to: ' + options.output_seq_file) 814 self.logger.info('Unbinned sequences statistics written to: ' + options.output_stats_file) 815 816 self.timeKeeper.printTimeStamp() 817 818 def coverage(self, options): 819 """Coverage command""" 820 821 self.logger.info('[CheckM - coverage] Calculating coverage of sequences.') 822 823 checkDirExists(options.bin_dir) 824 makeSurePathExists(os.path.dirname(options.output_file)) 825 826 binFiles = self.binFiles(options.bin_dir, options.extension) 827 828 coverage = Coverage(options.threads) 829 coverage.run(binFiles, options.bam_files, options.output_file, options.all_reads, 830 options.min_align, options.max_edit_dist, options.min_qc) 831 832 self.logger.info('Coverage information written to: ' + options.output_file) 833 834 self.timeKeeper.printTimeStamp() 835 836 def tetraSignatures(self, options): 837 """Tetranucleotide signature command""" 838 839 self.logger.info('[CheckM - tetra] Calculating tetranucleotide signature of sequences.') 840 841 checkFileExists(options.seq_file) 842 makeSurePathExists(os.path.dirname(options.output_file)) 843 844 845 tetraSig = GenomicSignatures(4, options.threads) 846 tetraSig.calculate(options.seq_file, options.output_file) 847 848 self.logger.info('Tetranucletoide signatures written to: ' + options.output_file) 849 850 self.timeKeeper.printTimeStamp() 851 852 def profile(self, options): 853 """Profile command""" 854 855 self.logger.info('[CheckM - profile] Calculating percentage of reads mapped to each bin.') 856 857 checkFileExists(options.coverage_file) 858 859 profile = Profile() 860 profile.run(options.coverage_file, options.file, options.bTabTable) 861 862 if options.file != '': 863 self.logger.info('Profile information written to: ' + options.file) 864 865 self.timeKeeper.printTimeStamp() 866 867 def merge(self, options): 868 """Merge command""" 869 870 self.logger.info('[CheckM - merge] Identifying bins with complementary sets of marker genes.') 871 872 checkDirExists(options.bin_dir) 873 874 binFiles = self.binFiles(options.bin_dir, options.extension) 875 876 if not options.bCalledGenes: 877 if not checkNuclotideSeqs(binFiles): 878 return 879 else: 880 if not checkProteinSeqs(binFiles): 881 return 882 883 markerSetParser = MarkerSetParser() 884 if markerSetParser.markerFileType(options.marker_file) == BinMarkerSets.TREE_MARKER_SET: 885 self.logger.error('Merge command requires a taxonomic-specific marker set or a user-defined HMM file.\n') 886 return 887 888 # setup directory structure 889 makeSurePathExists(options.output_dir) 890 makeSurePathExists(os.path.join(options.output_dir, 'bins')) 891 makeSurePathExists(os.path.join(options.output_dir, 'storage')) 892 makeSurePathExists(os.path.join(options.output_dir, 'storage', 'hmms')) 893 894 binIds = [] 895 for binFile in binFiles: 896 binIds.append(binIdFromFilename(binFile)) 897 898 # find marker genes in genome bins 899 mgf = MarkerGeneFinder(options.threads) 900 binIdToModels = mgf.find(binFiles, 901 options.output_dir, 902 "merger.table.txt", 903 "merger.hmmer3", 904 options.marker_file, 905 False, 906 False, 907 options.bCalledGenes) 908 909 # get HMM file for each bin 910 markerSetParser = MarkerSetParser() 911 binIdToBinMarkerSets = markerSetParser.getMarkerSets(options.output_dir, binIds, options.marker_file) 912 913 # compare markers found in each bin 914 915 merger = Merger() 916 outputFile = merger.run(binFiles, options.output_dir, "merger.table.txt", binIdToModels, binIdToBinMarkerSets, 917 options.delta_comp, options.delta_cont, options.merged_comp, options.merged_cont) 918 919 self.logger.info('Merger information written to: ' + outputFile) 920 921 self.timeKeeper.printTimeStamp() 922 923 def outliers(self, options): 924 """Outlier command""" 925 926 self.logger.info('[CheckM - outlier] Identifying outliers in bins.') 927 928 checkDirExists(options.bin_dir) 929 checkFileExists(options.tetra_profile) 930 makeSurePathExists(os.path.dirname(options.output_file)) 931 932 binFiles = self.binFiles(options.bin_dir, options.extension) 933 934 binTools = BinTools() 935 binTools.identifyOutliers(options.output_dir, binFiles, options.tetra_profile, options.distributions, options.report_type, options.output_file) 936 937 self.logger.info('Outlier information written to: ' + options.output_file) 938 939 self.timeKeeper.printTimeStamp() 940 941 def joinTables(self, options): 942 """Join tables command""" 943 944 self.logger.info('[CheckM - join_tables] Joining tables containing bin information.') 945 946 # read all tables 947 headers = {} 948 rows = defaultdict(dict) 949 binIds = set() 950 for f in options.tables: 951 with open(f) as fin: 952 headers[f] = [x.strip() for x in fin.readline().split('\t')][1:] 953 954 for line in fin: 955 lineSplit = [x.strip() for x in line.split('\t')] 956 957 binId = lineSplit[0] 958 binIds.add(binId) 959 960 for i, header in enumerate(headers[f]): 961 rows[binId][header] = lineSplit[i + 1] 962 963 # write merge table 964 oldStdOut = reassignStdOut(options.file) 965 966 row = 'Bin Id' 967 for f in options.tables: 968 row += '\t' + '\t'.join(headers[f]) 969 print(row) 970 971 for binId in binIds: 972 row = binId 973 for f in options.tables: 974 for header in headers[f]: 975 row += '\t' + rows[binId].get(header, '') 976 print(row) 977 978 restoreStdOut(options.file, oldStdOut) 979 980 if options.file: 981 self.logger.info('Joined table written to: ' + options.file) 982 983 self.timeKeeper.printTimeStamp() 984 985 def modify(self, options): 986 """Modify command""" 987 988 self.logger.info('[CheckM - modify] Modifying sequences in bin.') 989 990 makeSurePathExists(os.path.dirname(options.output_file)) 991 992 if not (options.add or options.remove or options.outlier_file): 993 self.logger.error('No modification to bin requested.\n') 994 sys.exit(1) 995 996 if (options.add or options.remove) and options.outlier_file: 997 self.logger.error("The 'outlier_file' option cannot be specified with 'add' or 'remove'.\n") 998 sys.exit(1) 999 1000 binTools = BinTools() 1001 1002 if options.add or options.remove: 1003 binTools.modify(options.bin_file, options.seq_file, options.add, options.remove, options.output_file) 1004 elif options.outlier_file: 1005 binTools.removeOutliers(options.bin_file, options.outlier_file, options.output_file) 1006 1007 self.logger.info('Modified bin written to: ' + options.output_file) 1008 1009 self.timeKeeper.printTimeStamp() 1010 1011 def unique(self, options): 1012 """Unique command""" 1013 1014 self.logger.info('[CheckM - unique] Ensuring no sequences are assigned to multiple bins.') 1015 1016 binFiles = self.binFiles(options.bin_dir, options.extension) 1017 1018 binTools = BinTools() 1019 binTools.unique(binFiles) 1020 1021 self.timeKeeper.printTimeStamp() 1022 1023 def ssuFinder(self, options): 1024 """SSU finder command""" 1025 1026 self.logger.info('[CheckM - ssu_finder] Identifying SSU (16S/18S) rRNAs in sequences.') 1027 1028 binFiles = self.binFiles(options.bin_dir, options.extension) 1029 1030 checkFileExists(options.seq_file) 1031 makeSurePathExists(options.output_dir) 1032 1033 ssuFinder = SSU_Finder(options.threads) 1034 ssuFinder.run(options.seq_file, binFiles, options.output_dir, options.evalue, options.concatenate) 1035 1036 self.timeKeeper.printTimeStamp() 1037 1038 def binCompare(self, options): 1039 """Bin compare command""" 1040 1041 self.logger.info('[CheckM - bin_compare] Comparing two sets of bins.') 1042 1043 checkDirExists(options.bin_dir1) 1044 checkDirExists(options.bin_dir2) 1045 1046 binFiles1 = self.binFiles(options.bin_dir1, options.extension1) 1047 binFiles2 = self.binFiles(options.bin_dir2, options.extension2) 1048 1049 binComparer = BinComparer() 1050 binComparer.report(binFiles1, binFiles2, options.seq_file, options.output_file) 1051 1052 1053 self.logger.info('Detailed bin comparison written to: ' + options.output_file) 1054 1055 self.timeKeeper.printTimeStamp() 1056 1057 def binUnion(self, options): 1058 """Bin union command""" 1059 1060 self.logger.info('[CheckM - bin_union] Redundancy reduce multiple sets of bins into a single set.') 1061 1062 output_dir = options.output_dir 1063 makeSurePathExists(output_dir) 1064 1065 bin_dirs = [] 1066 checkmQaTsvs = [] 1067 for i, arg in enumerate(options.bin_or_checkm_qa_table): 1068 if i % 2 == 0: 1069 checkDirExists(arg) 1070 bin_dirs.append(arg) 1071 else: 1072 checkFileExists(arg) 1073 checkmQaTsvs.append(arg) 1074 1075 if len(bin_dirs) < 2: 1076 self.logger.error("Need to specify at least two bin folders, found %i: " % len(bin_dirs)) 1077 sys.exit(1) 1078 if len(bin_dirs) != len(checkmQaTsvs): 1079 self.logger.error("Need to specify the same number of bin folders as checkm_qa_tsv files, found %i and %i, respectively: " % (len(bin_dirs), len(checkmQaTsvs))) 1080 sys.exit(1) 1081 1082 binFileSets = [] 1083 for bin_dir in bin_dirs: 1084 self.logger.info("Reading fasta files with extension %s from bin folder %s" % (options.extension, bin_dir)) 1085 binFileSets.append(self.binFiles(bin_dir, options.extension)) 1086 1087 binUnion = BinUnion() 1088 1089 contigConflictsOutputFile = os.path.join(output_dir, 'contigConflicts.csv') 1090 unionBinOutputFile = os.path.join(output_dir, 'union.txt') 1091 binUnion.report(bin_dirs, binFileSets, checkmQaTsvs, unionBinOutputFile, contigConflictsOutputFile, options.min_completeness, options.max_contamination) 1092 1093 def test(self, options): 1094 """Quick test of CheckM""" 1095 1096 self.logger.info('[CheckM - Test] Processing E.coli K12-W3310 to verify operation of CheckM.') 1097 1098 verifyEcoli = VerifyEcoli() 1099 verifyEcoli.run(self, options.output_dir) 1100 1101 self.timeKeeper.printTimeStamp() 1102 1103 def parseOptions(self, options): 1104 """Parse user options and call the correct pipeline(s)""" 1105 1106 try: 1107 if options.file == "stdout": 1108 options.file = '' 1109 except: 1110 pass 1111 1112 1113 if(options.subparser_name == "data"): 1114 self.updateCheckM_DB(options) 1115 elif(options.subparser_name == 'tree'): 1116 self.tree(options) 1117 elif(options.subparser_name == 'tree_qa'): 1118 self.treeQA(options) 1119 elif(options.subparser_name == 'lineage_set'): 1120 self.lineageSet(options) 1121 elif(options.subparser_name == 'taxon_list'): 1122 self.taxonList(options) 1123 elif(options.subparser_name == 'taxon_set'): 1124 self.taxonSet(options) 1125 elif(options.subparser_name == 'analyze'): 1126 self.analyze(options) 1127 elif(options.subparser_name == 'qa'): 1128 self.qa(options) 1129 elif(options.subparser_name == 'lineage_wf'): 1130 options.marker_file = os.path.join(options.output_dir, 'lineage.ms') 1131 options.tree_dir = options.output_dir 1132 options.analyze_dir = options.output_dir 1133 options.out_format = 1 1134 options.bAlignTopHit = False 1135 options.exclude_markers = None 1136 options.coverage_file = None 1137 1138 self.tree(options) 1139 self.lineageSet(options) 1140 self.analyze(options) 1141 self.qa(options) 1142 elif(options.subparser_name == 'taxonomy_wf'): 1143 options.marker_file = os.path.join(options.output_dir, options.taxon + '.ms') 1144 options.analyze_dir = options.output_dir 1145 options.out_format = 1 1146 options.bAlignTopHit = False 1147 options.exclude_markers = None 1148 1149 self.taxonSet(options) 1150 self.analyze(options) 1151 self.qa(options) 1152 elif(options.subparser_name == 'gc_plot'): 1153 self.gcPlot(options) 1154 elif(options.subparser_name == 'coding_plot'): 1155 self.codingDensityPlot(options) 1156 elif(options.subparser_name == 'tetra_plot'): 1157 self.tetraDistPlot(options) 1158 elif(options.subparser_name == 'dist_plot'): 1159 self.distributionPlots(options) 1160 elif(options.subparser_name == 'nx_plot'): 1161 self.nxPlot(options) 1162 elif(options.subparser_name == 'len_plot'): 1163 self.cumulativeLengthPlot(options) 1164 elif(options.subparser_name == 'len_hist'): 1165 self.lengthHistogram(options) 1166 elif(options.subparser_name == 'marker_plot'): 1167 self.markerPlot(options) 1168 elif(options.subparser_name == 'par_plot'): 1169 self.parallelCoordPlot(options) 1170 elif(options.subparser_name == 'tetra_pca'): 1171 self.tetraPcaPlot(options) 1172 elif(options.subparser_name == 'cov_pca'): 1173 self.coveragePcaPlot(options) 1174 elif(options.subparser_name == 'gc_bias_plot'): 1175 self.gcBiasPlot(options) 1176 elif(options.subparser_name == 'bin_qa_plot'): 1177 self.binQAPlot(options) 1178 elif(options.subparser_name == 'unbinned'): 1179 self.unbinned(options) 1180 elif(options.subparser_name == 'coverage'): 1181 self.coverage(options) 1182 elif(options.subparser_name == 'tetra'): 1183 self.tetraSignatures(options) 1184 elif(options.subparser_name == 'profile'): 1185 self.profile(options) 1186 elif(options.subparser_name == 'merge'): 1187 self.merge(options) 1188 elif(options.subparser_name == 'outliers'): 1189 self.outliers(options) 1190 elif(options.subparser_name == 'join_tables'): 1191 self.joinTables(options) 1192 elif(options.subparser_name == 'modify'): 1193 self.modify(options) 1194 elif(options.subparser_name == 'unique'): 1195 self.unique(options) 1196 elif(options.subparser_name == 'ssu_finder'): 1197 self.ssuFinder(options) 1198 elif(options.subparser_name == 'bin_compare'): 1199 self.binCompare(options) 1200 elif(options.subparser_name == 'bin_union'): 1201 self.binUnion(options) 1202 elif(options.subparser_name == 'test'): 1203 options.bCalledGenes = False 1204 options.pplacer_threads = 1 1205 self.test(options) 1206 else: 1207 self.logger.error('Unknown CheckM command: ' + options.subparser_name + '\n') 1208 sys.exit(1) 1209 1210 return 0 1211