CheckM-1.0.18/bin/checkm

#!/usr/bin/env python

###############################################################################
#
# checkm - main program entry point. See checkm/main.py for internals.
#
###############################################################################
#                                                                             #
#    This program is free software: you can redistribute it and/or modify     #
#    it under the terms of the GNU General Public License as published by     #
#    the Free Software Foundation, either version 3 of the License, or        #
#    (at your option) any later version.                                      #
#                                                                             #
#    This program is distributed in the hope that it will be useful,          #
#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
#    GNU General Public License for more details.                             #
#                                                                             #
#    You should have received a copy of the GNU General Public License        #
#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
#                                                                             #
###############################################################################

__author__ = "Donovan Parks, Connor Skennerton, Michael Imelfort"
__copyright__ = "Copyright 2014"
__credits__ = ["Donovan Parks", "Connor Skennerton", "Michael Imelfort"]
__license__ = "GPL3"
__maintainer__ = "Donovan Parks"
__email__ = "donovan.parks@gmail.com"
__status__ = "Development"

import argparse
import sys
import os

from checkm import main
from checkm.defaultValues import DefaultValues
from checkm.customHelpFormatter import CustomHelpFormatter
from checkm.util.taxonomyUtils import taxonomicRanks
from checkm.logger import logger_setup
import tempfile


class ChangeTempAction(argparse.Action):
    def __call__(self, parser, namespace, values, option_string=None):
        if os.path.isdir(values):
            tempfile.tempdir = values
        else:
            raise argparse.ArgumentTypeError('The value of %s must be a valid directory' % option_string)


def version():
    import checkm
    versionFile = open(os.path.join(checkm.__path__[0], 'VERSION'))
    return versionFile.readline().strip()


def printHelp():
    print ''
    print '                ...::: CheckM v' + version() + ' :::...'''
    print '''\

  Lineage-specific marker set:
    tree         -> Place bins in the reference genome tree
    tree_qa      -> Assess phylogenetic markers found in each bin
    lineage_set  -> Infer lineage-specific marker sets for each bin

  Taxonomic-specific marker set:
    taxon_list   -> List available taxonomic-specific marker sets
    taxon_set    -> Generate taxonomic-specific marker set

  Apply marker set to genome bins:
    analyze      -> Identify marker genes in bins
    qa           -> Assess bins for contamination and completeness

  Common workflows (combines above commands):
    lineage_wf   -> Runs tree, lineage_set, analyze, qa
    taxonomy_wf  -> Runs taxon_set, analyze, qa

  Bin QA plots:
    bin_qa_plot  -> Bar plot of bin completeness, contamination, and strain heterogeneity

  Reference distribution plots:
    gc_plot      -> Create GC histogram and delta-GC plot
    coding_plot  -> Create coding density (CD) histogram and delta-CD plot
    tetra_plot   -> Create tetranucleotide distance (TD) histogram and delta-TD plot
    dist_plot    -> Create image with GC, CD, and TD distribution plots together

  General plots:
    nx_plot      -> Create Nx-plots
    len_plot     -> Cumulative sequence length plot
    len_hist     -> Sequence length histogram
    marker_plot  -> Plot position of marker genes on sequences
    par_plot     -> Parallel coordinate plot of GC and coverage
    gc_bias_plot -> Plot bin coverage as a function of GC

  Sequence subspace plots:
    cov_pca      -> PCA plot of coverage profiles
    tetra_pca    -> PCA plot of tetranucleotide signatures

  Bin exploration and modification:
    unique       -> Ensure no sequences are assigned to multiple bins
    merge        -> Identify bins with complementary sets of marker genes
    bin_compare  -> Compare two sets of bins (e.g., from alternative binning methods)
    bin_union    -> [Experimental] Merge multiple binning efforts into a single bin set
    modify       -> [Experimental] Modify sequences in a bin
    outliers     -> [Experimental] Identify outlier in bins relative to reference distributions

  Utility functions:
    unbinned     -> Identify unbinned sequences
    coverage     -> Calculate coverage of sequences
    tetra        -> Calculate tetranucleotide signature of sequences
    profile      -> Calculate percentage of reads mapped to each bin
    join_tables  -> Join tab-separated value tables containing bin information
    ssu_finder   -> Identify SSU (16S/18S) rRNAs in sequences

  Use 'checkm data setRoot <checkm_data_dir>' to specify the location of CheckM database files.

  Usage: checkm <command> -h for command specific help
    '''

if __name__ == '__main__':
    # initialize the options parser
    parser = argparse.ArgumentParser(add_help=False)
    subparsers = parser.add_subparsers(help="--", dest='subparser_name')

    data_parser = subparsers.add_parser('data',
                                        formatter_class=CustomHelpFormatter,
                                        description='Set path to the CheckM database files.',
                                        epilog='Example: checkm data setRoot')
    data_parser.add_argument('action', nargs="+",
            help='''
  setRoot <PATH>  -> set the data directory to <PATH> (requires permissions)
            ''')

    # determine placement of each genome bin in the genome tree
    tree_parser = subparsers.add_parser('tree',
                                        formatter_class=CustomHelpFormatter,
                                        description='Place bins in the genome tree.',
                                        epilog='Example: checkm tree ./bins ./output')
    tree_parser.add_argument('bin_dir', help="directory containing bins (fasta format)")
    tree_parser.add_argument('output_dir', help="directory to write output files")
    tree_parser.add_argument('-r', '--reduced_tree', dest='bReducedTree', action="store_true", help="use reduced tree (requires <16GB of memory) for determining lineage of each bin")
    tree_parser.add_argument('--ali', dest='bKeepAlignment', action="store_true", default=False, help="generate HMMER alignment file for each bin")
    tree_parser.add_argument('--nt', dest='bNucORFs', action="store_true", default=False, help="generate nucleotide gene sequences for each bin")
    tree_parser.add_argument('-g', '--genes', dest='bCalledGenes', action="store_true", default=False, help="bins contain genes as amino acids instead of nucleotide contigs")
    tree_parser.add_argument('-x', '--extension', default='fna', help="extension of bins (other files in directory are ignored)")
    tree_parser.add_argument('-t', '--threads', type=int, default=1, help="number of threads")
    tree_parser.add_argument('--pplacer_threads', type=int, default=1, help="number of threads used by pplacer (memory usage increases linearly with additional threads)")
    tree_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")
    tree_parser.add_argument('--tmpdir', action=ChangeTempAction, help="specify an alternative directory for temporary files")

    # do QA on phylogenetic marker genes
    tree_qa_parser = subparsers.add_parser('tree_qa',
                                        formatter_class=CustomHelpFormatter,
                                        description='Assess phylogenetic markers found in each bin.',
                                        epilog='Example: checkm tree_qa ./output')
    tree_qa_parser.add_argument('tree_dir', help="directory specified during tree command")
    tree_qa_parser.add_argument('-o', '--out_format', type=int,
                                    help='''desired output:
  1. brief summary of genome tree placement
  2. detailed summary of genome tree placement including lineage-specific statistics
  3. genome tree in Newick format decorated with IMG genome ids
  4. genome tree in Newick format decorated with taxonomy strings
  5. multiple sequence alignment of reference genomes and bins''',
                                    default=1, choices=[1, 2, 3, 4, 5])
    tree_qa_parser.add_argument('-f', '--file', default='stdout', help="print results to file")
    tree_qa_parser.add_argument('--tab_table', dest='bTabTable', action="store_true", default=False, help="print tab-separated values table")
    tree_qa_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")
    tree_qa_parser.add_argument('--tmpdir', action=ChangeTempAction, help="specify an alternative directory for temporary files")

    # calculate lineage-specific marker set for genome bins
    lineage_set_parser = subparsers.add_parser('lineage_set',
                                        formatter_class=CustomHelpFormatter,
                                        description='Infer lineage-specific marker sets for each bin.',
                                        epilog='Example: checkm lineage_set ./output lineage.ms')
    lineage_set_parser.add_argument('tree_dir', help="directory specified during tree command")
    lineage_set_parser.add_argument('marker_file', help="output file describing marker set for each bin")
    lineage_set_parser.add_argument('-u', '--unique', type=int, default=10, help="minimum number of unique phylogenetic markers required to use lineage-specific marker set")
    lineage_set_parser.add_argument('-m', '--multi', type=int, default=10, help="maximum number of multi-copy phylogenetic markers before defaulting to domain-level marker set")
    lineage_set_parser.add_argument('--force_domain', dest='bForceDomain', action="store_true", default=False, help="use domain-level sets for all bins")
    lineage_set_parser.add_argument('--no_refinement', dest='bNoLineageSpecificRefinement', action="store_true", default=False, help="do not perform lineage-specific marker set refinement")
    lineage_set_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")
    lineage_set_parser.add_argument('--tmpdir', action=ChangeTempAction, help="specify an alternative directory for temporary files")

    # list of available taxonomic-specific marker set
    taxon_list_parser = subparsers.add_parser('taxon_list',
                                        formatter_class=CustomHelpFormatter,
                                        description='List available taxonomic-specific marker sets.',
                                        epilog='Example: checkm taxon_list --rank phylum')
    taxon_list_parser.add_argument('--rank', help="restrict list to specified taxonomic rank", choices=['ALL'] + taxonomicRanks, default='ALL')
    taxon_list_parser.add_argument('--tmpdir', action=ChangeTempAction, help="specify an alternative directory for temporary files")

    # calculate taxonomic-specific marker set
    taxon_set_parser = subparsers.add_parser('taxon_set',
                                        formatter_class=CustomHelpFormatter,
                                        description='Generate taxonomic-specific marker set.',
                                        epilog='Example: checkm taxon_set domain Bacteria bacteria.ms')
    taxon_set_parser.add_argument('rank', help="taxonomic rank", choices=taxonomicRanks)
    taxon_set_parser.add_argument('taxon', help="taxon of interest")
    taxon_set_parser.add_argument('marker_file', help="output file describing taxonomic-specific marker set")
    taxon_set_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")
    taxon_set_parser.add_argument('--tmpdir', action=ChangeTempAction, help="specify an alternative directory for temporary files")

    # identify marker genes within binned contigs and calculate genome statistics
    analyze_parser = subparsers.add_parser('analyze',
                                        formatter_class=CustomHelpFormatter,
                                        description='Identify marker genes in bins and calculate genome statistics.',
                                        epilog='Example: checkm analyze lineage.ms ./bins ./output')
    analyze_parser.add_argument('marker_file', help="markers for assessing bins (marker set or HMM file)")
    analyze_parser.add_argument('bin_dir', help="directory containing bins (fasta format)")
    analyze_parser.add_argument('output_dir', help="directory to write output files")
    analyze_parser.add_argument('--ali', dest='bKeepAlignment', action="store_true", default=False, help="generate HMMER alignment file for each bin")
    analyze_parser.add_argument('--nt', dest='bNucORFs', action="store_true", default=False, help="generate nucleotide gene sequences for each bin")
    analyze_parser.add_argument('-g', '--genes', dest='bCalledGenes', action="store_true", default=False, help="bins contain genes as amino acids instead of nucleotide contigs")
    analyze_parser.add_argument('-x', '--extension', default='fna', help="extension of bins (other files in directory are ignored)")
    analyze_parser.add_argument('-t', '--threads', type=int, default=1, help="number of threads")
    analyze_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    analyze_parser.add_argument('--ali_top_hits', dest='bAlignTopHit', action="store_true", default=False, help=argparse.SUPPRESS)  # [hidden argument] align top marker hits (used by genome tree database)
    analyze_parser.add_argument('--tmpdir', action=ChangeTempAction, help="specify an alternative directory for temporary files")

    # do QA on pre-processed contigs
    qa_parser = subparsers.add_parser('qa',
                                        formatter_class=CustomHelpFormatter,
                                        description='Assess bins for contamination and completeness.',
                                        epilog='''
Note: lineage_wf and taxonomy_wf produce a marker file in the specified output directory. The
        lineage workflow produced a marker file called lineage.ms, while the taxonomy workflow
        produces a marker file called <taxon>.ms (e.g. Bacteria.ms).

Example: checkm qa ./output/lineage.ms ./output
                                        ''')
    qa_parser.add_argument('marker_file', help="marker file specified during analyze command")
    qa_parser.add_argument('analyze_dir', help="directory specified during analyze command")
    qa_parser.add_argument('-o', '--out_format', type=int,
                                help='''desired output:
  1. summary of bin completeness and contamination
  2. extended summary of bin statistics (includes GC, genome size, ...)
  3. summary of bin quality for increasingly basal lineage-specific marker sets
  4. list of marker genes and their counts
  5. list of bin id, marker gene id, gene id
  6. list of marker genes present multiple times in a bin
  7. list of marker genes present multiple times on the same scaffold
  8. list indicating position of each marker gene within a bin
  9. FASTA file of marker genes identified in each bin''',
  # 10. scaffold statistics: scaffold id, bin id, length, GC, ..., marker gene(s)''',
                                default=1, choices=[1, 2, 3, 4, 5, 6, 7, 8, 9])
    qa_parser.add_argument('--exclude_markers', default=None, help="file specifying markers to exclude from marker sets")
    qa_parser.add_argument('--individual_markers', dest='bIndividualMarkers', action="store_true", default=False, help="treat marker as independent (i.e., ignore co-located set structure)")
    qa_parser.add_argument('--skip_adj_correction', dest='bSkipAdjCorrection', action="store_true", default=False, help="do not exclude adjacent marker genes when estimating contamination")
    qa_parser.add_argument('--skip_pseudogene_correction', dest='bSkipPseudoGeneCorrection', action="store_true", default=False, help="skip identification and filtering of pseudogenes")
    qa_parser.add_argument('--aai_strain', type=float, default=0.9, help="AAI threshold used to identify strain heterogeneity")
    qa_parser.add_argument('-a', '--alignment_file', default=None, help="produce file showing alignment of multi-copy genes and their AAI identity")
    qa_parser.add_argument('--ignore_thresholds', dest='bIgnoreThresholds', action="store_true", default=False, help="ignore model-specific score thresholds")
    qa_parser.add_argument('-e', '--e_value', type=float, default=DefaultValues.E_VAL, help="e-value cut off")
    qa_parser.add_argument('-l', '--length', type=float, default=DefaultValues.LENGTH, help="percent overlap between target and query")
    qa_parser.add_argument('-c', '--coverage_file', default=None, help="file containing coverage of each sequence; coverage information added to table type 2 (see coverage command)")
    qa_parser.add_argument('-f', '--file', default='stdout', help="print results to file")
    qa_parser.add_argument('--tab_table', dest='bTabTable', action="store_true", default=False, help="print tab-separated values table")
    qa_parser.add_argument('-t', '--threads', type=int, default=1, help="number of threads")
    qa_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")
    qa_parser.add_argument('--tmpdir', action=ChangeTempAction, help="specify an alternative directory for temporary files")

    # run lineage-specific workflow
    lineage_wf_parser = subparsers.add_parser('lineage_wf',
                                        formatter_class=CustomHelpFormatter,
                                        description='Runs tree, lineage_set, analyze, qa',
                                        epilog='Example: checkm lineage_wf ./bins ./output')
    lineage_wf_parser.add_argument('bin_dir', help="directory containing bins (fasta format)")
    lineage_wf_parser.add_argument('output_dir', help="directory to write output files")
    lineage_wf_parser.add_argument('-r', '--reduced_tree', dest='bReducedTree', action="store_true", help="use reduced tree (requires <16GB of memory) for determining lineage of each bin")
    lineage_wf_parser.add_argument('--ali', dest='bKeepAlignment', action="store_true", default=False, help="generate HMMER alignment file for each bin")
    lineage_wf_parser.add_argument('--nt', dest='bNucORFs', action="store_true", default=False, help="generate nucleotide gene sequences for each bin")
    lineage_wf_parser.add_argument('-g', '--genes', dest='bCalledGenes', action="store_true", default=False, help="bins contain genes as amino acids instead of nucleotide contigs")
    lineage_wf_parser.add_argument('-u', '--unique', type=int, default=10, help="minimum number of unique phylogenetic markers required to use lineage-specific marker set")
    lineage_wf_parser.add_argument('-m', '--multi', type=int, default=10, help="maximum number of multi-copy phylogenetic markers before defaulting to domain-level marker set")
    lineage_wf_parser.add_argument('--force_domain', dest='bForceDomain', action="store_true", default=False, help="use domain-level sets for all bins")
    lineage_wf_parser.add_argument('--no_refinement', dest='bNoLineageSpecificRefinement', action="store_true", default=False, help="do not perform lineage-specific marker set refinement")
    lineage_wf_parser.add_argument('--individual_markers', dest='bIndividualMarkers', action="store_true", default=False, help="treat marker as independent (i.e., ignore co-located set structure)")
    lineage_wf_parser.add_argument('--skip_adj_correction', dest='bSkipAdjCorrection', action="store_true", default=False, help="do not exclude adjacent marker genes when estimating contamination")
    lineage_wf_parser.add_argument('--skip_pseudogene_correction', dest='bSkipPseudoGeneCorrection', action="store_true", default=False, help="skip identification and filtering of pseudogenes")
    lineage_wf_parser.add_argument('--aai_strain', type=float, default=0.9, help="AAI threshold used to identify strain heterogeneity")
    lineage_wf_parser.add_argument('-a', '--alignment_file', default=None, help="produce file showing alignment of multi-copy genes and their AAI identity")
    lineage_wf_parser.add_argument('--ignore_thresholds', dest='bIgnoreThresholds', action="store_true", default=False, help="ignore model-specific score thresholds")
    lineage_wf_parser.add_argument('-e', '--e_value', type=float, default=DefaultValues.E_VAL, help="e-value cut off")
    lineage_wf_parser.add_argument('-l', '--length', type=float, default=DefaultValues.LENGTH, help="percent overlap between target and query")
    lineage_wf_parser.add_argument('-f', '--file', default='stdout', help="print results to file")
    lineage_wf_parser.add_argument('--tab_table', dest='bTabTable', action="store_true", default=False, help="print tab-separated values table")
    lineage_wf_parser.add_argument('-x', '--extension', default='fna', help="extension of bins (other files in directory are ignored)")
    lineage_wf_parser.add_argument('-t', '--threads', type=int, default=1, help="number of threads")
    lineage_wf_parser.add_argument('--pplacer_threads', type=int, default=1, help="number of threads used by pplacer (memory usage increases linearly with additional threads)")
    lineage_wf_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")
    lineage_wf_parser.add_argument('--tmpdir', action=ChangeTempAction, help="specify an alternative directory for temporary files")

    # run taxonomic-specific workflow
    taxonomy_wf_parser = subparsers.add_parser('taxonomy_wf',
                                        formatter_class=CustomHelpFormatter,
                                        description='Runs taxon_set, analyze, qa',
                                        epilog='Example: checkm taxonomy_wf domain Bacteria ./bins ./output')

    taxonomy_wf_parser.add_argument('rank', help="taxonomic rank", choices=['life', 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species'])
    taxonomy_wf_parser.add_argument('taxon', help="taxon of interest")
    taxonomy_wf_parser.add_argument('bin_dir', help="directory containing bins (fasta format)")
    taxonomy_wf_parser.add_argument('output_dir', help="directory to write output files")
    taxonomy_wf_parser.add_argument('--ali', dest='bKeepAlignment', action="store_true", default=False, help="generate HMMER alignment file for each bin")
    taxonomy_wf_parser.add_argument('--nt', dest='bNucORFs', action="store_true", default=False, help="generate nucleotide gene sequences for each bin")
    taxonomy_wf_parser.add_argument('-g', '--genes', dest='bCalledGenes', action="store_true", default=False, help="bins contain genes as amino acids instead of nucleotide contigs")
    taxonomy_wf_parser.add_argument('--individual_markers', dest='bIndividualMarkers', action="store_true", default=False, help="treat marker as independent (i.e., ignore co-located set structure)")
    taxonomy_wf_parser.add_argument('--skip_adj_correction', dest='bSkipAdjCorrection', action="store_true", default=False, help="do not exclude adjacent marker genes when estimating contamination")
    taxonomy_wf_parser.add_argument('--skip_pseudogene_correction', dest='bSkipPseudoGeneCorrection', action="store_true", default=False, help="skip identification and filtering of pseudogenes")
    taxonomy_wf_parser.add_argument('--aai_strain', type=float, default=0.9, help="AAI threshold used to identify strain heterogeneity")
    taxonomy_wf_parser.add_argument('-a', '--alignment_file', default=None, help="produce file showing alignment of multi-copy genes and their AAI identity")
    taxonomy_wf_parser.add_argument('--ignore_thresholds', dest='bIgnoreThresholds', action="store_true", default=False, help="ignore model-specific score thresholds")
    taxonomy_wf_parser.add_argument('-e', '--e_value', type=float, default=DefaultValues.E_VAL, help="e-value cut off")
    taxonomy_wf_parser.add_argument('-l', '--length', type=float, default=DefaultValues.LENGTH, help="percent overlap between target and query")
    taxonomy_wf_parser.add_argument('-c', '--coverage_file', default=None, help="file containing coverage of each sequence; coverage information added to table type 2 (see coverage command)")
    taxonomy_wf_parser.add_argument('-f', '--file', default='stdout', help="print results to file")
    taxonomy_wf_parser.add_argument('--tab_table', dest='bTabTable', action="store_true", default=False, help="print tab-separated values table")
    taxonomy_wf_parser.add_argument('-x', '--extension', default='fna', help="extension of bins (other files in directory are ignored)")
    taxonomy_wf_parser.add_argument('-t', '--threads', type=int, default=1, help="number of threads")
    taxonomy_wf_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")
    taxonomy_wf_parser.add_argument('--tmpdir', action=ChangeTempAction, help="specify an alternative directory for temporary files")

    # generic arguments for plots
    plot_need_qa_results_parser = argparse.ArgumentParser(add_help=False)
    plot_need_qa_results_parser.add_argument('results_dir', help="directory specified during qa command")

    plot_parser = argparse.ArgumentParser(add_help=False)
    plot_parser.add_argument('bin_dir', help="directory containing bins to plot (fasta format)")
    plot_parser.add_argument('output_dir', help="directory to hold plots")
    plot_parser.add_argument('--image_type', default='png', choices=['eps', 'pdf', 'png', 'ps', 'svg'], help='desired image type')
    plot_parser.add_argument('--dpi', type=int, default=600, help='desired DPI of output image')
    plot_parser.add_argument('--font_size', type=int, default=8, help='Desired font size')
    plot_parser.add_argument('-x', '--extension', default='fna', help="extension of bins (other files in directory are ignored)")

    plot_single_parser = argparse.ArgumentParser('plot_single',
                                        parents=[plot_parser], add_help=False)
    plot_single_parser.add_argument('--width', type=float, default=6.5, help='width of output image')
    plot_single_parser.add_argument('--height', type=float, default=6.5, help='height of output image')

    plot_double_parser = argparse.ArgumentParser('plot_double',
                                        parents=[plot_parser], add_help=False)
    plot_double_parser.add_argument('--width', type=float, default=6.5, help='width of output image')
    plot_double_parser.add_argument('--height', type=float, default=3.5, help='height of output image')

    plot_rows_parser = argparse.ArgumentParser('plot_rows',
                                        parents=[plot_parser], add_help=False)
    plot_rows_parser.add_argument('--width', type=float, default=6.5, help='width of output image')
    plot_rows_parser.add_argument('--row_height', type=float, default=0.3, help='height of each row in the output image')

    # GC plot
    plot_gc_parser = subparsers.add_parser('gc_plot',
                                        formatter_class=CustomHelpFormatter,
                                        help='Create GC histogram and delta-GC plot.',
                                        parents=[plot_double_parser],
                                        description='Create GC histogram and delta-GC plot.',
                                        epilog='Example: checkm gc_plot ./bins ./plots 95')

    plot_gc_parser.add_argument('distributions', help='reference distribution(s) to plot; integer between 0 and 100', nargs='+', type=int, choices=xrange(0, 101), default=95, metavar='dist_value')
    plot_gc_parser.add_argument('-w', '--gc_window_size', help="window size used to calculate GC histogram", type=int, default=5000)
    plot_gc_parser.add_argument('-b', '--gc_bin_width', help="width of GC bars in histogram", type=float, default=0.01)
    plot_gc_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # Coding density plot
    plot_coding_parser = subparsers.add_parser('coding_plot',
                                        formatter_class=CustomHelpFormatter,
                                        parents=[plot_need_qa_results_parser, plot_double_parser],
                                        description='Create coding density (CD) histogram and delta-CD plot.',
                                        epilog='Example: checkm coding_plot ./output ./bins ./plots 95')

    plot_coding_parser.add_argument('distributions', help='reference distribution(s) to plot; integer between 0 and 100', nargs='+', type=int, choices=xrange(0, 101), default=95, metavar='dist_value')
    plot_coding_parser.add_argument('-w', '--cd_window_size', help="window size used to calculate CD histogram", type=int, default=10000)
    plot_coding_parser.add_argument('-b', '--cd_bin_width', help="width of CD bars in histogram", type=float, default=0.01)
    plot_coding_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # Tetranucleotide distance  plot
    plot_tetra_parser = subparsers.add_parser('tetra_plot',
                                        formatter_class=CustomHelpFormatter,
                                        parents=[plot_need_qa_results_parser, plot_double_parser],
                                        description='Create tetranucleotide distance (TD) histogram and delta-TD plot.',
                                        epilog='Example: checkm tetra_plot ./output ./bins ./plots tetra.tsv 95')
    plot_tetra_parser.add_argument('tetra_profile', help='tetranucleotide profiles for each bin (see tetra command)')
    plot_tetra_parser.add_argument('distributions', help='reference distribution(s) to plot; integer between 0 and 100', nargs='+', type=int, choices=xrange(0, 101), default=95, metavar='dist_value')
    plot_tetra_parser.add_argument('-w', '--td_window_size', help="window size used to calculate TD histogram", type=int, default=5000)
    plot_tetra_parser.add_argument('-b', '--td_bin_width', help="width of TD bars in histogram", type=float, default=0.01)
    plot_tetra_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # Reference distribution  plot
    plot_dist_parser = subparsers.add_parser('dist_plot',
                                        formatter_class=CustomHelpFormatter,
                                        description='Create image with GC, CD, and TD distribution plots together.',
                                        epilog='Example: checkm dist_plot ./output ./bins ./plots tetra.tsv 95')
    plot_dist_parser.add_argument('results_dir', help="directory specified during analyze command")
    plot_dist_parser.add_argument('bin_dir', help="directory containing bins to plot (fasta format)")
    plot_dist_parser.add_argument('output_dir', help="directory to hold plots")
    plot_dist_parser.add_argument('tetra_profile', help='tetranucleotide profiles for each sequence (see tetra command)')
    plot_dist_parser.add_argument('distributions', help='reference distribution(s) to plot; integer between 0 and 100', nargs='+', type=int, choices=xrange(0, 101), default=95, metavar='dist_value')

    plot_dist_parser.add_argument('--image_type', default='png', choices=['eps', 'pdf', 'png', 'ps', 'svg'], help='desired image type')
    plot_dist_parser.add_argument('--dpi', type=int, default=600, help='desired DPI of output image')
    plot_dist_parser.add_argument('--font_size', type=int, default=8, help='Desired font size')
    plot_dist_parser.add_argument('-x', '--extension', default='fna', help="extension of bins (other files in directory are ignored)")
    plot_dist_parser.add_argument('--width', type=float, default=6.5, help='width of output image')
    plot_dist_parser.add_argument('--height', type=float, default=8, help='height of output image')

    plot_dist_parser.add_argument('-a', '--gc_window_size', help="window size used to calculate GC histogram", type=int, default=5000)
    plot_dist_parser.add_argument('-b', '--td_window_size', help="window size used to calculate TD histogram", type=int, default=5000)
    plot_dist_parser.add_argument('-c', '--cd_window_size', help="window size used to calculate CD histogram", type=int, default=10000)
    plot_dist_parser.add_argument('-1', '--gc_bin_width', help="width of GC bars in histogram", type=float, default=0.01)
    plot_dist_parser.add_argument('-2', '--td_bin_width', help="width of TD bars in histogram", type=float, default=0.01)
    plot_dist_parser.add_argument('-3', '--cd_bin_width', help="width of CD bars in histogram", type=float, default=0.01)
    plot_dist_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # PCA plot of tetranucleotide signatures
    plot_tetra_pca_parser = subparsers.add_parser('tetra_pca',
                                        formatter_class=CustomHelpFormatter,
                                        parents=[plot_parser],
                                        description='PCA plot of tetranucleotide signatures.',
                                        epilog='Example: checkm tetra_pca ./bins ./plots tetra.tsv')
    plot_tetra_pca_parser.add_argument('tetra_profile', help='tetranucleotide profiles for each sequence (see tetra command)')
    plot_tetra_pca_parser.add_argument('--width', type=float, default=6.5, help='width of output image')
    plot_tetra_pca_parser.add_argument('--height', type=float, default=6.5, help='height of output image')
    plot_tetra_pca_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # GC bias plots
    plot_gc_bias_parser = subparsers.add_parser('gc_bias_plot',
                                        formatter_class=CustomHelpFormatter,
                                        parents=[plot_double_parser],
                                        description='Plot bin coverage as a function of GC.',
                                        epilog='Example: checkm gc_bias_plot ./bins ./plots example.bam')
    plot_gc_bias_parser.add_argument('bam_file', help="BAM file to interrogate for coverage information")
    plot_gc_bias_parser.add_argument('-w', '--window_size', help="window size used to calculate plot statistics", type=int, default=5000)
    plot_gc_bias_parser.add_argument('-r', '--all_reads', action='store_true', help="use all reads to estimate coverage instead of just those in proper pairs")
    plot_gc_bias_parser.add_argument('-a', '--min_align', help='minimum alignment length as percentage of read length', type=float, default=0.98)
    plot_gc_bias_parser.add_argument('-e', '--max_edit_dist', help='maximum edit distance as percentage of read length', type=float, default=0.02)
    plot_gc_bias_parser.add_argument('-t', '--threads', type=int, default=1, help="number of threads")
    plot_gc_bias_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # PCA plot of coverage profiles
    plot_cov_pca_parser = subparsers.add_parser('cov_pca',
                                        formatter_class=CustomHelpFormatter,
                                        parents=[plot_parser],
                                        description='PCA plot of coverage profiles.',
                                        epilog='Example: checkm cov_pca ./bins ./plots coverate.tsv')
    plot_cov_pca_parser.add_argument('coverage_file', help="file indicating coverage of each sequence (see coverage command)")
    plot_cov_pca_parser.add_argument('--width', type=float, default=6.5, help='width of output image')
    plot_cov_pca_parser.add_argument('--height', type=float, default=6.5, help='height of output image')
    plot_cov_pca_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # Nx-plot
    plot_nx_parser = subparsers.add_parser('nx_plot',
                                        formatter_class=CustomHelpFormatter,
                                        parents=[plot_single_parser],
                                        description='Create Nx-plots.',
                                        epilog='Example: checkm nx_plot ./bins ./plots')

    plot_nx_parser.add_argument('-s', '--step_size', help="x step size for calculating Nx", type=float, default=0.05)
    plot_nx_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # Cumulative sequence length plot
    plot_len_parser = subparsers.add_parser('len_plot',
                                        formatter_class=CustomHelpFormatter,
                                        parents=[plot_single_parser],
                                        description='Cumulative sequence length plot.',
                                        epilog='Example: checkm len_plot ./bins ./plots')

    plot_len_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # Sequence length distribution plot
    hist_len_parser = subparsers.add_parser('len_hist',
                                        formatter_class=CustomHelpFormatter,
                                            parents=[plot_single_parser],
                                            description='Sequence length histogram.',
                                        epilog='Example: checkm len_hist ./bins ./plots')

    hist_len_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # Marker position plot
    marker_plot_parser = subparsers.add_parser('marker_plot',
                                        formatter_class=CustomHelpFormatter,
                                        parents=[plot_need_qa_results_parser, plot_single_parser],
                                        description='Plot position of marker genes on sequences.',
                                        epilog='Example: checkm marker_plot ./output ./bins ./plots')

    marker_plot_parser.add_argument('--fig_padding', type=float, default=0.2, help='white space to place around figure (in inches)')
    marker_plot_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # Parallel coordinate plot
    parallel_coord_plot_parser = subparsers.add_parser('par_plot',
                                        formatter_class=CustomHelpFormatter,
                                        parents=[plot_need_qa_results_parser, plot_single_parser],
                                        description='Parallel coordinate plot of GC and coverage.',
                                        epilog='Example: checkm par_plot ./output ./bins ./plots coverage.tsv')
    parallel_coord_plot_parser.add_argument('coverage_file', help="file indicating coverage of each sequence (see coverage command)")
    parallel_coord_plot_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # Bin QA plot
    bin_qa_plot_parser = subparsers.add_parser('bin_qa_plot',
                                        formatter_class=CustomHelpFormatter,
                                        parents=[plot_need_qa_results_parser, plot_rows_parser],
                                        description='Bar plot of bin completeness, contamination, and strain heterogeneity.',
                                        epilog='Example: checkm bin_qa_plot ./output ./bins ./plots')
    bin_qa_plot_parser.add_argument('--ignore_hetero', dest='bIgnoreHetero', action="store_true", help="do not plot strain heterogeneity")
    bin_qa_plot_parser.add_argument('--aai_strain', type=float, default=0.9, help="AAI threshold used to identify strain heterogeneity")
    bin_qa_plot_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # Identify unbinned sequences
    unbinned_parser = subparsers.add_parser('unbinned',
                                            formatter_class=CustomHelpFormatter,
                                            description='Identify unbinned sequences.',
                                            epilog='Example: checkm unbinned ./bins seqs.fna unbinned.fna unbinned_stats.tsv')
    unbinned_parser.add_argument('bin_dir', help="directory containing bins (fasta format)")
    unbinned_parser.add_argument('seq_file', help="sequences used to generate bins (fasta format)")
    unbinned_parser.add_argument('output_seq_file', help="write unbinned sequences to file")
    unbinned_parser.add_argument('output_stats_file', help="write unbinned sequence statistics to file")
    unbinned_parser.add_argument('-x', '--extension', default='fna', help="extension of bins (other files in directory are ignored)")
    unbinned_parser.add_argument('-s', '--min_seq_len', type=int, default=0, help="required length of sequence")
    unbinned_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # Calculate coverage
    coverage_parser = subparsers.add_parser('coverage',
                                            formatter_class=CustomHelpFormatter,
                                            description='Calculate coverage of sequences.',
                                            epilog='Example: checkm coverage ./bins coverage.tsv example_1.bam example_2.bam')

    coverage_parser.add_argument('bin_dir', help="directory containing bins (fasta format)")
    coverage_parser.add_argument('output_file', help="print results to file")
    coverage_parser.add_argument('bam_files', nargs='+', help="BAM files to parse")
    coverage_parser.add_argument('-x', '--extension', default='fna', help="extension of bins (other files in directory are ignored)")
    coverage_parser.add_argument('-r', '--all_reads', action='store_true', help="use all reads to estimate coverage instead of just those in proper pairs")
    coverage_parser.add_argument('-a', '--min_align', help='minimum alignment length as percentage of read length', type=float, default=0.98)
    coverage_parser.add_argument('-e', '--max_edit_dist', help='maximum edit distance as percentage of read length', type=float, default=0.02)
    coverage_parser.add_argument('-m', '--min_qc', help='minimum quality score (in phred)', type=int, default=15)
    coverage_parser.add_argument('-t', '--threads', type=int, default=1, help="number of threads")
    coverage_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # Calculate tetranucleotide signatures
    tetra_parser = subparsers.add_parser('tetra',
                                            formatter_class=CustomHelpFormatter,
                                            description='Calculate tetranucleotide signature of sequences.',
                                            epilog='Example: checkm tetra seqs.fna tetra.tsv')

    tetra_parser.add_argument('seq_file', help="sequences used to generate bins (fasta format)")
    tetra_parser.add_argument('output_file', help="print results to file")
    tetra_parser.add_argument('-t', '--threads', type=int, default=1, help="number of threads")
    tetra_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # Calculate community profile
    profile_parser = subparsers.add_parser('profile',
                                            formatter_class=CustomHelpFormatter,
                                            description='Calculate percentage of reads mapped to each bin.',
                                            epilog='Example: checkm profile coverage.tsv')
    profile_parser.add_argument('coverage_file', help="file indicating coverage of each sequence (see coverage command)")
    profile_parser.add_argument('-f', '--file', default='stdout', help="print results to file")
    profile_parser.add_argument('--tab_table', dest='bTabTable', action="store_true", default=False, help="print tab-separated values table")
    profile_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # Join tab-separated values file
    join_parser = subparsers.add_parser('join_tables',
                                            formatter_class=CustomHelpFormatter,
                                            description='Join tab-separated value tables containing bin information.',
                                            epilog='Example: checkm join_tables table1.tsv table2.tsv')
    join_parser.add_argument('tables', nargs='+', help="tab-separated table files with bin ids as their primary key")
    join_parser.add_argument('-f', '--file', default='stdout', help="print results to file")
    join_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # Find SSU rRNAs in sequences
    ssu_finder_parser = subparsers.add_parser('ssu_finder',
                                              formatter_class=CustomHelpFormatter,
                                              description='Identify SSU (16S/18S) rRNAs in sequences.',
                                              epilog='Example: checkm ssu_finder seqs.fna ./bins ./ssu_finder')
    ssu_finder_parser.add_argument('seq_file', help="sequences used to generate bins (fasta format)")
    ssu_finder_parser.add_argument('bin_dir', help="directory containing bins (fasta format)")
    ssu_finder_parser.add_argument('output_dir', help="directory to write output files")

    ssu_finder_parser.add_argument('-x', '--extension', default='fna', help="extension of bins (other files in directory are ignored)")
    ssu_finder_parser.add_argument('-e', '--evalue', help='e-value threshold for identifying hits', type=float, default=1e-5)
    ssu_finder_parser.add_argument('-c', '--concatenate', help='concatenate hits that are within the specified number of base pairs', type=int, default=200)
    ssu_finder_parser.add_argument('-t', '--threads', help='number of threads', type=int, default=1)
    ssu_finder_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # Compare two sets of bins (e.g., from alternative binning methods)
    bin_compare_parser = subparsers.add_parser('bin_compare',
                                               formatter_class=CustomHelpFormatter,
                                               description='Compare two sets of bins.',
                                               epilog='Example: checkm bin_compare seqs.fna ./bins1 ./bins2 bin_comparison.tsv')
    bin_compare_parser.add_argument('seq_file', help="sequences used to generate bins (fasta format)")
    bin_compare_parser.add_argument('bin_dir1', help="directory containing bins (fasta format)")
    bin_compare_parser.add_argument('bin_dir2', help="directory containing bins (fasta format)")
    bin_compare_parser.add_argument('output_file', help="output file showing overlap between bins")

    bin_compare_parser.add_argument('-x', '--extension1', default='fna', help="extension of bins in directory 1")
    bin_compare_parser.add_argument('-y', '--extension2', default='fna', help="extension of bins in directory 2")
    bin_compare_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # Remove redundant bins from several binning attempts
    bin_union_parser = subparsers.add_parser('bin_union',
                                               formatter_class=argparse.RawDescriptionHelpFormatter,
                                               description='Create a non-redundant set of bins from multiple sets of bins',
                                               epilog='''Example: checkm bin_union_output bins1/ checkm_qa_tab_table1.tsv bins2/ checkm_qa_tab_table2.tsv

The checkm_qa_tab_table.tsv files are generated through this process for each bin.
checkm qa --tab_table ./output/lineage.ms ./output >checkm_qa_tab_table.tsv

Also note that sequences can be assigned to multiple resulting bins.
i.e. checkm unique will now fail''')
    bin_union_parser.add_argument('output_dir', help="directory for outputting")
    bin_union_parser.add_argument('bin_or_checkm_qa_table', nargs='+', help="bin directories and checkm_qa_table_tables (must have at least one of each)")
    bin_union_parser.add_argument('-x', '--extension', default='fna', help="extension of bins in bin directories")
    bin_union_parser.add_argument('--min_completeness', help="ignore bins with less completeness than this, as a percentage e.g. '70'", type=float, default=70)
    bin_union_parser.add_argument('--max_contamination', help="ignore bins with more contamination than this, as a percentage e.g. '10'", type=float, default=10)
    bin_union_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # Identify bins with complementary marker sets
    merge_parser = subparsers.add_parser('merge',
                                            formatter_class=CustomHelpFormatter,
                                            description='Identify bins with complementary sets of marker genes.',
                                            epilog='Example: checkm merge bacteria.ms ./bins ./output')
    merge_parser.add_argument('marker_file', help="marker file to use for assessing potential bin mergers (marker set or HMM file)")
    merge_parser.add_argument('bin_dir', help="directory containing bins (fasta format)")
    merge_parser.add_argument('output_dir', help="directory to write output files")
    merge_parser.add_argument('-g', '--genes', dest='bCalledGenes', action="store_true", default=False, help="bins contain genes as amino acids instead of nucleotide contigs")
    merge_parser.add_argument('--delta_comp', help="minimum increase in completeness to report pair", type=float, default=5.0)
    merge_parser.add_argument('--delta_cont', help="maximum increase in contamination to report pair", type=float, default=10.0)
    merge_parser.add_argument('--merged_comp', help="minimum merged completeness to report pair", type=float, default=50.0)
    merge_parser.add_argument('--merged_cont', help="maximum merged contamination to report pair", type=float, default=20.0)
    merge_parser.add_argument('-x', '--extension', default='fna', help="extension of bins (other files in directory are ignored)")
    merge_parser.add_argument('-t', '--threads', type=int, default=1, help="number of threads")
    merge_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # Identify outlier sequences
    outlier_parser = subparsers.add_parser('outliers',
                                            formatter_class=CustomHelpFormatter,
                                            parents=[plot_need_qa_results_parser],
                                            description='Identify outliers in bins relative to reference distributions.',
                                            epilog='Example: checkm outliers ./output ./bins tetra.tsv outliers.tsv')
    outlier_parser.add_argument('bin_dir', help="directory containing bins (fasta format)")
    outlier_parser.add_argument('tetra_profile', help='tetranucleotide profiles for each sequence (see tetra command)')
    outlier_parser.add_argument('output_file', help="print results to file")
    outlier_parser.add_argument('-d', '--distributions', help='reference distribution used to identify outliers; integer between 0 and 100', type=int, choices=xrange(0, 101), default=95, metavar='dist_value')
    outlier_parser.add_argument('-r', '--report_type', help="report sequences that are outliers in 'all' or 'any' reference distribution", choices=['any', 'all'], default='any')
    outlier_parser.add_argument('-x', '--extension', default='fna', help="extension of bins (other files in directory are ignored)")
    outlier_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # Modify a bin
    modify_parser = subparsers.add_parser('modify',
                                            formatter_class=CustomHelpFormatter,
                                            description='Modify sequences in a bin.',
                                            epilog='Example: checkm modify -r seq_id1 -r seq_id2 seqs.fna bin.fna new_bin.fna')
    modify_parser.add_argument('seq_file', help="sequences used to generate bins (fasta format)")
    modify_parser.add_argument('bin_file', help="bin to be modified")
    modify_parser.add_argument('output_file', help="modified bin")
    modify_parser.add_argument('-a', '--add', action='append', help="ID of sequence to add to bin (may specify multiple times)")
    modify_parser.add_argument('-r', '--remove', action='append', help="ID of sequence to remove from bin (may specify multiple times)")
    modify_parser.add_argument('-o', '--outlier_file', help="remove all sequences marked as outliers in the bin (see outlier command)")
    modify_parser.add_argument('-q', '--quiet', dest='bQuiet', action="store_true", default=False, help="suppress console output")

    # Ensure uniqueness of bins
    unique_parser = subparsers.add_parser('unique',
                                            formatter_class=CustomHelpFormatter,
                                            description='Ensure no sequences are assigned to multiple bins.',
                                            epilog='Example: checkm unique ./bins')
    unique_parser.add_argument('bin_dir', help="directory containing bins (fasta format)")
    unique_parser.add_argument('-x', '--extension', default='fna', help="extension of bins (all other files in bin directory are ignored)")

    # Quick test of CheckM
    test_parser = subparsers.add_parser('test',
                                            formatter_class=CustomHelpFormatter,
                                            description='Test CheckM on E. coli genome.',
                                            epilog='Example: checkm test ~/checkm_test')
    test_parser.add_argument('output_dir', help="output directory for test data")
    test_parser.add_argument('--tmpdir', action=ChangeTempAction, help="specify an alternative directory for temporary files")

    # debug and development
    if False:
        debug_parser = subparsers.add_parser('debug',
                                            formatter_class=CustomHelpFormatter,
                                            description='Rogue mode for use in testing new features.')
        debug_parser.add_argument('data', help="some data")

    # get and check options
    args = None
    if(len(sys.argv) == 1 or sys.argv[1] == '-h' or sys.argv == '--help'):
        printHelp()
        sys.exit(0)
    else:
        args = parser.parse_args()

    # setup logging
    silent = False
    if hasattr(args, 'bQuiet') and args.bQuiet:
        silent = True

    try:
        logger_setup(args.output_dir, "checkm.log", "CheckM", version(), silent)
    except:
        logger_setup(None, "checkm.log", "CheckM", version(), silent)

    # do what we came here to do
    try:
        checkmParser = main.OptionsParser()
        if(False):
            # import pstats
            # p = pstats.Stats('prof')
            # p.sort_stats('cumulative').print_stats(10)
            # p.sort_stats('time').print_stats(10)
            import cProfile
            cProfile.run('checkmParser.parseOptions(args)', 'prof')
        elif False:
            import pdb
            pdb.run(checkmParser.parseOptions(args))
        else:
            checkmParser.parseOptions(args)
    except SystemExit:
        print "\n  Controlled exit resulting from an unrecoverable error or warning."
    except:
        print "\nUnexpected error:", sys.exc_info()[0]
        raise