1###############################################################################
2#
3# defaultValues.py - store default values used in many places in CheckM
4#
5###############################################################################
6#                                                                             #
7#    This program is free software: you can redistribute it and/or modify     #
8#    it under the terms of the GNU General Public License as published by     #
9#    the Free Software Foundation, either version 3 of the License, or        #
10#    (at your option) any later version.                                      #
11#                                                                             #
12#    This program is distributed in the hope that it will be useful,          #
13#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
14#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
15#    GNU General Public License for more details.                             #
16#                                                                             #
17#    You should have received a copy of the GNU General Public License        #
18#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
19#                                                                             #
20###############################################################################
21
22import os
23from checkm.checkmData import DBManager
24
25
26class DefaultValues():
27    """Default values for filenames and common constants."""
28
29    __DBM = DBManager()
30
31    # set of markers recognized to be unreliable. These are often
32    # ubiquitous, single-copy genes, but ones which are challenging
33    # to correctly annotate with the PFAM and TIGRFAM models.
34    MARKERS_TO_EXCLUDE = {'TIGR00398', 'TIGR00399'}
35
36    E_VAL = 1e-10
37    LENGTH = 0.7
38    PSEUDOGENE_LENGTH = 0.3
39
40    TAXON_MARKER_FILE_HEADER = '# [Taxon Marker File]'
41    LINEAGE_MARKER_FILE_HEADER = '# [Lineage Marker File]'
42
43    SEQ_CONCAT_CHAR = '&&'
44
45    CHECKM_DATA_DIR = __DBM.config.values["dataRoot"]
46    PHYLO_HMM_MODELS = phyloHMMs = os.path.join(CHECKM_DATA_DIR, 'hmms', 'phylo.hmm')
47    HMM_MODELS = os.path.join(CHECKM_DATA_DIR, 'hmms', 'checkm.hmm')
48    PFAM_CLAN_FILE = os.path.join(CHECKM_DATA_DIR, 'pfam', 'Pfam-A.hmm.dat')
49
50    IMG_METADATA_FILE = os.path.join(CHECKM_DATA_DIR, 'img', 'img_metadata.tsv')
51    REDUNDANT_TIGRFAM_FILE = os.path.join(CHECKM_DATA_DIR, 'pfam', 'tigrfam2pfam.tsv')
52
53    SELECTED_MARKER_SETS = os.path.join(CHECKM_DATA_DIR, 'selected_marker_sets.tsv')
54    TAXON_MARKER_SETS = os.path.join(CHECKM_DATA_DIR, 'taxon_marker_sets.tsv')
55
56    GENOME_TREE_DIR = os.path.join(CHECKM_DATA_DIR, 'genome_tree')
57    PPLACER_REF_PACKAGE_FULL = os.path.join(GENOME_TREE_DIR, 'genome_tree_full.refpkg')
58    PPLACER_REF_PACKAGE_REDUCED = os.path.join(GENOME_TREE_DIR, 'genome_tree_reduced.refpkg')
59    GENOME_TREE = 'genome_tree.tre'
60    GENOME_TREE_FASTA = 'genome_tree.fasta'
61    GENOME_TREE_DEREP = 'genome_tree.derep.txt'
62    GENOME_TREE_TAXONOMY = 'genome_tree.taxonomy.tsv'
63    GENOME_TREE_METADATA = 'genome_tree.metadata.tsv'
64    GENOME_TREE_MISSING_DUPLICATE = 'missing_duplicate_genes_50.tsv'
65    DISTRIBUTION_DIR = os.path.join(CHECKM_DATA_DIR, 'distributions')
66
67    PHYLO_HMM_MODEL_INFO = 'phylo_hmm_info.pkl.gz'
68    CHECKM_HMM_MODEL_INFO = 'checkm_hmm_info.pkl.gz'
69
70    HMMER_TABLE_PHYLO_OUT = 'hmmer.tree.txt'
71    HMMER_PHYLO_OUT = 'hmmer.tree.ali.txt'
72
73    HMMER_TABLE_OUT = 'hmmer.analyze.txt'
74    HMMER_OUT = 'hmmer.analyze.ali.txt'
75
76    PRODIGAL_AA = 'genes.faa'
77    PRODIGAL_NT = 'genes.fna'
78    PRODIGAL_GFF = 'genes.gff'
79
80    PPLACER_CONCAT_SEQ_OUT = 'concatenated.fasta'
81    PPLACER_JSON_OUT = 'concatenated.pplacer.json'
82    PPLACER_OUT = 'pplacer.out'
83    PPLACER_TREE_OUT = 'concatenated.tre'
84
85    BIN_STATS_PHYLO_OUT = 'bin_stats.tree.tsv'
86    # SEQ_STATS_PHYLO_OUT = 'seq_stats.tree.tsv'
87
88    BIN_STATS_OUT = 'bin_stats.analyze.tsv'
89    # SEQ_STATS_OUT = 'seq_stats.analyze.tsv'
90
91    BIN_STATS_EXT_OUT = 'bin_stats_ext.tsv'
92    MARKER_GENE_STATS = 'marker_gene_stats.tsv'
93
94    CONTIG_BREAK = 'NNNNNNNNNN'
95
96    UNBINNED = 'unbinned'
97
98    MIN_SEQ_LEN_GC_STD = 1000