1###############################################################################
2#
3# pplacer.py - runs pplacer and provides functions for parsing output
4#
5###############################################################################
6#                                                                             #
7#    This program is free software: you can redistribute it and/or modify     #
8#    it under the terms of the GNU General Public License as published by     #
9#    the Free Software Foundation, either version 3 of the License, or        #
10#    (at your option) any later version.                                      #
11#                                                                             #
12#    This program is distributed in the hope that it will be useful,          #
13#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
14#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
15#    GNU General Public License for more details.                             #
16#                                                                             #
17#    You should have received a copy of the GNU General Public License        #
18#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
19#                                                                             #
20###############################################################################
21
22import os
23import sys
24import stat
25import shutil
26import subprocess
27import logging
28from collections import defaultdict
29
30from checkm.defaultValues import DefaultValues
31
32from checkm.common import checkDirExists
33from checkm.util.seqUtils import readFasta, writeFasta
34
35
36class PplacerRunner():
37    """Wrapper for running pplacer."""
38    def __init__(self, threads):
39        self.logger = logging.getLogger('timestamp')
40        self.numThreads = threads
41
42        # make sure pplace and guppy are on the system path
43        self.__checkForPplacer()
44        self.__checkForGuppy()
45
46    def run(self, binFiles, resultsParser, outDir, bReducedTree):
47        # make sure output and tree directories exist
48        checkDirExists(outDir)
49        alignOutputDir = os.path.join(outDir, 'storage', 'tree')
50        checkDirExists(alignOutputDir)
51
52        treeFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_TREE_OUT)
53        pplacerJsonOut = os.path.join(alignOutputDir, DefaultValues.PPLACER_JSON_OUT)
54        pplacerOut = os.path.join(alignOutputDir, DefaultValues.PPLACER_OUT)
55
56        # create concatenated alignment file for each bin
57        concatenatedAlignFile = self.__createConcatenatedAlignment(binFiles, resultsParser, alignOutputDir)
58
59        pplacerRefPkg = DefaultValues.PPLACER_REF_PACKAGE_FULL
60        if bReducedTree:
61            pplacerRefPkg = DefaultValues.PPLACER_REF_PACKAGE_REDUCED
62
63        # check if concatenated alignment file is empty
64        # (this can occur when all genomes have no phylogenetically informative marker genes)
65        if os.stat(concatenatedAlignFile)[stat.ST_SIZE] == 0:
66            self.logger.info('No genomes were identified that could be placed in the reference genome tree.')
67            shutil.copyfile(os.path.join(pplacerRefPkg, DefaultValues.GENOME_TREE), treeFile)
68            return
69
70        # run pplacer to place bins in reference genome tree
71        self.logger.info('Placing %d bins into the genome tree with pplacer (be patient).' % len(binFiles))
72        cmd = 'pplacer -j %d -c %s -o %s %s > %s' % (self.numThreads,
73                                                     pplacerRefPkg,
74                                                     pplacerJsonOut,
75                                                     concatenatedAlignFile,
76                                                     pplacerOut)
77        os.system(cmd)
78
79        # extract tree
80        cmd = 'guppy tog -o %s %s' % (treeFile, pplacerJsonOut)
81        os.system(cmd)
82
83    def __createConcatenatedAlignment(self, binFiles, resultsParser, alignOutputDir):
84        """Create a concatenated alignment of marker genes for each bin."""
85
86        # read alignment files
87        self.logger.info('Reading marker alignment files.')
88        alignments = defaultdict(dict)
89        files = os.listdir(alignOutputDir)
90        binIds = set()
91        for f in files:
92            if f.endswith('.masked.faa'):
93                markerId = f[0:f.find('.masked.faa')]
94                seqs = readFasta(os.path.join(alignOutputDir, f))
95
96                for seqId, seq in seqs.iteritems():
97                    binId = seqId[0:seqId.find(DefaultValues.SEQ_CONCAT_CHAR)]
98
99                    alignments[markerId][binId] = seq
100                    binIds.add(binId)
101
102        # get all markers and their lengths
103        markerIds = resultsParser.models[resultsParser.models.keys()[0]].keys()
104        markerIdLens = {}
105        for markerId in markerIds:
106            markerIdLens[markerId] = resultsParser.models[resultsParser.models.keys()[0]][markerId].leng
107
108        # create concatenated alignment
109        self.logger.info('Concatenating alignments.')
110        concatenatedSeqs = {}
111        for markerId in sorted(markerIds):
112            seqs = alignments[markerId]
113
114            for binId in binIds:
115                if binId in seqs:
116                    # append alignment
117                    concatenatedSeqs[binId] = concatenatedSeqs.get(binId, '') + seqs[binId]
118                else:
119                    # missing gene
120                    concatenatedSeqs[binId] = concatenatedSeqs.get(binId, '') + '-' * markerIdLens[markerId]
121
122        # save concatenated alignment
123        concatenatedAlignFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_CONCAT_SEQ_OUT)
124        writeFasta(concatenatedSeqs, concatenatedAlignFile)
125
126        return concatenatedAlignFile
127
128    def __checkForPplacer(self):
129        """Check to see if pplacer is on the system before we try to run it."""
130
131        # Assume that a successful pplacer -h returns 0 and anything
132        # else returns something non-zero
133        try:
134            subprocess.call(['pplacer', '-h'], stdout=open(os.devnull, 'w'), stderr=subprocess.STDOUT)
135        except:
136            self.logger.error("  [Error] Make sure pplacer is on your system path.")
137            sys.exit(1)
138
139    def __checkForGuppy(self):
140        """Check to see if guppy is on the system before we try to run it."""
141
142        # Assume that a successful pplacer -h returns 0 and anything
143        # else returns something non-zero
144        try:
145            subprocess.call(['guppy', '-h'], stdout=open(os.devnull, 'w'), stderr=subprocess.STDOUT)
146        except:
147            self.logger.error("  [Error] Make sure guppy, which is part of the pplacer package, is on your system path.")
148            sys.exit(1)
149