1############################################################################### 2# 3# pplacer.py - runs pplacer and provides functions for parsing output 4# 5############################################################################### 6# # 7# This program is free software: you can redistribute it and/or modify # 8# it under the terms of the GNU General Public License as published by # 9# the Free Software Foundation, either version 3 of the License, or # 10# (at your option) any later version. # 11# # 12# This program is distributed in the hope that it will be useful, # 13# but WITHOUT ANY WARRANTY; without even the implied warranty of # 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 15# GNU General Public License for more details. # 16# # 17# You should have received a copy of the GNU General Public License # 18# along with this program. If not, see <http://www.gnu.org/licenses/>. # 19# # 20############################################################################### 21 22import os 23import sys 24import stat 25import shutil 26import subprocess 27import logging 28from collections import defaultdict 29 30from checkm.defaultValues import DefaultValues 31 32from checkm.common import checkDirExists 33from checkm.util.seqUtils import readFasta, writeFasta 34 35 36class PplacerRunner(): 37 """Wrapper for running pplacer.""" 38 def __init__(self, threads): 39 self.logger = logging.getLogger('timestamp') 40 self.numThreads = threads 41 42 # make sure pplace and guppy are on the system path 43 self.__checkForPplacer() 44 self.__checkForGuppy() 45 46 def run(self, binFiles, resultsParser, outDir, bReducedTree): 47 # make sure output and tree directories exist 48 checkDirExists(outDir) 49 alignOutputDir = os.path.join(outDir, 'storage', 'tree') 50 checkDirExists(alignOutputDir) 51 52 treeFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_TREE_OUT) 53 pplacerJsonOut = os.path.join(alignOutputDir, DefaultValues.PPLACER_JSON_OUT) 54 pplacerOut = os.path.join(alignOutputDir, DefaultValues.PPLACER_OUT) 55 56 # create concatenated alignment file for each bin 57 concatenatedAlignFile = self.__createConcatenatedAlignment(binFiles, resultsParser, alignOutputDir) 58 59 pplacerRefPkg = DefaultValues.PPLACER_REF_PACKAGE_FULL 60 if bReducedTree: 61 pplacerRefPkg = DefaultValues.PPLACER_REF_PACKAGE_REDUCED 62 63 # check if concatenated alignment file is empty 64 # (this can occur when all genomes have no phylogenetically informative marker genes) 65 if os.stat(concatenatedAlignFile)[stat.ST_SIZE] == 0: 66 self.logger.info('No genomes were identified that could be placed in the reference genome tree.') 67 shutil.copyfile(os.path.join(pplacerRefPkg, DefaultValues.GENOME_TREE), treeFile) 68 return 69 70 # run pplacer to place bins in reference genome tree 71 self.logger.info('Placing %d bins into the genome tree with pplacer (be patient).' % len(binFiles)) 72 cmd = 'pplacer -j %d -c %s -o %s %s > %s' % (self.numThreads, 73 pplacerRefPkg, 74 pplacerJsonOut, 75 concatenatedAlignFile, 76 pplacerOut) 77 os.system(cmd) 78 79 # extract tree 80 cmd = 'guppy tog -o %s %s' % (treeFile, pplacerJsonOut) 81 os.system(cmd) 82 83 def __createConcatenatedAlignment(self, binFiles, resultsParser, alignOutputDir): 84 """Create a concatenated alignment of marker genes for each bin.""" 85 86 # read alignment files 87 self.logger.info('Reading marker alignment files.') 88 alignments = defaultdict(dict) 89 files = os.listdir(alignOutputDir) 90 binIds = set() 91 for f in files: 92 if f.endswith('.masked.faa'): 93 markerId = f[0:f.find('.masked.faa')] 94 seqs = readFasta(os.path.join(alignOutputDir, f)) 95 96 for seqId, seq in seqs.iteritems(): 97 binId = seqId[0:seqId.find(DefaultValues.SEQ_CONCAT_CHAR)] 98 99 alignments[markerId][binId] = seq 100 binIds.add(binId) 101 102 # get all markers and their lengths 103 markerIds = resultsParser.models[resultsParser.models.keys()[0]].keys() 104 markerIdLens = {} 105 for markerId in markerIds: 106 markerIdLens[markerId] = resultsParser.models[resultsParser.models.keys()[0]][markerId].leng 107 108 # create concatenated alignment 109 self.logger.info('Concatenating alignments.') 110 concatenatedSeqs = {} 111 for markerId in sorted(markerIds): 112 seqs = alignments[markerId] 113 114 for binId in binIds: 115 if binId in seqs: 116 # append alignment 117 concatenatedSeqs[binId] = concatenatedSeqs.get(binId, '') + seqs[binId] 118 else: 119 # missing gene 120 concatenatedSeqs[binId] = concatenatedSeqs.get(binId, '') + '-' * markerIdLens[markerId] 121 122 # save concatenated alignment 123 concatenatedAlignFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_CONCAT_SEQ_OUT) 124 writeFasta(concatenatedSeqs, concatenatedAlignFile) 125 126 return concatenatedAlignFile 127 128 def __checkForPplacer(self): 129 """Check to see if pplacer is on the system before we try to run it.""" 130 131 # Assume that a successful pplacer -h returns 0 and anything 132 # else returns something non-zero 133 try: 134 subprocess.call(['pplacer', '-h'], stdout=open(os.devnull, 'w'), stderr=subprocess.STDOUT) 135 except: 136 self.logger.error(" [Error] Make sure pplacer is on your system path.") 137 sys.exit(1) 138 139 def __checkForGuppy(self): 140 """Check to see if guppy is on the system before we try to run it.""" 141 142 # Assume that a successful pplacer -h returns 0 and anything 143 # else returns something non-zero 144 try: 145 subprocess.call(['guppy', '-h'], stdout=open(os.devnull, 'w'), stderr=subprocess.STDOUT) 146 except: 147 self.logger.error(" [Error] Make sure guppy, which is part of the pplacer package, is on your system path.") 148 sys.exit(1) 149