1###############################################################################
2#
3# treeParser.py - parse genome tree and associated tree metadata
4#
5###############################################################################
6#                                                                             #
7#    This program is free software: you can redistribute it and/or modify     #
8#    it under the terms of the GNU General Public License as published by     #
9#    the Free Software Foundation, either version 3 of the License, or        #
10#    (at your option) any later version.                                      #
11#                                                                             #
12#    This program is distributed in the hope that it will be useful,          #
13#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
14#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
15#    GNU General Public License for more details.                             #
16#                                                                             #
17#    You should have received a copy of the GNU General Public License        #
18#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
19#                                                                             #
20###############################################################################
21
22import os
23import sys
24import logging
25import json
26
27import dendropy
28
29import checkm.prettytable as prettytable
30
31from checkm.defaultValues import DefaultValues
32from checkm.markerSets import MarkerSet, BinMarkerSets
33from checkm.common import checkDirExists, reassignStdOut, restoreStdOut, getBinIdsFromOutDir
34from checkm.util.seqUtils import readFasta
35from checkm.util.taxonomyUtils import taxonomicPrefixes
36
37
38class TreeParser():
39    """Parse genome tree and associated tree metadata."""
40    def __init__(self):
41        self.logger = logging.getLogger('timestamp')
42
43    def printSummary(self, outputFormat, outDir, resultsParser, bTabTable, outFile, binStats):
44        if outputFormat == 1:
45            self.reportBinTaxonomy(outDir, resultsParser, bTabTable, outFile, binStats, bLineageStatistics=False)
46        elif outputFormat == 2:
47            self.reportBinTaxonomy(outDir, resultsParser, bTabTable, outFile, binStats, bLineageStatistics=True)
48        elif outputFormat == 3:
49            self.reportNewickTree(outDir, outFile, None)
50        elif outputFormat == 4:
51            self.reportNewickTree(outDir, outFile, 'taxonomy')
52        elif outputFormat == 5:
53            self.reportFullMSA(outDir, outFile)
54        else:
55            self.logger.error("Unknown output format: %d", outputFormat)
56
57    def reportFullMSA(self, outDir, outFile):
58        """Create MSA with all reference and bin alignments."""
59
60        # write bin alignments to file
61        oldStdOut = reassignStdOut(outFile)
62        for line in open(os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_CONCAT_SEQ_OUT)):
63            print(line.rstrip())
64
65        # read duplicate seqs
66        duplicateNodes = self.__readDuplicateSeqs()
67
68        # write reference alignments to file
69        seqs = readFasta(os.path.join(DefaultValues.PPLACER_REF_PACKAGE_FULL, DefaultValues.GENOME_TREE_FASTA))
70        for seqId, seq in seqs.iteritems():
71            print('>' + seqId)
72            print(seq)
73
74            if seqId in duplicateNodes:
75                for dupSeqId in duplicateNodes[seqId]:
76                    print('>' + dupSeqId)
77                    print(seq)
78
79        restoreStdOut(outFile, oldStdOut)
80
81    def readPlacementFile(self, placementFile):
82        '''Read pplacer JSON placement file.'''
83        jsonData = open(placementFile)
84
85        data = json.load(jsonData)
86        binIdToPP = {}
87        for placementData in data['placements']:
88            binId = placementData['nm'][0][0]
89
90            topPP = 0
91            for pp in placementData['p']:
92                if pp[2] > topPP:
93                    topPP = pp[2]
94
95            binIdToPP[binId] = topPP
96
97        jsonData.close()
98
99        return binIdToPP
100
101    def reportNewickTree(self, outDir, outFile, leafLabels=None):
102        # read duplicate nodes
103        duplicateSeqs = self.__readDuplicateSeqs()
104
105        # read tree
106        treeFile = os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_TREE_OUT)
107        tree = dendropy.Tree.get_from_path(treeFile, schema='newick', rooting="force-rooted", preserve_underscores=True)
108
109        # clean up internal node labels
110        for node in tree.internal_nodes():
111            if node.label:
112                labelSplit = node.label.split('|')
113
114                label = labelSplit[0]
115                if labelSplit[1] != '':
116                    label += '|' + labelSplit[1]
117                if labelSplit[2] != '':
118                    label += '|' + labelSplit[2]
119
120                node.label = label
121
122        # insert duplicate nodes into tree
123        for leaf in tree.leaf_nodes():
124            duplicates = duplicateSeqs.get(leaf.taxon.label, None)
125            if duplicates != None:
126                newParent = leaf.parent_node.new_child(edge_length=leaf.edge_length)
127                curLeaf = leaf.parent_node.remove_child(leaf)
128                newParent.new_child(taxon=curLeaf.taxon, edge_length=0)
129                for d in duplicates:
130                    newParent.new_child(taxon=dendropy.Taxon(label=d), edge_length=0)
131
132        # append taxonomy to leaf nodes
133        if leafLabels == 'taxonomy':
134            # read taxonomy string for each IMG genome
135            taxonomy = {}
136            for line in open(os.path.join(DefaultValues.GENOME_TREE_DIR, DefaultValues.GENOME_TREE_TAXONOMY)):
137                lineSplit = line.split('\t')
138                taxonomy[lineSplit[0]] = lineSplit[1].rstrip()
139
140            # append taxonomy to leaf labels
141            for leaf in tree.leaf_nodes():
142                taxaStr = taxonomy.get(leaf.taxon.label, None)
143                if taxaStr:
144                    leaf.taxon.label += '|' + taxaStr
145
146        # write out tree
147        oldStdOut = reassignStdOut(outFile)
148        print(tree.as_string(schema='newick', suppress_rooting=True))
149        restoreStdOut(outFile, oldStdOut)
150
151    def getInsertionBranchId(self, outDir, binIds):
152        # make sure output and tree directories exist
153        checkDirExists(outDir)
154        alignOutputDir = os.path.join(outDir, 'storage', 'tree')
155        checkDirExists(alignOutputDir)
156
157        # read genome tree (if it exists)
158        binIdToUID = {}
159        treeFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_TREE_OUT)
160        tree = dendropy.Tree.get_from_path(treeFile, schema='newick', rooting="force-rooted", preserve_underscores=True)
161
162        # find first parent of each bin with a taxonomic label
163        for binId in binIds:
164            node = tree.find_node_with_taxon_label(binId)
165            if node == None:
166                binIdToUID[binId] = 'NA'
167                continue
168
169            # find first node decorated with a UID string between leaf and root
170            parentNode = node.parent_node
171            while parentNode != None:
172                if parentNode.label:
173                    uid = parentNode.label.split('|')[0]
174                    break
175
176                parentNode = parentNode.parent_node
177
178            binIdToUID[binId] = uid
179
180        return binIdToUID
181
182    def getBinTaxonomy(self, outDir, binIds):
183        # make sure output and tree directories exist
184        checkDirExists(outDir)
185        alignOutputDir = os.path.join(outDir, 'storage', 'tree')
186        checkDirExists(alignOutputDir)
187
188        # read genome tree (if it exists)
189        binIdToTaxonomy = {}
190        treeFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_TREE_OUT)
191        tree = dendropy.Tree.get_from_path(treeFile, schema='newick', rooting="force-rooted", preserve_underscores=True)
192
193        # find first parent of each bin with a taxonomic label
194        for binId in binIds:
195            node = tree.find_node_with_taxon_label(binId)
196            if node == None:
197                binIdToTaxonomy[binId] = 'NA'
198                continue
199
200            # find first node decorated with a taxon string between leaf and root
201            taxaStr = None
202            parentNode = node.parent_node
203            while parentNode != None:
204                if parentNode.label:
205                    tokens = parentNode.label.split('|')
206
207                    if tokens[1] != '':
208                        if taxaStr:
209                            taxaStr = tokens[1] + ';' + taxaStr
210                        else:
211                            taxaStr = tokens[1]
212
213                parentNode = parentNode.parent_node
214
215            if not taxaStr:
216                domainNode = self.__findDomainNode(node)
217                taxaStr = domainNode.label.split('|')[1] + ' (root)'
218
219            binIdToTaxonomy[node.taxon.label] = taxaStr
220
221        return binIdToTaxonomy
222
223    def __findDomainNode(self, binNode):
224        """Find node defining the domain. Assumes 'binNode' is the leaf node of a bin on either the archaeal or bacterial branch."""
225
226        # bin is either on the bacterial or archaeal branch descendant from the root,
227        # so descend tree to first internal node with a label. Note that there may
228        # be multiple bins inserted in the tree creating unlabeled internal nodes.
229        # Identical population bins can also cause internal bifurcations.
230
231        # find first parent node which contains at least one IMG genome
232        curNode = binNode.parent_node
233        while True:
234            found_ref_genome = False
235            for leaf in curNode.leaf_nodes():
236                if leaf.taxon.label.startswith('IMG_'):
237                    found_ref_genome = True
238                    break
239
240            if found_ref_genome:
241                break
242
243            curNode = curNode.parent_node
244
245        # perform depth first search to find a labeled internal node
246        queue = [curNode]
247        while queue:
248            curNode = queue.pop(0)
249
250            if curNode.label:
251                return curNode
252
253            for child in curNode.child_nodes():
254                if child.is_internal():
255                    queue.append(child)
256
257        self.logger.error('  [Error] Failed to associate bin with a domain. Please report this bug.')
258        sys.exit(1)
259
260    def getBinSisterTaxonomy(self, outDir, binIds):
261        # make sure output and tree directories exist
262        checkDirExists(outDir)
263        alignOutputDir = os.path.join(outDir, 'storage', 'tree')
264        checkDirExists(alignOutputDir)
265
266        # read genome tree
267        treeFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_TREE_OUT)
268        tree = dendropy.Tree.get_from_path(treeFile, schema='newick', rooting="force-rooted", preserve_underscores=True)
269
270        # read taxonomy string for each IMG genome
271        leafIdToTaxonomy = {}
272        for line in open(os.path.join(DefaultValues.GENOME_TREE_DIR, DefaultValues.GENOME_TREE_TAXONOMY)):
273            lineSplit = line.split('\t')
274            leafIdToTaxonomy[lineSplit[0]] = lineSplit[1].rstrip()
275
276        # find LCA of all labeled node in sister lineage
277        binIdToSisterTaxonomy = {}
278        for binId in binIds:
279            node = tree.find_node_with_taxon_label(binId)
280
281            taxaStr = ''
282            if node != None:
283                # get taxonomic labels of all internal nodes in sister lineages
284                sisterNodes = node.sister_nodes()
285                internalTaxonomyLabels = set()
286                leafTaxonomyLabels = set()
287                for sn in sisterNodes:
288                    for curNode in sn.postorder_iter():
289                        if curNode.is_leaf():
290                            if curNode.taxon.label:
291                                taxonomy = leafIdToTaxonomy.get(curNode.taxon.label, None)
292                                if taxonomy != None:  # inserted bins will not have an assigned taxonomy
293                                    for taxa in taxonomy.split(';'):
294                                        leafTaxonomyLabels.add(taxa.strip())
295                        else:
296                            if curNode.label:
297                                tokens = curNode.label.split('|')
298                                if tokens[1] != '':
299                                    for taxa in tokens[1].split(';'):
300                                        internalTaxonomyLabels.add(taxa)
301
302                # find LCA of taxonomic labels in rank order;
303                # only consider leaf node labels if there were no internal labels
304                labels = internalTaxonomyLabels
305                if len(labels) == 0:
306                    labels = leafTaxonomyLabels
307
308                for prefix in taxonomicPrefixes:
309                    taxa = []
310                    for taxon in labels:
311                        if prefix in taxon:
312                            taxa.append(taxon)
313
314                    if len(taxa) == 1:
315                        # unambiguous label at this rank
316                        taxaStr += taxa[0] + ';'
317                    elif len(taxa) > 1:
318                        # unable to resolve taxonomy at this rank
319                        break
320
321            if not taxaStr:
322                taxaStr = 'unresolved'
323            binIdToSisterTaxonomy[binId] = taxaStr
324
325        return binIdToSisterTaxonomy
326
327    def __getNextNamedNode(self, node, uniqueIdToLineageStatistics):
328        """Get first parent node with taxonomy information."""
329        parentNode = node.parent_node
330        while True:
331            if parentNode == None:
332                break  # reached the root node so terminate
333
334            if parentNode.label:
335                trustedUniqueId = parentNode.label.split('|')[0]
336                trustedStats = uniqueIdToLineageStatistics[trustedUniqueId]
337                if trustedStats['taxonomy'] != '':
338                    return trustedStats['taxonomy']
339
340            parentNode = parentNode.parent_node
341
342        return 'root'
343
344    def __getMarkerSet(self, parentNode, tree, uniqueIdToLineageStatistics,
345                                    numGenomesMarkers, bootstrap,
346                                    bForceDomain, bRequireTaxonomy):
347        """Get marker set for next parent node meeting selection criteria."""
348
349        # ascend tree to root finding first node meeting all selection criteria
350        selectedParentNode = parentNode
351        taxonomyStr = 'root'
352        while True:
353            if selectedParentNode.label:  # nodes inserted by PPLACER will not have a label
354                trustedUniqueId = selectedParentNode.label.split('|')[0]
355                nodeTaxonomy = selectedParentNode.label.split('|')[1]
356
357                stats = uniqueIdToLineageStatistics[trustedUniqueId]
358                if stats['# genomes'] >= numGenomesMarkers and stats['bootstrap'] >= bootstrap:
359                    if not bForceDomain or nodeTaxonomy in ['k__Bacteria', 'k__Archaea']:
360                        if not bRequireTaxonomy or stats['taxonomy'] != '':
361                            # get closest taxonomic label
362                            taxonomyStr = stats['taxonomy']
363                            if not bRequireTaxonomy and stats['taxonomy'] == '':
364                                taxonomyStr = self.__getNextNamedNode(selectedParentNode, uniqueIdToLineageStatistics)
365
366                            # all criteria meet, so use marker set from this node
367                            break
368
369            if selectedParentNode.parent_node == None:
370                break  # reached the root node so terminate
371
372            selectedParentNode = selectedParentNode.parent_node
373
374        # get marker set meeting all criteria required for a trusted marker set
375        taxonomyStr = taxonomyStr.split(';')[-1]  # extract most specific taxonomy identifier
376        markerSet = MarkerSet(trustedUniqueId, taxonomyStr, int(stats['# genomes']), eval(stats['marker set']))
377
378        return selectedParentNode, markerSet
379
380    def __refineMarkerSet(self, markerSet, binNode, tree, uniqueIdToLineageStatistics, numGenomesRefine):
381        """Refine marker set to account for lineage-specific gene loss and duplication."""
382
383        # lineage-specific refine is done with the sister lineage to where the bin is inserted
384
385        # get lineage-specific marker set which will be used to refine the above marker set
386        curNode = binNode.sister_nodes()[0]
387        while True:
388            if curNode.label:  # nodes inserted by PPLACER will not have a label
389                uniqueId = curNode.label.split('|')[0]
390                stats = uniqueIdToLineageStatistics[uniqueId]
391
392                if stats['# genomes'] >= numGenomesRefine:
393                    break
394
395            curNode = curNode.parent_node
396            if curNode == None:
397                break  # reached the root node so terminate
398
399        # get lineage-specific marker set
400        lineageMarkerSet = eval(stats['marker set'])
401
402        # refine marker set by finding the intersection between these two sets,
403        # this removes markers that are not single-copy or ubiquitous in the
404        # specific lineage of a bin
405        # Note: co-localization information is taken from the trusted set
406
407        # get all lineage-specific marker genes
408        allLineageSpecificGenes = set()
409        for m in lineageMarkerSet:
410            for gene in m:
411                allLineageSpecificGenes.add(gene)
412
413        # remove genes not present in the lineage-specific gene set
414        finalMarkerSet = []
415        for ms in markerSet.markerSet:
416            s = set()
417            for gene in ms:
418                if gene in allLineageSpecificGenes:
419                    s.add(gene)
420
421            if s:
422                finalMarkerSet.append(s)
423
424        refinedMarkerSet = MarkerSet(markerSet.UID, markerSet.lineageStr, markerSet.numGenomes, finalMarkerSet)
425
426        return refinedMarkerSet
427
428    def __readLineageSpecificGenesToRemove(self):
429        """Get set of genes subject to lineage-specific gene loss and duplication."""
430
431        self.lineageSpecificGenesToRemove = {}
432        for line in open(os.path.join(DefaultValues.GENOME_TREE_DIR, DefaultValues.GENOME_TREE_MISSING_DUPLICATE)):
433            lineSplit = line.split('\t')
434            uid = lineSplit[0]
435            missingGenes = eval(lineSplit[1])
436            duplicateGenes = eval(lineSplit[2])
437            self.lineageSpecificGenesToRemove[uid] = missingGenes.union(duplicateGenes)
438
439    def __removeInvalidLineageMarkerGenes(self, markerSet, lineageSpecificMarkersToRemove):
440        """Refine marker set to account for lineage-specific gene loss and duplication."""
441
442        # refine marker set by removing marker genes subject to lineage-specific
443        # gene loss and duplication
444        #
445        # Note: co-localization information is taken from the trusted set
446
447        finalMarkerSet = []
448        for ms in markerSet.markerSet:
449            s = set()
450            for gene in ms:
451                geneIdToTest = gene
452                if geneIdToTest.startswith('PF'):
453                    geneIdToTest = gene.replace('PF', 'pfam')
454                    geneIdToTest = geneIdToTest[0:geneIdToTest.rfind('.')]
455
456                if geneIdToTest not in lineageSpecificMarkersToRemove:
457                    s.add(gene)
458
459            if s:
460                finalMarkerSet.append(s)
461
462        refinedMarkerSet = MarkerSet(markerSet.UID, markerSet.lineageStr, markerSet.numGenomes, finalMarkerSet)
463
464        return refinedMarkerSet
465
466    def getBinMarkerSets(self, outDir, markerFile,
467                                    numGenomesMarkers,
468                                    bootstrap, bNoLineageSpecificRefinement,
469                                    bForceDomain, bRequireTaxonomy,
470                                    resultsParser, minUnique, maxMulti):
471        """Determine marker sets for each bin."""
472
473        self.logger.info('Determining marker sets for each genome bin.')
474
475        # get all bin ids
476        binIds = getBinIdsFromOutDir(outDir)
477
478        # get statistics for internal nodes
479        uniqueIdToLineageStatistics = self.readNodeMetadata()
480
481        # determine marker set for each bin
482        treeFile = os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_TREE_OUT)
483        tree = dendropy.Tree.get_from_path(treeFile, schema='newick', rooting="force-rooted", preserve_underscores=True)
484        rootNode = tree.find_node(filter_fn=lambda n: n.parent_node == None)
485
486        fout = open(markerFile, 'w')
487        fout.write(DefaultValues.LINEAGE_MARKER_FILE_HEADER + '\n')
488
489        numProcessedBins = 0
490        statusStr = ''
491        for binId in binIds:
492            if self.logger.getEffectiveLevel() <= logging.INFO:
493                numProcessedBins += 1
494                sys.stderr.write(' ' * len(statusStr) + '\r')  # clear previous line
495                statusStr = '    Finished processing %d of %d (%.2f%%) bins (current: %s).' % (numProcessedBins, len(binIds), float(numProcessedBins) * 100 / len(binIds), binId)
496                sys.stderr.write('%s\r' % statusStr)
497                sys.stderr.flush()
498
499            node = tree.find_node_with_taxon_label(binId)
500            binMarkerSets = BinMarkerSets(binId, BinMarkerSets.TREE_MARKER_SET)
501            if node == None:
502                # bin is not in tree
503                node, markerSet = self.__getMarkerSet(rootNode, tree, uniqueIdToLineageStatistics,
504                                                        numGenomesMarkers, bootstrap,
505                                                        bForceDomain, bRequireTaxonomy)
506                binMarkerSets.addMarkerSet(markerSet)
507            else:
508                # special case: if node is on the bacterial or archaeal branch descendant from the root,
509                # then move down the tree to include the domain-specific marker set
510                parentNode = node.parent_node
511                while parentNode != None:
512                    if parentNode.label:
513                        bRoot = (parentNode.parent_node == None)
514                        break
515
516                    parentNode = parentNode.parent_node
517
518                if bRoot:
519                    # since the root is the first labeled node, we need to descend the
520                    # tree to incorporate the domain-specific marker set
521                    domainNode = self.__findDomainNode(node)
522                    curNode = domainNode.child_nodes()[0]
523                else:
524                    curNode = node
525
526                # get lineage specific refinement for first node with an id
527                if not bNoLineageSpecificRefinement:
528                    uniqueId = parentNode.label.split('|')[0]
529                    self.__readLineageSpecificGenesToRemove()
530                    lineageSpecificRefinement = self.lineageSpecificGenesToRemove[uniqueId]
531
532                # ascend tree to root, recording all marker sets meeting selection criteria
533                while curNode.parent_node != None:
534                    uniqueHits, multiCopyHits = resultsParser.results[binId].countUniqueHits()
535                    tempForceDomain = bForceDomain or (uniqueHits < minUnique) or (multiCopyHits > maxMulti)
536
537                    curNode, markerSet = self.__getMarkerSet(curNode.parent_node, tree, uniqueIdToLineageStatistics,
538                                                                numGenomesMarkers, bootstrap,
539                                                                tempForceDomain, bRequireTaxonomy)
540
541                    if not bNoLineageSpecificRefinement:
542                        markerSet = self.__removeInvalidLineageMarkerGenes(markerSet, lineageSpecificRefinement)
543
544                    binMarkerSets.addMarkerSet(markerSet)
545
546            binMarkerSets.write(fout)
547
548        if self.logger.getEffectiveLevel() <= logging.INFO:
549            sys.stderr.write('\n')
550
551        fout.close()
552
553    def readNodeMetadata(self):
554        """Read metadata for internal nodes."""
555
556        uniqueIdToLineageStatistics = {}
557        metadataFile = os.path.join(DefaultValues.GENOME_TREE_DIR, DefaultValues.GENOME_TREE_METADATA)
558        with open(metadataFile) as f:
559            f.readline()
560            for line in f:
561                lineSplit = line.rstrip().split('\t')
562
563                uniqueId = lineSplit[0]
564
565                d = {}
566                d['# genomes'] = int(lineSplit[1])
567                d['taxonomy'] = lineSplit[2]
568                try:
569                    d['bootstrap'] = float(lineSplit[3])
570                except:
571                    d['bootstrap'] = 'NA'
572                d['gc mean'] = float(lineSplit[4])
573                d['gc std'] = float(lineSplit[5])
574                d['genome size mean'] = float(lineSplit[6]) / 1e6
575                d['genome size std'] = float(lineSplit[7]) / 1e6
576                d['gene count mean'] = float(lineSplit[8])
577                d['gene count std'] = float(lineSplit[9])
578                d['marker set'] = lineSplit[10].rstrip()
579
580                uniqueIdToLineageStatistics[uniqueId] = d
581
582        return uniqueIdToLineageStatistics
583
584    def readLineageMetadata(self, outDir, binIds):
585        """Get metadata for each bin."""
586
587        uniqueIdToLineageStatistics = self.readNodeMetadata()
588
589        # read genome tree
590        treeFile = os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_TREE_OUT)
591        tree = dendropy.Tree.get_from_path(treeFile, schema='newick', rooting="force-rooted", preserve_underscores=True)
592
593        # find first parent of each bin with a label
594        binIdToLineageStatistics = {}
595        for binId in binIds:
596            node = tree.find_node_with_taxon_label(binId)
597            if node == None:
598                d = {}
599                d['# genomes'] = 'NA'
600                d['taxonomy'] = 'unresolved'
601                d['gc mean'] = 'NA'
602                d['gc std'] = 'NA'
603                d['genome size mean'] = 'NA'
604                d['genome size std'] = 'NA'
605                d['gene count mean'] = 'NA'
606                d['gene count std'] = 'NA'
607                d['marker set'] = 'NA'
608                binIdToLineageStatistics[binId] = d
609                continue
610
611            # find first labeled parent node (nodes inserted by pplacer will be unlabeled)
612            parentNode = node.parent_node
613            uniqueId = None
614            while parentNode != None:
615                if parentNode.label:
616                    uniqueId = parentNode.label.split('|')[0]
617                    break
618
619                parentNode = parentNode.parent_node
620
621            if uniqueId:
622                binIdToLineageStatistics[binId] = uniqueIdToLineageStatistics[uniqueId]
623            else:
624                self.logger.error('Failed to find lineage-specific statistics for inserted bin: ' + node.taxon.label)
625                sys.exit(0)
626
627        return binIdToLineageStatistics
628
629    def reportBinTaxonomy(self, outDir, resultsParser, bTabTable, outFile, binStats, bLineageStatistics):
630        # make sure output and tree directories exist
631        checkDirExists(outDir)
632        alignOutputDir = os.path.join(outDir, 'storage', 'tree')
633        checkDirExists(alignOutputDir)
634
635        # get all bin ids
636        binIds = getBinIdsFromOutDir(outDir)
637
638        # get taxonomy for each bin
639        binIdToTaxonomy = self.getBinTaxonomy(outDir, binIds)
640
641        # write table
642        if not bLineageStatistics:
643            self.__printSimpleSummaryTable(binIdToTaxonomy, resultsParser, bTabTable, outFile)
644        else:
645            # get taxonomy of sister lineage for each bin
646            binIdToSisterTaxonomy = self.getBinSisterTaxonomy(outDir, binIds)
647            binIdToUID = self.getInsertionBranchId(outDir, binIds)
648
649            binIdToLineageStatistics = self.readLineageMetadata(outDir, binIds)
650            self.__printFullTable(binIdToUID, binIdToTaxonomy, binIdToSisterTaxonomy, binIdToLineageStatistics, resultsParser, binStats, bTabTable, outFile)
651
652    def __printSimpleSummaryTable(self, binIdToTaxonomy, resultsParser, bTabTable, outFile):
653        # redirect output
654        oldStdOut = reassignStdOut(outFile)
655
656        arbitraryBinId = binIdToTaxonomy.keys()[0]
657        markerCountLabel = '# unique markers (of %d)' % len(resultsParser.models[arbitraryBinId])
658        header = ['Bin Id', markerCountLabel, '# multi-copy', 'Taxonomy']
659
660        if bTabTable:
661            pTable = None
662            print('\t'.join(header))
663        else:
664            pTable = prettytable.PrettyTable(header)
665            pTable.float_format = '.2'
666            pTable.align = 'c'
667            pTable.align[header[0]] = 'l'
668            pTable.align['Taxonomy'] = 'l'
669            pTable.hrules = prettytable.FRAME
670            pTable.vrules = prettytable.NONE
671
672        for binId in sorted(binIdToTaxonomy.keys()):
673            uniqueHits, multiCopyHits = resultsParser.results[binId].countUniqueHits()
674
675            row = [binId, uniqueHits, multiCopyHits, binIdToTaxonomy[binId]]
676
677            if bTabTable:
678                print('\t'.join(map(str, row)))
679            else:
680                pTable.add_row(row)
681
682        if not bTabTable:
683            print(pTable.get_string(sortby=markerCountLabel, reversesort=True))
684
685        # restore stdout
686        restoreStdOut(outFile, oldStdOut)
687
688    def __printFullTable(self, binIdToUID, binIdToTaxonomy, binIdToSisterTaxonomy, binIdToLineageStatistics, resultsParser, binStats, bTabTable, outFile):
689        # redirect output
690        oldStdOut = reassignStdOut(outFile)
691
692        arbitraryBinId = binIdToTaxonomy.keys()[0]
693        markerCountLabel = '# unique markers (of %d)' % len(resultsParser.models[arbitraryBinId])
694        header = ['Bin Id', markerCountLabel, "# multi-copy"]
695        header += ['Insertion branch UID', 'Taxonomy (contained)', 'Taxonomy (sister lineage)']
696        header += ['GC', 'Genome size (Mbp)', 'Gene count', 'Coding density', 'Translation table']
697        header += ['# descendant genomes', 'Lineage: GC mean', 'Lineage: GC std']
698        header += ['Lineage: genome size (Mbp) mean', 'Lineage: genome size (Mbp) std']
699        header += ['Lineage: gene count mean', 'Lineage: gene count std']
700
701        if bTabTable:
702            pTable = None
703            print('\t'.join(header))
704        else:
705            pTable = prettytable.PrettyTable(header)
706            pTable.float_format = '.2'
707            pTable.float_format['GC'] = '.1'
708            pTable.float_format['Lineage: GC mean'] = '.1'
709            pTable.float_format['Lineage: GC std'] = '.1'
710            pTable.float_format['Lineage: gene count mean'] = '.0'
711            pTable.float_format['Lineage: gene count std'] = '.0'
712            pTable.align = 'c'
713            pTable.align[header[0]] = 'l'
714            pTable.align['Insertion branch UID'] = 'l'
715            pTable.align['Taxonomy (contained)'] = 'l'
716            pTable.align['Taxonomy (sister lineage)'] = 'l'
717            pTable.hrules = prettytable.FRAME
718            pTable.vrules = prettytable.NONE
719
720        for binId in sorted(binIdToTaxonomy.keys()):
721            uniqueHits, multiCopyHits = resultsParser.results[binId].countUniqueHits()
722
723            truncSisterLineage = binIdToSisterTaxonomy[binId]
724            for taxa in binIdToTaxonomy[binId].split(';'):
725                truncSisterLineage = truncSisterLineage.replace(taxa + ';', '')
726
727            if len(truncSisterLineage) == 0:
728                truncSisterLineage = 'unresolved'
729            elif truncSisterLineage[-1] == ';':
730                truncSisterLineage = truncSisterLineage[0:-1]
731
732            row = [binId, uniqueHits, multiCopyHits]
733            row += [binIdToUID[binId], binIdToTaxonomy[binId], truncSisterLineage]
734            row += [binStats[binId]['GC'] * 100]
735            row += [float(binStats[binId]['Genome size']) / 1e6]
736            row += [binStats[binId]['# predicted genes']]
737            row += [binStats[binId]['Coding density']]
738            row += [binStats[binId]['Translation table']]
739            row += [binIdToLineageStatistics[binId]['# genomes']]
740            row += [binIdToLineageStatistics[binId]['gc mean']]
741            row += [binIdToLineageStatistics[binId]['gc std']]
742            row += [binIdToLineageStatistics[binId]['genome size mean']]
743            row += [binIdToLineageStatistics[binId]['genome size std']]
744            row += [binIdToLineageStatistics[binId]['gene count mean']]
745            row += [binIdToLineageStatistics[binId]['gene count std']]
746
747            if bTabTable:
748                print('\t'.join(map(str, row)))
749            else:
750                pTable.add_row(row)
751
752        if not bTabTable:
753            print(pTable.get_string(sortby=markerCountLabel, reversesort=True))
754
755        # restore stdout
756        restoreStdOut(outFile, oldStdOut)
757
758    def __readDuplicateSeqs(self):
759        """Parse file indicating duplicate sequence alignments."""
760        duplicateSeqs = {}
761        for line in open(os.path.join(DefaultValues.GENOME_TREE_DIR, DefaultValues.GENOME_TREE_DEREP)):
762            lineSplit = line.rstrip().split()
763            if len(lineSplit) > 1:
764                duplicateSeqs[lineSplit[0]] = lineSplit[1:]
765
766        return duplicateSeqs
767