1############################################################################### 2# 3# treeParser.py - parse genome tree and associated tree metadata 4# 5############################################################################### 6# # 7# This program is free software: you can redistribute it and/or modify # 8# it under the terms of the GNU General Public License as published by # 9# the Free Software Foundation, either version 3 of the License, or # 10# (at your option) any later version. # 11# # 12# This program is distributed in the hope that it will be useful, # 13# but WITHOUT ANY WARRANTY; without even the implied warranty of # 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 15# GNU General Public License for more details. # 16# # 17# You should have received a copy of the GNU General Public License # 18# along with this program. If not, see <http://www.gnu.org/licenses/>. # 19# # 20############################################################################### 21 22import os 23import sys 24import logging 25import json 26 27import dendropy 28 29import checkm.prettytable as prettytable 30 31from checkm.defaultValues import DefaultValues 32from checkm.markerSets import MarkerSet, BinMarkerSets 33from checkm.common import checkDirExists, reassignStdOut, restoreStdOut, getBinIdsFromOutDir 34from checkm.util.seqUtils import readFasta 35from checkm.util.taxonomyUtils import taxonomicPrefixes 36 37 38class TreeParser(): 39 """Parse genome tree and associated tree metadata.""" 40 def __init__(self): 41 self.logger = logging.getLogger('timestamp') 42 43 def printSummary(self, outputFormat, outDir, resultsParser, bTabTable, outFile, binStats): 44 if outputFormat == 1: 45 self.reportBinTaxonomy(outDir, resultsParser, bTabTable, outFile, binStats, bLineageStatistics=False) 46 elif outputFormat == 2: 47 self.reportBinTaxonomy(outDir, resultsParser, bTabTable, outFile, binStats, bLineageStatistics=True) 48 elif outputFormat == 3: 49 self.reportNewickTree(outDir, outFile, None) 50 elif outputFormat == 4: 51 self.reportNewickTree(outDir, outFile, 'taxonomy') 52 elif outputFormat == 5: 53 self.reportFullMSA(outDir, outFile) 54 else: 55 self.logger.error("Unknown output format: %d", outputFormat) 56 57 def reportFullMSA(self, outDir, outFile): 58 """Create MSA with all reference and bin alignments.""" 59 60 # write bin alignments to file 61 oldStdOut = reassignStdOut(outFile) 62 for line in open(os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_CONCAT_SEQ_OUT)): 63 print(line.rstrip()) 64 65 # read duplicate seqs 66 duplicateNodes = self.__readDuplicateSeqs() 67 68 # write reference alignments to file 69 seqs = readFasta(os.path.join(DefaultValues.PPLACER_REF_PACKAGE_FULL, DefaultValues.GENOME_TREE_FASTA)) 70 for seqId, seq in seqs.iteritems(): 71 print('>' + seqId) 72 print(seq) 73 74 if seqId in duplicateNodes: 75 for dupSeqId in duplicateNodes[seqId]: 76 print('>' + dupSeqId) 77 print(seq) 78 79 restoreStdOut(outFile, oldStdOut) 80 81 def readPlacementFile(self, placementFile): 82 '''Read pplacer JSON placement file.''' 83 jsonData = open(placementFile) 84 85 data = json.load(jsonData) 86 binIdToPP = {} 87 for placementData in data['placements']: 88 binId = placementData['nm'][0][0] 89 90 topPP = 0 91 for pp in placementData['p']: 92 if pp[2] > topPP: 93 topPP = pp[2] 94 95 binIdToPP[binId] = topPP 96 97 jsonData.close() 98 99 return binIdToPP 100 101 def reportNewickTree(self, outDir, outFile, leafLabels=None): 102 # read duplicate nodes 103 duplicateSeqs = self.__readDuplicateSeqs() 104 105 # read tree 106 treeFile = os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_TREE_OUT) 107 tree = dendropy.Tree.get_from_path(treeFile, schema='newick', rooting="force-rooted", preserve_underscores=True) 108 109 # clean up internal node labels 110 for node in tree.internal_nodes(): 111 if node.label: 112 labelSplit = node.label.split('|') 113 114 label = labelSplit[0] 115 if labelSplit[1] != '': 116 label += '|' + labelSplit[1] 117 if labelSplit[2] != '': 118 label += '|' + labelSplit[2] 119 120 node.label = label 121 122 # insert duplicate nodes into tree 123 for leaf in tree.leaf_nodes(): 124 duplicates = duplicateSeqs.get(leaf.taxon.label, None) 125 if duplicates != None: 126 newParent = leaf.parent_node.new_child(edge_length=leaf.edge_length) 127 curLeaf = leaf.parent_node.remove_child(leaf) 128 newParent.new_child(taxon=curLeaf.taxon, edge_length=0) 129 for d in duplicates: 130 newParent.new_child(taxon=dendropy.Taxon(label=d), edge_length=0) 131 132 # append taxonomy to leaf nodes 133 if leafLabels == 'taxonomy': 134 # read taxonomy string for each IMG genome 135 taxonomy = {} 136 for line in open(os.path.join(DefaultValues.GENOME_TREE_DIR, DefaultValues.GENOME_TREE_TAXONOMY)): 137 lineSplit = line.split('\t') 138 taxonomy[lineSplit[0]] = lineSplit[1].rstrip() 139 140 # append taxonomy to leaf labels 141 for leaf in tree.leaf_nodes(): 142 taxaStr = taxonomy.get(leaf.taxon.label, None) 143 if taxaStr: 144 leaf.taxon.label += '|' + taxaStr 145 146 # write out tree 147 oldStdOut = reassignStdOut(outFile) 148 print(tree.as_string(schema='newick', suppress_rooting=True)) 149 restoreStdOut(outFile, oldStdOut) 150 151 def getInsertionBranchId(self, outDir, binIds): 152 # make sure output and tree directories exist 153 checkDirExists(outDir) 154 alignOutputDir = os.path.join(outDir, 'storage', 'tree') 155 checkDirExists(alignOutputDir) 156 157 # read genome tree (if it exists) 158 binIdToUID = {} 159 treeFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_TREE_OUT) 160 tree = dendropy.Tree.get_from_path(treeFile, schema='newick', rooting="force-rooted", preserve_underscores=True) 161 162 # find first parent of each bin with a taxonomic label 163 for binId in binIds: 164 node = tree.find_node_with_taxon_label(binId) 165 if node == None: 166 binIdToUID[binId] = 'NA' 167 continue 168 169 # find first node decorated with a UID string between leaf and root 170 parentNode = node.parent_node 171 while parentNode != None: 172 if parentNode.label: 173 uid = parentNode.label.split('|')[0] 174 break 175 176 parentNode = parentNode.parent_node 177 178 binIdToUID[binId] = uid 179 180 return binIdToUID 181 182 def getBinTaxonomy(self, outDir, binIds): 183 # make sure output and tree directories exist 184 checkDirExists(outDir) 185 alignOutputDir = os.path.join(outDir, 'storage', 'tree') 186 checkDirExists(alignOutputDir) 187 188 # read genome tree (if it exists) 189 binIdToTaxonomy = {} 190 treeFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_TREE_OUT) 191 tree = dendropy.Tree.get_from_path(treeFile, schema='newick', rooting="force-rooted", preserve_underscores=True) 192 193 # find first parent of each bin with a taxonomic label 194 for binId in binIds: 195 node = tree.find_node_with_taxon_label(binId) 196 if node == None: 197 binIdToTaxonomy[binId] = 'NA' 198 continue 199 200 # find first node decorated with a taxon string between leaf and root 201 taxaStr = None 202 parentNode = node.parent_node 203 while parentNode != None: 204 if parentNode.label: 205 tokens = parentNode.label.split('|') 206 207 if tokens[1] != '': 208 if taxaStr: 209 taxaStr = tokens[1] + ';' + taxaStr 210 else: 211 taxaStr = tokens[1] 212 213 parentNode = parentNode.parent_node 214 215 if not taxaStr: 216 domainNode = self.__findDomainNode(node) 217 taxaStr = domainNode.label.split('|')[1] + ' (root)' 218 219 binIdToTaxonomy[node.taxon.label] = taxaStr 220 221 return binIdToTaxonomy 222 223 def __findDomainNode(self, binNode): 224 """Find node defining the domain. Assumes 'binNode' is the leaf node of a bin on either the archaeal or bacterial branch.""" 225 226 # bin is either on the bacterial or archaeal branch descendant from the root, 227 # so descend tree to first internal node with a label. Note that there may 228 # be multiple bins inserted in the tree creating unlabeled internal nodes. 229 # Identical population bins can also cause internal bifurcations. 230 231 # find first parent node which contains at least one IMG genome 232 curNode = binNode.parent_node 233 while True: 234 found_ref_genome = False 235 for leaf in curNode.leaf_nodes(): 236 if leaf.taxon.label.startswith('IMG_'): 237 found_ref_genome = True 238 break 239 240 if found_ref_genome: 241 break 242 243 curNode = curNode.parent_node 244 245 # perform depth first search to find a labeled internal node 246 queue = [curNode] 247 while queue: 248 curNode = queue.pop(0) 249 250 if curNode.label: 251 return curNode 252 253 for child in curNode.child_nodes(): 254 if child.is_internal(): 255 queue.append(child) 256 257 self.logger.error(' [Error] Failed to associate bin with a domain. Please report this bug.') 258 sys.exit(1) 259 260 def getBinSisterTaxonomy(self, outDir, binIds): 261 # make sure output and tree directories exist 262 checkDirExists(outDir) 263 alignOutputDir = os.path.join(outDir, 'storage', 'tree') 264 checkDirExists(alignOutputDir) 265 266 # read genome tree 267 treeFile = os.path.join(alignOutputDir, DefaultValues.PPLACER_TREE_OUT) 268 tree = dendropy.Tree.get_from_path(treeFile, schema='newick', rooting="force-rooted", preserve_underscores=True) 269 270 # read taxonomy string for each IMG genome 271 leafIdToTaxonomy = {} 272 for line in open(os.path.join(DefaultValues.GENOME_TREE_DIR, DefaultValues.GENOME_TREE_TAXONOMY)): 273 lineSplit = line.split('\t') 274 leafIdToTaxonomy[lineSplit[0]] = lineSplit[1].rstrip() 275 276 # find LCA of all labeled node in sister lineage 277 binIdToSisterTaxonomy = {} 278 for binId in binIds: 279 node = tree.find_node_with_taxon_label(binId) 280 281 taxaStr = '' 282 if node != None: 283 # get taxonomic labels of all internal nodes in sister lineages 284 sisterNodes = node.sister_nodes() 285 internalTaxonomyLabels = set() 286 leafTaxonomyLabels = set() 287 for sn in sisterNodes: 288 for curNode in sn.postorder_iter(): 289 if curNode.is_leaf(): 290 if curNode.taxon.label: 291 taxonomy = leafIdToTaxonomy.get(curNode.taxon.label, None) 292 if taxonomy != None: # inserted bins will not have an assigned taxonomy 293 for taxa in taxonomy.split(';'): 294 leafTaxonomyLabels.add(taxa.strip()) 295 else: 296 if curNode.label: 297 tokens = curNode.label.split('|') 298 if tokens[1] != '': 299 for taxa in tokens[1].split(';'): 300 internalTaxonomyLabels.add(taxa) 301 302 # find LCA of taxonomic labels in rank order; 303 # only consider leaf node labels if there were no internal labels 304 labels = internalTaxonomyLabels 305 if len(labels) == 0: 306 labels = leafTaxonomyLabels 307 308 for prefix in taxonomicPrefixes: 309 taxa = [] 310 for taxon in labels: 311 if prefix in taxon: 312 taxa.append(taxon) 313 314 if len(taxa) == 1: 315 # unambiguous label at this rank 316 taxaStr += taxa[0] + ';' 317 elif len(taxa) > 1: 318 # unable to resolve taxonomy at this rank 319 break 320 321 if not taxaStr: 322 taxaStr = 'unresolved' 323 binIdToSisterTaxonomy[binId] = taxaStr 324 325 return binIdToSisterTaxonomy 326 327 def __getNextNamedNode(self, node, uniqueIdToLineageStatistics): 328 """Get first parent node with taxonomy information.""" 329 parentNode = node.parent_node 330 while True: 331 if parentNode == None: 332 break # reached the root node so terminate 333 334 if parentNode.label: 335 trustedUniqueId = parentNode.label.split('|')[0] 336 trustedStats = uniqueIdToLineageStatistics[trustedUniqueId] 337 if trustedStats['taxonomy'] != '': 338 return trustedStats['taxonomy'] 339 340 parentNode = parentNode.parent_node 341 342 return 'root' 343 344 def __getMarkerSet(self, parentNode, tree, uniqueIdToLineageStatistics, 345 numGenomesMarkers, bootstrap, 346 bForceDomain, bRequireTaxonomy): 347 """Get marker set for next parent node meeting selection criteria.""" 348 349 # ascend tree to root finding first node meeting all selection criteria 350 selectedParentNode = parentNode 351 taxonomyStr = 'root' 352 while True: 353 if selectedParentNode.label: # nodes inserted by PPLACER will not have a label 354 trustedUniqueId = selectedParentNode.label.split('|')[0] 355 nodeTaxonomy = selectedParentNode.label.split('|')[1] 356 357 stats = uniqueIdToLineageStatistics[trustedUniqueId] 358 if stats['# genomes'] >= numGenomesMarkers and stats['bootstrap'] >= bootstrap: 359 if not bForceDomain or nodeTaxonomy in ['k__Bacteria', 'k__Archaea']: 360 if not bRequireTaxonomy or stats['taxonomy'] != '': 361 # get closest taxonomic label 362 taxonomyStr = stats['taxonomy'] 363 if not bRequireTaxonomy and stats['taxonomy'] == '': 364 taxonomyStr = self.__getNextNamedNode(selectedParentNode, uniqueIdToLineageStatistics) 365 366 # all criteria meet, so use marker set from this node 367 break 368 369 if selectedParentNode.parent_node == None: 370 break # reached the root node so terminate 371 372 selectedParentNode = selectedParentNode.parent_node 373 374 # get marker set meeting all criteria required for a trusted marker set 375 taxonomyStr = taxonomyStr.split(';')[-1] # extract most specific taxonomy identifier 376 markerSet = MarkerSet(trustedUniqueId, taxonomyStr, int(stats['# genomes']), eval(stats['marker set'])) 377 378 return selectedParentNode, markerSet 379 380 def __refineMarkerSet(self, markerSet, binNode, tree, uniqueIdToLineageStatistics, numGenomesRefine): 381 """Refine marker set to account for lineage-specific gene loss and duplication.""" 382 383 # lineage-specific refine is done with the sister lineage to where the bin is inserted 384 385 # get lineage-specific marker set which will be used to refine the above marker set 386 curNode = binNode.sister_nodes()[0] 387 while True: 388 if curNode.label: # nodes inserted by PPLACER will not have a label 389 uniqueId = curNode.label.split('|')[0] 390 stats = uniqueIdToLineageStatistics[uniqueId] 391 392 if stats['# genomes'] >= numGenomesRefine: 393 break 394 395 curNode = curNode.parent_node 396 if curNode == None: 397 break # reached the root node so terminate 398 399 # get lineage-specific marker set 400 lineageMarkerSet = eval(stats['marker set']) 401 402 # refine marker set by finding the intersection between these two sets, 403 # this removes markers that are not single-copy or ubiquitous in the 404 # specific lineage of a bin 405 # Note: co-localization information is taken from the trusted set 406 407 # get all lineage-specific marker genes 408 allLineageSpecificGenes = set() 409 for m in lineageMarkerSet: 410 for gene in m: 411 allLineageSpecificGenes.add(gene) 412 413 # remove genes not present in the lineage-specific gene set 414 finalMarkerSet = [] 415 for ms in markerSet.markerSet: 416 s = set() 417 for gene in ms: 418 if gene in allLineageSpecificGenes: 419 s.add(gene) 420 421 if s: 422 finalMarkerSet.append(s) 423 424 refinedMarkerSet = MarkerSet(markerSet.UID, markerSet.lineageStr, markerSet.numGenomes, finalMarkerSet) 425 426 return refinedMarkerSet 427 428 def __readLineageSpecificGenesToRemove(self): 429 """Get set of genes subject to lineage-specific gene loss and duplication.""" 430 431 self.lineageSpecificGenesToRemove = {} 432 for line in open(os.path.join(DefaultValues.GENOME_TREE_DIR, DefaultValues.GENOME_TREE_MISSING_DUPLICATE)): 433 lineSplit = line.split('\t') 434 uid = lineSplit[0] 435 missingGenes = eval(lineSplit[1]) 436 duplicateGenes = eval(lineSplit[2]) 437 self.lineageSpecificGenesToRemove[uid] = missingGenes.union(duplicateGenes) 438 439 def __removeInvalidLineageMarkerGenes(self, markerSet, lineageSpecificMarkersToRemove): 440 """Refine marker set to account for lineage-specific gene loss and duplication.""" 441 442 # refine marker set by removing marker genes subject to lineage-specific 443 # gene loss and duplication 444 # 445 # Note: co-localization information is taken from the trusted set 446 447 finalMarkerSet = [] 448 for ms in markerSet.markerSet: 449 s = set() 450 for gene in ms: 451 geneIdToTest = gene 452 if geneIdToTest.startswith('PF'): 453 geneIdToTest = gene.replace('PF', 'pfam') 454 geneIdToTest = geneIdToTest[0:geneIdToTest.rfind('.')] 455 456 if geneIdToTest not in lineageSpecificMarkersToRemove: 457 s.add(gene) 458 459 if s: 460 finalMarkerSet.append(s) 461 462 refinedMarkerSet = MarkerSet(markerSet.UID, markerSet.lineageStr, markerSet.numGenomes, finalMarkerSet) 463 464 return refinedMarkerSet 465 466 def getBinMarkerSets(self, outDir, markerFile, 467 numGenomesMarkers, 468 bootstrap, bNoLineageSpecificRefinement, 469 bForceDomain, bRequireTaxonomy, 470 resultsParser, minUnique, maxMulti): 471 """Determine marker sets for each bin.""" 472 473 self.logger.info('Determining marker sets for each genome bin.') 474 475 # get all bin ids 476 binIds = getBinIdsFromOutDir(outDir) 477 478 # get statistics for internal nodes 479 uniqueIdToLineageStatistics = self.readNodeMetadata() 480 481 # determine marker set for each bin 482 treeFile = os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_TREE_OUT) 483 tree = dendropy.Tree.get_from_path(treeFile, schema='newick', rooting="force-rooted", preserve_underscores=True) 484 rootNode = tree.find_node(filter_fn=lambda n: n.parent_node == None) 485 486 fout = open(markerFile, 'w') 487 fout.write(DefaultValues.LINEAGE_MARKER_FILE_HEADER + '\n') 488 489 numProcessedBins = 0 490 statusStr = '' 491 for binId in binIds: 492 if self.logger.getEffectiveLevel() <= logging.INFO: 493 numProcessedBins += 1 494 sys.stderr.write(' ' * len(statusStr) + '\r') # clear previous line 495 statusStr = ' Finished processing %d of %d (%.2f%%) bins (current: %s).' % (numProcessedBins, len(binIds), float(numProcessedBins) * 100 / len(binIds), binId) 496 sys.stderr.write('%s\r' % statusStr) 497 sys.stderr.flush() 498 499 node = tree.find_node_with_taxon_label(binId) 500 binMarkerSets = BinMarkerSets(binId, BinMarkerSets.TREE_MARKER_SET) 501 if node == None: 502 # bin is not in tree 503 node, markerSet = self.__getMarkerSet(rootNode, tree, uniqueIdToLineageStatistics, 504 numGenomesMarkers, bootstrap, 505 bForceDomain, bRequireTaxonomy) 506 binMarkerSets.addMarkerSet(markerSet) 507 else: 508 # special case: if node is on the bacterial or archaeal branch descendant from the root, 509 # then move down the tree to include the domain-specific marker set 510 parentNode = node.parent_node 511 while parentNode != None: 512 if parentNode.label: 513 bRoot = (parentNode.parent_node == None) 514 break 515 516 parentNode = parentNode.parent_node 517 518 if bRoot: 519 # since the root is the first labeled node, we need to descend the 520 # tree to incorporate the domain-specific marker set 521 domainNode = self.__findDomainNode(node) 522 curNode = domainNode.child_nodes()[0] 523 else: 524 curNode = node 525 526 # get lineage specific refinement for first node with an id 527 if not bNoLineageSpecificRefinement: 528 uniqueId = parentNode.label.split('|')[0] 529 self.__readLineageSpecificGenesToRemove() 530 lineageSpecificRefinement = self.lineageSpecificGenesToRemove[uniqueId] 531 532 # ascend tree to root, recording all marker sets meeting selection criteria 533 while curNode.parent_node != None: 534 uniqueHits, multiCopyHits = resultsParser.results[binId].countUniqueHits() 535 tempForceDomain = bForceDomain or (uniqueHits < minUnique) or (multiCopyHits > maxMulti) 536 537 curNode, markerSet = self.__getMarkerSet(curNode.parent_node, tree, uniqueIdToLineageStatistics, 538 numGenomesMarkers, bootstrap, 539 tempForceDomain, bRequireTaxonomy) 540 541 if not bNoLineageSpecificRefinement: 542 markerSet = self.__removeInvalidLineageMarkerGenes(markerSet, lineageSpecificRefinement) 543 544 binMarkerSets.addMarkerSet(markerSet) 545 546 binMarkerSets.write(fout) 547 548 if self.logger.getEffectiveLevel() <= logging.INFO: 549 sys.stderr.write('\n') 550 551 fout.close() 552 553 def readNodeMetadata(self): 554 """Read metadata for internal nodes.""" 555 556 uniqueIdToLineageStatistics = {} 557 metadataFile = os.path.join(DefaultValues.GENOME_TREE_DIR, DefaultValues.GENOME_TREE_METADATA) 558 with open(metadataFile) as f: 559 f.readline() 560 for line in f: 561 lineSplit = line.rstrip().split('\t') 562 563 uniqueId = lineSplit[0] 564 565 d = {} 566 d['# genomes'] = int(lineSplit[1]) 567 d['taxonomy'] = lineSplit[2] 568 try: 569 d['bootstrap'] = float(lineSplit[3]) 570 except: 571 d['bootstrap'] = 'NA' 572 d['gc mean'] = float(lineSplit[4]) 573 d['gc std'] = float(lineSplit[5]) 574 d['genome size mean'] = float(lineSplit[6]) / 1e6 575 d['genome size std'] = float(lineSplit[7]) / 1e6 576 d['gene count mean'] = float(lineSplit[8]) 577 d['gene count std'] = float(lineSplit[9]) 578 d['marker set'] = lineSplit[10].rstrip() 579 580 uniqueIdToLineageStatistics[uniqueId] = d 581 582 return uniqueIdToLineageStatistics 583 584 def readLineageMetadata(self, outDir, binIds): 585 """Get metadata for each bin.""" 586 587 uniqueIdToLineageStatistics = self.readNodeMetadata() 588 589 # read genome tree 590 treeFile = os.path.join(outDir, 'storage', 'tree', DefaultValues.PPLACER_TREE_OUT) 591 tree = dendropy.Tree.get_from_path(treeFile, schema='newick', rooting="force-rooted", preserve_underscores=True) 592 593 # find first parent of each bin with a label 594 binIdToLineageStatistics = {} 595 for binId in binIds: 596 node = tree.find_node_with_taxon_label(binId) 597 if node == None: 598 d = {} 599 d['# genomes'] = 'NA' 600 d['taxonomy'] = 'unresolved' 601 d['gc mean'] = 'NA' 602 d['gc std'] = 'NA' 603 d['genome size mean'] = 'NA' 604 d['genome size std'] = 'NA' 605 d['gene count mean'] = 'NA' 606 d['gene count std'] = 'NA' 607 d['marker set'] = 'NA' 608 binIdToLineageStatistics[binId] = d 609 continue 610 611 # find first labeled parent node (nodes inserted by pplacer will be unlabeled) 612 parentNode = node.parent_node 613 uniqueId = None 614 while parentNode != None: 615 if parentNode.label: 616 uniqueId = parentNode.label.split('|')[0] 617 break 618 619 parentNode = parentNode.parent_node 620 621 if uniqueId: 622 binIdToLineageStatistics[binId] = uniqueIdToLineageStatistics[uniqueId] 623 else: 624 self.logger.error('Failed to find lineage-specific statistics for inserted bin: ' + node.taxon.label) 625 sys.exit(0) 626 627 return binIdToLineageStatistics 628 629 def reportBinTaxonomy(self, outDir, resultsParser, bTabTable, outFile, binStats, bLineageStatistics): 630 # make sure output and tree directories exist 631 checkDirExists(outDir) 632 alignOutputDir = os.path.join(outDir, 'storage', 'tree') 633 checkDirExists(alignOutputDir) 634 635 # get all bin ids 636 binIds = getBinIdsFromOutDir(outDir) 637 638 # get taxonomy for each bin 639 binIdToTaxonomy = self.getBinTaxonomy(outDir, binIds) 640 641 # write table 642 if not bLineageStatistics: 643 self.__printSimpleSummaryTable(binIdToTaxonomy, resultsParser, bTabTable, outFile) 644 else: 645 # get taxonomy of sister lineage for each bin 646 binIdToSisterTaxonomy = self.getBinSisterTaxonomy(outDir, binIds) 647 binIdToUID = self.getInsertionBranchId(outDir, binIds) 648 649 binIdToLineageStatistics = self.readLineageMetadata(outDir, binIds) 650 self.__printFullTable(binIdToUID, binIdToTaxonomy, binIdToSisterTaxonomy, binIdToLineageStatistics, resultsParser, binStats, bTabTable, outFile) 651 652 def __printSimpleSummaryTable(self, binIdToTaxonomy, resultsParser, bTabTable, outFile): 653 # redirect output 654 oldStdOut = reassignStdOut(outFile) 655 656 arbitraryBinId = binIdToTaxonomy.keys()[0] 657 markerCountLabel = '# unique markers (of %d)' % len(resultsParser.models[arbitraryBinId]) 658 header = ['Bin Id', markerCountLabel, '# multi-copy', 'Taxonomy'] 659 660 if bTabTable: 661 pTable = None 662 print('\t'.join(header)) 663 else: 664 pTable = prettytable.PrettyTable(header) 665 pTable.float_format = '.2' 666 pTable.align = 'c' 667 pTable.align[header[0]] = 'l' 668 pTable.align['Taxonomy'] = 'l' 669 pTable.hrules = prettytable.FRAME 670 pTable.vrules = prettytable.NONE 671 672 for binId in sorted(binIdToTaxonomy.keys()): 673 uniqueHits, multiCopyHits = resultsParser.results[binId].countUniqueHits() 674 675 row = [binId, uniqueHits, multiCopyHits, binIdToTaxonomy[binId]] 676 677 if bTabTable: 678 print('\t'.join(map(str, row))) 679 else: 680 pTable.add_row(row) 681 682 if not bTabTable: 683 print(pTable.get_string(sortby=markerCountLabel, reversesort=True)) 684 685 # restore stdout 686 restoreStdOut(outFile, oldStdOut) 687 688 def __printFullTable(self, binIdToUID, binIdToTaxonomy, binIdToSisterTaxonomy, binIdToLineageStatistics, resultsParser, binStats, bTabTable, outFile): 689 # redirect output 690 oldStdOut = reassignStdOut(outFile) 691 692 arbitraryBinId = binIdToTaxonomy.keys()[0] 693 markerCountLabel = '# unique markers (of %d)' % len(resultsParser.models[arbitraryBinId]) 694 header = ['Bin Id', markerCountLabel, "# multi-copy"] 695 header += ['Insertion branch UID', 'Taxonomy (contained)', 'Taxonomy (sister lineage)'] 696 header += ['GC', 'Genome size (Mbp)', 'Gene count', 'Coding density', 'Translation table'] 697 header += ['# descendant genomes', 'Lineage: GC mean', 'Lineage: GC std'] 698 header += ['Lineage: genome size (Mbp) mean', 'Lineage: genome size (Mbp) std'] 699 header += ['Lineage: gene count mean', 'Lineage: gene count std'] 700 701 if bTabTable: 702 pTable = None 703 print('\t'.join(header)) 704 else: 705 pTable = prettytable.PrettyTable(header) 706 pTable.float_format = '.2' 707 pTable.float_format['GC'] = '.1' 708 pTable.float_format['Lineage: GC mean'] = '.1' 709 pTable.float_format['Lineage: GC std'] = '.1' 710 pTable.float_format['Lineage: gene count mean'] = '.0' 711 pTable.float_format['Lineage: gene count std'] = '.0' 712 pTable.align = 'c' 713 pTable.align[header[0]] = 'l' 714 pTable.align['Insertion branch UID'] = 'l' 715 pTable.align['Taxonomy (contained)'] = 'l' 716 pTable.align['Taxonomy (sister lineage)'] = 'l' 717 pTable.hrules = prettytable.FRAME 718 pTable.vrules = prettytable.NONE 719 720 for binId in sorted(binIdToTaxonomy.keys()): 721 uniqueHits, multiCopyHits = resultsParser.results[binId].countUniqueHits() 722 723 truncSisterLineage = binIdToSisterTaxonomy[binId] 724 for taxa in binIdToTaxonomy[binId].split(';'): 725 truncSisterLineage = truncSisterLineage.replace(taxa + ';', '') 726 727 if len(truncSisterLineage) == 0: 728 truncSisterLineage = 'unresolved' 729 elif truncSisterLineage[-1] == ';': 730 truncSisterLineage = truncSisterLineage[0:-1] 731 732 row = [binId, uniqueHits, multiCopyHits] 733 row += [binIdToUID[binId], binIdToTaxonomy[binId], truncSisterLineage] 734 row += [binStats[binId]['GC'] * 100] 735 row += [float(binStats[binId]['Genome size']) / 1e6] 736 row += [binStats[binId]['# predicted genes']] 737 row += [binStats[binId]['Coding density']] 738 row += [binStats[binId]['Translation table']] 739 row += [binIdToLineageStatistics[binId]['# genomes']] 740 row += [binIdToLineageStatistics[binId]['gc mean']] 741 row += [binIdToLineageStatistics[binId]['gc std']] 742 row += [binIdToLineageStatistics[binId]['genome size mean']] 743 row += [binIdToLineageStatistics[binId]['genome size std']] 744 row += [binIdToLineageStatistics[binId]['gene count mean']] 745 row += [binIdToLineageStatistics[binId]['gene count std']] 746 747 if bTabTable: 748 print('\t'.join(map(str, row))) 749 else: 750 pTable.add_row(row) 751 752 if not bTabTable: 753 print(pTable.get_string(sortby=markerCountLabel, reversesort=True)) 754 755 # restore stdout 756 restoreStdOut(outFile, oldStdOut) 757 758 def __readDuplicateSeqs(self): 759 """Parse file indicating duplicate sequence alignments.""" 760 duplicateSeqs = {} 761 for line in open(os.path.join(DefaultValues.GENOME_TREE_DIR, DefaultValues.GENOME_TREE_DEREP)): 762 lineSplit = line.rstrip().split() 763 if len(lineSplit) > 1: 764 duplicateSeqs[lineSplit[0]] = lineSplit[1:] 765 766 return duplicateSeqs 767