1#!/usr/bin/env python 2 3############################################################################### 4# # 5# This program is free software: you can redistribute it and/or modify # 6# it under the terms of the GNU General Public License as published by # 7# the Free Software Foundation, either version 3 of the License, or # 8# (at your option) any later version. # 9# # 10# This program is distributed in the hope that it will be useful, # 11# but WITHOUT ANY WARRANTY; without even the implied warranty of # 12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 13# GNU General Public License for more details. # 14# # 15# You should have received a copy of the GNU General Public License # 16# along with this program. If not, see <http://www.gnu.org/licenses/>. # 17# # 18############################################################################### 19 20""" 21Methods for identifying and investigating marker sets. 22""" 23 24__author__ = 'Donovan Parks' 25__copyright__ = 'Copyright 2013' 26__credits__ = ['Donovan Parks'] 27__license__ = 'GPL3' 28__version__ = '1.0.0' 29__maintainer__ = 'Donovan Parks' 30__email__ = 'donovan.parks@gmail.com' 31__status__ = 'Development' 32 33import os 34import sys 35import random 36from collections import defaultdict 37from checkm.markerSets import BinMarkerSets, MarkerSet 38 39from checkm.util.img import IMG 40 41from numpy import array 42from numpy.random import choice 43 44class MarkerSetBuilder(object): 45 def __init__(self): 46 self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv') 47 self.colocatedFile = './data/colocated.tsv' 48 self.duplicateSeqs = self.readDuplicateSeqs() 49 self.uniqueIdToLineageStatistics = self.__readNodeMetadata() 50 51 self.cachedGeneCountTable = None 52 53 def precomputeGenomeSeqLens(self, genomeIds): 54 """Cache the length of contigs/scaffolds for all genomes.""" 55 56 # This function is intended to speed up functions, such as img.geneDistTable(), 57 # that are called multiple times (typically during simulations) 58 self.img.precomputeGenomeSeqLens(genomeIds) 59 60 def precomputeGenomeFamilyPositions(self, genomeIds, spacingBetweenContigs): 61 """Cache position of PFAM and TIGRFAM genes in genomes.""" 62 63 # This function is intended to speed up functions, such as img.geneDistTable(), 64 # that are called multiple times (typically during simulations) 65 self.img.precomputeGenomeFamilyPositions(genomeIds, spacingBetweenContigs) 66 67 def precomputeGenomeFamilyScaffolds(self, genomeIds): 68 """Cache scaffolds of PFAM and TIGRFAM genes in genomes.""" 69 70 # This function is intended to speed up functions, such as img.geneDistTable(), 71 # that are called multiple times (typically during simulations) 72 self.img.precomputeGenomeFamilyScaffolds(genomeIds) 73 74 def getLineageMarkerGenes(self, lineage, minGenomes = 20, minMarkerSets = 20): 75 pfamIds = set() 76 tigrIds = set() 77 78 bHeader = True 79 for line in open(self.colocatedFile): 80 if bHeader: 81 bHeader = False 82 continue 83 84 lineSplit = line.split('\t') 85 curLineage = lineSplit[0] 86 numGenomes = int(lineSplit[1]) 87 numMarkerSets = int(lineSplit[3]) 88 markerSets = lineSplit[4:] 89 90 if curLineage != lineage or numGenomes < minGenomes or numMarkerSets < minMarkerSets: 91 continue 92 93 for ms in markerSets: 94 markers = ms.split(',') 95 for m in markers: 96 if 'pfam' in m: 97 pfamIds.add(m.strip()) 98 elif 'TIGR' in m: 99 tigrIds.add(m.strip()) 100 101 return pfamIds, tigrIds 102 103 def getCalculatedMarkerGenes(self, minGenomes = 20, minMarkerSets = 20): 104 pfamIds = set() 105 tigrIds = set() 106 107 bHeader = True 108 for line in open(self.colocatedFile): 109 if bHeader: 110 bHeader = False 111 continue 112 113 lineSplit = line.split('\t') 114 numGenomes = int(lineSplit[1]) 115 numMarkerSets = int(lineSplit[3]) 116 markerSets = lineSplit[4:] 117 118 if numGenomes < minGenomes or numMarkerSets < minMarkerSets: 119 continue 120 121 for ms in markerSets: 122 markers = ms.split(',') 123 for m in markers: 124 if 'pfam' in m: 125 pfamIds.add(m.strip()) 126 elif 'TIGR' in m: 127 tigrIds.add(m.strip()) 128 129 return pfamIds, tigrIds 130 131 def markerGenes(self, genomeIds, countTable, ubiquityThreshold, singleCopyThreshold): 132 if ubiquityThreshold < 1 or singleCopyThreshold < 1: 133 print '[Warning] Looks like degenerate threshold.' 134 135 # find genes meeting ubiquity and single-copy thresholds 136 markers = set() 137 for clusterId, genomeCounts in countTable.iteritems(): 138 ubiquity = 0 139 singleCopy = 0 140 141 if len(genomeCounts) < ubiquityThreshold: 142 # gene is clearly not ubiquitous 143 continue 144 145 for genomeId in genomeIds: 146 count = genomeCounts.get(genomeId, 0) 147 148 if count > 0: 149 ubiquity += 1 150 151 if count == 1: 152 singleCopy += 1 153 154 if ubiquity >= ubiquityThreshold and singleCopy >= singleCopyThreshold: 155 markers.add(clusterId) 156 157 return markers 158 159 def colocatedGenes(self, geneDistTable, distThreshold = 5000, genomeThreshold = 0.95): 160 """Identify co-located gene pairs.""" 161 162 colocatedGenes = defaultdict(int) 163 for _, clusterIdToGeneLocs in geneDistTable.iteritems(): 164 clusterIds = clusterIdToGeneLocs.keys() 165 for i, clusterId1 in enumerate(clusterIds): 166 geneLocations1 = clusterIdToGeneLocs[clusterId1] 167 168 for clusterId2 in clusterIds[i+1:]: 169 geneLocations2 = clusterIdToGeneLocs[clusterId2] 170 bColocated = False 171 for p1 in geneLocations1: 172 for p2 in geneLocations2: 173 if abs(p1[0] - p2[0]) < distThreshold: 174 bColocated = True 175 break 176 177 if bColocated: 178 break 179 180 if bColocated: 181 if clusterId1 <= clusterId2: 182 colocatedStr = clusterId1 + '-' + clusterId2 183 else: 184 colocatedStr = clusterId2 + '-' + clusterId1 185 colocatedGenes[colocatedStr] += 1 186 187 colocated = [] 188 for colocatedStr, count in colocatedGenes.iteritems(): 189 if float(count)/len(geneDistTable) > genomeThreshold: 190 colocated.append(colocatedStr) 191 192 return colocated 193 194 def colocatedSets(self, colocatedGenes, markerGenes): 195 # run through co-located genes once creating initial sets 196 sets = [] 197 for cg in colocatedGenes: 198 geneA, geneB = cg.split('-') 199 sets.append(set([geneA, geneB])) 200 201 # combine any sets with overlapping genes 202 bProcessed = [False]*len(sets) 203 finalSets = [] 204 for i in xrange(0, len(sets)): 205 if bProcessed[i]: 206 continue 207 208 curSet = sets[i] 209 bProcessed[i] = True 210 211 bUpdated = True 212 while bUpdated: 213 bUpdated = False 214 for j in xrange(i+1, len(sets)): 215 if bProcessed[j]: 216 continue 217 218 if len(curSet.intersection(sets[j])) > 0: 219 curSet.update(sets[j]) 220 bProcessed[j] = True 221 bUpdated = True 222 223 finalSets.append(curSet) 224 225 # add all singletons into colocated sets 226 for clusterId in markerGenes: 227 bFound = False 228 for cs in finalSets: 229 if clusterId in cs: 230 bFound = True 231 232 if not bFound: 233 finalSets.append(set([clusterId])) 234 235 return finalSets 236 237 def genomeCheck(self, colocatedSet, genomeId, countTable): 238 comp = 0.0 239 cont = 0.0 240 missingMarkers = set() 241 duplicateMarkers = set() 242 243 if len(colocatedSet) == 0: 244 return comp, cont, missingMarkers, duplicateMarkers 245 246 for cs in colocatedSet: 247 present = 0 248 multiCopy = 0 249 for contigId in cs: 250 count = countTable[contigId].get(genomeId, 0) 251 if count == 1: 252 present += 1 253 elif count > 1: 254 present += 1 255 multiCopy += (count-1) 256 duplicateMarkers.add(contigId) 257 elif count == 0: 258 missingMarkers.add(contigId) 259 260 comp += float(present) / len(cs) 261 cont += float(multiCopy) / len(cs) 262 263 return comp / len(colocatedSet), cont / len(colocatedSet), missingMarkers, duplicateMarkers 264 265 def uniformity(self, genomeSize, pts): 266 U = float(genomeSize) / (len(pts)+1) # distance between perfectly evenly spaced points 267 268 # calculate distance between adjacent points 269 dists = [] 270 pts = sorted(pts) 271 for i in xrange(0, len(pts)-1): 272 dists.append(pts[i+1] - pts[i]) 273 274 # calculate uniformity index 275 num = 0 276 den = 0 277 for d in dists: 278 num += abs(d - U) 279 den += max(d, U) 280 281 return 1.0 - num/den 282 283 def sampleGenome(self, genomeLen, percentComp, percentCont, contigLen): 284 """Sample a genome to simulate a given percent completion and contamination.""" 285 286 contigsInGenome = genomeLen / contigLen 287 288 # determine number of contigs to achieve desired completeness and contamination 289 contigsToSampleComp = int(contigsInGenome*percentComp + 0.5) 290 contigsToSampleCont = int(contigsInGenome*percentCont + 0.5) 291 292 # randomly sample contigs with contamination done via sampling with replacement 293 compContigs = random.sample(xrange(contigsInGenome), contigsToSampleComp) 294 contContigs = choice(xrange(contigsInGenome), contigsToSampleCont, replace=True) 295 296 # determine start of each contig 297 contigStarts = [c*contigLen for c in compContigs] 298 contigStarts += [c*contigLen for c in contContigs] 299 300 contigStarts.sort() 301 302 trueComp = float(contigsToSampleComp)*contigLen*100 / genomeLen 303 trueCont = float(contigsToSampleCont)*contigLen*100 / genomeLen 304 305 return trueComp, trueCont, contigStarts 306 307 def sampleGenomeScaffoldsInvLength(self, targetPer, seqLens, genomeSize): 308 """Sample genome comprised of several sequences with probability inversely proportional to length.""" 309 310 # calculate probability of sampling a sequences 311 seqProb = [] 312 for _, seqLen in seqLens.iteritems(): 313 prob = 1.0 / (float(seqLen) / genomeSize) 314 seqProb.append(prob) 315 316 seqProb = array(seqProb) 317 seqProb /= sum(seqProb) 318 319 # select sequence with probability proportional to length 320 selectedSeqsIds = choice(seqLens.keys(), size = len(seqLens), replace=False, p = seqProb) 321 322 sampledSeqIds = [] 323 truePer = 0.0 324 for seqId in selectedSeqsIds: 325 sampledSeqIds.append(seqId) 326 truePer += float(seqLens[seqId]) / genomeSize 327 328 if truePer >= targetPer: 329 break 330 331 return sampledSeqIds, truePer*100 332 333 def sampleGenomeScaffoldsWithoutReplacement(self, targetPer, seqLens, genomeSize): 334 """Sample genome comprised of several sequences without replacement. 335 336 Sampling is conducted randomly until the selected sequences comprise 337 greater than or equal to the desired target percentage. 338 """ 339 340 selectedSeqsIds = choice(seqLens.keys(), size = len(seqLens), replace=False) 341 342 sampledSeqIds = [] 343 truePer = 0.0 344 for seqId in selectedSeqsIds: 345 sampledSeqIds.append(seqId) 346 truePer += float(seqLens[seqId]) / genomeSize 347 348 if truePer >= targetPer: 349 break 350 351 return sampledSeqIds, truePer*100 352 353 def containedMarkerGenes(self, markerGenes, clusterIdToGenomePositions, startPartialGenomeContigs, contigLen): 354 """Determine markers contained in a set of contigs.""" 355 356 contained = {} 357 for markerGene in markerGenes: 358 positions = clusterIdToGenomePositions.get(markerGene, []) 359 360 containedPos = [] 361 for p in positions: 362 for s in startPartialGenomeContigs: 363 if (p[0] - s) >= 0 and (p[0] - s) < contigLen: 364 containedPos.append(s) 365 366 if len(containedPos) > 0: 367 contained[markerGene] = containedPos 368 369 return contained 370 371 def markerGenesOnScaffolds(self, markerGenes, genomeId, scaffoldIds, containedMarkerGenes): 372 """Determine if marker genes are found on the scaffolds of a given genome.""" 373 for markerGeneId in markerGenes: 374 scaffoldIdsWithMarker = self.img.cachedGenomeFamilyScaffolds[genomeId].get(markerGeneId, []) 375 376 for scaffoldId in scaffoldIdsWithMarker: 377 if scaffoldId in scaffoldIds: 378 containedMarkerGenes[markerGeneId] += [scaffoldId] 379 380 def readDuplicateSeqs(self): 381 """Parse file indicating duplicate sequence alignments.""" 382 duplicateSeqs = {} 383 for line in open(os.path.join('/srv/whitlam/bio/db/checkm/genome_tree', 'genome_tree.derep.txt')): 384 lineSplit = line.rstrip().split() 385 if len(lineSplit) > 1: 386 duplicateSeqs[lineSplit[0]] = lineSplit[1:] 387 388 return duplicateSeqs 389 390 def __readNodeMetadata(self): 391 """Read metadata for internal nodes.""" 392 393 uniqueIdToLineageStatistics = {} 394 metadataFile = os.path.join('/srv/whitlam/bio/db/checkm/genome_tree', 'genome_tree.metadata.tsv') 395 with open(metadataFile) as f: 396 f.readline() 397 for line in f: 398 lineSplit = line.rstrip().split('\t') 399 400 uniqueId = lineSplit[0] 401 402 d = {} 403 d['# genomes'] = int(lineSplit[1]) 404 d['taxonomy'] = lineSplit[2] 405 try: 406 d['bootstrap'] = float(lineSplit[3]) 407 except: 408 d['bootstrap'] = 'NA' 409 d['gc mean'] = float(lineSplit[4]) 410 d['gc std'] = float(lineSplit[5]) 411 d['genome size mean'] = float(lineSplit[6])/1e6 412 d['genome size std'] = float(lineSplit[7])/1e6 413 d['gene count mean'] = float(lineSplit[8]) 414 d['gene count std'] = float(lineSplit[9]) 415 d['marker set'] = lineSplit[10].rstrip() 416 417 uniqueIdToLineageStatistics[uniqueId] = d 418 419 return uniqueIdToLineageStatistics 420 421 def __getNextNamedNode(self, node, uniqueIdToLineageStatistics): 422 """Get first parent node with taxonomy information.""" 423 parentNode = node.parent_node 424 while True: 425 if parentNode == None: 426 break # reached the root node so terminate 427 428 if parentNode.label: 429 trustedUniqueId = parentNode.label.split('|')[0] 430 trustedStats = uniqueIdToLineageStatistics[trustedUniqueId] 431 if trustedStats['taxonomy'] != '': 432 return trustedStats['taxonomy'] 433 434 parentNode = parentNode.parent_node 435 436 return 'root' 437 438 def __refineMarkerSet(self, markerSet, lineageSpecificMarkerSet): 439 """Refine marker set to account for lineage-specific gene loss and duplication.""" 440 441 # refine marker set by finding the intersection between these two sets, 442 # this removes markers that are not single-copy or ubiquitous in the 443 # specific lineage of a bin 444 # Note: co-localization information is taken from the trusted set 445 446 # remove genes not present in the lineage-specific gene set 447 finalMarkerSet = [] 448 for ms in markerSet.markerSet: 449 s = set() 450 for gene in ms: 451 if gene in lineageSpecificMarkerSet.getMarkerGenes(): 452 s.add(gene) 453 454 if s: 455 finalMarkerSet.append(s) 456 457 refinedMarkerSet = MarkerSet(markerSet.UID, markerSet.lineageStr, markerSet.numGenomes, finalMarkerSet) 458 459 return refinedMarkerSet 460 461 def ____removeInvalidLineageMarkerGenes(self, markerSet, lineageSpecificMarkersToRemove): 462 """Refine marker set to account for lineage-specific gene loss and duplication.""" 463 464 # refine marker set by removing marker genes subject to lineage-specific 465 # gene loss and duplication 466 # 467 # Note: co-localization information is taken from the trusted set 468 469 finalMarkerSet = [] 470 for ms in markerSet.markerSet: 471 s = set() 472 for gene in ms: 473 if gene.startswith('PF'): 474 print 'ERROR! Expected genes to start with pfam, not PF.' 475 476 if gene not in lineageSpecificMarkersToRemove: 477 s.add(gene) 478 479 if s: 480 finalMarkerSet.append(s) 481 482 refinedMarkerSet = MarkerSet(markerSet.UID, markerSet.lineageStr, markerSet.numGenomes, finalMarkerSet) 483 484 return refinedMarkerSet 485 486 def missingGenes(self, genomeIds, markerGenes, ubiquityThreshold): 487 """Inferring consistently missing marker genes within a set of genomes.""" 488 489 if self.cachedGeneCountTable != None: 490 geneCountTable = self.cachedGeneCountTable 491 else: 492 geneCountTable = self.img.geneCountTable(genomeIds) 493 494 # find genes meeting ubiquity and single-copy thresholds 495 missing = set() 496 for clusterId, genomeCounts in geneCountTable.iteritems(): 497 if clusterId not in markerGenes: 498 continue 499 500 absence = 0 501 for genomeId in genomeIds: 502 count = genomeCounts.get(genomeId, 0) 503 504 if count == 0: 505 absence += 1 506 507 if absence >= ubiquityThreshold*len(genomeIds): 508 missing.add(clusterId) 509 510 return missing 511 512 def duplicateGenes(self, genomeIds, markerGenes, ubiquityThreshold): 513 """Inferring consistently duplicated marker genes within a set of genomes.""" 514 515 if self.cachedGeneCountTable != None: 516 geneCountTable = self.cachedGeneCountTable 517 else: 518 geneCountTable = self.img.geneCountTable(genomeIds) 519 520 # find genes meeting ubiquity and single-copy thresholds 521 duplicate = set() 522 for clusterId, genomeCounts in geneCountTable.iteritems(): 523 if clusterId not in markerGenes: 524 continue 525 526 duplicateCount = 0 527 for genomeId in genomeIds: 528 count = genomeCounts.get(genomeId, 0) 529 530 if count > 1: 531 duplicateCount += 1 532 533 if duplicateCount >= ubiquityThreshold*len(genomeIds): 534 duplicate.add(clusterId) 535 536 return duplicate 537 538 def buildMarkerGenes(self, genomeIds, ubiquityThreshold, singleCopyThreshold): 539 """Infer marker genes from specified genomes.""" 540 541 if self.cachedGeneCountTable != None: 542 geneCountTable = self.cachedGeneCountTable 543 else: 544 geneCountTable = self.img.geneCountTable(genomeIds) 545 546 #counts = [] 547 #singleCopy = 0 548 #for genomeId, count in geneCountTable['pfam01351'].iteritems(): 549 # print genomeId, count 550 # counts.append(count) 551 # if count == 1: 552 # singleCopy += 1 553 554 #print 'Ubiquity: %d of %d' % (len(counts), len(genomeIds)) 555 #print 'Single-copy: %d of %d' % (singleCopy, len(genomeIds)) 556 #print 'Mean: %.2f' % mean(counts) 557 558 markerGenes = self.markerGenes(genomeIds, geneCountTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds)) 559 tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes) 560 markerGenes = markerGenes - tigrToRemove 561 562 return markerGenes 563 564 def buildMarkerSet(self, genomeIds, ubiquityThreshold, singleCopyThreshold, spacingBetweenContigs = 5000): 565 """Infer marker set from specified genomes.""" 566 567 markerGenes = self.buildMarkerGenes(genomeIds, ubiquityThreshold, singleCopyThreshold) 568 569 geneDistTable = self.img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs) 570 colocatedGenes = self.colocatedGenes(geneDistTable) 571 colocatedSets = self.colocatedSets(colocatedGenes, markerGenes) 572 markerSet = MarkerSet(0, 'NA', len(genomeIds), colocatedSets) 573 574 return markerSet 575 576 def readLineageSpecificGenesToRemove(self): 577 """Get set of genes subject to lineage-specific gene loss and duplication.""" 578 579 self.lineageSpecificGenesToRemove = {} 580 for line in open('/srv/whitlam/bio/db/checkm/genome_tree/missing_duplicate_genes_50.tsv'): 581 lineSplit = line.split('\t') 582 uid = lineSplit[0] 583 missingGenes = eval(lineSplit[1]) 584 duplicateGenes = eval(lineSplit[2]) 585 self.lineageSpecificGenesToRemove[uid] = missingGenes.union(duplicateGenes) 586 587 def buildBinMarkerSet(self, tree, curNode, ubiquityThreshold, singleCopyThreshold, bMarkerSet = True, genomeIdsToRemove = None): 588 """Build lineage-specific marker sets for a genome in a LOO-fashion.""" 589 590 # determine marker sets for bin 591 binMarkerSets = BinMarkerSets(curNode.label, BinMarkerSets.TREE_MARKER_SET) 592 refinedBinMarkerSet = BinMarkerSets(curNode.label, BinMarkerSets.TREE_MARKER_SET) 593 594 # ascend tree to root, recording all marker sets 595 uniqueId = curNode.label.split('|')[0] 596 lineageSpecificRefinement = self.lineageSpecificGenesToRemove[uniqueId] 597 598 while curNode != None: 599 uniqueId = curNode.label.split('|')[0] 600 stats = self.uniqueIdToLineageStatistics[uniqueId] 601 taxonomyStr = stats['taxonomy'] 602 if taxonomyStr == '': 603 taxonomyStr = self.__getNextNamedNode(curNode, self.uniqueIdToLineageStatistics) 604 605 leafNodes = curNode.leaf_nodes() 606 genomeIds = set() 607 for leaf in leafNodes: 608 genomeIds.add(leaf.taxon.label.replace('IMG_', '')) 609 610 duplicateGenomes = self.duplicateSeqs.get(leaf.taxon.label, []) 611 for dup in duplicateGenomes: 612 genomeIds.add(dup.replace('IMG_', '')) 613 614 # remove all genomes from the same taxonomic group as the genome of interest 615 if genomeIdsToRemove != None: 616 genomeIds.difference_update(genomeIdsToRemove) 617 618 if len(genomeIds) >= 2: 619 if bMarkerSet: 620 markerSet = self.buildMarkerSet(genomeIds, ubiquityThreshold, singleCopyThreshold) 621 else: 622 markerSet = MarkerSet(0, 'NA', len(genomeIds), [self.buildMarkerGenes(genomeIds, ubiquityThreshold, singleCopyThreshold)]) 623 624 markerSet.lineageStr = uniqueId + ' | ' + taxonomyStr.split(';')[-1] 625 binMarkerSets.addMarkerSet(markerSet) 626 627 #refinedMarkerSet = self.__refineMarkerSet(markerSet, lineageSpecificMarkerSet) 628 refinedMarkerSet = self.____removeInvalidLineageMarkerGenes(markerSet, lineageSpecificRefinement) 629 #print 'Refinement: %d of %d' % (len(refinedMarkerSet.getMarkerGenes()), len(markerSet.getMarkerGenes())) 630 refinedBinMarkerSet.addMarkerSet(refinedMarkerSet) 631 632 curNode = curNode.parent_node 633 634 return binMarkerSets, refinedBinMarkerSet 635 636 def buildDomainMarkerSet(self, tree, curNode, ubiquityThreshold, singleCopyThreshold, bMarkerSet = True, genomeIdsToRemove = None): 637 """Build domain-specific marker sets for a genome in a LOO-fashion.""" 638 639 # determine marker sets for bin 640 binMarkerSets = BinMarkerSets(curNode.label, BinMarkerSets.TREE_MARKER_SET) 641 refinedBinMarkerSet = BinMarkerSets(curNode.label, BinMarkerSets.TREE_MARKER_SET) 642 643 # calculate marker set for bacterial or archaeal node 644 uniqueId = curNode.label.split('|')[0] 645 lineageSpecificRefinement = self.lineageSpecificGenesToRemove[uniqueId] 646 647 while curNode != None: 648 uniqueId = curNode.label.split('|')[0] 649 if uniqueId != 'UID2' and uniqueId != 'UID203': 650 curNode = curNode.parent_node 651 continue 652 653 stats = self.uniqueIdToLineageStatistics[uniqueId] 654 taxonomyStr = stats['taxonomy'] 655 if taxonomyStr == '': 656 taxonomyStr = self.__getNextNamedNode(curNode, self.uniqueIdToLineageStatistics) 657 658 leafNodes = curNode.leaf_nodes() 659 genomeIds = set() 660 for leaf in leafNodes: 661 genomeIds.add(leaf.taxon.label.replace('IMG_', '')) 662 663 duplicateGenomes = self.duplicateSeqs.get(leaf.taxon.label, []) 664 for dup in duplicateGenomes: 665 genomeIds.add(dup.replace('IMG_', '')) 666 667 # remove all genomes from the same taxonomic group as the genome of interest 668 if genomeIdsToRemove != None: 669 genomeIds.difference_update(genomeIdsToRemove) 670 671 if len(genomeIds) >= 2: 672 if bMarkerSet: 673 markerSet = self.buildMarkerSet(genomeIds, ubiquityThreshold, singleCopyThreshold) 674 else: 675 markerSet = MarkerSet(0, 'NA', len(genomeIds), [self.buildMarkerGenes(genomeIds, ubiquityThreshold, singleCopyThreshold)]) 676 677 markerSet.lineageStr = uniqueId + ' | ' + taxonomyStr.split(';')[-1] 678 binMarkerSets.addMarkerSet(markerSet) 679 680 #refinedMarkerSet = self.__refineMarkerSet(markerSet, lineageSpecificMarkerSet) 681 refinedMarkerSet = self.____removeInvalidLineageMarkerGenes(markerSet, lineageSpecificRefinement) 682 #print 'Refinement: %d of %d' % (len(refinedMarkerSet.getMarkerGenes()), len(markerSet.getMarkerGenes())) 683 refinedBinMarkerSet.addMarkerSet(refinedMarkerSet) 684 685 curNode = curNode.parent_node 686 687 return binMarkerSets, refinedBinMarkerSet