1#!/usr/bin/env python
2
3###############################################################################
4#                                                                             #
5#    This program is free software: you can redistribute it and/or modify     #
6#    it under the terms of the GNU General Public License as published by     #
7#    the Free Software Foundation, either version 3 of the License, or        #
8#    (at your option) any later version.                                      #
9#                                                                             #
10#    This program is distributed in the hope that it will be useful,          #
11#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
12#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
13#    GNU General Public License for more details.                             #
14#                                                                             #
15#    You should have received a copy of the GNU General Public License        #
16#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
17#                                                                             #
18###############################################################################
19
20"""
21Methods for identifying and investigating marker sets.
22"""
23
24__author__ = 'Donovan Parks'
25__copyright__ = 'Copyright 2013'
26__credits__ = ['Donovan Parks']
27__license__ = 'GPL3'
28__version__ = '1.0.0'
29__maintainer__ = 'Donovan Parks'
30__email__ = 'donovan.parks@gmail.com'
31__status__ = 'Development'
32
33import os
34import sys
35import random
36from collections import defaultdict
37from checkm.markerSets import BinMarkerSets, MarkerSet
38
39from checkm.util.img import IMG
40
41from numpy import array
42from numpy.random import choice
43
44class MarkerSetBuilder(object):
45    def __init__(self):
46        self.img = IMG('/srv/whitlam/bio/db/checkm/img/img_metadata.tsv', '/srv/whitlam/bio/db/checkm/pfam/tigrfam2pfam.tsv')
47        self.colocatedFile = './data/colocated.tsv'
48        self.duplicateSeqs = self.readDuplicateSeqs()
49        self.uniqueIdToLineageStatistics = self.__readNodeMetadata()
50
51        self.cachedGeneCountTable = None
52
53    def precomputeGenomeSeqLens(self, genomeIds):
54        """Cache the length of contigs/scaffolds for all genomes."""
55
56        # This function is intended to speed up functions, such as img.geneDistTable(),
57        # that are called multiple times (typically during simulations)
58        self.img.precomputeGenomeSeqLens(genomeIds)
59
60    def precomputeGenomeFamilyPositions(self, genomeIds, spacingBetweenContigs):
61        """Cache position of PFAM and TIGRFAM genes in genomes."""
62
63        # This function is intended to speed up functions, such as img.geneDistTable(),
64        # that are called multiple times (typically during simulations)
65        self.img.precomputeGenomeFamilyPositions(genomeIds, spacingBetweenContigs)
66
67    def precomputeGenomeFamilyScaffolds(self, genomeIds):
68        """Cache scaffolds of PFAM and TIGRFAM genes in genomes."""
69
70        # This function is intended to speed up functions, such as img.geneDistTable(),
71        # that are called multiple times (typically during simulations)
72        self.img.precomputeGenomeFamilyScaffolds(genomeIds)
73
74    def getLineageMarkerGenes(self, lineage, minGenomes = 20, minMarkerSets = 20):
75        pfamIds = set()
76        tigrIds = set()
77
78        bHeader = True
79        for line in open(self.colocatedFile):
80            if bHeader:
81                bHeader = False
82                continue
83
84            lineSplit = line.split('\t')
85            curLineage = lineSplit[0]
86            numGenomes = int(lineSplit[1])
87            numMarkerSets = int(lineSplit[3])
88            markerSets = lineSplit[4:]
89
90            if curLineage != lineage or numGenomes < minGenomes or numMarkerSets < minMarkerSets:
91                continue
92
93            for ms in markerSets:
94                markers = ms.split(',')
95                for m in markers:
96                    if 'pfam' in m:
97                        pfamIds.add(m.strip())
98                    elif 'TIGR' in m:
99                        tigrIds.add(m.strip())
100
101        return pfamIds, tigrIds
102
103    def getCalculatedMarkerGenes(self, minGenomes = 20, minMarkerSets = 20):
104        pfamIds = set()
105        tigrIds = set()
106
107        bHeader = True
108        for line in open(self.colocatedFile):
109            if bHeader:
110                bHeader = False
111                continue
112
113            lineSplit = line.split('\t')
114            numGenomes = int(lineSplit[1])
115            numMarkerSets = int(lineSplit[3])
116            markerSets = lineSplit[4:]
117
118            if numGenomes < minGenomes or numMarkerSets < minMarkerSets:
119                continue
120
121            for ms in markerSets:
122                markers = ms.split(',')
123                for m in markers:
124                    if 'pfam' in m:
125                        pfamIds.add(m.strip())
126                    elif 'TIGR' in m:
127                        tigrIds.add(m.strip())
128
129        return pfamIds, tigrIds
130
131    def markerGenes(self, genomeIds, countTable, ubiquityThreshold, singleCopyThreshold):
132        if ubiquityThreshold < 1 or singleCopyThreshold < 1:
133            print '[Warning] Looks like degenerate threshold.'
134
135        # find genes meeting ubiquity and single-copy thresholds
136        markers = set()
137        for clusterId, genomeCounts in countTable.iteritems():
138            ubiquity = 0
139            singleCopy = 0
140
141            if len(genomeCounts) < ubiquityThreshold:
142                # gene is clearly not ubiquitous
143                continue
144
145            for genomeId in genomeIds:
146                count = genomeCounts.get(genomeId, 0)
147
148                if count > 0:
149                    ubiquity += 1
150
151                if count == 1:
152                    singleCopy += 1
153
154            if ubiquity >= ubiquityThreshold and singleCopy >= singleCopyThreshold:
155                markers.add(clusterId)
156
157        return markers
158
159    def colocatedGenes(self, geneDistTable, distThreshold = 5000, genomeThreshold = 0.95):
160        """Identify co-located gene pairs."""
161
162        colocatedGenes = defaultdict(int)
163        for _, clusterIdToGeneLocs in geneDistTable.iteritems():
164            clusterIds = clusterIdToGeneLocs.keys()
165            for i, clusterId1 in enumerate(clusterIds):
166                geneLocations1 = clusterIdToGeneLocs[clusterId1]
167
168                for clusterId2 in clusterIds[i+1:]:
169                    geneLocations2 = clusterIdToGeneLocs[clusterId2]
170                    bColocated = False
171                    for p1 in geneLocations1:
172                        for p2 in geneLocations2:
173                            if abs(p1[0] - p2[0]) < distThreshold:
174                                bColocated = True
175                                break
176
177                        if bColocated:
178                            break
179
180                    if bColocated:
181                        if clusterId1 <= clusterId2:
182                            colocatedStr = clusterId1 + '-' + clusterId2
183                        else:
184                            colocatedStr = clusterId2 + '-' + clusterId1
185                        colocatedGenes[colocatedStr] += 1
186
187        colocated = []
188        for colocatedStr, count in colocatedGenes.iteritems():
189            if float(count)/len(geneDistTable) > genomeThreshold:
190                colocated.append(colocatedStr)
191
192        return colocated
193
194    def colocatedSets(self, colocatedGenes, markerGenes):
195        # run through co-located genes once creating initial sets
196        sets = []
197        for cg in colocatedGenes:
198            geneA, geneB = cg.split('-')
199            sets.append(set([geneA, geneB]))
200
201        # combine any sets with overlapping genes
202        bProcessed = [False]*len(sets)
203        finalSets = []
204        for i in xrange(0, len(sets)):
205            if bProcessed[i]:
206                continue
207
208            curSet = sets[i]
209            bProcessed[i] = True
210
211            bUpdated = True
212            while bUpdated:
213                bUpdated = False
214                for j in xrange(i+1, len(sets)):
215                    if bProcessed[j]:
216                        continue
217
218                    if len(curSet.intersection(sets[j])) > 0:
219                        curSet.update(sets[j])
220                        bProcessed[j] = True
221                        bUpdated = True
222
223            finalSets.append(curSet)
224
225        # add all singletons into colocated sets
226        for clusterId in markerGenes:
227            bFound = False
228            for cs in finalSets:
229                if clusterId in cs:
230                    bFound = True
231
232            if not bFound:
233                finalSets.append(set([clusterId]))
234
235        return finalSets
236
237    def genomeCheck(self, colocatedSet, genomeId, countTable):
238        comp = 0.0
239        cont = 0.0
240        missingMarkers = set()
241        duplicateMarkers = set()
242
243        if len(colocatedSet) == 0:
244            return comp, cont, missingMarkers, duplicateMarkers
245
246        for cs in colocatedSet:
247            present = 0
248            multiCopy = 0
249            for contigId in cs:
250                count = countTable[contigId].get(genomeId, 0)
251                if count == 1:
252                    present += 1
253                elif count > 1:
254                    present += 1
255                    multiCopy += (count-1)
256                    duplicateMarkers.add(contigId)
257                elif count == 0:
258                    missingMarkers.add(contigId)
259
260            comp += float(present) / len(cs)
261            cont += float(multiCopy) / len(cs)
262
263        return comp / len(colocatedSet), cont / len(colocatedSet), missingMarkers, duplicateMarkers
264
265    def uniformity(self, genomeSize, pts):
266        U = float(genomeSize) / (len(pts)+1)  # distance between perfectly evenly spaced points
267
268        # calculate distance between adjacent points
269        dists = []
270        pts = sorted(pts)
271        for i in xrange(0, len(pts)-1):
272            dists.append(pts[i+1] - pts[i])
273
274        # calculate uniformity index
275        num = 0
276        den = 0
277        for d in dists:
278            num += abs(d - U)
279            den += max(d, U)
280
281        return 1.0 - num/den
282
283    def sampleGenome(self, genomeLen, percentComp, percentCont, contigLen):
284        """Sample a genome to simulate a given percent completion and contamination."""
285
286        contigsInGenome = genomeLen / contigLen
287
288        # determine number of contigs to achieve desired completeness and contamination
289        contigsToSampleComp = int(contigsInGenome*percentComp + 0.5)
290        contigsToSampleCont = int(contigsInGenome*percentCont + 0.5)
291
292        # randomly sample contigs with contamination done via sampling with replacement
293        compContigs = random.sample(xrange(contigsInGenome), contigsToSampleComp)
294        contContigs = choice(xrange(contigsInGenome), contigsToSampleCont, replace=True)
295
296        # determine start of each contig
297        contigStarts = [c*contigLen for c in compContigs]
298        contigStarts += [c*contigLen for c in contContigs]
299
300        contigStarts.sort()
301
302        trueComp = float(contigsToSampleComp)*contigLen*100 / genomeLen
303        trueCont = float(contigsToSampleCont)*contigLen*100 / genomeLen
304
305        return trueComp, trueCont, contigStarts
306
307    def sampleGenomeScaffoldsInvLength(self, targetPer, seqLens, genomeSize):
308        """Sample genome comprised of several sequences with probability inversely proportional to length."""
309
310        # calculate probability of sampling a sequences
311        seqProb = []
312        for _, seqLen in seqLens.iteritems():
313            prob = 1.0 / (float(seqLen) / genomeSize)
314            seqProb.append(prob)
315
316        seqProb = array(seqProb)
317        seqProb /= sum(seqProb)
318
319        # select sequence with probability proportional to length
320        selectedSeqsIds = choice(seqLens.keys(), size = len(seqLens), replace=False, p = seqProb)
321
322        sampledSeqIds = []
323        truePer = 0.0
324        for seqId in selectedSeqsIds:
325            sampledSeqIds.append(seqId)
326            truePer += float(seqLens[seqId]) / genomeSize
327
328            if truePer >= targetPer:
329                break
330
331        return sampledSeqIds, truePer*100
332
333    def sampleGenomeScaffoldsWithoutReplacement(self, targetPer, seqLens, genomeSize):
334        """Sample genome comprised of several sequences without replacement.
335
336          Sampling is conducted randomly until the selected sequences comprise
337          greater than or equal to the desired target percentage.
338        """
339
340        selectedSeqsIds = choice(seqLens.keys(), size = len(seqLens), replace=False)
341
342        sampledSeqIds = []
343        truePer = 0.0
344        for seqId in selectedSeqsIds:
345            sampledSeqIds.append(seqId)
346            truePer += float(seqLens[seqId]) / genomeSize
347
348            if truePer >= targetPer:
349                break
350
351        return sampledSeqIds, truePer*100
352
353    def containedMarkerGenes(self, markerGenes, clusterIdToGenomePositions, startPartialGenomeContigs, contigLen):
354        """Determine markers contained in a set of contigs."""
355
356        contained = {}
357        for markerGene in markerGenes:
358            positions = clusterIdToGenomePositions.get(markerGene, [])
359
360            containedPos = []
361            for p in positions:
362                for s in startPartialGenomeContigs:
363                    if (p[0] - s) >= 0 and (p[0] - s) < contigLen:
364                        containedPos.append(s)
365
366            if len(containedPos) > 0:
367                contained[markerGene] = containedPos
368
369        return contained
370
371    def markerGenesOnScaffolds(self, markerGenes, genomeId, scaffoldIds, containedMarkerGenes):
372        """Determine if marker genes are found on the scaffolds of a given genome."""
373        for markerGeneId in markerGenes:
374            scaffoldIdsWithMarker = self.img.cachedGenomeFamilyScaffolds[genomeId].get(markerGeneId, [])
375
376            for scaffoldId in scaffoldIdsWithMarker:
377                if scaffoldId in scaffoldIds:
378                    containedMarkerGenes[markerGeneId] += [scaffoldId]
379
380    def readDuplicateSeqs(self):
381        """Parse file indicating duplicate sequence alignments."""
382        duplicateSeqs = {}
383        for line in open(os.path.join('/srv/whitlam/bio/db/checkm/genome_tree', 'genome_tree.derep.txt')):
384            lineSplit = line.rstrip().split()
385            if len(lineSplit) > 1:
386                duplicateSeqs[lineSplit[0]] = lineSplit[1:]
387
388        return duplicateSeqs
389
390    def __readNodeMetadata(self):
391        """Read metadata for internal nodes."""
392
393        uniqueIdToLineageStatistics = {}
394        metadataFile = os.path.join('/srv/whitlam/bio/db/checkm/genome_tree', 'genome_tree.metadata.tsv')
395        with open(metadataFile) as f:
396            f.readline()
397            for line in f:
398                lineSplit = line.rstrip().split('\t')
399
400                uniqueId = lineSplit[0]
401
402                d = {}
403                d['# genomes'] = int(lineSplit[1])
404                d['taxonomy'] = lineSplit[2]
405                try:
406                    d['bootstrap'] = float(lineSplit[3])
407                except:
408                    d['bootstrap'] = 'NA'
409                d['gc mean'] = float(lineSplit[4])
410                d['gc std'] = float(lineSplit[5])
411                d['genome size mean'] = float(lineSplit[6])/1e6
412                d['genome size std'] = float(lineSplit[7])/1e6
413                d['gene count mean'] = float(lineSplit[8])
414                d['gene count std'] = float(lineSplit[9])
415                d['marker set'] = lineSplit[10].rstrip()
416
417                uniqueIdToLineageStatistics[uniqueId] = d
418
419        return uniqueIdToLineageStatistics
420
421    def __getNextNamedNode(self, node, uniqueIdToLineageStatistics):
422        """Get first parent node with taxonomy information."""
423        parentNode = node.parent_node
424        while True:
425            if parentNode == None:
426                break # reached the root node so terminate
427
428            if parentNode.label:
429                trustedUniqueId = parentNode.label.split('|')[0]
430                trustedStats = uniqueIdToLineageStatistics[trustedUniqueId]
431                if trustedStats['taxonomy'] != '':
432                    return trustedStats['taxonomy']
433
434            parentNode = parentNode.parent_node
435
436        return 'root'
437
438    def __refineMarkerSet(self, markerSet, lineageSpecificMarkerSet):
439        """Refine marker set to account for lineage-specific gene loss and duplication."""
440
441        # refine marker set by finding the intersection between these two sets,
442        # this removes markers that are not single-copy or ubiquitous in the
443        # specific lineage of a bin
444        # Note: co-localization information is taken from the trusted set
445
446        # remove genes not present in the lineage-specific gene set
447        finalMarkerSet = []
448        for ms in markerSet.markerSet:
449            s = set()
450            for gene in ms:
451                if gene in lineageSpecificMarkerSet.getMarkerGenes():
452                    s.add(gene)
453
454            if s:
455                finalMarkerSet.append(s)
456
457        refinedMarkerSet = MarkerSet(markerSet.UID, markerSet.lineageStr, markerSet.numGenomes, finalMarkerSet)
458
459        return refinedMarkerSet
460
461    def ____removeInvalidLineageMarkerGenes(self, markerSet, lineageSpecificMarkersToRemove):
462        """Refine marker set to account for lineage-specific gene loss and duplication."""
463
464        # refine marker set by removing marker genes subject to lineage-specific
465        # gene loss and duplication
466        #
467        # Note: co-localization information is taken from the trusted set
468
469        finalMarkerSet = []
470        for ms in markerSet.markerSet:
471            s = set()
472            for gene in ms:
473                if gene.startswith('PF'):
474                    print 'ERROR! Expected genes to start with pfam, not PF.'
475
476                if gene not in lineageSpecificMarkersToRemove:
477                    s.add(gene)
478
479            if s:
480                finalMarkerSet.append(s)
481
482        refinedMarkerSet = MarkerSet(markerSet.UID, markerSet.lineageStr, markerSet.numGenomes, finalMarkerSet)
483
484        return refinedMarkerSet
485
486    def missingGenes(self, genomeIds, markerGenes, ubiquityThreshold):
487        """Inferring consistently missing marker genes within a set of genomes."""
488
489        if self.cachedGeneCountTable != None:
490            geneCountTable = self.cachedGeneCountTable
491        else:
492            geneCountTable = self.img.geneCountTable(genomeIds)
493
494        # find genes meeting ubiquity and single-copy thresholds
495        missing = set()
496        for clusterId, genomeCounts in geneCountTable.iteritems():
497            if clusterId not in markerGenes:
498                continue
499
500            absence = 0
501            for genomeId in genomeIds:
502                count = genomeCounts.get(genomeId, 0)
503
504                if count == 0:
505                    absence += 1
506
507            if absence >= ubiquityThreshold*len(genomeIds):
508                missing.add(clusterId)
509
510        return missing
511
512    def duplicateGenes(self, genomeIds, markerGenes, ubiquityThreshold):
513        """Inferring consistently duplicated marker genes within a set of genomes."""
514
515        if self.cachedGeneCountTable != None:
516            geneCountTable = self.cachedGeneCountTable
517        else:
518            geneCountTable = self.img.geneCountTable(genomeIds)
519
520        # find genes meeting ubiquity and single-copy thresholds
521        duplicate = set()
522        for clusterId, genomeCounts in geneCountTable.iteritems():
523            if clusterId not in markerGenes:
524                continue
525
526            duplicateCount = 0
527            for genomeId in genomeIds:
528                count = genomeCounts.get(genomeId, 0)
529
530                if count > 1:
531                    duplicateCount += 1
532
533            if duplicateCount >= ubiquityThreshold*len(genomeIds):
534                duplicate.add(clusterId)
535
536        return duplicate
537
538    def buildMarkerGenes(self, genomeIds, ubiquityThreshold, singleCopyThreshold):
539        """Infer marker genes from specified genomes."""
540
541        if self.cachedGeneCountTable != None:
542            geneCountTable = self.cachedGeneCountTable
543        else:
544            geneCountTable = self.img.geneCountTable(genomeIds)
545
546        #counts = []
547        #singleCopy = 0
548        #for genomeId, count in geneCountTable['pfam01351'].iteritems():
549        #    print genomeId, count
550        #    counts.append(count)
551        #    if count == 1:
552        #        singleCopy += 1
553
554        #print 'Ubiquity: %d of %d' % (len(counts), len(genomeIds))
555        #print 'Single-copy: %d of %d' % (singleCopy, len(genomeIds))
556        #print 'Mean: %.2f' % mean(counts)
557
558        markerGenes = self.markerGenes(genomeIds, geneCountTable, ubiquityThreshold*len(genomeIds), singleCopyThreshold*len(genomeIds))
559        tigrToRemove = self.img.identifyRedundantTIGRFAMs(markerGenes)
560        markerGenes = markerGenes - tigrToRemove
561
562        return markerGenes
563
564    def buildMarkerSet(self, genomeIds, ubiquityThreshold, singleCopyThreshold, spacingBetweenContigs = 5000):
565        """Infer marker set from specified genomes."""
566
567        markerGenes = self.buildMarkerGenes(genomeIds, ubiquityThreshold, singleCopyThreshold)
568
569        geneDistTable = self.img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs)
570        colocatedGenes = self.colocatedGenes(geneDistTable)
571        colocatedSets = self.colocatedSets(colocatedGenes, markerGenes)
572        markerSet = MarkerSet(0, 'NA', len(genomeIds), colocatedSets)
573
574        return markerSet
575
576    def readLineageSpecificGenesToRemove(self):
577        """Get set of genes subject to lineage-specific gene loss and duplication."""
578
579        self.lineageSpecificGenesToRemove = {}
580        for line in open('/srv/whitlam/bio/db/checkm/genome_tree/missing_duplicate_genes_50.tsv'):
581            lineSplit = line.split('\t')
582            uid = lineSplit[0]
583            missingGenes = eval(lineSplit[1])
584            duplicateGenes = eval(lineSplit[2])
585            self.lineageSpecificGenesToRemove[uid] = missingGenes.union(duplicateGenes)
586
587    def buildBinMarkerSet(self, tree, curNode, ubiquityThreshold, singleCopyThreshold, bMarkerSet = True, genomeIdsToRemove = None):
588        """Build lineage-specific marker sets for a genome in a LOO-fashion."""
589
590        # determine marker sets for bin
591        binMarkerSets = BinMarkerSets(curNode.label, BinMarkerSets.TREE_MARKER_SET)
592        refinedBinMarkerSet = BinMarkerSets(curNode.label, BinMarkerSets.TREE_MARKER_SET)
593
594        # ascend tree to root, recording all marker sets
595        uniqueId = curNode.label.split('|')[0]
596        lineageSpecificRefinement = self.lineageSpecificGenesToRemove[uniqueId]
597
598        while curNode != None:
599            uniqueId = curNode.label.split('|')[0]
600            stats = self.uniqueIdToLineageStatistics[uniqueId]
601            taxonomyStr = stats['taxonomy']
602            if taxonomyStr == '':
603                taxonomyStr = self.__getNextNamedNode(curNode, self.uniqueIdToLineageStatistics)
604
605            leafNodes = curNode.leaf_nodes()
606            genomeIds = set()
607            for leaf in leafNodes:
608                genomeIds.add(leaf.taxon.label.replace('IMG_', ''))
609
610                duplicateGenomes = self.duplicateSeqs.get(leaf.taxon.label, [])
611                for dup in duplicateGenomes:
612                    genomeIds.add(dup.replace('IMG_', ''))
613
614            # remove all genomes from the same taxonomic group as the genome of interest
615            if genomeIdsToRemove != None:
616                genomeIds.difference_update(genomeIdsToRemove)
617
618            if len(genomeIds) >= 2:
619                if bMarkerSet:
620                    markerSet = self.buildMarkerSet(genomeIds, ubiquityThreshold, singleCopyThreshold)
621                else:
622                    markerSet = MarkerSet(0, 'NA', len(genomeIds), [self.buildMarkerGenes(genomeIds, ubiquityThreshold, singleCopyThreshold)])
623
624                markerSet.lineageStr = uniqueId + ' | ' + taxonomyStr.split(';')[-1]
625                binMarkerSets.addMarkerSet(markerSet)
626
627                #refinedMarkerSet = self.__refineMarkerSet(markerSet, lineageSpecificMarkerSet)
628                refinedMarkerSet = self.____removeInvalidLineageMarkerGenes(markerSet, lineageSpecificRefinement)
629                #print 'Refinement: %d of %d' % (len(refinedMarkerSet.getMarkerGenes()), len(markerSet.getMarkerGenes()))
630                refinedBinMarkerSet.addMarkerSet(refinedMarkerSet)
631
632            curNode = curNode.parent_node
633
634        return binMarkerSets, refinedBinMarkerSet
635
636    def buildDomainMarkerSet(self, tree, curNode, ubiquityThreshold, singleCopyThreshold, bMarkerSet = True, genomeIdsToRemove = None):
637        """Build domain-specific marker sets for a genome in a LOO-fashion."""
638
639        # determine marker sets for bin
640        binMarkerSets = BinMarkerSets(curNode.label, BinMarkerSets.TREE_MARKER_SET)
641        refinedBinMarkerSet = BinMarkerSets(curNode.label, BinMarkerSets.TREE_MARKER_SET)
642
643        # calculate marker set for bacterial or archaeal node
644        uniqueId = curNode.label.split('|')[0]
645        lineageSpecificRefinement = self.lineageSpecificGenesToRemove[uniqueId]
646
647        while curNode != None:
648            uniqueId = curNode.label.split('|')[0]
649            if uniqueId != 'UID2' and uniqueId != 'UID203':
650                curNode = curNode.parent_node
651                continue
652
653            stats = self.uniqueIdToLineageStatistics[uniqueId]
654            taxonomyStr = stats['taxonomy']
655            if taxonomyStr == '':
656                taxonomyStr = self.__getNextNamedNode(curNode, self.uniqueIdToLineageStatistics)
657
658            leafNodes = curNode.leaf_nodes()
659            genomeIds = set()
660            for leaf in leafNodes:
661                genomeIds.add(leaf.taxon.label.replace('IMG_', ''))
662
663                duplicateGenomes = self.duplicateSeqs.get(leaf.taxon.label, [])
664                for dup in duplicateGenomes:
665                    genomeIds.add(dup.replace('IMG_', ''))
666
667            # remove all genomes from the same taxonomic group as the genome of interest
668            if genomeIdsToRemove != None:
669                genomeIds.difference_update(genomeIdsToRemove)
670
671            if len(genomeIds) >= 2:
672                if bMarkerSet:
673                    markerSet = self.buildMarkerSet(genomeIds, ubiquityThreshold, singleCopyThreshold)
674                else:
675                    markerSet = MarkerSet(0, 'NA', len(genomeIds), [self.buildMarkerGenes(genomeIds, ubiquityThreshold, singleCopyThreshold)])
676
677                markerSet.lineageStr = uniqueId + ' | ' + taxonomyStr.split(';')[-1]
678                binMarkerSets.addMarkerSet(markerSet)
679
680                #refinedMarkerSet = self.__refineMarkerSet(markerSet, lineageSpecificMarkerSet)
681                refinedMarkerSet = self.____removeInvalidLineageMarkerGenes(markerSet, lineageSpecificRefinement)
682                #print 'Refinement: %d of %d' % (len(refinedMarkerSet.getMarkerGenes()), len(markerSet.getMarkerGenes()))
683                refinedBinMarkerSet.addMarkerSet(refinedMarkerSet)
684
685            curNode = curNode.parent_node
686
687        return binMarkerSets, refinedBinMarkerSet