1###############################################################################
2#
3# resultsParser.py - Parse and output results.
4#
5###############################################################################
6#                                                                             #
7#    This program is free software: you can redistribute it and/or modify     #
8#    it under the terms of the GNU General Public License as published by     #
9#    the Free Software Foundation, either version 3 of the License, or        #
10#    (at your option) any later version.                                      #
11#                                                                             #
12#    This program is distributed in the hope that it will be useful,          #
13#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
14#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
15#    GNU General Public License for more details.                             #
16#                                                                             #
17#    You should have received a copy of the GNU General Public License        #
18#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
19#                                                                             #
20###############################################################################
21
22from __future__ import print_function
23
24import sys
25import os
26import ast
27from collections import defaultdict
28import logging
29
30import checkm.prettytable as prettytable
31
32from checkm.defaultValues import DefaultValues
33from checkm.common import reassignStdOut, restoreStdOut, checkFileExists
34from checkm.coverage import Coverage
35from checkm.hmmer import HMMERParser
36
37from checkm.util.pfam import PFAM
38from checkm.util.seqUtils import readFasta
39
40
41class ResultsParser():
42    """Parse output of Prodigal+HMMER run and derived statistics."""
43    def __init__(self, binIdToModels):
44        self.logger = logging.getLogger('timestamp')
45
46        self.results = {}
47        self.models = binIdToModels
48
49    def analyseResults(self,
50                       outDir,
51                       binStatsFile,
52                       hmmTableFile,
53                       bIgnoreThresholds=False,
54                       evalueThreshold=DefaultValues.E_VAL,
55                       lengthThreshold=DefaultValues.LENGTH,
56                       bSkipPseudoGeneCorrection=False,
57                       bSkipAdjCorrection=False,
58                       ):
59        """Parse results in the output directory."""
60
61        # read bin and sequence stats into dictionaries
62        binStats = self.parseBinStats(outDir, binStatsFile)
63
64        # get hits to each bin
65        self.parseBinHits(outDir, hmmTableFile, bSkipAdjCorrection, bIgnoreThresholds, evalueThreshold, lengthThreshold, bSkipPseudoGeneCorrection, binStats)
66
67        return binStats
68
69    def cacheResults(self, outDir, binIdToBinMarkerSets, bIndividualMarkers):
70        # cache critical results to file
71        self.__writeBinStatsExt(outDir, binIdToBinMarkerSets, bIndividualMarkers)
72        self.__writeMarkerGeneStats(outDir, binIdToBinMarkerSets, bIndividualMarkers)
73
74    def parseBinHits(self, outDir,
75                     hmmTableFile,
76                     bSkipAdjCorrection=False,
77                     bIgnoreThresholds=False,
78                     evalueThreshold=DefaultValues.E_VAL,
79                     lengthThreshold=DefaultValues.LENGTH,
80                     bSkipPseudoGeneCorrection=False,
81                     binStats=None):
82        """ Parse HMM hits for each bin."""
83        if not self.models:
84            self.logger.error('  [Error] Models must be parsed before identifying HMM hits.')
85            raise
86
87        self.logger.info('Parsing HMM hits to marker genes:')
88
89        numBinsProcessed = 0
90        for binId in self.models:
91            if self.logger.getEffectiveLevel() <= logging.INFO:
92                numBinsProcessed += 1
93                statusStr = '    Finished parsing hits for %d of %d (%.2f%%) bins.' % (numBinsProcessed, len(self.models), float(numBinsProcessed) * 100 / len(self.models))
94                sys.stderr.write('%s\r' % statusStr)
95                sys.stderr.flush()
96
97            if binStats != None:
98                resultsManager = ResultsManager(binId, self.models[binId], bIgnoreThresholds, evalueThreshold, lengthThreshold, bSkipPseudoGeneCorrection, binStats[binId])
99            elif binStats == None:
100                resultsManager = ResultsManager(binId, self.models[binId], bIgnoreThresholds, evalueThreshold, lengthThreshold, bSkipPseudoGeneCorrection)
101            else:
102                self.logger.error('  [Error] Invalid parameter settings for binStats and seqStats.')
103                raise
104
105            hmmerTableFile = os.path.join(outDir, 'bins', binId, hmmTableFile)
106            self.parseHmmerResults(hmmerTableFile, resultsManager, bSkipAdjCorrection)
107            self.results[binId] = resultsManager
108
109        if self.logger.getEffectiveLevel() <= logging.INFO:
110            sys.stderr.write('\n')
111
112    def __writeBinStatsExt(self, directory, binIdToBinMarkerSets, bIndividualMarkers):
113        binStatsExtFile = os.path.join(directory, 'storage', DefaultValues.BIN_STATS_EXT_OUT)
114        fout = open(binStatsExtFile, 'w')
115
116        for binId in self.results:
117            binStatsExt = self.results[binId].getSummary(binIdToBinMarkerSets[binId], bIndividualMarkers, outputFormat=2)
118            binStatsExt.update(self.results[binId].geneCopyNumber(binIdToBinMarkerSets[binId]))
119            fout.write(binId + '\t' + str(binStatsExt) + '\n')
120        fout.close()
121
122    def __writeMarkerGeneStats(self, directory, binIdToBinMarkerSets, bIndividualMarkers):
123        markerGenesFile = os.path.join(directory, 'storage', DefaultValues.MARKER_GENE_STATS)
124        fout = open(markerGenesFile, 'w')
125
126        for binId in self.results:
127            markerGenesHits = self.results[binId].getSummary(binIdToBinMarkerSets[binId], bIndividualMarkers, outputFormat=8)
128            fout.write(binId + '\t' + str(markerGenesHits) + '\n')
129        fout.close()
130
131    def parseBinStats(self, resultsFolder, binStatsFile):
132        """Read bin statistics from file."""
133        binStatsFile = os.path.join(resultsFolder, 'storage', binStatsFile)
134
135        checkFileExists(binStatsFile)
136
137        binStats = {}
138        with open(binStatsFile, 'r') as f:
139            for line in f:
140                lineSplit = line.split('\t')
141                binStats[lineSplit[0]] = ast.literal_eval(lineSplit[1])
142
143        return binStats
144
145    def parseBinStatsExt(self, resultsFolder):
146        """Read bin statistics from file."""
147        binStatsExtFile = os.path.join(resultsFolder, 'storage', DefaultValues.BIN_STATS_EXT_OUT)
148
149        checkFileExists(binStatsExtFile)
150
151        binStatsExt = {}
152        with open(binStatsExtFile, 'r') as f:
153            for line in f:
154                lineSplit = line.split('\t')
155                binStatsExt[lineSplit[0]] = ast.literal_eval(lineSplit[1])
156
157        return binStatsExt
158
159    def parseMarkerGeneStats(self, resultsFolder):
160        """Read bin statistics from file."""
161        markerGeneStatsFile = os.path.join(resultsFolder, 'storage', DefaultValues.MARKER_GENE_STATS)
162
163        checkFileExists(markerGeneStatsFile)
164
165        markerGeneStats = {}
166        with open(markerGeneStatsFile, 'r') as f:
167            for line in f:
168                lineSplit = line.split('\t')
169                markerGeneStats[lineSplit[0]] = ast.literal_eval(lineSplit[1])
170
171        return markerGeneStats
172
173    def parseHmmerResults(self, fileName, resultsManager, bSkipAdjCorrection):
174        """Parse HMMER results."""
175        try:
176            with open(fileName, 'r') as hmmerHandle:
177                try:
178                    HP = HMMERParser(hmmerHandle)
179                except:
180                    print("Error opening HMM file: ", fileName)
181                    raise
182
183                while True:
184                    hit = HP.next()
185                    if hit is None:
186                        break
187                    resultsManager.addHit(hit)
188
189            # retain only best hit to PFAM clan
190            pfam = PFAM(DefaultValues.PFAM_CLAN_FILE)
191            resultsManager.markerHits = pfam.filterHitsFromSameClan(resultsManager.markerHits)
192
193            # correct for errors in ORF calling
194            if not bSkipAdjCorrection:
195                resultsManager.identifyAdjacentMarkerGenes()
196
197        except IOError as detail:
198            sys.stderr.write(str(detail) + "\n")
199
200    def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles=None, table=None):
201        """Get header for requested output table."""
202
203        # keep count of single, double, triple genes etc...
204        if outputFormat == 1:
205            header = ['Bin Id', 'Marker lineage', '# genomes', '# markers', '# marker sets', '0', '1', '2', '3', '4', '5+', 'Completeness', 'Contamination', 'Strain heterogeneity']
206        elif outputFormat == 2:
207            header = ['Bin Id']
208            header += ['Marker lineage', '# genomes', '# markers', '# marker sets']
209            header += ['Completeness', 'Contamination', 'Strain heterogeneity']
210            header += ['Genome size (bp)', '# ambiguous bases', '# scaffolds', '# contigs']
211            header += ['N50 (scaffolds)', 'N50 (contigs)', 'Mean scaffold length (bp)', 'Mean contig length (bp)']
212            header += ['Longest scaffold (bp)', 'Longest contig (bp)']
213            header += ['GC', 'GC std (scaffolds > 1kbp)']
214            header += ['Coding density', 'Translation table', '# predicted genes']
215            header += ['0', '1', '2', '3', '4', '5+']
216
217            if coverageBinProfiles != None:
218                for bamId in coverageBinProfiles[coverageBinProfiles.keys()[0]]:
219                    header += ['Coverage (' + bamId + ')', 'Coverage std (' + bamId + ')']
220
221            if DefaultValues.MIN_SEQ_LEN_GC_STD != 1000:
222                self.logger.error('  [Error] Labeling error: GC std (scaffolds > 1kbp)')
223                sys.exit(1)
224        elif outputFormat == 3:
225            header = ['Bin Id', 'Node Id', 'Marker lineage', '# genomes', '# markers', '# marker sets', '0', '1', '2', '3', '4', '5+', 'Completeness', 'Contamination', 'Strain heterogeneity']
226        elif outputFormat == 4:
227            header = None
228        elif outputFormat == 5:
229            header = ['Bin Id', 'Marker Id', 'Gene Id']
230        elif outputFormat == 6:
231            header = ['Bin Id', 'Marker Id', 'Gene Ids']
232        elif outputFormat == 7:
233            header = ['Bin Id', 'Marker Id', 'Gene Ids']
234        elif outputFormat == 8:
235            header = ['Bin Id', 'Gene Id', '{Marker Id, Start position, End position}']
236        elif outputFormat == 9:
237            if table is not None:
238                header = ['Bin Id', 'Contig', 'Gene Number', 'Gene Start', 'Gene End', 'Gene Strand', 'Prot Length', 'Marker Id', 'Align Start', 'Align End', 'Sequence']
239            else:
240                header = " "
241        elif outputFormat == 10:
242            header = ['Scaffold Id', 'Bin Id', 'Length', '# contigs', 'GC', '# ORFs', 'Coding density', 'Marker Ids']
243
244        return header
245
246    def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarkers, coverageFile, bTabTable, outFile, anaFolder):
247        # redirect output
248        oldStdOut = reassignStdOut(outFile)
249
250        coverageBinProfiles = None
251        if coverageFile:
252            coverage = Coverage(1)
253            coverageBinProfiles = coverage.binProfiles(coverageFile)
254
255        prettyTableFormats = [1, 2, 3, 9]
256
257        header = self.__getHeader(outputFormat, binIdToBinMarkerSets[binIdToBinMarkerSets.keys()[0]], coverageBinProfiles, bTabTable)
258        if bTabTable or outputFormat not in prettyTableFormats:
259            bTabTable = True
260            pTable = None
261
262            if header != None:
263                print('\t'.join(header))
264        else:
265            pTable = prettytable.PrettyTable(header)
266            pTable.float_format = '.2'
267            pTable.align = 'c'
268            pTable.align[header[0]] = 'l'
269            pTable.hrules = prettytable.FRAME
270            pTable.vrules = prettytable.NONE
271
272        seqsReported = 0
273        for binId in sorted(self.results.keys()):
274            seqsReported += self.results[binId].printSummary(outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable, anaFolder)
275
276        if outputFormat in [6, 7] and seqsReported == 0:
277            print('[No marker genes satisfied the reporting criteria.]')
278
279        if not bTabTable:
280            if outputFormat in [1, 2]:
281                print(pTable.get_string(sortby='Completeness', reversesort=True))
282            else:
283                # only print if there are rows
284                if pTable.get_string(print_empty=False):
285                    print(pTable.get_string(print_empty=False))
286
287        # restore stdout
288        restoreStdOut(outFile, oldStdOut)
289
290
291class ResultsManager():
292    """Store all the results for a single bin"""
293    def __init__(self, binId, models,
294                 bIgnoreThresholds=False,
295                 evalueThreshold=DefaultValues.E_VAL,
296                 lengthThreshold=DefaultValues.LENGTH,
297                 bSkipPseudoGeneCorrection=False,
298                 binStats=None):
299        self.binId = binId
300        self.markerHits = {}
301        self.bIgnoreThresholds = bIgnoreThresholds
302        self.evalueThreshold = evalueThreshold
303        self.lengthThreshold = lengthThreshold
304        self.bSkipPseudoGeneCorrection = bSkipPseudoGeneCorrection
305        self.models = models
306        self.binStats = binStats
307
308    def vetHit(self, hit):
309        """Check if hit meets required thresholds."""
310        model = self.models[hit.query_accession]
311
312        # preferentially use model specific bit score thresholds, before
313        # using the user specified e-value and length criteria
314
315        # Give preference to the gathering threshold unless the model
316        # is marked as TIGR (i.e., TIGRFAM model)
317
318        if not self.bSkipPseudoGeneCorrection:
319            alignment_length = float(hit.ali_to - hit.ali_from)
320            length_perc = alignment_length / float(hit.query_length)
321            if length_perc < DefaultValues.PSEUDOGENE_LENGTH:
322                return False
323
324        if model.nc != None and not self.bIgnoreThresholds and 'TIGR' in model.acc:
325            if model.nc[0] <= hit.full_score and model.nc[1] <= hit.dom_score:
326                return True
327        elif model.ga != None and not self.bIgnoreThresholds:
328            if model.ga[0] <= hit.full_score and model.ga[1] <= hit.dom_score:
329                return True
330        elif model.tc != None and not self.bIgnoreThresholds:
331            if model.tc[0] <= hit.full_score and model.tc[1] <= hit.dom_score:
332                return True
333        elif model.nc != None and not self.bIgnoreThresholds:
334            if model.nc[0] <= hit.full_score and model.nc[1] <= hit.dom_score:
335                return True
336        else:
337            if hit.full_e_value > self.evalueThreshold:
338                return False
339
340            alignment_length = float(hit.ali_to - hit.ali_from)
341            length_perc = alignment_length / float(hit.query_length)
342            if length_perc >= self.lengthThreshold:
343                return True
344
345        return False
346
347    def addHit(self, hit):
348        """Process hit and add it to the set of markers if it passes filtering criteria."""
349        if self.vetHit(hit):
350            if hit.query_accession in self.markerHits:
351                # retain only the best domain hit for a given marker to a specific ORF
352                previousHitToORF = None
353                for h in self.markerHits[hit.query_accession]:
354                    if h.target_name == hit.target_name:
355                        previousHitToORF = h
356                        break
357
358                if not previousHitToORF:
359                    self.markerHits[hit.query_accession].append(hit)
360                else:
361                    if previousHitToORF.dom_score < hit.dom_score:
362                        self.markerHits[hit.query_accession].append(hit)
363                        self.markerHits[hit.query_accession].remove(previousHitToORF)
364
365            else:
366                self.markerHits[hit.query_accession] = [hit]
367
368    def identifyAdjacentMarkerGenes(self):
369        """Identify adjacent marker genes and exclude these from the contamination estimate."""
370
371        # check for adjacent ORFs with hits to the same marker gene
372        for markerId, hits in self.markerHits.iteritems():
373
374            bCombined = True
375            while bCombined:
376                for i in xrange(0, len(hits)):
377                    orfI = hits[i].target_name
378                    scaffoldIdI = orfI[0:orfI.rfind('_')]
379
380                    bCombined = False
381                    for j in xrange(i + 1, len(hits)):
382                        orfJ = hits[j].target_name
383                        scaffoldIdJ = orfJ[0:orfJ.rfind('_')]
384
385                        # check if hits are on adjacent ORFs
386                        if scaffoldIdI == scaffoldIdJ:
387                            try:
388                                orfNumI = int(orfI[orfI.rfind('_') + 1:])
389                                orfNumJ = int(orfJ[orfJ.rfind('_') + 1:])
390                            except:
391                                # it appears called genes are not labeled
392                                # according to the prodigal format, so
393                                # it is not possible to perform this correction
394                                break
395
396                            if abs(orfNumI - orfNumJ) == 1:
397                                # check if hits are to different parts of the HMM
398                                sI = hits[i].hmm_from
399                                eI = hits[i].hmm_to
400
401                                sJ = hits[j].hmm_from
402                                eJ = hits[j].hmm_to
403
404                                if (sI <= sJ and eI > sJ) or (sJ <= sI and eJ > sI):
405                                    # models overlap so this could represent contamination,
406                                    # but it seems more likely that adjacent genes hitting
407                                    # the same marker represent legitimate gene duplication,
408                                    # a gene calling error, or an assembly error and thus
409                                    # should not be treated as contamination
410                                    bCombined = True
411                                    break
412                                else:
413                                    # combine the two hits
414                                    bCombined = True
415                                    break
416
417                    if bCombined:
418                        newHit = hits[i]
419
420                        # produce concatenated label indicating the two genes being combined
421                        orfA, orfB = sorted([orfI, orfJ])
422                        newHit.target_name = DefaultValues.SEQ_CONCAT_CHAR.join([orfA, orfB])
423
424                        newHit.target_length = hits[i].target_length + hits[j].target_length
425                        newHit.hmm_from = min(hits[i].hmm_from, hits[j].hmm_from)
426                        newHit.hmm_to = min(hits[i].hmm_to, hits[j].hmm_to)
427
428                        newHit.ali_from = min(hits[i].ali_from, hits[j].ali_from)
429                        newHit.ali_to = min(hits[i].ali_to, hits[j].ali_to)
430
431                        newHit.env_from = min(hits[i].env_from, hits[j].env_from)
432                        newHit.env_to = min(hits[i].env_to, hits[j].env_to)
433
434                        hits.remove(hits[j])
435                        hits.remove(hits[i])
436
437                        hits.append(newHit)
438
439                        break
440
441            self.markerHits[markerId] = hits
442
443    def countUniqueHits(self):
444        """Determine number of unique and multiple hits."""
445        uniqueHits = 0
446        multiCopyHits = 0
447        for hits in self.markerHits.values():
448            if len(hits) == 1:
449                uniqueHits += 1
450            elif len(hits) > 1:
451                multiCopyHits += 1
452
453        return uniqueHits, multiCopyHits
454
455    def hitsToMarkerGene(self, markerSet):
456        """Determine number of times each marker gene is hit."""
457        ret = dict()
458        for marker in markerSet.getMarkerGenes():
459            try:
460                ret[marker] = len(self.markerHits[marker])
461            except KeyError:
462                ret[marker] = 0
463
464        return ret
465
466    def geneCountsForSelectedMarkerSet(self, binMarkerSets, bIndividualMarkers):
467        """Report number of times each marker gene was found along with
468           a completeness and contamination estimation for the selected marker
469           set associated with a bin."""
470        geneCounts = self.geneCounts(binMarkerSets.selectedMarkerSet(), self.markerHits, bIndividualMarkers)
471
472        return geneCounts
473
474    def geneCounts(self, markerSet, markerHits, bIndividualMarkers):
475        """ Determine number of marker genes with 0-5 hits
476            as well as the total completeness and contamination."""
477        geneCounts = [0] * 6
478        multiCopyCount = 0
479        for marker in markerSet.getMarkerGenes():
480            if marker in markerHits:
481                if len(markerHits[marker]) > 5:
482                    markerCount = 5
483                else:
484                    markerCount = len(markerHits[marker])
485
486                multiCopyCount += (len(markerHits[marker]) - 1)
487            else:
488                markerCount = 0
489
490            geneCounts[markerCount] += 1
491
492        percComp, percCont = markerSet.genomeCheck(markerHits, bIndividualMarkers)
493
494        geneCounts.append(percComp)
495        geneCounts.append(percCont)
496
497        return geneCounts
498
499    def geneCopyNumber(self, binMarkerSets):
500        """ Determine number of times each marker gene is present."""
501        geneCopyNumber = {}
502        geneCopyNumber['GCN0'] = []
503        geneCopyNumber['GCN1'] = []
504        geneCopyNumber['GCN2'] = []
505        geneCopyNumber['GCN3'] = []
506        geneCopyNumber['GCN4'] = []
507        geneCopyNumber['GCN5+'] = []
508
509        markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()
510
511        for marker in self.models:
512            if marker not in markerGenes:
513                continue
514
515            markerId = os.path.splitext(marker)[0]
516            if marker in self.markerHits:
517                if len(self.markerHits[marker]) >= 5:
518                    geneCopyNumber['GCN5+'].append(markerId)
519                else:
520                    geneCopyNumber['GCN' + str(len(self.markerHits[marker]))].append(markerId)
521            else:
522                geneCopyNumber['GCN0'].append(markerId)
523
524        return geneCopyNumber
525
526    def getSummary(self, binMarkerSets, bIndividualMarkers, outputFormat=1):
527        """Get dictionary containing information about bin."""
528        summary = {}
529
530        if outputFormat == 1:
531            selectedMarkerSet = binMarkerSets.selectedMarkerSet()
532            data = self.geneCountsForSelectedMarkerSet(binMarkerSets, bIndividualMarkers)
533
534            summary['marker lineage'] = selectedMarkerSet.lineageStr
535            summary['# genomes'] = selectedMarkerSet.numGenomes
536            summary['# markers'] = selectedMarkerSet.numMarkers()
537            summary['# marker sets'] = selectedMarkerSet.numSets()
538            summary['0'] = data[0]
539            summary['1'] = data[1]
540            summary['2'] = data[2]
541            summary['3'] = data[3]
542            summary['4'] = data[4]
543            summary['5+'] = data[5]
544            summary['Completeness'] = data[6]
545            summary['Contamination'] = data[7]
546        elif outputFormat == 2:
547            selectedMarkerSet = binMarkerSets.selectedMarkerSet()
548            data = self.geneCountsForSelectedMarkerSet(binMarkerSets, bIndividualMarkers)
549
550            summary['marker lineage'] = selectedMarkerSet.lineageStr
551            summary['# genomes'] = selectedMarkerSet.numGenomes
552            summary['# markers'] = selectedMarkerSet.numMarkers()
553            summary['# marker sets'] = selectedMarkerSet.numSets()
554            summary['0'] = data[0]
555            summary['1'] = data[1]
556            summary['2'] = data[2]
557            summary['3'] = data[3]
558            summary['4'] = data[4]
559            summary['5+'] = data[5]
560            summary['Completeness'] = data[6]
561            summary['Contamination'] = data[7]
562            summary.update(self.binStats)
563
564        elif outputFormat == 5:
565            # tabular of bin_id, marker, contig_id
566            markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()
567
568            for marker, hit_list in self.markerHits.items():
569                if marker not in markerGenes:
570                    continue
571
572                summary[marker] = []
573                for hit in hit_list:
574                    summary[marker].append(hit.target_name)
575
576        elif outputFormat == 6:
577            markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()
578
579            for marker, hit_list in self.markerHits.items():
580                if marker not in markerGenes:
581                    continue
582
583                if len(hit_list) >= 2:
584                    summary[marker] = []
585                    for hit in hit_list:
586                        summary[marker].append(hit.target_name)
587
588        elif outputFormat == 7:
589            # tabular - print only contigs that have more than one copy
590            # of the same marker on them
591            markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()
592
593            contigs = defaultdict(dict)
594            for marker, hit_list in self.markerHits.items():
595                if marker not in markerGenes:
596                    continue
597
598                for hit in hit_list:
599                    try:
600                        contigs[hit.target_name][marker] += 1
601                    except KeyError:
602                        contigs[hit.target_name][marker] = 1
603
604            for contig_name, marker_counts in contigs.items():
605                for marker_name, marker_count in marker_counts.items():
606                    if marker_count > 1:
607                        if contig_name not in summary:
608                            summary[contig_name] = {}
609
610                        summary[contig_name][marker_name] = marker_count
611
612        elif outputFormat == 8:
613            # tabular - print only position of marker genes
614            markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()
615
616            genesWithMarkers = {}
617            for marker, hit_list in self.markerHits.items():
618                if marker not in markerGenes:
619                    continue
620
621                for hit in hit_list:
622                    genesWithMarkers[hit.target_name] = genesWithMarkers.get(hit.target_name, []) + [hit]
623
624            for geneId, hits in genesWithMarkers.iteritems():
625                summary[geneId] = {}
626                for hit in hits:
627                    summary[geneId][hit.query_accession] = summary[geneId].get(hit.query_accession, []) + [[hit.ali_from, hit.ali_to]]
628        else:
629            print("Unknown output format: ", outputFormat)
630
631        return summary
632
633    def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles=None, table=None, anaFolder=None):
634
635        """Print out information about bin."""
636        if outputFormat == 1:
637            selectedMarkerSet = binMarkerSets.selectedMarkerSet()
638
639            lineageStr = selectedMarkerSet.lineageStr
640            if selectedMarkerSet.UID != '0':
641                lineageStr += ' (' + str(selectedMarkerSet.UID) + ')'
642
643            data = self.geneCountsForSelectedMarkerSet(binMarkerSets, bIndividualMarkers)
644            row = "%s\t%s\t%d\t%d\t%d\t%s\t%0.2f\t%0.2f\t%0.2f" % (self.binId, lineageStr,
645                                                selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets(),
646                                                "\t".join([str(data[i]) for i in range(6)]),
647                                                data[6],
648                                                data[7],
649                                                aai.aaiMeanBinHetero.get(self.binId, 0.0)
650                                                )
651            if table == None:
652                print(row)
653            else:
654                table.add_row([self.binId, lineageStr, selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets()] + data + [aai.aaiMeanBinHetero.get(self.binId, 0.0)])
655        elif outputFormat == 2:
656            selectedMarkerSet = binMarkerSets.selectedMarkerSet()
657
658            lineageStr = selectedMarkerSet.lineageStr
659            if selectedMarkerSet.UID != '0':
660                lineageStr += ' (' + str(selectedMarkerSet.UID) + ')'
661
662            data = self.geneCountsForSelectedMarkerSet(binMarkerSets, bIndividualMarkers)
663
664            if table == None:
665                row = self.binId
666                row += '\t%s\t%d\t%d\t%d' % (lineageStr, selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets())
667                row += '\t%0.2f\t%0.2f\t%0.2f' % (data[6], data[7], aai.aaiMeanBinHetero.get(self.binId, 0.0))
668                row += '\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % (self.binStats['Genome size'], self.binStats['# ambiguous bases'],
669                                                                         self.binStats['# scaffolds'], self.binStats['# contigs'],
670                                                                         self.binStats['N50 (scaffolds)'], self.binStats['N50 (contigs)'],
671                                                                         self.binStats['Mean scaffold length'], self.binStats['Mean contig length'],
672                                                                         self.binStats['Longest scaffold'], self.binStats['Longest contig'])
673                row += '\t%.1f\t%.2f' % (self.binStats['GC'] * 100, self.binStats['GC std'] * 100)
674                row += '\t%.2f\t%d\t%d' % (self.binStats['Coding density'] * 100, self.binStats['Translation table'], self.binStats['# predicted genes'])
675                row += '\t' + '\t'.join([str(data[i]) for i in xrange(6)])
676
677                if coverageBinProfiles:
678                    if self.binId in coverageBinProfiles:
679                        for _, coverageStats in coverageBinProfiles[self.binId].iteritems():
680                            row += '\t%.2f\t%.2f' % (coverageStats[0], coverageStats[1])
681                    else:
682                        for bamId in coverageBinProfiles[coverageBinProfiles.keys()[0]]:
683                            row += '\t%.2f\t%.2f' % (0, 0)
684                print(row)
685            else:
686                row = [self.binId, lineageStr, selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets()]
687                row.extend([data[6], data[7], aai.aaiMeanBinHetero.get(self.binId, 0.0)])
688                row.extend([self.binStats['Genome size'], self.binStats['# ambiguous bases'], self.binStats['# scaffolds'],
689                                                 self.binStats['# contigs'], self.binStats['N50 (scaffolds)'], self.binStats['N50 (contigs)'],
690                                                 int(self.binStats['Mean scaffold length']), int(self.binStats['Mean contig length']),
691                                                 self.binStats['Longest scaffold'], self.binStats['Longest contig']])
692                row.extend([self.binStats['GC'] * 100, self.binStats['GC std'] * 100])
693                row.extend([self.binStats['Coding density'] * 100, self.binStats['Translation table'], self.binStats['# predicted genes']])
694                row.extend(data[0:6])
695
696                if coverageBinProfiles:
697                    if self.binId in coverageBinProfiles:
698                        for _, coverageStats in coverageBinProfiles[self.binId].iteritems():
699                            row.extend(coverageStats)
700                    else:
701                        for bamId in coverageBinProfiles[coverageBinProfiles.keys()[0]]:
702                            row.extend([0,0])
703
704                table.add_row(row)
705        elif outputFormat == 3:
706            for ms in binMarkerSets.markerSetIter():
707                data = self.geneCounts(ms, self.markerHits, bIndividualMarkers)
708                row = "%s\t%s\t%s\t%d\t%d\t%d\t%s\t%0.2f\t%0.2f\t%0.2f" % (self.binId, ms.UID, ms.lineageStr, ms.numGenomes,
709                                                    ms.numMarkers(), ms.numSets(),
710                                                    "\t".join([str(data[i]) for i in range(6)]),
711                                                    data[6],
712                                                    data[7],
713                                                    aai.aaiMeanBinHetero.get(self.binId, 0.0)
714                                                    )
715                if table == None:
716                    print(row)
717                else:
718                    table.add_row([self.binId, ms.UID, ms.lineageStr, ms.numGenomes, ms.numMarkers(), ms.numSets()] + data + [aai.aaiMeanBinHetero.get(self.binId, 0.0)])
719
720        elif outputFormat == 4:
721            selectedMarkerSet = binMarkerSets.selectedMarkerSet()
722            data = self.hitsToMarkerGene(binMarkerSets.selectedMarkerSet())
723            row = "Node Id: %s; Marker lineage: %s" % (selectedMarkerSet.UID, selectedMarkerSet.lineageStr)
724            for marker in data:
725                row += '\t' + marker
726            print(row)
727
728            row = self.binId
729            for count in data.values():
730                row += '\t' + str(count)
731            print(row)
732
733            print()
734        elif outputFormat == 5:
735            # tabular of bin_id, marker, contig_id
736            markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()
737
738            for marker, hit_list in self.markerHits.items():
739                if marker not in markerGenes:
740                    continue
741
742                for hit in hit_list:
743                    print(self.binId, marker, hit.target_name, sep='\t', end='\n')
744
745        elif outputFormat == 6:
746            markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()
747
748            seqsReported = 0
749            for marker, hitList in self.markerHits.items():
750                if marker not in markerGenes:
751                    continue
752
753                if len(hitList) >= 2:
754                    print(self.binId, marker, sep='\t', end='\t')
755
756                    scaffoldIds = []
757                    for hit in hitList:
758                        scaffoldIds.append(hit.target_name)
759
760                    print(','.join(sorted(scaffoldIds)), end='\n')
761
762                    seqsReported += 1
763
764            return seqsReported
765
766        elif outputFormat == 7:
767            markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()
768
769            seqsReported = 0
770            for marker, hitList in self.markerHits.items():
771                if marker not in markerGenes:
772                    continue
773
774                if len(hitList) >= 2:
775                    scaffoldsWithMultipleHits = set()
776                    for i in xrange(0, len(hitList)):
777                        scaffoldId = hitList[i].target_name[0:hitList[i].target_name.rfind('_')]
778                        for j in xrange(i + 1, len(hitList)):
779                            if scaffoldId == hitList[j].target_name[0:hitList[j].target_name.rfind('_')]:
780                                scaffoldsWithMultipleHits.add(hitList[i].target_name)
781                                scaffoldsWithMultipleHits.add(hitList[j].target_name)
782
783                    if len(scaffoldsWithMultipleHits) >= 2:
784                        print(self.binId, marker, sep='\t', end='\t')
785                        print(','.join(sorted(list(scaffoldsWithMultipleHits))), end='\n')
786                        seqsReported += 1
787
788            return seqsReported
789
790        elif outputFormat == 8:
791            # tabular - print only position of marker genes
792            markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()
793
794            genesWithMarkers = {}
795            for marker, hit_list in self.markerHits.items():
796                if marker not in markerGenes:
797                    continue
798
799                for hit in hit_list:
800                    genesWithMarkers[hit.target_name] = genesWithMarkers.get(hit.target_name, []) + [hit]
801
802            for geneId, hits in genesWithMarkers.iteritems():
803                rowStr = self.binId + '\t' + geneId
804                for hit in hits:
805                    rowStr += '\t' + hit.query_accession + ',' + str(hit.ali_from) + ',' + str(hit.ali_to)
806                print(rowStr)
807
808        # Hunter Cameron, May 29, 2015 - print a fasta of marker genes
809        elif outputFormat == 9:
810            # tabular of bin_id, marker, contig_id
811
812            # check for the analyze folder for later use
813            if anaFolder is None:
814                raise ValueError("AnaFolder must not be None for outputFormat 9")
815
816            # ## build a dict to link target_names with marker gene alignment information
817            markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()
818            hitInfo = {}
819            for marker, hit_list in self.markerHits.items():
820                if marker not in markerGenes:
821                    continue
822
823                for hit in hit_list:
824                    name = hit.target_name
825                    hitInfo[name] = {
826                            "marker": marker,
827                            "ali_from": str(hit.ali_from),
828                            "ali_to": str(hit.ali_to)
829                            }
830
831            # ## Open genes.faa and print the ones that were found with some descriptive info in the header
832            path_to_genes = "/".join([anaFolder, "bins", self.binId, "genes.faa"])
833
834            # get only the seqs we need and their information as a dict
835            seqs = readFasta(path_to_genes, trimHeader=False)
836
837            filt_seqs = []
838            # remove seqs without markers
839            for header in seqs.keys():
840                gene_name = header.split(" # ")[0]
841                if gene_name in hitInfo:
842                    filt_seqs.append(header)
843
844            def sort_header(header):
845                """ sorts headers by contig and gene number """
846                name = header.split(" # ")[0]
847                ctg_name, gene_num = name.rsplit("_", 1)
848                return ctg_name, int(gene_num)
849
850            for header in sorted(filt_seqs, key=sort_header):
851                elems = header.split(" # ")
852                gene_name = elems[0]
853
854                # remove the gene number from Prodigal to get the original contig name
855                contig_name, gene_num = gene_name.rsplit("_", 1)
856
857                # parse some info about the gene from the header line
858                gene_start = elems[1]
859                gene_end = elems[2]
860                gene_strand = elems[3]
861
862                # if table output not specified, print FASTA
863                if table != None:
864                    gene_info = "geneId={};start={};end={};strand={};protlen={}".format(
865                            gene_num, gene_start, gene_end, gene_strand, str(len(seqs[header])))
866
867                    marker_info = "marker={};mstart={};mend={}".format(
868                            hitInfo[gene_name]["marker"],
869                            hitInfo[gene_name]["ali_from"],
870                            hitInfo[gene_name]["ali_to"])
871
872                    # new header will be the bin name, contig name, gene info, and marker info separated by spaces
873                    new_header = ">" + " ".join([self.binId, contig_name, gene_info, marker_info])
874
875                    print(new_header, seqs[header], sep="\n")
876                # otherwise, print a table
877                else:
878                    print("\t".join([
879                            self.binId,
880                            contig_name,
881                            gene_num,
882                            gene_start,
883                            gene_end,
884                            gene_strand,
885                            str(len(seqs[header])),
886                            hitInfo[gene_name]["marker"],
887                            hitInfo[gene_name]["ali_from"],
888                            hitInfo[gene_name]["ali_to"],
889                            seqs[header]
890                            ]))
891        else:
892            self.logger.error("Unknown output format: %d", outputFormat)
893
894        return 0
895
896        '''
897        elif outputFormat == 10:
898            markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes()
899
900            markersInScaffold = {}
901            for marker, hit_list in self.markerHits.items():
902                if marker not in markerGenes:
903                    continue
904
905                for hit in hit_list:
906                    scaffoldId = hit.target_name[0:hit.target_name.rfind('_')]
907                    markersInScaffold[scaffoldId] = markersInScaffold.get(scaffoldId, []) + [marker]
908
909            for scaffoldId, data in self.scaffoldStats.iteritems():
910                print(scaffoldId, self.binId, str(data['Length']), str(data['# contigs']),
911                      '%.3f' % (data['GC']), str(data.get('# ORFs', 0)),
912                      '%.3f' % (float(data.get('Coding bases', 0)) / data['Total contig length']),
913                      sep='\t', end='\t')
914
915                if scaffoldId in markersInScaffold:
916                    markerStr = ','.join(sorted(markersInScaffold[scaffoldId]))
917                    print(markerStr, end='\n')
918                else:
919                    print()
920        '''
921