1############################################################################### 2# 3# resultsParser.py - Parse and output results. 4# 5############################################################################### 6# # 7# This program is free software: you can redistribute it and/or modify # 8# it under the terms of the GNU General Public License as published by # 9# the Free Software Foundation, either version 3 of the License, or # 10# (at your option) any later version. # 11# # 12# This program is distributed in the hope that it will be useful, # 13# but WITHOUT ANY WARRANTY; without even the implied warranty of # 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 15# GNU General Public License for more details. # 16# # 17# You should have received a copy of the GNU General Public License # 18# along with this program. If not, see <http://www.gnu.org/licenses/>. # 19# # 20############################################################################### 21 22from __future__ import print_function 23 24import sys 25import os 26import ast 27from collections import defaultdict 28import logging 29 30import checkm.prettytable as prettytable 31 32from checkm.defaultValues import DefaultValues 33from checkm.common import reassignStdOut, restoreStdOut, checkFileExists 34from checkm.coverage import Coverage 35from checkm.hmmer import HMMERParser 36 37from checkm.util.pfam import PFAM 38from checkm.util.seqUtils import readFasta 39 40 41class ResultsParser(): 42 """Parse output of Prodigal+HMMER run and derived statistics.""" 43 def __init__(self, binIdToModels): 44 self.logger = logging.getLogger('timestamp') 45 46 self.results = {} 47 self.models = binIdToModels 48 49 def analyseResults(self, 50 outDir, 51 binStatsFile, 52 hmmTableFile, 53 bIgnoreThresholds=False, 54 evalueThreshold=DefaultValues.E_VAL, 55 lengthThreshold=DefaultValues.LENGTH, 56 bSkipPseudoGeneCorrection=False, 57 bSkipAdjCorrection=False, 58 ): 59 """Parse results in the output directory.""" 60 61 # read bin and sequence stats into dictionaries 62 binStats = self.parseBinStats(outDir, binStatsFile) 63 64 # get hits to each bin 65 self.parseBinHits(outDir, hmmTableFile, bSkipAdjCorrection, bIgnoreThresholds, evalueThreshold, lengthThreshold, bSkipPseudoGeneCorrection, binStats) 66 67 return binStats 68 69 def cacheResults(self, outDir, binIdToBinMarkerSets, bIndividualMarkers): 70 # cache critical results to file 71 self.__writeBinStatsExt(outDir, binIdToBinMarkerSets, bIndividualMarkers) 72 self.__writeMarkerGeneStats(outDir, binIdToBinMarkerSets, bIndividualMarkers) 73 74 def parseBinHits(self, outDir, 75 hmmTableFile, 76 bSkipAdjCorrection=False, 77 bIgnoreThresholds=False, 78 evalueThreshold=DefaultValues.E_VAL, 79 lengthThreshold=DefaultValues.LENGTH, 80 bSkipPseudoGeneCorrection=False, 81 binStats=None): 82 """ Parse HMM hits for each bin.""" 83 if not self.models: 84 self.logger.error(' [Error] Models must be parsed before identifying HMM hits.') 85 raise 86 87 self.logger.info('Parsing HMM hits to marker genes:') 88 89 numBinsProcessed = 0 90 for binId in self.models: 91 if self.logger.getEffectiveLevel() <= logging.INFO: 92 numBinsProcessed += 1 93 statusStr = ' Finished parsing hits for %d of %d (%.2f%%) bins.' % (numBinsProcessed, len(self.models), float(numBinsProcessed) * 100 / len(self.models)) 94 sys.stderr.write('%s\r' % statusStr) 95 sys.stderr.flush() 96 97 if binStats != None: 98 resultsManager = ResultsManager(binId, self.models[binId], bIgnoreThresholds, evalueThreshold, lengthThreshold, bSkipPseudoGeneCorrection, binStats[binId]) 99 elif binStats == None: 100 resultsManager = ResultsManager(binId, self.models[binId], bIgnoreThresholds, evalueThreshold, lengthThreshold, bSkipPseudoGeneCorrection) 101 else: 102 self.logger.error(' [Error] Invalid parameter settings for binStats and seqStats.') 103 raise 104 105 hmmerTableFile = os.path.join(outDir, 'bins', binId, hmmTableFile) 106 self.parseHmmerResults(hmmerTableFile, resultsManager, bSkipAdjCorrection) 107 self.results[binId] = resultsManager 108 109 if self.logger.getEffectiveLevel() <= logging.INFO: 110 sys.stderr.write('\n') 111 112 def __writeBinStatsExt(self, directory, binIdToBinMarkerSets, bIndividualMarkers): 113 binStatsExtFile = os.path.join(directory, 'storage', DefaultValues.BIN_STATS_EXT_OUT) 114 fout = open(binStatsExtFile, 'w') 115 116 for binId in self.results: 117 binStatsExt = self.results[binId].getSummary(binIdToBinMarkerSets[binId], bIndividualMarkers, outputFormat=2) 118 binStatsExt.update(self.results[binId].geneCopyNumber(binIdToBinMarkerSets[binId])) 119 fout.write(binId + '\t' + str(binStatsExt) + '\n') 120 fout.close() 121 122 def __writeMarkerGeneStats(self, directory, binIdToBinMarkerSets, bIndividualMarkers): 123 markerGenesFile = os.path.join(directory, 'storage', DefaultValues.MARKER_GENE_STATS) 124 fout = open(markerGenesFile, 'w') 125 126 for binId in self.results: 127 markerGenesHits = self.results[binId].getSummary(binIdToBinMarkerSets[binId], bIndividualMarkers, outputFormat=8) 128 fout.write(binId + '\t' + str(markerGenesHits) + '\n') 129 fout.close() 130 131 def parseBinStats(self, resultsFolder, binStatsFile): 132 """Read bin statistics from file.""" 133 binStatsFile = os.path.join(resultsFolder, 'storage', binStatsFile) 134 135 checkFileExists(binStatsFile) 136 137 binStats = {} 138 with open(binStatsFile, 'r') as f: 139 for line in f: 140 lineSplit = line.split('\t') 141 binStats[lineSplit[0]] = ast.literal_eval(lineSplit[1]) 142 143 return binStats 144 145 def parseBinStatsExt(self, resultsFolder): 146 """Read bin statistics from file.""" 147 binStatsExtFile = os.path.join(resultsFolder, 'storage', DefaultValues.BIN_STATS_EXT_OUT) 148 149 checkFileExists(binStatsExtFile) 150 151 binStatsExt = {} 152 with open(binStatsExtFile, 'r') as f: 153 for line in f: 154 lineSplit = line.split('\t') 155 binStatsExt[lineSplit[0]] = ast.literal_eval(lineSplit[1]) 156 157 return binStatsExt 158 159 def parseMarkerGeneStats(self, resultsFolder): 160 """Read bin statistics from file.""" 161 markerGeneStatsFile = os.path.join(resultsFolder, 'storage', DefaultValues.MARKER_GENE_STATS) 162 163 checkFileExists(markerGeneStatsFile) 164 165 markerGeneStats = {} 166 with open(markerGeneStatsFile, 'r') as f: 167 for line in f: 168 lineSplit = line.split('\t') 169 markerGeneStats[lineSplit[0]] = ast.literal_eval(lineSplit[1]) 170 171 return markerGeneStats 172 173 def parseHmmerResults(self, fileName, resultsManager, bSkipAdjCorrection): 174 """Parse HMMER results.""" 175 try: 176 with open(fileName, 'r') as hmmerHandle: 177 try: 178 HP = HMMERParser(hmmerHandle) 179 except: 180 print("Error opening HMM file: ", fileName) 181 raise 182 183 while True: 184 hit = HP.next() 185 if hit is None: 186 break 187 resultsManager.addHit(hit) 188 189 # retain only best hit to PFAM clan 190 pfam = PFAM(DefaultValues.PFAM_CLAN_FILE) 191 resultsManager.markerHits = pfam.filterHitsFromSameClan(resultsManager.markerHits) 192 193 # correct for errors in ORF calling 194 if not bSkipAdjCorrection: 195 resultsManager.identifyAdjacentMarkerGenes() 196 197 except IOError as detail: 198 sys.stderr.write(str(detail) + "\n") 199 200 def __getHeader(self, outputFormat, binMarkerSets, coverageBinProfiles=None, table=None): 201 """Get header for requested output table.""" 202 203 # keep count of single, double, triple genes etc... 204 if outputFormat == 1: 205 header = ['Bin Id', 'Marker lineage', '# genomes', '# markers', '# marker sets', '0', '1', '2', '3', '4', '5+', 'Completeness', 'Contamination', 'Strain heterogeneity'] 206 elif outputFormat == 2: 207 header = ['Bin Id'] 208 header += ['Marker lineage', '# genomes', '# markers', '# marker sets'] 209 header += ['Completeness', 'Contamination', 'Strain heterogeneity'] 210 header += ['Genome size (bp)', '# ambiguous bases', '# scaffolds', '# contigs'] 211 header += ['N50 (scaffolds)', 'N50 (contigs)', 'Mean scaffold length (bp)', 'Mean contig length (bp)'] 212 header += ['Longest scaffold (bp)', 'Longest contig (bp)'] 213 header += ['GC', 'GC std (scaffolds > 1kbp)'] 214 header += ['Coding density', 'Translation table', '# predicted genes'] 215 header += ['0', '1', '2', '3', '4', '5+'] 216 217 if coverageBinProfiles != None: 218 for bamId in coverageBinProfiles[coverageBinProfiles.keys()[0]]: 219 header += ['Coverage (' + bamId + ')', 'Coverage std (' + bamId + ')'] 220 221 if DefaultValues.MIN_SEQ_LEN_GC_STD != 1000: 222 self.logger.error(' [Error] Labeling error: GC std (scaffolds > 1kbp)') 223 sys.exit(1) 224 elif outputFormat == 3: 225 header = ['Bin Id', 'Node Id', 'Marker lineage', '# genomes', '# markers', '# marker sets', '0', '1', '2', '3', '4', '5+', 'Completeness', 'Contamination', 'Strain heterogeneity'] 226 elif outputFormat == 4: 227 header = None 228 elif outputFormat == 5: 229 header = ['Bin Id', 'Marker Id', 'Gene Id'] 230 elif outputFormat == 6: 231 header = ['Bin Id', 'Marker Id', 'Gene Ids'] 232 elif outputFormat == 7: 233 header = ['Bin Id', 'Marker Id', 'Gene Ids'] 234 elif outputFormat == 8: 235 header = ['Bin Id', 'Gene Id', '{Marker Id, Start position, End position}'] 236 elif outputFormat == 9: 237 if table is not None: 238 header = ['Bin Id', 'Contig', 'Gene Number', 'Gene Start', 'Gene End', 'Gene Strand', 'Prot Length', 'Marker Id', 'Align Start', 'Align End', 'Sequence'] 239 else: 240 header = " " 241 elif outputFormat == 10: 242 header = ['Scaffold Id', 'Bin Id', 'Length', '# contigs', 'GC', '# ORFs', 'Coding density', 'Marker Ids'] 243 244 return header 245 246 def printSummary(self, outputFormat, aai, binIdToBinMarkerSets, bIndividualMarkers, coverageFile, bTabTable, outFile, anaFolder): 247 # redirect output 248 oldStdOut = reassignStdOut(outFile) 249 250 coverageBinProfiles = None 251 if coverageFile: 252 coverage = Coverage(1) 253 coverageBinProfiles = coverage.binProfiles(coverageFile) 254 255 prettyTableFormats = [1, 2, 3, 9] 256 257 header = self.__getHeader(outputFormat, binIdToBinMarkerSets[binIdToBinMarkerSets.keys()[0]], coverageBinProfiles, bTabTable) 258 if bTabTable or outputFormat not in prettyTableFormats: 259 bTabTable = True 260 pTable = None 261 262 if header != None: 263 print('\t'.join(header)) 264 else: 265 pTable = prettytable.PrettyTable(header) 266 pTable.float_format = '.2' 267 pTable.align = 'c' 268 pTable.align[header[0]] = 'l' 269 pTable.hrules = prettytable.FRAME 270 pTable.vrules = prettytable.NONE 271 272 seqsReported = 0 273 for binId in sorted(self.results.keys()): 274 seqsReported += self.results[binId].printSummary(outputFormat, aai, binIdToBinMarkerSets[binId], bIndividualMarkers, coverageBinProfiles, pTable, anaFolder) 275 276 if outputFormat in [6, 7] and seqsReported == 0: 277 print('[No marker genes satisfied the reporting criteria.]') 278 279 if not bTabTable: 280 if outputFormat in [1, 2]: 281 print(pTable.get_string(sortby='Completeness', reversesort=True)) 282 else: 283 # only print if there are rows 284 if pTable.get_string(print_empty=False): 285 print(pTable.get_string(print_empty=False)) 286 287 # restore stdout 288 restoreStdOut(outFile, oldStdOut) 289 290 291class ResultsManager(): 292 """Store all the results for a single bin""" 293 def __init__(self, binId, models, 294 bIgnoreThresholds=False, 295 evalueThreshold=DefaultValues.E_VAL, 296 lengthThreshold=DefaultValues.LENGTH, 297 bSkipPseudoGeneCorrection=False, 298 binStats=None): 299 self.binId = binId 300 self.markerHits = {} 301 self.bIgnoreThresholds = bIgnoreThresholds 302 self.evalueThreshold = evalueThreshold 303 self.lengthThreshold = lengthThreshold 304 self.bSkipPseudoGeneCorrection = bSkipPseudoGeneCorrection 305 self.models = models 306 self.binStats = binStats 307 308 def vetHit(self, hit): 309 """Check if hit meets required thresholds.""" 310 model = self.models[hit.query_accession] 311 312 # preferentially use model specific bit score thresholds, before 313 # using the user specified e-value and length criteria 314 315 # Give preference to the gathering threshold unless the model 316 # is marked as TIGR (i.e., TIGRFAM model) 317 318 if not self.bSkipPseudoGeneCorrection: 319 alignment_length = float(hit.ali_to - hit.ali_from) 320 length_perc = alignment_length / float(hit.query_length) 321 if length_perc < DefaultValues.PSEUDOGENE_LENGTH: 322 return False 323 324 if model.nc != None and not self.bIgnoreThresholds and 'TIGR' in model.acc: 325 if model.nc[0] <= hit.full_score and model.nc[1] <= hit.dom_score: 326 return True 327 elif model.ga != None and not self.bIgnoreThresholds: 328 if model.ga[0] <= hit.full_score and model.ga[1] <= hit.dom_score: 329 return True 330 elif model.tc != None and not self.bIgnoreThresholds: 331 if model.tc[0] <= hit.full_score and model.tc[1] <= hit.dom_score: 332 return True 333 elif model.nc != None and not self.bIgnoreThresholds: 334 if model.nc[0] <= hit.full_score and model.nc[1] <= hit.dom_score: 335 return True 336 else: 337 if hit.full_e_value > self.evalueThreshold: 338 return False 339 340 alignment_length = float(hit.ali_to - hit.ali_from) 341 length_perc = alignment_length / float(hit.query_length) 342 if length_perc >= self.lengthThreshold: 343 return True 344 345 return False 346 347 def addHit(self, hit): 348 """Process hit and add it to the set of markers if it passes filtering criteria.""" 349 if self.vetHit(hit): 350 if hit.query_accession in self.markerHits: 351 # retain only the best domain hit for a given marker to a specific ORF 352 previousHitToORF = None 353 for h in self.markerHits[hit.query_accession]: 354 if h.target_name == hit.target_name: 355 previousHitToORF = h 356 break 357 358 if not previousHitToORF: 359 self.markerHits[hit.query_accession].append(hit) 360 else: 361 if previousHitToORF.dom_score < hit.dom_score: 362 self.markerHits[hit.query_accession].append(hit) 363 self.markerHits[hit.query_accession].remove(previousHitToORF) 364 365 else: 366 self.markerHits[hit.query_accession] = [hit] 367 368 def identifyAdjacentMarkerGenes(self): 369 """Identify adjacent marker genes and exclude these from the contamination estimate.""" 370 371 # check for adjacent ORFs with hits to the same marker gene 372 for markerId, hits in self.markerHits.iteritems(): 373 374 bCombined = True 375 while bCombined: 376 for i in xrange(0, len(hits)): 377 orfI = hits[i].target_name 378 scaffoldIdI = orfI[0:orfI.rfind('_')] 379 380 bCombined = False 381 for j in xrange(i + 1, len(hits)): 382 orfJ = hits[j].target_name 383 scaffoldIdJ = orfJ[0:orfJ.rfind('_')] 384 385 # check if hits are on adjacent ORFs 386 if scaffoldIdI == scaffoldIdJ: 387 try: 388 orfNumI = int(orfI[orfI.rfind('_') + 1:]) 389 orfNumJ = int(orfJ[orfJ.rfind('_') + 1:]) 390 except: 391 # it appears called genes are not labeled 392 # according to the prodigal format, so 393 # it is not possible to perform this correction 394 break 395 396 if abs(orfNumI - orfNumJ) == 1: 397 # check if hits are to different parts of the HMM 398 sI = hits[i].hmm_from 399 eI = hits[i].hmm_to 400 401 sJ = hits[j].hmm_from 402 eJ = hits[j].hmm_to 403 404 if (sI <= sJ and eI > sJ) or (sJ <= sI and eJ > sI): 405 # models overlap so this could represent contamination, 406 # but it seems more likely that adjacent genes hitting 407 # the same marker represent legitimate gene duplication, 408 # a gene calling error, or an assembly error and thus 409 # should not be treated as contamination 410 bCombined = True 411 break 412 else: 413 # combine the two hits 414 bCombined = True 415 break 416 417 if bCombined: 418 newHit = hits[i] 419 420 # produce concatenated label indicating the two genes being combined 421 orfA, orfB = sorted([orfI, orfJ]) 422 newHit.target_name = DefaultValues.SEQ_CONCAT_CHAR.join([orfA, orfB]) 423 424 newHit.target_length = hits[i].target_length + hits[j].target_length 425 newHit.hmm_from = min(hits[i].hmm_from, hits[j].hmm_from) 426 newHit.hmm_to = min(hits[i].hmm_to, hits[j].hmm_to) 427 428 newHit.ali_from = min(hits[i].ali_from, hits[j].ali_from) 429 newHit.ali_to = min(hits[i].ali_to, hits[j].ali_to) 430 431 newHit.env_from = min(hits[i].env_from, hits[j].env_from) 432 newHit.env_to = min(hits[i].env_to, hits[j].env_to) 433 434 hits.remove(hits[j]) 435 hits.remove(hits[i]) 436 437 hits.append(newHit) 438 439 break 440 441 self.markerHits[markerId] = hits 442 443 def countUniqueHits(self): 444 """Determine number of unique and multiple hits.""" 445 uniqueHits = 0 446 multiCopyHits = 0 447 for hits in self.markerHits.values(): 448 if len(hits) == 1: 449 uniqueHits += 1 450 elif len(hits) > 1: 451 multiCopyHits += 1 452 453 return uniqueHits, multiCopyHits 454 455 def hitsToMarkerGene(self, markerSet): 456 """Determine number of times each marker gene is hit.""" 457 ret = dict() 458 for marker in markerSet.getMarkerGenes(): 459 try: 460 ret[marker] = len(self.markerHits[marker]) 461 except KeyError: 462 ret[marker] = 0 463 464 return ret 465 466 def geneCountsForSelectedMarkerSet(self, binMarkerSets, bIndividualMarkers): 467 """Report number of times each marker gene was found along with 468 a completeness and contamination estimation for the selected marker 469 set associated with a bin.""" 470 geneCounts = self.geneCounts(binMarkerSets.selectedMarkerSet(), self.markerHits, bIndividualMarkers) 471 472 return geneCounts 473 474 def geneCounts(self, markerSet, markerHits, bIndividualMarkers): 475 """ Determine number of marker genes with 0-5 hits 476 as well as the total completeness and contamination.""" 477 geneCounts = [0] * 6 478 multiCopyCount = 0 479 for marker in markerSet.getMarkerGenes(): 480 if marker in markerHits: 481 if len(markerHits[marker]) > 5: 482 markerCount = 5 483 else: 484 markerCount = len(markerHits[marker]) 485 486 multiCopyCount += (len(markerHits[marker]) - 1) 487 else: 488 markerCount = 0 489 490 geneCounts[markerCount] += 1 491 492 percComp, percCont = markerSet.genomeCheck(markerHits, bIndividualMarkers) 493 494 geneCounts.append(percComp) 495 geneCounts.append(percCont) 496 497 return geneCounts 498 499 def geneCopyNumber(self, binMarkerSets): 500 """ Determine number of times each marker gene is present.""" 501 geneCopyNumber = {} 502 geneCopyNumber['GCN0'] = [] 503 geneCopyNumber['GCN1'] = [] 504 geneCopyNumber['GCN2'] = [] 505 geneCopyNumber['GCN3'] = [] 506 geneCopyNumber['GCN4'] = [] 507 geneCopyNumber['GCN5+'] = [] 508 509 markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() 510 511 for marker in self.models: 512 if marker not in markerGenes: 513 continue 514 515 markerId = os.path.splitext(marker)[0] 516 if marker in self.markerHits: 517 if len(self.markerHits[marker]) >= 5: 518 geneCopyNumber['GCN5+'].append(markerId) 519 else: 520 geneCopyNumber['GCN' + str(len(self.markerHits[marker]))].append(markerId) 521 else: 522 geneCopyNumber['GCN0'].append(markerId) 523 524 return geneCopyNumber 525 526 def getSummary(self, binMarkerSets, bIndividualMarkers, outputFormat=1): 527 """Get dictionary containing information about bin.""" 528 summary = {} 529 530 if outputFormat == 1: 531 selectedMarkerSet = binMarkerSets.selectedMarkerSet() 532 data = self.geneCountsForSelectedMarkerSet(binMarkerSets, bIndividualMarkers) 533 534 summary['marker lineage'] = selectedMarkerSet.lineageStr 535 summary['# genomes'] = selectedMarkerSet.numGenomes 536 summary['# markers'] = selectedMarkerSet.numMarkers() 537 summary['# marker sets'] = selectedMarkerSet.numSets() 538 summary['0'] = data[0] 539 summary['1'] = data[1] 540 summary['2'] = data[2] 541 summary['3'] = data[3] 542 summary['4'] = data[4] 543 summary['5+'] = data[5] 544 summary['Completeness'] = data[6] 545 summary['Contamination'] = data[7] 546 elif outputFormat == 2: 547 selectedMarkerSet = binMarkerSets.selectedMarkerSet() 548 data = self.geneCountsForSelectedMarkerSet(binMarkerSets, bIndividualMarkers) 549 550 summary['marker lineage'] = selectedMarkerSet.lineageStr 551 summary['# genomes'] = selectedMarkerSet.numGenomes 552 summary['# markers'] = selectedMarkerSet.numMarkers() 553 summary['# marker sets'] = selectedMarkerSet.numSets() 554 summary['0'] = data[0] 555 summary['1'] = data[1] 556 summary['2'] = data[2] 557 summary['3'] = data[3] 558 summary['4'] = data[4] 559 summary['5+'] = data[5] 560 summary['Completeness'] = data[6] 561 summary['Contamination'] = data[7] 562 summary.update(self.binStats) 563 564 elif outputFormat == 5: 565 # tabular of bin_id, marker, contig_id 566 markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() 567 568 for marker, hit_list in self.markerHits.items(): 569 if marker not in markerGenes: 570 continue 571 572 summary[marker] = [] 573 for hit in hit_list: 574 summary[marker].append(hit.target_name) 575 576 elif outputFormat == 6: 577 markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() 578 579 for marker, hit_list in self.markerHits.items(): 580 if marker not in markerGenes: 581 continue 582 583 if len(hit_list) >= 2: 584 summary[marker] = [] 585 for hit in hit_list: 586 summary[marker].append(hit.target_name) 587 588 elif outputFormat == 7: 589 # tabular - print only contigs that have more than one copy 590 # of the same marker on them 591 markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() 592 593 contigs = defaultdict(dict) 594 for marker, hit_list in self.markerHits.items(): 595 if marker not in markerGenes: 596 continue 597 598 for hit in hit_list: 599 try: 600 contigs[hit.target_name][marker] += 1 601 except KeyError: 602 contigs[hit.target_name][marker] = 1 603 604 for contig_name, marker_counts in contigs.items(): 605 for marker_name, marker_count in marker_counts.items(): 606 if marker_count > 1: 607 if contig_name not in summary: 608 summary[contig_name] = {} 609 610 summary[contig_name][marker_name] = marker_count 611 612 elif outputFormat == 8: 613 # tabular - print only position of marker genes 614 markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() 615 616 genesWithMarkers = {} 617 for marker, hit_list in self.markerHits.items(): 618 if marker not in markerGenes: 619 continue 620 621 for hit in hit_list: 622 genesWithMarkers[hit.target_name] = genesWithMarkers.get(hit.target_name, []) + [hit] 623 624 for geneId, hits in genesWithMarkers.iteritems(): 625 summary[geneId] = {} 626 for hit in hits: 627 summary[geneId][hit.query_accession] = summary[geneId].get(hit.query_accession, []) + [[hit.ali_from, hit.ali_to]] 628 else: 629 print("Unknown output format: ", outputFormat) 630 631 return summary 632 633 def printSummary(self, outputFormat, aai, binMarkerSets, bIndividualMarkers, coverageBinProfiles=None, table=None, anaFolder=None): 634 635 """Print out information about bin.""" 636 if outputFormat == 1: 637 selectedMarkerSet = binMarkerSets.selectedMarkerSet() 638 639 lineageStr = selectedMarkerSet.lineageStr 640 if selectedMarkerSet.UID != '0': 641 lineageStr += ' (' + str(selectedMarkerSet.UID) + ')' 642 643 data = self.geneCountsForSelectedMarkerSet(binMarkerSets, bIndividualMarkers) 644 row = "%s\t%s\t%d\t%d\t%d\t%s\t%0.2f\t%0.2f\t%0.2f" % (self.binId, lineageStr, 645 selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets(), 646 "\t".join([str(data[i]) for i in range(6)]), 647 data[6], 648 data[7], 649 aai.aaiMeanBinHetero.get(self.binId, 0.0) 650 ) 651 if table == None: 652 print(row) 653 else: 654 table.add_row([self.binId, lineageStr, selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets()] + data + [aai.aaiMeanBinHetero.get(self.binId, 0.0)]) 655 elif outputFormat == 2: 656 selectedMarkerSet = binMarkerSets.selectedMarkerSet() 657 658 lineageStr = selectedMarkerSet.lineageStr 659 if selectedMarkerSet.UID != '0': 660 lineageStr += ' (' + str(selectedMarkerSet.UID) + ')' 661 662 data = self.geneCountsForSelectedMarkerSet(binMarkerSets, bIndividualMarkers) 663 664 if table == None: 665 row = self.binId 666 row += '\t%s\t%d\t%d\t%d' % (lineageStr, selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets()) 667 row += '\t%0.2f\t%0.2f\t%0.2f' % (data[6], data[7], aai.aaiMeanBinHetero.get(self.binId, 0.0)) 668 row += '\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d' % (self.binStats['Genome size'], self.binStats['# ambiguous bases'], 669 self.binStats['# scaffolds'], self.binStats['# contigs'], 670 self.binStats['N50 (scaffolds)'], self.binStats['N50 (contigs)'], 671 self.binStats['Mean scaffold length'], self.binStats['Mean contig length'], 672 self.binStats['Longest scaffold'], self.binStats['Longest contig']) 673 row += '\t%.1f\t%.2f' % (self.binStats['GC'] * 100, self.binStats['GC std'] * 100) 674 row += '\t%.2f\t%d\t%d' % (self.binStats['Coding density'] * 100, self.binStats['Translation table'], self.binStats['# predicted genes']) 675 row += '\t' + '\t'.join([str(data[i]) for i in xrange(6)]) 676 677 if coverageBinProfiles: 678 if self.binId in coverageBinProfiles: 679 for _, coverageStats in coverageBinProfiles[self.binId].iteritems(): 680 row += '\t%.2f\t%.2f' % (coverageStats[0], coverageStats[1]) 681 else: 682 for bamId in coverageBinProfiles[coverageBinProfiles.keys()[0]]: 683 row += '\t%.2f\t%.2f' % (0, 0) 684 print(row) 685 else: 686 row = [self.binId, lineageStr, selectedMarkerSet.numGenomes, selectedMarkerSet.numMarkers(), selectedMarkerSet.numSets()] 687 row.extend([data[6], data[7], aai.aaiMeanBinHetero.get(self.binId, 0.0)]) 688 row.extend([self.binStats['Genome size'], self.binStats['# ambiguous bases'], self.binStats['# scaffolds'], 689 self.binStats['# contigs'], self.binStats['N50 (scaffolds)'], self.binStats['N50 (contigs)'], 690 int(self.binStats['Mean scaffold length']), int(self.binStats['Mean contig length']), 691 self.binStats['Longest scaffold'], self.binStats['Longest contig']]) 692 row.extend([self.binStats['GC'] * 100, self.binStats['GC std'] * 100]) 693 row.extend([self.binStats['Coding density'] * 100, self.binStats['Translation table'], self.binStats['# predicted genes']]) 694 row.extend(data[0:6]) 695 696 if coverageBinProfiles: 697 if self.binId in coverageBinProfiles: 698 for _, coverageStats in coverageBinProfiles[self.binId].iteritems(): 699 row.extend(coverageStats) 700 else: 701 for bamId in coverageBinProfiles[coverageBinProfiles.keys()[0]]: 702 row.extend([0,0]) 703 704 table.add_row(row) 705 elif outputFormat == 3: 706 for ms in binMarkerSets.markerSetIter(): 707 data = self.geneCounts(ms, self.markerHits, bIndividualMarkers) 708 row = "%s\t%s\t%s\t%d\t%d\t%d\t%s\t%0.2f\t%0.2f\t%0.2f" % (self.binId, ms.UID, ms.lineageStr, ms.numGenomes, 709 ms.numMarkers(), ms.numSets(), 710 "\t".join([str(data[i]) for i in range(6)]), 711 data[6], 712 data[7], 713 aai.aaiMeanBinHetero.get(self.binId, 0.0) 714 ) 715 if table == None: 716 print(row) 717 else: 718 table.add_row([self.binId, ms.UID, ms.lineageStr, ms.numGenomes, ms.numMarkers(), ms.numSets()] + data + [aai.aaiMeanBinHetero.get(self.binId, 0.0)]) 719 720 elif outputFormat == 4: 721 selectedMarkerSet = binMarkerSets.selectedMarkerSet() 722 data = self.hitsToMarkerGene(binMarkerSets.selectedMarkerSet()) 723 row = "Node Id: %s; Marker lineage: %s" % (selectedMarkerSet.UID, selectedMarkerSet.lineageStr) 724 for marker in data: 725 row += '\t' + marker 726 print(row) 727 728 row = self.binId 729 for count in data.values(): 730 row += '\t' + str(count) 731 print(row) 732 733 print() 734 elif outputFormat == 5: 735 # tabular of bin_id, marker, contig_id 736 markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() 737 738 for marker, hit_list in self.markerHits.items(): 739 if marker not in markerGenes: 740 continue 741 742 for hit in hit_list: 743 print(self.binId, marker, hit.target_name, sep='\t', end='\n') 744 745 elif outputFormat == 6: 746 markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() 747 748 seqsReported = 0 749 for marker, hitList in self.markerHits.items(): 750 if marker not in markerGenes: 751 continue 752 753 if len(hitList) >= 2: 754 print(self.binId, marker, sep='\t', end='\t') 755 756 scaffoldIds = [] 757 for hit in hitList: 758 scaffoldIds.append(hit.target_name) 759 760 print(','.join(sorted(scaffoldIds)), end='\n') 761 762 seqsReported += 1 763 764 return seqsReported 765 766 elif outputFormat == 7: 767 markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() 768 769 seqsReported = 0 770 for marker, hitList in self.markerHits.items(): 771 if marker not in markerGenes: 772 continue 773 774 if len(hitList) >= 2: 775 scaffoldsWithMultipleHits = set() 776 for i in xrange(0, len(hitList)): 777 scaffoldId = hitList[i].target_name[0:hitList[i].target_name.rfind('_')] 778 for j in xrange(i + 1, len(hitList)): 779 if scaffoldId == hitList[j].target_name[0:hitList[j].target_name.rfind('_')]: 780 scaffoldsWithMultipleHits.add(hitList[i].target_name) 781 scaffoldsWithMultipleHits.add(hitList[j].target_name) 782 783 if len(scaffoldsWithMultipleHits) >= 2: 784 print(self.binId, marker, sep='\t', end='\t') 785 print(','.join(sorted(list(scaffoldsWithMultipleHits))), end='\n') 786 seqsReported += 1 787 788 return seqsReported 789 790 elif outputFormat == 8: 791 # tabular - print only position of marker genes 792 markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() 793 794 genesWithMarkers = {} 795 for marker, hit_list in self.markerHits.items(): 796 if marker not in markerGenes: 797 continue 798 799 for hit in hit_list: 800 genesWithMarkers[hit.target_name] = genesWithMarkers.get(hit.target_name, []) + [hit] 801 802 for geneId, hits in genesWithMarkers.iteritems(): 803 rowStr = self.binId + '\t' + geneId 804 for hit in hits: 805 rowStr += '\t' + hit.query_accession + ',' + str(hit.ali_from) + ',' + str(hit.ali_to) 806 print(rowStr) 807 808 # Hunter Cameron, May 29, 2015 - print a fasta of marker genes 809 elif outputFormat == 9: 810 # tabular of bin_id, marker, contig_id 811 812 # check for the analyze folder for later use 813 if anaFolder is None: 814 raise ValueError("AnaFolder must not be None for outputFormat 9") 815 816 # ## build a dict to link target_names with marker gene alignment information 817 markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() 818 hitInfo = {} 819 for marker, hit_list in self.markerHits.items(): 820 if marker not in markerGenes: 821 continue 822 823 for hit in hit_list: 824 name = hit.target_name 825 hitInfo[name] = { 826 "marker": marker, 827 "ali_from": str(hit.ali_from), 828 "ali_to": str(hit.ali_to) 829 } 830 831 # ## Open genes.faa and print the ones that were found with some descriptive info in the header 832 path_to_genes = "/".join([anaFolder, "bins", self.binId, "genes.faa"]) 833 834 # get only the seqs we need and their information as a dict 835 seqs = readFasta(path_to_genes, trimHeader=False) 836 837 filt_seqs = [] 838 # remove seqs without markers 839 for header in seqs.keys(): 840 gene_name = header.split(" # ")[0] 841 if gene_name in hitInfo: 842 filt_seqs.append(header) 843 844 def sort_header(header): 845 """ sorts headers by contig and gene number """ 846 name = header.split(" # ")[0] 847 ctg_name, gene_num = name.rsplit("_", 1) 848 return ctg_name, int(gene_num) 849 850 for header in sorted(filt_seqs, key=sort_header): 851 elems = header.split(" # ") 852 gene_name = elems[0] 853 854 # remove the gene number from Prodigal to get the original contig name 855 contig_name, gene_num = gene_name.rsplit("_", 1) 856 857 # parse some info about the gene from the header line 858 gene_start = elems[1] 859 gene_end = elems[2] 860 gene_strand = elems[3] 861 862 # if table output not specified, print FASTA 863 if table != None: 864 gene_info = "geneId={};start={};end={};strand={};protlen={}".format( 865 gene_num, gene_start, gene_end, gene_strand, str(len(seqs[header]))) 866 867 marker_info = "marker={};mstart={};mend={}".format( 868 hitInfo[gene_name]["marker"], 869 hitInfo[gene_name]["ali_from"], 870 hitInfo[gene_name]["ali_to"]) 871 872 # new header will be the bin name, contig name, gene info, and marker info separated by spaces 873 new_header = ">" + " ".join([self.binId, contig_name, gene_info, marker_info]) 874 875 print(new_header, seqs[header], sep="\n") 876 # otherwise, print a table 877 else: 878 print("\t".join([ 879 self.binId, 880 contig_name, 881 gene_num, 882 gene_start, 883 gene_end, 884 gene_strand, 885 str(len(seqs[header])), 886 hitInfo[gene_name]["marker"], 887 hitInfo[gene_name]["ali_from"], 888 hitInfo[gene_name]["ali_to"], 889 seqs[header] 890 ])) 891 else: 892 self.logger.error("Unknown output format: %d", outputFormat) 893 894 return 0 895 896 ''' 897 elif outputFormat == 10: 898 markerGenes = binMarkerSets.selectedMarkerSet().getMarkerGenes() 899 900 markersInScaffold = {} 901 for marker, hit_list in self.markerHits.items(): 902 if marker not in markerGenes: 903 continue 904 905 for hit in hit_list: 906 scaffoldId = hit.target_name[0:hit.target_name.rfind('_')] 907 markersInScaffold[scaffoldId] = markersInScaffold.get(scaffoldId, []) + [marker] 908 909 for scaffoldId, data in self.scaffoldStats.iteritems(): 910 print(scaffoldId, self.binId, str(data['Length']), str(data['# contigs']), 911 '%.3f' % (data['GC']), str(data.get('# ORFs', 0)), 912 '%.3f' % (float(data.get('Coding bases', 0)) / data['Total contig length']), 913 sep='\t', end='\t') 914 915 if scaffoldId in markersInScaffold: 916 markerStr = ','.join(sorted(markersInScaffold[scaffoldId])) 917 print(markerStr, end='\n') 918 else: 919 print() 920 ''' 921