1#!/usr/bin/env python 2 3""" MultiQC module to parse output from Tophat """ 4 5from __future__ import print_function 6from collections import OrderedDict 7import logging 8import os 9import re 10 11from multiqc import config 12from multiqc.plots import bargraph 13from multiqc.modules.base_module import BaseMultiqcModule 14 15# Initialise the logger 16log = logging.getLogger(__name__) 17 18 19class MultiqcModule(BaseMultiqcModule): 20 def __init__(self): 21 22 # Initialise the parent object 23 super(MultiqcModule, self).__init__( 24 name="Tophat", 25 anchor="tophat", 26 href="https://ccb.jhu.edu/software/tophat/", 27 info="is a fast splice junction mapper for RNA-Seq reads. " 28 "It aligns RNA-Seq reads to mammalian-sized genomes.", 29 ) 30 31 # Find and load any Tophat reports 32 self.tophat_data = dict() 33 for f in self.find_log_files("tophat"): 34 parsed_data = self.parse_tophat_log(f["f"]) 35 if parsed_data is not None: 36 if f["s_name"] == "align" or f["s_name"] == "align_summary.txt": 37 s_name = os.path.basename(f["root"]) 38 else: 39 s_name = f["s_name"].split("align_summary.txt", 1)[0] 40 s_name = self.clean_s_name(s_name, f["root"]) 41 if s_name in self.tophat_data: 42 log.debug("Duplicate sample name found! Overwriting: {}".format(s_name)) 43 self.add_data_source(f, s_name) 44 self.tophat_data[s_name] = parsed_data 45 46 # Filter to strip out ignored sample names 47 self.tophat_data = self.ignore_samples(self.tophat_data) 48 49 if len(self.tophat_data) == 0: 50 raise UserWarning 51 52 log.info("Found {} reports".format(len(self.tophat_data))) 53 54 # Write parsed report data to a file 55 self.write_data_file(self.tophat_data, "multiqc_tophat.txt") 56 57 # Basic Stats Table 58 self.tophat_general_stats_table() 59 60 # Alignment Rate Plot 61 self.tophat_alignment_plot() 62 63 def parse_tophat_log(self, raw_data): 64 """ Parse the Tophat alignment log file. """ 65 66 if "Aligned pairs" in raw_data: 67 # Paired end data 68 regexes = { 69 "overall_aligned_percent": r"([\d\.]+)% overall read mapping rate.", 70 "concordant_aligned_percent": r"([\d\.]+)% concordant pair alignment rate.", 71 "aligned_total": r"Aligned pairs:\s+(\d+)", 72 "aligned_multimap": r"Aligned pairs:\s+\d+\n\s+of these:\s+(\d+)", 73 "aligned_discordant": r"(\d+) \([\s\d\.]+%\) are discordant alignments", 74 "total_reads": r"[Rr]eads:\n\s+Input\s*:\s+(\d+)", 75 } 76 else: 77 # Single end data 78 regexes = { 79 "total_reads": r"[Rr]eads:\n\s+Input\s*:\s+(\d+)", 80 "aligned_total": r"Mapped\s*:\s+(\d+)", 81 "aligned_multimap": r"of these\s*:\s+(\d+)", 82 "overall_aligned_percent": r"([\d\.]+)% overall read mapping rate.", 83 } 84 85 parsed_data = {} 86 for k, r in regexes.items(): 87 r_search = re.search(r, raw_data, re.MULTILINE) 88 if r_search: 89 parsed_data[k] = float(r_search.group(1)) 90 91 # Exit if we didn't manage to parse enough fields - probably not a TopHat log 92 # Note that Bowtie2 / HiSAT2 logs contain some but not all of these strings 93 if len(parsed_data) < 4: 94 return None 95 96 parsed_data["concordant_aligned_percent"] = parsed_data.get("concordant_aligned_percent", 0) 97 parsed_data["aligned_total"] = parsed_data.get("aligned_total", 0) 98 parsed_data["aligned_multimap"] = parsed_data.get("aligned_multimap", 0) 99 parsed_data["aligned_discordant"] = parsed_data.get("aligned_discordant", 0) 100 parsed_data["unaligned_total"] = parsed_data["total_reads"] - parsed_data["aligned_total"] 101 parsed_data["aligned_not_multimapped_discordant"] = ( 102 parsed_data["aligned_total"] - parsed_data["aligned_multimap"] - parsed_data["aligned_discordant"] 103 ) 104 return parsed_data 105 106 def tophat_general_stats_table(self): 107 """Take the parsed stats from the Tophat report and add it to the 108 basic stats table at the top of the report""" 109 110 headers = OrderedDict() 111 headers["overall_aligned_percent"] = { 112 "title": "% Aligned", 113 "description": "overall read mapping rate", 114 "max": 100, 115 "min": 0, 116 "suffix": "%", 117 "scale": "YlGn", 118 } 119 headers["aligned_not_multimapped_discordant"] = { 120 "title": "{} Aligned".format(config.read_count_prefix), 121 "description": "Aligned reads, not multimapped or discordant ({})".format(config.read_count_desc), 122 "min": 0, 123 "scale": "PuRd", 124 "modify": lambda x: x * config.read_count_multiplier, 125 "shared_key": "read_count", 126 } 127 self.general_stats_addcols(self.tophat_data, headers) 128 129 def tophat_alignment_plot(self): 130 """ Make the HighCharts HTML to plot the alignment rates """ 131 132 # Specify the order of the different possible categories 133 keys = OrderedDict() 134 keys["aligned_not_multimapped_discordant"] = {"color": "#437bb1", "name": "Aligned"} 135 keys["aligned_multimap"] = {"color": "#f7a35c", "name": "Multimapped"} 136 keys["aligned_discordant"] = {"color": "#e63491", "name": "Discordant mappings"} 137 keys["unaligned_total"] = {"color": "#7f0000", "name": "Not aligned"} 138 139 # Config for the plot 140 config = { 141 "id": "tophat_alignment", 142 "title": "Tophat: Alignment Scores", 143 "ylab": "# Reads", 144 "cpswitch_counts_label": "Number of Reads", 145 } 146 147 self.add_section(plot=bargraph.plot(self.tophat_data, keys, config)) 148