1#!/usr/bin/env python
2
3""" MultiQC module to parse output from Tophat """
4
5from __future__ import print_function
6from collections import OrderedDict
7import logging
8import os
9import re
10
11from multiqc import config
12from multiqc.plots import bargraph
13from multiqc.modules.base_module import BaseMultiqcModule
14
15# Initialise the logger
16log = logging.getLogger(__name__)
17
18
19class MultiqcModule(BaseMultiqcModule):
20    def __init__(self):
21
22        # Initialise the parent object
23        super(MultiqcModule, self).__init__(
24            name="Tophat",
25            anchor="tophat",
26            href="https://ccb.jhu.edu/software/tophat/",
27            info="is a fast splice junction mapper for RNA-Seq reads. "
28            "It aligns RNA-Seq reads to mammalian-sized genomes.",
29        )
30
31        # Find and load any Tophat reports
32        self.tophat_data = dict()
33        for f in self.find_log_files("tophat"):
34            parsed_data = self.parse_tophat_log(f["f"])
35            if parsed_data is not None:
36                if f["s_name"] == "align" or f["s_name"] == "align_summary.txt":
37                    s_name = os.path.basename(f["root"])
38                else:
39                    s_name = f["s_name"].split("align_summary.txt", 1)[0]
40                s_name = self.clean_s_name(s_name, f["root"])
41                if s_name in self.tophat_data:
42                    log.debug("Duplicate sample name found! Overwriting: {}".format(s_name))
43                self.add_data_source(f, s_name)
44                self.tophat_data[s_name] = parsed_data
45
46        # Filter to strip out ignored sample names
47        self.tophat_data = self.ignore_samples(self.tophat_data)
48
49        if len(self.tophat_data) == 0:
50            raise UserWarning
51
52        log.info("Found {} reports".format(len(self.tophat_data)))
53
54        # Write parsed report data to a file
55        self.write_data_file(self.tophat_data, "multiqc_tophat.txt")
56
57        # Basic Stats Table
58        self.tophat_general_stats_table()
59
60        # Alignment Rate Plot
61        self.tophat_alignment_plot()
62
63    def parse_tophat_log(self, raw_data):
64        """ Parse the Tophat alignment log file. """
65
66        if "Aligned pairs" in raw_data:
67            # Paired end data
68            regexes = {
69                "overall_aligned_percent": r"([\d\.]+)% overall read mapping rate.",
70                "concordant_aligned_percent": r"([\d\.]+)% concordant pair alignment rate.",
71                "aligned_total": r"Aligned pairs:\s+(\d+)",
72                "aligned_multimap": r"Aligned pairs:\s+\d+\n\s+of these:\s+(\d+)",
73                "aligned_discordant": r"(\d+) \([\s\d\.]+%\) are discordant alignments",
74                "total_reads": r"[Rr]eads:\n\s+Input\s*:\s+(\d+)",
75            }
76        else:
77            # Single end data
78            regexes = {
79                "total_reads": r"[Rr]eads:\n\s+Input\s*:\s+(\d+)",
80                "aligned_total": r"Mapped\s*:\s+(\d+)",
81                "aligned_multimap": r"of these\s*:\s+(\d+)",
82                "overall_aligned_percent": r"([\d\.]+)% overall read mapping rate.",
83            }
84
85        parsed_data = {}
86        for k, r in regexes.items():
87            r_search = re.search(r, raw_data, re.MULTILINE)
88            if r_search:
89                parsed_data[k] = float(r_search.group(1))
90
91        # Exit if we didn't manage to parse enough fields - probably not a TopHat log
92        # Note that Bowtie2 / HiSAT2 logs contain some but not all of these strings
93        if len(parsed_data) < 4:
94            return None
95
96        parsed_data["concordant_aligned_percent"] = parsed_data.get("concordant_aligned_percent", 0)
97        parsed_data["aligned_total"] = parsed_data.get("aligned_total", 0)
98        parsed_data["aligned_multimap"] = parsed_data.get("aligned_multimap", 0)
99        parsed_data["aligned_discordant"] = parsed_data.get("aligned_discordant", 0)
100        parsed_data["unaligned_total"] = parsed_data["total_reads"] - parsed_data["aligned_total"]
101        parsed_data["aligned_not_multimapped_discordant"] = (
102            parsed_data["aligned_total"] - parsed_data["aligned_multimap"] - parsed_data["aligned_discordant"]
103        )
104        return parsed_data
105
106    def tophat_general_stats_table(self):
107        """Take the parsed stats from the Tophat report and add it to the
108        basic stats table at the top of the report"""
109
110        headers = OrderedDict()
111        headers["overall_aligned_percent"] = {
112            "title": "% Aligned",
113            "description": "overall read mapping rate",
114            "max": 100,
115            "min": 0,
116            "suffix": "%",
117            "scale": "YlGn",
118        }
119        headers["aligned_not_multimapped_discordant"] = {
120            "title": "{} Aligned".format(config.read_count_prefix),
121            "description": "Aligned reads, not multimapped or discordant ({})".format(config.read_count_desc),
122            "min": 0,
123            "scale": "PuRd",
124            "modify": lambda x: x * config.read_count_multiplier,
125            "shared_key": "read_count",
126        }
127        self.general_stats_addcols(self.tophat_data, headers)
128
129    def tophat_alignment_plot(self):
130        """ Make the HighCharts HTML to plot the alignment rates """
131
132        # Specify the order of the different possible categories
133        keys = OrderedDict()
134        keys["aligned_not_multimapped_discordant"] = {"color": "#437bb1", "name": "Aligned"}
135        keys["aligned_multimap"] = {"color": "#f7a35c", "name": "Multimapped"}
136        keys["aligned_discordant"] = {"color": "#e63491", "name": "Discordant mappings"}
137        keys["unaligned_total"] = {"color": "#7f0000", "name": "Not aligned"}
138
139        # Config for the plot
140        config = {
141            "id": "tophat_alignment",
142            "title": "Tophat: Alignment Scores",
143            "ylab": "# Reads",
144            "cpswitch_counts_label": "Number of Reads",
145        }
146
147        self.add_section(plot=bargraph.plot(self.tophat_data, keys, config))
148