1#!/usr/bin/env python
2
3""" MultiQC module to parse output from featureCounts """
4
5from __future__ import print_function
6from collections import OrderedDict
7import logging
8import re
9
10from multiqc import config
11from multiqc.plots import bargraph
12from multiqc.modules.base_module import BaseMultiqcModule
13
14# Initialise the logger
15log = logging.getLogger(__name__)
16
17
18class MultiqcModule(BaseMultiqcModule):
19    def __init__(self):
20
21        # Initialise the parent object
22        super(MultiqcModule, self).__init__(
23            name="featureCounts",
24            anchor="featurecounts",
25            target="Subread featureCounts",
26            href="http://bioinf.wehi.edu.au/featureCounts/",
27            info="is a highly efficient general-purpose read summarization program"
28            " that counts mapped reads for genomic features such as genes, exons,"
29            " promoter, gene bodies, genomic bins and chromosomal locations.",
30        )
31
32        # Find and load any featureCounts reports
33        self.featurecounts_data = dict()
34        self.featurecounts_keys = list()
35        for f in self.find_log_files("featurecounts"):
36            self.parse_featurecounts_report(f)
37
38        # Filter to strip out ignored sample names
39        self.featurecounts_data = self.ignore_samples(self.featurecounts_data)
40
41        if len(self.featurecounts_data) == 0:
42            raise UserWarning
43
44        log.info("Found {} reports".format(len(self.featurecounts_data)))
45
46        # Write parsed report data to a file
47        self.write_data_file(self.featurecounts_data, "multiqc_featureCounts")
48
49        # Basic Stats Table
50        self.featurecounts_stats_table()
51
52        # Assignment bar plot
53        self.add_section(plot=self.featureCounts_chart())
54
55    def parse_featurecounts_report(self, f):
56        """ Parse the featureCounts log file. """
57
58        file_names = list()
59        parsed_data = dict()
60        split_sep = "\t"
61        for l in f["f"].splitlines():
62            thisrow = list()
63
64            # If this is from Rsubread then the formatting can be very variable
65            # Default search pattern is quite generic, so f
66            if len(file_names) == 0 and len(l.split(split_sep)) < 2:
67                # Split by whitespace and strip quote marks
68                # NB: Will break if sample names have whitespace. RSubread output is so variable that this is difficult to avoid
69                split_sep = None
70
71            s = l.split(split_sep)
72            s = [sv.strip('"') for sv in s]
73
74            if len(s) < 2:
75                continue
76            if s[0] == "Status":
77                for f_name in s[1:]:
78                    file_names.append(f_name)
79            elif len(file_names) + 1 == len(s):
80                k = s[0]
81                if k not in self.featurecounts_keys:
82                    self.featurecounts_keys.append(k)
83                for val in s[1:]:
84                    try:
85                        thisrow.append(int(val))
86                    except ValueError:
87                        pass
88            if len(thisrow) > 0:
89                parsed_data[k] = thisrow
90
91        # Check that this actually is a featureCounts file, as format and parsing is quite general
92        if "Assigned" not in parsed_data.keys():
93            return None
94
95        for idx, f_name in enumerate(file_names):
96
97            # Clean up sample name
98            s_name = self.clean_s_name(f_name, f["root"])
99
100            # Reorganised parsed data for this sample
101            # Collect total count number
102            data = dict()
103            data["Total"] = 0
104            for k in parsed_data:
105                data[k] = parsed_data[k][idx]
106                data["Total"] += parsed_data[k][idx]
107
108            # Calculate the percent aligned if we can
109            try:
110                data["percent_assigned"] = (float(data["Assigned"]) / float(data["Total"])) * 100.0
111            except (KeyError, ZeroDivisionError):
112                pass
113
114            # Add to the main dictionary
115            if len(data) > 1:
116                if s_name in self.featurecounts_data:
117                    log.debug("Duplicate sample name found! Overwriting: {}".format(s_name))
118                self.add_data_source(f, s_name)
119                self.featurecounts_data[s_name] = data
120
121    def featurecounts_stats_table(self):
122        """Take the parsed stats from the featureCounts report and add them to the
123        basic stats table at the top of the report"""
124
125        headers = OrderedDict()
126        headers["percent_assigned"] = {
127            "title": "% Assigned",
128            "description": "% Assigned reads",
129            "max": 100,
130            "min": 0,
131            "suffix": "%",
132            "scale": "RdYlGn",
133        }
134        headers["Assigned"] = {
135            "title": "{} Assigned".format(config.read_count_prefix),
136            "description": "Assigned reads ({})".format(config.read_count_desc),
137            "min": 0,
138            "scale": "PuBu",
139            "modify": lambda x: float(x) * config.read_count_multiplier,
140            "shared_key": "read_count",
141        }
142        self.general_stats_addcols(self.featurecounts_data, headers)
143
144    def featureCounts_chart(self):
145        """ Make the featureCounts assignment rates plot """
146
147        headers = OrderedDict()
148        for h in self.featurecounts_keys:
149            nice_name = h.replace("Unassigned_", "Unassigned: ").replace("_", " ")
150            nice_name = re.sub(r"([a-z])([A-Z])", "\g<1> \g<2>", nice_name)
151            headers[h] = {"name": nice_name}
152
153        # Config for the plot
154        config = {
155            "id": "featureCounts_assignment_plot",
156            "title": "featureCounts: Assignments",
157            "ylab": "# Reads",
158            "cpswitch_counts_label": "Number of Reads",
159        }
160
161        return bargraph.plot(self.featurecounts_data, headers, config)
162