1#!/usr/bin/env python
2
3""" MultiQC module to parse output from HTSeq Count """
4
5from __future__ import print_function
6from collections import OrderedDict
7import logging
8
9from multiqc import config
10from multiqc.plots import bargraph
11from multiqc.modules.base_module import BaseMultiqcModule
12
13# Initialise the logger
14log = logging.getLogger(__name__)
15
16
17class MultiqcModule(BaseMultiqcModule):
18    def __init__(self):
19
20        # Initialise the parent object
21        super(MultiqcModule, self).__init__(
22            name="HTSeq Count",
23            anchor="htseq",
24            target="HTSeq Count",
25            href="http://www-huber.embl.de/HTSeq/doc/count.html",
26            info=" is part of the HTSeq Python package - it takes a file with aligned sequencing "
27            "reads, plus a list of genomic features and counts how many reads map to each feature.",
28        )
29
30        # Find and load any HTSeq Count reports
31        self.htseq_data = dict()
32        self.htseq_keys = list()
33        for f in self.find_log_files("htseq", filehandles=True):
34            parsed_data = self.parse_htseq_report(f)
35            if parsed_data is not None:
36                self.htseq_data[f["s_name"]] = parsed_data
37
38        # Filter to strip out ignored sample names
39        self.htseq_data = self.ignore_samples(self.htseq_data)
40
41        if len(self.htseq_data) == 0:
42            raise UserWarning
43
44        log.info("Found {} reports".format(len(self.htseq_data)))
45
46        # Write parsed report data to a file
47        self.write_data_file(self.htseq_data, "multiqc_htseq")
48
49        # Basic Stats Table
50        self.htseq_stats_table()
51
52        # Assignment bar plot
53        self.add_section(plot=self.htseq_counts_chart())
54
55    def parse_htseq_report(self, f):
56        """ Parse the HTSeq Count log file. """
57        keys = ["__no_feature", "__ambiguous", "__too_low_aQual", "__not_aligned", "__alignment_not_unique"]
58        parsed_data = dict()
59        assigned_counts = 0
60        for l in f["f"]:
61            s = l.split("\t")
62            if s[0] in keys:
63                parsed_data[s[0][2:]] = int(s[-1])
64            else:
65                try:
66                    assigned_counts += int(s[-1])
67                except (ValueError, IndexError):
68                    pass
69        if len(parsed_data) > 0:
70            parsed_data["assigned"] = assigned_counts
71            parsed_data["total_count"] = sum([v for v in parsed_data.values()])
72            try:
73                parsed_data["percent_assigned"] = (
74                    float(parsed_data["assigned"]) / float(parsed_data["total_count"])
75                ) * 100.0
76            except ZeroDivisionError:
77                parsed_data["percent_assigned"] = 0
78            return parsed_data
79        return None
80
81    def htseq_stats_table(self):
82        """Take the parsed stats from the HTSeq Count report and add them to the
83        basic stats table at the top of the report"""
84
85        headers = OrderedDict()
86        headers["percent_assigned"] = {
87            "title": "% Assigned",
88            "description": "% Assigned reads",
89            "max": 100,
90            "min": 0,
91            "suffix": "%",
92            "scale": "RdYlGn",
93        }
94        headers["assigned"] = {
95            "title": "{} Assigned".format(config.read_count_prefix),
96            "description": "Assigned Reads ({})".format(config.read_count_desc),
97            "min": 0,
98            "scale": "PuBu",
99            "modify": lambda x: float(x) * config.read_count_multiplier,
100            "shared_key": "read_count",
101        }
102        self.general_stats_addcols(self.htseq_data, headers)
103
104    def htseq_counts_chart(self):
105        """ Make the HTSeq Count assignment rates plot """
106        cats = OrderedDict()
107        cats["assigned"] = {"name": "Assigned"}
108        cats["ambiguous"] = {"name": "Ambiguous"}
109        cats["alignment_not_unique"] = {"name": "Alignment Not Unique"}
110        cats["no_feature"] = {"name": "No Feature"}
111        cats["too_low_aQual"] = {"name": "Too Low aQual"}
112        cats["not_aligned"] = {"name": "Not Aligned"}
113        config = {
114            "id": "htseq_assignment_plot",
115            "title": "HTSeq: Count Assignments",
116            "ylab": "# Reads",
117            "hide_zero_cats": False,
118            "cpswitch_counts_label": "Number of Reads",
119        }
120        return bargraph.plot(self.htseq_data, cats, config)
121