1#!/usr/bin/env python
2
3""" MultiQC module to parse output from sargasso """
4
5from __future__ import print_function
6from collections import OrderedDict
7import logging
8
9from multiqc import config
10from multiqc.plots import bargraph
11from multiqc.modules.base_module import BaseMultiqcModule
12
13# Initialise the logger
14log = logging.getLogger(__name__)
15
16
17class MultiqcModule(BaseMultiqcModule):
18    def __init__(self):
19
20        # Initialise the parent object
21        super(MultiqcModule, self).__init__(
22            name="sargasso",
23            anchor="sargasso",
24            href="http://statbio.github.io/Sargasso/",
25            info="is a tool to separate mixed-species RNA-seq reads" "according to their species of origin.",
26        )
27
28        # Find and load any Sargasso reports
29        self.sargasso_data = dict()
30        self.sargasso_files = list()
31        self.sargasso_keys = list()  # header keys
32
33        for f in self.find_log_files("sargasso"):
34            self.parse_sargasso_logs(f)
35            self.sargasso_files.append(f)
36
37        # log.info('Removing ignored samples...')
38        self.sargasso_data = self.ignore_samples(self.sargasso_data)
39
40        if len(self.sargasso_data) == 0:
41            raise UserWarning
42
43        log.info("Found {} reports".format(len(self.sargasso_files)))
44
45        # Write parsed report data to a file
46        self.write_data_file(self.sargasso_data, "multiqc_sargasso")
47
48        # Basic Stats Table
49        self.sargasso_stats_table()
50
51        # Assignment bar plot
52        self.add_section(plot=self.sargasso_chart())
53
54        # log.info('done')
55
56    def parse_sargasso_logs(self, f):
57        """ Parse the sargasso log file. """
58        species_name = list()
59        items = list()
60        header = list()
61        is_first_line = True
62        for l in f["f"].splitlines():
63            s = l.split(",")
64            # Check that this actually is a Sargasso file
65            if is_first_line and s[0] != "Sample":
66                return None
67
68            if len(s) < 7:
69                continue
70            if is_first_line:
71                # prepare header
72                is_first_line = False
73                header = s
74                for i in header[1:]:
75                    # find out what species included
76                    sname = i.split("-")[-1]
77                    if sname not in species_name:
78                        species_name.append(sname)
79                    # find out what is being counted
80                    kname = "-".join(i.split("-")[-3:-1])
81                    if kname not in items:
82                        items.append(kname)
83            else:
84                # start sample lines.
85                sample_name = s.pop(0)
86
87                chunk_by_species = [s[i : i + len(items)] for i in range(0, len(s), len(items))]
88                for idx, v in enumerate(chunk_by_species):
89                    # adding species name to the sample name for easy interpretation
90                    new_sample_name = "_".join([sample_name, species_name[idx]])
91
92                    # Clean up sample name
93                    new_sample_name = self.clean_s_name(new_sample_name, f["root"])
94
95                    if new_sample_name in self.sargasso_data.keys():
96                        log.debug("Duplicate sample name found! Overwriting: {}".format(new_sample_name))
97
98                    try:
99                        self.sargasso_data[new_sample_name] = dict(zip(items, map(int, v)))
100                    except ValueError:
101                        pass
102
103        self.sargasso_keys = items
104
105        for idx, f_name in enumerate(self.sargasso_data.keys()):
106
107            # Reorganised parsed data for this sample
108            # Collect total READ count number
109            self.sargasso_data[f_name]["Total"] = 0
110            for key, value in list(self.sargasso_data[f_name].items()):  # iter on both keys and values
111                if key.endswith("Reads"):
112                    self.sargasso_data[f_name]["Total"] += value
113
114            # Calculate the percent aligned if we can
115            try:
116                self.sargasso_data[f_name]["sargasso_percent_assigned"] = (
117                    float(self.sargasso_data[f_name]["Assigned-Reads"]) / float(self.sargasso_data[f_name]["Total"])
118                ) * 100.0
119            except (KeyError, ZeroDivisionError):
120                pass
121
122    def sargasso_stats_table(self):
123        """Take the parsed stats from the sargasso report and add them to the
124        basic stats table at the top of the report"""
125
126        headers = OrderedDict()
127        headers["sargasso_percent_assigned"] = {
128            "title": "% Assigned",
129            "description": "Sargasso % Assigned reads",
130            "max": 100,
131            "min": 0,
132            "suffix": "%",
133            "scale": "RdYlGn",
134        }
135        headers["Assigned-Reads"] = {
136            "title": "{} Assigned".format(config.read_count_prefix),
137            "description": "Sargasso Assigned reads ({})".format(config.read_count_desc),
138            "min": 0,
139            "scale": "PuBu",
140            "modify": lambda x: float(x) * config.read_count_multiplier,
141            "shared_key": "read_count",
142        }
143        self.general_stats_addcols(self.sargasso_data, headers)
144
145    def sargasso_chart(self):
146        """ Make the sargasso plot """
147
148        # Config for the plot
149        config = {
150            "id": "sargasso_assignment_plot",
151            "title": "Sargasso: Assigned Reads",
152            "ylab": "# Reads",
153            "cpswitch_counts_label": "Number of Reads",
154        }
155
156        # We only want to plot the READs at the moment
157        return bargraph.plot(self.sargasso_data, [name for name in self.sargasso_keys if "Reads" in name], config)
158