1#!/usr/bin/env python
2
3""" MultiQC module to parse output from FastQ Screen """
4
5from __future__ import print_function
6from collections import OrderedDict
7import json
8import logging
9import re
10
11from multiqc import config
12from multiqc.plots import bargraph
13from multiqc.modules.base_module import BaseMultiqcModule
14from multiqc.utils import report
15
16# Initialise the logger
17log = logging.getLogger(__name__)
18
19
20class MultiqcModule(BaseMultiqcModule):
21    def __init__(self):
22
23        # Initialise the parent object
24        super(MultiqcModule, self).__init__(
25            name="FastQ Screen",
26            anchor="fastq_screen",
27            href="http://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/",
28            info="allows you to screen a library of sequences in FastQ format against"
29            " a set of sequence databases so you can see if the composition of the"
30            " library matches with what you expect.",
31        )
32
33        # Find and load any FastQ Screen reports
34        self.fq_screen_data = dict()
35        self.num_orgs = 0
36        for f in self.find_log_files("fastq_screen", filehandles=True):
37            parsed_data = self.parse_fqscreen(f)
38            if parsed_data is not None:
39                if f["s_name"] in self.fq_screen_data:
40                    log.debug("Duplicate sample name found! Overwriting: {}".format(f["s_name"]))
41                self.add_data_source(f)
42                self.fq_screen_data[f["s_name"]] = parsed_data
43
44        # Filter to strip out ignored sample names
45        self.fq_screen_data = self.ignore_samples(self.fq_screen_data)
46
47        if len(self.fq_screen_data) == 0:
48            raise UserWarning
49
50        log.info("Found {} reports".format(len(self.fq_screen_data)))
51
52        # Check whether we have a consistent number of organisms across all samples
53        num_orgs = set([len(orgs) for orgs in self.fq_screen_data.values()])
54
55        # Section 1 - Alignment Profiles
56        # Posh plot only works for around 20 samples, 8 organisms. If all samples have the same number of organisms.
57        if (
58            len(num_orgs) == 1
59            and len(self.fq_screen_data) * self.num_orgs <= 160
60            and not config.plots_force_flat
61            and not getattr(config, "fastqscreen_simpleplot", False)
62        ):
63            self.add_section(name="Mapped Reads", anchor="fastq_screen_mapped_reads", content=self.fqscreen_plot())
64        # Use simpler plot that works with many samples
65        else:
66            self.add_section(name="Mapped Reads", anchor="fastq_screen_mapped_reads", plot=self.fqscreen_simple_plot())
67
68        # Section 2 - Optional bisfulfite plot
69        self.fqscreen_bisulfite_plot()
70
71        # Write the total counts and percentages to files
72        self.write_data_file(self.parse_csv(), "multiqc_fastq_screen")
73
74    def parse_fqscreen(self, f):
75        """ Parse the FastQ Screen output into a 3D dict """
76        parsed_data = OrderedDict()
77        nohits_pct = None
78        headers = None
79        bs_headers = None
80        for l in f["f"]:
81            # Skip comment lines
82            if l.startswith("#"):
83                continue
84            if l.startswith("%Hit_no_genomes:") or l.startswith("%Hit_no_libraries:"):
85                nohits_pct = float(l.split(":", 1)[1])
86                parsed_data["No hits"] = {"percentages": {"one_hit_one_library": nohits_pct}}
87            else:
88                s = l.strip().split("\t")
89
90                # Regular FastQ Screen table section
91                if len(s) == 12:
92                    if headers is None:
93                        headers = s
94                    else:
95                        # Can't use #Reads in subset as varies. #Reads_processed should be same for all orgs in a sample
96                        parsed_data["total_reads"] = int(s[1])
97                        # Loop through all columns
98                        parsed_data[s[0]] = {"percentages": {}, "counts": {}}
99                        for idx, h in enumerate(headers):
100                            if idx == 0:
101                                continue
102                            dtype = "percentages" if h.startswith("%") else "counts"
103                            field = (
104                                h.replace("%", "")
105                                .replace("#", "")
106                                .replace("genomes", "libraries")
107                                .replace("genome", "library")
108                                .lower()
109                            )
110                            parsed_data[s[0]][dtype][field] = float(s[idx])
111
112                # Optional Bisulfite table section
113                elif len(s) == 9:
114                    if bs_headers is None:
115                        bs_headers = s
116                    else:
117                        # Loop through all columns
118                        parsed_data[s[0]]["bisulfite_percentages"] = {}
119                        parsed_data[s[0]]["bisulfite_counts"] = {}
120                        for idx, h in enumerate(bs_headers):
121                            if idx == 0:
122                                continue
123                            dtype = "bisulfite_percentages" if h.startswith("%") else "bisulfite_counts"
124                            field = h.replace("%", "").replace("#", "").lower()
125                            parsed_data[s[0]][dtype][field] = float(s[idx])
126
127        if len(parsed_data) == 0:
128            return None
129
130        # Calculate no hits counts
131        if parsed_data.get("total_reads") is not None and nohits_pct is not None:
132            parsed_data["No hits"]["counts"] = {
133                "one_hit_one_library": int((nohits_pct / 100.0) * float(parsed_data["total_reads"]))
134            }
135        else:
136            log.warning("Couldn't find number of reads with no hits for '{}'".format(f["s_name"]))
137
138        self.num_orgs = max(len(parsed_data), self.num_orgs)
139        return parsed_data
140
141    def parse_csv(self):
142        totals = OrderedDict()
143        for s in sorted(self.fq_screen_data.keys()):
144            totals[s] = OrderedDict()
145            for org in self.fq_screen_data[s]:
146                if org == "total_reads":
147                    totals[s]["total_reads"] = self.fq_screen_data[s][org]
148                    continue
149                try:
150                    k = "{} counts".format(org)
151                    totals[s][k] = self.fq_screen_data[s][org]["counts"]["one_hit_one_library"]
152                    totals[s][k] += self.fq_screen_data[s][org]["counts"].get("multiple_hits_one_library", 0)
153                    totals[s][k] += self.fq_screen_data[s][org]["counts"].get("one_hit_multiple_libraries", 0)
154                    totals[s][k] += self.fq_screen_data[s][org]["counts"].get("multiple_hits_multiple_libraries", 0)
155                except KeyError:
156                    pass
157                try:
158                    k = "{} percentage".format(org)
159                    totals[s][k] = self.fq_screen_data[s][org]["percentages"]["one_hit_one_library"]
160                    totals[s][k] += self.fq_screen_data[s][org]["percentages"].get("multiple_hits_one_library", 0)
161                    totals[s][k] += self.fq_screen_data[s][org]["percentages"].get("one_hit_multiple_libraries", 0)
162                    totals[s][k] += self.fq_screen_data[s][org]["percentages"].get(
163                        "multiple_hits_multiple_libraries", 0
164                    )
165                except KeyError:
166                    pass
167        return totals
168
169    def fqscreen_plot(self):
170        """Makes a fancy custom plot which replicates the plot seen in the main
171        FastQ Screen program. Not useful if lots of samples as gets too wide."""
172
173        categories = list()
174        getCats = True
175        data = list()
176        p_types = OrderedDict()
177        p_types["multiple_hits_multiple_libraries"] = {"col": "#7f0000", "name": "Multiple Hits, Multiple Genomes"}
178        p_types["one_hit_multiple_libraries"] = {"col": "#ff0000", "name": "One Hit, Multiple Genomes"}
179        p_types["multiple_hits_one_library"] = {"col": "#00007f", "name": "Multiple Hits, One Genome"}
180        p_types["one_hit_one_library"] = {"col": "#0000ff", "name": "One Hit, One Genome"}
181        for k, t in p_types.items():
182            first = True
183            for s in sorted(self.fq_screen_data.keys()):
184                thisdata = list()
185                if len(categories) > 0:
186                    getCats = False
187                for org in sorted(self.fq_screen_data[s]):
188                    if org == "total_reads":
189                        continue
190                    try:
191                        thisdata.append(self.fq_screen_data[s][org]["percentages"][k])
192                    except KeyError:
193                        thisdata.append(None)
194                    if getCats:
195                        categories.append(org)
196                td = {"name": t["name"], "stack": s, "data": thisdata, "color": t["col"]}
197                if first:
198                    first = False
199                else:
200                    td["linkedTo"] = ":previous"
201                data.append(td)
202
203        plot_id = report.save_htmlid("fq_screen_plot")
204        html = """<div id={plot_id} class="fq_screen_plot hc-plot"></div>
205        <script type="application/json" class="fq_screen_dict">{dict}</script>
206        """.format(
207            plot_id=json.dumps(plot_id),
208            dict=json.dumps({"plot_id": plot_id, "data": data, "categories": categories}),
209        )
210
211        html += """<script type="text/javascript">
212            fq_screen_dict = { }; // { <plot_id>: data, categories }
213            $('.fq_screen_dict').each(function (i, elem) {
214                var dict = JSON.parse(elem.innerHTML);
215                fq_screen_dict[dict.plot_id] = dict;
216            });
217
218            $(function () {
219                // In case of repeated modules: #fq_screen_plot, #fq_screen_plot-1, ..
220                $(".fq_screen_plot").each(function () {
221                    var plot_id = $(this).attr('id');
222
223                    $(this).highcharts({
224                        chart: { type: "column", backgroundColor: null },
225                        title: { text: "FastQ Screen Results" },
226                        xAxis: { categories: fq_screen_dict[plot_id].categories },
227                        yAxis: {
228                            max: 100,
229                            min: 0,
230                            title: { text: "Percentage Aligned" }
231                        },
232                        tooltip: {
233                            formatter: function () {
234                                return "<b>" + this.series.stackKey.replace("column","") + " - " + this.x + "</b><br/>" +
235                                    this.series.name + ": " + this.y + "%<br/>" +
236                                    "Total Alignment: " + this.point.stackTotal + "%";
237                            },
238                        },
239                        plotOptions: {
240                            column: {
241                                pointPadding: 0,
242                                groupPadding: 0.02,
243                                stacking: "normal"
244                            }
245                        },
246                        series: fq_screen_dict[plot_id].data
247                    });
248                });
249            });
250        </script>"""
251
252        return html
253
254    def fqscreen_simple_plot(self):
255        """Makes a simple bar plot with summed alignment counts for
256        each species, stacked."""
257
258        # First, sum the different types of alignment counts
259        data = {}
260        org_counts = {}
261        for s_name in sorted(self.fq_screen_data):
262            data[s_name] = OrderedDict()
263            sum_alignments = 0
264            for org in self.fq_screen_data[s_name]:
265                if org == "total_reads":
266                    continue
267                try:
268                    data[s_name][org] = self.fq_screen_data[s_name][org]["counts"]["one_hit_one_library"]
269                except KeyError:
270                    log.error(
271                        "No counts found for '{}' ('{}'). Could be malformed or very old FastQ Screen results.".format(
272                            org, s_name
273                        )
274                    )
275                    continue
276                try:
277                    data[s_name][org] += self.fq_screen_data[s_name][org]["counts"]["multiple_hits_one_library"]
278                except KeyError:
279                    pass
280                sum_alignments += data[s_name][org]
281                org_counts[org] = org_counts.get(org, 0) + data[s_name][org]
282
283            # Calculate hits in multiple genomes
284            if "total_reads" in self.fq_screen_data[s_name]:
285                data[s_name]["Multiple Genomes"] = self.fq_screen_data[s_name]["total_reads"] - sum_alignments
286
287        # Sort the categories by the total read counts
288        cats = OrderedDict()
289        for org in sorted(org_counts, key=org_counts.get, reverse=True):
290            if org not in cats and org != "No hits":
291                cats[org] = {"name": org}
292
293        # Strip empty dicts
294        for s_name in list(data.keys()):
295            if len(data[s_name]) == 0:
296                del data[s_name]
297
298        pconfig = {
299            "id": "fastq_screen_plot",
300            "title": "FastQ Screen: Mapped Reads",
301            "cpswitch_c_active": False,
302            "ylab": "Percentages",
303        }
304        cats["Multiple Genomes"] = {"name": "Multiple Genomes", "color": "#820000"}
305        cats["No hits"] = {"name": "No hits", "color": "#cccccc"}
306
307        return bargraph.plot(data, cats, pconfig)
308
309    def fqscreen_bisulfite_plot(self):
310        """ Make a stacked barplot for the bisulfite data, if we have any """
311
312        pconfig = {
313            "id": "fastq_screen_bisulfite_plot",
314            "title": "FastQ Screen: Bisulfite Mapping Strand Orientation",
315            "hide_zero_cats": False,
316            "ylab": "Percentages",
317            "data_labels": [],
318        }
319
320        cats = OrderedDict()
321        cats["original_top_strand"] = {"name": "Original top strand", "color": "#80cdc1"}
322        cats["complementary_to_original_top_strand"] = {
323            "name": "Complementary to original top strand",
324            "color": "#018571",
325        }
326        cats["complementary_to_original_bottom_strand"] = {
327            "name": "Complementary to original bottom strand",
328            "color": "#a6611a",
329        }
330        cats["original_bottom_strand"] = {"name": "Original bottom strand", "color": "#dfc27d"}
331
332        # Pull out the data that we want
333        pdata_unsorted = {}
334        org_counts = {}
335        max_count = 0
336        for s_name, d in self.fq_screen_data.items():
337            for org, dd in d.items():
338                if org not in pdata_unsorted:
339                    pdata_unsorted[org] = {}
340                    org_counts[org] = 0
341                try:
342                    pdata_unsorted[org][s_name] = dd["bisulfite_counts"]
343                    total_counts = sum([c for c in dd["bisulfite_counts"].values()])
344                    org_counts[org] += total_counts
345                    max_count = max(max_count, total_counts)
346                except (TypeError, KeyError):
347                    pass
348
349        # Consistent y-max across all genomes
350        pconfig["ymax"] = max_count
351
352        # Show tabbed bar plots, sorted by total read count
353        pdata = []
354        pcats = []
355        for org in sorted(org_counts, key=org_counts.get, reverse=True):
356            if org_counts[org] > 0:
357                pconfig["data_labels"].append(org)
358                pdata.append(pdata_unsorted[org])
359                pcats.append(cats)
360
361        if len(pdata) == 0:
362            return None
363
364        self.add_section(
365            name="Bisulfite Reads", anchor="fastq_screen_bisulfite", plot=bargraph.plot(pdata, pcats, pconfig)
366        )
367