1#!/usr/bin/env python
2
3""" MultiQC module to parse logs from SnpEff """
4
5from __future__ import print_function
6
7from collections import OrderedDict
8import logging
9from multiqc.plots import bargraph, linegraph
10from multiqc.modules.base_module import BaseMultiqcModule
11
12# Initialise the logger
13log = logging.getLogger(__name__)
14
15
16class MultiqcModule(BaseMultiqcModule):
17    """ SnpEff """
18
19    def __init__(self):
20        # Initialise the parent object
21        super(MultiqcModule, self).__init__(
22            name="SnpEff",
23            anchor="snpeff",
24            href="http://snpeff.sourceforge.net/",
25            info=" is a genetic variant annotation and effect prediction "
26            "toolbox. It annotates and predicts the effects of variants "
27            "on genes (such as amino acid changes). ",
28        )
29
30        self.snpeff_data = dict()
31        self.snpeff_section_totals = dict()
32        self.snpeff_qualities = dict()
33
34        for f in self.find_log_files("snpeff", filehandles=True):
35            self.parse_snpeff_log(f)
36
37        # Filter to strip out ignored sample names
38        self.snpeff_data = self.ignore_samples(self.snpeff_data)
39
40        if len(self.snpeff_data) == 0:
41            raise UserWarning
42
43        log.info("Found {} reports".format(len(self.snpeff_data)))
44
45        # Write parsed report data to a file
46        self.write_data_file(self.snpeff_data, "multiqc_snpeff")
47
48        # General stats table
49        self.general_stats()
50
51        # Report sections
52        self.add_section(
53            name="Variants by Genomic Region",
54            description=(
55                """
56                The stacked bar plot shows locations of detected variants in
57                the genome and the number of variants for each location.
58                """
59            ),
60            helptext=(
61                """
62                The upstream and downstream interval size to detect these
63                genomic regions is 5000bp by default.
64                """
65            ),
66            anchor="snpeff-genomic-regions",
67            plot=self.count_genomic_region_plot(),
68        )
69        self.add_section(
70            name="Variant Effects by Impact",
71            description=(
72                """
73                The stacked bar plot shows the putative impact of detected
74                variants and the number of variants for each impact.
75                """
76            ),
77            helptext=(
78                """
79                There are four levels of impacts predicted by SnpEff:
80
81                * **High**: High impact (like stop codon)
82                * **Moderate**: Middle impact (like same type of amino acid substitution)
83                * **Low**: Low impact (ie silence mutation)
84                * **Modifier**: No impact
85                """
86            ),
87            anchor="snpeff-effects-impact",
88            plot=self.effects_impact_plot(),
89        )
90        self.add_section(
91            name="Variants by Effect Types",
92            description=(
93                """
94                The stacked bar plot shows the effect of variants at protein
95                level and the number of variants for each effect type.
96                """
97            ),
98            helptext=(
99                """
100                This plot shows the effect of variants with respect to
101                the mRNA.
102                """
103            ),
104            anchor="snpeff-effects",
105            plot=self.effects_plot(),
106        )
107        self.add_section(
108            name="Variants by Functional Class",
109            description=(
110                """
111                The stacked bar plot shows the effect of variants and
112                the number of variants for each effect type.
113                """
114            ),
115            helptext=(
116                """
117                This plot shows the effect of variants on the translation of
118                the mRNA as protein. There are three possible cases:
119
120                * **Silent**: The amino acid does not change.
121                * **Missense**: The amino acid is different.
122                * **Nonsense**: The variant generates a stop codon.
123                """
124            ),
125            anchor="snpeff-functional-class",
126            plot=self.effects_function_plot(),
127        )
128        if len(self.snpeff_qualities) > 0:
129            self.add_section(
130                name="Variant Qualities",
131                description=(
132                    """
133                    The line plot shows the quantity as function of the
134                    variant quality score.
135                    """
136                ),
137                helptext=(
138                    """
139                    The quality score corresponds to the QUAL column of the
140                    VCF file. This score is set by the variant caller.
141                    """
142                ),
143                anchor="snpeff-qualities",
144                plot=self.qualities_plot(),
145            )
146
147    def parse_snpeff_log(self, f):
148        """ Go through log file looking for snpeff output """
149
150        keys = {
151            "# Summary table": [
152                "Genome",
153                "Number_of_variants_before_filter",
154                "Number_of_known_variants",
155                "Number_of_effects",
156                "Genome_total_length",
157                "Change_rate",
158            ],
159            "# Effects by impact": ["HIGH", "LOW", "MODERATE", "MODIFIER"],
160            "# Effects by functional class": ["MISSENSE", "NONSENSE", "SILENT", "Missense_Silent_ratio"],
161            "# Hom/Het table": ["Het", "Hom", "Missing"],
162            "# Ts/Tv summary": ["Transitions", "Transversions", "Ts_Tv_ratio"],
163            "# Count by effects": "all",
164            "# Count by genomic region": "all",
165        }
166        parsed_data = {}
167        section = None
168        for l in f["f"]:
169            l = l.strip()
170            if l[:1] == "#":
171                section = l
172                self.snpeff_section_totals[section] = dict()
173                continue
174            s = l.split(",")
175
176            # Quality values / counts
177            if section == "# Quality":
178                quals = OrderedDict()
179                if l.startswith("Values"):
180                    values = [int(c) for c in l.split(",")[1:]]
181                    counts = f["f"].readline()
182                    counts = [int(c) for c in counts.split(",")[1:]]
183                    c = 0
184                    total = sum(counts)
185                    for i, v in enumerate(values):
186                        if c < (total * 0.995):
187                            quals[v] = counts[i]
188                            c += counts[i]
189                if len(quals) > 0:
190                    self.snpeff_qualities[f["s_name"]] = quals
191
192            # Everything else
193            elif section in keys:
194                if keys[section] == "all" or any([k in s[0].strip() for k in keys[section]]):
195                    try:
196                        parsed_data[s[0].strip()] = float(s[1].strip())
197                    except ValueError:
198                        parsed_data[s[0].strip()] = s[1].strip()
199                    except IndexError:
200                        pass
201                    else:
202                        # Parsing the number worked - add to totals
203                        try:
204                            self.snpeff_section_totals[section][s[0].strip()] += parsed_data[s[0].strip()]
205                        except KeyError:
206                            self.snpeff_section_totals[section][s[0].strip()] = parsed_data[s[0].strip()]
207                    if len(s) > 2 and s[2][-1:] == "%":
208                        parsed_data["{}_percent".format(s[0].strip())] = float(s[2][:-1])
209
210        if len(parsed_data) > 0:
211            if f["s_name"] in self.snpeff_data:
212                log.debug("Duplicate sample name found! Overwriting: {}".format(f["s_name"]))
213            self.add_data_source(f)
214            self.snpeff_data[f["s_name"]] = parsed_data
215
216    def general_stats(self):
217        """ Add key SnpEff stats to the general stats table """
218
219        headers = OrderedDict()
220        headers["Change_rate"] = {"title": "Change rate", "scale": "RdYlBu-rev", "min": 0, "format": "{:,.0f}"}
221        headers["Ts_Tv_ratio"] = {
222            "title": "Ts/Tv",
223            "description": "Transitions / Transversions ratio",
224            "format": "{:,.3f}",
225        }
226        headers["Number_of_variants_before_filter"] = {
227            "title": "M Variants",
228            "description": "Number of variants before filter (millions)",
229            "scale": "PuRd",
230            "modify": lambda x: x / 1000000,
231            "min": 0,
232            "format": "{:,.2f}",
233        }
234        self.general_stats_addcols(self.snpeff_data, headers)
235
236    def count_genomic_region_plot(self):
237        """ Generate the SnpEff Counts by Genomic Region plot """
238
239        # Sort the keys based on the total counts
240        keys = self.snpeff_section_totals["# Count by genomic region"]
241        sorted_keys = sorted(keys, reverse=True, key=keys.get)
242
243        # Make nicer label names
244        pkeys = OrderedDict()
245        for k in sorted_keys:
246            pkeys[k] = {"name": k.replace("_", " ").title().replace("Utr", "UTR")}
247
248        # Config for the plot
249        pconfig = {
250            "id": "snpeff_variant_effects_region",
251            "title": "SnpEff: Counts by Genomic Region",
252            "ylab": "# Reads",
253            "logswitch": True,
254        }
255
256        return bargraph.plot(self.snpeff_data, pkeys, pconfig)
257
258    def effects_plot(self):
259        """ Generate the SnpEff Counts by Effects plot """
260
261        # Sort the keys based on the total counts
262        keys = self.snpeff_section_totals["# Count by effects"]
263        sorted_keys = sorted(keys, reverse=True, key=keys.get)
264
265        # Make nicer label names
266        pkeys = OrderedDict()
267        for k in sorted_keys:
268            pkeys[k] = {"name": k.replace("_", " ").title().replace("Utr", "UTR")}
269
270        # Config for the plot
271        pconfig = {
272            "id": "snpeff_effects",
273            "title": "SnpEff: Counts by Effect Types",
274            "ylab": "# Reads",
275            "logswitch": False,
276        }
277        return bargraph.plot(self.snpeff_data, pkeys, pconfig)
278
279    def effects_impact_plot(self):
280        """ Generate the SnpEff Counts by Effects Impact plot """
281
282        # Put keys in a more logical order
283        keys = ["MODIFIER", "LOW", "MODERATE", "HIGH"]
284
285        # Make nicer label names
286        pkeys = OrderedDict()
287        for k in keys:
288            pkeys[k] = {"name": k.title()}
289
290        # Config for the plot
291        pconfig = {
292            "id": "snpeff_variant_effects_impact",
293            "title": "SnpEff: Counts by Effects Impact",
294            "ylab": "# Reads",
295            "logswitch": True,
296        }
297
298        return bargraph.plot(self.snpeff_data, pkeys, pconfig)
299
300    def effects_function_plot(self):
301        """ Generate the SnpEff Counts by Functional Class plot """
302
303        # Cats to plot in a sensible order
304        keys = ["SILENT", "MISSENSE", "NONSENSE"]
305
306        # Make nicer label names
307        pkeys = OrderedDict()
308        for k in keys:
309            pkeys[k] = {"name": k.title()}
310
311        # Config for the plot
312        pconfig = {
313            "id": "snpeff_variant_effects_class",
314            "title": "SnpEff: Counts by Functional Class",
315            "ylab": "# Reads",
316            "logswitch": True,
317        }
318
319        return bargraph.plot(self.snpeff_data, pkeys, pconfig)
320
321    def qualities_plot(self):
322        """ Generate the qualities plot """
323
324        pconfig = {
325            "smooth_points": 200,
326            "id": "snpeff_qualities",
327            "title": "SnpEff: Qualities",
328            "ylab": "Count",
329            "xlab": "Values",
330            "xDecimals": False,
331            "ymin": 0,
332        }
333
334        return linegraph.plot(self.snpeff_qualities, pconfig)
335