1#!/usr/bin/env python
2
3""" MultiQC submodule to parse output from Picard VariantCallingMetrics """
4
5import logging
6from collections import OrderedDict
7from multiqc.plots import bargraph
8
9# Initialise the logger
10log = logging.getLogger(__name__)
11
12
13def parse_reports(parent_module):
14    """ Find Picard VariantCallingMetrics reports and process their data """
15
16    # get data
17    data = collect_data(parent_module)
18
19    # Filter to strip out ignored sample names
20    data = parent_module.ignore_samples(data)
21
22    # Reference data in parent module
23    parent_module.picard_variantCalling_data = data
24
25    if len(data) > 0:
26        derive_data(data)
27
28        # Write parsed data to a file
29        parent_module.write_data_file(data, "multiqc_picard_variantCalling")
30
31        parent_module.general_stats_headers["DBSNP_TITV"] = {
32            "title": "TiTV ratio (known)",
33            "description": "The Transition/Transversion ratio of the passing bi-allelic SNP calls made at SNP-database sites.",
34            "min": 0,
35            "scale": "Blues",
36            "shared_key": "titv_ratio",
37        }
38
39        parent_module.general_stats_headers["NOVEL_TITV"] = {
40            "title": "TiTV ratio (novel)",
41            "description": "The Transition/Transversion ratio of the passing bi-allelic SNP calls made at non-SNP-database sites.",
42            "min": 0,
43            "scale": "Blues",
44            "shared_key": "titv_ratio",
45        }
46
47        parent_module.general_stats_headers["DBSNP_INS_DEL_RATIO"] = {
48            "title": "InDel ratio (known)",
49            "description": "The Insertion / Deletion ratio of the passing bi-allelic SNP calls made at SNP-database sites.",
50            "min": 0,
51            "scale": "Greens",
52            "shared_key": "indel_ratio",
53            "hidden": True,
54        }
55
56        parent_module.general_stats_headers["NOVEL_INS_DEL_RATIO"] = {
57            "title": "InDel ratio (novel)",
58            "description": "The Insertion / Deletion ratio of the passing bi-allelic SNP calls made at non-SNP-database sites.",
59            "min": 0,
60            "scale": "Greens",
61            "shared_key": "indel_ratio",
62            "hidden": True,
63        }
64
65        parent_module.general_stats_headers["total_called_variants_known"] = {
66            "title": "Called Variants (known)",
67            "description": "Total counts of variants in SNP-database sites.",
68            "shared_key": "variant_count",
69            "min": 0,
70            "format": "{0:,.0f}",
71            "hidden": True,
72        }
73
74        parent_module.general_stats_headers["total_called_variants_novel"] = {
75            "title": "Called Variants (novel)",
76            "description": "Total counts of variants in non-SNP-database sites.",
77            "shared_key": "variant_count",
78            "min": 0,
79            "format": "{0:,.0f}",
80            "hidden": True,
81        }
82
83        for s_name in data:
84            if s_name not in parent_module.general_stats_data:
85                parent_module.general_stats_data[s_name] = dict()
86            parent_module.general_stats_data[s_name].update(data[s_name])
87
88        # Variant Counts Bargraph
89        parent_module.add_section(
90            name="Variant Types",
91            anchor="picard-variants-types",
92            description="Variants that have been called, looking at variant types. Optionally filtered on label.",
93            helptext="""
94            Only passing variants are shown (i.e. non-filtered).\n
95            SNPs are bi-allelic.\n
96            Complex InDels are both an insertion and a deletion.
97            """,
98            plot=compare_variant_type_plot(data),
99        )
100
101        # Variant Counts Table
102        parent_module.add_section(
103            name="Variant Labels",
104            anchor="picard-variants-labels",
105            description="Variants that have been called, comparing with known variant sites.",
106            helptext="""
107            Only passing variants are shown (i.e. non-filtered).\n
108            Variants contain bi-allelic SNPs, multi-allelic SNPs, simple and complex inserts and deletions.
109            """,
110            plot=compare_variants_label_plot(data),
111        )
112
113    return len(data)
114
115
116def collect_data(parent_module):
117    """ Find Picard VariantCallingMetrics reports and parse their data """
118
119    data = dict()
120    for file_meta in parent_module.find_log_files("picard/variant_calling_metrics", filehandles=True):
121        s_name = None
122        for header, value in table_in(file_meta["f"], pre_header_string="## METRICS CLASS"):
123            if header == "SAMPLE_ALIAS":
124                s_name = value
125                if s_name in data:
126                    log.debug("Duplicate sample name found in {}! Overwriting: {}".format(file_meta["fn"], s_name))
127                data[s_name] = OrderedDict()
128            else:
129                data[s_name][header] = value
130    return data
131
132
133def table_in(filehandle, pre_header_string):
134    """ Generator that assumes a table starts the line after a given string """
135
136    in_histogram = False
137    next_is_header = False
138    headers = list()
139    for line in stripped(filehandle):
140        if not in_histogram and line.startswith(pre_header_string):
141            in_histogram = True
142            next_is_header = True
143        elif in_histogram and next_is_header:
144            next_is_header = False
145            headers = line.split("\t")
146        elif in_histogram:
147            values = line.split("\t")
148            if values != [""]:
149                for couple in zip(headers, values):
150                    yield couple
151
152
153def derive_data(data):
154    """ Based on the data derive additional data """
155
156    for s_name, values in data.items():
157        # setup holding variable
158
159        # Sum all variants that have been called
160        total_called_variants = 0
161        for value_name in ["TOTAL_SNPS", "TOTAL_COMPLEX_INDELS", "TOTAL_MULTIALLELIC_SNPS", "TOTAL_INDELS"]:
162            total_called_variants = total_called_variants + int(values[value_name])
163        values["total_called_variants"] = total_called_variants
164
165        # Sum all variants that have been called and are known
166        total_called_variants_known = 0
167        for value_name in ["NUM_IN_DB_SNP", "NUM_IN_DB_SNP_COMPLEX_INDELS", "NUM_IN_DB_SNP_MULTIALLELIC"]:
168            total_called_variants_known = total_called_variants_known + int(values[value_name])
169        total_called_variants_known = (
170            total_called_variants_known + int(values["TOTAL_INDELS"]) - int(values["NOVEL_INDELS"])
171        )
172        values["total_called_variants_known"] = total_called_variants_known
173
174        # Extrapolate the total novel variants
175        values["total_called_variants_novel"] = total_called_variants - total_called_variants_known
176
177
178def stripped(iterator):
179    """ Generator to strip string of whitespace """
180    for item in iterator:
181        yield item.strip()
182
183
184def compare_variant_type_plot(data):
185    """ Return HTML for the Variant Counts barplot """
186    keys = OrderedDict()
187    keys["snps"] = {"name": "SNPs", "color": "#7cb5ec"}
188    keys["indels"] = {"name": "InDels", "color": "#90ed7d"}
189    keys["multiallelic_snps"] = {"name": "multi-allelic SNP", "color": "orange"}
190    keys["complex_indels"] = {"name": "Complex InDels", "color": "#8085e9"}
191
192    total_variants = dict()
193    known_variants = dict()
194    novel_variants = dict()
195    for s_name, values in data.items():
196        total_variants[s_name] = {
197            "snps": values["TOTAL_SNPS"],
198            "indels": values["TOTAL_INDELS"],
199            "multiallelic_snps": values["TOTAL_MULTIALLELIC_SNPS"],
200            "complex_indels": values["TOTAL_COMPLEX_INDELS"],
201        }
202
203        known_variants[s_name] = {
204            "snps": values["NUM_IN_DB_SNP"],
205            "indels": int(values["TOTAL_INDELS"]) - int(values["NOVEL_INDELS"]),
206            "multiallelic_snps": values["NUM_IN_DB_SNP_MULTIALLELIC"],
207            "complex_indels": values["NUM_IN_DB_SNP_COMPLEX_INDELS"],
208        }
209
210        novel_variants[s_name] = {
211            "snps": values["NOVEL_SNPS"],
212            "indels": values["NOVEL_INDELS"],
213            "multiallelic_snps": int(values["TOTAL_MULTIALLELIC_SNPS"]) - int(values["NUM_IN_DB_SNP_MULTIALLELIC"]),
214            "complex_indels": int(values["TOTAL_COMPLEX_INDELS"]) - int(values["NUM_IN_DB_SNP_COMPLEX_INDELS"]),
215        }
216
217    plot_conf = {
218        "id": "picard_variantCallingMetrics_variant_type",
219        "title": "Picard: Variants Called",
220        "ylab": "Counts of Variants",
221        "hide_zero_cats": False,
222        "data_labels": [{"name": "Total"}, {"name": "Known"}, {"name": "Novel"}],
223    }
224    return bargraph.plot(
225        data=[total_variants, known_variants, novel_variants], cats=[keys, keys, keys], pconfig=plot_conf
226    )
227
228
229def compare_variants_label_plot(data):
230    """ Return HTML for the Compare variants plot"""
231    keys = OrderedDict()
232
233    keys["total_called_variants_known"] = {"name": "Known Variants"}
234    keys["total_called_variants_novel"] = {"name": "Novel Variants"}
235
236    pconfig = {
237        "id": "picard_variantCallingMetrics_variant_label",
238        "title": "Picard: Variants Called",
239        "ylab": "Counts of Variants",
240    }
241
242    return bargraph.plot(data, cats=keys, pconfig=pconfig)
243
244
245def merge_two_dicts(x, y):
246    z = x.copy()  # start with x's keys and values
247    z.update(y)  # modifies z with y's keys and values & returns None
248    return z
249