1# !/usr/bin/env python
2
3""" MultiQC module to parse output from DeDup """
4
5from __future__ import print_function
6from collections import OrderedDict
7import logging
8import json
9
10from multiqc.plots import bargraph
11from multiqc.utils import config
12from multiqc.modules.base_module import BaseMultiqcModule
13
14# Initialise the logger
15log = logging.getLogger(__name__)
16
17
18class MultiqcModule(BaseMultiqcModule):
19    """ DeDup module """
20
21    def __init__(self):
22
23        # Initialise the parent object
24        super(MultiqcModule, self).__init__(
25            name="DeDup",
26            anchor="dedup",
27            href="http://www.github.com/apeltzer/DeDup",
28            info="is a tool for duplicate removal for merged/collapsed reads in ancient DNA analysis.",
29        )
30
31        # Find and load any DeDup reports
32        self.dedup_data = dict()
33
34        for f in self.find_log_files("dedup", filehandles=True):
35            try:
36                self.parseJSON(f)
37            except KeyError:
38                logging.warning("Error loading file {}".format(f["fn"]))
39
40        # Filter to strip out ignored sample names
41        self.dedup_data = self.ignore_samples(self.dedup_data)
42
43        if len(self.dedup_data) == 0:
44            raise UserWarning
45
46        log.info("Found {} reports".format(len(self.dedup_data)))
47
48        # Write parsed report data to a file
49        self.write_data_file(self.dedup_data, "multiqc_dedup")
50
51        # Basic Stats Table
52        self.dedup_general_stats_table()
53
54        # Alignment Rate Plot
55        self.add_section(
56            description="This plot shows read categories that were either not removed (unique reads) or removed (duplicates).",
57            plot=self.dedup_alignment_plot(),
58        )
59
60    # Parse our nice little JSON file
61    def parseJSON(self, f):
62        """ Parse the JSON output from DeDup and save the summary statistics """
63        try:
64            parsed_json = json.load(f["f"])
65            # Check for Keys existing
66            if "metrics" not in parsed_json or "metadata" not in parsed_json:
67                log.debug("DeDup JSON missing essential keys - skipping sample: '{}'".format(f["fn"]))
68                return None
69        except JSONDecodeError as e:
70            log.debug("Could not parse DeDup JSON: '{}'".format(f["fn"]))
71            log.debug(e)
72            return None
73
74        # Get sample name from JSON first
75        s_name = self.clean_s_name(parsed_json["metadata"]["sample_name"], f["root"])
76        self.add_data_source(f, s_name)
77
78        metrics_dict = parsed_json["metrics"]
79
80        for k in metrics_dict:
81            metrics_dict[k] = float(metrics_dict[k])
82
83        # Compute (not) removed _mapped_ reads from given values as dedup only affects mapped reads
84        # Keep legacy behaviour in case "mapped_reads" cannot be found for <= v0.12.6
85        if "mapped_reads" in metrics_dict:
86            metrics_dict["mapped_after_dedup"] = (
87                metrics_dict["mapped_reads"]
88                - metrics_dict["reverse_removed"]
89                - metrics_dict["forward_removed"]
90                - metrics_dict["merged_removed"]
91            )
92        else:
93            metrics_dict["not_removed"] = (
94                metrics_dict["total_reads"]
95                - metrics_dict["reverse_removed"]
96                - metrics_dict["forward_removed"]
97                - metrics_dict["merged_removed"]
98            )
99        metrics_dict["reads_removed"] = (
100            metrics_dict["reverse_removed"] + metrics_dict["forward_removed"] + metrics_dict["merged_removed"]
101        )
102
103        # Add all in the main data_table
104        self.dedup_data[s_name] = metrics_dict
105
106    def dedup_general_stats_table(self):
107        """Take the parsed stats from the DeDup report and add it to the
108        basic stats table at the top of the report"""
109
110        ancient_read_count_prefix = getattr(config, "ancient_read_count_prefix", "K")
111        ancient_read_count_desc = getattr(config, "ancient_read_count_desc", "thousands")
112        ancient_read_count_multiplier = getattr(config, "ancient_read_count_multiplier", 0.001)
113
114        headers = OrderedDict()
115        headers["dup_rate"] = {
116            "title": "Duplication Rate",
117            "description": "Percentage of reads categorised as a technical duplicate",
118            "min": 0,
119            "max": 100,
120            "suffix": "%",
121            "scale": "OrRd",
122            "format": "{:,.0f}",
123            "modify": lambda x: x * 100.0,
124        }
125        headers["clusterfactor"] = {
126            "title": "ClusterFactor",
127            "description": "CF~1 means high library complexity. Large CF means not worth sequencing deeper.",
128            "min": 1,
129            "max": 100,
130            "scale": "OrRd",
131            "format": "{:,.2f}",
132        }
133        headers["reads_removed"] = {
134            "title": "{} Reads Removed".format(ancient_read_count_prefix),
135            "description": "Non-unique reads removed after deduplication ({})".format(ancient_read_count_desc),
136            "modify": lambda x: x * ancient_read_count_multiplier,
137            "shared_key": "read_count",
138            "min": 0,
139            "hidden": True,
140        }
141        headers["mapped_after_dedup"] = {
142            "title": "{} Post-DeDup Mapped Reads".format(ancient_read_count_prefix),
143            "description": "Unique mapping reads after deduplication ({})".format(ancient_read_count_desc),
144            "modify": lambda x: x * ancient_read_count_multiplier,
145            "shared_key": "read_count",
146            "min": 0,
147        }
148        self.general_stats_addcols(self.dedup_data, headers)
149
150    def dedup_alignment_plot(self):
151        """ Make the HighCharts HTML to plot the duplication rates """
152
153        # Specify the order of the different possible categories
154        keys = OrderedDict()
155        keys["mapped_after_dedup"] = {"name": "Unique Retained"}
156        keys["not_removed"] = {"name": "Not Removed"}
157        keys["reverse_removed"] = {"name": "Reverse Removed"}
158        keys["forward_removed"] = {"name": "Forward Removed"}
159        keys["merged_removed"] = {"name": "Merged Removed"}
160
161        # Config for the plot
162        config = {
163            "id": "dedup_rates",
164            "title": "DeDup: Deduplicated Reads",
165            "ylab": "# Reads",
166            "cpswitch_counts_label": "Number of Reads",
167        }
168
169        return bargraph.plot(self.dedup_data, keys, config)
170