1# !/usr/bin/env python 2 3""" MultiQC module to parse output from DeDup """ 4 5from __future__ import print_function 6from collections import OrderedDict 7import logging 8import json 9 10from multiqc.plots import bargraph 11from multiqc.utils import config 12from multiqc.modules.base_module import BaseMultiqcModule 13 14# Initialise the logger 15log = logging.getLogger(__name__) 16 17 18class MultiqcModule(BaseMultiqcModule): 19 """ DeDup module """ 20 21 def __init__(self): 22 23 # Initialise the parent object 24 super(MultiqcModule, self).__init__( 25 name="DeDup", 26 anchor="dedup", 27 href="http://www.github.com/apeltzer/DeDup", 28 info="is a tool for duplicate removal for merged/collapsed reads in ancient DNA analysis.", 29 ) 30 31 # Find and load any DeDup reports 32 self.dedup_data = dict() 33 34 for f in self.find_log_files("dedup", filehandles=True): 35 try: 36 self.parseJSON(f) 37 except KeyError: 38 logging.warning("Error loading file {}".format(f["fn"])) 39 40 # Filter to strip out ignored sample names 41 self.dedup_data = self.ignore_samples(self.dedup_data) 42 43 if len(self.dedup_data) == 0: 44 raise UserWarning 45 46 log.info("Found {} reports".format(len(self.dedup_data))) 47 48 # Write parsed report data to a file 49 self.write_data_file(self.dedup_data, "multiqc_dedup") 50 51 # Basic Stats Table 52 self.dedup_general_stats_table() 53 54 # Alignment Rate Plot 55 self.add_section( 56 description="This plot shows read categories that were either not removed (unique reads) or removed (duplicates).", 57 plot=self.dedup_alignment_plot(), 58 ) 59 60 # Parse our nice little JSON file 61 def parseJSON(self, f): 62 """ Parse the JSON output from DeDup and save the summary statistics """ 63 try: 64 parsed_json = json.load(f["f"]) 65 # Check for Keys existing 66 if "metrics" not in parsed_json or "metadata" not in parsed_json: 67 log.debug("DeDup JSON missing essential keys - skipping sample: '{}'".format(f["fn"])) 68 return None 69 except JSONDecodeError as e: 70 log.debug("Could not parse DeDup JSON: '{}'".format(f["fn"])) 71 log.debug(e) 72 return None 73 74 # Get sample name from JSON first 75 s_name = self.clean_s_name(parsed_json["metadata"]["sample_name"], f["root"]) 76 self.add_data_source(f, s_name) 77 78 metrics_dict = parsed_json["metrics"] 79 80 for k in metrics_dict: 81 metrics_dict[k] = float(metrics_dict[k]) 82 83 # Compute (not) removed _mapped_ reads from given values as dedup only affects mapped reads 84 # Keep legacy behaviour in case "mapped_reads" cannot be found for <= v0.12.6 85 if "mapped_reads" in metrics_dict: 86 metrics_dict["mapped_after_dedup"] = ( 87 metrics_dict["mapped_reads"] 88 - metrics_dict["reverse_removed"] 89 - metrics_dict["forward_removed"] 90 - metrics_dict["merged_removed"] 91 ) 92 else: 93 metrics_dict["not_removed"] = ( 94 metrics_dict["total_reads"] 95 - metrics_dict["reverse_removed"] 96 - metrics_dict["forward_removed"] 97 - metrics_dict["merged_removed"] 98 ) 99 metrics_dict["reads_removed"] = ( 100 metrics_dict["reverse_removed"] + metrics_dict["forward_removed"] + metrics_dict["merged_removed"] 101 ) 102 103 # Add all in the main data_table 104 self.dedup_data[s_name] = metrics_dict 105 106 def dedup_general_stats_table(self): 107 """Take the parsed stats from the DeDup report and add it to the 108 basic stats table at the top of the report""" 109 110 ancient_read_count_prefix = getattr(config, "ancient_read_count_prefix", "K") 111 ancient_read_count_desc = getattr(config, "ancient_read_count_desc", "thousands") 112 ancient_read_count_multiplier = getattr(config, "ancient_read_count_multiplier", 0.001) 113 114 headers = OrderedDict() 115 headers["dup_rate"] = { 116 "title": "Duplication Rate", 117 "description": "Percentage of reads categorised as a technical duplicate", 118 "min": 0, 119 "max": 100, 120 "suffix": "%", 121 "scale": "OrRd", 122 "format": "{:,.0f}", 123 "modify": lambda x: x * 100.0, 124 } 125 headers["clusterfactor"] = { 126 "title": "ClusterFactor", 127 "description": "CF~1 means high library complexity. Large CF means not worth sequencing deeper.", 128 "min": 1, 129 "max": 100, 130 "scale": "OrRd", 131 "format": "{:,.2f}", 132 } 133 headers["reads_removed"] = { 134 "title": "{} Reads Removed".format(ancient_read_count_prefix), 135 "description": "Non-unique reads removed after deduplication ({})".format(ancient_read_count_desc), 136 "modify": lambda x: x * ancient_read_count_multiplier, 137 "shared_key": "read_count", 138 "min": 0, 139 "hidden": True, 140 } 141 headers["mapped_after_dedup"] = { 142 "title": "{} Post-DeDup Mapped Reads".format(ancient_read_count_prefix), 143 "description": "Unique mapping reads after deduplication ({})".format(ancient_read_count_desc), 144 "modify": lambda x: x * ancient_read_count_multiplier, 145 "shared_key": "read_count", 146 "min": 0, 147 } 148 self.general_stats_addcols(self.dedup_data, headers) 149 150 def dedup_alignment_plot(self): 151 """ Make the HighCharts HTML to plot the duplication rates """ 152 153 # Specify the order of the different possible categories 154 keys = OrderedDict() 155 keys["mapped_after_dedup"] = {"name": "Unique Retained"} 156 keys["not_removed"] = {"name": "Not Removed"} 157 keys["reverse_removed"] = {"name": "Reverse Removed"} 158 keys["forward_removed"] = {"name": "Forward Removed"} 159 keys["merged_removed"] = {"name": "Merged Removed"} 160 161 # Config for the plot 162 config = { 163 "id": "dedup_rates", 164 "title": "DeDup: Deduplicated Reads", 165 "ylab": "# Reads", 166 "cpswitch_counts_label": "Number of Reads", 167 } 168 169 return bargraph.plot(self.dedup_data, keys, config) 170