1#!/usr/bin/env python 2 3""" MultiQC module to parse logs from SnpEff """ 4 5from __future__ import print_function 6 7from collections import OrderedDict 8import logging 9from multiqc.plots import bargraph, linegraph 10from multiqc.modules.base_module import BaseMultiqcModule 11 12# Initialise the logger 13log = logging.getLogger(__name__) 14 15 16class MultiqcModule(BaseMultiqcModule): 17 """ SnpEff """ 18 19 def __init__(self): 20 # Initialise the parent object 21 super(MultiqcModule, self).__init__( 22 name="SnpEff", 23 anchor="snpeff", 24 href="http://snpeff.sourceforge.net/", 25 info=" is a genetic variant annotation and effect prediction " 26 "toolbox. It annotates and predicts the effects of variants " 27 "on genes (such as amino acid changes). ", 28 ) 29 30 self.snpeff_data = dict() 31 self.snpeff_section_totals = dict() 32 self.snpeff_qualities = dict() 33 34 for f in self.find_log_files("snpeff", filehandles=True): 35 self.parse_snpeff_log(f) 36 37 # Filter to strip out ignored sample names 38 self.snpeff_data = self.ignore_samples(self.snpeff_data) 39 40 if len(self.snpeff_data) == 0: 41 raise UserWarning 42 43 log.info("Found {} reports".format(len(self.snpeff_data))) 44 45 # Write parsed report data to a file 46 self.write_data_file(self.snpeff_data, "multiqc_snpeff") 47 48 # General stats table 49 self.general_stats() 50 51 # Report sections 52 self.add_section( 53 name="Variants by Genomic Region", 54 description=( 55 """ 56 The stacked bar plot shows locations of detected variants in 57 the genome and the number of variants for each location. 58 """ 59 ), 60 helptext=( 61 """ 62 The upstream and downstream interval size to detect these 63 genomic regions is 5000bp by default. 64 """ 65 ), 66 anchor="snpeff-genomic-regions", 67 plot=self.count_genomic_region_plot(), 68 ) 69 self.add_section( 70 name="Variant Effects by Impact", 71 description=( 72 """ 73 The stacked bar plot shows the putative impact of detected 74 variants and the number of variants for each impact. 75 """ 76 ), 77 helptext=( 78 """ 79 There are four levels of impacts predicted by SnpEff: 80 81 * **High**: High impact (like stop codon) 82 * **Moderate**: Middle impact (like same type of amino acid substitution) 83 * **Low**: Low impact (ie silence mutation) 84 * **Modifier**: No impact 85 """ 86 ), 87 anchor="snpeff-effects-impact", 88 plot=self.effects_impact_plot(), 89 ) 90 self.add_section( 91 name="Variants by Effect Types", 92 description=( 93 """ 94 The stacked bar plot shows the effect of variants at protein 95 level and the number of variants for each effect type. 96 """ 97 ), 98 helptext=( 99 """ 100 This plot shows the effect of variants with respect to 101 the mRNA. 102 """ 103 ), 104 anchor="snpeff-effects", 105 plot=self.effects_plot(), 106 ) 107 self.add_section( 108 name="Variants by Functional Class", 109 description=( 110 """ 111 The stacked bar plot shows the effect of variants and 112 the number of variants for each effect type. 113 """ 114 ), 115 helptext=( 116 """ 117 This plot shows the effect of variants on the translation of 118 the mRNA as protein. There are three possible cases: 119 120 * **Silent**: The amino acid does not change. 121 * **Missense**: The amino acid is different. 122 * **Nonsense**: The variant generates a stop codon. 123 """ 124 ), 125 anchor="snpeff-functional-class", 126 plot=self.effects_function_plot(), 127 ) 128 if len(self.snpeff_qualities) > 0: 129 self.add_section( 130 name="Variant Qualities", 131 description=( 132 """ 133 The line plot shows the quantity as function of the 134 variant quality score. 135 """ 136 ), 137 helptext=( 138 """ 139 The quality score corresponds to the QUAL column of the 140 VCF file. This score is set by the variant caller. 141 """ 142 ), 143 anchor="snpeff-qualities", 144 plot=self.qualities_plot(), 145 ) 146 147 def parse_snpeff_log(self, f): 148 """ Go through log file looking for snpeff output """ 149 150 keys = { 151 "# Summary table": [ 152 "Genome", 153 "Number_of_variants_before_filter", 154 "Number_of_known_variants", 155 "Number_of_effects", 156 "Genome_total_length", 157 "Change_rate", 158 ], 159 "# Effects by impact": ["HIGH", "LOW", "MODERATE", "MODIFIER"], 160 "# Effects by functional class": ["MISSENSE", "NONSENSE", "SILENT", "Missense_Silent_ratio"], 161 "# Hom/Het table": ["Het", "Hom", "Missing"], 162 "# Ts/Tv summary": ["Transitions", "Transversions", "Ts_Tv_ratio"], 163 "# Count by effects": "all", 164 "# Count by genomic region": "all", 165 } 166 parsed_data = {} 167 section = None 168 for l in f["f"]: 169 l = l.strip() 170 if l[:1] == "#": 171 section = l 172 self.snpeff_section_totals[section] = dict() 173 continue 174 s = l.split(",") 175 176 # Quality values / counts 177 if section == "# Quality": 178 quals = OrderedDict() 179 if l.startswith("Values"): 180 values = [int(c) for c in l.split(",")[1:]] 181 counts = f["f"].readline() 182 counts = [int(c) for c in counts.split(",")[1:]] 183 c = 0 184 total = sum(counts) 185 for i, v in enumerate(values): 186 if c < (total * 0.995): 187 quals[v] = counts[i] 188 c += counts[i] 189 if len(quals) > 0: 190 self.snpeff_qualities[f["s_name"]] = quals 191 192 # Everything else 193 elif section in keys: 194 if keys[section] == "all" or any([k in s[0].strip() for k in keys[section]]): 195 try: 196 parsed_data[s[0].strip()] = float(s[1].strip()) 197 except ValueError: 198 parsed_data[s[0].strip()] = s[1].strip() 199 except IndexError: 200 pass 201 else: 202 # Parsing the number worked - add to totals 203 try: 204 self.snpeff_section_totals[section][s[0].strip()] += parsed_data[s[0].strip()] 205 except KeyError: 206 self.snpeff_section_totals[section][s[0].strip()] = parsed_data[s[0].strip()] 207 if len(s) > 2 and s[2][-1:] == "%": 208 parsed_data["{}_percent".format(s[0].strip())] = float(s[2][:-1]) 209 210 if len(parsed_data) > 0: 211 if f["s_name"] in self.snpeff_data: 212 log.debug("Duplicate sample name found! Overwriting: {}".format(f["s_name"])) 213 self.add_data_source(f) 214 self.snpeff_data[f["s_name"]] = parsed_data 215 216 def general_stats(self): 217 """ Add key SnpEff stats to the general stats table """ 218 219 headers = OrderedDict() 220 headers["Change_rate"] = {"title": "Change rate", "scale": "RdYlBu-rev", "min": 0, "format": "{:,.0f}"} 221 headers["Ts_Tv_ratio"] = { 222 "title": "Ts/Tv", 223 "description": "Transitions / Transversions ratio", 224 "format": "{:,.3f}", 225 } 226 headers["Number_of_variants_before_filter"] = { 227 "title": "M Variants", 228 "description": "Number of variants before filter (millions)", 229 "scale": "PuRd", 230 "modify": lambda x: x / 1000000, 231 "min": 0, 232 "format": "{:,.2f}", 233 } 234 self.general_stats_addcols(self.snpeff_data, headers) 235 236 def count_genomic_region_plot(self): 237 """ Generate the SnpEff Counts by Genomic Region plot """ 238 239 # Sort the keys based on the total counts 240 keys = self.snpeff_section_totals["# Count by genomic region"] 241 sorted_keys = sorted(keys, reverse=True, key=keys.get) 242 243 # Make nicer label names 244 pkeys = OrderedDict() 245 for k in sorted_keys: 246 pkeys[k] = {"name": k.replace("_", " ").title().replace("Utr", "UTR")} 247 248 # Config for the plot 249 pconfig = { 250 "id": "snpeff_variant_effects_region", 251 "title": "SnpEff: Counts by Genomic Region", 252 "ylab": "# Reads", 253 "logswitch": True, 254 } 255 256 return bargraph.plot(self.snpeff_data, pkeys, pconfig) 257 258 def effects_plot(self): 259 """ Generate the SnpEff Counts by Effects plot """ 260 261 # Sort the keys based on the total counts 262 keys = self.snpeff_section_totals["# Count by effects"] 263 sorted_keys = sorted(keys, reverse=True, key=keys.get) 264 265 # Make nicer label names 266 pkeys = OrderedDict() 267 for k in sorted_keys: 268 pkeys[k] = {"name": k.replace("_", " ").title().replace("Utr", "UTR")} 269 270 # Config for the plot 271 pconfig = { 272 "id": "snpeff_effects", 273 "title": "SnpEff: Counts by Effect Types", 274 "ylab": "# Reads", 275 "logswitch": False, 276 } 277 return bargraph.plot(self.snpeff_data, pkeys, pconfig) 278 279 def effects_impact_plot(self): 280 """ Generate the SnpEff Counts by Effects Impact plot """ 281 282 # Put keys in a more logical order 283 keys = ["MODIFIER", "LOW", "MODERATE", "HIGH"] 284 285 # Make nicer label names 286 pkeys = OrderedDict() 287 for k in keys: 288 pkeys[k] = {"name": k.title()} 289 290 # Config for the plot 291 pconfig = { 292 "id": "snpeff_variant_effects_impact", 293 "title": "SnpEff: Counts by Effects Impact", 294 "ylab": "# Reads", 295 "logswitch": True, 296 } 297 298 return bargraph.plot(self.snpeff_data, pkeys, pconfig) 299 300 def effects_function_plot(self): 301 """ Generate the SnpEff Counts by Functional Class plot """ 302 303 # Cats to plot in a sensible order 304 keys = ["SILENT", "MISSENSE", "NONSENSE"] 305 306 # Make nicer label names 307 pkeys = OrderedDict() 308 for k in keys: 309 pkeys[k] = {"name": k.title()} 310 311 # Config for the plot 312 pconfig = { 313 "id": "snpeff_variant_effects_class", 314 "title": "SnpEff: Counts by Functional Class", 315 "ylab": "# Reads", 316 "logswitch": True, 317 } 318 319 return bargraph.plot(self.snpeff_data, pkeys, pconfig) 320 321 def qualities_plot(self): 322 """ Generate the qualities plot """ 323 324 pconfig = { 325 "smooth_points": 200, 326 "id": "snpeff_qualities", 327 "title": "SnpEff: Qualities", 328 "ylab": "Count", 329 "xlab": "Values", 330 "xDecimals": False, 331 "ymin": 0, 332 } 333 334 return linegraph.plot(self.snpeff_qualities, pconfig) 335