1#!/usr/bin/env python 2 3""" MultiQC submodule to parse output from Picard VariantCallingMetrics """ 4 5import logging 6from collections import OrderedDict 7from multiqc.plots import bargraph 8 9# Initialise the logger 10log = logging.getLogger(__name__) 11 12 13def parse_reports(parent_module): 14 """ Find Picard VariantCallingMetrics reports and process their data """ 15 16 # get data 17 data = collect_data(parent_module) 18 19 # Filter to strip out ignored sample names 20 data = parent_module.ignore_samples(data) 21 22 # Reference data in parent module 23 parent_module.picard_variantCalling_data = data 24 25 if len(data) > 0: 26 derive_data(data) 27 28 # Write parsed data to a file 29 parent_module.write_data_file(data, "multiqc_picard_variantCalling") 30 31 parent_module.general_stats_headers["DBSNP_TITV"] = { 32 "title": "TiTV ratio (known)", 33 "description": "The Transition/Transversion ratio of the passing bi-allelic SNP calls made at SNP-database sites.", 34 "min": 0, 35 "scale": "Blues", 36 "shared_key": "titv_ratio", 37 } 38 39 parent_module.general_stats_headers["NOVEL_TITV"] = { 40 "title": "TiTV ratio (novel)", 41 "description": "The Transition/Transversion ratio of the passing bi-allelic SNP calls made at non-SNP-database sites.", 42 "min": 0, 43 "scale": "Blues", 44 "shared_key": "titv_ratio", 45 } 46 47 parent_module.general_stats_headers["DBSNP_INS_DEL_RATIO"] = { 48 "title": "InDel ratio (known)", 49 "description": "The Insertion / Deletion ratio of the passing bi-allelic SNP calls made at SNP-database sites.", 50 "min": 0, 51 "scale": "Greens", 52 "shared_key": "indel_ratio", 53 "hidden": True, 54 } 55 56 parent_module.general_stats_headers["NOVEL_INS_DEL_RATIO"] = { 57 "title": "InDel ratio (novel)", 58 "description": "The Insertion / Deletion ratio of the passing bi-allelic SNP calls made at non-SNP-database sites.", 59 "min": 0, 60 "scale": "Greens", 61 "shared_key": "indel_ratio", 62 "hidden": True, 63 } 64 65 parent_module.general_stats_headers["total_called_variants_known"] = { 66 "title": "Called Variants (known)", 67 "description": "Total counts of variants in SNP-database sites.", 68 "shared_key": "variant_count", 69 "min": 0, 70 "format": "{0:,.0f}", 71 "hidden": True, 72 } 73 74 parent_module.general_stats_headers["total_called_variants_novel"] = { 75 "title": "Called Variants (novel)", 76 "description": "Total counts of variants in non-SNP-database sites.", 77 "shared_key": "variant_count", 78 "min": 0, 79 "format": "{0:,.0f}", 80 "hidden": True, 81 } 82 83 for s_name in data: 84 if s_name not in parent_module.general_stats_data: 85 parent_module.general_stats_data[s_name] = dict() 86 parent_module.general_stats_data[s_name].update(data[s_name]) 87 88 # Variant Counts Bargraph 89 parent_module.add_section( 90 name="Variant Types", 91 anchor="picard-variants-types", 92 description="Variants that have been called, looking at variant types. Optionally filtered on label.", 93 helptext=""" 94 Only passing variants are shown (i.e. non-filtered).\n 95 SNPs are bi-allelic.\n 96 Complex InDels are both an insertion and a deletion. 97 """, 98 plot=compare_variant_type_plot(data), 99 ) 100 101 # Variant Counts Table 102 parent_module.add_section( 103 name="Variant Labels", 104 anchor="picard-variants-labels", 105 description="Variants that have been called, comparing with known variant sites.", 106 helptext=""" 107 Only passing variants are shown (i.e. non-filtered).\n 108 Variants contain bi-allelic SNPs, multi-allelic SNPs, simple and complex inserts and deletions. 109 """, 110 plot=compare_variants_label_plot(data), 111 ) 112 113 return len(data) 114 115 116def collect_data(parent_module): 117 """ Find Picard VariantCallingMetrics reports and parse their data """ 118 119 data = dict() 120 for file_meta in parent_module.find_log_files("picard/variant_calling_metrics", filehandles=True): 121 s_name = None 122 for header, value in table_in(file_meta["f"], pre_header_string="## METRICS CLASS"): 123 if header == "SAMPLE_ALIAS": 124 s_name = value 125 if s_name in data: 126 log.debug("Duplicate sample name found in {}! Overwriting: {}".format(file_meta["fn"], s_name)) 127 data[s_name] = OrderedDict() 128 else: 129 data[s_name][header] = value 130 return data 131 132 133def table_in(filehandle, pre_header_string): 134 """ Generator that assumes a table starts the line after a given string """ 135 136 in_histogram = False 137 next_is_header = False 138 headers = list() 139 for line in stripped(filehandle): 140 if not in_histogram and line.startswith(pre_header_string): 141 in_histogram = True 142 next_is_header = True 143 elif in_histogram and next_is_header: 144 next_is_header = False 145 headers = line.split("\t") 146 elif in_histogram: 147 values = line.split("\t") 148 if values != [""]: 149 for couple in zip(headers, values): 150 yield couple 151 152 153def derive_data(data): 154 """ Based on the data derive additional data """ 155 156 for s_name, values in data.items(): 157 # setup holding variable 158 159 # Sum all variants that have been called 160 total_called_variants = 0 161 for value_name in ["TOTAL_SNPS", "TOTAL_COMPLEX_INDELS", "TOTAL_MULTIALLELIC_SNPS", "TOTAL_INDELS"]: 162 total_called_variants = total_called_variants + int(values[value_name]) 163 values["total_called_variants"] = total_called_variants 164 165 # Sum all variants that have been called and are known 166 total_called_variants_known = 0 167 for value_name in ["NUM_IN_DB_SNP", "NUM_IN_DB_SNP_COMPLEX_INDELS", "NUM_IN_DB_SNP_MULTIALLELIC"]: 168 total_called_variants_known = total_called_variants_known + int(values[value_name]) 169 total_called_variants_known = ( 170 total_called_variants_known + int(values["TOTAL_INDELS"]) - int(values["NOVEL_INDELS"]) 171 ) 172 values["total_called_variants_known"] = total_called_variants_known 173 174 # Extrapolate the total novel variants 175 values["total_called_variants_novel"] = total_called_variants - total_called_variants_known 176 177 178def stripped(iterator): 179 """ Generator to strip string of whitespace """ 180 for item in iterator: 181 yield item.strip() 182 183 184def compare_variant_type_plot(data): 185 """ Return HTML for the Variant Counts barplot """ 186 keys = OrderedDict() 187 keys["snps"] = {"name": "SNPs", "color": "#7cb5ec"} 188 keys["indels"] = {"name": "InDels", "color": "#90ed7d"} 189 keys["multiallelic_snps"] = {"name": "multi-allelic SNP", "color": "orange"} 190 keys["complex_indels"] = {"name": "Complex InDels", "color": "#8085e9"} 191 192 total_variants = dict() 193 known_variants = dict() 194 novel_variants = dict() 195 for s_name, values in data.items(): 196 total_variants[s_name] = { 197 "snps": values["TOTAL_SNPS"], 198 "indels": values["TOTAL_INDELS"], 199 "multiallelic_snps": values["TOTAL_MULTIALLELIC_SNPS"], 200 "complex_indels": values["TOTAL_COMPLEX_INDELS"], 201 } 202 203 known_variants[s_name] = { 204 "snps": values["NUM_IN_DB_SNP"], 205 "indels": int(values["TOTAL_INDELS"]) - int(values["NOVEL_INDELS"]), 206 "multiallelic_snps": values["NUM_IN_DB_SNP_MULTIALLELIC"], 207 "complex_indels": values["NUM_IN_DB_SNP_COMPLEX_INDELS"], 208 } 209 210 novel_variants[s_name] = { 211 "snps": values["NOVEL_SNPS"], 212 "indels": values["NOVEL_INDELS"], 213 "multiallelic_snps": int(values["TOTAL_MULTIALLELIC_SNPS"]) - int(values["NUM_IN_DB_SNP_MULTIALLELIC"]), 214 "complex_indels": int(values["TOTAL_COMPLEX_INDELS"]) - int(values["NUM_IN_DB_SNP_COMPLEX_INDELS"]), 215 } 216 217 plot_conf = { 218 "id": "picard_variantCallingMetrics_variant_type", 219 "title": "Picard: Variants Called", 220 "ylab": "Counts of Variants", 221 "hide_zero_cats": False, 222 "data_labels": [{"name": "Total"}, {"name": "Known"}, {"name": "Novel"}], 223 } 224 return bargraph.plot( 225 data=[total_variants, known_variants, novel_variants], cats=[keys, keys, keys], pconfig=plot_conf 226 ) 227 228 229def compare_variants_label_plot(data): 230 """ Return HTML for the Compare variants plot""" 231 keys = OrderedDict() 232 233 keys["total_called_variants_known"] = {"name": "Known Variants"} 234 keys["total_called_variants_novel"] = {"name": "Novel Variants"} 235 236 pconfig = { 237 "id": "picard_variantCallingMetrics_variant_label", 238 "title": "Picard: Variants Called", 239 "ylab": "Counts of Variants", 240 } 241 242 return bargraph.plot(data, cats=keys, pconfig=pconfig) 243 244 245def merge_two_dicts(x, y): 246 z = x.copy() # start with x's keys and values 247 z.update(y) # modifies z with y's keys and values & returns None 248 return z 249