1#!/usr/bin/env python 2 3""" MultiQC module to parse output from FastQ Screen """ 4 5from __future__ import print_function 6from collections import OrderedDict 7import json 8import logging 9import re 10 11from multiqc import config 12from multiqc.plots import bargraph 13from multiqc.modules.base_module import BaseMultiqcModule 14from multiqc.utils import report 15 16# Initialise the logger 17log = logging.getLogger(__name__) 18 19 20class MultiqcModule(BaseMultiqcModule): 21 def __init__(self): 22 23 # Initialise the parent object 24 super(MultiqcModule, self).__init__( 25 name="FastQ Screen", 26 anchor="fastq_screen", 27 href="http://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/", 28 info="allows you to screen a library of sequences in FastQ format against" 29 " a set of sequence databases so you can see if the composition of the" 30 " library matches with what you expect.", 31 ) 32 33 # Find and load any FastQ Screen reports 34 self.fq_screen_data = dict() 35 self.num_orgs = 0 36 for f in self.find_log_files("fastq_screen", filehandles=True): 37 parsed_data = self.parse_fqscreen(f) 38 if parsed_data is not None: 39 if f["s_name"] in self.fq_screen_data: 40 log.debug("Duplicate sample name found! Overwriting: {}".format(f["s_name"])) 41 self.add_data_source(f) 42 self.fq_screen_data[f["s_name"]] = parsed_data 43 44 # Filter to strip out ignored sample names 45 self.fq_screen_data = self.ignore_samples(self.fq_screen_data) 46 47 if len(self.fq_screen_data) == 0: 48 raise UserWarning 49 50 log.info("Found {} reports".format(len(self.fq_screen_data))) 51 52 # Check whether we have a consistent number of organisms across all samples 53 num_orgs = set([len(orgs) for orgs in self.fq_screen_data.values()]) 54 55 # Section 1 - Alignment Profiles 56 # Posh plot only works for around 20 samples, 8 organisms. If all samples have the same number of organisms. 57 if ( 58 len(num_orgs) == 1 59 and len(self.fq_screen_data) * self.num_orgs <= 160 60 and not config.plots_force_flat 61 and not getattr(config, "fastqscreen_simpleplot", False) 62 ): 63 self.add_section(name="Mapped Reads", anchor="fastq_screen_mapped_reads", content=self.fqscreen_plot()) 64 # Use simpler plot that works with many samples 65 else: 66 self.add_section(name="Mapped Reads", anchor="fastq_screen_mapped_reads", plot=self.fqscreen_simple_plot()) 67 68 # Section 2 - Optional bisfulfite plot 69 self.fqscreen_bisulfite_plot() 70 71 # Write the total counts and percentages to files 72 self.write_data_file(self.parse_csv(), "multiqc_fastq_screen") 73 74 def parse_fqscreen(self, f): 75 """ Parse the FastQ Screen output into a 3D dict """ 76 parsed_data = OrderedDict() 77 nohits_pct = None 78 headers = None 79 bs_headers = None 80 for l in f["f"]: 81 # Skip comment lines 82 if l.startswith("#"): 83 continue 84 if l.startswith("%Hit_no_genomes:") or l.startswith("%Hit_no_libraries:"): 85 nohits_pct = float(l.split(":", 1)[1]) 86 parsed_data["No hits"] = {"percentages": {"one_hit_one_library": nohits_pct}} 87 else: 88 s = l.strip().split("\t") 89 90 # Regular FastQ Screen table section 91 if len(s) == 12: 92 if headers is None: 93 headers = s 94 else: 95 # Can't use #Reads in subset as varies. #Reads_processed should be same for all orgs in a sample 96 parsed_data["total_reads"] = int(s[1]) 97 # Loop through all columns 98 parsed_data[s[0]] = {"percentages": {}, "counts": {}} 99 for idx, h in enumerate(headers): 100 if idx == 0: 101 continue 102 dtype = "percentages" if h.startswith("%") else "counts" 103 field = ( 104 h.replace("%", "") 105 .replace("#", "") 106 .replace("genomes", "libraries") 107 .replace("genome", "library") 108 .lower() 109 ) 110 parsed_data[s[0]][dtype][field] = float(s[idx]) 111 112 # Optional Bisulfite table section 113 elif len(s) == 9: 114 if bs_headers is None: 115 bs_headers = s 116 else: 117 # Loop through all columns 118 parsed_data[s[0]]["bisulfite_percentages"] = {} 119 parsed_data[s[0]]["bisulfite_counts"] = {} 120 for idx, h in enumerate(bs_headers): 121 if idx == 0: 122 continue 123 dtype = "bisulfite_percentages" if h.startswith("%") else "bisulfite_counts" 124 field = h.replace("%", "").replace("#", "").lower() 125 parsed_data[s[0]][dtype][field] = float(s[idx]) 126 127 if len(parsed_data) == 0: 128 return None 129 130 # Calculate no hits counts 131 if parsed_data.get("total_reads") is not None and nohits_pct is not None: 132 parsed_data["No hits"]["counts"] = { 133 "one_hit_one_library": int((nohits_pct / 100.0) * float(parsed_data["total_reads"])) 134 } 135 else: 136 log.warning("Couldn't find number of reads with no hits for '{}'".format(f["s_name"])) 137 138 self.num_orgs = max(len(parsed_data), self.num_orgs) 139 return parsed_data 140 141 def parse_csv(self): 142 totals = OrderedDict() 143 for s in sorted(self.fq_screen_data.keys()): 144 totals[s] = OrderedDict() 145 for org in self.fq_screen_data[s]: 146 if org == "total_reads": 147 totals[s]["total_reads"] = self.fq_screen_data[s][org] 148 continue 149 try: 150 k = "{} counts".format(org) 151 totals[s][k] = self.fq_screen_data[s][org]["counts"]["one_hit_one_library"] 152 totals[s][k] += self.fq_screen_data[s][org]["counts"].get("multiple_hits_one_library", 0) 153 totals[s][k] += self.fq_screen_data[s][org]["counts"].get("one_hit_multiple_libraries", 0) 154 totals[s][k] += self.fq_screen_data[s][org]["counts"].get("multiple_hits_multiple_libraries", 0) 155 except KeyError: 156 pass 157 try: 158 k = "{} percentage".format(org) 159 totals[s][k] = self.fq_screen_data[s][org]["percentages"]["one_hit_one_library"] 160 totals[s][k] += self.fq_screen_data[s][org]["percentages"].get("multiple_hits_one_library", 0) 161 totals[s][k] += self.fq_screen_data[s][org]["percentages"].get("one_hit_multiple_libraries", 0) 162 totals[s][k] += self.fq_screen_data[s][org]["percentages"].get( 163 "multiple_hits_multiple_libraries", 0 164 ) 165 except KeyError: 166 pass 167 return totals 168 169 def fqscreen_plot(self): 170 """Makes a fancy custom plot which replicates the plot seen in the main 171 FastQ Screen program. Not useful if lots of samples as gets too wide.""" 172 173 categories = list() 174 getCats = True 175 data = list() 176 p_types = OrderedDict() 177 p_types["multiple_hits_multiple_libraries"] = {"col": "#7f0000", "name": "Multiple Hits, Multiple Genomes"} 178 p_types["one_hit_multiple_libraries"] = {"col": "#ff0000", "name": "One Hit, Multiple Genomes"} 179 p_types["multiple_hits_one_library"] = {"col": "#00007f", "name": "Multiple Hits, One Genome"} 180 p_types["one_hit_one_library"] = {"col": "#0000ff", "name": "One Hit, One Genome"} 181 for k, t in p_types.items(): 182 first = True 183 for s in sorted(self.fq_screen_data.keys()): 184 thisdata = list() 185 if len(categories) > 0: 186 getCats = False 187 for org in sorted(self.fq_screen_data[s]): 188 if org == "total_reads": 189 continue 190 try: 191 thisdata.append(self.fq_screen_data[s][org]["percentages"][k]) 192 except KeyError: 193 thisdata.append(None) 194 if getCats: 195 categories.append(org) 196 td = {"name": t["name"], "stack": s, "data": thisdata, "color": t["col"]} 197 if first: 198 first = False 199 else: 200 td["linkedTo"] = ":previous" 201 data.append(td) 202 203 plot_id = report.save_htmlid("fq_screen_plot") 204 html = """<div id={plot_id} class="fq_screen_plot hc-plot"></div> 205 <script type="application/json" class="fq_screen_dict">{dict}</script> 206 """.format( 207 plot_id=json.dumps(plot_id), 208 dict=json.dumps({"plot_id": plot_id, "data": data, "categories": categories}), 209 ) 210 211 html += """<script type="text/javascript"> 212 fq_screen_dict = { }; // { <plot_id>: data, categories } 213 $('.fq_screen_dict').each(function (i, elem) { 214 var dict = JSON.parse(elem.innerHTML); 215 fq_screen_dict[dict.plot_id] = dict; 216 }); 217 218 $(function () { 219 // In case of repeated modules: #fq_screen_plot, #fq_screen_plot-1, .. 220 $(".fq_screen_plot").each(function () { 221 var plot_id = $(this).attr('id'); 222 223 $(this).highcharts({ 224 chart: { type: "column", backgroundColor: null }, 225 title: { text: "FastQ Screen Results" }, 226 xAxis: { categories: fq_screen_dict[plot_id].categories }, 227 yAxis: { 228 max: 100, 229 min: 0, 230 title: { text: "Percentage Aligned" } 231 }, 232 tooltip: { 233 formatter: function () { 234 return "<b>" + this.series.stackKey.replace("column","") + " - " + this.x + "</b><br/>" + 235 this.series.name + ": " + this.y + "%<br/>" + 236 "Total Alignment: " + this.point.stackTotal + "%"; 237 }, 238 }, 239 plotOptions: { 240 column: { 241 pointPadding: 0, 242 groupPadding: 0.02, 243 stacking: "normal" 244 } 245 }, 246 series: fq_screen_dict[plot_id].data 247 }); 248 }); 249 }); 250 </script>""" 251 252 return html 253 254 def fqscreen_simple_plot(self): 255 """Makes a simple bar plot with summed alignment counts for 256 each species, stacked.""" 257 258 # First, sum the different types of alignment counts 259 data = {} 260 org_counts = {} 261 for s_name in sorted(self.fq_screen_data): 262 data[s_name] = OrderedDict() 263 sum_alignments = 0 264 for org in self.fq_screen_data[s_name]: 265 if org == "total_reads": 266 continue 267 try: 268 data[s_name][org] = self.fq_screen_data[s_name][org]["counts"]["one_hit_one_library"] 269 except KeyError: 270 log.error( 271 "No counts found for '{}' ('{}'). Could be malformed or very old FastQ Screen results.".format( 272 org, s_name 273 ) 274 ) 275 continue 276 try: 277 data[s_name][org] += self.fq_screen_data[s_name][org]["counts"]["multiple_hits_one_library"] 278 except KeyError: 279 pass 280 sum_alignments += data[s_name][org] 281 org_counts[org] = org_counts.get(org, 0) + data[s_name][org] 282 283 # Calculate hits in multiple genomes 284 if "total_reads" in self.fq_screen_data[s_name]: 285 data[s_name]["Multiple Genomes"] = self.fq_screen_data[s_name]["total_reads"] - sum_alignments 286 287 # Sort the categories by the total read counts 288 cats = OrderedDict() 289 for org in sorted(org_counts, key=org_counts.get, reverse=True): 290 if org not in cats and org != "No hits": 291 cats[org] = {"name": org} 292 293 # Strip empty dicts 294 for s_name in list(data.keys()): 295 if len(data[s_name]) == 0: 296 del data[s_name] 297 298 pconfig = { 299 "id": "fastq_screen_plot", 300 "title": "FastQ Screen: Mapped Reads", 301 "cpswitch_c_active": False, 302 "ylab": "Percentages", 303 } 304 cats["Multiple Genomes"] = {"name": "Multiple Genomes", "color": "#820000"} 305 cats["No hits"] = {"name": "No hits", "color": "#cccccc"} 306 307 return bargraph.plot(data, cats, pconfig) 308 309 def fqscreen_bisulfite_plot(self): 310 """ Make a stacked barplot for the bisulfite data, if we have any """ 311 312 pconfig = { 313 "id": "fastq_screen_bisulfite_plot", 314 "title": "FastQ Screen: Bisulfite Mapping Strand Orientation", 315 "hide_zero_cats": False, 316 "ylab": "Percentages", 317 "data_labels": [], 318 } 319 320 cats = OrderedDict() 321 cats["original_top_strand"] = {"name": "Original top strand", "color": "#80cdc1"} 322 cats["complementary_to_original_top_strand"] = { 323 "name": "Complementary to original top strand", 324 "color": "#018571", 325 } 326 cats["complementary_to_original_bottom_strand"] = { 327 "name": "Complementary to original bottom strand", 328 "color": "#a6611a", 329 } 330 cats["original_bottom_strand"] = {"name": "Original bottom strand", "color": "#dfc27d"} 331 332 # Pull out the data that we want 333 pdata_unsorted = {} 334 org_counts = {} 335 max_count = 0 336 for s_name, d in self.fq_screen_data.items(): 337 for org, dd in d.items(): 338 if org not in pdata_unsorted: 339 pdata_unsorted[org] = {} 340 org_counts[org] = 0 341 try: 342 pdata_unsorted[org][s_name] = dd["bisulfite_counts"] 343 total_counts = sum([c for c in dd["bisulfite_counts"].values()]) 344 org_counts[org] += total_counts 345 max_count = max(max_count, total_counts) 346 except (TypeError, KeyError): 347 pass 348 349 # Consistent y-max across all genomes 350 pconfig["ymax"] = max_count 351 352 # Show tabbed bar plots, sorted by total read count 353 pdata = [] 354 pcats = [] 355 for org in sorted(org_counts, key=org_counts.get, reverse=True): 356 if org_counts[org] > 0: 357 pconfig["data_labels"].append(org) 358 pdata.append(pdata_unsorted[org]) 359 pcats.append(cats) 360 361 if len(pdata) == 0: 362 return None 363 364 self.add_section( 365 name="Bisulfite Reads", anchor="fastq_screen_bisulfite", plot=bargraph.plot(pdata, pcats, pconfig) 366 ) 367