1#!/usr/bin/env python 2 3""" MultiQC module to parse output from sargasso """ 4 5from __future__ import print_function 6from collections import OrderedDict 7import logging 8 9from multiqc import config 10from multiqc.plots import bargraph 11from multiqc.modules.base_module import BaseMultiqcModule 12 13# Initialise the logger 14log = logging.getLogger(__name__) 15 16 17class MultiqcModule(BaseMultiqcModule): 18 def __init__(self): 19 20 # Initialise the parent object 21 super(MultiqcModule, self).__init__( 22 name="sargasso", 23 anchor="sargasso", 24 href="http://statbio.github.io/Sargasso/", 25 info="is a tool to separate mixed-species RNA-seq reads" "according to their species of origin.", 26 ) 27 28 # Find and load any Sargasso reports 29 self.sargasso_data = dict() 30 self.sargasso_files = list() 31 self.sargasso_keys = list() # header keys 32 33 for f in self.find_log_files("sargasso"): 34 self.parse_sargasso_logs(f) 35 self.sargasso_files.append(f) 36 37 # log.info('Removing ignored samples...') 38 self.sargasso_data = self.ignore_samples(self.sargasso_data) 39 40 if len(self.sargasso_data) == 0: 41 raise UserWarning 42 43 log.info("Found {} reports".format(len(self.sargasso_files))) 44 45 # Write parsed report data to a file 46 self.write_data_file(self.sargasso_data, "multiqc_sargasso") 47 48 # Basic Stats Table 49 self.sargasso_stats_table() 50 51 # Assignment bar plot 52 self.add_section(plot=self.sargasso_chart()) 53 54 # log.info('done') 55 56 def parse_sargasso_logs(self, f): 57 """ Parse the sargasso log file. """ 58 species_name = list() 59 items = list() 60 header = list() 61 is_first_line = True 62 for l in f["f"].splitlines(): 63 s = l.split(",") 64 # Check that this actually is a Sargasso file 65 if is_first_line and s[0] != "Sample": 66 return None 67 68 if len(s) < 7: 69 continue 70 if is_first_line: 71 # prepare header 72 is_first_line = False 73 header = s 74 for i in header[1:]: 75 # find out what species included 76 sname = i.split("-")[-1] 77 if sname not in species_name: 78 species_name.append(sname) 79 # find out what is being counted 80 kname = "-".join(i.split("-")[-3:-1]) 81 if kname not in items: 82 items.append(kname) 83 else: 84 # start sample lines. 85 sample_name = s.pop(0) 86 87 chunk_by_species = [s[i : i + len(items)] for i in range(0, len(s), len(items))] 88 for idx, v in enumerate(chunk_by_species): 89 # adding species name to the sample name for easy interpretation 90 new_sample_name = "_".join([sample_name, species_name[idx]]) 91 92 # Clean up sample name 93 new_sample_name = self.clean_s_name(new_sample_name, f["root"]) 94 95 if new_sample_name in self.sargasso_data.keys(): 96 log.debug("Duplicate sample name found! Overwriting: {}".format(new_sample_name)) 97 98 try: 99 self.sargasso_data[new_sample_name] = dict(zip(items, map(int, v))) 100 except ValueError: 101 pass 102 103 self.sargasso_keys = items 104 105 for idx, f_name in enumerate(self.sargasso_data.keys()): 106 107 # Reorganised parsed data for this sample 108 # Collect total READ count number 109 self.sargasso_data[f_name]["Total"] = 0 110 for key, value in list(self.sargasso_data[f_name].items()): # iter on both keys and values 111 if key.endswith("Reads"): 112 self.sargasso_data[f_name]["Total"] += value 113 114 # Calculate the percent aligned if we can 115 try: 116 self.sargasso_data[f_name]["sargasso_percent_assigned"] = ( 117 float(self.sargasso_data[f_name]["Assigned-Reads"]) / float(self.sargasso_data[f_name]["Total"]) 118 ) * 100.0 119 except (KeyError, ZeroDivisionError): 120 pass 121 122 def sargasso_stats_table(self): 123 """Take the parsed stats from the sargasso report and add them to the 124 basic stats table at the top of the report""" 125 126 headers = OrderedDict() 127 headers["sargasso_percent_assigned"] = { 128 "title": "% Assigned", 129 "description": "Sargasso % Assigned reads", 130 "max": 100, 131 "min": 0, 132 "suffix": "%", 133 "scale": "RdYlGn", 134 } 135 headers["Assigned-Reads"] = { 136 "title": "{} Assigned".format(config.read_count_prefix), 137 "description": "Sargasso Assigned reads ({})".format(config.read_count_desc), 138 "min": 0, 139 "scale": "PuBu", 140 "modify": lambda x: float(x) * config.read_count_multiplier, 141 "shared_key": "read_count", 142 } 143 self.general_stats_addcols(self.sargasso_data, headers) 144 145 def sargasso_chart(self): 146 """ Make the sargasso plot """ 147 148 # Config for the plot 149 config = { 150 "id": "sargasso_assignment_plot", 151 "title": "Sargasso: Assigned Reads", 152 "ylab": "# Reads", 153 "cpswitch_counts_label": "Number of Reads", 154 } 155 156 # We only want to plot the READs at the moment 157 return bargraph.plot(self.sargasso_data, [name for name in self.sargasso_keys if "Reads" in name], config) 158