1#!/usr/bin/env python 2 3""" MultiQC module to parse output from featureCounts """ 4 5from __future__ import print_function 6from collections import OrderedDict 7import logging 8import re 9 10from multiqc import config 11from multiqc.plots import bargraph 12from multiqc.modules.base_module import BaseMultiqcModule 13 14# Initialise the logger 15log = logging.getLogger(__name__) 16 17 18class MultiqcModule(BaseMultiqcModule): 19 def __init__(self): 20 21 # Initialise the parent object 22 super(MultiqcModule, self).__init__( 23 name="featureCounts", 24 anchor="featurecounts", 25 target="Subread featureCounts", 26 href="http://bioinf.wehi.edu.au/featureCounts/", 27 info="is a highly efficient general-purpose read summarization program" 28 " that counts mapped reads for genomic features such as genes, exons," 29 " promoter, gene bodies, genomic bins and chromosomal locations.", 30 ) 31 32 # Find and load any featureCounts reports 33 self.featurecounts_data = dict() 34 self.featurecounts_keys = list() 35 for f in self.find_log_files("featurecounts"): 36 self.parse_featurecounts_report(f) 37 38 # Filter to strip out ignored sample names 39 self.featurecounts_data = self.ignore_samples(self.featurecounts_data) 40 41 if len(self.featurecounts_data) == 0: 42 raise UserWarning 43 44 log.info("Found {} reports".format(len(self.featurecounts_data))) 45 46 # Write parsed report data to a file 47 self.write_data_file(self.featurecounts_data, "multiqc_featureCounts") 48 49 # Basic Stats Table 50 self.featurecounts_stats_table() 51 52 # Assignment bar plot 53 self.add_section(plot=self.featureCounts_chart()) 54 55 def parse_featurecounts_report(self, f): 56 """ Parse the featureCounts log file. """ 57 58 file_names = list() 59 parsed_data = dict() 60 split_sep = "\t" 61 for l in f["f"].splitlines(): 62 thisrow = list() 63 64 # If this is from Rsubread then the formatting can be very variable 65 # Default search pattern is quite generic, so f 66 if len(file_names) == 0 and len(l.split(split_sep)) < 2: 67 # Split by whitespace and strip quote marks 68 # NB: Will break if sample names have whitespace. RSubread output is so variable that this is difficult to avoid 69 split_sep = None 70 71 s = l.split(split_sep) 72 s = [sv.strip('"') for sv in s] 73 74 if len(s) < 2: 75 continue 76 if s[0] == "Status": 77 for f_name in s[1:]: 78 file_names.append(f_name) 79 elif len(file_names) + 1 == len(s): 80 k = s[0] 81 if k not in self.featurecounts_keys: 82 self.featurecounts_keys.append(k) 83 for val in s[1:]: 84 try: 85 thisrow.append(int(val)) 86 except ValueError: 87 pass 88 if len(thisrow) > 0: 89 parsed_data[k] = thisrow 90 91 # Check that this actually is a featureCounts file, as format and parsing is quite general 92 if "Assigned" not in parsed_data.keys(): 93 return None 94 95 for idx, f_name in enumerate(file_names): 96 97 # Clean up sample name 98 s_name = self.clean_s_name(f_name, f["root"]) 99 100 # Reorganised parsed data for this sample 101 # Collect total count number 102 data = dict() 103 data["Total"] = 0 104 for k in parsed_data: 105 data[k] = parsed_data[k][idx] 106 data["Total"] += parsed_data[k][idx] 107 108 # Calculate the percent aligned if we can 109 try: 110 data["percent_assigned"] = (float(data["Assigned"]) / float(data["Total"])) * 100.0 111 except (KeyError, ZeroDivisionError): 112 pass 113 114 # Add to the main dictionary 115 if len(data) > 1: 116 if s_name in self.featurecounts_data: 117 log.debug("Duplicate sample name found! Overwriting: {}".format(s_name)) 118 self.add_data_source(f, s_name) 119 self.featurecounts_data[s_name] = data 120 121 def featurecounts_stats_table(self): 122 """Take the parsed stats from the featureCounts report and add them to the 123 basic stats table at the top of the report""" 124 125 headers = OrderedDict() 126 headers["percent_assigned"] = { 127 "title": "% Assigned", 128 "description": "% Assigned reads", 129 "max": 100, 130 "min": 0, 131 "suffix": "%", 132 "scale": "RdYlGn", 133 } 134 headers["Assigned"] = { 135 "title": "{} Assigned".format(config.read_count_prefix), 136 "description": "Assigned reads ({})".format(config.read_count_desc), 137 "min": 0, 138 "scale": "PuBu", 139 "modify": lambda x: float(x) * config.read_count_multiplier, 140 "shared_key": "read_count", 141 } 142 self.general_stats_addcols(self.featurecounts_data, headers) 143 144 def featureCounts_chart(self): 145 """ Make the featureCounts assignment rates plot """ 146 147 headers = OrderedDict() 148 for h in self.featurecounts_keys: 149 nice_name = h.replace("Unassigned_", "Unassigned: ").replace("_", " ") 150 nice_name = re.sub(r"([a-z])([A-Z])", "\g<1> \g<2>", nice_name) 151 headers[h] = {"name": nice_name} 152 153 # Config for the plot 154 config = { 155 "id": "featureCounts_assignment_plot", 156 "title": "featureCounts: Assignments", 157 "ylab": "# Reads", 158 "cpswitch_counts_label": "Number of Reads", 159 } 160 161 return bargraph.plot(self.featurecounts_data, headers, config) 162