1#!/usr/bin/env python 2 3""" MultiQC module to parse output from HTSeq Count """ 4 5from __future__ import print_function 6from collections import OrderedDict 7import logging 8 9from multiqc import config 10from multiqc.plots import bargraph 11from multiqc.modules.base_module import BaseMultiqcModule 12 13# Initialise the logger 14log = logging.getLogger(__name__) 15 16 17class MultiqcModule(BaseMultiqcModule): 18 def __init__(self): 19 20 # Initialise the parent object 21 super(MultiqcModule, self).__init__( 22 name="HTSeq Count", 23 anchor="htseq", 24 target="HTSeq Count", 25 href="http://www-huber.embl.de/HTSeq/doc/count.html", 26 info=" is part of the HTSeq Python package - it takes a file with aligned sequencing " 27 "reads, plus a list of genomic features and counts how many reads map to each feature.", 28 ) 29 30 # Find and load any HTSeq Count reports 31 self.htseq_data = dict() 32 self.htseq_keys = list() 33 for f in self.find_log_files("htseq", filehandles=True): 34 parsed_data = self.parse_htseq_report(f) 35 if parsed_data is not None: 36 self.htseq_data[f["s_name"]] = parsed_data 37 38 # Filter to strip out ignored sample names 39 self.htseq_data = self.ignore_samples(self.htseq_data) 40 41 if len(self.htseq_data) == 0: 42 raise UserWarning 43 44 log.info("Found {} reports".format(len(self.htseq_data))) 45 46 # Write parsed report data to a file 47 self.write_data_file(self.htseq_data, "multiqc_htseq") 48 49 # Basic Stats Table 50 self.htseq_stats_table() 51 52 # Assignment bar plot 53 self.add_section(plot=self.htseq_counts_chart()) 54 55 def parse_htseq_report(self, f): 56 """ Parse the HTSeq Count log file. """ 57 keys = ["__no_feature", "__ambiguous", "__too_low_aQual", "__not_aligned", "__alignment_not_unique"] 58 parsed_data = dict() 59 assigned_counts = 0 60 for l in f["f"]: 61 s = l.split("\t") 62 if s[0] in keys: 63 parsed_data[s[0][2:]] = int(s[-1]) 64 else: 65 try: 66 assigned_counts += int(s[-1]) 67 except (ValueError, IndexError): 68 pass 69 if len(parsed_data) > 0: 70 parsed_data["assigned"] = assigned_counts 71 parsed_data["total_count"] = sum([v for v in parsed_data.values()]) 72 try: 73 parsed_data["percent_assigned"] = ( 74 float(parsed_data["assigned"]) / float(parsed_data["total_count"]) 75 ) * 100.0 76 except ZeroDivisionError: 77 parsed_data["percent_assigned"] = 0 78 return parsed_data 79 return None 80 81 def htseq_stats_table(self): 82 """Take the parsed stats from the HTSeq Count report and add them to the 83 basic stats table at the top of the report""" 84 85 headers = OrderedDict() 86 headers["percent_assigned"] = { 87 "title": "% Assigned", 88 "description": "% Assigned reads", 89 "max": 100, 90 "min": 0, 91 "suffix": "%", 92 "scale": "RdYlGn", 93 } 94 headers["assigned"] = { 95 "title": "{} Assigned".format(config.read_count_prefix), 96 "description": "Assigned Reads ({})".format(config.read_count_desc), 97 "min": 0, 98 "scale": "PuBu", 99 "modify": lambda x: float(x) * config.read_count_multiplier, 100 "shared_key": "read_count", 101 } 102 self.general_stats_addcols(self.htseq_data, headers) 103 104 def htseq_counts_chart(self): 105 """ Make the HTSeq Count assignment rates plot """ 106 cats = OrderedDict() 107 cats["assigned"] = {"name": "Assigned"} 108 cats["ambiguous"] = {"name": "Ambiguous"} 109 cats["alignment_not_unique"] = {"name": "Alignment Not Unique"} 110 cats["no_feature"] = {"name": "No Feature"} 111 cats["too_low_aQual"] = {"name": "Too Low aQual"} 112 cats["not_aligned"] = {"name": "Not Aligned"} 113 config = { 114 "id": "htseq_assignment_plot", 115 "title": "HTSeq: Count Assignments", 116 "ylab": "# Reads", 117 "hide_zero_cats": False, 118 "cpswitch_counts_label": "Number of Reads", 119 } 120 return bargraph.plot(self.htseq_data, cats, config) 121