#!/usr/bin/env python """ MultiQC module to parse output from HTSeq Count """ from __future__ import print_function from collections import OrderedDict import logging from multiqc import config from multiqc.plots import bargraph from multiqc.modules.base_module import BaseMultiqcModule # Initialise the logger log = logging.getLogger(__name__) class MultiqcModule(BaseMultiqcModule): def __init__(self): # Initialise the parent object super(MultiqcModule, self).__init__( name="HTSeq Count", anchor="htseq", target="HTSeq Count", href="http://www-huber.embl.de/HTSeq/doc/count.html", info=" is part of the HTSeq Python package - it takes a file with aligned sequencing " "reads, plus a list of genomic features and counts how many reads map to each feature.", ) # Find and load any HTSeq Count reports self.htseq_data = dict() self.htseq_keys = list() for f in self.find_log_files("htseq", filehandles=True): parsed_data = self.parse_htseq_report(f) if parsed_data is not None: self.htseq_data[f["s_name"]] = parsed_data # Filter to strip out ignored sample names self.htseq_data = self.ignore_samples(self.htseq_data) if len(self.htseq_data) == 0: raise UserWarning log.info("Found {} reports".format(len(self.htseq_data))) # Write parsed report data to a file self.write_data_file(self.htseq_data, "multiqc_htseq") # Basic Stats Table self.htseq_stats_table() # Assignment bar plot self.add_section(plot=self.htseq_counts_chart()) def parse_htseq_report(self, f): """ Parse the HTSeq Count log file. """ keys = ["__no_feature", "__ambiguous", "__too_low_aQual", "__not_aligned", "__alignment_not_unique"] parsed_data = dict() assigned_counts = 0 for l in f["f"]: s = l.split("\t") if s[0] in keys: parsed_data[s[0][2:]] = int(s[-1]) else: try: assigned_counts += int(s[-1]) except (ValueError, IndexError): pass if len(parsed_data) > 0: parsed_data["assigned"] = assigned_counts parsed_data["total_count"] = sum([v for v in parsed_data.values()]) try: parsed_data["percent_assigned"] = ( float(parsed_data["assigned"]) / float(parsed_data["total_count"]) ) * 100.0 except ZeroDivisionError: parsed_data["percent_assigned"] = 0 return parsed_data return None def htseq_stats_table(self): """Take the parsed stats from the HTSeq Count report and add them to the basic stats table at the top of the report""" headers = OrderedDict() headers["percent_assigned"] = { "title": "% Assigned", "description": "% Assigned reads", "max": 100, "min": 0, "suffix": "%", "scale": "RdYlGn", } headers["assigned"] = { "title": "{} Assigned".format(config.read_count_prefix), "description": "Assigned Reads ({})".format(config.read_count_desc), "min": 0, "scale": "PuBu", "modify": lambda x: float(x) * config.read_count_multiplier, "shared_key": "read_count", } self.general_stats_addcols(self.htseq_data, headers) def htseq_counts_chart(self): """ Make the HTSeq Count assignment rates plot """ cats = OrderedDict() cats["assigned"] = {"name": "Assigned"} cats["ambiguous"] = {"name": "Ambiguous"} cats["alignment_not_unique"] = {"name": "Alignment Not Unique"} cats["no_feature"] = {"name": "No Feature"} cats["too_low_aQual"] = {"name": "Too Low aQual"} cats["not_aligned"] = {"name": "Not Aligned"} config = { "id": "htseq_assignment_plot", "title": "HTSeq: Count Assignments", "ylab": "# Reads", "hide_zero_cats": False, "cpswitch_counts_label": "Number of Reads", } return bargraph.plot(self.htseq_data, cats, config)