#!/usr/bin/env python """ MultiQC module to parse output from Flexbar """ from __future__ import print_function from collections import OrderedDict import logging import re from multiqc.plots import bargraph from multiqc.modules.base_module import BaseMultiqcModule # Initialise the logger log = logging.getLogger(__name__) class MultiqcModule(BaseMultiqcModule): def __init__(self): # Initialise the parent object super(MultiqcModule, self).__init__( name="Flexbar", anchor="flexbar", href="https://github.com/seqan/flexbar", info="is a barcode and adapter removal tool.", ) # Parse logs self.flexbar_data = dict() for f in self.find_log_files("flexbar", filehandles=True): self.parse_flexbar(f) # Filter to strip out ignored sample names self.flexbar_data = self.ignore_samples(self.flexbar_data) if len(self.flexbar_data) == 0: raise UserWarning log.info("Found {} logs".format(len(self.flexbar_data))) self.write_data_file(self.flexbar_data, "multiqc_flexbar") # Add drop rate to the general stats table headers = {} headers["removed_bases_pct"] = { "title": "% bp Trimmed", "description": "% Total Base Pairs removed", "max": 100, "min": 0, "suffix": "%", "scale": "YlOrRd", } self.general_stats_addcols(self.flexbar_data, headers) # Make barplot self.flexbar_barplot() def parse_flexbar(self, f): def _save_data(parsed_data): if len(parsed_data) > 0: # Calculate removed_bases if "processed_bases" in parsed_data and "remaining_bases" in parsed_data: parsed_data["removed_bases"] = parsed_data["processed_bases"] - parsed_data["remaining_bases"] parsed_data["removed_bases_pct"] = ( float(parsed_data["removed_bases"]) / float(parsed_data["processed_bases"]) ) * 100.0 if s_name in self.flexbar_data: log.debug("Duplicate sample name found! Overwriting: {}".format(s_name)) self.flexbar_data[s_name] = parsed_data regexes = { "output_filename": r"Read file:\s+(.+)$", "processed_reads": r"Processed reads\s+(\d+)", "skipped_due_to_uncalled_bases": r"skipped due to uncalled bases\s+(\d+)", "short_prior_to_adapter_removal": r"short prior to adapter removal\s+(\d+)", "finally_skipped_short_reads": r"finally skipped short reads\s+(\d+)", "discarded_reads_overall": r"Discarded reads overall\s+(\d+)", "remaining_reads": r"Remaining reads\s+(\d+)", "processed_bases": r"Processed bases:?\s+(\d+)", "remaining_bases": r"Remaining bases:?\s+(\d+)", } s_name = f["s_name"] parsed_data = dict() for l in f["f"]: for k, r in regexes.items(): match = re.search(r, l) if match: if k == "output_filename": s_name = self.clean_s_name(match.group(1), f["root"]) else: parsed_data[k] = int(match.group(1)) # End of log output. Save and reset in case of more logs. if "Flexbar completed" in l: _save_data(parsed_data) s_name = f["s_name"] parsed_data = dict() # Pick up any partial logs _save_data(parsed_data) def flexbar_barplot(self): """ Make the HighCharts HTML to plot the flexbar rates """ # Specify the order of the different possible categories keys = OrderedDict() keys["remaining_reads"] = {"color": "#437bb1", "name": "Remaining reads"} keys["skipped_due_to_uncalled_bases"] = {"color": "#e63491", "name": "Skipped due to uncalled bases"} keys["short_prior_to_adapter_removal"] = {"color": "#b1084c", "name": "Short prior to adapter removal"} keys["finally_skipped_short_reads"] = {"color": "#7f0000", "name": "Finally skipped short reads"} # Config for the plot pconfig = { "id": "flexbar_plot", "title": "Flexbar: Processed Reads", "ylab": "# Reads", "cpswitch_counts_label": "Number of Reads", "hide_zero_cats": False, } self.add_section(plot=bargraph.plot(self.flexbar_data, keys, pconfig))