1#!/usr/bin/env python 2 3""" MultiQC module to parse output from Samblaster """ 4 5from __future__ import print_function 6import os 7from collections import OrderedDict 8import logging 9import re 10from multiqc.plots import bargraph 11from multiqc.modules.base_module import BaseMultiqcModule 12 13# Initialise the logger 14log = logging.getLogger(__name__) 15 16 17class MultiqcModule(BaseMultiqcModule): 18 """ Samblaster """ 19 20 def __init__(self): 21 # Initialise the parent object 22 super(MultiqcModule, self).__init__( 23 name="Samblaster", 24 anchor="samblaster", 25 href="https://github.com/GregoryFaust/samblaster", 26 info="is a tool to mark duplicates and extract discordant and split reads from sam files.", 27 ) 28 29 self.samblaster_data = dict() 30 for f in self.find_log_files("samblaster", filehandles=True): 31 self.parse_samblaster(f) 32 33 # Filter to strip out ignored sample names 34 self.samblaster_data = self.ignore_samples(self.samblaster_data) 35 36 if len(self.samblaster_data) == 0: 37 raise UserWarning 38 39 headers = OrderedDict() 40 headers["pct_dups"] = { 41 "title": "% Dups", 42 "description": "Percent Duplication", 43 "max": 100, 44 "min": 0, 45 "suffix": "%", 46 "scale": "OrRd", 47 } 48 49 self.general_stats_addcols(self.samblaster_data, headers) 50 51 # Write parsed report data to a file 52 self.write_data_file(self.samblaster_data, "multiqc_samblaster") 53 54 log.info("Found {} reports".format(len(self.samblaster_data))) 55 56 self.add_barplot() 57 58 def add_barplot(self): 59 """ Generate the Samblaster bar plot. """ 60 cats = OrderedDict() 61 cats["n_nondups"] = {"name": "Non-duplicates"} 62 cats["n_dups"] = {"name": "Duplicates"} 63 64 pconfig = { 65 "id": "samblaster_duplicates", 66 "title": "Samblaster: Number of duplicate reads", 67 "ylab": "Number of reads", 68 } 69 self.add_section(plot=bargraph.plot(self.samblaster_data, cats, pconfig)) 70 71 def parse_samblaster(self, f): 72 """Go through log file looking for samblaster output. 73 If the 74 Grab the name from the RG tag of the preceding bwa command""" 75 76 # Should capture the following: 77 # samblaster: Marked 1134898 of 43791982 (2.592%) total read ids as duplicates using 753336k \ 78 # memory in 1M1S(60.884S) CPU seconds and 3M53S(233S) wall time. 79 dups_regex = ( 80 r"samblaster: (Removed|Marked)\s+(\d+)\s+of\s+(\d+) \((\d+.\d+)%\)\s*(total)?\s*read ids as duplicates" 81 ) 82 83 input_file_regex = "samblaster: Opening (\S+) for read." 84 rgtag_name_regex = "\\\\tID:(\S*?)\\\\t" 85 data = {} 86 s_name = None 87 fh = f["f"] 88 for l in fh: 89 # try to find name from RG-tag. If bwa mem is used upstream samblaster with pipes, then the bwa mem command 90 # including the read group will be written in the log 91 match = re.search(rgtag_name_regex, l) 92 if match: 93 s_name = self.clean_s_name(match.group(1), f["root"]) 94 95 # try to find name from the input file name, if used 96 match = re.search(input_file_regex, l) 97 if match: 98 basefn = os.path.basename(match.group(1)) 99 fname, ext = os.path.splitext(basefn) 100 # if it's stdin, then try bwa RG-tag instead 101 if fname != "stdin": 102 s_name = self.clean_s_name(fname, f["root"]) 103 104 match = re.search(dups_regex, l) 105 if match: 106 data["n_dups"] = int(match.group(2)) 107 data["n_tot"] = int(match.group(3)) 108 data["n_nondups"] = data["n_tot"] - data["n_dups"] 109 data["pct_dups"] = float(match.group(4)) 110 111 if s_name is None: 112 s_name = f["s_name"] 113 114 if len(data) > 0: 115 if s_name in self.samblaster_data: 116 log.debug("Duplicate sample name found in {}! Overwriting: {}".format(f["fn"], s_name)) 117 self.add_data_source(f, s_name) 118 self.samblaster_data[s_name] = data 119