1#!/usr/bin/env python
2
3""" MultiQC module to parse output from Samblaster """
4
5from __future__ import print_function
6import os
7from collections import OrderedDict
8import logging
9import re
10from multiqc.plots import bargraph
11from multiqc.modules.base_module import BaseMultiqcModule
12
13# Initialise the logger
14log = logging.getLogger(__name__)
15
16
17class MultiqcModule(BaseMultiqcModule):
18    """ Samblaster """
19
20    def __init__(self):
21        # Initialise the parent object
22        super(MultiqcModule, self).__init__(
23            name="Samblaster",
24            anchor="samblaster",
25            href="https://github.com/GregoryFaust/samblaster",
26            info="is a tool to mark duplicates and extract discordant and split reads from sam files.",
27        )
28
29        self.samblaster_data = dict()
30        for f in self.find_log_files("samblaster", filehandles=True):
31            self.parse_samblaster(f)
32
33        # Filter to strip out ignored sample names
34        self.samblaster_data = self.ignore_samples(self.samblaster_data)
35
36        if len(self.samblaster_data) == 0:
37            raise UserWarning
38
39        headers = OrderedDict()
40        headers["pct_dups"] = {
41            "title": "% Dups",
42            "description": "Percent Duplication",
43            "max": 100,
44            "min": 0,
45            "suffix": "%",
46            "scale": "OrRd",
47        }
48
49        self.general_stats_addcols(self.samblaster_data, headers)
50
51        # Write parsed report data to a file
52        self.write_data_file(self.samblaster_data, "multiqc_samblaster")
53
54        log.info("Found {} reports".format(len(self.samblaster_data)))
55
56        self.add_barplot()
57
58    def add_barplot(self):
59        """ Generate the Samblaster bar plot. """
60        cats = OrderedDict()
61        cats["n_nondups"] = {"name": "Non-duplicates"}
62        cats["n_dups"] = {"name": "Duplicates"}
63
64        pconfig = {
65            "id": "samblaster_duplicates",
66            "title": "Samblaster: Number of duplicate reads",
67            "ylab": "Number of reads",
68        }
69        self.add_section(plot=bargraph.plot(self.samblaster_data, cats, pconfig))
70
71    def parse_samblaster(self, f):
72        """Go through log file looking for samblaster output.
73        If the
74        Grab the name from the RG tag of the preceding bwa command"""
75
76        # Should capture the following:
77        # samblaster: Marked     1134898 of   43791982 (2.592%) total read ids as duplicates using 753336k \
78        # memory in 1M1S(60.884S) CPU seconds and 3M53S(233S) wall time.
79        dups_regex = (
80            r"samblaster: (Removed|Marked)\s+(\d+)\s+of\s+(\d+) \((\d+.\d+)%\)\s*(total)?\s*read ids as duplicates"
81        )
82
83        input_file_regex = "samblaster: Opening (\S+) for read."
84        rgtag_name_regex = "\\\\tID:(\S*?)\\\\t"
85        data = {}
86        s_name = None
87        fh = f["f"]
88        for l in fh:
89            # try to find name from RG-tag. If bwa mem is used upstream samblaster with pipes, then the bwa mem command
90            # including the read group will be written in the log
91            match = re.search(rgtag_name_regex, l)
92            if match:
93                s_name = self.clean_s_name(match.group(1), f["root"])
94
95            # try to find name from the input file name, if used
96            match = re.search(input_file_regex, l)
97            if match:
98                basefn = os.path.basename(match.group(1))
99                fname, ext = os.path.splitext(basefn)
100                # if it's stdin, then try bwa RG-tag instead
101                if fname != "stdin":
102                    s_name = self.clean_s_name(fname, f["root"])
103
104            match = re.search(dups_regex, l)
105            if match:
106                data["n_dups"] = int(match.group(2))
107                data["n_tot"] = int(match.group(3))
108                data["n_nondups"] = data["n_tot"] - data["n_dups"]
109                data["pct_dups"] = float(match.group(4))
110
111        if s_name is None:
112            s_name = f["s_name"]
113
114        if len(data) > 0:
115            if s_name in self.samblaster_data:
116                log.debug("Duplicate sample name found in {}! Overwriting: {}".format(f["fn"], s_name))
117            self.add_data_source(f, s_name)
118            self.samblaster_data[s_name] = data
119