1#!/usr/bin/env python
2
3""" MultiQC module to parse output from Longranger """
4
5from __future__ import print_function
6from collections import OrderedDict
7import logging
8import re
9import os
10
11from multiqc.modules.base_module import BaseMultiqcModule
12from multiqc.plots import table, bargraph
13
14# Initialise the logger
15log = logging.getLogger(__name__)
16
17
18class MultiqcModule(BaseMultiqcModule):
19    """ Longranger module """
20
21    def __init__(self):
22
23        # Initialise the parent object
24        super(MultiqcModule, self).__init__(
25            name="Long Ranger",
26            anchor="longranger",
27            href="https://www.10xgenomics.com/",
28            info="A set of analysis pipelines that perform sample demultiplexing, "
29            "barcode processing, alignment, quality control, variant calling, phasing, "
30            "and structural variant calling.",
31        )
32
33        def try_float_lambda(x, func, base):
34            try:
35                if func == "*":
36                    return float(x) * base
37                elif func == "/":
38                    return float(x) / base
39                else:
40                    return x
41            except:
42                return x
43
44        self.headers = OrderedDict()
45        self.headers["large_sv_calls"] = {
46            "title": "Large SVs",
47            "description": "Large structural variants called by Longranger. Not including blacklisted regions.",
48            "format": "{:,.0f}",
49            "scale": "PuRd",
50        }
51        self.headers["short_deletion_calls"] = {
52            "title": "Short dels",
53            "description": "Short deletions called by Longranger.",
54            "format": "{:,.0f}",
55            "scale": "PuRd",
56            "hidden": True,
57        }
58        self.headers["genes_phased_lt_100kb"] = {
59            "title": "genes phased < 100kb",
60            "description": "Percentage of genes shorter than 100kb with >1 heterozygous SNP that are phased into a single phase block.",
61            "modify": lambda x: try_float_lambda(x, "*", 100.0),
62            "suffix": "%",
63            "scale": "YlOrRd",
64            "hidden": True,
65        }
66        self.headers["longest_phase_block"] = {
67            "title": "Longest phased",
68            "description": "Size of the longest phase block, in base pairs",
69            "scale": "YlOrRd",
70            "modify": lambda x: try_float_lambda(x, "/", 1000000.0),
71            "suffix": "Mbp",
72            "hidden": True,
73        }
74        self.headers["n50_phase_block"] = {
75            "title": "N50 phased",
76            "description": "N50 length of the called phase blocks, in base pairs.",
77            "modify": lambda x: try_float_lambda(x, "/", 1000000.0),
78            "suffix": "Mbp",
79            "scale": "YlOrRd",
80            "hidden": True,
81        }
82        self.headers["snps_phased"] = {
83            "title": "SNPs phased",
84            "description": "Percentage of called SNPs that were phased.",
85            "modify": lambda x: try_float_lambda(x, "*", 100.0),
86            "suffix": "%",
87            "scale": "PuRd",
88            "hidden": True,
89        }
90        self.headers["median_insert_size"] = {
91            "title": "Insert size",
92            "description": "Median insert size of aligned read pairs.",
93            "format": "{:,.0f}",
94            "suffix": "bp",
95            "scale": "PuBu",
96            "hidden": True,
97        }
98        self.headers["on_target_bases"] = {
99            "title": "On target",
100            "description": "Percentage of aligned bases mapped with the target regions in targeted mode. Only bases inside the intervals of target BED file are counted.",
101            "suffix": "%",
102            "modify": lambda x: try_float_lambda(x, "*", 100.0),
103            "scale": "Greens",
104        }
105        self.headers["zero_coverage"] = {
106            "title": "Zero cov",
107            "description": "Percentage of non-N bases in the genome with zero coverage.",
108            "modify": lambda x: try_float_lambda(x, "*", 100.0),
109            "suffix": "%",
110            "max": 100.0,
111            "min": 0.0,
112            "scale": "RdGy-rev",
113        }
114        self.headers["mean_depth"] = {
115            "title": "Depth",
116            "description": "Mean read depth, including PCR duplicate reads. In WGS mode, this is measured across the genome; in targeted mode, this is the measure inside targeted regions.",
117            "suffix": "X",
118            "scale": "PuBu",
119        }
120        self.headers["pcr_duplication"] = {
121            "title": "PCR Dup",
122            "description": "Percentage of reads marked as PCR duplicates. To be marked as PCR duplicates, reads must have the same mapping extents on the genome and the same 10x barcode.",
123            "suffix": "%",
124            "min": 15.0,
125            "modify": lambda x: try_float_lambda(x, "*", 100.0),
126            "scale": "RdGy-rev",
127            "hidden": True,
128        }
129        self.headers["mapped_reads"] = {
130            "title": "Mapped",
131            "modify": lambda x: try_float_lambda(x, "*", 100.0),
132            "suffix": "%",
133            "description": "Percentage of input reads that were mapped to the reference genome.",
134            "scale": "PuBu",
135            "hidden": True,
136        }
137        self.headers["number_reads"] = {
138            "title": "M Reads",
139            "modify": lambda x: try_float_lambda(x, "/", 1000000.0),
140            "description": "Total number of reads supplied to Long Ranger. (millions)",
141            "scale": "PuBu",
142            "hidden": True,
143        }
144        self.headers["molecule_length_mean"] = {
145            "title": "Mol size",
146            "description": "The length-weighted mean input DNA length in base pairs.",
147            "modify": lambda x: try_float_lambda(x, "/", 1000.0),
148            "suffix": "Kbp",
149            "scale": "YlGn",
150        }
151        self.headers["molecule_length_stddev"] = {
152            "title": "Mol stddev",
153            "description": "The length-weighted standard deviation of the input DNA length distribution in base pairs.",
154            "modify": lambda x: try_float_lambda(x, "/", 1000.0),
155            "suffix": "Kbp",
156            "scale": "YlGn",
157            "hidden": True,
158        }
159        self.headers["n50_linked_reads_per_molecule"] = {
160            "title": "N50 read per mol.",
161            "description": "The N50 number of read-pairs per input DNA molecule. Half of read-pairs came from molecules with this many or greater read-pairs.",
162            "scale": "BuGn",
163            "hidden": True,
164        }
165        self.headers["r1_q30_bases_fract"] = {
166            "title": "% R1 >= Q30",
167            "description": "Percentage of bases in R1 with base quality >= 30.",
168            "hidden": True,
169            "suffix": "%",
170            "modify": lambda x: try_float_lambda(x, "*", 100.0),
171            "scale": "Purples",
172        }
173        self.headers["r2_q30_bases_fract"] = {
174            "title": "% R2 >= Q30",
175            "description": "Percentage of bases in R2 with base quality >= 30.",
176            "suffix": "%",
177            "modify": lambda x: try_float_lambda(x, "*", 100.0),
178            "scale": "Purples",
179            "hidden": True,
180        }
181        self.headers["bc_on_whitelist"] = {
182            "title": "Valid BCs",
183            "description": "The Percentage of reads that carried a valid 10x barcode sequence.",
184            "modify": lambda x: try_float_lambda(x, "*", 100.0),
185            "suffix": "%",
186            "scale": "BuPu",
187            "hidden": True,
188        }
189        self.headers["bc_q30_bases_fract"] = {
190            "title": "BC Q30",
191            "description": "Percentage of bases in the barcode with base quality >= 30.",
192            "suffix": "%",
193            "modify": lambda x: try_float_lambda(x, "*", 100.0),
194            "scale": "Purples",
195            "hidden": True,
196        }
197        self.headers["bc_mean_qscore"] = {
198            "title": "BC Qscore",
199            "description": "The mean base quality value on the barcode bases.",
200            "scale": "BuPu",
201            "hidden": True,
202        }
203        self.headers["mean_dna_per_gem"] = {
204            "title": "DNA per gem",
205            "description": "The average number of base pairs of genomic DNA loaded into each GEM. This metric is based on the observed extents of read-pairs on each molecule.",
206            "modify": lambda x: try_float_lambda(x, "/", 1000000.0),
207            "suffix": "Mbp",
208            "scale": "OrRd",
209            "hidden": True,
210        }
211        self.headers["gems_detected"] = {
212            "title": "M Gems",
213            "description": "The number of Chromium GEMs that were collected and which generated a non-trivial number of read-pairs. (millions)",
214            "modify": lambda x: try_float_lambda(x, "/", 1000000.0),
215            "scale": "OrRd",
216        }
217        self.headers["corrected_loaded_mass_ng"] = {
218            "title": "Loaded (corrected)",
219            "description": "The estimated number of nanograms of DNA loaded into the input well of the Chromium chip. This metric is calculated by measuring the mean amount of DNA covered by input molecules in each GEM, then multiplying by the ratio of the chip input to the sample volume in each GEM.",
220            "suffix": "ng",
221            "scale": "RdYlGn",
222        }
223        self.headers["loaded_mass_ng"] = {
224            "title": "Loaded",
225            "description": "This metric was found to overestimate the true loading by a factor of 1.6, due primarily to denaturation of the input DNA.",
226            "suffix": "ng",
227            "scale": "RdYlGn",
228        }
229        self.headers["instrument_ids"] = {
230            "title": "Instrument ID",
231            "description": "The list of instrument IDs used to generate the input reads.",
232            "scale": False,
233            "hidden": True,
234        }
235        self.headers["longranger_version"] = {
236            "title": "Long Ranger Version",
237            "description": "The version of the Longranger software used to generate the results.",
238            "scale": False,
239        }
240
241        ### Parse the data
242        self.longranger_data = dict()
243        self.paths_dict = dict()
244        for f in self.find_log_files("longranger/invocation"):
245            sid = self.parse_invocation(f["f"])
246            self.paths_dict[os.path.basename(f["root"])] = sid
247
248        running_name = 1
249        for f in self.find_log_files("longranger/summary"):
250            data = self.parse_summary(f["f"])
251            updir, _ = os.path.split(f["root"])
252            base_updir = os.path.basename(updir)
253            sid = "longranger#{}".format(running_name)
254            if base_updir in self.paths_dict.keys():
255                sid = self.paths_dict[base_updir]
256            else:
257                log.debug("Did not find _invocation file: {}".format(f["fn"]))
258                running_name += 1
259
260            self.longranger_data[sid] = data
261
262        # Filter to strip out ignored sample names
263        self.longranger_data = self.ignore_samples(self.longranger_data)
264
265        if len(self.longranger_data) == 0:
266            raise UserWarning
267        log.info("Found {} reports".format(len(self.longranger_data.keys())))
268
269        # Write parsed report data to a file
270        self.write_data_file(self.longranger_data, "multiqc_longranger")
271
272        # Add a longranger versions column if not all the same
273        longranger_versions = set([d["longranger_version"] for d in self.longranger_data.values()])
274        version_str = ""
275        if len(longranger_versions) == 1:
276            version_str = " All samples were processed using Longranger version {}".format(list(longranger_versions)[0])
277            del self.headers["longranger_version"]
278
279        ### Write the table
280        config_table = {"id": "longranger_table", "namespace": "longranger"}
281        self.add_section(
282            name="Run stats",
283            anchor="longranger-run-stats",
284            description="Statistics gathered from Longranger reports. "
285            "There are more columns available but they are hidden by default." + version_str,
286            helptext="""Parses the files `summary.csv` and `_invocation` found in the
287                    output directory of Longranger. If `_invocation` is not found
288                    the sample IDs will be missing and they will be given a running
289                    number. E.g., `longranger#1` and `longranger#2`.""",
290            plot=table.plot(self.longranger_data, self.headers, config_table),
291        )
292
293        ### Bar plot of phasing stats
294        phase_pdata = {}
295        snps_phased_pct = {}
296        genes_phased_pct = {}
297        for s_name in self.longranger_data:
298            try:
299                phase_pdata[s_name] = {
300                    "longest_phase_block": float(self.longranger_data[s_name]["longest_phase_block"]),
301                    "n50_phase_block": float(self.longranger_data[s_name]["n50_phase_block"]),
302                }
303            except:
304                pass
305            try:
306                snps_phased_pct[s_name] = {
307                    "snps_phased_pct": float(self.longranger_data[s_name]["snps_phased"]) * 100.0
308                }
309            except:
310                pass
311            try:
312                genes_phased_pct[s_name] = {
313                    "genes_phased_pct": float(self.longranger_data[s_name]["genes_phased_lt_100kb"]) * 100.0
314                }
315            except:
316                pass
317        phase_plot_cats = [OrderedDict(), OrderedDict(), OrderedDict()]
318        phase_plot_cats[0]["longest_phase_block"] = {"name": "Longest Phase Block"}
319        phase_plot_cats[0]["n50_phase_block"] = {"name": "N50 of Phase Blocks"}
320        phase_plot_cats[1]["snps_phased_pct"] = {"name": "% SNPs Phased"}
321        phase_plot_cats[2]["genes_phased_pct"] = {"name": "% Genes < 100kbp in a single phase block"}
322        if len(phase_pdata) > 0:
323            self.add_section(
324                name="Phasing",
325                anchor="longranger-phasing",
326                description="Phasing performance from Long Ranger. Genes are only considered if &le; 100kbp in length and with at least one heterozygous SNP.",
327                helptext="""
328                        * Longest phased
329                            * Size of the longest phase block, in base pairs
330                        * N50 phased
331                            * N50 length of the called phase blocks, in base pairs.
332                        * % SNPs phased
333                            * Percentage of called SNPs that were phased.
334                        * % Genes Phased
335                            * Percentage of genes shorter than 100kb with >1 heterozygous SNP that are phased into a single phase block.
336                        """,
337                plot=bargraph.plot(
338                    [phase_pdata, snps_phased_pct, genes_phased_pct],
339                    phase_plot_cats,
340                    {
341                        "id": "longranger-phasing-plot",
342                        "title": "Long Ranger: Phasing Statistics",
343                        "data_labels": [
344                            {"name": "N50 Phased", "ylab": "N50 of called phase blocks (bp)"},
345                            {"name": "% SNPs Phased", "ylab": "% SNPs Phased", "ymax": 100},
346                            {"name": "% Genes Phased", "ylab": "% Genes Phased", "ymax": 100},
347                        ],
348                        "cpswitch": False,
349                        "stacking": None,
350                        "ylab": "N50 of called phase blocks (bp)",
351                    },
352                ),
353            )
354
355        ### Bar plot of mapping statistics
356        mapping_counts_data = {}
357        for s_name in self.longranger_data:
358            mapped_reads = float(self.longranger_data[s_name]["number_reads"]) * float(
359                self.longranger_data[s_name]["mapped_reads"]
360            )
361            unmapped_reads = float(self.longranger_data[s_name]["number_reads"]) - mapped_reads
362            dup_reads = mapped_reads * float(self.longranger_data[s_name]["pcr_duplication"])
363            unique_reads = mapped_reads - dup_reads
364            mapping_counts_data[s_name] = {
365                "unique_reads": unique_reads,
366                "dup_reads": dup_reads,
367                "unmapped_reads": unmapped_reads,
368            }
369        mapping_counts_cats = OrderedDict()
370        mapping_counts_cats["unique_reads"] = {"name": "Uniquely Aligned Reads", "color": "#437bb1"}
371        mapping_counts_cats["dup_reads"] = {"name": "PCR Duplicate Aligned Reads", "color": "#7cb5ec"}
372        mapping_counts_cats["unmapped_reads"] = {"name": "Unaligned Reads", "color": "#7f0000"}
373        self.add_section(
374            name="Alignment",
375            anchor="longranger-alignment",
376            description="Long Ranger alignment against the reference genome. To be marked as PCR duplicates, reads must have the same mapping extents on the genome and the same 10x barcode.",
377            plot=bargraph.plot(
378                mapping_counts_data,
379                mapping_counts_cats,
380                {
381                    "id": "longranger-alignment-plot",
382                    "title": "Long Ranger: Alignment Statistics",
383                    "ylab": "Reads Counts",
384                    "cpswitch_counts_label": "Read Counts",
385                },
386            ),
387        )
388
389    def parse_invocation(self, content):
390        sid_pat = re.compile('    sample_id = "(.*)",')
391
392        sid = None
393        for l in content.splitlines():
394            sid_m = re.match(sid_pat, l)
395            if sid_m is not None:
396                sid = sid_m.groups()[0]
397        return sid
398
399    def parse_summary(self, content):
400
401        out_dict = OrderedDict()
402        lines = content.splitlines()
403        data = list(zip(lines[0].strip().split(","), lines[1].strip().split(",")))
404        for i, j in data:
405            if j != "":
406                try:
407                    out_dict[i] = float(j)
408                except ValueError:
409                    out_dict[i] = j
410
411        return out_dict
412