1#!/usr/bin/env python 2 3""" MultiQC module to parse output from Longranger """ 4 5from __future__ import print_function 6from collections import OrderedDict 7import logging 8import re 9import os 10 11from multiqc.modules.base_module import BaseMultiqcModule 12from multiqc.plots import table, bargraph 13 14# Initialise the logger 15log = logging.getLogger(__name__) 16 17 18class MultiqcModule(BaseMultiqcModule): 19 """ Longranger module """ 20 21 def __init__(self): 22 23 # Initialise the parent object 24 super(MultiqcModule, self).__init__( 25 name="Long Ranger", 26 anchor="longranger", 27 href="https://www.10xgenomics.com/", 28 info="A set of analysis pipelines that perform sample demultiplexing, " 29 "barcode processing, alignment, quality control, variant calling, phasing, " 30 "and structural variant calling.", 31 ) 32 33 def try_float_lambda(x, func, base): 34 try: 35 if func == "*": 36 return float(x) * base 37 elif func == "/": 38 return float(x) / base 39 else: 40 return x 41 except: 42 return x 43 44 self.headers = OrderedDict() 45 self.headers["large_sv_calls"] = { 46 "title": "Large SVs", 47 "description": "Large structural variants called by Longranger. Not including blacklisted regions.", 48 "format": "{:,.0f}", 49 "scale": "PuRd", 50 } 51 self.headers["short_deletion_calls"] = { 52 "title": "Short dels", 53 "description": "Short deletions called by Longranger.", 54 "format": "{:,.0f}", 55 "scale": "PuRd", 56 "hidden": True, 57 } 58 self.headers["genes_phased_lt_100kb"] = { 59 "title": "genes phased < 100kb", 60 "description": "Percentage of genes shorter than 100kb with >1 heterozygous SNP that are phased into a single phase block.", 61 "modify": lambda x: try_float_lambda(x, "*", 100.0), 62 "suffix": "%", 63 "scale": "YlOrRd", 64 "hidden": True, 65 } 66 self.headers["longest_phase_block"] = { 67 "title": "Longest phased", 68 "description": "Size of the longest phase block, in base pairs", 69 "scale": "YlOrRd", 70 "modify": lambda x: try_float_lambda(x, "/", 1000000.0), 71 "suffix": "Mbp", 72 "hidden": True, 73 } 74 self.headers["n50_phase_block"] = { 75 "title": "N50 phased", 76 "description": "N50 length of the called phase blocks, in base pairs.", 77 "modify": lambda x: try_float_lambda(x, "/", 1000000.0), 78 "suffix": "Mbp", 79 "scale": "YlOrRd", 80 "hidden": True, 81 } 82 self.headers["snps_phased"] = { 83 "title": "SNPs phased", 84 "description": "Percentage of called SNPs that were phased.", 85 "modify": lambda x: try_float_lambda(x, "*", 100.0), 86 "suffix": "%", 87 "scale": "PuRd", 88 "hidden": True, 89 } 90 self.headers["median_insert_size"] = { 91 "title": "Insert size", 92 "description": "Median insert size of aligned read pairs.", 93 "format": "{:,.0f}", 94 "suffix": "bp", 95 "scale": "PuBu", 96 "hidden": True, 97 } 98 self.headers["on_target_bases"] = { 99 "title": "On target", 100 "description": "Percentage of aligned bases mapped with the target regions in targeted mode. Only bases inside the intervals of target BED file are counted.", 101 "suffix": "%", 102 "modify": lambda x: try_float_lambda(x, "*", 100.0), 103 "scale": "Greens", 104 } 105 self.headers["zero_coverage"] = { 106 "title": "Zero cov", 107 "description": "Percentage of non-N bases in the genome with zero coverage.", 108 "modify": lambda x: try_float_lambda(x, "*", 100.0), 109 "suffix": "%", 110 "max": 100.0, 111 "min": 0.0, 112 "scale": "RdGy-rev", 113 } 114 self.headers["mean_depth"] = { 115 "title": "Depth", 116 "description": "Mean read depth, including PCR duplicate reads. In WGS mode, this is measured across the genome; in targeted mode, this is the measure inside targeted regions.", 117 "suffix": "X", 118 "scale": "PuBu", 119 } 120 self.headers["pcr_duplication"] = { 121 "title": "PCR Dup", 122 "description": "Percentage of reads marked as PCR duplicates. To be marked as PCR duplicates, reads must have the same mapping extents on the genome and the same 10x barcode.", 123 "suffix": "%", 124 "min": 15.0, 125 "modify": lambda x: try_float_lambda(x, "*", 100.0), 126 "scale": "RdGy-rev", 127 "hidden": True, 128 } 129 self.headers["mapped_reads"] = { 130 "title": "Mapped", 131 "modify": lambda x: try_float_lambda(x, "*", 100.0), 132 "suffix": "%", 133 "description": "Percentage of input reads that were mapped to the reference genome.", 134 "scale": "PuBu", 135 "hidden": True, 136 } 137 self.headers["number_reads"] = { 138 "title": "M Reads", 139 "modify": lambda x: try_float_lambda(x, "/", 1000000.0), 140 "description": "Total number of reads supplied to Long Ranger. (millions)", 141 "scale": "PuBu", 142 "hidden": True, 143 } 144 self.headers["molecule_length_mean"] = { 145 "title": "Mol size", 146 "description": "The length-weighted mean input DNA length in base pairs.", 147 "modify": lambda x: try_float_lambda(x, "/", 1000.0), 148 "suffix": "Kbp", 149 "scale": "YlGn", 150 } 151 self.headers["molecule_length_stddev"] = { 152 "title": "Mol stddev", 153 "description": "The length-weighted standard deviation of the input DNA length distribution in base pairs.", 154 "modify": lambda x: try_float_lambda(x, "/", 1000.0), 155 "suffix": "Kbp", 156 "scale": "YlGn", 157 "hidden": True, 158 } 159 self.headers["n50_linked_reads_per_molecule"] = { 160 "title": "N50 read per mol.", 161 "description": "The N50 number of read-pairs per input DNA molecule. Half of read-pairs came from molecules with this many or greater read-pairs.", 162 "scale": "BuGn", 163 "hidden": True, 164 } 165 self.headers["r1_q30_bases_fract"] = { 166 "title": "% R1 >= Q30", 167 "description": "Percentage of bases in R1 with base quality >= 30.", 168 "hidden": True, 169 "suffix": "%", 170 "modify": lambda x: try_float_lambda(x, "*", 100.0), 171 "scale": "Purples", 172 } 173 self.headers["r2_q30_bases_fract"] = { 174 "title": "% R2 >= Q30", 175 "description": "Percentage of bases in R2 with base quality >= 30.", 176 "suffix": "%", 177 "modify": lambda x: try_float_lambda(x, "*", 100.0), 178 "scale": "Purples", 179 "hidden": True, 180 } 181 self.headers["bc_on_whitelist"] = { 182 "title": "Valid BCs", 183 "description": "The Percentage of reads that carried a valid 10x barcode sequence.", 184 "modify": lambda x: try_float_lambda(x, "*", 100.0), 185 "suffix": "%", 186 "scale": "BuPu", 187 "hidden": True, 188 } 189 self.headers["bc_q30_bases_fract"] = { 190 "title": "BC Q30", 191 "description": "Percentage of bases in the barcode with base quality >= 30.", 192 "suffix": "%", 193 "modify": lambda x: try_float_lambda(x, "*", 100.0), 194 "scale": "Purples", 195 "hidden": True, 196 } 197 self.headers["bc_mean_qscore"] = { 198 "title": "BC Qscore", 199 "description": "The mean base quality value on the barcode bases.", 200 "scale": "BuPu", 201 "hidden": True, 202 } 203 self.headers["mean_dna_per_gem"] = { 204 "title": "DNA per gem", 205 "description": "The average number of base pairs of genomic DNA loaded into each GEM. This metric is based on the observed extents of read-pairs on each molecule.", 206 "modify": lambda x: try_float_lambda(x, "/", 1000000.0), 207 "suffix": "Mbp", 208 "scale": "OrRd", 209 "hidden": True, 210 } 211 self.headers["gems_detected"] = { 212 "title": "M Gems", 213 "description": "The number of Chromium GEMs that were collected and which generated a non-trivial number of read-pairs. (millions)", 214 "modify": lambda x: try_float_lambda(x, "/", 1000000.0), 215 "scale": "OrRd", 216 } 217 self.headers["corrected_loaded_mass_ng"] = { 218 "title": "Loaded (corrected)", 219 "description": "The estimated number of nanograms of DNA loaded into the input well of the Chromium chip. This metric is calculated by measuring the mean amount of DNA covered by input molecules in each GEM, then multiplying by the ratio of the chip input to the sample volume in each GEM.", 220 "suffix": "ng", 221 "scale": "RdYlGn", 222 } 223 self.headers["loaded_mass_ng"] = { 224 "title": "Loaded", 225 "description": "This metric was found to overestimate the true loading by a factor of 1.6, due primarily to denaturation of the input DNA.", 226 "suffix": "ng", 227 "scale": "RdYlGn", 228 } 229 self.headers["instrument_ids"] = { 230 "title": "Instrument ID", 231 "description": "The list of instrument IDs used to generate the input reads.", 232 "scale": False, 233 "hidden": True, 234 } 235 self.headers["longranger_version"] = { 236 "title": "Long Ranger Version", 237 "description": "The version of the Longranger software used to generate the results.", 238 "scale": False, 239 } 240 241 ### Parse the data 242 self.longranger_data = dict() 243 self.paths_dict = dict() 244 for f in self.find_log_files("longranger/invocation"): 245 sid = self.parse_invocation(f["f"]) 246 self.paths_dict[os.path.basename(f["root"])] = sid 247 248 running_name = 1 249 for f in self.find_log_files("longranger/summary"): 250 data = self.parse_summary(f["f"]) 251 updir, _ = os.path.split(f["root"]) 252 base_updir = os.path.basename(updir) 253 sid = "longranger#{}".format(running_name) 254 if base_updir in self.paths_dict.keys(): 255 sid = self.paths_dict[base_updir] 256 else: 257 log.debug("Did not find _invocation file: {}".format(f["fn"])) 258 running_name += 1 259 260 self.longranger_data[sid] = data 261 262 # Filter to strip out ignored sample names 263 self.longranger_data = self.ignore_samples(self.longranger_data) 264 265 if len(self.longranger_data) == 0: 266 raise UserWarning 267 log.info("Found {} reports".format(len(self.longranger_data.keys()))) 268 269 # Write parsed report data to a file 270 self.write_data_file(self.longranger_data, "multiqc_longranger") 271 272 # Add a longranger versions column if not all the same 273 longranger_versions = set([d["longranger_version"] for d in self.longranger_data.values()]) 274 version_str = "" 275 if len(longranger_versions) == 1: 276 version_str = " All samples were processed using Longranger version {}".format(list(longranger_versions)[0]) 277 del self.headers["longranger_version"] 278 279 ### Write the table 280 config_table = {"id": "longranger_table", "namespace": "longranger"} 281 self.add_section( 282 name="Run stats", 283 anchor="longranger-run-stats", 284 description="Statistics gathered from Longranger reports. " 285 "There are more columns available but they are hidden by default." + version_str, 286 helptext="""Parses the files `summary.csv` and `_invocation` found in the 287 output directory of Longranger. If `_invocation` is not found 288 the sample IDs will be missing and they will be given a running 289 number. E.g., `longranger#1` and `longranger#2`.""", 290 plot=table.plot(self.longranger_data, self.headers, config_table), 291 ) 292 293 ### Bar plot of phasing stats 294 phase_pdata = {} 295 snps_phased_pct = {} 296 genes_phased_pct = {} 297 for s_name in self.longranger_data: 298 try: 299 phase_pdata[s_name] = { 300 "longest_phase_block": float(self.longranger_data[s_name]["longest_phase_block"]), 301 "n50_phase_block": float(self.longranger_data[s_name]["n50_phase_block"]), 302 } 303 except: 304 pass 305 try: 306 snps_phased_pct[s_name] = { 307 "snps_phased_pct": float(self.longranger_data[s_name]["snps_phased"]) * 100.0 308 } 309 except: 310 pass 311 try: 312 genes_phased_pct[s_name] = { 313 "genes_phased_pct": float(self.longranger_data[s_name]["genes_phased_lt_100kb"]) * 100.0 314 } 315 except: 316 pass 317 phase_plot_cats = [OrderedDict(), OrderedDict(), OrderedDict()] 318 phase_plot_cats[0]["longest_phase_block"] = {"name": "Longest Phase Block"} 319 phase_plot_cats[0]["n50_phase_block"] = {"name": "N50 of Phase Blocks"} 320 phase_plot_cats[1]["snps_phased_pct"] = {"name": "% SNPs Phased"} 321 phase_plot_cats[2]["genes_phased_pct"] = {"name": "% Genes < 100kbp in a single phase block"} 322 if len(phase_pdata) > 0: 323 self.add_section( 324 name="Phasing", 325 anchor="longranger-phasing", 326 description="Phasing performance from Long Ranger. Genes are only considered if ≤ 100kbp in length and with at least one heterozygous SNP.", 327 helptext=""" 328 * Longest phased 329 * Size of the longest phase block, in base pairs 330 * N50 phased 331 * N50 length of the called phase blocks, in base pairs. 332 * % SNPs phased 333 * Percentage of called SNPs that were phased. 334 * % Genes Phased 335 * Percentage of genes shorter than 100kb with >1 heterozygous SNP that are phased into a single phase block. 336 """, 337 plot=bargraph.plot( 338 [phase_pdata, snps_phased_pct, genes_phased_pct], 339 phase_plot_cats, 340 { 341 "id": "longranger-phasing-plot", 342 "title": "Long Ranger: Phasing Statistics", 343 "data_labels": [ 344 {"name": "N50 Phased", "ylab": "N50 of called phase blocks (bp)"}, 345 {"name": "% SNPs Phased", "ylab": "% SNPs Phased", "ymax": 100}, 346 {"name": "% Genes Phased", "ylab": "% Genes Phased", "ymax": 100}, 347 ], 348 "cpswitch": False, 349 "stacking": None, 350 "ylab": "N50 of called phase blocks (bp)", 351 }, 352 ), 353 ) 354 355 ### Bar plot of mapping statistics 356 mapping_counts_data = {} 357 for s_name in self.longranger_data: 358 mapped_reads = float(self.longranger_data[s_name]["number_reads"]) * float( 359 self.longranger_data[s_name]["mapped_reads"] 360 ) 361 unmapped_reads = float(self.longranger_data[s_name]["number_reads"]) - mapped_reads 362 dup_reads = mapped_reads * float(self.longranger_data[s_name]["pcr_duplication"]) 363 unique_reads = mapped_reads - dup_reads 364 mapping_counts_data[s_name] = { 365 "unique_reads": unique_reads, 366 "dup_reads": dup_reads, 367 "unmapped_reads": unmapped_reads, 368 } 369 mapping_counts_cats = OrderedDict() 370 mapping_counts_cats["unique_reads"] = {"name": "Uniquely Aligned Reads", "color": "#437bb1"} 371 mapping_counts_cats["dup_reads"] = {"name": "PCR Duplicate Aligned Reads", "color": "#7cb5ec"} 372 mapping_counts_cats["unmapped_reads"] = {"name": "Unaligned Reads", "color": "#7f0000"} 373 self.add_section( 374 name="Alignment", 375 anchor="longranger-alignment", 376 description="Long Ranger alignment against the reference genome. To be marked as PCR duplicates, reads must have the same mapping extents on the genome and the same 10x barcode.", 377 plot=bargraph.plot( 378 mapping_counts_data, 379 mapping_counts_cats, 380 { 381 "id": "longranger-alignment-plot", 382 "title": "Long Ranger: Alignment Statistics", 383 "ylab": "Reads Counts", 384 "cpswitch_counts_label": "Read Counts", 385 }, 386 ), 387 ) 388 389 def parse_invocation(self, content): 390 sid_pat = re.compile(' sample_id = "(.*)",') 391 392 sid = None 393 for l in content.splitlines(): 394 sid_m = re.match(sid_pat, l) 395 if sid_m is not None: 396 sid = sid_m.groups()[0] 397 return sid 398 399 def parse_summary(self, content): 400 401 out_dict = OrderedDict() 402 lines = content.splitlines() 403 data = list(zip(lines[0].strip().split(","), lines[1].strip().split(","))) 404 for i, j in data: 405 if j != "": 406 try: 407 out_dict[i] = float(j) 408 except ValueError: 409 out_dict[i] = j 410 411 return out_dict 412