1#!/usr/bin/env Rscript 2### 3# The MIT License 4# 5# Copyright (C) 2017-2021 Giulio Genovese 6# 7# Author: Giulio Genovese <giulio.genovese@gmail.com> 8# 9# Permission is hereby granted, free of charge, to any person obtaining a copy 10# of this software and associated documentation files (the "Software"), to deal 11# in the Software without restriction, including without limitation the rights 12# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13# copies of the Software, and to permit persons to whom the Software is 14# furnished to do so, subject to the following conditions: 15# 16# The above copyright notice and this permission notice shall be included in 17# all copies or substantial portions of the Software. 18# 19# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 25# THE SOFTWARE. 26### 27 28summary_plot_version <- '2021-10-15' 29 30library(optparse) 31library(ggplot2) 32options(bitmapType = 'cairo') 33 34parser <- OptionParser('usage: summary_plot.R [options] --stats <file.tsv> --calls <file.tsv> --pdf <file.pdf>') 35parser <- add_option(parser, c('--stats'), type = 'character', help = 'input MoChA stats file', metavar = '<file.tsv>') 36parser <- add_option(parser, c('--calls'), type = 'character', help = 'input MoChA calls file', metavar = '<file.tsv>') 37parser <- add_option(parser, c('--call-rate-thr'), type = 'double', default = 0.97, help = 'minimum call rate threshold [0.97]', metavar = '<float>') 38parser <- add_option(parser, c('--baf-auto-thr'), type = 'double', default = 0.03, help = 'maximum BAF autocorrelation threshold [0.03]', metavar = '<float>') 39parser <- add_option(parser, c('--pdf'), type = 'character', help = 'output PDF file', metavar = '<file.pdf>') 40parser <- add_option(parser, c('--width'), type = 'integer', default = 7, help = 'inches width of the output file [7]', metavar = '<integer>') 41parser <- add_option(parser, c('--height'), type = 'integer', default = 7, help = 'inches height of the output file [7]', metavar = '<integer>') 42parser <- add_option(parser, c('--fontsize'), type = 'integer', default = 16, help = 'font size [16]', metavar = '<integer>') 43args <- parse_args(parser, commandArgs(trailingOnly = TRUE), convert_hyphens_to_underscores = TRUE) 44 45write(paste('summary_plot.R', summary_plot_version, 'https://github.com/freeseek/mocha'), stderr()) 46 47if (is.null(args$stats)) {print_help(parser); stop('option --stats is required')} 48if (is.null(args$calls)) {print_help(parser); stop('option --calls is required')} 49if (is.null(args$pdf)) {print_help(parser); stop('option --pdf is required')} 50 51df_stats <- read.table(args$stats, sep = '\t', header = TRUE) 52df_calls <- read.table(args$calls, sep = '\t', header = TRUE) 53df_calls$chrom <- as.factor(gsub('^chr', '', gsub('^chrM', 'MT', df_calls$chrom))) 54ord <- order(as.numeric(gsub('MT', '26', gsub('Y', '24', gsub('X', '23', levels(df_calls$chrom)))))) 55df_calls$chrom <- factor(df_calls$chrom, levels(df_calls$chrom)[ord]) 56 57if ('lrr_median' %in% colnames(df_stats)) { 58 model <- 'array' 59 df_stats$x_nonpar_adj_lrr_median <- df_stats$x_nonpar_lrr_median - df_stats$lrr_median 60 df_stats$y_nonpar_adj_lrr_median <- df_stats$y_nonpar_lrr_median - df_stats$lrr_median 61 df_stats$mt_adj_lrr_median <- df_stats$mt_lrr_median - df_stats$lrr_median 62} else { 63 model <- 'wgs' 64 df_stats$x_nonpar_adj_lrr_median <- log(df_stats$x_nonpar_cov_median) - log(df_stats$cov_median) 65 df_stats$y_nonpar_adj_lrr_median <- log(df_stats$y_nonpar_cov_median) - log(df_stats$cov_median) 66 df_stats$mt_adj_lrr_median <- log(df_stats$mt_cov_median) - log(df_stats$cov_median) 67} 68 69df_calls$sv <- factor(df_calls$chrom, levels = c('<0.5 Mbp', '0.5-5 Mbp', '5-50 Mbp', '>50 Mbp')) 70df_calls$sv[df_calls$length <= 5e8] <- '>50 Mbp' 71df_calls$sv[df_calls$length <= 5e7] <- '5-50 Mbp' 72df_calls$sv[df_calls$length <= 5e6] <- '0.5-5 Mbp' 73df_calls$sv[df_calls$length <= 5e5] <- '<0.5 Mbp' 74 75# avoid plotting BDEV for calls with not enough heterozygous sites 76df_calls$bdev[df_calls$n_hets < 5] <- NaN 77df_calls$bdev[is.na(df_calls$bdev)] <- 0 78 79pdf(args$pdf, width = args$width, height = args$height) 80 81idx <- !( df_calls$sample_id %in% df_stats$sample_id[df_stats$call_rate < args$call_rate_thr | df_stats$baf_auto > args$baf_auto_thr] | 82 df_calls$chrom %in% c('X', 'Y', 'MT') | grepl('^CNP', df_calls$type) ) 83 84if (sum(idx) > 0) { 85 p <- ggplot(df_calls[idx,], aes(x = bdev, y = rel_cov, color = type)) + 86 geom_hline(yintercept = c(1.0, 2.0, 3.0), color = 'gray', size = .5, linetype = 'dashed') + 87 geom_segment(aes(x = 0.0, y = 2.0, xend = 1.0/6.0, yend = 3.0), color = 'gray', size = .5, linetype = 'dashed') + 88 geom_segment(aes(x = 0.0, y = 2.0, xend = 1.0/6.0, yend = 1.5), color = 'gray', size = .5, linetype = 'dashed') + 89 geom_point(shape = 20, size = .5, alpha = 1/2) + 90 scale_color_manual('', values = c('CN-LOH' = 'orange', 'Loss' = 'blue', 'Gain' = 'red', 'Undetermined' = 'gray50')) + 91 theme_bw(base_size = args$fontsize) + 92 theme(strip.background = element_rect(color = NA, fill = NA), legend.position = 'bottom', legend.box = 'horizontal') + 93 facet_wrap(~sv) 94 print(p + scale_x_continuous('BAF deviation (Bdev)', breaks = c(0.00, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30)) + scale_y_continuous(expression(paste('Relative coverage (rescaled ', 2^LRR, ')')), breaks = c(0, 1, 2, 3, 4)) + coord_cartesian(xlim = c(0.00, 0.30), ylim = c(0, 4))) 95 print(p + scale_x_continuous('BAF deviation (Bdev)', breaks = c(0.00, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06)) + scale_y_continuous(expression(paste('Relative coverage (rescaled ', 2^LRR, ')')), breaks = c(1.8, 1.9, 2.0, 2.1, 2.2)) + coord_cartesian(xlim = c(0.00, 0.06), ylim = c(1.8, 2.2))) 96} 97 98if (sum(idx & df_calls$type == 'CN-LOH') > 0) { 99 p <- ggplot(df_calls[idx & df_calls$type == 'CN-LOH',], aes(x = bdev, y = n50_hets, color = type)) + 100 geom_point(shape = 20, size = .5, alpha = 1/2) + 101 scale_x_continuous('BAF deviation (Bdev)') + 102 scale_y_log10('N50 heterozygous sites distance') + 103 scale_color_manual('', values = c('CN-LOH' = 'orange')) + 104 theme_bw(base_size = args$fontsize) + 105 theme(strip.background = element_rect(color = NA, fill = NA), legend.position = 'bottom', legend.box = 'horizontal') + 106 facet_wrap(~sv) 107 print(p) 108} 109 110p <- ggplot(df_stats, aes(x = 1 - call_rate, y = baf_auto, color = computed_gender)) + 111 geom_vline(xintercept = 1 - args$call_rate_thr, color = 'black', size = .5, alpha = 1/2) + 112 geom_hline(yintercept = args$baf_auto_thr, color = 'black', size = .5, alpha = 1/2) + 113 geom_point(data = df_stats[df_stats$call_rate < args$call_rate_thr | df_stats$baf_auto > args$baf_auto_thr,], color = 'black', shape = 1, size = .5, alpha = 1/2) + 114 geom_point(shape = 20, size = .5, alpha = 1/2) + 115 scale_x_log10('1 - call rate') + 116 scale_y_continuous('BAF auto-correlation') + 117 scale_color_manual('', values = c('M' = 'blue', 'F' = 'orchid', 'U' = 'gray'), labels = c('M' = 'Male', 'F' = 'Female', 'U' = 'Undetermined')) + 118 theme_bw(base_size = args$fontsize) + 119 theme(legend.position = 'bottom', legend.box = 'horizontal') 120print(p) 121 122p <- ggplot(df_stats, aes(x = x_nonpar_adj_lrr_median, y = y_nonpar_adj_lrr_median, color = computed_gender)) + 123 geom_point(data = df_stats[df_stats$call_rate < args$call_rate_thr | df_stats$baf_auto > args$baf_auto_thr,], color = 'black', shape = 1, size = .5, alpha = 1/2) + 124 geom_point(shape = 20, size = .5, alpha = 1/2) + 125 scale_x_continuous('X nonPAR median LRR (autosome corrected)') + 126 scale_y_continuous('Y nonPAR median LRR (autosome corrected)') + 127 scale_color_manual('', values = c('M' = 'blue', 'F' = 'orchid', 'U' = 'gray'), labels = c('M' = 'Male', 'F' = 'Female', 'U' = 'Undetermined')) + 128 theme_bw(base_size = args$fontsize) + 129 theme(legend.position = 'bottom', legend.box = 'horizontal') 130print(p) 131 132if ('baf_sd' %in% colnames(df_stats)) { col_x <- 'baf_sd'; lbl_x <- 'Standard deviation BAF' 133} else if ('baf_corr' %in% colnames(df_stats)) { col_x <- 'baf_corr'; lbl_x <- 'Beta-binomial correlation BAF' } 134if ('lrr_sd' %in% colnames(df_stats)) { col_y <- 'lrr_sd'; lbl_y <- 'Standard deviation LRR' 135} else if ('cov_sd' %in% colnames(df_stats)) { col_y <- 'cov_sd'; lbl_y <- 'Standard deviation coverage' } 136p <- ggplot(df_stats, aes_string(x = col_x, y = col_y, color = 'computed_gender')) + 137 geom_point(data = df_stats[df_stats$call_rate < args$call_rate_thr | df_stats$baf_auto > args$baf_auto_thr,], color = 'black', shape = 1, size = .5, alpha = 1/2) + 138 geom_point(shape = 20, size = .5, alpha = 1/2) + 139 scale_x_continuous(lbl_x) + 140 scale_y_continuous(lbl_y) + 141 scale_color_manual('', values = c('M' = 'blue', 'F' = 'orchid', 'U' = 'gray'), labels = c('M' = 'Male', 'F' = 'Female', 'U' = 'Undetermined')) + 142 theme_bw(base_size = args$fontsize) + 143 theme(legend.position = 'bottom', legend.box = 'horizontal') 144print(p) 145 146idx_mlox <- df_calls$computed_gender == 'F' & df_calls$chrom == 'X' & df_calls$length > 1e8 147if (sum(idx_mlox) > 0) { 148 df_merge <- merge(df_stats[df_stats$computed_gender == 'F' & df_stats$call_rate >= args$call_rate_thr & df_stats$baf_auto <= args$baf_auto_thr, c('sample_id', 'x_nonpar_adj_lrr_median')], 149 df_calls[idx_mlox, c('sample_id', 'bdev')], all.x = TRUE) 150 df_merge$mlox <- !is.na(df_merge$bdev) 151 df_merge$bdev[is.na(df_merge$bdev)] <- 0 152 p <- ggplot(df_merge, aes(x = bdev, y = x_nonpar_adj_lrr_median, color = mlox)) + 153 geom_point(shape = 20, size = .5, alpha = 1/2) + 154 scale_x_continuous('X BAF deviation') + 155 scale_y_continuous('X nonPAR median LRR (autosome corrected)') + 156 scale_color_manual(paste(sum(df_merge$mlox), 'mLOX'), values = c('FALSE' = 'gray', 'TRUE' = 'red'), labels = c('FALSE' = 'no', 'TRUE' = 'yes')) + 157 theme_bw(base_size = args$fontsize) + 158 theme(legend.position = 'bottom', legend.box = 'horizontal') 159 print(p) 160} 161 162idx_mloy <- df_calls$computed_gender == 'M' & df_calls$chrom == 'X' & df_calls$length > 2e6 163if (sum(idx_mloy) > 0) { 164 df_merge <- merge(df_stats[df_stats$computed_gender == 'M' & df_stats$call_rate >= args$call_rate_thr & df_stats$baf_auto <= args$baf_auto_thr, c('sample_id', 'y_nonpar_adj_lrr_median')], 165 df_calls[idx_mloy, c('sample_id', 'bdev')], all.x = TRUE) 166 df_merge$mloy <- !is.na(df_merge$bdev) 167 df_merge$bdev[is.na(df_merge$bdev)] <- 0 168 p <- ggplot(df_merge, aes(x = bdev, y = y_nonpar_adj_lrr_median, color = mloy)) + 169 geom_point(shape = 20, size = .5, alpha = 1/2) + 170 scale_x_continuous('PAR1/PAR2 BAF deviation') + 171 scale_y_continuous('Y nonPAR median LRR (autosome corrected)') + 172 scale_color_manual(paste(sum(df_merge$mloy), 'mLOY'), values = c('FALSE' = 'gray', 'TRUE' = 'red'), labels = c('FALSE' = 'no', 'TRUE' = 'yes')) + 173 theme_bw(base_size = args$fontsize) + 174 theme(legend.position = 'bottom', legend.box = 'horizontal') 175 print(p) 176} 177 178if (sum(idx_mlox | idx_mloy) > 0) { 179 p <- ggplot(df_calls[idx_mlox | idx_mloy,], aes(x = bdev)) + 180 geom_histogram(binwidth = .001, color = 'black', fill = 'transparent') + 181 scale_x_continuous('mLOX/mLOY BAF deviation', limits = c(0.0, 0.05), expand = c(0, 0)) + 182 theme_bw(base_size = args$fontsize) + 183 facet_grid(computed_gender ~ ., scales = 'free_y') 184 print(p) 185} 186 187invisible(dev.off()) 188