1#!/usr/bin/env Rscript
2###
3#  The MIT License
4#
5#  Copyright (C) 2017-2021 Giulio Genovese
6#
7#  Author: Giulio Genovese <giulio.genovese@gmail.com>
8#
9#  Permission is hereby granted, free of charge, to any person obtaining a copy
10#  of this software and associated documentation files (the "Software"), to deal
11#  in the Software without restriction, including without limitation the rights
12#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13#  copies of the Software, and to permit persons to whom the Software is
14#  furnished to do so, subject to the following conditions:
15#
16#  The above copyright notice and this permission notice shall be included in
17#  all copies or substantial portions of the Software.
18#
19#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25#  THE SOFTWARE.
26###
27
28summary_plot_version <- '2021-10-15'
29
30library(optparse)
31library(ggplot2)
32options(bitmapType = 'cairo')
33
34parser <- OptionParser('usage: summary_plot.R [options] --stats <file.tsv> --calls <file.tsv> --pdf <file.pdf>')
35parser <- add_option(parser, c('--stats'), type = 'character', help = 'input MoChA stats file', metavar = '<file.tsv>')
36parser <- add_option(parser, c('--calls'), type = 'character', help = 'input MoChA calls file', metavar = '<file.tsv>')
37parser <- add_option(parser, c('--call-rate-thr'), type = 'double', default = 0.97, help = 'minimum call rate threshold [0.97]', metavar = '<float>')
38parser <- add_option(parser, c('--baf-auto-thr'), type = 'double', default = 0.03, help = 'maximum BAF autocorrelation threshold [0.03]', metavar = '<float>')
39parser <- add_option(parser, c('--pdf'), type = 'character', help = 'output PDF file', metavar = '<file.pdf>')
40parser <- add_option(parser, c('--width'), type = 'integer', default = 7, help = 'inches width of the output file [7]', metavar = '<integer>')
41parser <- add_option(parser, c('--height'), type = 'integer', default = 7, help = 'inches height of the output file [7]', metavar = '<integer>')
42parser <- add_option(parser, c('--fontsize'), type = 'integer', default = 16, help = 'font size [16]', metavar = '<integer>')
43args <- parse_args(parser, commandArgs(trailingOnly = TRUE), convert_hyphens_to_underscores = TRUE)
44
45write(paste('summary_plot.R', summary_plot_version, 'https://github.com/freeseek/mocha'), stderr())
46
47if (is.null(args$stats)) {print_help(parser); stop('option --stats is required')}
48if (is.null(args$calls)) {print_help(parser); stop('option --calls is required')}
49if (is.null(args$pdf)) {print_help(parser); stop('option --pdf is required')}
50
51df_stats <- read.table(args$stats, sep = '\t', header = TRUE)
52df_calls <- read.table(args$calls, sep = '\t', header = TRUE)
53df_calls$chrom <- as.factor(gsub('^chr', '', gsub('^chrM', 'MT', df_calls$chrom)))
54ord <- order(as.numeric(gsub('MT', '26', gsub('Y', '24', gsub('X', '23', levels(df_calls$chrom))))))
55df_calls$chrom <- factor(df_calls$chrom, levels(df_calls$chrom)[ord])
56
57if ('lrr_median' %in% colnames(df_stats)) {
58  model <- 'array'
59  df_stats$x_nonpar_adj_lrr_median <- df_stats$x_nonpar_lrr_median - df_stats$lrr_median
60  df_stats$y_nonpar_adj_lrr_median <- df_stats$y_nonpar_lrr_median - df_stats$lrr_median
61  df_stats$mt_adj_lrr_median <- df_stats$mt_lrr_median - df_stats$lrr_median
62} else {
63  model <- 'wgs'
64  df_stats$x_nonpar_adj_lrr_median <- log(df_stats$x_nonpar_cov_median) - log(df_stats$cov_median)
65  df_stats$y_nonpar_adj_lrr_median <- log(df_stats$y_nonpar_cov_median) - log(df_stats$cov_median)
66  df_stats$mt_adj_lrr_median <- log(df_stats$mt_cov_median) - log(df_stats$cov_median)
67}
68
69df_calls$sv <- factor(df_calls$chrom, levels = c('<0.5 Mbp', '0.5-5 Mbp', '5-50 Mbp', '>50 Mbp'))
70df_calls$sv[df_calls$length <= 5e8] <- '>50 Mbp'
71df_calls$sv[df_calls$length <= 5e7] <- '5-50 Mbp'
72df_calls$sv[df_calls$length <= 5e6] <- '0.5-5 Mbp'
73df_calls$sv[df_calls$length <= 5e5] <- '<0.5 Mbp'
74
75# avoid plotting BDEV for calls with not enough heterozygous sites
76df_calls$bdev[df_calls$n_hets < 5] <- NaN
77df_calls$bdev[is.na(df_calls$bdev)] <- 0
78
79pdf(args$pdf, width = args$width, height = args$height)
80
81idx <- !( df_calls$sample_id %in% df_stats$sample_id[df_stats$call_rate < args$call_rate_thr | df_stats$baf_auto > args$baf_auto_thr] |
82          df_calls$chrom %in% c('X', 'Y', 'MT') | grepl('^CNP', df_calls$type) )
83
84if (sum(idx) > 0) {
85  p <- ggplot(df_calls[idx,], aes(x = bdev, y = rel_cov, color = type)) +
86    geom_hline(yintercept = c(1.0, 2.0, 3.0), color = 'gray', size = .5, linetype = 'dashed') +
87    geom_segment(aes(x = 0.0, y = 2.0, xend = 1.0/6.0, yend = 3.0), color = 'gray', size = .5, linetype = 'dashed') +
88    geom_segment(aes(x = 0.0, y = 2.0, xend = 1.0/6.0, yend = 1.5), color = 'gray', size = .5, linetype = 'dashed') +
89    geom_point(shape = 20, size = .5, alpha = 1/2) +
90    scale_color_manual('', values = c('CN-LOH' = 'orange', 'Loss' = 'blue', 'Gain' = 'red', 'Undetermined' = 'gray50')) +
91    theme_bw(base_size = args$fontsize) +
92    theme(strip.background = element_rect(color = NA, fill = NA), legend.position = 'bottom', legend.box = 'horizontal') +
93    facet_wrap(~sv)
94  print(p + scale_x_continuous('BAF deviation (Bdev)', breaks = c(0.00, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30)) + scale_y_continuous(expression(paste('Relative coverage (rescaled ', 2^LRR, ')')), breaks = c(0, 1, 2, 3, 4)) + coord_cartesian(xlim = c(0.00, 0.30), ylim = c(0, 4)))
95  print(p + scale_x_continuous('BAF deviation (Bdev)', breaks = c(0.00, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06)) + scale_y_continuous(expression(paste('Relative coverage (rescaled ', 2^LRR, ')')), breaks = c(1.8, 1.9, 2.0, 2.1, 2.2)) + coord_cartesian(xlim = c(0.00, 0.06), ylim = c(1.8, 2.2)))
96}
97
98if (sum(idx & df_calls$type == 'CN-LOH') > 0) {
99  p <- ggplot(df_calls[idx & df_calls$type == 'CN-LOH',], aes(x = bdev, y = n50_hets, color = type)) +
100    geom_point(shape = 20, size = .5, alpha = 1/2) +
101    scale_x_continuous('BAF deviation (Bdev)') +
102    scale_y_log10('N50 heterozygous sites distance') +
103    scale_color_manual('', values = c('CN-LOH' = 'orange')) +
104    theme_bw(base_size = args$fontsize) +
105    theme(strip.background = element_rect(color = NA, fill = NA), legend.position = 'bottom', legend.box = 'horizontal') +
106    facet_wrap(~sv)
107  print(p)
108}
109
110p <- ggplot(df_stats, aes(x = 1 - call_rate, y = baf_auto, color = computed_gender)) +
111  geom_vline(xintercept = 1 - args$call_rate_thr, color = 'black', size = .5, alpha = 1/2) +
112  geom_hline(yintercept = args$baf_auto_thr, color = 'black', size = .5, alpha = 1/2) +
113  geom_point(data = df_stats[df_stats$call_rate < args$call_rate_thr | df_stats$baf_auto > args$baf_auto_thr,], color = 'black', shape = 1, size = .5, alpha = 1/2) +
114  geom_point(shape = 20, size = .5, alpha = 1/2) +
115  scale_x_log10('1 - call rate') +
116  scale_y_continuous('BAF auto-correlation') +
117  scale_color_manual('', values = c('M' = 'blue', 'F' = 'orchid', 'U' = 'gray'), labels = c('M' = 'Male', 'F' = 'Female', 'U' = 'Undetermined')) +
118  theme_bw(base_size = args$fontsize) +
119  theme(legend.position = 'bottom', legend.box = 'horizontal')
120print(p)
121
122p <- ggplot(df_stats, aes(x = x_nonpar_adj_lrr_median, y = y_nonpar_adj_lrr_median, color = computed_gender)) +
123  geom_point(data = df_stats[df_stats$call_rate < args$call_rate_thr | df_stats$baf_auto > args$baf_auto_thr,], color = 'black', shape = 1, size = .5, alpha = 1/2) +
124  geom_point(shape = 20, size = .5, alpha = 1/2) +
125  scale_x_continuous('X nonPAR median LRR (autosome corrected)') +
126  scale_y_continuous('Y nonPAR median LRR (autosome corrected)') +
127  scale_color_manual('', values = c('M' = 'blue', 'F' = 'orchid', 'U' = 'gray'), labels = c('M' = 'Male', 'F' = 'Female', 'U' = 'Undetermined')) +
128  theme_bw(base_size = args$fontsize) +
129  theme(legend.position = 'bottom', legend.box = 'horizontal')
130print(p)
131
132if ('baf_sd' %in% colnames(df_stats)) { col_x <- 'baf_sd'; lbl_x <- 'Standard deviation BAF'
133} else if ('baf_corr' %in% colnames(df_stats)) { col_x <- 'baf_corr'; lbl_x <- 'Beta-binomial correlation BAF' }
134if ('lrr_sd' %in% colnames(df_stats)) { col_y <- 'lrr_sd'; lbl_y <- 'Standard deviation LRR'
135} else if ('cov_sd' %in% colnames(df_stats)) { col_y <- 'cov_sd'; lbl_y <- 'Standard deviation coverage' }
136p <- ggplot(df_stats, aes_string(x = col_x, y = col_y, color = 'computed_gender')) +
137  geom_point(data = df_stats[df_stats$call_rate < args$call_rate_thr | df_stats$baf_auto > args$baf_auto_thr,], color = 'black', shape = 1, size = .5, alpha = 1/2) +
138  geom_point(shape = 20, size = .5, alpha = 1/2) +
139  scale_x_continuous(lbl_x) +
140  scale_y_continuous(lbl_y) +
141  scale_color_manual('', values = c('M' = 'blue', 'F' = 'orchid', 'U' = 'gray'), labels = c('M' = 'Male', 'F' = 'Female', 'U' = 'Undetermined')) +
142  theme_bw(base_size = args$fontsize) +
143  theme(legend.position = 'bottom', legend.box = 'horizontal')
144print(p)
145
146idx_mlox <- df_calls$computed_gender == 'F' & df_calls$chrom == 'X' & df_calls$length > 1e8
147if (sum(idx_mlox) > 0) {
148  df_merge <- merge(df_stats[df_stats$computed_gender == 'F' & df_stats$call_rate >= args$call_rate_thr & df_stats$baf_auto <= args$baf_auto_thr, c('sample_id', 'x_nonpar_adj_lrr_median')],
149                    df_calls[idx_mlox, c('sample_id', 'bdev')], all.x = TRUE)
150  df_merge$mlox <- !is.na(df_merge$bdev)
151  df_merge$bdev[is.na(df_merge$bdev)] <- 0
152  p <- ggplot(df_merge, aes(x = bdev, y = x_nonpar_adj_lrr_median, color = mlox)) +
153    geom_point(shape = 20, size = .5, alpha = 1/2) +
154    scale_x_continuous('X BAF deviation') +
155    scale_y_continuous('X nonPAR median LRR (autosome corrected)') +
156    scale_color_manual(paste(sum(df_merge$mlox), 'mLOX'), values = c('FALSE' = 'gray', 'TRUE' = 'red'), labels = c('FALSE' = 'no', 'TRUE' = 'yes')) +
157    theme_bw(base_size = args$fontsize) +
158    theme(legend.position = 'bottom', legend.box = 'horizontal')
159  print(p)
160}
161
162idx_mloy <- df_calls$computed_gender == 'M' & df_calls$chrom == 'X' & df_calls$length > 2e6
163if (sum(idx_mloy) > 0) {
164  df_merge <- merge(df_stats[df_stats$computed_gender == 'M' & df_stats$call_rate >= args$call_rate_thr & df_stats$baf_auto <= args$baf_auto_thr, c('sample_id', 'y_nonpar_adj_lrr_median')],
165                    df_calls[idx_mloy, c('sample_id', 'bdev')], all.x = TRUE)
166  df_merge$mloy <- !is.na(df_merge$bdev)
167  df_merge$bdev[is.na(df_merge$bdev)] <- 0
168  p <- ggplot(df_merge, aes(x = bdev, y = y_nonpar_adj_lrr_median, color = mloy)) +
169    geom_point(shape = 20, size = .5, alpha = 1/2) +
170    scale_x_continuous('PAR1/PAR2 BAF deviation') +
171    scale_y_continuous('Y nonPAR median LRR (autosome corrected)') +
172    scale_color_manual(paste(sum(df_merge$mloy), 'mLOY'), values = c('FALSE' = 'gray', 'TRUE' = 'red'), labels = c('FALSE' = 'no', 'TRUE' = 'yes')) +
173    theme_bw(base_size = args$fontsize) +
174    theme(legend.position = 'bottom', legend.box = 'horizontal')
175  print(p)
176}
177
178if (sum(idx_mlox | idx_mloy) > 0) {
179  p <- ggplot(df_calls[idx_mlox | idx_mloy,], aes(x = bdev)) +
180    geom_histogram(binwidth = .001, color = 'black', fill = 'transparent') +
181    scale_x_continuous('mLOX/mLOY BAF deviation', limits = c(0.0, 0.05), expand = c(0, 0)) +
182    theme_bw(base_size = args$fontsize) +
183    facet_grid(computed_gender ~ ., scales = 'free_y')
184  print(p)
185}
186
187invisible(dev.off())
188