1#!/usr/bin/env Rscript
2
3library(tidyr)
4library(dplyr)
5library(ggplot2)
6library(reshape2)
7
8# ./gatk VariantsToTable -V /dsde/data/deep/vqsr/vcfs/illumina_na12878_platinum_scored_chr2.vcf.gz -F CHROM -F POS -F REF -F ALT -F FILTER -F G947_SITE_LABELLED_RRAB -F EVENTLENGTH -F AC -F MULTI-ALLELIC -F TRANSITION -F TYPE -O ~/Documents/illumin_chr2.table
9#d <- read.table("illumin_chr2.table", header=TRUE)
10#score_key <- "G947_SITE_LABELLED_RRAB"
11#d <- read.table("g94982_chr20.table", header=TRUE)
12#score_key <- "CNN_2D"
13#d <- read.table("new_gnomad_22.table", header=TRUE)
14#score_key <- "CNN_1D"
15
16args = commandArgs(trailingOnly=TRUE)
17if (length(args) != 2) {
18  stop("We need 2 arguments: call_vcf_table  score_key")
19}
20
21print("try to load VCF table.")
22d <- read.table(args[1], header=TRUE)
23score_key <- args[2]
24score_label <- paste(score_key, " LOD Score")
25plot_title <- gsub(".vcf.gz.table", "", basename(args[1]))
26num_bins <- 50
27bin_by_quantile <- FALSE
28
29get_proportion <- function(d, num_bins, column_to_sum, quality_column) {
30  x <- rowsum(column_to_sum, quality_column, na.rm =T)
31  idx <- row.names(x)
32
33  for (i in 1:num_bins) {
34    qsum <- sum(quality_column==as.numeric(idx[i]))
35    if (!is.na(x[i]) && qsum>0) {
36      x[i] <- x[i] / qsum
37    }
38  }
39  return(x[quality_column])
40}
41
42d$SNP <- d$EVENTLENGTH == 0
43d$ONE <- 1
44x <- rowsum(d$ONE, d$EVENTLENGTH)
45d$EVENTLENGTH_SUM <- x[as.factor(d$EVENTLENGTH)]
46d$Unfiltered <- d$FILTER == "PASS" | d$FILTER == "."
47d$Variant_Type <- paste(d$TYPE, as.factor(d$EVENTLENGTH<0))
48
49
50# All variant plots
51print("Make all variant plots.")
52p1 <- ggplot(d, aes(get(score_key), color=SNP, fill=SNP)) +
53  scale_fill_discrete(name="Variant\nType", breaks=c("TRUE", "FALSE"), labels=c("SNPs", "INDELs")) +
54  geom_density(alpha=0.55) +
55  ggtitle(plot_title) +
56  xlab(score_label) +
57  guides(color=FALSE)
58
59p2 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Unfiltered, shape=Variant_Type)) +
60  scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) +
61  scale_shape_discrete(name='', breaks=c("INDEL TRUE", "INDEL FALSE", "SNP FALSE"), labels=c("Deletion", "Insertion", "SNP")) +
62  geom_jitter(height = 0, width = 0.2, alpha=0.6) +
63  ggtitle(plot_title) +
64  ylab(score_label) +
65  xlab("Event Length: - Deletions, 0 SNPs, + Insertions")
66
67p3 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Unfiltered)) + xlim(-20, 20) +
68  scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) +
69  geom_jitter(height = 0, width = 0.15, alpha=0.4) +
70  geom_violin(color="grey", alpha=0) +
71  geom_text(aes(x=EVENTLENGTH, y=14, label=EVENTLENGTH_SUM), color="grey30", size=2, angle=60) +
72  ggtitle(plot_title) +
73  ylab(score_label) +
74  xlab("Event Length: - Deletions, 0 SNPs, + Insertions")
75
76p4 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Unfiltered)) + xlim(-10, 10) +
77  scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) +
78  geom_jitter(height = 0, width = 0.2, alpha=0.4) +
79  geom_violin(color="grey", alpha=0) +
80  geom_text(aes(x=EVENTLENGTH, y=14, label=EVENTLENGTH_SUM), color="grey30", size=3, angle=30) +
81  ggtitle(plot_title) +
82  ylab(score_label) +
83  xlab("Event Length: - Deletions, 0 SNPs, + Insertions")
84
85p5 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Unfiltered)) +
86  scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) +
87  xlim(-5, 5) +
88  geom_jitter(height = 0, width = 0.35, alpha=0.4) +
89  geom_violin(color="grey", alpha=0.0) +
90  geom_text(aes(x=EVENTLENGTH, y=14, label=EVENTLENGTH_SUM), color="grey30", size=4, angle=30) +
91  ggtitle(plot_title) +
92  ylab(score_label) +
93  xlab("Event Length: - Deletions, 0 SNPs, + Insertions")
94
95
96# SNP specific plots
97print("Make SNP plots.")
98snps <- subset(d, EVENTLENGTH == 0)
99my_breaks <- ifelse(bin_by_quantile, quantile(snps[[score_key]], probs = seq(0, 1, 1.0/num_bins), na.rm=T), num_bins)
100snps$QUALITY_BIN <- cut(snps[[score_key]], breaks=my_breaks, include.lowest=T, labels=F)
101snps$QUALITY_BIN_RANGE <- cut(snps[[score_key]], breaks=my_breaks, include.lowest=T)
102mine <- lapply(strsplit(sapply(levels(snps$QUALITY_BIN_RANGE), function(x) substr(x, 2, nchar(x)-1)), ","), as.numeric)
103df <- data.frame(matrix(unlist(mine), nrow=num_bins, byrow=T))
104q_means <- rowMeans(df)
105snps$QUALITY_LOD <- q_means[snps$QUALITY_BIN]
106
107x <- rowsum(snps$ONE, snps$QUALITY_BIN)
108snps$BIN_SUM <- x[snps$QUALITY_BIN]
109snps$TRANSVERSION <- as.numeric(abs(snps$TRANSITION)==0)
110ti <- get_proportion(snps, num_bins, snps$TRANSITION, snps$QUALITY_BIN)
111tv <- get_proportion(snps, num_bins, snps$TRANSVERSION, snps$QUALITY_BIN)
112snps$TI_TV <- ti/tv
113
114p6 <- ggplot(snps, aes(x=get(score_key), y=TI_TV, group=QUALITY_BIN, color=Unfiltered, shape=TRANSITION==1)) +
115  scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) +
116  scale_shape_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Transition", "Transversion")) +
117  geom_point() +
118  geom_line(color="grey") +
119  xlab(score_label) +
120  ggtitle("Transition Transversion Ratio per score bin") +
121  ylim(0, 4)
122
123
124# INDEL specific plots
125print("Make INDEL plots.")
126indels <- subset(d, EVENTLENGTH != 0)
127my_breaks <- ifelse(bin_by_quantile, quantile(indels[[score_key]], probs = seq(0, 1, 1.0/num_bins), na.rm=T), num_bins)
128indels$QUALITY_BIN <- cut(indels[[score_key]], breaks=my_breaks, include.lowest=T, labels=F)
129indels$QUALITY_BIN_RANGE <- cut(indels[[score_key]], breaks=my_breaks, include.lowest=T)
130mine <- lapply(strsplit(sapply(levels(indels$QUALITY_BIN_RANGE), function(x) substr(x, 2, nchar(x)-1)), ","), as.numeric)
131df <- data.frame(matrix(unlist(mine), nrow=num_bins, byrow=T))
132q_means <- rowMeans(df)
133indels$QUALITY_LOD <- q_means[indels$QUALITY_BIN]
134x <- rowsum(indels$ONE, indels$QUALITY_BIN)
135indels$BIN_SUM <- x[indels$QUALITY_BIN]
136indels$ONEBP <- as.numeric(abs(indels$EVENTLENGTH)==1)
137indels$PROPORTION_ONEBP <- get_proportion(indels, num_bins, indels$ONEBP, indels$QUALITY_BIN)
138
139p7 <- ggplot(indels, aes(x=get(score_key), y=PROPORTION_ONEBP, group=QUALITY_BIN, color=Unfiltered, shape=EVENTLENGTH<0)) +
140  scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) +
141  scale_shape_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Deletion", "Insertion")) +
142  geom_jitter(height = 0.005, width = 0.0, alpha=0.6) +
143  geom_line(color="grey") +
144  ggtitle("Proportion of 1bp INDELs per score bin") +
145  xlab(score_label)
146
147# Multiple plot function
148#
149# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
150# - cols:   Number of columns in layout
151# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
152#
153# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
154# then plot 1 will go in the upper left, 2 will go in the upper right, and
155# 3 will go all the way across the bottom.
156#
157multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
158  library(grid)
159
160  # Make a list from the ... arguments and plotlist
161  plots <- c(list(...), plotlist)
162
163  numPlots = length(plots)
164
165  # If layout is NULL, then use 'cols' to determine layout
166  if (is.null(layout)) {
167    # Make the panel
168    # ncol: Number of columns of plots
169    # nrow: Number of rows needed, calculated from # of cols
170    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
171                     ncol = cols, nrow = ceiling(numPlots/cols))
172  }
173
174  if (numPlots==1) {
175    print(plots[[1]])
176
177  } else {
178    # Set up the page
179    grid.newpage()
180    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
181
182    # Make each plot, in the correct location
183    for (i in 1:numPlots) {
184      # Get the i,j matrix positions of the regions that contain this subplot
185      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
186
187      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
188                                      layout.pos.col = matchidx$col))
189    }
190  }
191}
192ggsave(plot=multiplot(p1,p2,p3,p4,p5,p6,p7, cols=2), filename = paste(plot_title, "_plots.png", sep=""), width=16, height=20)
193
194