1#!/usr/bin/env Rscript 2 3library(tidyr) 4library(dplyr) 5library(ggplot2) 6library(reshape2) 7 8# ./gatk VariantsToTable -V /dsde/data/deep/vqsr/vcfs/illumina_na12878_platinum_scored_chr2.vcf.gz -F CHROM -F POS -F REF -F ALT -F FILTER -F G947_SITE_LABELLED_RRAB -F EVENTLENGTH -F AC -F MULTI-ALLELIC -F TRANSITION -F TYPE -O ~/Documents/illumin_chr2.table 9#d <- read.table("illumin_chr2.table", header=TRUE) 10#score_key <- "G947_SITE_LABELLED_RRAB" 11#d <- read.table("g94982_chr20.table", header=TRUE) 12#score_key <- "CNN_2D" 13#d <- read.table("new_gnomad_22.table", header=TRUE) 14#score_key <- "CNN_1D" 15 16args = commandArgs(trailingOnly=TRUE) 17if (length(args) != 2) { 18 stop("We need 2 arguments: call_vcf_table score_key") 19} 20 21print("try to load VCF table.") 22d <- read.table(args[1], header=TRUE) 23score_key <- args[2] 24score_label <- paste(score_key, " LOD Score") 25plot_title <- gsub(".vcf.gz.table", "", basename(args[1])) 26num_bins <- 50 27bin_by_quantile <- FALSE 28 29get_proportion <- function(d, num_bins, column_to_sum, quality_column) { 30 x <- rowsum(column_to_sum, quality_column, na.rm =T) 31 idx <- row.names(x) 32 33 for (i in 1:num_bins) { 34 qsum <- sum(quality_column==as.numeric(idx[i])) 35 if (!is.na(x[i]) && qsum>0) { 36 x[i] <- x[i] / qsum 37 } 38 } 39 return(x[quality_column]) 40} 41 42d$SNP <- d$EVENTLENGTH == 0 43d$ONE <- 1 44x <- rowsum(d$ONE, d$EVENTLENGTH) 45d$EVENTLENGTH_SUM <- x[as.factor(d$EVENTLENGTH)] 46d$Unfiltered <- d$FILTER == "PASS" | d$FILTER == "." 47d$Variant_Type <- paste(d$TYPE, as.factor(d$EVENTLENGTH<0)) 48 49 50# All variant plots 51print("Make all variant plots.") 52p1 <- ggplot(d, aes(get(score_key), color=SNP, fill=SNP)) + 53 scale_fill_discrete(name="Variant\nType", breaks=c("TRUE", "FALSE"), labels=c("SNPs", "INDELs")) + 54 geom_density(alpha=0.55) + 55 ggtitle(plot_title) + 56 xlab(score_label) + 57 guides(color=FALSE) 58 59p2 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Unfiltered, shape=Variant_Type)) + 60 scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) + 61 scale_shape_discrete(name='', breaks=c("INDEL TRUE", "INDEL FALSE", "SNP FALSE"), labels=c("Deletion", "Insertion", "SNP")) + 62 geom_jitter(height = 0, width = 0.2, alpha=0.6) + 63 ggtitle(plot_title) + 64 ylab(score_label) + 65 xlab("Event Length: - Deletions, 0 SNPs, + Insertions") 66 67p3 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Unfiltered)) + xlim(-20, 20) + 68 scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) + 69 geom_jitter(height = 0, width = 0.15, alpha=0.4) + 70 geom_violin(color="grey", alpha=0) + 71 geom_text(aes(x=EVENTLENGTH, y=14, label=EVENTLENGTH_SUM), color="grey30", size=2, angle=60) + 72 ggtitle(plot_title) + 73 ylab(score_label) + 74 xlab("Event Length: - Deletions, 0 SNPs, + Insertions") 75 76p4 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Unfiltered)) + xlim(-10, 10) + 77 scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) + 78 geom_jitter(height = 0, width = 0.2, alpha=0.4) + 79 geom_violin(color="grey", alpha=0) + 80 geom_text(aes(x=EVENTLENGTH, y=14, label=EVENTLENGTH_SUM), color="grey30", size=3, angle=30) + 81 ggtitle(plot_title) + 82 ylab(score_label) + 83 xlab("Event Length: - Deletions, 0 SNPs, + Insertions") 84 85p5 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Unfiltered)) + 86 scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) + 87 xlim(-5, 5) + 88 geom_jitter(height = 0, width = 0.35, alpha=0.4) + 89 geom_violin(color="grey", alpha=0.0) + 90 geom_text(aes(x=EVENTLENGTH, y=14, label=EVENTLENGTH_SUM), color="grey30", size=4, angle=30) + 91 ggtitle(plot_title) + 92 ylab(score_label) + 93 xlab("Event Length: - Deletions, 0 SNPs, + Insertions") 94 95 96# SNP specific plots 97print("Make SNP plots.") 98snps <- subset(d, EVENTLENGTH == 0) 99my_breaks <- ifelse(bin_by_quantile, quantile(snps[[score_key]], probs = seq(0, 1, 1.0/num_bins), na.rm=T), num_bins) 100snps$QUALITY_BIN <- cut(snps[[score_key]], breaks=my_breaks, include.lowest=T, labels=F) 101snps$QUALITY_BIN_RANGE <- cut(snps[[score_key]], breaks=my_breaks, include.lowest=T) 102mine <- lapply(strsplit(sapply(levels(snps$QUALITY_BIN_RANGE), function(x) substr(x, 2, nchar(x)-1)), ","), as.numeric) 103df <- data.frame(matrix(unlist(mine), nrow=num_bins, byrow=T)) 104q_means <- rowMeans(df) 105snps$QUALITY_LOD <- q_means[snps$QUALITY_BIN] 106 107x <- rowsum(snps$ONE, snps$QUALITY_BIN) 108snps$BIN_SUM <- x[snps$QUALITY_BIN] 109snps$TRANSVERSION <- as.numeric(abs(snps$TRANSITION)==0) 110ti <- get_proportion(snps, num_bins, snps$TRANSITION, snps$QUALITY_BIN) 111tv <- get_proportion(snps, num_bins, snps$TRANSVERSION, snps$QUALITY_BIN) 112snps$TI_TV <- ti/tv 113 114p6 <- ggplot(snps, aes(x=get(score_key), y=TI_TV, group=QUALITY_BIN, color=Unfiltered, shape=TRANSITION==1)) + 115 scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) + 116 scale_shape_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Transition", "Transversion")) + 117 geom_point() + 118 geom_line(color="grey") + 119 xlab(score_label) + 120 ggtitle("Transition Transversion Ratio per score bin") + 121 ylim(0, 4) 122 123 124# INDEL specific plots 125print("Make INDEL plots.") 126indels <- subset(d, EVENTLENGTH != 0) 127my_breaks <- ifelse(bin_by_quantile, quantile(indels[[score_key]], probs = seq(0, 1, 1.0/num_bins), na.rm=T), num_bins) 128indels$QUALITY_BIN <- cut(indels[[score_key]], breaks=my_breaks, include.lowest=T, labels=F) 129indels$QUALITY_BIN_RANGE <- cut(indels[[score_key]], breaks=my_breaks, include.lowest=T) 130mine <- lapply(strsplit(sapply(levels(indels$QUALITY_BIN_RANGE), function(x) substr(x, 2, nchar(x)-1)), ","), as.numeric) 131df <- data.frame(matrix(unlist(mine), nrow=num_bins, byrow=T)) 132q_means <- rowMeans(df) 133indels$QUALITY_LOD <- q_means[indels$QUALITY_BIN] 134x <- rowsum(indels$ONE, indels$QUALITY_BIN) 135indels$BIN_SUM <- x[indels$QUALITY_BIN] 136indels$ONEBP <- as.numeric(abs(indels$EVENTLENGTH)==1) 137indels$PROPORTION_ONEBP <- get_proportion(indels, num_bins, indels$ONEBP, indels$QUALITY_BIN) 138 139p7 <- ggplot(indels, aes(x=get(score_key), y=PROPORTION_ONEBP, group=QUALITY_BIN, color=Unfiltered, shape=EVENTLENGTH<0)) + 140 scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) + 141 scale_shape_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Deletion", "Insertion")) + 142 geom_jitter(height = 0.005, width = 0.0, alpha=0.6) + 143 geom_line(color="grey") + 144 ggtitle("Proportion of 1bp INDELs per score bin") + 145 xlab(score_label) 146 147# Multiple plot function 148# 149# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects) 150# - cols: Number of columns in layout 151# - layout: A matrix specifying the layout. If present, 'cols' is ignored. 152# 153# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE), 154# then plot 1 will go in the upper left, 2 will go in the upper right, and 155# 3 will go all the way across the bottom. 156# 157multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) { 158 library(grid) 159 160 # Make a list from the ... arguments and plotlist 161 plots <- c(list(...), plotlist) 162 163 numPlots = length(plots) 164 165 # If layout is NULL, then use 'cols' to determine layout 166 if (is.null(layout)) { 167 # Make the panel 168 # ncol: Number of columns of plots 169 # nrow: Number of rows needed, calculated from # of cols 170 layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), 171 ncol = cols, nrow = ceiling(numPlots/cols)) 172 } 173 174 if (numPlots==1) { 175 print(plots[[1]]) 176 177 } else { 178 # Set up the page 179 grid.newpage() 180 pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) 181 182 # Make each plot, in the correct location 183 for (i in 1:numPlots) { 184 # Get the i,j matrix positions of the regions that contain this subplot 185 matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) 186 187 print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, 188 layout.pos.col = matchidx$col)) 189 } 190 } 191} 192ggsave(plot=multiplot(p1,p2,p3,p4,p5,p6,p7, cols=2), filename = paste(plot_title, "_plots.png", sep=""), width=16, height=20) 193 194