1#!/bin/bash
2
3vcftools --vcf $1 --missing --out $2
4
5CUTOFF=$(mawk '!/IN/' $2.imiss | cut -f5 | sort -rn | perl -e '$d=.14;@l=<>;print $l[int($d*$#l)]')
6#echo $CUTOFF
7
8mawk '!/IN/' $2.imiss | cut -f5 > totalmissing
9
10gnuplot << \EOF
11set terminal dumb size 120, 30
12set autoscale
13unset label
14set title "Histogram of % missing data per individual"
15set ylabel "Number of Occurrences"
16set xlabel "% of missing data"
17#set yr [0:100000]
18binwidth=0.01
19bin(x,width)=width*floor(x/width) + binwidth/2.0
20plot 'totalmissing' using (bin($1,binwidth)):(1.0) smooth freq with boxes
21pause -1
22EOF
23
24echo "The 85% cutoff would be" $CUTOFF
25echo "Would you like to set a different cutoff, yes or no"
26
27read NEWCUTOFF
28
29if [ "$NEWCUTOFF" != "yes" ]; then
30
31mawk -v x=$CUTOFF '$5 > x' $2.imiss | cut -f1 > lowDP.indv
32
33vcftools --vcf $1 --remove lowDP.indv --recode --recode-INFO-all --out $2
34
35else
36
37echo "Please enter new cutoff"
38
39read CUTOFF2
40
41mawk -v x=$CUTOFF2 '$5 > x' $2.imiss | cut -f1 > lowDP.indv
42
43vcftools --vcf $1 --remove lowDP.indv --recode --recode-INFO-all --out $2
44fi
45