1#!/usr/local/bin/bash
2export LC_ALL=en_US.UTF-8
3
4#check for vcftools version
5VCFTV=$(vcftools | grep VCF | grep -oh '[0-9]*[a-z]*)$' | sed 's/[a-z)]//')
6        if [ "$VCFTV" -lt "10" ]; then
7                echo "The version of VCFtools installed in your" '$PATH' "is not optimized for dDocent."
8                echo "Please install at least version 0.1.11"
9                exit 1
10        elif [ "$VCFTV" -lt "13" ]; then
11                VCFMISSINGFLAG="--missing"
12        elif [ "$VCFTV" -ge "13" ]; then
13                VCFMISSINGFLAG="--missing-indv"
14        fi
15
16if [[ -z "$2" ]]; then
17echo "Usuage is filter_missing_ind.sh vcf_file name_prefix_for_new_vcf_file"
18exit 1
19fi
20
21vcftools --vcf $1 $VCFMISSINGFLAG --out $2
22
23CUTOFF=$(mawk '!/IN/' $2.imiss | cut -f5 | sort -rn | perl -e '$d=.14;@l=<>;print $l[int($d*$#l)]')
24
25mawk '!/IN/' $2.imiss | cut -f5 > totalmissing
26
27gnuplot << \EOF
28set terminal dumb size 120, 30
29set autoscale
30unset label
31set title "Histogram of % missing data per individual"
32set ylabel "Number of Occurrences"
33set xlabel "% of missing data"
34#set yr [0:100000]
35binwidth=0.01
36bin(x,width)=width*floor(x/width) + binwidth/2.0
37set lmargin 10
38plot 'totalmissing' using (bin($1,binwidth)):(1.0) smooth freq with boxes
39pause -1
40EOF
41
42if [[ -z "$3" ]]; then
43	echo "The 85% cutoff would be" $CUTOFF
44	echo "Would you like to set a different cutoff, yes or no"
45
46	read NEWCUTOFF
47else
48	NEWCUTOFF=$3
49fi
50
51if [ "$NEWCUTOFF" != "yes" ]; then
52
53	mawk -v x=$CUTOFF '$5 > x' $2.imiss | cut -f1 > lowDP.indv
54
55	vcftools --vcf $1 --remove lowDP.indv --recode --recode-INFO-all --out $2
56
57else
58	if [[ -z "$4" ]]; then
59		echo "Please enter new cutoff"
60		read CUTOFF2
61
62	else
63		CUTOFF2=$4
64	fi
65	CUTPRINT=$(python -c "print($CUTOFF2 * 100)")
66	echo "All individuals with more than" $CUTPRINT"% missing data will be removed."
67	mawk -v x=$CUTOFF2 '$5 > x' $2.imiss | cut -f1 > lowDP.indv
68
69vcftools --vcf $1 --remove lowDP.indv --recode --recode-INFO-all --out $2
70fi
71