1#!/usr/local/bin/bash 2export LC_ALL=en_US.UTF-8 3 4#check for vcftools version 5VCFTV=$(vcftools | grep VCF | grep -oh '[0-9]*[a-z]*)$' | sed 's/[a-z)]//') 6 if [ "$VCFTV" -lt "10" ]; then 7 echo "The version of VCFtools installed in your" '$PATH' "is not optimized for dDocent." 8 echo "Please install at least version 0.1.11" 9 exit 1 10 elif [ "$VCFTV" -lt "13" ]; then 11 VCFMISSINGFLAG="--missing" 12 elif [ "$VCFTV" -ge "13" ]; then 13 VCFMISSINGFLAG="--missing-indv" 14 fi 15 16if [[ -z "$2" ]]; then 17echo "Usuage is filter_missing_ind.sh vcf_file name_prefix_for_new_vcf_file" 18exit 1 19fi 20 21vcftools --vcf $1 $VCFMISSINGFLAG --out $2 22 23CUTOFF=$(mawk '!/IN/' $2.imiss | cut -f5 | sort -rn | perl -e '$d=.14;@l=<>;print $l[int($d*$#l)]') 24 25mawk '!/IN/' $2.imiss | cut -f5 > totalmissing 26 27gnuplot << \EOF 28set terminal dumb size 120, 30 29set autoscale 30unset label 31set title "Histogram of % missing data per individual" 32set ylabel "Number of Occurrences" 33set xlabel "% of missing data" 34#set yr [0:100000] 35binwidth=0.01 36bin(x,width)=width*floor(x/width) + binwidth/2.0 37set lmargin 10 38plot 'totalmissing' using (bin($1,binwidth)):(1.0) smooth freq with boxes 39pause -1 40EOF 41 42if [[ -z "$3" ]]; then 43 echo "The 85% cutoff would be" $CUTOFF 44 echo "Would you like to set a different cutoff, yes or no" 45 46 read NEWCUTOFF 47else 48 NEWCUTOFF=$3 49fi 50 51if [ "$NEWCUTOFF" != "yes" ]; then 52 53 mawk -v x=$CUTOFF '$5 > x' $2.imiss | cut -f1 > lowDP.indv 54 55 vcftools --vcf $1 --remove lowDP.indv --recode --recode-INFO-all --out $2 56 57else 58 if [[ -z "$4" ]]; then 59 echo "Please enter new cutoff" 60 read CUTOFF2 61 62 else 63 CUTOFF2=$4 64 fi 65 CUTPRINT=$(python -c "print($CUTOFF2 * 100)") 66 echo "All individuals with more than" $CUTPRINT"% missing data will be removed." 67 mawk -v x=$CUTOFF2 '$5 > x' $2.imiss | cut -f1 > lowDP.indv 68 69vcftools --vcf $1 --remove lowDP.indv --recode --recode-INFO-all --out $2 70fi 71