1#!/usr/local/bin/bash 2 3usage="\ 4Usage: 5 $(basename "$0") -P stacks_dir -B bam_file -O out_dir 6 7Extracts the coordinates of the RAD loci from the given BAM file into a 8'locus_coordinates.tsv' table, then rewrites the 'catalog.fa.gz' and 9'catalog.calls' files so that they include the genomic coordinates given in the 10input BAM file. 11" 12 13version="_VERSION_" 14 15# Check dependencies 16# ========== 17 18command -v python3 &>/dev/null || { python3; echo "Error: Python 3 is required." >&2; exit 1; } 19command -v samtools &>/dev/null || { samtools; echo "Error: Samtools is required." >&2; exit 1; } 20 21# Parse arguments 22# ========== 23 24# If there aren't any arguments, print the help. 25[[ $# -gt 0 ]] || { echo -n "$usage"; exit 1; } 26 27# Check that there aren't weird characters in the input. 28for arg in "$@" ;do 29 if [[ "$arg" =~ [^[:alnum:]_.,/~+=:-] ]] ;then 30 echo "Error: illegal character in argument '$arg'." >&2 31 exit 1 32 fi 33done 34 35# Parse arguments. 36while getopts 'P:B:O:hv' opt ;do 37 case "$opt" in 38 P) stacks_d="${OPTARG/%\//}" ;; 39 B) bam_f="${OPTARG/%\//}" ;; 40 O) out_d="${OPTARG/%\//}" ;; 41 h) echo -n "$usage" >&2; exit 1 ;; 42 v) echo "$version" >&2; exit 1 ;; 43 *) echo "Error: Bad arguments (-h for help)." >&2; exit 1 ;; 44 esac 45done 46shift $((OPTIND-1)) 47if [[ $# -ne 0 ]] ;then 48 echo "Error: '$1' is seen as a positional argument (expected no positional arguments)." >&2 49 exit 1 50fi 51 52# Check them. 53[[ "$stacks_d" ]] || { echo -n "Error: -P is required." >&2; exit 1; } 54[[ "$bam_f" ]] || { echo -n "Error: -B is required." >&2; exit 1; } 55[[ "$out_d" ]] || { echo -n "Error: -O is required." >&2; exit 1; } 56 57[[ -e "$stacks_d" ]] || { ls -- "$stacks_d"; exit 1; } 58[[ -e "$bam_f" ]] || { ls -- "$bam_f"; exit 1; } 59 60fa="$stacks_d/catalog.fa.gz" 61vcf="$stacks_d/catalog.calls" 62[[ -e "$fa" ]] || { ls -- "$fa"; exit 1; } 63[[ -e "$vcf" ]] || { ls -- "$vcf"; exit 1; } 64 65o_coords="$out_d/locus_coordinates.tsv" 66o_fa="$out_d/catalog.fa.gz" 67o_vcf="$out_d/catalog.calls" 68[[ ! -e "$o_coords" ]] || { echo "Error: Refusing to overwrite '$o_coords'." >&2; exit 1; } 69[[ ! -e "$o_fa" ]] || { echo "Error: Refusing to overwrite '$o_fa'." >&2; exit 1; } 70[[ ! -e "$o_vcf" ]] || { echo "Error: Refusing to overwrite '$o_vcf'." >&2; exit 1; } 71 72# If something goes wrong, stop & print an error. 73set -e -o pipefail 74trap "echo 'Error: aborted.' >&2" EXIT 75 76# Make sure we have write permissions. 77mkdir -p "$out_d" 78touch "$o_fa" "$o_vcf" 79 80# Retrieve the highest ID in the catalog. 81# ========== 82 83echo "Finding the highest current locus ID..." 84id=$(gzip -cd "$fa" | tail -n2 | head -n1 | awk '{print $1}') 85if [[ ! "$id" =~ ^\>[0-9]+$ ]] ;then 86 echo "Error: Unexpected format in '$fa'; the second to last line was:" >&2 87 gzip -cd "$fa" | tail -n2 | head -n1 88 exit 1 89fi 90id=$(echo "$id" | cut -c2-) 91echo "$id" 92echo 93 94# Write the old_id : new_id : pos file. 95# ========== 96 97echo "Extracting locus coordinates..." 98samtools view -F 0x904 "$bam_f" | 99 # Convert SAM to `LOCUS_ID \t CHR \t POS \t STRAND`. 100 awk -F '\t' -v OFS='\t' '{ 101 if (int( $2 % (2*16) / 16 )) { 102 # Negative strand. 103 pos = $4 - 1 104 gsub(/[0-9]+[ISH]/, "", $6) 105 gsub(/[A-Z]/, ",", $6) 106 n = split(substr($6, 1, length($6)-1), cig, ",") 107 for (i=1; i<=n; ++i) { pos += cig[i] } 108 print $1, $3, pos, "-" 109 } else { 110 print $1, $3, $4, "+" 111 } 112 }' | 113 # Sort by chromosome. 114 sort -k2,2 | 115 # Pull the reference chromosome order from the BAM header. 116 join -t $'\t' -12 -22 -o "1.1 1.2 2.1 1.3 1.4" \ 117 - \ 118 <(samtools view -H "$bam_f" | 119 grep '^@SQ' | grep -oE 'SN:[^\t]+' | cut -c4- | 120 awk '{printf NR"\t"$1"\n"}' | 121 sort -k2,2 122 ) | 123 # Sort by chromosome index, bp & strand, then remove the index. 124 sort -k3,3n -k4,4n -k5,5r | cut -f 1,2,4,5 | 125 # Create the new IDs (we start at the power of 10 above the current max). 126 awk 'BEGIN{id = '"$id"' + 1} {printf id++ "\t" $1 "\t" $2 ":" $3 ":" $4 "\n"}' | 127 # Add a header. 128 { echo -e 'id_new\tid_old\taln_pos'; cat; } \ 129 > "$o_coords" 130 131[[ $(wc -l < "$o_coords") -gt 1 ]] || { echo "Error: Extraction of coordinates failed; check BAM file." >&2; exit 1; } 132echo "Wrote '$o_coords'." 133echo 134 135# Write the new FASTA file. 136# ========== 137 138echo "Rewriting locus sequences and information..." 139gzip -cd "$fa" | 140 tr -d '\r' | paste -d '\r' - - | sed 's/^>//' | 141 awk -F' ' -v OFS=' ' ' 142 BEGIN{ 143 while(getline < "'"$o_coords"'") { 144 if (! /^id_new/) { 145 old2new[$2] = $1 146 old2pos[$2] = $3 147 } 148 } 149 } { 150 old_id = $1 151 if (old_id in old2new) { 152 $1 = old2new[old_id] " pos=" old2pos[old_id] 153 print 154 } 155 }' | 156 sort -k1,1n | 157 sed 's/^/>/' | tr '\r' '\n' | 158 gzip \ 159 > "$o_fa" 160 161echo "Wrote '$o_fa'." 162echo 163 164# Write the new VCF file. 165# ========== 166 167echo "Rewriting variants..." 168{ 169# Copy the header. 170{ gzip -cd "$vcf" 2>/dev/null || true; } | sed '/^#CHROM/ q' # (gzip unhappy about the closed pipe). 171 172# Update & resort the records. 173gzip -cd "$vcf" | grep -v '^#' | 174 awk -F'\t' -v OFS='\t' ' 175 BEGIN{ 176 while(getline < "'"$o_coords"'") { 177 if (! /^id_new/) { 178 old2new[$2] = $1 179 } 180 } 181 } { 182 old_id = $1 183 if (old_id in old2new) { 184 $1 = old2new[old_id] 185 print 186 } 187 }' | 188 sort -k1,1n -k2,2n 189} | gzip \ 190> "$o_vcf" 191 192echo "Wrote '$o_vcf'." 193echo 194 195trap - EXIT 196echo "$(basename "$0") is done." 197