1#!/usr/local/bin/bash
2
3usage="\
4Usage:
5  $(basename "$0") -P stacks_dir -B bam_file -O out_dir
6
7Extracts the coordinates of the RAD loci from the given BAM file into a
8'locus_coordinates.tsv' table, then rewrites the 'catalog.fa.gz' and
9'catalog.calls' files so that they include the genomic coordinates given in the
10input BAM file.
11"
12
13version="_VERSION_"
14
15# Check dependencies
16# ==========
17
18command -v python3 &>/dev/null  || { python3; echo "Error: Python 3 is required." >&2; exit 1; }
19command -v samtools &>/dev/null || { samtools; echo "Error: Samtools is required." >&2; exit 1; }
20
21# Parse arguments
22# ==========
23
24# If there aren't any arguments, print the help.
25[[ $# -gt 0 ]] || { echo -n "$usage"; exit 1; }
26
27# Check that there aren't weird characters in the input.
28for arg in "$@" ;do
29    if [[ "$arg" =~ [^[:alnum:]_.,/~+=:-] ]] ;then
30        echo "Error: illegal character in argument '$arg'." >&2
31        exit 1
32    fi
33done
34
35# Parse arguments.
36while getopts 'P:B:O:hv' opt ;do
37    case "$opt" in
38    P) stacks_d="${OPTARG/%\//}" ;;
39    B) bam_f="${OPTARG/%\//}" ;;
40    O) out_d="${OPTARG/%\//}" ;;
41    h) echo -n "$usage" >&2; exit 1 ;;
42    v) echo "$version" >&2; exit 1 ;;
43    *) echo "Error: Bad arguments (-h for help)." >&2; exit 1 ;;
44    esac
45done
46shift $((OPTIND-1))
47if [[ $# -ne 0 ]] ;then
48    echo "Error: '$1' is seen as a positional argument (expected no positional arguments)." >&2
49    exit 1
50fi
51
52# Check them.
53[[ "$stacks_d" ]] || { echo -n "Error: -P is required." >&2; exit 1; }
54[[ "$bam_f" ]]    || { echo -n "Error: -B is required." >&2; exit 1; }
55[[ "$out_d" ]]    || { echo -n "Error: -O is required." >&2; exit 1; }
56
57[[ -e "$stacks_d" ]] || { ls -- "$stacks_d"; exit 1; }
58[[ -e "$bam_f" ]]    || { ls -- "$bam_f"; exit 1; }
59
60fa="$stacks_d/catalog.fa.gz"
61vcf="$stacks_d/catalog.calls"
62[[ -e "$fa" ]]  || { ls -- "$fa"; exit 1; }
63[[ -e "$vcf" ]] || { ls -- "$vcf"; exit 1; }
64
65o_coords="$out_d/locus_coordinates.tsv"
66o_fa="$out_d/catalog.fa.gz"
67o_vcf="$out_d/catalog.calls"
68[[ ! -e "$o_coords" ]]  || { echo "Error: Refusing to overwrite '$o_coords'." >&2; exit 1; }
69[[ ! -e "$o_fa" ]]  || { echo "Error: Refusing to overwrite '$o_fa'." >&2; exit 1; }
70[[ ! -e "$o_vcf" ]] || { echo "Error: Refusing to overwrite '$o_vcf'." >&2; exit 1; }
71
72# If something goes wrong, stop & print an error.
73set -e -o pipefail
74trap "echo 'Error: aborted.' >&2" EXIT
75
76# Make sure we have write permissions.
77mkdir -p "$out_d"
78touch "$o_fa" "$o_vcf"
79
80# Retrieve the highest ID in the catalog.
81# ==========
82
83echo "Finding the highest current locus ID..."
84id=$(gzip -cd "$fa" | tail -n2 | head -n1 | awk '{print $1}')
85if [[ ! "$id" =~ ^\>[0-9]+$ ]] ;then
86    echo "Error: Unexpected format in '$fa'; the second to last line was:" >&2
87    gzip -cd "$fa" | tail -n2 | head -n1
88    exit 1
89fi
90id=$(echo "$id" | cut -c2-)
91echo "$id"
92echo
93
94# Write the old_id : new_id : pos file.
95# ==========
96
97echo "Extracting locus coordinates..."
98samtools view -F 0x904 "$bam_f" |
99    # Convert SAM to `LOCUS_ID \t CHR \t POS \t STRAND`.
100    awk -F '\t' -v OFS='\t' '{
101            if (int( $2 % (2*16) / 16 )) {
102                # Negative strand.
103                pos = $4 - 1
104                gsub(/[0-9]+[ISH]/, "", $6)
105                gsub(/[A-Z]/, ",", $6)
106                n = split(substr($6, 1, length($6)-1), cig, ",")
107                for (i=1; i<=n; ++i) { pos += cig[i] }
108                print $1, $3, pos, "-"
109            } else {
110                print $1, $3, $4, "+"
111            }
112        }' |
113    # Sort by chromosome.
114    sort -k2,2 |
115    # Pull the reference chromosome order from the BAM header.
116    join -t $'\t' -12 -22 -o "1.1 1.2 2.1 1.3 1.4" \
117        - \
118        <(samtools view -H "$bam_f" |
119            grep '^@SQ' | grep -oE 'SN:[^\t]+' | cut -c4- |
120            awk '{printf NR"\t"$1"\n"}' |
121            sort -k2,2
122        ) |
123    # Sort by chromosome index, bp & strand, then remove the index.
124    sort -k3,3n -k4,4n -k5,5r | cut -f 1,2,4,5 |
125    # Create the new IDs (we start at the power of 10 above the current max).
126    awk 'BEGIN{id = '"$id"' + 1} {printf id++ "\t" $1 "\t" $2 ":" $3 ":" $4 "\n"}' |
127    # Add a header.
128    { echo -e 'id_new\tid_old\taln_pos'; cat; } \
129    > "$o_coords"
130
131[[ $(wc -l < "$o_coords") -gt 1 ]] || { echo "Error: Extraction of coordinates failed; check BAM file." >&2; exit 1; }
132echo "Wrote '$o_coords'."
133echo
134
135# Write the new FASTA file.
136# ==========
137
138echo "Rewriting locus sequences and information..."
139gzip -cd "$fa" |
140    tr -d '\r' | paste -d '\r' - - | sed 's/^>//' |
141    awk -F' ' -v OFS=' ' '
142        BEGIN{
143            while(getline < "'"$o_coords"'") {
144                if (! /^id_new/) {
145                    old2new[$2] = $1
146                    old2pos[$2] = $3
147                }
148            }
149        } {
150            old_id = $1
151            if (old_id in old2new) {
152                $1 = old2new[old_id] " pos=" old2pos[old_id]
153                print
154            }
155        }' |
156    sort -k1,1n |
157    sed 's/^/>/' | tr '\r' '\n' |
158    gzip \
159    > "$o_fa"
160
161echo "Wrote '$o_fa'."
162echo
163
164# Write the new VCF file.
165# ==========
166
167echo "Rewriting variants..."
168{
169# Copy the header.
170{ gzip -cd "$vcf" 2>/dev/null || true; } | sed '/^#CHROM/ q' # (gzip unhappy about the closed pipe).
171
172# Update & resort the records.
173gzip -cd "$vcf" | grep -v '^#' |
174    awk -F'\t' -v OFS='\t' '
175        BEGIN{
176            while(getline < "'"$o_coords"'") {
177                if (! /^id_new/) {
178                    old2new[$2] = $1
179                }
180            }
181        } {
182            old_id = $1
183            if (old_id in old2new) {
184                $1 = old2new[old_id]
185                print
186            }
187       }' |
188    sort -k1,1n -k2,2n
189} | gzip \
190> "$o_vcf"
191
192echo "Wrote '$o_vcf'."
193echo
194
195trap - EXIT
196echo "$(basename "$0") is done."
197