1#!/usr/bin/env bash
2
3# Runs performance tests - compares Edlib with other aligners (Parasail, Seqan, Myers).
4
5EDLIB=~/git/edlib/build/bin/edlib-aligner
6
7PARASAIL=~/git/parasail/apps/parasail_aligner
8PARASAIL_FLAGS="-t 1 -d -e 1 -o 1 -M 0 -X 1"
9
10# Here I am using Seqan through modified edlib aligner. It can be found in seqan-test branch of edlib repo.
11SEQAN=~/git/edlib-seqan/src/aligner
12
13MYERS=~/Dropbox/Mile/SIMD_alignment/Myers/myers_98_martin/myers
14
15TEST_DATA=.
16
17########################## TEST RUNNERS ########################
18
19function edlib {
20    mode=$1
21    query=$2
22    target=$3
23    num_tests=$4
24    k=$5
25
26    time_sum=0
27    for i in $(seq $num_tests); do
28        sleep 1
29        output=$($EDLIB -m $mode -k $k $query $target)
30        time=$(echo "$output" | grep "Cpu time of searching" | cut -d " " -f5)
31        score=$(echo "$output" | grep "#0:" | cut -d " " -f2)
32        time_sum=$(python -c "print($time_sum + $time)")
33        echo ">" "#"$i $score $time
34    done
35    avg_time=$(python -c "print($time_sum / $num_tests)")
36    echo "Edlib:" $avg_time $score
37}
38
39function edlib_path {
40    mode=$1
41    query=$2
42    target=$3
43    num_tests=$4
44
45    time_sum=0
46    for i in $(seq $num_tests); do
47        sleep 1
48        output=$($EDLIB -m $mode -p -s $query $target)
49        time=$(echo "$output" | grep "Cpu time of searching" | cut -d " " -f5)
50        time_sum=$(python -c "print($time_sum + $time)")
51        echo ">" "#"$i $time
52    done
53    avg_time=$(python -c "print($time_sum / $num_tests)")
54    echo "Edlib (path):" $avg_time
55}
56
57function seqan {
58    mode=$1
59    query=$2
60    target=$3
61    num_tests=$4
62
63    time_sum=0
64    for i in $(seq $num_tests); do
65        sleep 1
66        output=$($SEQAN -m $mode -t $query $target)
67        time=$(echo "$output" | grep "Cpu time of searching" | cut -d " " -f5)
68        score=$(($(echo "$output" | grep "Seqan Score:" | cut -d " " -f4) * -1))
69        time_sum=$(python -c "print($time_sum + $time)")
70        echo ">" "#"$i $score $time
71    done
72    avg_time=$(python -c "print($time_sum / $num_tests)")
73    echo "Seqan:" $avg_time $score
74}
75
76function seqan_path {
77    mode=$1
78    query=$2
79    target=$3
80    num_tests=$4
81
82    time_sum=0
83    for i in $(seq $num_tests); do
84        sleep 1
85        output=$($SEQAN -m $mode -t -p -s $query $target)
86        time=$(echo "$output" | grep "Cpu time of searching" | cut -d " " -f5)
87        score=$(($(echo "$output" | grep "Seqan Score:" | cut -d " " -f4) * -1))
88        time_sum=$(python -c "print($time_sum + $time)")
89        echo ">" "#"$i $score $time
90    done
91    avg_time=$(python -c "print($time_sum / $num_tests)")
92    echo "Seqan (path):" $score $avg_time
93}
94
95function parasail {
96    query=$1
97    target=$2
98    num_tests=$3
99
100    time_sum=0
101    for i in $(seq $num_tests); do
102        sleep 1
103        output=$($PARASAIL $PARASAIL_FLAGS -a nw_striped_32 -f $target -q $query)
104        time=$(echo "$output" | grep "alignment time" | cut -d ":" -f2 | cut -d " " -f2)
105        score=$(($(head -n 1 parasail.csv | cut -d "," -f5) * -1))
106        rm parasail.csv
107        time_sum=$(python -c "print($time_sum + $time)")
108        echo ">" "#"$i $score $time
109    done
110    avg_time=$(python -c "print($time_sum / $num_tests)")
111    echo "Parasail:" $avg_time $score
112}
113
114function myers {
115    query=$1
116    target=$2
117    num_tests=$3
118    k=$4
119
120    time_sum=0
121    for i in $(seq $num_tests); do
122        sleep 1
123        tail -n +2 $query | tr -d '\n' > queryMyers.fasta
124        tail -n +2 $target | tr -d '\n' > targetMyers.fasta
125        output=$({ time -p $MYERS $(cat queryMyers.fasta) $k targetMyers.fasta; } 2>&1)
126        rm queryMyers.fasta targetMyers.fasta
127        time=$(echo "$output" | grep "real" | cut -d " " -f2)
128        time_sum=$(python -c "print($time_sum + $time)")
129        echo ">" "#"$i $time
130    done
131    avg_time=$(python -c "print($time_sum / $num_tests)")
132    echo "Myers:" $avg_time
133
134}
135
136
137############################ TESTS #############################
138
139
140#Enterobacteria
141echo -e "\nEnterobacteria, NW"
142target=$TEST_DATA/Enterobacteria_Phage_1/Enterobacteria_phage_1.fasta
143for query in $(ls $TEST_DATA/Enterobacteria_Phage_1/mutated_*_perc.fasta); do
144    echo $query
145
146    edlib NW $query $target 3 -1
147    edlib_path NW $query $target 3
148    seqan NW $query $target 3
149    seqan_path NW $query $target 3
150    parasail $query $target 3
151done
152
153
154#E coli and its illumina read, HW
155echo -e "\nE coli and its illumina read, HW"
156target=$TEST_DATA/E_coli_DH1/e_coli_DH1.fasta
157for query in $(ls $TEST_DATA/E_coli_DH1/mason_illumina_reads/10kbp/*.fasta); do
158    echo $query
159
160    edlib HW $query $target 3 -1
161    edlib_path HW $query $target 3
162    seqan HW $query $target 3
163#    seqan_path HW $query $target 3   # Fails because it allocates too much memory.
164done
165
166
167#E coli and its prefix, SHW
168echo -e "\nE coli and its prefix, SHW"
169target=$TEST_DATA/E_coli_DH1/e_coli_DH1.fasta
170for query in $(ls $TEST_DATA/E_coli_DH1/prefixes/10kbp/mutated_*_perc.fasta); do
171    echo $query
172
173    edlib SHW $query $target 3 -1
174    edlib_path SHW $query $target 3
175    seqan SHW $query $target 3
176#    seqan_path SHW $query $target 3   # Fails because it allocates too much memory.
177done
178
179
180#Chromosome
181echo -e "\nChromosome, NW"
182target=$TEST_DATA/Chromosome_2890043_3890042_0/Chromosome_2890043_3890042_0.fasta
183for query in $(ls $TEST_DATA/Chromosome_2890043_3890042_0/mutated_*_perc.fasta); do
184    echo $query
185
186    edlib NW $query $target 3 -1
187    edlib_path NW $query $target 3
188    seqan NW $query $target 3
189    seqan_path NW $query $target 3
190    parasail $query $target 3
191done
192
193
194################### Myers #####################
195echo -e "\nMyers"
196target=$TEST_DATA/E_coli_DH1/e_coli_DH1.fasta
197
198k=100
199for query_file in e_coli_DH1_illumina_1x10000.fasta; do
200    query=$TEST_DATA/E_coli_DH1/mason_illumina_reads/10kbp/$query_file
201    echo $query $k
202    edlib HW $query $target 3 $k
203    myers $query $target 3 $k
204done
205
206k=1000
207for query_file in e_coli_DH1_illumina_1x10000.fasta mutated_97_perc.fasta mutated_94_perc.fasta mutated_90_perc.fasta; do
208    query=$TEST_DATA/E_coli_DH1/mason_illumina_reads/10kbp/$query_file
209    echo $query $k
210    edlib HW $query $target 3 $k
211    myers $query $target 3 $k
212done
213
214k=10000
215for query in $(ls $TEST_DATA/E_coli_DH1/mason_illumina_reads/10kbp/*); do
216    echo $query $k
217    edlib HW $query $target 3 $k
218    myers $query $target 3 $k
219done
220