1
2 /* Implementation of the Wozniak "anti-diagonal" vectorization
3 strategy for Smith-Waterman comparison, Wozniak (1997) Comp.
4 Appl. Biosci. 13:145-150
5
6 November, 2004
7 */
8
9 /*
10 Written by Erik Lindahl, Stockholm Bioinformatics Center, 2004.
11 Please send bug reports and/or suggestions to lindahl@sbc.su.se.
12 */
13
14 #include <stdio.h>
15
16 #include "defs.h"
17 #include "param.h"
18 #include "dropgsw2.h"
19
20 #ifdef SW_ALTIVEC
21
22 int
smith_waterman_altivec_word(unsigned char * query_sequence,unsigned short * query_profile_word,int query_length,unsigned char * db_sequence,int db_length,unsigned short bias,unsigned short gap_open,unsigned short gap_extend,struct f_struct * f_str)23 smith_waterman_altivec_word(unsigned char * query_sequence,
24 unsigned short * query_profile_word,
25 int query_length,
26 unsigned char * db_sequence,
27 int db_length,
28 unsigned short bias,
29 unsigned short gap_open,
30 unsigned short gap_extend,
31 struct f_struct * f_str)
32 {
33 int i,j,k;
34 unsigned short * p;
35 unsigned short score;
36 unsigned char * p_dbseq;
37 int alphabet_size = f_str->alphabet_size;
38 unsigned short * workspace = (unsigned short *)f_str->workspace;
39
40 vector unsigned short Fup,Hup1,Hup2,E,F,H,tmp;
41 vector unsigned char perm;
42 vector unsigned short v_maxscore;
43 vector unsigned short v_bias,v_gapopen,v_gapextend;
44 vector unsigned short v_score;
45 vector unsigned short v_score_q1;
46 vector unsigned short v_score_q2;
47 vector unsigned short v_score_q3;
48 vector unsigned short v_score_load;
49 vector unsigned char queue1_to_score = (vector unsigned char)(16,17,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
50 vector unsigned char queue2_to_queue1 = (vector unsigned char)(0,1,18,19,4,5,6,7,8,9,10,11,12,13,14,15);
51 vector unsigned char queue3_to_queue2 = (vector unsigned char)(16,16,16,16,16,21,16,0,16,1,16,2,16,3,16,4);
52 vector unsigned char queue3_with_load = (vector unsigned char)(23,5,6,7,8,25,9,10,11,27,12,13,29,14,31,16);
53
54 /* Load the bias to all elements of a constant */
55 v_bias = vec_lde(0,&bias);
56 perm = vec_lvsl(0,&bias);
57 v_bias = vec_perm(v_bias,v_bias,perm);
58 v_bias = vec_splat(v_bias,0);
59
60 /* Load gap opening penalty to all elements of a constant */
61 v_gapopen = vec_lde(0,&gap_open);
62 perm = vec_lvsl(0,&gap_open);
63 v_gapopen = vec_perm(v_gapopen,v_gapopen,perm);
64 v_gapopen = vec_splat(v_gapopen,0);
65
66 /* Load gap extension penalty to all elements of a constant */
67 v_gapextend = vec_lde(0,&gap_extend);
68 perm = vec_lvsl(0,&gap_extend);
69 v_gapextend = vec_perm(v_gapextend,v_gapextend,perm);
70 v_gapextend = vec_splat(v_gapextend,0);
71
72 v_maxscore = vec_xor(v_maxscore,v_maxscore);
73
74 // Zero out the storage vector
75 k = 2*(db_length+7);
76
77 for(i=0,j=0;i<k;i++,j+=16)
78 {
79 // borrow the zero value in v_maxscore to have something to store
80 vec_st(v_maxscore,j,workspace);
81 }
82
83 for(i=0;i<query_length;i+=8)
84 {
85 // fetch first data asap.
86 p_dbseq = db_sequence;
87 k = *p_dbseq++;
88 v_score_load = vec_ld(16*k,query_profile_word);
89
90 // zero lots of stuff.
91 // We use both the VPERM and VSIU unit to knock off some cycles.
92
93 E = vec_splat_u16(0);
94 F = vec_xor(F,F);
95 H = vec_splat_u16(0);
96 Hup2 = vec_xor(Hup2,Hup2);
97 v_score_q1 = vec_splat_u16(0);
98 v_score_q2 = vec_xor(v_score_q2,v_score_q2);
99 v_score_q3 = vec_splat_u16(0);
100
101 // reset pointers to the start of the saved data from the last row
102 p = workspace;
103
104 // PROLOGUE 1
105 // prefetch next residue
106 k = *p_dbseq++;
107
108 // Create the actual diagonal score vector
109 // and update the queue of incomplete score vectors
110
111 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
112 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
113 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
114 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
115
116 // prefetch score for next step
117 v_score_load = vec_ld(16*k,query_profile_word);
118
119 // load values of F and H from previous row (one unit up)
120 Fup = vec_ld(0, p);
121 Hup1 = vec_ld(16, p);
122 p += 16; // move ahead 32 bytes
123
124 // shift into place so we have complete F and H vectors
125 // that refer to the values one unit up from each cell
126 // that we are currently working on.
127 Fup = vec_sld(Fup,F,14);
128 Hup1 = vec_sld(Hup1,H,14);
129
130 // do the dynamic programming
131
132 // update E value
133 E = vec_subs(E,v_gapextend);
134 tmp = vec_subs(H,v_gapopen);
135 E = vec_max(E,tmp);
136
137 // update F value
138 F = vec_subs(Fup,v_gapextend);
139 tmp = vec_subs(Hup1,v_gapopen);
140 F = vec_max(F,tmp);
141
142 // add score to H
143 H = vec_adds(Hup2,v_score);
144 H = vec_subs(H,v_bias);
145
146 // set H to max of H,E,F
147 H = vec_max(H,E);
148 H = vec_max(H,F);
149
150 // Save value to use for next diagonal H
151 Hup2 = Hup1;
152
153 // Update highest score encountered this far
154 v_maxscore = vec_max(v_maxscore,H);
155
156
157 // PROLOGUE 2
158 // prefetch next residue
159 k = *p_dbseq++;
160
161 // Create the actual diagonal score vector
162 // and update the queue of incomplete score vectors
163
164 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
165 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
166 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
167 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
168
169 // prefetch score for next step
170 v_score_load = vec_ld(16*k,query_profile_word);
171
172 // load values of F and H from previous row (one unit up)
173 Fup = vec_ld(0, p);
174 Hup1 = vec_ld(16, p);
175 p += 16; // move ahead 32 bytes
176
177 // shift into place so we have complete F and H vectors
178 // that refer to the values one unit up from each cell
179 // that we are currently working on.
180 Fup = vec_sld(Fup,F,14);
181 Hup1 = vec_sld(Hup1,H,14);
182
183 // do the dynamic programming
184
185 // update E value
186 E = vec_subs(E,v_gapextend);
187 tmp = vec_subs(H,v_gapopen);
188 E = vec_max(E,tmp);
189
190 // update F value
191 F = vec_subs(Fup,v_gapextend);
192 tmp = vec_subs(Hup1,v_gapopen);
193 F = vec_max(F,tmp);
194
195 // add score to H
196 H = vec_adds(Hup2,v_score);
197 H = vec_subs(H,v_bias);
198
199 // set H to max of H,E,F
200 H = vec_max(H,E);
201 H = vec_max(H,F);
202
203 // Save value to use for next diagonal H
204 Hup2 = Hup1;
205
206 // Update highest score encountered this far
207 v_maxscore = vec_max(v_maxscore,H);
208
209
210 // PROLOGUE 3
211 // prefetch next residue
212 k = *p_dbseq++;
213
214 // Create the actual diagonal score vector
215 // and update the queue of incomplete score vectors
216
217 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
218 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
219 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
220 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
221
222 // prefetch score for next step
223 v_score_load = vec_ld(16*k,query_profile_word);
224
225 // load values of F and H from previous row (one unit up)
226 Fup = vec_ld(0, p);
227 Hup1 = vec_ld(16, p);
228 p += 16; // move ahead 32 bytes
229
230 // shift into place so we have complete F and H vectors
231 // that refer to the values one unit up from each cell
232 // that we are currently working on.
233 Fup = vec_sld(Fup,F,14);
234 Hup1 = vec_sld(Hup1,H,14);
235
236 // do the dynamic programming
237
238 // update E value
239 E = vec_subs(E,v_gapextend);
240 tmp = vec_subs(H,v_gapopen);
241 E = vec_max(E,tmp);
242
243 // update F value
244 F = vec_subs(Fup,v_gapextend);
245 tmp = vec_subs(Hup1,v_gapopen);
246 F = vec_max(F,tmp);
247
248 // add score to H
249 H = vec_adds(Hup2,v_score);
250 H = vec_subs(H,v_bias);
251
252 // set H to max of H,E,F
253 H = vec_max(H,E);
254 H = vec_max(H,F);
255
256 // Save value to use for next diagonal H
257 Hup2 = Hup1;
258
259 // Update highest score encountered this far
260 v_maxscore = vec_max(v_maxscore,H);
261
262
263 // PROLOGUE 4
264 // prefetch next residue
265 k = *p_dbseq++;
266
267 // Create the actual diagonal score vector
268 // and update the queue of incomplete score vectors
269
270 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
271 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
272 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
273 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
274
275 // prefetch score for next step
276 v_score_load = vec_ld(16*k,query_profile_word);
277
278 // load values of F and H from previous row (one unit up)
279 Fup = vec_ld(0, p);
280 Hup1 = vec_ld(16, p);
281 p += 16; // move ahead 32 bytes
282
283 // shift into place so we have complete F and H vectors
284 // that refer to the values one unit up from each cell
285 // that we are currently working on.
286 Fup = vec_sld(Fup,F,14);
287 Hup1 = vec_sld(Hup1,H,14);
288
289 // do the dynamic programming
290
291 // update E value
292 E = vec_subs(E,v_gapextend);
293 tmp = vec_subs(H,v_gapopen);
294 E = vec_max(E,tmp);
295
296 // update F value
297 F = vec_subs(Fup,v_gapextend);
298 tmp = vec_subs(Hup1,v_gapopen);
299 F = vec_max(F,tmp);
300
301 // add score to H
302 H = vec_adds(Hup2,v_score);
303 H = vec_subs(H,v_bias);
304
305 // set H to max of H,E,F
306 H = vec_max(H,E);
307 H = vec_max(H,F);
308
309 // Save value to use for next diagonal H
310 Hup2 = Hup1;
311
312 // Update highest score encountered this far
313 v_maxscore = vec_max(v_maxscore,H);
314
315
316 // PROLOGUE 5
317 // prefetch next residue
318 k = *p_dbseq++;
319
320 // Create the actual diagonal score vector
321 // and update the queue of incomplete score vectors
322
323 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
324 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
325 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
326 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
327
328 // prefetch score for next step
329 v_score_load = vec_ld(16*k,query_profile_word);
330
331 // load values of F and H from previous row (one unit up)
332 Fup = vec_ld(0, p);
333 Hup1 = vec_ld(16, p);
334 p += 16; // move ahead 32 bytes
335
336 // shift into place so we have complete F and H vectors
337 // that refer to the values one unit up from each cell
338 // that we are currently working on.
339 Fup = vec_sld(Fup,F,14);
340 Hup1 = vec_sld(Hup1,H,14);
341
342 // do the dynamic programming
343
344 // update E value
345 E = vec_subs(E,v_gapextend);
346 tmp = vec_subs(H,v_gapopen);
347 E = vec_max(E,tmp);
348
349 // update F value
350 F = vec_subs(Fup,v_gapextend);
351 tmp = vec_subs(Hup1,v_gapopen);
352 F = vec_max(F,tmp);
353
354 // add score to H
355 H = vec_adds(Hup2,v_score);
356 H = vec_subs(H,v_bias);
357
358 // set H to max of H,E,F
359 H = vec_max(H,E);
360 H = vec_max(H,F);
361
362 // Save value to use for next diagonal H
363 Hup2 = Hup1;
364
365 // Update highest score encountered this far
366 v_maxscore = vec_max(v_maxscore,H);
367
368
369 // PROLOGUE 6
370 // prefetch next residue
371 k = *p_dbseq++;
372
373 // Create the actual diagonal score vector
374 // and update the queue of incomplete score vectors
375
376 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
377 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
378 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
379 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
380
381 // prefetch score for next step
382 v_score_load = vec_ld(16*k,query_profile_word);
383
384 // load values of F and H from previous row (one unit up)
385 Fup = vec_ld(0, p);
386 Hup1 = vec_ld(16, p);
387 p += 16; // move ahead 32 bytes
388
389 // shift into place so we have complete F and H vectors
390 // that refer to the values one unit up from each cell
391 // that we are currently working on.
392 Fup = vec_sld(Fup,F,14);
393 Hup1 = vec_sld(Hup1,H,14);
394
395 // do the dynamic programming
396
397 // update E value
398 E = vec_subs(E,v_gapextend);
399 tmp = vec_subs(H,v_gapopen);
400 E = vec_max(E,tmp);
401
402 // update F value
403 F = vec_subs(Fup,v_gapextend);
404 tmp = vec_subs(Hup1,v_gapopen);
405 F = vec_max(F,tmp);
406
407 // add score to H
408 H = vec_adds(Hup2,v_score);
409 H = vec_subs(H,v_bias);
410
411 // set H to max of H,E,F
412 H = vec_max(H,E);
413 H = vec_max(H,F);
414
415 // Save value to use for next diagonal H
416 Hup2 = Hup1;
417
418 // Update highest score encountered this far
419 v_maxscore = vec_max(v_maxscore,H);
420
421
422 // PROLOGUE 7
423 // prefetch next residue
424 k = *p_dbseq++;
425
426 // Create the actual diagonal score vector
427 // and update the queue of incomplete score vectors
428
429 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
430 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
431 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
432 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
433
434 // prefetch score for next step
435 v_score_load = vec_ld(16*k,query_profile_word);
436
437 // load values of F and H from previous row (one unit up)
438 Fup = vec_ld(0, p);
439 Hup1 = vec_ld(16, p);
440 p += 16; // move ahead 32 bytes
441
442 // shift into place so we have complete F and H vectors
443 // that refer to the values one unit up from each cell
444 // that we are currently working on.
445 Fup = vec_sld(Fup,F,14);
446 Hup1 = vec_sld(Hup1,H,14);
447
448 // do the dynamic programming
449
450 // update E value
451 E = vec_subs(E,v_gapextend);
452 tmp = vec_subs(H,v_gapopen);
453 E = vec_max(E,tmp);
454
455 // update F value
456 F = vec_subs(Fup,v_gapextend);
457 tmp = vec_subs(Hup1,v_gapopen);
458 F = vec_max(F,tmp);
459
460 // add score to H
461 H = vec_adds(Hup2,v_score);
462 H = vec_subs(H,v_bias);
463
464 // set H to max of H,E,F
465 H = vec_max(H,E);
466 H = vec_max(H,F);
467
468 // Save value to use for next diagonal H
469 Hup2 = Hup1;
470
471 // Update highest score encountered this far
472 v_maxscore = vec_max(v_maxscore,H);
473
474
475 // PROLOGUE 8
476 // prefetch next residue
477 k = *p_dbseq++;
478
479 // Create the actual diagonal score vector
480 // and update the queue of incomplete score vectors
481
482 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
483 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
484 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
485 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
486
487 // prefetch score for next step
488 v_score_load = vec_ld(16*k,query_profile_word);
489
490 // load values of F and H from previous row (one unit up)
491 Fup = vec_ld(0, p);
492 Hup1 = vec_ld(16, p);
493 p += 16; // move ahead 32 bytes
494
495 // shift into place so we have complete F and H vectors
496 // that refer to the values one unit up from each cell
497 // that we are currently working on.
498 Fup = vec_sld(Fup,F,14);
499 Hup1 = vec_sld(Hup1,H,14);
500
501 // do the dynamic programming
502
503 // update E value
504 E = vec_subs(E,v_gapextend);
505 tmp = vec_subs(H,v_gapopen);
506 E = vec_max(E,tmp);
507
508 // update F value
509 F = vec_subs(Fup,v_gapextend);
510 tmp = vec_subs(Hup1,v_gapopen);
511 F = vec_max(F,tmp);
512
513 // add score to H
514 H = vec_adds(Hup2,v_score);
515 H = vec_subs(H,v_bias);
516
517 // set H to max of H,E,F
518 H = vec_max(H,E);
519 H = vec_max(H,F);
520
521 // Save value to use for next diagonal H
522 Hup2 = Hup1;
523
524 // Update highest score encountered this far
525 v_maxscore = vec_max(v_maxscore,H);
526
527
528 // reset pointers to the start of the saved data from the last row
529 p = workspace;
530
531 for(j=8;j<db_length;j+=8)
532 {
533 // STEP 1
534
535 // prefetch next residue
536 k = *p_dbseq++;
537
538 // Create the actual diagonal score vector
539 // and update the queue of incomplete score vectors
540
541 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
542 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
543 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
544 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
545
546 // prefetch score for next step
547 v_score_load = vec_ld(16*k,query_profile_word);
548
549 // load values of F and H from previous row (one unit up)
550 Fup = vec_ld(256, p);
551 Hup1 = vec_ld(272, p);
552
553 // save old values of F and H to use on next row
554 vec_st(F, 0, p);
555 vec_st(H, 16, p);
556 p += 16; // move ahead 32 bytes
557
558 // shift into place so we have complete F and H vectors
559 // that refer to the values one unit up from each cell
560 // that we are currently working on.
561 Fup = vec_sld(Fup,F,14);
562 Hup1 = vec_sld(Hup1,H,14);
563
564 // do the dynamic programming
565
566 // update E value
567 E = vec_subs(E,v_gapextend);
568 tmp = vec_subs(H,v_gapopen);
569 E = vec_max(E,tmp);
570
571 // update F value
572 F = vec_subs(Fup,v_gapextend);
573 tmp = vec_subs(Hup1,v_gapopen);
574 F = vec_max(F,tmp);
575
576 // add score to H
577 H = vec_adds(Hup2,v_score);
578 H = vec_subs(H,v_bias);
579
580 // set H to max of H,E,F
581 H = vec_max(H,E);
582 H = vec_max(H,F);
583
584
585 // Update highest score encountered this far
586 v_maxscore = vec_max(v_maxscore,H);
587
588
589
590 // STEP 2
591
592 // prefetch next residue
593 k = *p_dbseq++;
594
595 // Create the actual diagonal score vector
596 // and update the queue of incomplete score vectors
597
598 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
599 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
600 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
601 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
602
603 // prefetch score for next step
604 v_score_load = vec_ld(16*k,query_profile_word);
605
606 // load values of F and H from previous row (one unit up)
607 Fup = vec_ld(256, p);
608 Hup2 = vec_ld(272, p);
609
610 // save old values of F and H to use on next row
611 vec_st(F, 0, p);
612 vec_st(H, 16, p);
613 p += 16; // move ahead 32 bytes
614
615 // shift into place so we have complete F and H vectors
616 // that refer to the values one unit up from each cell
617 // that we are currently working on.
618 Fup = vec_sld(Fup,F,14);
619 Hup2 = vec_sld(Hup2,H,14);
620
621 // do the dynamic programming
622
623 // update E value
624 E = vec_subs(E,v_gapextend);
625 tmp = vec_subs(H,v_gapopen);
626 E = vec_max(E,tmp);
627
628 // update F value
629 F = vec_subs(Fup,v_gapextend);
630 tmp = vec_subs(Hup2,v_gapopen);
631 F = vec_max(F,tmp);
632
633 // add score to H
634 H = vec_adds(Hup1,v_score);
635 H = vec_subs(H,v_bias);
636
637 // set H to max of H,E,F
638 H = vec_max(H,E);
639 H = vec_max(H,F);
640
641
642 // Update highest score encountered this far
643 v_maxscore = vec_max(v_maxscore,H);
644
645
646
647 // STEP 3
648
649 // prefetch next residue
650 k = *p_dbseq++;
651
652 // Create the actual diagonal score vector
653 // and update the queue of incomplete score vectors
654
655 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
656 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
657 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
658 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
659
660 // prefetch score for next step
661 v_score_load = vec_ld(16*k,query_profile_word);
662
663 // load values of F and H from previous row (one unit up)
664 Fup = vec_ld(256, p);
665 Hup1 = vec_ld(272, p);
666
667 // save old values of F and H to use on next row
668 vec_st(F, 0, p);
669 vec_st(H, 16, p);
670 p += 16; // move ahead 32 bytes
671
672 // shift into place so we have complete F and H vectors
673 // that refer to the values one unit up from each cell
674 // that we are currently working on.
675 Fup = vec_sld(Fup,F,14);
676 Hup1 = vec_sld(Hup1,H,14);
677
678 // do the dynamic programming
679
680 // update E value
681 E = vec_subs(E,v_gapextend);
682 tmp = vec_subs(H,v_gapopen);
683 E = vec_max(E,tmp);
684
685 // update F value
686 F = vec_subs(Fup,v_gapextend);
687 tmp = vec_subs(Hup1,v_gapopen);
688 F = vec_max(F,tmp);
689
690 // add score to H
691 H = vec_adds(Hup2,v_score);
692 H = vec_subs(H,v_bias);
693
694 // set H to max of H,E,F
695 H = vec_max(H,E);
696 H = vec_max(H,F);
697
698
699
700 // Update highest score encountered this far
701 v_maxscore = vec_max(v_maxscore,H);
702
703
704
705 // STEP 4
706
707 // prefetch next residue
708 k = *p_dbseq++;
709
710 // Create the actual diagonal score vector
711 // and update the queue of incomplete score vectors
712
713 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
714 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
715 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
716 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
717
718 // prefetch score for next step
719 v_score_load = vec_ld(16*k,query_profile_word);
720
721 // load values of F and H from previous row (one unit up)
722 Fup = vec_ld(256, p);
723 Hup2 = vec_ld(272, p);
724
725 // save old values of F and H to use on next row
726 vec_st(F, 0, p);
727 vec_st(H, 16, p);
728 p += 16; // move ahead 32 bytes
729
730 // shift into place so we have complete F and H vectors
731 // that refer to the values one unit up from each cell
732 // that we are currently working on.
733 Fup = vec_sld(Fup,F,14);
734 Hup2 = vec_sld(Hup2,H,14);
735
736 // do the dynamic programming
737
738 // update E value
739 E = vec_subs(E,v_gapextend);
740 tmp = vec_subs(H,v_gapopen);
741 E = vec_max(E,tmp);
742
743 // update F value
744 F = vec_subs(Fup,v_gapextend);
745 tmp = vec_subs(Hup2,v_gapopen);
746 F = vec_max(F,tmp);
747
748 // add score to H
749 H = vec_adds(Hup1,v_score);
750 H = vec_subs(H,v_bias);
751
752 // set H to max of H,E,F
753 H = vec_max(H,E);
754 H = vec_max(H,F);
755
756
757 // Update highest score encountered this far
758 v_maxscore = vec_max(v_maxscore,H);
759
760
761
762 // STEP 5
763
764 // prefetch next residue
765 k = *p_dbseq++;
766
767 // Create the actual diagonal score vector
768 // and update the queue of incomplete score vectors
769
770 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
771 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
772 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
773 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
774
775 // prefetch score for next step
776 v_score_load = vec_ld(16*k,query_profile_word);
777
778 // load values of F and H from previous row (one unit up)
779 Fup = vec_ld(256, p);
780 Hup1 = vec_ld(272, p);
781
782 // save old values of F and H to use on next row
783 vec_st(F, 0, p);
784 vec_st(H, 16, p);
785 p += 16; // move ahead 32 bytes
786
787 // shift into place so we have complete F and H vectors
788 // that refer to the values one unit up from each cell
789 // that we are currently working on.
790 Fup = vec_sld(Fup,F,14);
791 Hup1 = vec_sld(Hup1,H,14);
792
793 // do the dynamic programming
794
795 // update E value
796 E = vec_subs(E,v_gapextend);
797 tmp = vec_subs(H,v_gapopen);
798 E = vec_max(E,tmp);
799
800 // update F value
801 F = vec_subs(Fup,v_gapextend);
802 tmp = vec_subs(Hup1,v_gapopen);
803 F = vec_max(F,tmp);
804
805 // add score to H
806 H = vec_adds(Hup2,v_score);
807 H = vec_subs(H,v_bias);
808
809 // set H to max of H,E,F
810 H = vec_max(H,E);
811 H = vec_max(H,F);
812
813
814 // Update highest score encountered this far
815 v_maxscore = vec_max(v_maxscore,H);
816
817
818
819 // STEP 6
820
821 // prefetch next residue
822 k = *p_dbseq++;
823
824 // Create the actual diagonal score vector
825 // and update the queue of incomplete score vectors
826
827 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
828 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
829 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
830 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
831
832 // prefetch score for next step
833 v_score_load = vec_ld(16*k,query_profile_word);
834
835 // load values of F and H from previous row (one unit up)
836 Fup = vec_ld(256, p);
837 Hup2 = vec_ld(272, p);
838
839 // save old values of F and H to use on next row
840 vec_st(F, 0, p);
841 vec_st(H, 16, p);
842 p += 16; // move ahead 32 bytes
843
844 // shift into place so we have complete F and H vectors
845 // that refer to the values one unit up from each cell
846 // that we are currently working on.
847 Fup = vec_sld(Fup,F,14);
848 Hup2 = vec_sld(Hup2,H,14);
849
850 // do the dynamic programming
851
852 // update E value
853 E = vec_subs(E,v_gapextend);
854 tmp = vec_subs(H,v_gapopen);
855 E = vec_max(E,tmp);
856
857 // update F value
858 F = vec_subs(Fup,v_gapextend);
859 tmp = vec_subs(Hup2,v_gapopen);
860 F = vec_max(F,tmp);
861
862 // add score to H
863 H = vec_adds(Hup1,v_score);
864 H = vec_subs(H,v_bias);
865
866 // set H to max of H,E,F
867 H = vec_max(H,E);
868 H = vec_max(H,F);
869
870
871
872 // Update highest score encountered this far
873 v_maxscore = vec_max(v_maxscore,H);
874
875
876
877 // STEP 7
878
879 // prefetch next residue
880 k = *p_dbseq++;
881
882 // Create the actual diagonal score vector
883 // and update the queue of incomplete score vectors
884
885 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
886 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
887 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
888 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
889
890 // prefetch score for next step
891 v_score_load = vec_ld(16*k,query_profile_word);
892
893 // load values of F and H from previous row (one unit up)
894 Fup = vec_ld(256, p);
895 Hup1 = vec_ld(272, p);
896
897 // save old values of F and H to use on next row
898 vec_st(F, 0, p);
899 vec_st(H, 16, p);
900 p += 16; // move ahead 32 bytes
901
902 // shift into place so we have complete F and H vectors
903 // that refer to the values one unit up from each cell
904 // that we are currently working on.
905 Fup = vec_sld(Fup,F,14);
906 Hup1 = vec_sld(Hup1,H,14);
907
908 // do the dynamic programming
909
910 // update E value
911 E = vec_subs(E,v_gapextend);
912 tmp = vec_subs(H,v_gapopen);
913 E = vec_max(E,tmp);
914
915 // update F value
916 F = vec_subs(Fup,v_gapextend);
917 tmp = vec_subs(Hup1,v_gapopen);
918 F = vec_max(F,tmp);
919
920 // add score to H
921 H = vec_adds(Hup2,v_score);
922 H = vec_subs(H,v_bias);
923
924 // set H to max of H,E,F
925 H = vec_max(H,E);
926 H = vec_max(H,F);
927
928
929
930 // Update highest score encountered this far
931 v_maxscore = vec_max(v_maxscore,H);
932
933
934
935 // STEP 8
936
937 // prefetch next residue
938 k = *p_dbseq++;
939
940 // Create the actual diagonal score vector
941 // and update the queue of incomplete score vectors
942
943 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
944 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
945 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
946 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
947
948 // prefetch score for next step
949 v_score_load = vec_ld(16*k,query_profile_word);
950
951 // load values of F and H from previous row (one unit up)
952 Fup = vec_ld(256, p);
953 Hup2 = vec_ld(272, p);
954
955 // save old values of F and H to use on next row
956 vec_st(F, 0, p);
957 vec_st(H, 16, p);
958 p += 16; // move ahead 32 bytes
959
960 // shift into place so we have complete F and H vectors
961 // that refer to the values one unit up from each cell
962 // that we are currently working on.
963 Fup = vec_sld(Fup,F,14);
964 Hup2 = vec_sld(Hup2,H,14);
965
966 // do the dynamic programming
967
968 // update E value
969 E = vec_subs(E,v_gapextend);
970 tmp = vec_subs(H,v_gapopen);
971 E = vec_max(E,tmp);
972
973 // update F value
974 F = vec_subs(Fup,v_gapextend);
975 tmp = vec_subs(Hup2,v_gapopen);
976 F = vec_max(F,tmp);
977
978 // add score to H
979 H = vec_adds(Hup1,v_score);
980 H = vec_subs(H,v_bias);
981
982 // set H to max of H,E,F
983 H = vec_max(H,E);
984 H = vec_max(H,F);
985
986
987 // Update highest score encountered this far
988 v_maxscore = vec_max(v_maxscore,H);
989 }
990
991 v_score_load = vec_splat_u16(0);
992
993 for(;j<db_length+7;j++)
994 {
995 // Create the actual diagonal score vector
996 // and update the queue of incomplete score vectors
997 //
998 // This could of course be done with only vec_perm or vec_sel,
999 // but since they use different execution units we have found
1000 // it to be slightly faster to mix them.
1001 v_score = vec_perm(v_score_q1, v_score_load, queue1_to_score);
1002 v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
1003 v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
1004 v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
1005
1006 // save old values of F and H to use on next row
1007 vec_st(F, 0, p);
1008 vec_st(H, 16, p);
1009 p += 16; // move ahead 32 bytes
1010
1011 // v_score_load contains all zeros
1012 Fup = vec_sld(v_score_load,F,14);
1013 Hup1 = vec_sld(v_score_load,H,14);
1014
1015 // do the dynamic programming
1016
1017 // update E value
1018 E = vec_subs(E,v_gapextend);
1019 tmp = vec_subs(H,v_gapopen);
1020 E = vec_max(E,tmp);
1021
1022 // update F value
1023 F = vec_subs(Fup,v_gapextend);
1024 tmp = vec_subs(Hup1,v_gapopen);
1025 F = vec_max(F,tmp);
1026
1027 // add score to H
1028 H = vec_adds(Hup2,v_score);
1029 H = vec_subs(H,v_bias);
1030
1031 // set H to max of H,E,F
1032 H = vec_max(H,E);
1033 H = vec_max(H,F);
1034
1035 // Save value to use for next diagonal H
1036 Hup2 = Hup1;
1037
1038 // Update highest score encountered this far
1039 v_maxscore = vec_max(v_maxscore,H);
1040 }
1041 vec_st(F, 0, p);
1042 vec_st(H, 16, p);
1043
1044 query_profile_word += 8*alphabet_size;
1045 }
1046
1047 // find largest score in the v_maxscore vector
1048 tmp = vec_sld(v_maxscore,v_maxscore,8);
1049 v_maxscore = vec_max(v_maxscore,tmp);
1050 tmp = vec_sld(v_maxscore,v_maxscore,4);
1051 v_maxscore = vec_max(v_maxscore,tmp);
1052 tmp = vec_sld(v_maxscore,v_maxscore,2);
1053 v_maxscore = vec_max(v_maxscore,tmp);
1054
1055 // store in temporary variable
1056 vec_ste(v_maxscore,0,&score);
1057
1058 // return largest score
1059 return score;
1060 }
1061
1062 int
smith_waterman_altivec_byte(unsigned char * query_sequence,unsigned char * query_profile_byte,int query_length,unsigned char * db_sequence,int db_length,unsigned char bias,unsigned char gap_open,unsigned char gap_extend,struct f_struct * f_str)1063 smith_waterman_altivec_byte(unsigned char * query_sequence,
1064 unsigned char * query_profile_byte,
1065 int query_length,
1066 unsigned char * db_sequence,
1067 int db_length,
1068 unsigned char bias,
1069 unsigned char gap_open,
1070 unsigned char gap_extend,
1071 struct f_struct * f_str)
1072 {
1073 int i,j,k,k8;
1074 int overflow;
1075 unsigned char * p;
1076 unsigned char score;
1077 int alphabet_size = f_str->alphabet_size;
1078 unsigned char * workspace = (unsigned char *)f_str->workspace;
1079
1080 vector unsigned char Fup,Hup1,Hup2,E,F,H,tmp;
1081 vector unsigned char perm;
1082 vector unsigned char v_maxscore;
1083 vector unsigned char v_bias,v_gapopen,v_gapextend;
1084 vector unsigned char v_score;
1085 vector unsigned char v_score_q1;
1086 vector unsigned char v_score_q2;
1087 vector unsigned char v_score_q3;
1088 vector unsigned char v_score_q4;
1089 vector unsigned char v_score_q5;
1090 vector unsigned char v_score_load1;
1091 vector unsigned char v_score_load2;
1092 vector unsigned char v_zero;
1093
1094 vector unsigned char queue1_to_score = (vector unsigned char)(16,1,2,3,4,5,6,7,24,9,10,11,12,13,14,15);
1095 vector unsigned char queue2_to_queue1 = (vector unsigned char)(16,17,2,3,4,5,6,7,24,25,10,11,12,13,14,15);
1096 vector unsigned char queue3_to_queue2 = (vector unsigned char)(16,17,18,3,4,5,6,7,24,25,26,11,12,13,14,15);
1097 vector unsigned char queue4_to_queue3 = (vector unsigned char)(16,17,18,19,4,5,6,7,24,25,26,27,12,13,14,15);
1098 vector unsigned char queue5_to_queue4 = (vector unsigned char)(16,17,18,19,20,2,3,4,24,25,26,27,28,10,11,12);
1099 vector unsigned char queue5_with_load = (vector unsigned char)(19,20,21,5,6,22,7,23,27,28,29,13,14,30,15,31);
1100 vector unsigned char merge_score_load = (vector unsigned char)(0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
1101
1102 v_zero = vec_splat_u8(0);
1103
1104 /* Load the bias to all elements of a constant */
1105 v_bias = vec_lde(0,&bias);
1106 perm = vec_lvsl(0,&bias);
1107 v_bias = vec_perm(v_bias,v_bias,perm);
1108 v_bias = vec_splat(v_bias,0);
1109
1110 /* Load gap opening penalty to all elements of a constant */
1111 v_gapopen = vec_lde(0,&gap_open);
1112 perm = vec_lvsl(0,&gap_open);
1113 v_gapopen = vec_perm(v_gapopen,v_gapopen,perm);
1114 v_gapopen = vec_splat(v_gapopen,0);
1115
1116 /* Load gap extension penalty to all elements of a constant */
1117 v_gapextend = vec_lde(0,&gap_extend);
1118 perm = vec_lvsl(0,&gap_extend);
1119 v_gapextend = vec_perm(v_gapextend,v_gapextend,perm);
1120 v_gapextend = vec_splat(v_gapextend,0);
1121
1122 v_maxscore = vec_xor(v_maxscore,v_maxscore);
1123
1124 // Zero out the storage vector
1125 k = (db_length+15);
1126 for(i=0,j=0;i<k;i++,j+=32)
1127 {
1128 // borrow the zero value in v_maxscore to have something to store
1129 vec_st(v_maxscore,j,workspace);
1130 vec_st(v_maxscore,j+16,workspace);
1131 }
1132
1133 for(i=0;i<query_length;i+=16)
1134 {
1135 // zero lots of stuff.
1136 // We use both the VPERM and VSIU unit to knock off some cycles.
1137
1138 E = vec_splat_u8(0);
1139 F = vec_xor(F,F);
1140 H = vec_splat_u8(0);
1141 Hup2 = vec_xor(Hup2,Hup2);
1142 v_score_q1 = vec_splat_u8(0);
1143 v_score_q2 = vec_xor(v_score_q2,v_score_q2);
1144 v_score_q3 = vec_splat_u8(0);
1145 v_score_q4 = vec_xor(v_score_q4,v_score_q4);
1146 v_score_q5 = vec_splat_u8(0);
1147
1148 // reset pointers to the start of the saved data from the last row
1149 p = workspace;
1150
1151 // start directly and prefetch score column
1152 k = db_sequence[0];
1153 k8 = k;
1154 v_score_load1 = vec_ld(16*k,query_profile_byte);
1155 v_score_load2 = v_score_load1;
1156 v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1157
1158 // PROLOGUE 1
1159 // prefetch next residue
1160 k = db_sequence[1];
1161
1162 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1163 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1164 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1165 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1166 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1167 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1168
1169 // prefetch score for next step
1170 v_score_load1 = vec_ld(16*k,query_profile_byte);
1171
1172 // load values of F and H from previous row (one unit up)
1173 Fup = vec_ld(0, p);
1174 Hup1 = vec_ld(16, p);
1175 p += 32; // move ahead 32 bytes
1176
1177 // shift into place so we have complete F and H vectors
1178 // that refer to the values one unit up from each cell
1179 // that we are currently working on.
1180 Fup = vec_sld(Fup,F,15);
1181 Hup1 = vec_sld(Hup1,H,15);
1182
1183 // do the dynamic programming
1184
1185 // update E value
1186 E = vec_subs(E,v_gapextend);
1187 tmp = vec_subs(H,v_gapopen);
1188 E = vec_max(E,tmp);
1189
1190 // update F value
1191 F = vec_subs(Fup,v_gapextend);
1192 tmp = vec_subs(Hup1,v_gapopen);
1193 F = vec_max(F,tmp);
1194
1195 v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1196
1197 // add score to H
1198 H = vec_adds(Hup2,v_score);
1199 H = vec_subs(H,v_bias);
1200
1201 // set H to max of H,E,F
1202 H = vec_max(H,E);
1203 H = vec_max(H,F);
1204
1205 // Update highest score encountered this far
1206 v_maxscore = vec_max(v_maxscore,H);
1207
1208
1209
1210
1211 // PROLOGUE 2
1212 // prefetch next residue
1213 k = db_sequence[2];
1214
1215 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1216 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1217 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1218 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1219 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1220 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1221
1222
1223 // prefetch score for next step
1224 v_score_load1 = vec_ld(16*k,query_profile_byte);
1225
1226 // load values of F and H from previous row (one unit up)
1227 Fup = vec_ld(0, p);
1228 Hup2 = vec_ld(16, p);
1229 p += 32; // move ahead 32 bytes
1230
1231 // shift into place so we have complete F and H vectors
1232 // that refer to the values one unit up from each cell
1233 // that we are currently working on.
1234 Fup = vec_sld(Fup,F,15);
1235 Hup2 = vec_sld(Hup2,H,15);
1236
1237 // do the dynamic programming
1238
1239 // update E value
1240 E = vec_subs(E,v_gapextend);
1241 tmp = vec_subs(H,v_gapopen);
1242 E = vec_max(E,tmp);
1243
1244 // update F value
1245 F = vec_subs(Fup,v_gapextend);
1246 tmp = vec_subs(Hup2,v_gapopen);
1247 F = vec_max(F,tmp);
1248
1249 v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1250
1251 // add score to H
1252 H = vec_adds(Hup1,v_score);
1253 H = vec_subs(H,v_bias);
1254
1255 // set H to max of H,E,F
1256 H = vec_max(H,E);
1257 H = vec_max(H,F);
1258
1259 // Update highest score encountered this far
1260 v_maxscore = vec_max(v_maxscore,H);
1261
1262
1263 // PROLOGUE 3
1264 // prefetch next residue
1265 k = db_sequence[3];
1266
1267 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1268 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1269 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1270 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1271 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1272 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1273
1274
1275 // prefetch score for next step
1276 v_score_load1 = vec_ld(16*k,query_profile_byte);
1277
1278 // load values of F and H from previous row (one unit up)
1279 Fup = vec_ld(0, p);
1280 Hup1 = vec_ld(16, p);
1281 p += 32; // move ahead 32 bytes
1282
1283 // shift into place so we have complete F and H vectors
1284 // that refer to the values one unit up from each cell
1285 // that we are currently working on.
1286 Fup = vec_sld(Fup,F,15);
1287 Hup1 = vec_sld(Hup1,H,15);
1288
1289 // do the dynamic programming
1290
1291 // update E value
1292 E = vec_subs(E,v_gapextend);
1293 tmp = vec_subs(H,v_gapopen);
1294 E = vec_max(E,tmp);
1295
1296 // update F value
1297 F = vec_subs(Fup,v_gapextend);
1298 tmp = vec_subs(Hup1,v_gapopen);
1299 F = vec_max(F,tmp);
1300
1301 v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1302
1303 // add score to H
1304 H = vec_adds(Hup2,v_score);
1305 H = vec_subs(H,v_bias);
1306
1307 // set H to max of H,E,F
1308 H = vec_max(H,E);
1309 H = vec_max(H,F);
1310
1311 // Update highest score encountered this far
1312 v_maxscore = vec_max(v_maxscore,H);
1313
1314
1315 // PROLOGUE 4
1316 // prefetch next residue
1317 k = db_sequence[4];
1318
1319 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1320 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1321 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1322 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1323 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1324 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1325
1326
1327 // prefetch score for next step
1328 v_score_load1 = vec_ld(16*k,query_profile_byte);
1329
1330 // load values of F and H from previous row (one unit up)
1331 Fup = vec_ld(0, p);
1332 Hup2 = vec_ld(16, p);
1333 p += 32; // move ahead 32 bytes
1334
1335 // shift into place so we have complete F and H vectors
1336 // that refer to the values one unit up from each cell
1337 // that we are currently working on.
1338 Fup = vec_sld(Fup,F,15);
1339 Hup2 = vec_sld(Hup2,H,15);
1340
1341 // do the dynamic programming
1342
1343 // update E value
1344 E = vec_subs(E,v_gapextend);
1345 tmp = vec_subs(H,v_gapopen);
1346 E = vec_max(E,tmp);
1347
1348 // update F value
1349 F = vec_subs(Fup,v_gapextend);
1350 tmp = vec_subs(Hup2,v_gapopen);
1351 F = vec_max(F,tmp);
1352
1353 v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1354
1355 // add score to H
1356 H = vec_adds(Hup1,v_score);
1357 H = vec_subs(H,v_bias);
1358
1359 // set H to max of H,E,F
1360 H = vec_max(H,E);
1361 H = vec_max(H,F);
1362
1363 // Update highest score encountered this far
1364 v_maxscore = vec_max(v_maxscore,H);
1365
1366
1367 // PROLOGUE 5
1368 // prefetch next residue
1369 k = db_sequence[5];
1370
1371 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1372 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1373 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1374 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1375 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1376 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1377
1378
1379 // prefetch score for next step
1380 v_score_load1 = vec_ld(16*k,query_profile_byte);
1381
1382 // load values of F and H from previous row (one unit up)
1383 Fup = vec_ld(0, p);
1384 Hup1 = vec_ld(16, p);
1385 p += 32; // move ahead 32 bytes
1386
1387 // shift into place so we have complete F and H vectors
1388 // that refer to the values one unit up from each cell
1389 // that we are currently working on.
1390 Fup = vec_sld(Fup,F,15);
1391 Hup1 = vec_sld(Hup1,H,15);
1392
1393 // do the dynamic programming
1394
1395 // update E value
1396 E = vec_subs(E,v_gapextend);
1397 tmp = vec_subs(H,v_gapopen);
1398 E = vec_max(E,tmp);
1399
1400 // update F value
1401 F = vec_subs(Fup,v_gapextend);
1402 tmp = vec_subs(Hup1,v_gapopen);
1403 F = vec_max(F,tmp);
1404
1405 v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1406
1407 // add score to H
1408 H = vec_adds(Hup2,v_score);
1409 H = vec_subs(H,v_bias);
1410
1411 // set H to max of H,E,F
1412 H = vec_max(H,E);
1413 H = vec_max(H,F);
1414
1415 // Update highest score encountered this far
1416 v_maxscore = vec_max(v_maxscore,H);
1417
1418
1419 // PROLOGUE 6
1420 // prefetch next residue
1421 k = db_sequence[6];
1422
1423 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1424 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1425 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1426 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1427 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1428 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1429
1430
1431 // prefetch score for next step
1432 v_score_load1 = vec_ld(16*k,query_profile_byte);
1433
1434 // load values of F and H from previous row (one unit up)
1435 Fup = vec_ld(0, p);
1436 Hup2 = vec_ld(16, p);
1437 p += 32; // move ahead 32 bytes
1438
1439 // shift into place so we have complete F and H vectors
1440 // that refer to the values one unit up from each cell
1441 // that we are currently working on.
1442 Fup = vec_sld(Fup,F,15);
1443 Hup2 = vec_sld(Hup2,H,15);
1444
1445 // do the dynamic programming
1446
1447 // update E value
1448 E = vec_subs(E,v_gapextend);
1449 tmp = vec_subs(H,v_gapopen);
1450 E = vec_max(E,tmp);
1451
1452 // update F value
1453 F = vec_subs(Fup,v_gapextend);
1454 tmp = vec_subs(Hup2,v_gapopen);
1455 F = vec_max(F,tmp);
1456
1457 v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1458
1459 // add score to H
1460 H = vec_adds(Hup1,v_score);
1461 H = vec_subs(H,v_bias);
1462
1463 // set H to max of H,E,F
1464 H = vec_max(H,E);
1465 H = vec_max(H,F);
1466
1467 // Update highest score encountered this far
1468 v_maxscore = vec_max(v_maxscore,H);
1469
1470
1471
1472 // PROLOGUE 7
1473 // prefetch next residue
1474 k = db_sequence[7];
1475
1476 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1477 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1478 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1479 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1480 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1481 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1482
1483
1484 // prefetch score for next step
1485 v_score_load1 = vec_ld(16*k,query_profile_byte);
1486
1487 // load values of F and H from previous row (one unit up)
1488 Fup = vec_ld(0, p);
1489 Hup1 = vec_ld(16, p);
1490 p += 32; // move ahead 32 bytes
1491
1492 // shift into place so we have complete F and H vectors
1493 // that refer to the values one unit up from each cell
1494 // that we are currently working on.
1495 Fup = vec_sld(Fup,F,15);
1496 Hup1 = vec_sld(Hup1,H,15);
1497
1498 // do the dynamic programming
1499
1500 // update E value
1501 E = vec_subs(E,v_gapextend);
1502 tmp = vec_subs(H,v_gapopen);
1503 E = vec_max(E,tmp);
1504
1505 // update F value
1506 F = vec_subs(Fup,v_gapextend);
1507 tmp = vec_subs(Hup1,v_gapopen);
1508 F = vec_max(F,tmp);
1509
1510 v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1511
1512 // add score to H
1513 H = vec_adds(Hup2,v_score);
1514 H = vec_subs(H,v_bias);
1515
1516 // set H to max of H,E,F
1517 H = vec_max(H,E);
1518 H = vec_max(H,F);
1519
1520 // Update highest score encountered this far
1521 v_maxscore = vec_max(v_maxscore,H);
1522
1523
1524
1525 // PROLOGUE 8
1526 // prefetch next residue
1527 k = db_sequence[8];
1528
1529 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1530 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1531 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1532 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1533 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1534 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1535
1536
1537 // prefetch score for next step
1538 v_score_load1 = vec_ld(16*k,query_profile_byte);
1539
1540 // load values of F and H from previous row (one unit up)
1541 Fup = vec_ld(0, p);
1542 Hup2 = vec_ld(16, p);
1543 p += 32; // move ahead 32 bytes
1544
1545 // shift into place so we have complete F and H vectors
1546 // that refer to the values one unit up from each cell
1547 // that we are currently working on.
1548 Fup = vec_sld(Fup,F,15);
1549 Hup2 = vec_sld(Hup2,H,15);
1550
1551 // do the dynamic programming
1552
1553 // update E value
1554 E = vec_subs(E,v_gapextend);
1555 tmp = vec_subs(H,v_gapopen);
1556 E = vec_max(E,tmp);
1557
1558 // update F value
1559 F = vec_subs(Fup,v_gapextend);
1560 tmp = vec_subs(Hup2,v_gapopen);
1561 F = vec_max(F,tmp);
1562
1563 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1564
1565 // add score to H
1566 H = vec_adds(Hup1,v_score);
1567 H = vec_subs(H,v_bias);
1568
1569 // set H to max of H,E,F
1570 H = vec_max(H,E);
1571 H = vec_max(H,F);
1572
1573 // Update highest score encountered this far
1574 v_maxscore = vec_max(v_maxscore,H);
1575
1576
1577
1578
1579 // PROLOGUE 9
1580 // prefetch next residue
1581 k = db_sequence[9];
1582 k8 = db_sequence[1];
1583
1584 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1585 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1586 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1587 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1588 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1589 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1590
1591
1592 // prefetch score for next step
1593 v_score_load1 = vec_ld(16*k,query_profile_byte);
1594 v_score_load2 = vec_ld(16*k8,query_profile_byte);
1595
1596 // load values of F and H from previous row (one unit up)
1597 Fup = vec_ld(0, p);
1598 Hup1 = vec_ld(16, p);
1599 p += 32; // move ahead 32 bytes
1600
1601 // shift into place so we have complete F and H vectors
1602 // that refer to the values one unit up from each cell
1603 // that we are currently working on.
1604 Fup = vec_sld(Fup,F,15);
1605 Hup1 = vec_sld(Hup1,H,15);
1606
1607 // do the dynamic programming
1608
1609 // update E value
1610 E = vec_subs(E,v_gapextend);
1611 tmp = vec_subs(H,v_gapopen);
1612 E = vec_max(E,tmp);
1613
1614 // update F value
1615 F = vec_subs(Fup,v_gapextend);
1616 tmp = vec_subs(Hup1,v_gapopen);
1617 F = vec_max(F,tmp);
1618
1619 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1620
1621 // add score to H
1622 H = vec_adds(Hup2,v_score);
1623 H = vec_subs(H,v_bias);
1624
1625 // set H to max of H,E,F
1626 H = vec_max(H,E);
1627 H = vec_max(H,F);
1628
1629 // Update highest score encountered this far
1630 v_maxscore = vec_max(v_maxscore,H);
1631
1632
1633
1634 // PROLOGUE 10
1635 // prefetch next residue
1636 k = db_sequence[10];
1637 k8 = db_sequence[2];
1638
1639 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1640 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1641 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1642 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1643 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1644 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1645
1646
1647 // prefetch score for next step
1648 v_score_load1 = vec_ld(16*k,query_profile_byte);
1649 v_score_load2 = vec_ld(16*k8,query_profile_byte);
1650
1651 // load values of F and H from previous row (one unit up)
1652 Fup = vec_ld(0, p);
1653 Hup2 = vec_ld(16, p);
1654 p += 32; // move ahead 32 bytes
1655
1656 // shift into place so we have complete F and H vectors
1657 // that refer to the values one unit up from each cell
1658 // that we are currently working on.
1659 Fup = vec_sld(Fup,F,15);
1660 Hup2 = vec_sld(Hup2,H,15);
1661
1662 // do the dynamic programming
1663
1664 // update E value
1665 E = vec_subs(E,v_gapextend);
1666 tmp = vec_subs(H,v_gapopen);
1667 E = vec_max(E,tmp);
1668
1669 // update F value
1670 F = vec_subs(Fup,v_gapextend);
1671 tmp = vec_subs(Hup2,v_gapopen);
1672 F = vec_max(F,tmp);
1673
1674 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1675
1676 // add score to H
1677 H = vec_adds(Hup1,v_score);
1678 H = vec_subs(H,v_bias);
1679
1680 // set H to max of H,E,F
1681 H = vec_max(H,E);
1682 H = vec_max(H,F);
1683
1684 // Update highest score encountered this far
1685 v_maxscore = vec_max(v_maxscore,H);
1686
1687
1688
1689
1690 // PROLOGUE 11
1691 // prefetch next residue
1692 k = db_sequence[11];
1693 k8 = db_sequence[3];
1694
1695 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1696 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1697 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1698 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1699 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1700 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1701
1702
1703 // prefetch score for next step
1704 v_score_load1 = vec_ld(16*k,query_profile_byte);
1705 v_score_load2 = vec_ld(16*k8,query_profile_byte);
1706
1707 // load values of F and H from previous row (one unit up)
1708 Fup = vec_ld(0, p);
1709 Hup1 = vec_ld(16, p);
1710 p += 32; // move ahead 32 bytes
1711
1712 // shift into place so we have complete F and H vectors
1713 // that refer to the values one unit up from each cell
1714 // that we are currently working on.
1715 Fup = vec_sld(Fup,F,15);
1716 Hup1 = vec_sld(Hup1,H,15);
1717
1718 // do the dynamic programming
1719
1720 // update E value
1721 E = vec_subs(E,v_gapextend);
1722 tmp = vec_subs(H,v_gapopen);
1723 E = vec_max(E,tmp);
1724
1725 // update F value
1726 F = vec_subs(Fup,v_gapextend);
1727 tmp = vec_subs(Hup1,v_gapopen);
1728 F = vec_max(F,tmp);
1729
1730 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1731
1732 // add score to H
1733 H = vec_adds(Hup2,v_score);
1734 H = vec_subs(H,v_bias);
1735
1736 // set H to max of H,E,F
1737 H = vec_max(H,E);
1738 H = vec_max(H,F);
1739
1740 // Update highest score encountered this far
1741 v_maxscore = vec_max(v_maxscore,H);
1742
1743
1744
1745 // PROLOGUE 12
1746 // prefetch next residue
1747 k = db_sequence[12];
1748 k8 = db_sequence[4];
1749
1750 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1751 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1752 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1753 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1754 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1755 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1756
1757
1758 // prefetch score for next step
1759 v_score_load1 = vec_ld(16*k,query_profile_byte);
1760 v_score_load2 = vec_ld(16*k8,query_profile_byte);
1761
1762 // load values of F and H from previous row (one unit up)
1763 Fup = vec_ld(0, p);
1764 Hup2 = vec_ld(16, p);
1765 p += 32; // move ahead 32 bytes
1766
1767 // shift into place so we have complete F and H vectors
1768 // that refer to the values one unit up from each cell
1769 // that we are currently working on.
1770 Fup = vec_sld(Fup,F,15);
1771 Hup2 = vec_sld(Hup2,H,15);
1772
1773 // do the dynamic programming
1774
1775 // update E value
1776 E = vec_subs(E,v_gapextend);
1777 tmp = vec_subs(H,v_gapopen);
1778 E = vec_max(E,tmp);
1779
1780 // update F value
1781 F = vec_subs(Fup,v_gapextend);
1782 tmp = vec_subs(Hup2,v_gapopen);
1783 F = vec_max(F,tmp);
1784
1785 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1786
1787 // add score to H
1788 H = vec_adds(Hup1,v_score);
1789 H = vec_subs(H,v_bias);
1790
1791 // set H to max of H,E,F
1792 H = vec_max(H,E);
1793 H = vec_max(H,F);
1794
1795 // Update highest score encountered this far
1796 v_maxscore = vec_max(v_maxscore,H);
1797
1798
1799
1800
1801 // PROLOGUE 13
1802 // prefetch next residue
1803 k = db_sequence[13];
1804 k8 = db_sequence[5];
1805
1806 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1807 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1808 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1809 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1810 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1811 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1812
1813
1814 // prefetch score for next step
1815 v_score_load1 = vec_ld(16*k,query_profile_byte);
1816 v_score_load2 = vec_ld(16*k8,query_profile_byte);
1817
1818 // load values of F and H from previous row (one unit up)
1819 Fup = vec_ld(0, p);
1820 Hup1 = vec_ld(16, p);
1821 p += 32; // move ahead 32 bytes
1822
1823 // shift into place so we have complete F and H vectors
1824 // that refer to the values one unit up from each cell
1825 // that we are currently working on.
1826 Fup = vec_sld(Fup,F,15);
1827 Hup1 = vec_sld(Hup1,H,15);
1828
1829 // do the dynamic programming
1830
1831 // update E value
1832 E = vec_subs(E,v_gapextend);
1833 tmp = vec_subs(H,v_gapopen);
1834 E = vec_max(E,tmp);
1835
1836 // update F value
1837 F = vec_subs(Fup,v_gapextend);
1838 tmp = vec_subs(Hup1,v_gapopen);
1839 F = vec_max(F,tmp);
1840
1841 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1842
1843 // add score to H
1844 H = vec_adds(Hup2,v_score);
1845 H = vec_subs(H,v_bias);
1846
1847 // set H to max of H,E,F
1848 H = vec_max(H,E);
1849 H = vec_max(H,F);
1850
1851 // Update highest score encountered this far
1852 v_maxscore = vec_max(v_maxscore,H);
1853
1854
1855
1856 // PROLOGUE 14
1857 // prefetch next residue
1858 k = db_sequence[14];
1859 k8 = db_sequence[6];
1860
1861 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1862 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1863 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1864 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1865 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1866 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1867
1868
1869 // prefetch score for next step
1870 v_score_load1 = vec_ld(16*k,query_profile_byte);
1871 v_score_load2 = vec_ld(16*k8,query_profile_byte);
1872
1873 // load values of F and H from previous row (one unit up)
1874 Fup = vec_ld(0, p);
1875 Hup2 = vec_ld(16, p);
1876 p += 32; // move ahead 32 bytes
1877
1878 // shift into place so we have complete F and H vectors
1879 // that refer to the values one unit up from each cell
1880 // that we are currently working on.
1881 Fup = vec_sld(Fup,F,15);
1882 Hup2 = vec_sld(Hup2,H,15);
1883
1884 // do the dynamic programming
1885
1886 // update E value
1887 E = vec_subs(E,v_gapextend);
1888 tmp = vec_subs(H,v_gapopen);
1889 E = vec_max(E,tmp);
1890
1891 // update F value
1892 F = vec_subs(Fup,v_gapextend);
1893 tmp = vec_subs(Hup2,v_gapopen);
1894 F = vec_max(F,tmp);
1895
1896 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1897
1898 // add score to H
1899 H = vec_adds(Hup1,v_score);
1900 H = vec_subs(H,v_bias);
1901
1902 // set H to max of H,E,F
1903 H = vec_max(H,E);
1904 H = vec_max(H,F);
1905
1906 // Update highest score encountered this far
1907 v_maxscore = vec_max(v_maxscore,H);
1908
1909
1910
1911 // PROLOGUE 15
1912 // prefetch next residue
1913 k = db_sequence[15];
1914 k8 = db_sequence[7];
1915
1916 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1917 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1918 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1919 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1920 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1921 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1922
1923
1924 // prefetch score for next step
1925 v_score_load1 = vec_ld(16*k,query_profile_byte);
1926 v_score_load2 = vec_ld(16*k8,query_profile_byte);
1927
1928 // load values of F and H from previous row (one unit up)
1929 Fup = vec_ld(0, p);
1930 Hup1 = vec_ld(16, p);
1931 p += 32; // move ahead 32 bytes
1932
1933 // shift into place so we have complete F and H vectors
1934 // that refer to the values one unit up from each cell
1935 // that we are currently working on.
1936 Fup = vec_sld(Fup,F,15);
1937 Hup1 = vec_sld(Hup1,H,15);
1938
1939 // do the dynamic programming
1940
1941 // update E value
1942 E = vec_subs(E,v_gapextend);
1943 tmp = vec_subs(H,v_gapopen);
1944 E = vec_max(E,tmp);
1945
1946 // update F value
1947 F = vec_subs(Fup,v_gapextend);
1948 tmp = vec_subs(Hup1,v_gapopen);
1949 F = vec_max(F,tmp);
1950
1951 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1952
1953 // add score to H
1954 H = vec_adds(Hup2,v_score);
1955 H = vec_subs(H,v_bias);
1956
1957 // set H to max of H,E,F
1958 H = vec_max(H,E);
1959 H = vec_max(H,F);
1960
1961 // Update highest score encountered this far
1962 v_maxscore = vec_max(v_maxscore,H);
1963
1964
1965
1966 // PROLOGUE 16
1967 // prefetch next residue
1968 k = db_sequence[16];
1969 k8 = db_sequence[8];
1970
1971 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
1972 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
1973 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
1974 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
1975 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
1976 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
1977
1978
1979 // prefetch score for next step
1980 v_score_load1 = vec_ld(16*k,query_profile_byte);
1981 v_score_load2 = vec_ld(16*k8,query_profile_byte);
1982
1983 // load values of F and H from previous row (one unit up)
1984 Fup = vec_ld(0, p);
1985 Hup2 = vec_ld(16, p);
1986 p += 32; // move ahead 32 bytes
1987
1988 // shift into place so we have complete F and H vectors
1989 // that refer to the values one unit up from each cell
1990 // that we are currently working on.
1991 Fup = vec_sld(Fup,F,15);
1992 Hup2 = vec_sld(Hup2,H,15);
1993
1994 // do the dynamic programming
1995
1996 // update E value
1997 E = vec_subs(E,v_gapextend);
1998 tmp = vec_subs(H,v_gapopen);
1999 E = vec_max(E,tmp);
2000
2001 // update F value
2002 F = vec_subs(Fup,v_gapextend);
2003 tmp = vec_subs(Hup2,v_gapopen);
2004 F = vec_max(F,tmp);
2005
2006 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2007
2008 // add score to H
2009 H = vec_adds(Hup1,v_score);
2010 H = vec_subs(H,v_bias);
2011
2012 // set H to max of H,E,F
2013 H = vec_max(H,E);
2014 H = vec_max(H,F);
2015
2016 // Update highest score encountered this far
2017 v_maxscore = vec_max(v_maxscore,H);
2018
2019 p = workspace;
2020
2021 for(j=16;j<db_length;j+=16)
2022 {
2023 // STEP 1
2024
2025 // prefetch next residue
2026 k = db_sequence[j+1];
2027 k8 = db_sequence[j-7];
2028
2029 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2030 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2031 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2032 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2033 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2034 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2035
2036 // prefetch scores for next step
2037 v_score_load1 = vec_ld(16*k,query_profile_byte);
2038 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2039
2040 // load values of F and H from previous row (one unit up)
2041 Fup = vec_ld(512, p);
2042 Hup1 = vec_ld(528, p);
2043
2044 // save old values of F and H to use on next row
2045 vec_st(F, 0, p);
2046 vec_st(H, 16, p);
2047 p += 32;
2048
2049 // shift into place so we have complete F and H vectors
2050 // that refer to the values one unit up from each cell
2051 // that we are currently working on.
2052 Fup = vec_sld(Fup,F,15);
2053 Hup1 = vec_sld(Hup1,H,15);
2054
2055 // do the dynamic programming
2056
2057 // update E value
2058 E = vec_subs(E,v_gapextend);
2059 tmp = vec_subs(H,v_gapopen);
2060 E = vec_max(E,tmp);
2061
2062 // update F value
2063 F = vec_subs(Fup,v_gapextend);
2064 tmp = vec_subs(Hup1,v_gapopen);
2065 F = vec_max(F,tmp);
2066
2067 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2068
2069 // add score to H
2070 H = vec_adds(Hup2,v_score);
2071 H = vec_subs(H,v_bias);
2072
2073 // set H to max of H,E,F
2074 H = vec_max(H,E);
2075 H = vec_max(H,F);
2076
2077
2078
2079 // Update highest score encountered this far
2080 v_maxscore = vec_max(v_maxscore,H);
2081
2082
2083
2084
2085
2086 // STEP 2
2087
2088 // prefetch next residue
2089 k = db_sequence[j+2];
2090 k8 = db_sequence[j-6];
2091
2092 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2093 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2094 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2095 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2096 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2097 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2098
2099
2100 // prefetch scores for next step
2101 v_score_load1 = vec_ld(16*k,query_profile_byte);
2102 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2103
2104 // load values of F and H from previous row (one unit up)
2105 Fup = vec_ld(512, p);
2106 Hup2 = vec_ld(528, p);
2107
2108 // save old values of F and H to use on next row
2109 vec_st(F, 0, p);
2110 vec_st(H, 16, p);
2111 p += 32;
2112
2113 // shift into place so we have complete F and H vectors
2114 // that refer to the values one unit up from each cell
2115 // that we are currently working on.
2116 Fup = vec_sld(Fup,F,15);
2117 Hup2 = vec_sld(Hup2,H,15);
2118
2119 // do the dynamic programming
2120
2121 // update E value
2122 E = vec_subs(E,v_gapextend);
2123 tmp = vec_subs(H,v_gapopen);
2124 E = vec_max(E,tmp);
2125
2126 // update F value
2127 F = vec_subs(Fup,v_gapextend);
2128 tmp = vec_subs(Hup2,v_gapopen);
2129 F = vec_max(F,tmp);
2130
2131 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2132
2133 // add score to H
2134 H = vec_adds(Hup1,v_score);
2135 H = vec_subs(H,v_bias);
2136
2137 // set H to max of H,E,F
2138 H = vec_max(H,E);
2139 H = vec_max(H,F);
2140
2141
2142 // Update highest score encountered this far
2143 v_maxscore = vec_max(v_maxscore,H);
2144
2145
2146
2147
2148
2149
2150 // STEP 3
2151
2152 // prefetch next residue
2153 k = db_sequence[j+3];
2154 k8 = db_sequence[j-5];
2155
2156 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2157 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2158 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2159 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2160 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2161 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2162
2163
2164 // prefetch scores for next step
2165 v_score_load1 = vec_ld(16*k,query_profile_byte);
2166 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2167
2168 // load values of F and H from previous row (one unit up)
2169 Fup = vec_ld(512, p);
2170 Hup1 = vec_ld(528, p);
2171
2172 // save old values of F and H to use on next row
2173 vec_st(F, 0, p);
2174 vec_st(H, 16, p);
2175 p += 32;
2176
2177 // shift into place so we have complete F and H vectors
2178 // that refer to the values one unit up from each cell
2179 // that we are currently working on.
2180 Fup = vec_sld(Fup,F,15);
2181 Hup1 = vec_sld(Hup1,H,15);
2182
2183 // do the dynamic programming
2184
2185 // update E value
2186 E = vec_subs(E,v_gapextend);
2187 tmp = vec_subs(H,v_gapopen);
2188 E = vec_max(E,tmp);
2189
2190 // update F value
2191 F = vec_subs(Fup,v_gapextend);
2192 tmp = vec_subs(Hup1,v_gapopen);
2193 F = vec_max(F,tmp);
2194
2195 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2196
2197 // add score to H
2198 H = vec_adds(Hup2,v_score);
2199 H = vec_subs(H,v_bias);
2200
2201 // set H to max of H,E,F
2202 H = vec_max(H,E);
2203 H = vec_max(H,F);
2204
2205 // Update highest score encountered this far
2206 v_maxscore = vec_max(v_maxscore,H);
2207
2208
2209
2210
2211
2212
2213 // STEP 4
2214
2215 // prefetch next residue
2216 k = db_sequence[j+4];
2217 k8 = db_sequence[j-4];
2218
2219 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2220 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2221 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2222 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2223 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2224 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2225
2226
2227 // prefetch scores for next step
2228 v_score_load1 = vec_ld(16*k,query_profile_byte);
2229 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2230
2231 // load values of F and H from previous row (one unit up)
2232 Fup = vec_ld(512, p);
2233 Hup2 = vec_ld(528, p);
2234
2235 // save old values of F and H to use on next row
2236 vec_st(F, 0, p);
2237 vec_st(H, 16, p);
2238 p += 32;
2239
2240 // shift into place so we have complete F and H vectors
2241 // that refer to the values one unit up from each cell
2242 // that we are currently working on.
2243 Fup = vec_sld(Fup,F,15);
2244 Hup2 = vec_sld(Hup2,H,15);
2245
2246 // do the dynamic programming
2247
2248 // update E value
2249 E = vec_subs(E,v_gapextend);
2250 tmp = vec_subs(H,v_gapopen);
2251 E = vec_max(E,tmp);
2252
2253 // update F value
2254 F = vec_subs(Fup,v_gapextend);
2255 tmp = vec_subs(Hup2,v_gapopen);
2256 F = vec_max(F,tmp);
2257
2258 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2259
2260 // add score to H
2261 H = vec_adds(Hup1,v_score);
2262 H = vec_subs(H,v_bias);
2263
2264 // set H to max of H,E,F
2265 H = vec_max(H,E);
2266 H = vec_max(H,F);
2267
2268 // Update highest score encountered this far
2269 v_maxscore = vec_max(v_maxscore,H);
2270
2271
2272
2273
2274
2275
2276 // STEP 5
2277
2278 // prefetch next residue
2279 k = db_sequence[j+5];
2280 k8 = db_sequence[j-3];
2281
2282 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2283 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2284 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2285 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2286 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2287 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2288
2289
2290 // prefetch scores for next step
2291 v_score_load1 = vec_ld(16*k,query_profile_byte);
2292 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2293
2294 // load values of F and H from previous row (one unit up)
2295 Fup = vec_ld(512, p);
2296 Hup1 = vec_ld(528, p);
2297
2298 // save old values of F and H to use on next row
2299 vec_st(F, 0, p);
2300 vec_st(H, 16, p);
2301 p += 32;
2302
2303 // shift into place so we have complete F and H vectors
2304 // that refer to the values one unit up from each cell
2305 // that we are currently working on.
2306 Fup = vec_sld(Fup,F,15);
2307 Hup1 = vec_sld(Hup1,H,15);
2308
2309 // do the dynamic programming
2310
2311 // update E value
2312 E = vec_subs(E,v_gapextend);
2313 tmp = vec_subs(H,v_gapopen);
2314 E = vec_max(E,tmp);
2315
2316 // update F value
2317 F = vec_subs(Fup,v_gapextend);
2318 tmp = vec_subs(Hup1,v_gapopen);
2319 F = vec_max(F,tmp);
2320
2321 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2322
2323 // add score to H
2324 H = vec_adds(Hup2,v_score);
2325 H = vec_subs(H,v_bias);
2326
2327 // set H to max of H,E,F
2328 H = vec_max(H,E);
2329 H = vec_max(H,F);
2330
2331 // Update highest score encountered this far
2332 v_maxscore = vec_max(v_maxscore,H);
2333
2334
2335
2336
2337
2338
2339 // STEP 6
2340
2341 // prefetch next residue
2342 k = db_sequence[j+6];
2343 k8 = db_sequence[j-2];
2344
2345 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2346 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2347 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2348 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2349 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2350 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2351
2352
2353 // prefetch scores for next step
2354 v_score_load1 = vec_ld(16*k,query_profile_byte);
2355 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2356
2357 // load values of F and H from previous row (one unit up)
2358 Fup = vec_ld(512, p);
2359 Hup2 = vec_ld(528, p);
2360
2361 // save old values of F and H to use on next row
2362 vec_st(F, 0, p);
2363 vec_st(H, 16, p);
2364 p += 32;
2365
2366 // shift into place so we have complete F and H vectors
2367 // that refer to the values one unit up from each cell
2368 // that we are currently working on.
2369 Fup = vec_sld(Fup,F,15);
2370 Hup2 = vec_sld(Hup2,H,15);
2371
2372 // do the dynamic programming
2373
2374 // update E value
2375 E = vec_subs(E,v_gapextend);
2376 tmp = vec_subs(H,v_gapopen);
2377 E = vec_max(E,tmp);
2378
2379 // update F value
2380 F = vec_subs(Fup,v_gapextend);
2381 tmp = vec_subs(Hup2,v_gapopen);
2382 F = vec_max(F,tmp);
2383
2384 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2385
2386 // add score to H
2387 H = vec_adds(Hup1,v_score);
2388 H = vec_subs(H,v_bias);
2389
2390 // set H to max of H,E,F
2391 H = vec_max(H,E);
2392 H = vec_max(H,F);
2393
2394 // Update highest score encountered this far
2395 v_maxscore = vec_max(v_maxscore,H);
2396
2397
2398
2399
2400
2401
2402 // STEP 7
2403
2404 // prefetch next residue
2405 k = db_sequence[j+7];
2406 k8 = db_sequence[j-1];
2407
2408 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2409 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2410 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2411 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2412 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2413 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2414
2415
2416 // prefetch scores for next step
2417 v_score_load1 = vec_ld(16*k,query_profile_byte);
2418 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2419
2420 // load values of F and H from previous row (one unit up)
2421 Fup = vec_ld(512, p);
2422 Hup1 = vec_ld(528, p);
2423
2424 // save old values of F and H to use on next row
2425 vec_st(F, 0, p);
2426 vec_st(H, 16, p);
2427 p += 32;
2428
2429 // shift into place so we have complete F and H vectors
2430 // that refer to the values one unit up from each cell
2431 // that we are currently working on.
2432 Fup = vec_sld(Fup,F,15);
2433 Hup1 = vec_sld(Hup1,H,15);
2434
2435 // do the dynamic programming
2436
2437 // update E value
2438 E = vec_subs(E,v_gapextend);
2439 tmp = vec_subs(H,v_gapopen);
2440 E = vec_max(E,tmp);
2441
2442 // update F value
2443 F = vec_subs(Fup,v_gapextend);
2444 tmp = vec_subs(Hup1,v_gapopen);
2445 F = vec_max(F,tmp);
2446
2447 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2448
2449 // add score to H
2450 H = vec_adds(Hup2,v_score);
2451 H = vec_subs(H,v_bias);
2452
2453 // set H to max of H,E,F
2454 H = vec_max(H,E);
2455 H = vec_max(H,F);
2456
2457 // Update highest score encountered this far
2458 v_maxscore = vec_max(v_maxscore,H);
2459
2460
2461
2462
2463
2464
2465 // STEP 8
2466
2467 // prefetch next residue
2468 k = db_sequence[j+8];
2469 k8 = db_sequence[j];
2470
2471 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2472 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2473 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2474 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2475 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2476 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2477
2478
2479 // prefetch scores for next step
2480 v_score_load1 = vec_ld(16*k,query_profile_byte);
2481 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2482
2483 // load values of F and H from previous row (one unit up)
2484 Fup = vec_ld(512, p);
2485 Hup2 = vec_ld(528, p);
2486
2487 // save old values of F and H to use on next row
2488 vec_st(F, 0, p);
2489 vec_st(H, 16, p);
2490 p += 32;
2491
2492 // shift into place so we have complete F and H vectors
2493 // that refer to the values one unit up from each cell
2494 // that we are currently working on.
2495 Fup = vec_sld(Fup,F,15);
2496 Hup2 = vec_sld(Hup2,H,15);
2497
2498 // do the dynamic programming
2499
2500 // update E value
2501 E = vec_subs(E,v_gapextend);
2502 tmp = vec_subs(H,v_gapopen);
2503 E = vec_max(E,tmp);
2504
2505 // update F value
2506 F = vec_subs(Fup,v_gapextend);
2507 tmp = vec_subs(Hup2,v_gapopen);
2508 F = vec_max(F,tmp);
2509
2510 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2511
2512 // add score to H
2513 H = vec_adds(Hup1,v_score);
2514 H = vec_subs(H,v_bias);
2515
2516 // set H to max of H,E,F
2517 H = vec_max(H,E);
2518 H = vec_max(H,F);
2519
2520 // Update highest score encountered this far
2521 v_maxscore = vec_max(v_maxscore,H);
2522
2523
2524
2525
2526
2527
2528 // STEP 9
2529
2530 // prefetch next residue
2531 k = db_sequence[j+9];
2532 k8 = db_sequence[j+1];
2533
2534 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2535 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2536 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2537 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2538 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2539 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2540
2541
2542 // prefetch scores for next step
2543 v_score_load1 = vec_ld(16*k,query_profile_byte);
2544 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2545
2546 // load values of F and H from previous row (one unit up)
2547 Fup = vec_ld(512, p);
2548 Hup1 = vec_ld(528, p);
2549
2550 // save old values of F and H to use on next row
2551 vec_st(F, 0, p);
2552 vec_st(H, 16, p);
2553 p += 32;
2554
2555 // shift into place so we have complete F and H vectors
2556 // that refer to the values one unit up from each cell
2557 // that we are currently working on.
2558 Fup = vec_sld(Fup,F,15);
2559 Hup1 = vec_sld(Hup1,H,15);
2560
2561 // do the dynamic programming
2562
2563 // update E value
2564 E = vec_subs(E,v_gapextend);
2565 tmp = vec_subs(H,v_gapopen);
2566 E = vec_max(E,tmp);
2567
2568 // update F value
2569 F = vec_subs(Fup,v_gapextend);
2570 tmp = vec_subs(Hup1,v_gapopen);
2571 F = vec_max(F,tmp);
2572
2573 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2574
2575 // add score to H
2576 H = vec_adds(Hup2,v_score);
2577 H = vec_subs(H,v_bias);
2578
2579 // set H to max of H,E,F
2580 H = vec_max(H,E);
2581 H = vec_max(H,F);
2582
2583 // Update highest score encountered this far
2584 v_maxscore = vec_max(v_maxscore,H);
2585
2586 // STEP 10
2587
2588 // prefetch next residue
2589 k = db_sequence[j+10];
2590 k8 = db_sequence[j+2];
2591
2592 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2593 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2594 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2595 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2596 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2597 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2598
2599
2600 // prefetch scores for next step
2601 v_score_load1 = vec_ld(16*k,query_profile_byte);
2602 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2603
2604 // load values of F and H from previous row (one unit up)
2605 Fup = vec_ld(512, p);
2606 Hup2 = vec_ld(528, p);
2607
2608 // save old values of F and H to use on next row
2609 vec_st(F, 0, p);
2610 vec_st(H, 16, p);
2611 p += 32;
2612
2613 // shift into place so we have complete F and H vectors
2614 // that refer to the values one unit up from each cell
2615 // that we are currently working on.
2616 Fup = vec_sld(Fup,F,15);
2617 Hup2 = vec_sld(Hup2,H,15);
2618
2619 // do the dynamic programming
2620
2621 // update E value
2622 E = vec_subs(E,v_gapextend);
2623 tmp = vec_subs(H,v_gapopen);
2624 E = vec_max(E,tmp);
2625
2626 // update F value
2627 F = vec_subs(Fup,v_gapextend);
2628 tmp = vec_subs(Hup2,v_gapopen);
2629 F = vec_max(F,tmp);
2630
2631 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2632
2633 // add score to H
2634 H = vec_adds(Hup1,v_score);
2635 H = vec_subs(H,v_bias);
2636
2637 // set H to max of H,E,F
2638 H = vec_max(H,E);
2639 H = vec_max(H,F);
2640
2641 // Update highest score encountered this far
2642 v_maxscore = vec_max(v_maxscore,H);
2643
2644 // STEP 11
2645
2646 // prefetch next residue
2647 k = db_sequence[j+11];
2648 k8 = db_sequence[j+3];
2649
2650 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2651 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2652 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2653 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2654 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2655 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2656
2657
2658 // prefetch scores for next step
2659 v_score_load1 = vec_ld(16*k,query_profile_byte);
2660 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2661
2662 // load values of F and H from previous row (one unit up)
2663 Fup = vec_ld(512, p);
2664 Hup1 = vec_ld(528, p);
2665
2666 // save old values of F and H to use on next row
2667 vec_st(F, 0, p);
2668 vec_st(H, 16, p);
2669 p += 32;
2670
2671 // shift into place so we have complete F and H vectors
2672 // that refer to the values one unit up from each cell
2673 // that we are currently working on.
2674 Fup = vec_sld(Fup,F,15);
2675 Hup1 = vec_sld(Hup1,H,15);
2676
2677 // do the dynamic programming
2678
2679 // update E value
2680 E = vec_subs(E,v_gapextend);
2681 tmp = vec_subs(H,v_gapopen);
2682 E = vec_max(E,tmp);
2683
2684 // update F value
2685 F = vec_subs(Fup,v_gapextend);
2686 tmp = vec_subs(Hup1,v_gapopen);
2687 F = vec_max(F,tmp);
2688
2689 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2690
2691 // add score to H
2692 H = vec_adds(Hup2,v_score);
2693 H = vec_subs(H,v_bias);
2694
2695 // set H to max of H,E,F
2696 H = vec_max(H,E);
2697 H = vec_max(H,F);
2698
2699 // Update highest score encountered this far
2700 v_maxscore = vec_max(v_maxscore,H);
2701
2702 // STEP 12
2703
2704 // prefetch next residue
2705 k = db_sequence[j+12];
2706 k8 = db_sequence[j+4];
2707
2708 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2709 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2710 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2711 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2712 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2713 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2714
2715
2716 // prefetch scores for next step
2717 v_score_load1 = vec_ld(16*k,query_profile_byte);
2718 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2719
2720 // load values of F and H from previous row (one unit up)
2721 Fup = vec_ld(512, p);
2722 Hup2 = vec_ld(528, p);
2723
2724 // save old values of F and H to use on next row
2725 vec_st(F, 0, p);
2726 vec_st(H, 16, p);
2727 p += 32;
2728
2729 // shift into place so we have complete F and H vectors
2730 // that refer to the values one unit up from each cell
2731 // that we are currently working on.
2732 Fup = vec_sld(Fup,F,15);
2733 Hup2 = vec_sld(Hup2,H,15);
2734
2735 // do the dynamic programming
2736
2737 // update E value
2738 E = vec_subs(E,v_gapextend);
2739 tmp = vec_subs(H,v_gapopen);
2740 E = vec_max(E,tmp);
2741
2742 // update F value
2743 F = vec_subs(Fup,v_gapextend);
2744 tmp = vec_subs(Hup2,v_gapopen);
2745 F = vec_max(F,tmp);
2746
2747 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2748
2749 // add score to H
2750 H = vec_adds(Hup1,v_score);
2751 H = vec_subs(H,v_bias);
2752
2753 // set H to max of H,E,F
2754 H = vec_max(H,E);
2755 H = vec_max(H,F);
2756
2757 // Update highest score encountered this far
2758 v_maxscore = vec_max(v_maxscore,H);
2759
2760 // STEP 13
2761
2762 // prefetch next residue
2763 k = db_sequence[j+13];
2764 k8 = db_sequence[j+5];
2765
2766 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2767 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2768 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2769 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2770 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2771 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2772
2773
2774 // prefetch scores for next step
2775 v_score_load1 = vec_ld(16*k,query_profile_byte);
2776 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2777
2778 // load values of F and H from previous row (one unit up)
2779 Fup = vec_ld(512, p);
2780 Hup1 = vec_ld(528, p);
2781
2782 // save old values of F and H to use on next row
2783 vec_st(F, 0, p);
2784 vec_st(H, 16, p);
2785 p += 32;
2786
2787 // shift into place so we have complete F and H vectors
2788 // that refer to the values one unit up from each cell
2789 // that we are currently working on.
2790 Fup = vec_sld(Fup,F,15);
2791 Hup1 = vec_sld(Hup1,H,15);
2792
2793 // do the dynamic programming
2794
2795 // update E value
2796 E = vec_subs(E,v_gapextend);
2797 tmp = vec_subs(H,v_gapopen);
2798 E = vec_max(E,tmp);
2799
2800 // update F value
2801 F = vec_subs(Fup,v_gapextend);
2802 tmp = vec_subs(Hup1,v_gapopen);
2803 F = vec_max(F,tmp);
2804
2805 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2806
2807 // add score to H
2808 H = vec_adds(Hup2,v_score);
2809 H = vec_subs(H,v_bias);
2810
2811 // set H to max of H,E,F
2812 H = vec_max(H,E);
2813 H = vec_max(H,F);
2814
2815 // Update highest score encountered this far
2816 v_maxscore = vec_max(v_maxscore,H);
2817
2818 // STEP 14
2819
2820 // prefetch next residue
2821 k = db_sequence[j+14];
2822 k8 = db_sequence[j+6];
2823
2824 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2825 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2826 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2827 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2828 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2829 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2830
2831
2832 // prefetch scores for next step
2833 v_score_load1 = vec_ld(16*k,query_profile_byte);
2834 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2835
2836 // load values of F and H from previous row (one unit up)
2837 Fup = vec_ld(512, p);
2838 Hup2 = vec_ld(528, p);
2839
2840 // save old values of F and H to use on next row
2841 vec_st(F, 0, p);
2842 vec_st(H, 16, p);
2843 p += 32;
2844
2845 // shift into place so we have complete F and H vectors
2846 // that refer to the values one unit up from each cell
2847 // that we are currently working on.
2848 Fup = vec_sld(Fup,F,15);
2849 Hup2 = vec_sld(Hup2,H,15);
2850
2851 // do the dynamic programming
2852
2853 // update E value
2854 E = vec_subs(E,v_gapextend);
2855 tmp = vec_subs(H,v_gapopen);
2856 E = vec_max(E,tmp);
2857
2858 // update F value
2859 F = vec_subs(Fup,v_gapextend);
2860 tmp = vec_subs(Hup2,v_gapopen);
2861 F = vec_max(F,tmp);
2862
2863 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2864
2865 // add score to H
2866 H = vec_adds(Hup1,v_score);
2867 H = vec_subs(H,v_bias);
2868
2869 // set H to max of H,E,F
2870 H = vec_max(H,E);
2871 H = vec_max(H,F);
2872
2873 // Update highest score encountered this far
2874 v_maxscore = vec_max(v_maxscore,H);
2875
2876 // STEP 15
2877
2878 // prefetch next residue
2879 k = db_sequence[j+15];
2880 k8 = db_sequence[j+7];
2881
2882 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2883 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2884 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2885 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2886 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2887 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2888
2889 // prefetch scores for next step
2890 v_score_load1 = vec_ld(16*k,query_profile_byte);
2891 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2892
2893 // load values of F and H from previous row (one unit up)
2894 Fup = vec_ld(512, p);
2895 Hup1 = vec_ld(528, p);
2896
2897 // save old values of F and H to use on next row
2898 vec_st(F, 0, p);
2899 vec_st(H, 16, p);
2900 p += 32;
2901
2902 // shift into place so we have complete F and H vectors
2903 // that refer to the values one unit up from each cell
2904 // that we are currently working on.
2905 Fup = vec_sld(Fup,F,15);
2906 Hup1 = vec_sld(Hup1,H,15);
2907
2908 // do the dynamic programming
2909
2910 // update E value
2911 E = vec_subs(E,v_gapextend);
2912 tmp = vec_subs(H,v_gapopen);
2913 E = vec_max(E,tmp);
2914
2915 // update F value
2916 F = vec_subs(Fup,v_gapextend);
2917 tmp = vec_subs(Hup1,v_gapopen);
2918 F = vec_max(F,tmp);
2919
2920 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2921
2922 // add score to H
2923 H = vec_adds(Hup2,v_score);
2924 H = vec_subs(H,v_bias);
2925
2926 // set H to max of H,E,F
2927 H = vec_max(H,E);
2928 H = vec_max(H,F);
2929
2930 // Update highest score encountered this far
2931 v_maxscore = vec_max(v_maxscore,H);
2932
2933 // STEP 16
2934
2935 // prefetch next residue
2936 k = db_sequence[j+16];
2937 k8 = db_sequence[j+8];
2938
2939 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2940 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2941 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
2942 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
2943 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
2944 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
2945
2946
2947 // prefetch scores for next step
2948 v_score_load1 = vec_ld(16*k,query_profile_byte);
2949 v_score_load2 = vec_ld(16*k8,query_profile_byte);
2950
2951 // load values of F and H from previous row (one unit up)
2952 Fup = vec_ld(512, p);
2953 Hup2 = vec_ld(528, p);
2954
2955 // save old values of F and H to use on next row
2956 vec_st(F, 0, p);
2957 vec_st(H, 16, p);
2958 p += 32;
2959
2960 // shift into place so we have complete F and H vectors
2961 // that refer to the values one unit up from each cell
2962 // that we are currently working on.
2963 Fup = vec_sld(Fup,F,15);
2964 Hup2 = vec_sld(Hup2,H,15);
2965
2966 // do the dynamic programming
2967
2968 // update E value
2969 E = vec_subs(E,v_gapextend);
2970 tmp = vec_subs(H,v_gapopen);
2971 E = vec_max(E,tmp);
2972
2973 // update F value
2974 F = vec_subs(Fup,v_gapextend);
2975 tmp = vec_subs(Hup2,v_gapopen);
2976 F = vec_max(F,tmp);
2977
2978 v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2979
2980 // add score to H
2981 H = vec_adds(Hup1,v_score);
2982 H = vec_subs(H,v_bias);
2983
2984 // set H to max of H,E,F
2985 H = vec_max(H,E);
2986 H = vec_max(H,F);
2987
2988 // Update highest score encountered this far
2989 v_maxscore = vec_max(v_maxscore,H);
2990
2991 }
2992
2993 for(;j<db_length+15;j++)
2994 {
2995 k8 = db_sequence[j-7];
2996
2997 v_score = vec_perm(v_score_q1, v_score_load1, queue1_to_score);
2998 v_score_q1 = vec_perm(v_score_q2, v_score_load1, queue2_to_queue1);
2999 v_score_q2 = vec_perm(v_score_q3, v_score_load1, queue3_to_queue2);
3000 v_score_q3 = vec_perm(v_score_q4, v_score_load1, queue4_to_queue3);
3001 v_score_q4 = vec_perm(v_score_q5, v_score_load1, queue5_to_queue4);
3002 v_score_q5 = vec_perm(v_score_q5, v_score_load1, queue5_with_load);
3003
3004
3005 // prefetch scores for next step
3006 v_score_load2 = vec_ld(16*k8,query_profile_byte);
3007 v_score_load1 = vec_perm(v_zero,v_score_load2,merge_score_load);
3008
3009 // save old values of F and H to use on next row
3010 vec_st(F, 0, p);
3011 vec_st(H, 16, p);
3012 p += 32; // move ahead 32 bytes
3013
3014 Fup = vec_sld(v_zero,F,15);
3015 Hup1 = vec_sld(v_zero,H,15);
3016
3017 // do the dynamic programming
3018
3019 // update E value
3020 E = vec_subs(E,v_gapextend);
3021 tmp = vec_subs(H,v_gapopen);
3022 E = vec_max(E,tmp);
3023
3024 // update F value
3025 F = vec_subs(Fup,v_gapextend);
3026 tmp = vec_subs(Hup1,v_gapopen);
3027 F = vec_max(F,tmp);
3028
3029 // add score to H
3030 H = vec_adds(Hup2,v_score);
3031 H = vec_subs(H,v_bias);
3032
3033 // set H to max of H,E,F
3034 H = vec_max(H,E);
3035 H = vec_max(H,F);
3036
3037 // Save value to use for next diagonal H
3038 Hup2 = Hup1;
3039
3040 // Update highest score encountered this far
3041 v_maxscore = vec_max(v_maxscore,H);
3042 }
3043 vec_st(F, 512, p);
3044 vec_st(H, 528, p);
3045
3046 query_profile_byte += 16*alphabet_size;
3047
3048 // End of this row (actually 16 rows due to SIMD).
3049 // Before we continue, check for overflow.
3050 tmp = vec_subs(vec_splat_u8(-1),v_bias);
3051 overflow = vec_any_ge(v_maxscore,tmp);
3052
3053
3054 }
3055
3056 if(overflow)
3057 {
3058 return 255;
3059 }
3060 else
3061 {
3062 // find largest score in the v_maxscore vector
3063 tmp = vec_sld(v_maxscore,v_maxscore,8);
3064 v_maxscore = vec_max(v_maxscore,tmp);
3065 tmp = vec_sld(v_maxscore,v_maxscore,4);
3066 v_maxscore = vec_max(v_maxscore,tmp);
3067 tmp = vec_sld(v_maxscore,v_maxscore,2);
3068 v_maxscore = vec_max(v_maxscore,tmp);
3069 tmp = vec_sld(v_maxscore,v_maxscore,1);
3070 v_maxscore = vec_max(v_maxscore,tmp);
3071
3072 // store in temporary variable
3073 vec_ste(v_maxscore,0,&score);
3074
3075 // return largest score
3076 return score;
3077 }}
3078
3079
3080 #else
3081
3082 /* No Altivec support. Avoid compiler complaints about empty object */
3083
3084 int sw_dummy;
3085
3086 #endif
3087