1 
2 /* Implementation of the Wozniak "anti-diagonal" vectorization
3    strategy for Smith-Waterman comparison, Wozniak (1997) Comp.
4    Appl. Biosci. 13:145-150
5 
6    November, 2004
7 */
8 
9 /*
10   Written by Erik Lindahl, Stockholm Bioinformatics Center, 2004.
11   Please send bug reports and/or suggestions to lindahl@sbc.su.se.
12 */
13 
14 #include <stdio.h>
15 
16 #include "defs.h"
17 #include "param.h"
18 #include "dropgsw2.h"
19 
20 #ifdef SW_ALTIVEC
21 
22 int
smith_waterman_altivec_word(unsigned char * query_sequence,unsigned short * query_profile_word,int query_length,unsigned char * db_sequence,int db_length,unsigned short bias,unsigned short gap_open,unsigned short gap_extend,struct f_struct * f_str)23 smith_waterman_altivec_word(unsigned char *     query_sequence,
24                             unsigned short *    query_profile_word,
25                             int                 query_length,
26                             unsigned char *     db_sequence,
27                             int                 db_length,
28                             unsigned short      bias,
29                             unsigned short      gap_open,
30                             unsigned short      gap_extend,
31                             struct f_struct *   f_str)
32 {
33     int                     i,j,k;
34     unsigned short *        p;
35     unsigned short          score;
36     unsigned char *         p_dbseq;
37     int                     alphabet_size = f_str->alphabet_size;
38     unsigned short *        workspace     = (unsigned short *)f_str->workspace;
39 
40     vector unsigned short   Fup,Hup1,Hup2,E,F,H,tmp;
41     vector unsigned char    perm;
42     vector unsigned short   v_maxscore;
43     vector unsigned short   v_bias,v_gapopen,v_gapextend;
44     vector unsigned short   v_score;
45     vector unsigned short   v_score_q1;
46     vector unsigned short   v_score_q2;
47     vector unsigned short   v_score_q3;
48     vector unsigned short   v_score_load;
49     vector unsigned char    queue1_to_score  = (vector unsigned char)(16,17,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
50     vector unsigned char    queue2_to_queue1 = (vector unsigned char)(0,1,18,19,4,5,6,7,8,9,10,11,12,13,14,15);
51     vector unsigned char    queue3_to_queue2 = (vector unsigned char)(16,16,16,16,16,21,16,0,16,1,16,2,16,3,16,4);
52     vector unsigned char    queue3_with_load = (vector unsigned char)(23,5,6,7,8,25,9,10,11,27,12,13,29,14,31,16);
53 
54     /* Load the bias to all elements of a constant */
55     v_bias           = vec_lde(0,&bias);
56     perm             = vec_lvsl(0,&bias);
57     v_bias           = vec_perm(v_bias,v_bias,perm);
58     v_bias           = vec_splat(v_bias,0);
59 
60     /* Load gap opening penalty to all elements of a constant */
61     v_gapopen        = vec_lde(0,&gap_open);
62     perm             = vec_lvsl(0,&gap_open);
63     v_gapopen        = vec_perm(v_gapopen,v_gapopen,perm);
64     v_gapopen        = vec_splat(v_gapopen,0);
65 
66     /* Load gap extension penalty to all elements of a constant */
67     v_gapextend      = vec_lde(0,&gap_extend);
68     perm             = vec_lvsl(0,&gap_extend);
69     v_gapextend      = vec_perm(v_gapextend,v_gapextend,perm);
70     v_gapextend      = vec_splat(v_gapextend,0);
71 
72     v_maxscore = vec_xor(v_maxscore,v_maxscore);
73 
74     // Zero out the storage vector
75     k = 2*(db_length+7);
76 
77     for(i=0,j=0;i<k;i++,j+=16)
78     {
79         // borrow the zero value in v_maxscore to have something to store
80         vec_st(v_maxscore,j,workspace);
81     }
82 
83     for(i=0;i<query_length;i+=8)
84     {
85         // fetch first data asap.
86         p_dbseq    = db_sequence;
87         k          = *p_dbseq++;
88         v_score_load = vec_ld(16*k,query_profile_word);
89 
90         // zero lots of stuff.
91         // We use both the VPERM and VSIU unit to knock off some cycles.
92 
93         E          = vec_splat_u16(0);
94         F          = vec_xor(F,F);
95         H          = vec_splat_u16(0);
96         Hup2       = vec_xor(Hup2,Hup2);
97         v_score_q1 = vec_splat_u16(0);
98         v_score_q2 = vec_xor(v_score_q2,v_score_q2);
99         v_score_q3 = vec_splat_u16(0);
100 
101         // reset pointers to the start of the saved data from the last row
102         p = workspace;
103 
104         // PROLOGUE 1
105         // prefetch next residue
106         k          = *p_dbseq++;
107 
108         // Create the actual diagonal score vector
109         // and update the queue of incomplete score vectors
110 
111         v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
112         v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
113         v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
114         v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
115 
116         // prefetch score for next step
117         v_score_load = vec_ld(16*k,query_profile_word);
118 
119         // load values of F and H from previous row (one unit up)
120         Fup    = vec_ld(0,  p);
121         Hup1   = vec_ld(16, p);
122         p += 16; // move ahead 32 bytes
123 
124         // shift into place so we have complete F and H vectors
125         // that refer to the values one unit up from each cell
126         // that we are currently working on.
127         Fup    = vec_sld(Fup,F,14);
128         Hup1   = vec_sld(Hup1,H,14);
129 
130         // do the dynamic programming
131 
132         // update E value
133         E   = vec_subs(E,v_gapextend);
134         tmp = vec_subs(H,v_gapopen);
135         E   = vec_max(E,tmp);
136 
137         // update F value
138         F   = vec_subs(Fup,v_gapextend);
139         tmp = vec_subs(Hup1,v_gapopen);
140         F   = vec_max(F,tmp);
141 
142         // add score to H
143         H   = vec_adds(Hup2,v_score);
144         H   = vec_subs(H,v_bias);
145 
146         // set H to max of H,E,F
147         H   = vec_max(H,E);
148         H   = vec_max(H,F);
149 
150         // Save value to use for next diagonal H
151         Hup2 = Hup1;
152 
153         // Update highest score encountered this far
154         v_maxscore = vec_max(v_maxscore,H);
155 
156 
157         // PROLOGUE 2
158         // prefetch next residue
159         k          = *p_dbseq++;
160 
161         // Create the actual diagonal score vector
162         // and update the queue of incomplete score vectors
163 
164         v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
165         v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
166         v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
167         v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
168 
169         // prefetch score for next step
170         v_score_load = vec_ld(16*k,query_profile_word);
171 
172         // load values of F and H from previous row (one unit up)
173         Fup    = vec_ld(0,  p);
174         Hup1   = vec_ld(16, p);
175         p += 16; // move ahead 32 bytes
176 
177         // shift into place so we have complete F and H vectors
178         // that refer to the values one unit up from each cell
179         // that we are currently working on.
180         Fup    = vec_sld(Fup,F,14);
181         Hup1   = vec_sld(Hup1,H,14);
182 
183         // do the dynamic programming
184 
185         // update E value
186         E   = vec_subs(E,v_gapextend);
187         tmp = vec_subs(H,v_gapopen);
188         E   = vec_max(E,tmp);
189 
190         // update F value
191         F   = vec_subs(Fup,v_gapextend);
192         tmp = vec_subs(Hup1,v_gapopen);
193         F   = vec_max(F,tmp);
194 
195         // add score to H
196         H   = vec_adds(Hup2,v_score);
197         H   = vec_subs(H,v_bias);
198 
199         // set H to max of H,E,F
200         H   = vec_max(H,E);
201         H   = vec_max(H,F);
202 
203         // Save value to use for next diagonal H
204         Hup2 = Hup1;
205 
206         // Update highest score encountered this far
207         v_maxscore = vec_max(v_maxscore,H);
208 
209 
210         // PROLOGUE 3
211         // prefetch next residue
212         k          = *p_dbseq++;
213 
214         // Create the actual diagonal score vector
215         // and update the queue of incomplete score vectors
216 
217         v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
218         v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
219         v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
220         v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
221 
222         // prefetch score for next step
223         v_score_load = vec_ld(16*k,query_profile_word);
224 
225         // load values of F and H from previous row (one unit up)
226         Fup    = vec_ld(0,  p);
227         Hup1   = vec_ld(16, p);
228         p += 16; // move ahead 32 bytes
229 
230         // shift into place so we have complete F and H vectors
231         // that refer to the values one unit up from each cell
232         // that we are currently working on.
233         Fup    = vec_sld(Fup,F,14);
234         Hup1   = vec_sld(Hup1,H,14);
235 
236         // do the dynamic programming
237 
238         // update E value
239         E   = vec_subs(E,v_gapextend);
240         tmp = vec_subs(H,v_gapopen);
241         E   = vec_max(E,tmp);
242 
243         // update F value
244         F   = vec_subs(Fup,v_gapextend);
245         tmp = vec_subs(Hup1,v_gapopen);
246         F   = vec_max(F,tmp);
247 
248         // add score to H
249         H   = vec_adds(Hup2,v_score);
250         H   = vec_subs(H,v_bias);
251 
252         // set H to max of H,E,F
253         H   = vec_max(H,E);
254         H   = vec_max(H,F);
255 
256         // Save value to use for next diagonal H
257         Hup2 = Hup1;
258 
259         // Update highest score encountered this far
260         v_maxscore = vec_max(v_maxscore,H);
261 
262 
263         // PROLOGUE 4
264         // prefetch next residue
265         k          = *p_dbseq++;
266 
267         // Create the actual diagonal score vector
268         // and update the queue of incomplete score vectors
269 
270         v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
271         v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
272         v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
273         v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
274 
275         // prefetch score for next step
276         v_score_load = vec_ld(16*k,query_profile_word);
277 
278         // load values of F and H from previous row (one unit up)
279         Fup    = vec_ld(0,  p);
280         Hup1   = vec_ld(16, p);
281         p += 16; // move ahead 32 bytes
282 
283         // shift into place so we have complete F and H vectors
284         // that refer to the values one unit up from each cell
285         // that we are currently working on.
286         Fup    = vec_sld(Fup,F,14);
287         Hup1   = vec_sld(Hup1,H,14);
288 
289         // do the dynamic programming
290 
291         // update E value
292         E   = vec_subs(E,v_gapextend);
293         tmp = vec_subs(H,v_gapopen);
294         E   = vec_max(E,tmp);
295 
296         // update F value
297         F   = vec_subs(Fup,v_gapextend);
298         tmp = vec_subs(Hup1,v_gapopen);
299         F   = vec_max(F,tmp);
300 
301         // add score to H
302         H   = vec_adds(Hup2,v_score);
303         H   = vec_subs(H,v_bias);
304 
305         // set H to max of H,E,F
306         H   = vec_max(H,E);
307         H   = vec_max(H,F);
308 
309         // Save value to use for next diagonal H
310         Hup2 = Hup1;
311 
312         // Update highest score encountered this far
313         v_maxscore = vec_max(v_maxscore,H);
314 
315 
316         // PROLOGUE 5
317         // prefetch next residue
318         k          = *p_dbseq++;
319 
320         // Create the actual diagonal score vector
321         // and update the queue of incomplete score vectors
322 
323         v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
324         v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
325         v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
326         v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
327 
328         // prefetch score for next step
329         v_score_load = vec_ld(16*k,query_profile_word);
330 
331         // load values of F and H from previous row (one unit up)
332         Fup    = vec_ld(0,  p);
333         Hup1   = vec_ld(16, p);
334         p += 16; // move ahead 32 bytes
335 
336         // shift into place so we have complete F and H vectors
337         // that refer to the values one unit up from each cell
338         // that we are currently working on.
339         Fup    = vec_sld(Fup,F,14);
340         Hup1   = vec_sld(Hup1,H,14);
341 
342         // do the dynamic programming
343 
344         // update E value
345         E   = vec_subs(E,v_gapextend);
346         tmp = vec_subs(H,v_gapopen);
347         E   = vec_max(E,tmp);
348 
349         // update F value
350         F   = vec_subs(Fup,v_gapextend);
351         tmp = vec_subs(Hup1,v_gapopen);
352         F   = vec_max(F,tmp);
353 
354         // add score to H
355         H   = vec_adds(Hup2,v_score);
356         H   = vec_subs(H,v_bias);
357 
358         // set H to max of H,E,F
359         H   = vec_max(H,E);
360         H   = vec_max(H,F);
361 
362         // Save value to use for next diagonal H
363         Hup2 = Hup1;
364 
365         // Update highest score encountered this far
366         v_maxscore = vec_max(v_maxscore,H);
367 
368 
369         // PROLOGUE 6
370         // prefetch next residue
371         k          = *p_dbseq++;
372 
373         // Create the actual diagonal score vector
374         // and update the queue of incomplete score vectors
375 
376         v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
377         v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
378         v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
379         v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
380 
381         // prefetch score for next step
382         v_score_load = vec_ld(16*k,query_profile_word);
383 
384         // load values of F and H from previous row (one unit up)
385         Fup    = vec_ld(0,  p);
386         Hup1   = vec_ld(16, p);
387         p += 16; // move ahead 32 bytes
388 
389         // shift into place so we have complete F and H vectors
390         // that refer to the values one unit up from each cell
391         // that we are currently working on.
392         Fup    = vec_sld(Fup,F,14);
393         Hup1   = vec_sld(Hup1,H,14);
394 
395         // do the dynamic programming
396 
397         // update E value
398         E   = vec_subs(E,v_gapextend);
399         tmp = vec_subs(H,v_gapopen);
400         E   = vec_max(E,tmp);
401 
402         // update F value
403         F   = vec_subs(Fup,v_gapextend);
404         tmp = vec_subs(Hup1,v_gapopen);
405         F   = vec_max(F,tmp);
406 
407         // add score to H
408         H   = vec_adds(Hup2,v_score);
409         H   = vec_subs(H,v_bias);
410 
411         // set H to max of H,E,F
412         H   = vec_max(H,E);
413         H   = vec_max(H,F);
414 
415         // Save value to use for next diagonal H
416         Hup2 = Hup1;
417 
418         // Update highest score encountered this far
419         v_maxscore = vec_max(v_maxscore,H);
420 
421 
422         // PROLOGUE 7
423         // prefetch next residue
424         k          = *p_dbseq++;
425 
426         // Create the actual diagonal score vector
427         // and update the queue of incomplete score vectors
428 
429         v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
430         v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
431         v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
432         v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
433 
434         // prefetch score for next step
435         v_score_load = vec_ld(16*k,query_profile_word);
436 
437         // load values of F and H from previous row (one unit up)
438         Fup    = vec_ld(0,  p);
439         Hup1   = vec_ld(16, p);
440         p += 16; // move ahead 32 bytes
441 
442         // shift into place so we have complete F and H vectors
443         // that refer to the values one unit up from each cell
444         // that we are currently working on.
445         Fup    = vec_sld(Fup,F,14);
446         Hup1   = vec_sld(Hup1,H,14);
447 
448         // do the dynamic programming
449 
450         // update E value
451         E   = vec_subs(E,v_gapextend);
452         tmp = vec_subs(H,v_gapopen);
453         E   = vec_max(E,tmp);
454 
455         // update F value
456         F   = vec_subs(Fup,v_gapextend);
457         tmp = vec_subs(Hup1,v_gapopen);
458         F   = vec_max(F,tmp);
459 
460         // add score to H
461         H   = vec_adds(Hup2,v_score);
462         H   = vec_subs(H,v_bias);
463 
464         // set H to max of H,E,F
465         H   = vec_max(H,E);
466         H   = vec_max(H,F);
467 
468         // Save value to use for next diagonal H
469         Hup2 = Hup1;
470 
471         // Update highest score encountered this far
472         v_maxscore = vec_max(v_maxscore,H);
473 
474 
475         // PROLOGUE 8
476         // prefetch next residue
477         k          = *p_dbseq++;
478 
479         // Create the actual diagonal score vector
480         // and update the queue of incomplete score vectors
481 
482         v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
483         v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
484         v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
485         v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
486 
487         // prefetch score for next step
488         v_score_load = vec_ld(16*k,query_profile_word);
489 
490         // load values of F and H from previous row (one unit up)
491         Fup    = vec_ld(0,  p);
492         Hup1   = vec_ld(16, p);
493         p += 16; // move ahead 32 bytes
494 
495         // shift into place so we have complete F and H vectors
496         // that refer to the values one unit up from each cell
497         // that we are currently working on.
498         Fup    = vec_sld(Fup,F,14);
499         Hup1   = vec_sld(Hup1,H,14);
500 
501         // do the dynamic programming
502 
503         // update E value
504         E   = vec_subs(E,v_gapextend);
505         tmp = vec_subs(H,v_gapopen);
506         E   = vec_max(E,tmp);
507 
508         // update F value
509         F   = vec_subs(Fup,v_gapextend);
510         tmp = vec_subs(Hup1,v_gapopen);
511         F   = vec_max(F,tmp);
512 
513         // add score to H
514         H   = vec_adds(Hup2,v_score);
515         H   = vec_subs(H,v_bias);
516 
517         // set H to max of H,E,F
518         H   = vec_max(H,E);
519         H   = vec_max(H,F);
520 
521         // Save value to use for next diagonal H
522         Hup2 = Hup1;
523 
524         // Update highest score encountered this far
525         v_maxscore = vec_max(v_maxscore,H);
526 
527 
528         // reset pointers to the start of the saved data from the last row
529         p = workspace;
530 
531         for(j=8;j<db_length;j+=8)
532         {
533             // STEP 1
534 
535             // prefetch next residue
536             k          = *p_dbseq++;
537 
538             // Create the actual diagonal score vector
539             // and update the queue of incomplete score vectors
540 
541             v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
542             v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
543             v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
544             v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
545 
546             // prefetch score for next step
547             v_score_load = vec_ld(16*k,query_profile_word);
548 
549             // load values of F and H from previous row (one unit up)
550             Fup    = vec_ld(256, p);
551             Hup1   = vec_ld(272, p);
552 
553             // save old values of F and H to use on next row
554             vec_st(F, 0,  p);
555             vec_st(H, 16, p);
556             p += 16; // move ahead 32 bytes
557 
558             // shift into place so we have complete F and H vectors
559             // that refer to the values one unit up from each cell
560             // that we are currently working on.
561             Fup    = vec_sld(Fup,F,14);
562             Hup1   = vec_sld(Hup1,H,14);
563 
564             // do the dynamic programming
565 
566             // update E value
567             E   = vec_subs(E,v_gapextend);
568             tmp = vec_subs(H,v_gapopen);
569             E   = vec_max(E,tmp);
570 
571             // update F value
572             F   = vec_subs(Fup,v_gapextend);
573             tmp = vec_subs(Hup1,v_gapopen);
574             F   = vec_max(F,tmp);
575 
576             // add score to H
577             H   = vec_adds(Hup2,v_score);
578             H   = vec_subs(H,v_bias);
579 
580             // set H to max of H,E,F
581             H   = vec_max(H,E);
582             H   = vec_max(H,F);
583 
584 
585             // Update highest score encountered this far
586             v_maxscore = vec_max(v_maxscore,H);
587 
588 
589 
590             // STEP 2
591 
592             // prefetch next residue
593             k          = *p_dbseq++;
594 
595             // Create the actual diagonal score vector
596             // and update the queue of incomplete score vectors
597 
598             v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
599             v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
600             v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
601             v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
602 
603             // prefetch score for next step
604             v_score_load = vec_ld(16*k,query_profile_word);
605 
606             // load values of F and H from previous row (one unit up)
607             Fup    = vec_ld(256, p);
608             Hup2   = vec_ld(272, p);
609 
610             // save old values of F and H to use on next row
611             vec_st(F, 0,  p);
612             vec_st(H, 16, p);
613             p += 16; // move ahead 32 bytes
614 
615             // shift into place so we have complete F and H vectors
616             // that refer to the values one unit up from each cell
617             // that we are currently working on.
618             Fup    = vec_sld(Fup,F,14);
619             Hup2   = vec_sld(Hup2,H,14);
620 
621             // do the dynamic programming
622 
623             // update E value
624             E   = vec_subs(E,v_gapextend);
625             tmp = vec_subs(H,v_gapopen);
626             E   = vec_max(E,tmp);
627 
628             // update F value
629             F   = vec_subs(Fup,v_gapextend);
630             tmp = vec_subs(Hup2,v_gapopen);
631             F   = vec_max(F,tmp);
632 
633             // add score to H
634             H   = vec_adds(Hup1,v_score);
635             H   = vec_subs(H,v_bias);
636 
637             // set H to max of H,E,F
638             H   = vec_max(H,E);
639             H   = vec_max(H,F);
640 
641 
642             // Update highest score encountered this far
643             v_maxscore = vec_max(v_maxscore,H);
644 
645 
646 
647             // STEP 3
648 
649             // prefetch next residue
650             k          = *p_dbseq++;
651 
652             // Create the actual diagonal score vector
653             // and update the queue of incomplete score vectors
654 
655             v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
656             v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
657             v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
658             v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
659 
660             // prefetch score for next step
661             v_score_load = vec_ld(16*k,query_profile_word);
662 
663             // load values of F and H from previous row (one unit up)
664             Fup    = vec_ld(256, p);
665             Hup1   = vec_ld(272, p);
666 
667             // save old values of F and H to use on next row
668             vec_st(F, 0,  p);
669             vec_st(H, 16, p);
670             p += 16; // move ahead 32 bytes
671 
672             // shift into place so we have complete F and H vectors
673             // that refer to the values one unit up from each cell
674             // that we are currently working on.
675             Fup    = vec_sld(Fup,F,14);
676             Hup1   = vec_sld(Hup1,H,14);
677 
678             // do the dynamic programming
679 
680             // update E value
681             E   = vec_subs(E,v_gapextend);
682             tmp = vec_subs(H,v_gapopen);
683             E   = vec_max(E,tmp);
684 
685             // update F value
686             F   = vec_subs(Fup,v_gapextend);
687             tmp = vec_subs(Hup1,v_gapopen);
688             F   = vec_max(F,tmp);
689 
690             // add score to H
691             H   = vec_adds(Hup2,v_score);
692             H   = vec_subs(H,v_bias);
693 
694             // set H to max of H,E,F
695             H   = vec_max(H,E);
696             H   = vec_max(H,F);
697 
698 
699 
700             // Update highest score encountered this far
701             v_maxscore = vec_max(v_maxscore,H);
702 
703 
704 
705             // STEP 4
706 
707             // prefetch next residue
708             k          = *p_dbseq++;
709 
710             // Create the actual diagonal score vector
711             // and update the queue of incomplete score vectors
712 
713             v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
714             v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
715             v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
716             v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
717 
718             // prefetch score for next step
719             v_score_load = vec_ld(16*k,query_profile_word);
720 
721             // load values of F and H from previous row (one unit up)
722             Fup    = vec_ld(256, p);
723             Hup2   = vec_ld(272, p);
724 
725             // save old values of F and H to use on next row
726             vec_st(F, 0,  p);
727             vec_st(H, 16, p);
728             p += 16; // move ahead 32 bytes
729 
730             // shift into place so we have complete F and H vectors
731             // that refer to the values one unit up from each cell
732             // that we are currently working on.
733             Fup    = vec_sld(Fup,F,14);
734             Hup2   = vec_sld(Hup2,H,14);
735 
736             // do the dynamic programming
737 
738             // update E value
739             E   = vec_subs(E,v_gapextend);
740             tmp = vec_subs(H,v_gapopen);
741             E   = vec_max(E,tmp);
742 
743             // update F value
744             F   = vec_subs(Fup,v_gapextend);
745             tmp = vec_subs(Hup2,v_gapopen);
746             F   = vec_max(F,tmp);
747 
748             // add score to H
749             H   = vec_adds(Hup1,v_score);
750             H   = vec_subs(H,v_bias);
751 
752             // set H to max of H,E,F
753             H   = vec_max(H,E);
754             H   = vec_max(H,F);
755 
756 
757             // Update highest score encountered this far
758             v_maxscore = vec_max(v_maxscore,H);
759 
760 
761 
762             // STEP 5
763 
764             // prefetch next residue
765             k          = *p_dbseq++;
766 
767             // Create the actual diagonal score vector
768             // and update the queue of incomplete score vectors
769 
770             v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
771             v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
772             v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
773             v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
774 
775             // prefetch score for next step
776             v_score_load = vec_ld(16*k,query_profile_word);
777 
778             // load values of F and H from previous row (one unit up)
779             Fup    = vec_ld(256, p);
780             Hup1   = vec_ld(272, p);
781 
782             // save old values of F and H to use on next row
783             vec_st(F, 0,  p);
784             vec_st(H, 16, p);
785             p += 16; // move ahead 32 bytes
786 
787             // shift into place so we have complete F and H vectors
788             // that refer to the values one unit up from each cell
789             // that we are currently working on.
790             Fup    = vec_sld(Fup,F,14);
791             Hup1   = vec_sld(Hup1,H,14);
792 
793             // do the dynamic programming
794 
795             // update E value
796             E   = vec_subs(E,v_gapextend);
797             tmp = vec_subs(H,v_gapopen);
798             E   = vec_max(E,tmp);
799 
800             // update F value
801             F   = vec_subs(Fup,v_gapextend);
802             tmp = vec_subs(Hup1,v_gapopen);
803             F   = vec_max(F,tmp);
804 
805             // add score to H
806             H   = vec_adds(Hup2,v_score);
807             H   = vec_subs(H,v_bias);
808 
809             // set H to max of H,E,F
810             H   = vec_max(H,E);
811             H   = vec_max(H,F);
812 
813 
814             // Update highest score encountered this far
815             v_maxscore = vec_max(v_maxscore,H);
816 
817 
818 
819             // STEP 6
820 
821             // prefetch next residue
822             k          = *p_dbseq++;
823 
824             // Create the actual diagonal score vector
825             // and update the queue of incomplete score vectors
826 
827             v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
828             v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
829             v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
830             v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
831 
832             // prefetch score for next step
833             v_score_load = vec_ld(16*k,query_profile_word);
834 
835             // load values of F and H from previous row (one unit up)
836             Fup    = vec_ld(256, p);
837             Hup2   = vec_ld(272, p);
838 
839             // save old values of F and H to use on next row
840             vec_st(F, 0,  p);
841             vec_st(H, 16, p);
842             p += 16; // move ahead 32 bytes
843 
844             // shift into place so we have complete F and H vectors
845             // that refer to the values one unit up from each cell
846             // that we are currently working on.
847             Fup    = vec_sld(Fup,F,14);
848             Hup2   = vec_sld(Hup2,H,14);
849 
850             // do the dynamic programming
851 
852             // update E value
853             E   = vec_subs(E,v_gapextend);
854             tmp = vec_subs(H,v_gapopen);
855             E   = vec_max(E,tmp);
856 
857             // update F value
858             F   = vec_subs(Fup,v_gapextend);
859             tmp = vec_subs(Hup2,v_gapopen);
860             F   = vec_max(F,tmp);
861 
862             // add score to H
863             H   = vec_adds(Hup1,v_score);
864             H   = vec_subs(H,v_bias);
865 
866             // set H to max of H,E,F
867             H   = vec_max(H,E);
868             H   = vec_max(H,F);
869 
870 
871 
872             // Update highest score encountered this far
873             v_maxscore = vec_max(v_maxscore,H);
874 
875 
876 
877             // STEP 7
878 
879             // prefetch next residue
880             k          = *p_dbseq++;
881 
882             // Create the actual diagonal score vector
883             // and update the queue of incomplete score vectors
884 
885             v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
886             v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
887             v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
888             v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
889 
890             // prefetch score for next step
891             v_score_load = vec_ld(16*k,query_profile_word);
892 
893             // load values of F and H from previous row (one unit up)
894             Fup    = vec_ld(256, p);
895             Hup1   = vec_ld(272, p);
896 
897             // save old values of F and H to use on next row
898             vec_st(F, 0,  p);
899             vec_st(H, 16, p);
900             p += 16; // move ahead 32 bytes
901 
902             // shift into place so we have complete F and H vectors
903             // that refer to the values one unit up from each cell
904             // that we are currently working on.
905             Fup    = vec_sld(Fup,F,14);
906             Hup1   = vec_sld(Hup1,H,14);
907 
908             // do the dynamic programming
909 
910             // update E value
911             E   = vec_subs(E,v_gapextend);
912             tmp = vec_subs(H,v_gapopen);
913             E   = vec_max(E,tmp);
914 
915             // update F value
916             F   = vec_subs(Fup,v_gapextend);
917             tmp = vec_subs(Hup1,v_gapopen);
918             F   = vec_max(F,tmp);
919 
920             // add score to H
921             H   = vec_adds(Hup2,v_score);
922             H   = vec_subs(H,v_bias);
923 
924             // set H to max of H,E,F
925             H   = vec_max(H,E);
926             H   = vec_max(H,F);
927 
928 
929 
930             // Update highest score encountered this far
931             v_maxscore = vec_max(v_maxscore,H);
932 
933 
934 
935             // STEP 8
936 
937             // prefetch next residue
938             k          = *p_dbseq++;
939 
940             // Create the actual diagonal score vector
941             // and update the queue of incomplete score vectors
942 
943             v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
944             v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
945             v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
946             v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
947 
948             // prefetch score for next step
949             v_score_load = vec_ld(16*k,query_profile_word);
950 
951             // load values of F and H from previous row (one unit up)
952             Fup    = vec_ld(256, p);
953             Hup2   = vec_ld(272, p);
954 
955             // save old values of F and H to use on next row
956             vec_st(F, 0,  p);
957             vec_st(H, 16, p);
958             p += 16; // move ahead 32 bytes
959 
960             // shift into place so we have complete F and H vectors
961             // that refer to the values one unit up from each cell
962             // that we are currently working on.
963             Fup    = vec_sld(Fup,F,14);
964             Hup2   = vec_sld(Hup2,H,14);
965 
966             // do the dynamic programming
967 
968             // update E value
969             E   = vec_subs(E,v_gapextend);
970             tmp = vec_subs(H,v_gapopen);
971             E   = vec_max(E,tmp);
972 
973             // update F value
974             F   = vec_subs(Fup,v_gapextend);
975             tmp = vec_subs(Hup2,v_gapopen);
976             F   = vec_max(F,tmp);
977 
978             // add score to H
979             H   = vec_adds(Hup1,v_score);
980             H   = vec_subs(H,v_bias);
981 
982             // set H to max of H,E,F
983             H   = vec_max(H,E);
984             H   = vec_max(H,F);
985 
986 
987             // Update highest score encountered this far
988             v_maxscore = vec_max(v_maxscore,H);
989         }
990 
991         v_score_load = vec_splat_u16(0);
992 
993         for(;j<db_length+7;j++)
994         {
995             // Create the actual diagonal score vector
996             // and update the queue of incomplete score vectors
997             //
998             // This could of course be done with only vec_perm or vec_sel,
999             // but since they use different execution units we have found
1000             // it to be slightly faster to mix them.
1001             v_score    = vec_perm(v_score_q1, v_score_load, queue1_to_score);
1002             v_score_q1 = vec_perm(v_score_q2, v_score_load, queue2_to_queue1);
1003             v_score_q2 = vec_perm(v_score_q3, v_score_load, queue3_to_queue2);
1004             v_score_q3 = vec_perm(v_score_q3, v_score_load, queue3_with_load);
1005 
1006             // save old values of F and H to use on next row
1007             vec_st(F, 0,  p);
1008             vec_st(H, 16, p);
1009             p += 16; // move ahead 32 bytes
1010 
1011             // v_score_load contains all zeros
1012             Fup    = vec_sld(v_score_load,F,14);
1013             Hup1   = vec_sld(v_score_load,H,14);
1014 
1015             // do the dynamic programming
1016 
1017             // update E value
1018             E   = vec_subs(E,v_gapextend);
1019             tmp = vec_subs(H,v_gapopen);
1020             E   = vec_max(E,tmp);
1021 
1022             // update F value
1023             F   = vec_subs(Fup,v_gapextend);
1024             tmp = vec_subs(Hup1,v_gapopen);
1025             F   = vec_max(F,tmp);
1026 
1027             // add score to H
1028             H   = vec_adds(Hup2,v_score);
1029             H   = vec_subs(H,v_bias);
1030 
1031             // set H to max of H,E,F
1032             H   = vec_max(H,E);
1033             H   = vec_max(H,F);
1034 
1035             // Save value to use for next diagonal H
1036             Hup2 = Hup1;
1037 
1038             // Update highest score encountered this far
1039             v_maxscore = vec_max(v_maxscore,H);
1040         }
1041         vec_st(F, 0,  p);
1042         vec_st(H, 16, p);
1043 
1044         query_profile_word += 8*alphabet_size;
1045     }
1046 
1047     // find largest score in the v_maxscore vector
1048     tmp = vec_sld(v_maxscore,v_maxscore,8);
1049     v_maxscore = vec_max(v_maxscore,tmp);
1050     tmp = vec_sld(v_maxscore,v_maxscore,4);
1051     v_maxscore = vec_max(v_maxscore,tmp);
1052     tmp = vec_sld(v_maxscore,v_maxscore,2);
1053     v_maxscore = vec_max(v_maxscore,tmp);
1054 
1055     // store in temporary variable
1056     vec_ste(v_maxscore,0,&score);
1057 
1058     // return largest score
1059     return score;
1060 }
1061 
1062 int
smith_waterman_altivec_byte(unsigned char * query_sequence,unsigned char * query_profile_byte,int query_length,unsigned char * db_sequence,int db_length,unsigned char bias,unsigned char gap_open,unsigned char gap_extend,struct f_struct * f_str)1063 smith_waterman_altivec_byte(unsigned char *     query_sequence,
1064                             unsigned char *     query_profile_byte,
1065                             int                 query_length,
1066                             unsigned char *     db_sequence,
1067                             int                 db_length,
1068                             unsigned char       bias,
1069                             unsigned char       gap_open,
1070                             unsigned char       gap_extend,
1071                             struct f_struct *   f_str)
1072 {
1073     int                     i,j,k,k8;
1074     int                     overflow;
1075     unsigned char *         p;
1076     unsigned char           score;
1077     int                     alphabet_size = f_str->alphabet_size;
1078     unsigned char *         workspace     = (unsigned char *)f_str->workspace;
1079 
1080     vector unsigned char    Fup,Hup1,Hup2,E,F,H,tmp;
1081     vector unsigned char    perm;
1082     vector unsigned char    v_maxscore;
1083     vector unsigned char    v_bias,v_gapopen,v_gapextend;
1084     vector unsigned char    v_score;
1085     vector unsigned char    v_score_q1;
1086     vector unsigned char    v_score_q2;
1087     vector unsigned char    v_score_q3;
1088     vector unsigned char    v_score_q4;
1089     vector unsigned char    v_score_q5;
1090     vector unsigned char    v_score_load1;
1091     vector unsigned char    v_score_load2;
1092     vector unsigned char    v_zero;
1093 
1094     vector unsigned char    queue1_to_score  = (vector unsigned char)(16,1,2,3,4,5,6,7,24,9,10,11,12,13,14,15);
1095     vector unsigned char    queue2_to_queue1 = (vector unsigned char)(16,17,2,3,4,5,6,7,24,25,10,11,12,13,14,15);
1096     vector unsigned char    queue3_to_queue2 = (vector unsigned char)(16,17,18,3,4,5,6,7,24,25,26,11,12,13,14,15);
1097     vector unsigned char    queue4_to_queue3 = (vector unsigned char)(16,17,18,19,4,5,6,7,24,25,26,27,12,13,14,15);
1098     vector unsigned char    queue5_to_queue4 = (vector unsigned char)(16,17,18,19,20,2,3,4,24,25,26,27,28,10,11,12);
1099     vector unsigned char    queue5_with_load = (vector unsigned char)(19,20,21,5,6,22,7,23,27,28,29,13,14,30,15,31);
1100     vector unsigned char    merge_score_load = (vector unsigned char)(0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
1101 
1102     v_zero           = vec_splat_u8(0);
1103 
1104     /* Load the bias to all elements of a constant */
1105     v_bias           = vec_lde(0,&bias);
1106     perm             = vec_lvsl(0,&bias);
1107     v_bias           = vec_perm(v_bias,v_bias,perm);
1108     v_bias           = vec_splat(v_bias,0);
1109 
1110     /* Load gap opening penalty to all elements of a constant */
1111     v_gapopen        = vec_lde(0,&gap_open);
1112     perm             = vec_lvsl(0,&gap_open);
1113     v_gapopen        = vec_perm(v_gapopen,v_gapopen,perm);
1114     v_gapopen        = vec_splat(v_gapopen,0);
1115 
1116     /* Load gap extension penalty to all elements of a constant */
1117     v_gapextend      = vec_lde(0,&gap_extend);
1118     perm             = vec_lvsl(0,&gap_extend);
1119     v_gapextend      = vec_perm(v_gapextend,v_gapextend,perm);
1120     v_gapextend      = vec_splat(v_gapextend,0);
1121 
1122     v_maxscore = vec_xor(v_maxscore,v_maxscore);
1123 
1124     // Zero out the storage vector
1125     k = (db_length+15);
1126     for(i=0,j=0;i<k;i++,j+=32)
1127     {
1128         // borrow the zero value in v_maxscore to have something to store
1129         vec_st(v_maxscore,j,workspace);
1130         vec_st(v_maxscore,j+16,workspace);
1131     }
1132 
1133     for(i=0;i<query_length;i+=16)
1134     {
1135         // zero lots of stuff.
1136         // We use both the VPERM and VSIU unit to knock off some cycles.
1137 
1138         E          = vec_splat_u8(0);
1139         F          = vec_xor(F,F);
1140         H          = vec_splat_u8(0);
1141         Hup2      = vec_xor(Hup2,Hup2);
1142         v_score_q1 = vec_splat_u8(0);
1143         v_score_q2 = vec_xor(v_score_q2,v_score_q2);
1144         v_score_q3 = vec_splat_u8(0);
1145         v_score_q4 = vec_xor(v_score_q4,v_score_q4);
1146         v_score_q5 = vec_splat_u8(0);
1147 
1148         // reset pointers to the start of the saved data from the last row
1149         p = workspace;
1150 
1151         // start directly and prefetch score column
1152         k             = db_sequence[0];
1153         k8            = k;
1154         v_score_load1 = vec_ld(16*k,query_profile_byte);
1155         v_score_load2 = v_score_load1;
1156         v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1157 
1158         // PROLOGUE 1
1159         // prefetch next residue
1160         k                = db_sequence[1];
1161 
1162         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1163         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1164         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1165         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1166         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1167         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1168 
1169         // prefetch score for next step
1170         v_score_load1 = vec_ld(16*k,query_profile_byte);
1171 
1172         // load values of F and H from previous row (one unit up)
1173         Fup    = vec_ld(0,  p);
1174         Hup1   = vec_ld(16, p);
1175         p += 32; // move ahead 32 bytes
1176 
1177         // shift into place so we have complete F and H vectors
1178         // that refer to the values one unit up from each cell
1179         // that we are currently working on.
1180         Fup    = vec_sld(Fup,F,15);
1181         Hup1    = vec_sld(Hup1,H,15);
1182 
1183         // do the dynamic programming
1184 
1185         // update E value
1186         E   = vec_subs(E,v_gapextend);
1187         tmp = vec_subs(H,v_gapopen);
1188         E   = vec_max(E,tmp);
1189 
1190         // update F value
1191         F   = vec_subs(Fup,v_gapextend);
1192         tmp = vec_subs(Hup1,v_gapopen);
1193         F   = vec_max(F,tmp);
1194 
1195         v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1196 
1197         // add score to H
1198         H   = vec_adds(Hup2,v_score);
1199         H   = vec_subs(H,v_bias);
1200 
1201         // set H to max of H,E,F
1202         H   = vec_max(H,E);
1203         H   = vec_max(H,F);
1204 
1205         // Update highest score encountered this far
1206         v_maxscore = vec_max(v_maxscore,H);
1207 
1208 
1209 
1210 
1211         // PROLOGUE 2
1212         // prefetch next residue
1213         k                = db_sequence[2];
1214 
1215         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1216         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1217         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1218         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1219         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1220         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1221 
1222 
1223         // prefetch score for next step
1224         v_score_load1 = vec_ld(16*k,query_profile_byte);
1225 
1226         // load values of F and H from previous row (one unit up)
1227         Fup    = vec_ld(0,  p);
1228         Hup2   = vec_ld(16, p);
1229         p += 32; // move ahead 32 bytes
1230 
1231         // shift into place so we have complete F and H vectors
1232         // that refer to the values one unit up from each cell
1233         // that we are currently working on.
1234         Fup    = vec_sld(Fup,F,15);
1235         Hup2   = vec_sld(Hup2,H,15);
1236 
1237         // do the dynamic programming
1238 
1239         // update E value
1240         E   = vec_subs(E,v_gapextend);
1241         tmp = vec_subs(H,v_gapopen);
1242         E   = vec_max(E,tmp);
1243 
1244         // update F value
1245         F   = vec_subs(Fup,v_gapextend);
1246         tmp = vec_subs(Hup2,v_gapopen);
1247         F   = vec_max(F,tmp);
1248 
1249         v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1250 
1251         // add score to H
1252         H   = vec_adds(Hup1,v_score);
1253         H   = vec_subs(H,v_bias);
1254 
1255         // set H to max of H,E,F
1256         H   = vec_max(H,E);
1257         H   = vec_max(H,F);
1258 
1259         // Update highest score encountered this far
1260         v_maxscore = vec_max(v_maxscore,H);
1261 
1262 
1263         // PROLOGUE 3
1264         // prefetch next residue
1265         k                = db_sequence[3];
1266 
1267         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1268         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1269         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1270         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1271         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1272         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1273 
1274 
1275         // prefetch score for next step
1276         v_score_load1 = vec_ld(16*k,query_profile_byte);
1277 
1278         // load values of F and H from previous row (one unit up)
1279         Fup    = vec_ld(0,  p);
1280         Hup1   = vec_ld(16, p);
1281         p += 32; // move ahead 32 bytes
1282 
1283         // shift into place so we have complete F and H vectors
1284         // that refer to the values one unit up from each cell
1285         // that we are currently working on.
1286         Fup    = vec_sld(Fup,F,15);
1287         Hup1    = vec_sld(Hup1,H,15);
1288 
1289         // do the dynamic programming
1290 
1291         // update E value
1292         E   = vec_subs(E,v_gapextend);
1293         tmp = vec_subs(H,v_gapopen);
1294         E   = vec_max(E,tmp);
1295 
1296         // update F value
1297         F   = vec_subs(Fup,v_gapextend);
1298         tmp = vec_subs(Hup1,v_gapopen);
1299         F   = vec_max(F,tmp);
1300 
1301         v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1302 
1303         // add score to H
1304         H   = vec_adds(Hup2,v_score);
1305         H   = vec_subs(H,v_bias);
1306 
1307         // set H to max of H,E,F
1308         H   = vec_max(H,E);
1309         H   = vec_max(H,F);
1310 
1311         // Update highest score encountered this far
1312         v_maxscore = vec_max(v_maxscore,H);
1313 
1314 
1315         // PROLOGUE 4
1316         // prefetch next residue
1317         k                = db_sequence[4];
1318 
1319         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1320         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1321         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1322         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1323         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1324         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1325 
1326 
1327         // prefetch score for next step
1328         v_score_load1 = vec_ld(16*k,query_profile_byte);
1329 
1330         // load values of F and H from previous row (one unit up)
1331         Fup    = vec_ld(0,  p);
1332         Hup2   = vec_ld(16, p);
1333         p += 32; // move ahead 32 bytes
1334 
1335         // shift into place so we have complete F and H vectors
1336         // that refer to the values one unit up from each cell
1337         // that we are currently working on.
1338         Fup    = vec_sld(Fup,F,15);
1339         Hup2   = vec_sld(Hup2,H,15);
1340 
1341         // do the dynamic programming
1342 
1343         // update E value
1344         E   = vec_subs(E,v_gapextend);
1345         tmp = vec_subs(H,v_gapopen);
1346         E   = vec_max(E,tmp);
1347 
1348         // update F value
1349         F   = vec_subs(Fup,v_gapextend);
1350         tmp = vec_subs(Hup2,v_gapopen);
1351         F   = vec_max(F,tmp);
1352 
1353         v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1354 
1355         // add score to H
1356         H   = vec_adds(Hup1,v_score);
1357         H   = vec_subs(H,v_bias);
1358 
1359         // set H to max of H,E,F
1360         H   = vec_max(H,E);
1361         H   = vec_max(H,F);
1362 
1363         // Update highest score encountered this far
1364         v_maxscore = vec_max(v_maxscore,H);
1365 
1366 
1367         // PROLOGUE 5
1368         // prefetch next residue
1369         k                = db_sequence[5];
1370 
1371         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1372         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1373         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1374         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1375         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1376         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1377 
1378 
1379         // prefetch score for next step
1380         v_score_load1 = vec_ld(16*k,query_profile_byte);
1381 
1382         // load values of F and H from previous row (one unit up)
1383         Fup    = vec_ld(0,  p);
1384         Hup1   = vec_ld(16, p);
1385         p += 32; // move ahead 32 bytes
1386 
1387         // shift into place so we have complete F and H vectors
1388         // that refer to the values one unit up from each cell
1389         // that we are currently working on.
1390         Fup    = vec_sld(Fup,F,15);
1391         Hup1    = vec_sld(Hup1,H,15);
1392 
1393         // do the dynamic programming
1394 
1395         // update E value
1396         E   = vec_subs(E,v_gapextend);
1397         tmp = vec_subs(H,v_gapopen);
1398         E   = vec_max(E,tmp);
1399 
1400         // update F value
1401         F   = vec_subs(Fup,v_gapextend);
1402         tmp = vec_subs(Hup1,v_gapopen);
1403         F   = vec_max(F,tmp);
1404 
1405         v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1406 
1407         // add score to H
1408         H   = vec_adds(Hup2,v_score);
1409         H   = vec_subs(H,v_bias);
1410 
1411         // set H to max of H,E,F
1412         H   = vec_max(H,E);
1413         H   = vec_max(H,F);
1414 
1415         // Update highest score encountered this far
1416         v_maxscore = vec_max(v_maxscore,H);
1417 
1418 
1419         // PROLOGUE 6
1420         // prefetch next residue
1421         k                = db_sequence[6];
1422 
1423         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1424         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1425         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1426         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1427         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1428         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1429 
1430 
1431         // prefetch score for next step
1432         v_score_load1 = vec_ld(16*k,query_profile_byte);
1433 
1434         // load values of F and H from previous row (one unit up)
1435         Fup    = vec_ld(0,  p);
1436         Hup2   = vec_ld(16, p);
1437         p += 32; // move ahead 32 bytes
1438 
1439         // shift into place so we have complete F and H vectors
1440         // that refer to the values one unit up from each cell
1441         // that we are currently working on.
1442         Fup    = vec_sld(Fup,F,15);
1443         Hup2   = vec_sld(Hup2,H,15);
1444 
1445         // do the dynamic programming
1446 
1447         // update E value
1448         E   = vec_subs(E,v_gapextend);
1449         tmp = vec_subs(H,v_gapopen);
1450         E   = vec_max(E,tmp);
1451 
1452         // update F value
1453         F   = vec_subs(Fup,v_gapextend);
1454         tmp = vec_subs(Hup2,v_gapopen);
1455         F   = vec_max(F,tmp);
1456 
1457         v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1458 
1459         // add score to H
1460         H   = vec_adds(Hup1,v_score);
1461         H   = vec_subs(H,v_bias);
1462 
1463         // set H to max of H,E,F
1464         H   = vec_max(H,E);
1465         H   = vec_max(H,F);
1466 
1467         // Update highest score encountered this far
1468         v_maxscore = vec_max(v_maxscore,H);
1469 
1470 
1471 
1472         // PROLOGUE 7
1473         // prefetch next residue
1474         k                = db_sequence[7];
1475 
1476         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1477         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1478         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1479         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1480         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1481         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1482 
1483 
1484         // prefetch score for next step
1485         v_score_load1 = vec_ld(16*k,query_profile_byte);
1486 
1487         // load values of F and H from previous row (one unit up)
1488         Fup    = vec_ld(0,  p);
1489         Hup1   = vec_ld(16, p);
1490         p += 32; // move ahead 32 bytes
1491 
1492         // shift into place so we have complete F and H vectors
1493         // that refer to the values one unit up from each cell
1494         // that we are currently working on.
1495         Fup    = vec_sld(Fup,F,15);
1496         Hup1    = vec_sld(Hup1,H,15);
1497 
1498         // do the dynamic programming
1499 
1500         // update E value
1501         E   = vec_subs(E,v_gapextend);
1502         tmp = vec_subs(H,v_gapopen);
1503         E   = vec_max(E,tmp);
1504 
1505         // update F value
1506         F   = vec_subs(Fup,v_gapextend);
1507         tmp = vec_subs(Hup1,v_gapopen);
1508         F   = vec_max(F,tmp);
1509 
1510         v_score_load1 = vec_perm(v_score_load1,v_zero,merge_score_load);
1511 
1512         // add score to H
1513         H   = vec_adds(Hup2,v_score);
1514         H   = vec_subs(H,v_bias);
1515 
1516         // set H to max of H,E,F
1517         H   = vec_max(H,E);
1518         H   = vec_max(H,F);
1519 
1520         // Update highest score encountered this far
1521         v_maxscore = vec_max(v_maxscore,H);
1522 
1523 
1524 
1525         // PROLOGUE 8
1526         // prefetch next residue
1527         k                = db_sequence[8];
1528 
1529         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1530         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1531         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1532         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1533         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1534         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1535 
1536 
1537         // prefetch score for next step
1538         v_score_load1 = vec_ld(16*k,query_profile_byte);
1539 
1540         // load values of F and H from previous row (one unit up)
1541         Fup    = vec_ld(0,  p);
1542         Hup2   = vec_ld(16, p);
1543         p += 32; // move ahead 32 bytes
1544 
1545         // shift into place so we have complete F and H vectors
1546         // that refer to the values one unit up from each cell
1547         // that we are currently working on.
1548         Fup    = vec_sld(Fup,F,15);
1549         Hup2   = vec_sld(Hup2,H,15);
1550 
1551         // do the dynamic programming
1552 
1553         // update E value
1554         E   = vec_subs(E,v_gapextend);
1555         tmp = vec_subs(H,v_gapopen);
1556         E   = vec_max(E,tmp);
1557 
1558         // update F value
1559         F   = vec_subs(Fup,v_gapextend);
1560         tmp = vec_subs(Hup2,v_gapopen);
1561         F   = vec_max(F,tmp);
1562 
1563         v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1564 
1565         // add score to H
1566         H   = vec_adds(Hup1,v_score);
1567         H   = vec_subs(H,v_bias);
1568 
1569         // set H to max of H,E,F
1570         H   = vec_max(H,E);
1571         H   = vec_max(H,F);
1572 
1573         // Update highest score encountered this far
1574         v_maxscore = vec_max(v_maxscore,H);
1575 
1576 
1577 
1578 
1579         // PROLOGUE 9
1580         // prefetch next residue
1581         k                = db_sequence[9];
1582         k8               = db_sequence[1];
1583 
1584         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1585         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1586         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1587         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1588         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1589         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1590 
1591 
1592         // prefetch score for next step
1593         v_score_load1 = vec_ld(16*k,query_profile_byte);
1594         v_score_load2 = vec_ld(16*k8,query_profile_byte);
1595 
1596         // load values of F and H from previous row (one unit up)
1597         Fup    = vec_ld(0,  p);
1598         Hup1    = vec_ld(16, p);
1599         p += 32; // move ahead 32 bytes
1600 
1601         // shift into place so we have complete F and H vectors
1602         // that refer to the values one unit up from each cell
1603         // that we are currently working on.
1604         Fup    = vec_sld(Fup,F,15);
1605         Hup1    = vec_sld(Hup1,H,15);
1606 
1607         // do the dynamic programming
1608 
1609         // update E value
1610         E   = vec_subs(E,v_gapextend);
1611         tmp = vec_subs(H,v_gapopen);
1612         E   = vec_max(E,tmp);
1613 
1614         // update F value
1615         F   = vec_subs(Fup,v_gapextend);
1616         tmp = vec_subs(Hup1,v_gapopen);
1617         F   = vec_max(F,tmp);
1618 
1619         v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1620 
1621         // add score to H
1622         H   = vec_adds(Hup2,v_score);
1623         H   = vec_subs(H,v_bias);
1624 
1625         // set H to max of H,E,F
1626         H   = vec_max(H,E);
1627         H   = vec_max(H,F);
1628 
1629         // Update highest score encountered this far
1630         v_maxscore = vec_max(v_maxscore,H);
1631 
1632 
1633 
1634         // PROLOGUE 10
1635         // prefetch next residue
1636         k                = db_sequence[10];
1637         k8               = db_sequence[2];
1638 
1639         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1640         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1641         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1642         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1643         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1644         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1645 
1646 
1647         // prefetch score for next step
1648         v_score_load1 = vec_ld(16*k,query_profile_byte);
1649         v_score_load2 = vec_ld(16*k8,query_profile_byte);
1650 
1651         // load values of F and H from previous row (one unit up)
1652         Fup    = vec_ld(0,  p);
1653         Hup2   = vec_ld(16, p);
1654         p += 32; // move ahead 32 bytes
1655 
1656         // shift into place so we have complete F and H vectors
1657         // that refer to the values one unit up from each cell
1658         // that we are currently working on.
1659         Fup    = vec_sld(Fup,F,15);
1660         Hup2   = vec_sld(Hup2,H,15);
1661 
1662         // do the dynamic programming
1663 
1664         // update E value
1665         E   = vec_subs(E,v_gapextend);
1666         tmp = vec_subs(H,v_gapopen);
1667         E   = vec_max(E,tmp);
1668 
1669         // update F value
1670         F   = vec_subs(Fup,v_gapextend);
1671         tmp = vec_subs(Hup2,v_gapopen);
1672         F   = vec_max(F,tmp);
1673 
1674         v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1675 
1676         // add score to H
1677         H   = vec_adds(Hup1,v_score);
1678         H   = vec_subs(H,v_bias);
1679 
1680         // set H to max of H,E,F
1681         H   = vec_max(H,E);
1682         H   = vec_max(H,F);
1683 
1684         // Update highest score encountered this far
1685         v_maxscore = vec_max(v_maxscore,H);
1686 
1687 
1688 
1689 
1690         // PROLOGUE 11
1691         // prefetch next residue
1692         k                = db_sequence[11];
1693         k8               = db_sequence[3];
1694 
1695         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1696         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1697         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1698         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1699         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1700         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1701 
1702 
1703         // prefetch score for next step
1704         v_score_load1 = vec_ld(16*k,query_profile_byte);
1705         v_score_load2 = vec_ld(16*k8,query_profile_byte);
1706 
1707         // load values of F and H from previous row (one unit up)
1708         Fup    = vec_ld(0,  p);
1709         Hup1    = vec_ld(16, p);
1710         p += 32; // move ahead 32 bytes
1711 
1712         // shift into place so we have complete F and H vectors
1713         // that refer to the values one unit up from each cell
1714         // that we are currently working on.
1715         Fup    = vec_sld(Fup,F,15);
1716         Hup1    = vec_sld(Hup1,H,15);
1717 
1718         // do the dynamic programming
1719 
1720         // update E value
1721         E   = vec_subs(E,v_gapextend);
1722         tmp = vec_subs(H,v_gapopen);
1723         E   = vec_max(E,tmp);
1724 
1725         // update F value
1726         F   = vec_subs(Fup,v_gapextend);
1727         tmp = vec_subs(Hup1,v_gapopen);
1728         F   = vec_max(F,tmp);
1729 
1730         v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1731 
1732         // add score to H
1733         H   = vec_adds(Hup2,v_score);
1734         H   = vec_subs(H,v_bias);
1735 
1736         // set H to max of H,E,F
1737         H   = vec_max(H,E);
1738         H   = vec_max(H,F);
1739 
1740         // Update highest score encountered this far
1741         v_maxscore = vec_max(v_maxscore,H);
1742 
1743 
1744 
1745         // PROLOGUE 12
1746         // prefetch next residue
1747         k                = db_sequence[12];
1748         k8               = db_sequence[4];
1749 
1750         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1751         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1752         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1753         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1754         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1755         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1756 
1757 
1758         // prefetch score for next step
1759         v_score_load1 = vec_ld(16*k,query_profile_byte);
1760         v_score_load2 = vec_ld(16*k8,query_profile_byte);
1761 
1762         // load values of F and H from previous row (one unit up)
1763         Fup    = vec_ld(0,  p);
1764         Hup2   = vec_ld(16, p);
1765         p += 32; // move ahead 32 bytes
1766 
1767         // shift into place so we have complete F and H vectors
1768         // that refer to the values one unit up from each cell
1769         // that we are currently working on.
1770         Fup    = vec_sld(Fup,F,15);
1771         Hup2   = vec_sld(Hup2,H,15);
1772 
1773         // do the dynamic programming
1774 
1775         // update E value
1776         E   = vec_subs(E,v_gapextend);
1777         tmp = vec_subs(H,v_gapopen);
1778         E   = vec_max(E,tmp);
1779 
1780         // update F value
1781         F   = vec_subs(Fup,v_gapextend);
1782         tmp = vec_subs(Hup2,v_gapopen);
1783         F   = vec_max(F,tmp);
1784 
1785         v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1786 
1787         // add score to H
1788         H   = vec_adds(Hup1,v_score);
1789         H   = vec_subs(H,v_bias);
1790 
1791         // set H to max of H,E,F
1792         H   = vec_max(H,E);
1793         H   = vec_max(H,F);
1794 
1795         // Update highest score encountered this far
1796         v_maxscore = vec_max(v_maxscore,H);
1797 
1798 
1799 
1800 
1801         // PROLOGUE 13
1802         // prefetch next residue
1803         k                = db_sequence[13];
1804         k8               = db_sequence[5];
1805 
1806         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1807         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1808         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1809         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1810         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1811         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1812 
1813 
1814         // prefetch score for next step
1815         v_score_load1 = vec_ld(16*k,query_profile_byte);
1816         v_score_load2 = vec_ld(16*k8,query_profile_byte);
1817 
1818         // load values of F and H from previous row (one unit up)
1819         Fup    = vec_ld(0,  p);
1820         Hup1    = vec_ld(16, p);
1821         p += 32; // move ahead 32 bytes
1822 
1823         // shift into place so we have complete F and H vectors
1824         // that refer to the values one unit up from each cell
1825         // that we are currently working on.
1826         Fup    = vec_sld(Fup,F,15);
1827         Hup1    = vec_sld(Hup1,H,15);
1828 
1829         // do the dynamic programming
1830 
1831         // update E value
1832         E   = vec_subs(E,v_gapextend);
1833         tmp = vec_subs(H,v_gapopen);
1834         E   = vec_max(E,tmp);
1835 
1836         // update F value
1837         F   = vec_subs(Fup,v_gapextend);
1838         tmp = vec_subs(Hup1,v_gapopen);
1839         F   = vec_max(F,tmp);
1840 
1841         v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1842 
1843         // add score to H
1844         H   = vec_adds(Hup2,v_score);
1845         H   = vec_subs(H,v_bias);
1846 
1847         // set H to max of H,E,F
1848         H   = vec_max(H,E);
1849         H   = vec_max(H,F);
1850 
1851         // Update highest score encountered this far
1852         v_maxscore = vec_max(v_maxscore,H);
1853 
1854 
1855 
1856         // PROLOGUE 14
1857         // prefetch next residue
1858         k                = db_sequence[14];
1859         k8               = db_sequence[6];
1860 
1861         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1862         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1863         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1864         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1865         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1866         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1867 
1868 
1869         // prefetch score for next step
1870         v_score_load1 = vec_ld(16*k,query_profile_byte);
1871         v_score_load2 = vec_ld(16*k8,query_profile_byte);
1872 
1873         // load values of F and H from previous row (one unit up)
1874         Fup    = vec_ld(0,  p);
1875         Hup2   = vec_ld(16, p);
1876         p += 32; // move ahead 32 bytes
1877 
1878         // shift into place so we have complete F and H vectors
1879         // that refer to the values one unit up from each cell
1880         // that we are currently working on.
1881         Fup    = vec_sld(Fup,F,15);
1882         Hup2   = vec_sld(Hup2,H,15);
1883 
1884         // do the dynamic programming
1885 
1886         // update E value
1887         E   = vec_subs(E,v_gapextend);
1888         tmp = vec_subs(H,v_gapopen);
1889         E   = vec_max(E,tmp);
1890 
1891         // update F value
1892         F   = vec_subs(Fup,v_gapextend);
1893         tmp = vec_subs(Hup2,v_gapopen);
1894         F   = vec_max(F,tmp);
1895 
1896         v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1897 
1898         // add score to H
1899         H   = vec_adds(Hup1,v_score);
1900         H   = vec_subs(H,v_bias);
1901 
1902         // set H to max of H,E,F
1903         H   = vec_max(H,E);
1904         H   = vec_max(H,F);
1905 
1906         // Update highest score encountered this far
1907         v_maxscore = vec_max(v_maxscore,H);
1908 
1909 
1910 
1911         // PROLOGUE 15
1912         // prefetch next residue
1913         k                = db_sequence[15];
1914         k8               = db_sequence[7];
1915 
1916         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1917         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1918         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1919         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1920         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1921         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1922 
1923 
1924         // prefetch score for next step
1925         v_score_load1 = vec_ld(16*k,query_profile_byte);
1926         v_score_load2 = vec_ld(16*k8,query_profile_byte);
1927 
1928         // load values of F and H from previous row (one unit up)
1929         Fup    = vec_ld(0,  p);
1930         Hup1    = vec_ld(16, p);
1931         p += 32; // move ahead 32 bytes
1932 
1933         // shift into place so we have complete F and H vectors
1934         // that refer to the values one unit up from each cell
1935         // that we are currently working on.
1936         Fup    = vec_sld(Fup,F,15);
1937         Hup1    = vec_sld(Hup1,H,15);
1938 
1939         // do the dynamic programming
1940 
1941         // update E value
1942         E   = vec_subs(E,v_gapextend);
1943         tmp = vec_subs(H,v_gapopen);
1944         E   = vec_max(E,tmp);
1945 
1946         // update F value
1947         F   = vec_subs(Fup,v_gapextend);
1948         tmp = vec_subs(Hup1,v_gapopen);
1949         F   = vec_max(F,tmp);
1950 
1951         v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
1952 
1953         // add score to H
1954         H   = vec_adds(Hup2,v_score);
1955         H   = vec_subs(H,v_bias);
1956 
1957         // set H to max of H,E,F
1958         H   = vec_max(H,E);
1959         H   = vec_max(H,F);
1960 
1961         // Update highest score encountered this far
1962         v_maxscore = vec_max(v_maxscore,H);
1963 
1964 
1965 
1966         // PROLOGUE 16
1967         // prefetch next residue
1968         k                = db_sequence[16];
1969         k8               = db_sequence[8];
1970 
1971         v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
1972         v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
1973         v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
1974         v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
1975         v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
1976         v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
1977 
1978 
1979         // prefetch score for next step
1980         v_score_load1 = vec_ld(16*k,query_profile_byte);
1981         v_score_load2 = vec_ld(16*k8,query_profile_byte);
1982 
1983         // load values of F and H from previous row (one unit up)
1984         Fup    = vec_ld(0,  p);
1985         Hup2   = vec_ld(16, p);
1986         p += 32; // move ahead 32 bytes
1987 
1988         // shift into place so we have complete F and H vectors
1989         // that refer to the values one unit up from each cell
1990         // that we are currently working on.
1991         Fup    = vec_sld(Fup,F,15);
1992         Hup2   = vec_sld(Hup2,H,15);
1993 
1994         // do the dynamic programming
1995 
1996         // update E value
1997         E   = vec_subs(E,v_gapextend);
1998         tmp = vec_subs(H,v_gapopen);
1999         E   = vec_max(E,tmp);
2000 
2001         // update F value
2002         F   = vec_subs(Fup,v_gapextend);
2003         tmp = vec_subs(Hup2,v_gapopen);
2004         F   = vec_max(F,tmp);
2005 
2006         v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2007 
2008         // add score to H
2009         H   = vec_adds(Hup1,v_score);
2010         H   = vec_subs(H,v_bias);
2011 
2012         // set H to max of H,E,F
2013         H   = vec_max(H,E);
2014         H   = vec_max(H,F);
2015 
2016         // Update highest score encountered this far
2017         v_maxscore = vec_max(v_maxscore,H);
2018 
2019         p = workspace;
2020 
2021         for(j=16;j<db_length;j+=16)
2022         {
2023             // STEP 1
2024 
2025             // prefetch next residue
2026             k                = db_sequence[j+1];
2027             k8               = db_sequence[j-7];
2028 
2029             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2030             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2031             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2032             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2033             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2034             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2035 
2036             // prefetch scores for next step
2037             v_score_load1 = vec_ld(16*k,query_profile_byte);
2038             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2039 
2040             // load values of F and H from previous row (one unit up)
2041             Fup    = vec_ld(512, p);
2042             Hup1   = vec_ld(528, p);
2043 
2044             // save old values of F and H to use on next row
2045             vec_st(F, 0,  p);
2046             vec_st(H, 16, p);
2047             p += 32;
2048 
2049             // shift into place so we have complete F and H vectors
2050             // that refer to the values one unit up from each cell
2051             // that we are currently working on.
2052             Fup    = vec_sld(Fup,F,15);
2053             Hup1    = vec_sld(Hup1,H,15);
2054 
2055             // do the dynamic programming
2056 
2057             // update E value
2058             E   = vec_subs(E,v_gapextend);
2059             tmp = vec_subs(H,v_gapopen);
2060             E   = vec_max(E,tmp);
2061 
2062             // update F value
2063             F   = vec_subs(Fup,v_gapextend);
2064             tmp = vec_subs(Hup1,v_gapopen);
2065             F   = vec_max(F,tmp);
2066 
2067             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2068 
2069             // add score to H
2070             H   = vec_adds(Hup2,v_score);
2071             H   = vec_subs(H,v_bias);
2072 
2073             // set H to max of H,E,F
2074             H   = vec_max(H,E);
2075             H   = vec_max(H,F);
2076 
2077 
2078 
2079             // Update highest score encountered this far
2080             v_maxscore = vec_max(v_maxscore,H);
2081 
2082 
2083 
2084 
2085 
2086             // STEP 2
2087 
2088             // prefetch next residue
2089             k                = db_sequence[j+2];
2090             k8               = db_sequence[j-6];
2091 
2092             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2093             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2094             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2095             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2096             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2097             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2098 
2099 
2100             // prefetch scores for next step
2101             v_score_load1 = vec_ld(16*k,query_profile_byte);
2102             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2103 
2104             // load values of F and H from previous row (one unit up)
2105             Fup    = vec_ld(512, p);
2106             Hup2   = vec_ld(528, p);
2107 
2108             // save old values of F and H to use on next row
2109             vec_st(F, 0,  p);
2110             vec_st(H, 16, p);
2111             p += 32;
2112 
2113             // shift into place so we have complete F and H vectors
2114             // that refer to the values one unit up from each cell
2115             // that we are currently working on.
2116             Fup    = vec_sld(Fup,F,15);
2117             Hup2   = vec_sld(Hup2,H,15);
2118 
2119             // do the dynamic programming
2120 
2121             // update E value
2122             E   = vec_subs(E,v_gapextend);
2123             tmp = vec_subs(H,v_gapopen);
2124             E   = vec_max(E,tmp);
2125 
2126             // update F value
2127             F   = vec_subs(Fup,v_gapextend);
2128             tmp = vec_subs(Hup2,v_gapopen);
2129             F   = vec_max(F,tmp);
2130 
2131             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2132 
2133             // add score to H
2134             H   = vec_adds(Hup1,v_score);
2135             H   = vec_subs(H,v_bias);
2136 
2137             // set H to max of H,E,F
2138             H   = vec_max(H,E);
2139             H   = vec_max(H,F);
2140 
2141 
2142             // Update highest score encountered this far
2143             v_maxscore = vec_max(v_maxscore,H);
2144 
2145 
2146 
2147 
2148 
2149 
2150             // STEP 3
2151 
2152             // prefetch next residue
2153             k                = db_sequence[j+3];
2154             k8               = db_sequence[j-5];
2155 
2156             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2157             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2158             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2159             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2160             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2161             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2162 
2163 
2164             // prefetch scores for next step
2165             v_score_load1 = vec_ld(16*k,query_profile_byte);
2166             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2167 
2168             // load values of F and H from previous row (one unit up)
2169             Fup    = vec_ld(512, p);
2170             Hup1   = vec_ld(528, p);
2171 
2172             // save old values of F and H to use on next row
2173             vec_st(F, 0,  p);
2174             vec_st(H, 16, p);
2175             p += 32;
2176 
2177             // shift into place so we have complete F and H vectors
2178             // that refer to the values one unit up from each cell
2179             // that we are currently working on.
2180             Fup    = vec_sld(Fup,F,15);
2181             Hup1    = vec_sld(Hup1,H,15);
2182 
2183             // do the dynamic programming
2184 
2185             // update E value
2186             E   = vec_subs(E,v_gapextend);
2187             tmp = vec_subs(H,v_gapopen);
2188             E   = vec_max(E,tmp);
2189 
2190             // update F value
2191             F   = vec_subs(Fup,v_gapextend);
2192             tmp = vec_subs(Hup1,v_gapopen);
2193             F   = vec_max(F,tmp);
2194 
2195             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2196 
2197             // add score to H
2198             H   = vec_adds(Hup2,v_score);
2199             H   = vec_subs(H,v_bias);
2200 
2201             // set H to max of H,E,F
2202             H   = vec_max(H,E);
2203             H   = vec_max(H,F);
2204 
2205             // Update highest score encountered this far
2206             v_maxscore = vec_max(v_maxscore,H);
2207 
2208 
2209 
2210 
2211 
2212 
2213             // STEP 4
2214 
2215             // prefetch next residue
2216             k                = db_sequence[j+4];
2217             k8               = db_sequence[j-4];
2218 
2219             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2220             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2221             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2222             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2223             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2224             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2225 
2226 
2227             // prefetch scores for next step
2228             v_score_load1 = vec_ld(16*k,query_profile_byte);
2229             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2230 
2231             // load values of F and H from previous row (one unit up)
2232             Fup    = vec_ld(512, p);
2233             Hup2   = vec_ld(528, p);
2234 
2235             // save old values of F and H to use on next row
2236             vec_st(F, 0,  p);
2237             vec_st(H, 16, p);
2238             p += 32;
2239 
2240             // shift into place so we have complete F and H vectors
2241             // that refer to the values one unit up from each cell
2242             // that we are currently working on.
2243             Fup    = vec_sld(Fup,F,15);
2244             Hup2   = vec_sld(Hup2,H,15);
2245 
2246             // do the dynamic programming
2247 
2248             // update E value
2249             E   = vec_subs(E,v_gapextend);
2250             tmp = vec_subs(H,v_gapopen);
2251             E   = vec_max(E,tmp);
2252 
2253             // update F value
2254             F   = vec_subs(Fup,v_gapextend);
2255             tmp = vec_subs(Hup2,v_gapopen);
2256             F   = vec_max(F,tmp);
2257 
2258             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2259 
2260             // add score to H
2261             H   = vec_adds(Hup1,v_score);
2262             H   = vec_subs(H,v_bias);
2263 
2264             // set H to max of H,E,F
2265             H   = vec_max(H,E);
2266             H   = vec_max(H,F);
2267 
2268             // Update highest score encountered this far
2269             v_maxscore = vec_max(v_maxscore,H);
2270 
2271 
2272 
2273 
2274 
2275 
2276             // STEP 5
2277 
2278             // prefetch next residue
2279             k                = db_sequence[j+5];
2280             k8               = db_sequence[j-3];
2281 
2282             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2283             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2284             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2285             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2286             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2287             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2288 
2289 
2290             // prefetch scores for next step
2291             v_score_load1 = vec_ld(16*k,query_profile_byte);
2292             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2293 
2294             // load values of F and H from previous row (one unit up)
2295             Fup    = vec_ld(512, p);
2296             Hup1    = vec_ld(528, p);
2297 
2298             // save old values of F and H to use on next row
2299             vec_st(F, 0,  p);
2300             vec_st(H, 16, p);
2301             p += 32;
2302 
2303             // shift into place so we have complete F and H vectors
2304             // that refer to the values one unit up from each cell
2305             // that we are currently working on.
2306             Fup    = vec_sld(Fup,F,15);
2307             Hup1   = vec_sld(Hup1,H,15);
2308 
2309             // do the dynamic programming
2310 
2311             // update E value
2312             E   = vec_subs(E,v_gapextend);
2313             tmp = vec_subs(H,v_gapopen);
2314             E   = vec_max(E,tmp);
2315 
2316             // update F value
2317             F   = vec_subs(Fup,v_gapextend);
2318             tmp = vec_subs(Hup1,v_gapopen);
2319             F   = vec_max(F,tmp);
2320 
2321             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2322 
2323             // add score to H
2324             H   = vec_adds(Hup2,v_score);
2325             H   = vec_subs(H,v_bias);
2326 
2327             // set H to max of H,E,F
2328             H   = vec_max(H,E);
2329             H   = vec_max(H,F);
2330 
2331             // Update highest score encountered this far
2332             v_maxscore = vec_max(v_maxscore,H);
2333 
2334 
2335 
2336 
2337 
2338 
2339             // STEP 6
2340 
2341             // prefetch next residue
2342             k                = db_sequence[j+6];
2343             k8               = db_sequence[j-2];
2344 
2345             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2346             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2347             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2348             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2349             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2350             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2351 
2352 
2353             // prefetch scores for next step
2354             v_score_load1 = vec_ld(16*k,query_profile_byte);
2355             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2356 
2357             // load values of F and H from previous row (one unit up)
2358             Fup    = vec_ld(512, p);
2359             Hup2   = vec_ld(528, p);
2360 
2361             // save old values of F and H to use on next row
2362             vec_st(F, 0,  p);
2363             vec_st(H, 16, p);
2364             p += 32;
2365 
2366             // shift into place so we have complete F and H vectors
2367             // that refer to the values one unit up from each cell
2368             // that we are currently working on.
2369             Fup    = vec_sld(Fup,F,15);
2370             Hup2   = vec_sld(Hup2,H,15);
2371 
2372             // do the dynamic programming
2373 
2374             // update E value
2375             E   = vec_subs(E,v_gapextend);
2376             tmp = vec_subs(H,v_gapopen);
2377             E   = vec_max(E,tmp);
2378 
2379             // update F value
2380             F   = vec_subs(Fup,v_gapextend);
2381             tmp = vec_subs(Hup2,v_gapopen);
2382             F   = vec_max(F,tmp);
2383 
2384             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2385 
2386             // add score to H
2387             H   = vec_adds(Hup1,v_score);
2388             H   = vec_subs(H,v_bias);
2389 
2390             // set H to max of H,E,F
2391             H   = vec_max(H,E);
2392             H   = vec_max(H,F);
2393 
2394             // Update highest score encountered this far
2395             v_maxscore = vec_max(v_maxscore,H);
2396 
2397 
2398 
2399 
2400 
2401 
2402             // STEP 7
2403 
2404             // prefetch next residue
2405             k                = db_sequence[j+7];
2406             k8               = db_sequence[j-1];
2407 
2408             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2409             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2410             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2411             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2412             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2413             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2414 
2415 
2416             // prefetch scores for next step
2417             v_score_load1 = vec_ld(16*k,query_profile_byte);
2418             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2419 
2420             // load values of F and H from previous row (one unit up)
2421             Fup    = vec_ld(512, p);
2422             Hup1    = vec_ld(528, p);
2423 
2424             // save old values of F and H to use on next row
2425             vec_st(F, 0,  p);
2426             vec_st(H, 16, p);
2427             p += 32;
2428 
2429             // shift into place so we have complete F and H vectors
2430             // that refer to the values one unit up from each cell
2431             // that we are currently working on.
2432             Fup    = vec_sld(Fup,F,15);
2433             Hup1    = vec_sld(Hup1,H,15);
2434 
2435             // do the dynamic programming
2436 
2437             // update E value
2438             E   = vec_subs(E,v_gapextend);
2439             tmp = vec_subs(H,v_gapopen);
2440             E   = vec_max(E,tmp);
2441 
2442             // update F value
2443             F   = vec_subs(Fup,v_gapextend);
2444             tmp = vec_subs(Hup1,v_gapopen);
2445             F   = vec_max(F,tmp);
2446 
2447             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2448 
2449             // add score to H
2450             H   = vec_adds(Hup2,v_score);
2451             H   = vec_subs(H,v_bias);
2452 
2453             // set H to max of H,E,F
2454             H   = vec_max(H,E);
2455             H   = vec_max(H,F);
2456 
2457             // Update highest score encountered this far
2458             v_maxscore = vec_max(v_maxscore,H);
2459 
2460 
2461 
2462 
2463 
2464 
2465             // STEP 8
2466 
2467             // prefetch next residue
2468             k                = db_sequence[j+8];
2469             k8               = db_sequence[j];
2470 
2471             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2472             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2473             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2474             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2475             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2476             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2477 
2478 
2479             // prefetch scores for next step
2480             v_score_load1 = vec_ld(16*k,query_profile_byte);
2481             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2482 
2483             // load values of F and H from previous row (one unit up)
2484             Fup    = vec_ld(512, p);
2485             Hup2   = vec_ld(528, p);
2486 
2487             // save old values of F and H to use on next row
2488             vec_st(F, 0,  p);
2489             vec_st(H, 16, p);
2490             p += 32;
2491 
2492             // shift into place so we have complete F and H vectors
2493             // that refer to the values one unit up from each cell
2494             // that we are currently working on.
2495             Fup    = vec_sld(Fup,F,15);
2496             Hup2   = vec_sld(Hup2,H,15);
2497 
2498             // do the dynamic programming
2499 
2500             // update E value
2501             E   = vec_subs(E,v_gapextend);
2502             tmp = vec_subs(H,v_gapopen);
2503             E   = vec_max(E,tmp);
2504 
2505             // update F value
2506             F   = vec_subs(Fup,v_gapextend);
2507             tmp = vec_subs(Hup2,v_gapopen);
2508             F   = vec_max(F,tmp);
2509 
2510             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2511 
2512             // add score to H
2513             H   = vec_adds(Hup1,v_score);
2514             H   = vec_subs(H,v_bias);
2515 
2516             // set H to max of H,E,F
2517             H   = vec_max(H,E);
2518             H   = vec_max(H,F);
2519 
2520             // Update highest score encountered this far
2521             v_maxscore = vec_max(v_maxscore,H);
2522 
2523 
2524 
2525 
2526 
2527 
2528             // STEP 9
2529 
2530             // prefetch next residue
2531             k                = db_sequence[j+9];
2532             k8               = db_sequence[j+1];
2533 
2534             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2535             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2536             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2537             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2538             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2539             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2540 
2541 
2542             // prefetch scores for next step
2543             v_score_load1 = vec_ld(16*k,query_profile_byte);
2544             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2545 
2546             // load values of F and H from previous row (one unit up)
2547             Fup    = vec_ld(512, p);
2548             Hup1   = vec_ld(528, p);
2549 
2550             // save old values of F and H to use on next row
2551             vec_st(F, 0,  p);
2552             vec_st(H, 16, p);
2553             p += 32;
2554 
2555             // shift into place so we have complete F and H vectors
2556             // that refer to the values one unit up from each cell
2557             // that we are currently working on.
2558             Fup    = vec_sld(Fup,F,15);
2559             Hup1   = vec_sld(Hup1,H,15);
2560 
2561             // do the dynamic programming
2562 
2563             // update E value
2564             E   = vec_subs(E,v_gapextend);
2565             tmp = vec_subs(H,v_gapopen);
2566             E   = vec_max(E,tmp);
2567 
2568             // update F value
2569             F   = vec_subs(Fup,v_gapextend);
2570             tmp = vec_subs(Hup1,v_gapopen);
2571             F   = vec_max(F,tmp);
2572 
2573             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2574 
2575             // add score to H
2576             H   = vec_adds(Hup2,v_score);
2577             H   = vec_subs(H,v_bias);
2578 
2579             // set H to max of H,E,F
2580             H   = vec_max(H,E);
2581             H   = vec_max(H,F);
2582 
2583             // Update highest score encountered this far
2584             v_maxscore = vec_max(v_maxscore,H);
2585 
2586             // STEP 10
2587 
2588             // prefetch next residue
2589             k                = db_sequence[j+10];
2590             k8               = db_sequence[j+2];
2591 
2592             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2593             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2594             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2595             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2596             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2597             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2598 
2599 
2600             // prefetch scores for next step
2601             v_score_load1 = vec_ld(16*k,query_profile_byte);
2602             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2603 
2604             // load values of F and H from previous row (one unit up)
2605             Fup    = vec_ld(512, p);
2606             Hup2   = vec_ld(528, p);
2607 
2608             // save old values of F and H to use on next row
2609             vec_st(F, 0,  p);
2610             vec_st(H, 16, p);
2611             p += 32;
2612 
2613             // shift into place so we have complete F and H vectors
2614             // that refer to the values one unit up from each cell
2615             // that we are currently working on.
2616             Fup    = vec_sld(Fup,F,15);
2617             Hup2   = vec_sld(Hup2,H,15);
2618 
2619             // do the dynamic programming
2620 
2621             // update E value
2622             E   = vec_subs(E,v_gapextend);
2623             tmp = vec_subs(H,v_gapopen);
2624             E   = vec_max(E,tmp);
2625 
2626             // update F value
2627             F   = vec_subs(Fup,v_gapextend);
2628             tmp = vec_subs(Hup2,v_gapopen);
2629             F   = vec_max(F,tmp);
2630 
2631             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2632 
2633             // add score to H
2634             H   = vec_adds(Hup1,v_score);
2635             H   = vec_subs(H,v_bias);
2636 
2637             // set H to max of H,E,F
2638             H   = vec_max(H,E);
2639             H   = vec_max(H,F);
2640 
2641             // Update highest score encountered this far
2642             v_maxscore = vec_max(v_maxscore,H);
2643 
2644             // STEP 11
2645 
2646             // prefetch next residue
2647             k                = db_sequence[j+11];
2648             k8               = db_sequence[j+3];
2649 
2650             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2651             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2652             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2653             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2654             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2655             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2656 
2657 
2658             // prefetch scores for next step
2659             v_score_load1 = vec_ld(16*k,query_profile_byte);
2660             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2661 
2662             // load values of F and H from previous row (one unit up)
2663             Fup    = vec_ld(512, p);
2664             Hup1   = vec_ld(528, p);
2665 
2666             // save old values of F and H to use on next row
2667             vec_st(F, 0,  p);
2668             vec_st(H, 16, p);
2669             p += 32;
2670 
2671             // shift into place so we have complete F and H vectors
2672             // that refer to the values one unit up from each cell
2673             // that we are currently working on.
2674             Fup    = vec_sld(Fup,F,15);
2675             Hup1   = vec_sld(Hup1,H,15);
2676 
2677             // do the dynamic programming
2678 
2679             // update E value
2680             E   = vec_subs(E,v_gapextend);
2681             tmp = vec_subs(H,v_gapopen);
2682             E   = vec_max(E,tmp);
2683 
2684             // update F value
2685             F   = vec_subs(Fup,v_gapextend);
2686             tmp = vec_subs(Hup1,v_gapopen);
2687             F   = vec_max(F,tmp);
2688 
2689             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2690 
2691             // add score to H
2692             H   = vec_adds(Hup2,v_score);
2693             H   = vec_subs(H,v_bias);
2694 
2695             // set H to max of H,E,F
2696             H   = vec_max(H,E);
2697             H   = vec_max(H,F);
2698 
2699             // Update highest score encountered this far
2700             v_maxscore = vec_max(v_maxscore,H);
2701 
2702             // STEP 12
2703 
2704             // prefetch next residue
2705             k                = db_sequence[j+12];
2706             k8               = db_sequence[j+4];
2707 
2708             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2709             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2710             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2711             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2712             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2713             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2714 
2715 
2716             // prefetch scores for next step
2717             v_score_load1 = vec_ld(16*k,query_profile_byte);
2718             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2719 
2720             // load values of F and H from previous row (one unit up)
2721             Fup    = vec_ld(512, p);
2722             Hup2   = vec_ld(528, p);
2723 
2724             // save old values of F and H to use on next row
2725             vec_st(F, 0,  p);
2726             vec_st(H, 16, p);
2727             p += 32;
2728 
2729             // shift into place so we have complete F and H vectors
2730             // that refer to the values one unit up from each cell
2731             // that we are currently working on.
2732             Fup    = vec_sld(Fup,F,15);
2733             Hup2   = vec_sld(Hup2,H,15);
2734 
2735             // do the dynamic programming
2736 
2737             // update E value
2738             E   = vec_subs(E,v_gapextend);
2739             tmp = vec_subs(H,v_gapopen);
2740             E   = vec_max(E,tmp);
2741 
2742             // update F value
2743             F   = vec_subs(Fup,v_gapextend);
2744             tmp = vec_subs(Hup2,v_gapopen);
2745             F   = vec_max(F,tmp);
2746 
2747             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2748 
2749             // add score to H
2750             H   = vec_adds(Hup1,v_score);
2751             H   = vec_subs(H,v_bias);
2752 
2753             // set H to max of H,E,F
2754             H   = vec_max(H,E);
2755             H   = vec_max(H,F);
2756 
2757             // Update highest score encountered this far
2758             v_maxscore = vec_max(v_maxscore,H);
2759 
2760             // STEP 13
2761 
2762             // prefetch next residue
2763             k                = db_sequence[j+13];
2764             k8               = db_sequence[j+5];
2765 
2766             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2767             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2768             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2769             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2770             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2771             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2772 
2773 
2774             // prefetch scores for next step
2775             v_score_load1 = vec_ld(16*k,query_profile_byte);
2776             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2777 
2778             // load values of F and H from previous row (one unit up)
2779             Fup    = vec_ld(512, p);
2780             Hup1   = vec_ld(528, p);
2781 
2782             // save old values of F and H to use on next row
2783             vec_st(F, 0,  p);
2784             vec_st(H, 16, p);
2785             p += 32;
2786 
2787             // shift into place so we have complete F and H vectors
2788             // that refer to the values one unit up from each cell
2789             // that we are currently working on.
2790             Fup    = vec_sld(Fup,F,15);
2791             Hup1   = vec_sld(Hup1,H,15);
2792 
2793             // do the dynamic programming
2794 
2795             // update E value
2796             E   = vec_subs(E,v_gapextend);
2797             tmp = vec_subs(H,v_gapopen);
2798             E   = vec_max(E,tmp);
2799 
2800             // update F value
2801             F   = vec_subs(Fup,v_gapextend);
2802             tmp = vec_subs(Hup1,v_gapopen);
2803             F   = vec_max(F,tmp);
2804 
2805             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2806 
2807             // add score to H
2808             H   = vec_adds(Hup2,v_score);
2809             H   = vec_subs(H,v_bias);
2810 
2811             // set H to max of H,E,F
2812             H   = vec_max(H,E);
2813             H   = vec_max(H,F);
2814 
2815             // Update highest score encountered this far
2816             v_maxscore = vec_max(v_maxscore,H);
2817 
2818             // STEP 14
2819 
2820             // prefetch next residue
2821             k                = db_sequence[j+14];
2822             k8               = db_sequence[j+6];
2823 
2824             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2825             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2826             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2827             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2828             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2829             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2830 
2831 
2832             // prefetch scores for next step
2833             v_score_load1 = vec_ld(16*k,query_profile_byte);
2834             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2835 
2836             // load values of F and H from previous row (one unit up)
2837             Fup    = vec_ld(512, p);
2838             Hup2   = vec_ld(528, p);
2839 
2840             // save old values of F and H to use on next row
2841             vec_st(F, 0,  p);
2842             vec_st(H, 16, p);
2843             p += 32;
2844 
2845             // shift into place so we have complete F and H vectors
2846             // that refer to the values one unit up from each cell
2847             // that we are currently working on.
2848             Fup    = vec_sld(Fup,F,15);
2849             Hup2   = vec_sld(Hup2,H,15);
2850 
2851             // do the dynamic programming
2852 
2853             // update E value
2854             E   = vec_subs(E,v_gapextend);
2855             tmp = vec_subs(H,v_gapopen);
2856             E   = vec_max(E,tmp);
2857 
2858             // update F value
2859             F   = vec_subs(Fup,v_gapextend);
2860             tmp = vec_subs(Hup2,v_gapopen);
2861             F   = vec_max(F,tmp);
2862 
2863             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2864 
2865             // add score to H
2866             H   = vec_adds(Hup1,v_score);
2867             H   = vec_subs(H,v_bias);
2868 
2869             // set H to max of H,E,F
2870             H   = vec_max(H,E);
2871             H   = vec_max(H,F);
2872 
2873             // Update highest score encountered this far
2874             v_maxscore = vec_max(v_maxscore,H);
2875 
2876             // STEP 15
2877 
2878             // prefetch next residue
2879             k                = db_sequence[j+15];
2880             k8               = db_sequence[j+7];
2881 
2882             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2883             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2884             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2885             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2886             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2887             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2888 
2889             // prefetch scores for next step
2890             v_score_load1 = vec_ld(16*k,query_profile_byte);
2891             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2892 
2893             // load values of F and H from previous row (one unit up)
2894             Fup    = vec_ld(512, p);
2895             Hup1   = vec_ld(528, p);
2896 
2897             // save old values of F and H to use on next row
2898             vec_st(F, 0,  p);
2899             vec_st(H, 16, p);
2900             p += 32;
2901 
2902             // shift into place so we have complete F and H vectors
2903             // that refer to the values one unit up from each cell
2904             // that we are currently working on.
2905             Fup    = vec_sld(Fup,F,15);
2906             Hup1   = vec_sld(Hup1,H,15);
2907 
2908             // do the dynamic programming
2909 
2910             // update E value
2911             E   = vec_subs(E,v_gapextend);
2912             tmp = vec_subs(H,v_gapopen);
2913             E   = vec_max(E,tmp);
2914 
2915             // update F value
2916             F   = vec_subs(Fup,v_gapextend);
2917             tmp = vec_subs(Hup1,v_gapopen);
2918             F   = vec_max(F,tmp);
2919 
2920             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2921 
2922             // add score to H
2923             H   = vec_adds(Hup2,v_score);
2924             H   = vec_subs(H,v_bias);
2925 
2926             // set H to max of H,E,F
2927             H   = vec_max(H,E);
2928             H   = vec_max(H,F);
2929 
2930             // Update highest score encountered this far
2931             v_maxscore = vec_max(v_maxscore,H);
2932 
2933             // STEP 16
2934 
2935             // prefetch next residue
2936             k                = db_sequence[j+16];
2937             k8               = db_sequence[j+8];
2938 
2939             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2940             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2941             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
2942             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
2943             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
2944             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
2945 
2946 
2947             // prefetch scores for next step
2948             v_score_load1 = vec_ld(16*k,query_profile_byte);
2949             v_score_load2 = vec_ld(16*k8,query_profile_byte);
2950 
2951             // load values of F and H from previous row (one unit up)
2952             Fup    = vec_ld(512, p);
2953             Hup2   = vec_ld(528, p);
2954 
2955             // save old values of F and H to use on next row
2956             vec_st(F, 0,  p);
2957             vec_st(H, 16, p);
2958             p += 32;
2959 
2960             // shift into place so we have complete F and H vectors
2961             // that refer to the values one unit up from each cell
2962             // that we are currently working on.
2963             Fup    = vec_sld(Fup,F,15);
2964             Hup2   = vec_sld(Hup2,H,15);
2965 
2966             // do the dynamic programming
2967 
2968             // update E value
2969             E   = vec_subs(E,v_gapextend);
2970             tmp = vec_subs(H,v_gapopen);
2971             E   = vec_max(E,tmp);
2972 
2973             // update F value
2974             F   = vec_subs(Fup,v_gapextend);
2975             tmp = vec_subs(Hup2,v_gapopen);
2976             F   = vec_max(F,tmp);
2977 
2978             v_score_load1 = vec_perm(v_score_load1,v_score_load2,merge_score_load);
2979 
2980             // add score to H
2981             H   = vec_adds(Hup1,v_score);
2982             H   = vec_subs(H,v_bias);
2983 
2984             // set H to max of H,E,F
2985             H   = vec_max(H,E);
2986             H   = vec_max(H,F);
2987 
2988             // Update highest score encountered this far
2989             v_maxscore = vec_max(v_maxscore,H);
2990 
2991         }
2992 
2993         for(;j<db_length+15;j++)
2994         {
2995             k8               = db_sequence[j-7];
2996 
2997             v_score     = vec_perm(v_score_q1,  v_score_load1,  queue1_to_score);
2998             v_score_q1  = vec_perm(v_score_q2,  v_score_load1,  queue2_to_queue1);
2999             v_score_q2  = vec_perm(v_score_q3,  v_score_load1,  queue3_to_queue2);
3000             v_score_q3  = vec_perm(v_score_q4,  v_score_load1,  queue4_to_queue3);
3001             v_score_q4  = vec_perm(v_score_q5,  v_score_load1,  queue5_to_queue4);
3002             v_score_q5  = vec_perm(v_score_q5,  v_score_load1,  queue5_with_load);
3003 
3004 
3005             // prefetch scores for next step
3006             v_score_load2 = vec_ld(16*k8,query_profile_byte);
3007             v_score_load1 = vec_perm(v_zero,v_score_load2,merge_score_load);
3008 
3009             // save old values of F and H to use on next row
3010             vec_st(F, 0,  p);
3011             vec_st(H, 16, p);
3012             p += 32; // move ahead 32 bytes
3013 
3014             Fup    = vec_sld(v_zero,F,15);
3015             Hup1   = vec_sld(v_zero,H,15);
3016 
3017             // do the dynamic programming
3018 
3019             // update E value
3020             E   = vec_subs(E,v_gapextend);
3021             tmp = vec_subs(H,v_gapopen);
3022             E   = vec_max(E,tmp);
3023 
3024             // update F value
3025             F   = vec_subs(Fup,v_gapextend);
3026             tmp = vec_subs(Hup1,v_gapopen);
3027             F   = vec_max(F,tmp);
3028 
3029             // add score to H
3030             H   = vec_adds(Hup2,v_score);
3031             H   = vec_subs(H,v_bias);
3032 
3033             // set H to max of H,E,F
3034             H   = vec_max(H,E);
3035             H   = vec_max(H,F);
3036 
3037             // Save value to use for next diagonal H
3038             Hup2 = Hup1;
3039 
3040             // Update highest score encountered this far
3041             v_maxscore = vec_max(v_maxscore,H);
3042         }
3043         vec_st(F, 512, p);
3044         vec_st(H, 528, p);
3045 
3046         query_profile_byte += 16*alphabet_size;
3047 
3048         // End of this row (actually 16 rows due to SIMD).
3049         // Before we continue, check for overflow.
3050         tmp      = vec_subs(vec_splat_u8(-1),v_bias);
3051         overflow = vec_any_ge(v_maxscore,tmp);
3052 
3053 
3054     }
3055 
3056     if(overflow)
3057     {
3058         return 255;
3059     }
3060     else
3061     {
3062         // find largest score in the v_maxscore vector
3063         tmp = vec_sld(v_maxscore,v_maxscore,8);
3064         v_maxscore = vec_max(v_maxscore,tmp);
3065         tmp = vec_sld(v_maxscore,v_maxscore,4);
3066         v_maxscore = vec_max(v_maxscore,tmp);
3067         tmp = vec_sld(v_maxscore,v_maxscore,2);
3068         v_maxscore = vec_max(v_maxscore,tmp);
3069         tmp = vec_sld(v_maxscore,v_maxscore,1);
3070         v_maxscore = vec_max(v_maxscore,tmp);
3071 
3072         // store in temporary variable
3073         vec_ste(v_maxscore,0,&score);
3074 
3075         // return largest score
3076         return score;
3077     }}
3078 
3079 
3080 #else
3081 
3082 /* No Altivec support. Avoid compiler complaints about empty object */
3083 
3084 int sw_dummy;
3085 
3086 #endif
3087