1 /*****************************************************************
2 * Unipro UGENE - Genome analysis suite
3 * Copyright (C) 2008 Unipro, Russia (http://ugene.net)
4 * All Rights Reserved
5 *
6 *     This source code is distributed under the terms of the
7 *     GNU General Public License. See the files COPYING and LICENSE
8 *     for details.
9 *****************************************************************/
10 /************************************************************
11  * HMMER - Biological sequence analysis with profile HMMs
12  * Copyright (C) 1992-2003 Washington University School of Medicine
13  * All Rights Reserved
14  *
15  *     This source code is distributed under the terms of the
16  *     GNU General Public License. See the files COPYING and LICENSE
17  *     for details.
18  ************************************************************/
19 
20 /* fast_algorithms.c
21  * SRE, Sun Nov 10 08:54:48 2002 [AA 3080, Denver to StL]
22  * CVS $Id: fast_algorithms.c,v 1.1.1.1 2008/01/17 02:32:19 jizhu Exp $
23  *
24  * Optimized routines to replace slower implementations in core_algorithms.c.
25  *
26  * The routines in core_algorithms.c are designed for clarity
27  * and maintainability, not for speed. Implementations here
28  * are designed for speed, not clarity. If you're trying to
29  * understand the code, or optimize for a specific platform,
30  * you are probably better off looking at core_algorithms.c.
31  *
32  * P7Viterbi() is the key function to target optimization to.
33  * The implementation in core_algorithms.c is currently ifdef'ed
34  * out of the code. The implementation that is used by default
35  * is here, in fast_algorithms.c. A third implementation, from
36  * Erik Lindahl at Stanford, is Mac/Altivec specific.
37  *
38  * Which implementation is used is controlled by ifdef's. The
39  * default implementation uses a fast implementation of
40  * P7Viterbi() from here. Other options (mutually exclusive):
41  *
42  * -DSLOW
43  *   enable original core_algorithms.c code: slower than default,
44  *   but might be easier to follow, for someone trying
45  *   to understand the DP code.
46  * -DALTIVEC
47  *   enable Erik Lindahl's Altivec code for Macintosh OSX
48  */
49 //#if !defined ALTIVEC && !defined SLOW
50 #ifdef ALTIVEC
51 #include "config.h"
52 //#include "squidconf.h"
53 
54 #include "structs.h"
55 #include "funcs.h"
56 //#include "squid.h"
57 #include <altivec.h>
58 #include <stdio.h>
59 #include <malloc.h>
60 //
61 // /* the DEFAULT P7Viterbi() is portably optimized; code follows:
62 //  */
63 //
64 // /* Function: P7Viterbi() - portably optimized version
65 //  * Incept:   SRE, Fri Nov 15 13:14:33 2002 [St. Louis]
66 //  *
67 //  * Purpose:  The Viterbi dynamic programming algorithm.
68 //  *           Derived from core_algorithms.c:P7Viterbi().
69 //  *
70 //  * Args:     dsq    - sequence in digitized form
71 //  *           L      - length of dsq
72 //  *           hmm    - the model
73 //  *           mx     - re-used DP matrix
74 //  *           ret_tr - RETURN: traceback; pass NULL if it's not wanted
75 //  *
76 //  * Return:   log P(S|M)/P(S|R), as a bit score
77 //  */
78 // float
79 // P7Viterbi(unsigned char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s *mx, struct p7trace_s **ret_tr)
80 // {
81 //   struct p7trace_s  *tr;
82 //   int **xmx;
83 //   int **mmx;
84 //   int **imx;
85 //   int **dmx;
86 //   int   i,k;
87 //   int   sc;
88 //   int  *mc, *dc, *ic;        /* pointers to rows of mmx, dmx, imx */
89 //   int  *ms, *is;             /* pointers to msc[i], isc[i] */
90 //   int  *mpp, *mpc, *ip;      /* ptrs to mmx[i-1], mmx[i], imx[i-1] */
91 //   int  *bp;		     /* ptr into bsc[] */
92 //   int  *ep;                  /* ptr into esc[] */
93 //   int   xmb;		     /* value of xmx[i-1][XMB] */
94 //   int   xme;                 /* max for xmx[i][XME] */
95 //   int  *dpp;                 /* ptr into dmx[i-1] (previous row) */
96 //   int  *tpmm, *tpmi, *tpmd, *tpim, *tpii, *tpdm, *tpdd; /* ptrs into tsc */
97 //   int   M;
98 //
99 //   /* Make sure we have space for a DP matrix with 0..L rows, 0..M-1 columns.
100 //    */
101 //   ResizePlan7Matrix(mx, L, hmm->M, &xmx, &mmx, &imx, &dmx);
102 //
103 //   /* Initialization of the zero row.
104 //    */
105 //   xmx[0][XMN] = 0;		                     /* S->N, p=1            */
106 //   xmx[0][XMB] = hmm->xsc[XTN][MOVE];                 /* S->N->B, no N-tail   */
107 //   xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY;  /* need seq to get here */
108 //   for (k = 0; k <= hmm->M; k++)
109 //     mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY;      /* need seq to get here */
110 //
111 //   /* Initializations that help icc vectorize.
112 //    */
113 //   M        = hmm->M;
114 //
115 //   /* Recursion. Done as a pull.
116 //    * Note some slightly wasteful boundary conditions:
117 //    *    tsc[0] = -INFTY for all eight transitions (no node 0)
118 //    *    D_M and I_M are wastefully calculated (they don't exist)
119 //    */
120 //
121 //   tpmm  = hmm->tsc[TMM];
122 //   tpim  = hmm->tsc[TIM];
123 //   tpdm  = hmm->tsc[TDM];
124 //   tpmd  = hmm->tsc[TMD];
125 //   tpdd  = hmm->tsc[TDD];
126 //   tpmi  = hmm->tsc[TMI];
127 //   tpii  = hmm->tsc[TII];
128 //   bp    = hmm->bsc;
129 //   for (i = 1; i <= L; i++) {
130 //     mc    = mmx[i];
131 //     dc    = dmx[i];
132 //     ic    = imx[i];
133 //     mpp   = mmx[i-1];
134 //     dpp   = dmx[i-1];
135 //     ip    = imx[i-1];
136 //     xmb   = xmx[i-1][XMB];
137 //     ms    = hmm->msc[dsq[i]];
138 //     is    = hmm->isc[dsq[i]];
139 //     mc[0] = -INFTY;
140 //     dc[0] = -INFTY;
141 //     ic[0] = -INFTY;
142 //
143 //     for (k = 1; k <= M; k++) {
144 //       mc[k] = mpp[k-1]   + tpmm[k-1];
145 //       if ((sc = ip[k-1]  + tpim[k-1]) > mc[k])  mc[k] = sc;
146 //       if ((sc = dpp[k-1] + tpdm[k-1]) > mc[k])  mc[k] = sc;
147 //       if ((sc = xmb  + bp[k])         > mc[k])  mc[k] = sc;
148 //       mc[k] += ms[k];
149 //       if (mc[k] < -INFTY) mc[k] = -INFTY;
150 //
151 //       dc[k] = dc[k-1] + tpdd[k-1];
152 //       if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc;
153 //       if (dc[k] < -INFTY) dc[k] = -INFTY;
154 //
155 //       if (k < M) {
156 // 	ic[k] = mpp[k] + tpmi[k];
157 // 	if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc;
158 // 	ic[k] += is[k];
159 // 	if (ic[k] < -INFTY) ic[k] = -INFTY;
160 //       }
161 //     }
162 //
163 //     /* Now the special states. Order is important here.
164 //      * remember, C and J emissions are zero score by definition,
165 //      */
166 // 				/* N state */
167 //     xmx[i][XMN] = -INFTY;
168 //     if ((sc = xmx[i-1][XMN] + hmm->xsc[XTN][LOOP]) > -INFTY)
169 //       xmx[i][XMN] = sc;
170 //
171 // 				/* E state */
172 //     xme = -INFTY;
173 //     mpc = mmx[i];
174 //     ep  = hmm->esc;
175 //     for (k = 1; k <= hmm->M; k++)
176 //       if ((sc =  mpc[k] + ep[k]) > xme) xme = sc;
177 //     xmx[i][XME] = xme;
178 // 				/* J state */
179 //     xmx[i][XMJ] = -INFTY;
180 //     if ((sc = xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP]) > -INFTY)
181 //       xmx[i][XMJ] = sc;
182 //     if ((sc = xmx[i][XME]   + hmm->xsc[XTE][LOOP]) > xmx[i][XMJ])
183 //       xmx[i][XMJ] = sc;
184 //
185 // 				/* B state */
186 //     xmx[i][XMB] = -INFTY;
187 //     if ((sc = xmx[i][XMN] + hmm->xsc[XTN][MOVE]) > -INFTY)
188 //       xmx[i][XMB] = sc;
189 //     if ((sc = xmx[i][XMJ] + hmm->xsc[XTJ][MOVE]) > xmx[i][XMB])
190 //       xmx[i][XMB] = sc;
191 //
192 // 				/* C state */
193 //     xmx[i][XMC] = -INFTY;
194 //     if ((sc = xmx[i-1][XMC] + hmm->xsc[XTC][LOOP]) > -INFTY)
195 //       xmx[i][XMC] = sc;
196 //     if ((sc = xmx[i][XME] + hmm->xsc[XTE][MOVE]) > xmx[i][XMC])
197 //       xmx[i][XMC] = sc;
198 //   }
199 // 				/* T state (not stored) */
200 //   sc = xmx[L][XMC] + hmm->xsc[XTC][MOVE];
201 //
202 //   if (ret_tr != NULL) {
203 //     P7ViterbiTrace(hmm, dsq, L, mx, &tr);
204 //     *ret_tr = tr;
205 //   }
206 //
207 //   return Scorify(sc);		/* the total Viterbi score. */
208 // }
209 // #endif /*default P7Viterbi, used when ALTIVEC and SLOW are not defined*/
210 
211 
212 //#ifdef ALTIVEC
213 /*################################################################
214  * The Altivec port, for Macintosh PowerPC.
215  * Erik Lindahl, Stanford University, 2002.
216  *
217  * Replaces the following functions:
218  *    AllocPlan7Body()      plan7.c                (data alignment on 16-byte boundaries)
219  *    CreatePlan7Matrix()   core_algorithms.c      (data alignment on 16-byte boundaries)
220  *    ResizePlan7Matrix()   core_algorithms.c      (data alignment on 16-byte boundaries)
221  *    P7Viterbi()           core_algorithms.c      (total recode, w/ Altivec instructions)
222  ################################################################*/
223 void
AllocPlan7Body(struct plan7_s * hmm,int M)224 AllocPlan7Body(struct plan7_s *hmm, int M)
225 {
226   int k, x;
227 
228   hmm->M = M;
229 
230   hmm->rf     = (char *)MallocOrDie ((M+2) * sizeof(char));
231   hmm->cs     = (char *)MallocOrDie ((M+2) * sizeof(char));
232   hmm->ca     = (char *)MallocOrDie ((M+2) * sizeof(char));
233   hmm->map    = (int  *)MallocOrDie ((M+1) * sizeof(int));
234 
235   hmm->t      = (float **)MallocOrDie (M     *           sizeof(float *));
236   hmm->tsc    = (int **)MallocOrDie (7     *           sizeof(int *));
237   hmm->mat    = (float **)MallocOrDie ((M+1) *           sizeof(float *));
238   hmm->ins    = (float **)MallocOrDie (M     *           sizeof(float *));
239   hmm->msc    = (int **)MallocOrDie (MAXCODE   *       sizeof(int *));
240   hmm->isc    = (int **)MallocOrDie (MAXCODE   *       sizeof(int *));
241 
242   hmm->t[0]   = (float *)MallocOrDie ((7*M)     *       sizeof(float));
243 
244   /* Allocate extra memory so tsc[TMM,TIM,TDM,TMD,TDD] start on the
245    * 16-byte cache boundary, and tsc[TMI,TII] start
246    * 12 bytes offset from the boundary.
247    */
248   //hmm->tsc_mem = MallocOrDie (((7*(M+16)))  *   sizeof(int));
249   hmm->tsc_mem = (int *)memalign(16, (((7*(M+16)))  *   sizeof(int)));
250   memset( hmm->tsc_mem, 0, ((7*(M+16)))  *   sizeof(int) );
251   hmm->mat[0] = (float *)MallocOrDie ((MAXABET*(M+1)) * sizeof(float));
252   hmm->ins[0] = (float *)MallocOrDie ((MAXABET*M) *     sizeof(float));
253   /* Allocate extra mem. to make sure all members of msc,isc start
254    * on 12-byte offsets from cache boundary.
255    */
256   hmm->msc_mem = (int *)memalign(128,((MAXCODE*(M+1+16)) * sizeof(int)));
257   memset( hmm->msc_mem, 0, (MAXCODE*(M+1+16)) * sizeof(int) );
258   hmm->isc_mem = (int *)memalign(128,(((MAXCODE*(M+16)) *   sizeof(int))));
259   memset( hmm->isc_mem, 0, (MAXCODE*(M+16)) *   sizeof(int) );
260   //hmm->msc_mem = (int *)MallocOrDie((MAXCODE*(M+1+16)) * sizeof(int));
261   //hmm->isc_mem = (int *)MallocOrDie((MAXCODE*(M+16)) *   sizeof(int));
262 
263   /* note allocation strategy for important 2D arrays -- trying
264    * to keep locality as much as possible, cache efficiency etc.
265    */
266   for (k = 1; k <= M; k++) {
267     hmm->mat[k] = hmm->mat[0] + k * MAXABET;
268     if (k < M) {
269       hmm->ins[k] = hmm->ins[0] + k * MAXABET;
270       hmm->t[k]   = hmm->t[0]   + k * 7;
271     }
272   }
273 
274   /* align tsc pointers */
275   hmm->tsc[TMM] = (int *) (((((unsigned long int) hmm->tsc_mem) + 15) & (~0xf)));
276   hmm->tsc[TMI] = (int *) (((((unsigned long int) hmm->tsc_mem) + (M+12)*sizeof(int) + 15) & (~0xf)) + 12);
277   hmm->tsc[TMD] = (int *) (((((unsigned long int) hmm->tsc_mem) + 2*(M+12)*sizeof(int) + 15) & (~0xf)));
278   hmm->tsc[TIM] = (int *) (((((unsigned long int) hmm->tsc_mem) + 3*(M+12)*sizeof(int) + 15) & (~0xf)));
279   hmm->tsc[TII] = (int *) (((((unsigned long int) hmm->tsc_mem) + 4*(M+12)*sizeof(int) + 15) & (~0xf)) + 12);
280   hmm->tsc[TDM] = (int *) (((((unsigned long int) hmm->tsc_mem) + 5*(M+12)*sizeof(int) + 15) & (~0xf)));
281   hmm->tsc[TDD] = (int *) (((((unsigned long int) hmm->tsc_mem) + 6*(M+12)*sizeof(int) + 15) & (~0xf)));
282 
283   for (x = 0; x < MAXCODE; x++) {
284     hmm->msc[x] = (int *) (((((unsigned long int)hmm->msc_mem) + x*(M+1+12)*sizeof(int) + 15) & (~0xf)) + 12);
285     hmm->isc[x] = (int *) (((((unsigned long int)hmm->isc_mem) + x*(M+12)*sizeof(int) + 15) & (~0xf)) + 12);
286   }
287   /* tsc[0] is used as a boundary condition sometimes [Viterbi()],
288    * so set to -inf always.
289    */
290   for (x = 0; x < 7; x++)
291     hmm->tsc[x][0] = -INFTY;
292 
293   hmm->begin  = (float *)MallocOrDie ((M+1) * sizeof(float));
294   hmm->bsc_mem= (int *)memalign(128,((M+1+12) * sizeof(int)));
295   memset( hmm->bsc_mem, 0, (M+1+12) * sizeof(int) );
296   //hmm->bsc_mem= (int *)MallocOrDie ((M+1+12) * sizeof(int));
297   hmm->end    = (float *)MallocOrDie ((M+1) * sizeof(float));
298   //hmm->esc_mem= (int *)MallocOrDie ((M+1+12) * sizeof(int));
299   hmm->esc_mem= (int *)memalign(128, ((M+1+12) * sizeof(int)));
300   memset( hmm->esc_mem, 0, (M+1+12) * sizeof(int) );
301 
302   hmm->bsc = (int *) (((((unsigned long int) hmm->bsc_mem) + 15) & (~0xf)) + 12);
303   hmm->esc = (int *) (((((unsigned long int) hmm->esc_mem) + 15) & (~0xf)) + 12);
304 
305   return;
306 }
307 
308 
309 /* Function: CreatePlan7Matrix()
310  *
311  * Purpose:  Create a dynamic programming matrix for standard Forward,
312  *           Backward, or Viterbi, with scores kept as scaled log-odds
313  *           integers. Keeps 2D arrays compact in RAM in an attempt
314  *           to maximize cache hits.
315  *
316  *           The mx structure can be dynamically grown, if a new
317  *           HMM or seq exceeds the currently allocated size. Dynamic
318  *           growing is more efficient than an alloc/free of a whole
319  *           matrix for every new target. The ResizePlan7Matrix()
320  *           call does this reallocation, if needed. Here, in the
321  *           creation step, we set up some pads - to inform the resizing
322  *           call how much to overallocate when it realloc's. If a pad
323  *           is zero, we will not resize in that dimension.
324  *
325  * Args:     N     - N+1 rows are allocated, for sequence.
326  *           M     - size of model in nodes
327  *           padN  - over-realloc in seq/row dimension, or 0
328  *           padM  - over-realloc in HMM/column dimension, or 0
329  *
330  * Return:   mx
331  *           mx is allocated here. Caller frees with FreePlan7Matrix(mx).
332  */
333 struct dpmatrix_s *
CreatePlan7Matrix(int N,int M,int padN,int padM)334 CreatePlan7Matrix(int N, int M, int padN, int padM)
335 {
336   struct dpmatrix_s *mx;
337   int i,n;
338 
339   mx         = (struct dpmatrix_s *) MallocOrDie (sizeof(struct dpmatrix_s));
340   mx->xmx    = (int **) MallocOrDie (sizeof(int *) * (N+1));
341   mx->mmx    = (int **) MallocOrDie (sizeof(int *) * (N+1));
342   mx->imx    = (int **) MallocOrDie (sizeof(int *) * (N+1));
343   mx->dmx    = (int **) MallocOrDie (sizeof(int *) * (N+1));
344 
345   /* For the memory accessed by the altivec routines, we want to have
346    * accesses aligned to 16-byte boundaries as far as possible.
347    * To accomplish this, we align extra memory and then set the first
348    * pointer on each row to point to 4 bytes before a boundary.
349    * This means element 1, which is the first one we work on, will be
350    * on a 16-byte boundary. We still make sure we 'own' the three bytes
351    * before, though, so we can load the vector with element 0 cache-aligned too.
352    * The real pointers to memory are kept in xmx_mem,mmx_mem,imx_mem,dmx_mem.
353    */
354   mx->xmx_mem = (void *) MallocOrDie (sizeof(int) * (N+1)*(5 + 16));
355   mx->mmx_mem = (void *) MallocOrDie (sizeof(int) * (N+1)*(M+2+16));
356   mx->imx_mem = (void *) MallocOrDie (sizeof(int) * (N+1)*(M+2+16));
357   mx->dmx_mem = (void *) MallocOrDie (sizeof(int) * (N+1)*(M+2+16));
358 
359   mx->xmx[0] = (int *) (((((unsigned long int) mx->xmx_mem) + 15) & (~0xf)) + 12);
360   mx->mmx[0] = (int *) (((((unsigned long int) mx->mmx_mem) + 15) & (~0xf)) + 12);
361   mx->imx[0] = (int *) (((((unsigned long int) mx->imx_mem) + 15) & (~0xf)) + 12);
362   mx->dmx[0] = (int *) (((((unsigned long int) mx->dmx_mem) + 15) & (~0xf)) + 12);
363 
364   /* And make sure the beginning of each row is aligned the same way */
365   for (i = 1; i <= N; i++)
366     {
367       mx->xmx[i] = mx->xmx[0] + i*(5+11) ; /* add 11 bytes per row, making it divisible by 4 */
368       n = 12 - (M+2)%4;
369       mx->mmx[i] = mx->mmx[0] + i*(M+2+n);
370       mx->imx[i] = mx->imx[0] + i*(M+2+n);
371       mx->dmx[i] = mx->dmx[0] + i*(M+2+n);
372     }
373 
374   mx->maxN = N;
375   mx->maxM = M;
376   mx->padN = padN;
377   mx->padM = padM;
378 
379   return mx;
380 }
381 
382 
383 struct dpmatrix_s *
CreatePlan7MatrixHP(int N,int M,int padN,int padM,char * memaddr)384 CreatePlan7MatrixHP(int N, int M, int padN, int padM, char* memaddr){
385   struct dpmatrix_s *mx;
386   int i,n;
387   mx         = (struct dpmatrix_s *) (memaddr + 0x060000);
388   mx->xmx    = (int **) (memaddr + 0x065000);
389   mx->mmx    = (int **) (memaddr + 0x070000);
390   mx->imx    = (int **) (memaddr + 0x075000);
391   mx->dmx    = (int **) (memaddr + 0x080000);
392   mx->xmx_mem = (void *) (memaddr + 0x100000);
393   mx->mmx_mem = (void *) (memaddr + 0x150000);
394   mx->imx_mem = (void *) (memaddr + 0x200000);
395   mx->dmx_mem = (void *) (memaddr + 0x250000);
396 
397   mx->xmx[0] = (int *) (((((unsigned long int) mx->xmx_mem) + 15) & (~0xf)) + 12);
398   mx->mmx[0] = (int *) (((((unsigned long int) mx->mmx_mem) + 15) & (~0xf)) + 12);
399   mx->imx[0] = (int *) (((((unsigned long int) mx->imx_mem) + 15) & (~0xf)) + 12);
400   mx->dmx[0] = (int *) (((((unsigned long int) mx->dmx_mem) + 15) & (~0xf)) + 12);
401 
402   /* And make sure the beginning of each row is aligned the same way */
403   for (i = 1; i <= N; i++)
404     {
405       mx->xmx[i] = mx->xmx[0] + i*(5+11) ; /* add 11 bytes per row, making it divisible by 4 */
406       n = 12 - (M+2)%4;
407       mx->mmx[i] = mx->mmx[0] + i*(M+2+n);
408       mx->imx[i] = mx->imx[0] + i*(M+2+n);
409       mx->dmx[i] = mx->dmx[0] + i*(M+2+n);
410     }
411 
412   mx->maxN = N;
413   mx->maxM = M;
414   mx->padN = padN;
415   mx->padM = padM;
416 
417   return mx;
418 }
419 
420 
421 
422 
423 /* Function: ResizePlan7Matrix()
424  *
425  * Purpose:  Reallocate a dynamic programming matrix, if necessary,
426  *           for a problem of NxM: sequence length N, model size M.
427  *           (N=1 for small memory score-only variants; we allocate
428  *           N+1 rows in the DP matrix.)
429  *
430  *           See additional comments in
431  *           core_algorithms.c:ResizePlan7Matrix(), the normal version
432  *           of this function. This version is only used in the
433  *           Altivec (--enable-altivec) port.
434  *
435  *           Returns individual ptrs to the four matrix components
436  *           as a convenience.
437  *
438  * Args:     mx    - an already allocated model to grow.
439  *           N     - seq length to allocate for; N+1 rows
440  *           M     - size of model
441  *           xmx, mmx, imx, dmx
442  *                 - RETURN: ptrs to four mx components as a convenience
443  *
444  * Return:   (void)
445  *           mx is (re)allocated here.
446  */
447 void
ResizePlan7Matrix(struct dpmatrix_s * mx,int N,int M,int *** xmx,int *** mmx,int *** imx,int *** dmx)448 ResizePlan7Matrix(struct dpmatrix_s *mx, int N, int M,
449 		  int ***xmx, int ***mmx, int ***imx, int ***dmx)
450 {
451   int i,n;
452 
453   if (N <= mx->maxN && M <= mx->maxM)
454     {
455       if (xmx != NULL) *xmx = mx->xmx;
456       if (mmx != NULL) *mmx = mx->mmx;
457       if (imx != NULL) *imx = mx->imx;
458       if (dmx != NULL) *dmx = mx->dmx;
459       return;
460     }
461 
462   if (N > mx->maxN) {
463     N          += mx->padN;
464     mx->maxN    = N;
465     mx->xmx     = (int **) ReallocOrDie (mx->xmx, sizeof(int *) * (mx->maxN+1));
466     mx->mmx     = (int **) ReallocOrDie (mx->mmx, sizeof(int *) * (mx->maxN+1));
467     mx->imx     = (int **) ReallocOrDie (mx->imx, sizeof(int *) * (mx->maxN+1));
468     mx->dmx     = (int **) ReallocOrDie (mx->dmx, sizeof(int *) * (mx->maxN+1));
469   }
470 
471   if (M > mx->maxM) {
472     M += mx->padM;
473     mx->maxM = M;
474   }
475 
476   mx->xmx_mem = ReallocOrDie (mx->xmx_mem, sizeof(int) * (mx->maxN+1)*(5 + 16));
477   mx->mmx_mem = ReallocOrDie (mx->mmx_mem, sizeof(int) * (mx->maxN+1)*(mx->maxM+2+16));
478   mx->imx_mem = ReallocOrDie (mx->imx_mem, sizeof(int) * (mx->maxN+1)*(mx->maxM+2+16));
479   mx->dmx_mem = ReallocOrDie (mx->dmx_mem, sizeof(int) * (mx->maxN+1)*(mx->maxM+2+16));
480 
481   mx->xmx[0] = (int *) (((((unsigned long int) mx->xmx_mem) + 15) & (~0xf)) + 12);
482   mx->mmx[0] = (int *) (((((unsigned long int) mx->mmx_mem) + 15) & (~0xf)) + 12);
483   mx->imx[0] = (int *) (((((unsigned long int) mx->imx_mem) + 15) & (~0xf)) + 12);
484   mx->dmx[0] = (int *) (((((unsigned long int) mx->dmx_mem) + 15) & (~0xf)) + 12);
485 
486   /* And make sure the beginning of each row is aligned the same way */
487   for (i = 1; i <= mx->maxN; i++)
488     {
489       mx->xmx[i] = mx->xmx[0] + i*(5+11) ; /* add 11 bytes per row, making it divisible by 4 */
490       n = 12 - (mx->maxM+2)%4;
491       mx->mmx[i] = mx->mmx[0] + i*(mx->maxM+2+n);
492       mx->imx[i] = mx->imx[0] + i*(mx->maxM+2+n);
493       mx->dmx[i] = mx->dmx[0] + i*(mx->maxM+2+n);
494     }
495 
496   if (xmx != NULL) *xmx = mx->xmx;
497   if (mmx != NULL) *mmx = mx->mmx;
498   if (imx != NULL) *imx = mx->imx;
499   if (dmx != NULL) *dmx = mx->dmx;
500 }
501 
502 
503 /* Function: P7Viterbi()
504  *
505  * Purpose:  The Viterbi dynamic programming algorithm; Altivec implementation
506  *           by Erik Lindahl, Stanford University, 2002.
507  *
508  * Args:     dsq    - sequence in digitized form
509  *           L      - length of dsq
510  *           hmm    - the model
511  *           mx     - DP matrix (may get grown here)
512  *           ret_tr - RETURN: traceback; pass NULL if it's not wanted
513  *
514  * Return:   log P(S|M)/P(S|R), as a bit score
515  */
516 /* This first version of P7Viterbi has been accelerated with Altivec vectorization.
517  * On Apple hardware, it is up to a factor 10 faster than the old non-altivec version.
518  */
519 typedef union {
520 vector signed int v;
521 int i[4];
522 } ivector;
printivec(vector signed int z)523 void printivec(vector signed int z)
524 {
525     ivector q;
526     q.v=z;
527     printf("%d  %d  %d  %d\n",q.i[0],q.i[1],q.i[2],q.i[3]);
528 }
529 float
P7Viterbi(unsigned char * dsq,int L,struct plan7_s * hmm,struct dpmatrix_s * mx,struct p7trace_s ** ret_tr)530 P7Viterbi(unsigned char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s *mx, struct p7trace_s **ret_tr)
531 {
532   struct p7trace_s  *tr;
533   int **xmx;
534   int **mmx;
535   int **imx;
536   int **dmx;
537   int  *mmxi,*xmxi,*imxi,*dmxi;
538   int  *lmmxi,*lxmxi,*limxi,*ldmxi;
539   int  *p_tmm,*p_tim,*p_tdm,*p_bsc,*p_msc,*p_tmd,*p_tdd,*p_tmi,*p_tii,*p_isc,*p_esc;
540   int   i,n,k;
541   int   sc;
542   /* gcc and motorola use different syntax for initializing vectors,
543    * so we use a dummy variable instead to get an adress to load from...
544    */
545   int t_lowscore = -INFTY;
546 
547   /* vector variables. We avoid the stupid gcc register spill/fill code by
548    * limiting ourselves to 32 generic variables and making the all registers.
549    * (This reuse is the reason for the generic variable names).
550    */
551   register vector signed int v_lowscore;
552   register vector signed int max_mmxesc;
553   register vector signed int v_xmb;
554   register vector unsigned int mask1;
555   register vector unsigned int mask2;
556   register vector unsigned int mask3;
557   register vector unsigned int mask4;
558   register vector signed int v_lmmx1;
559   register vector signed int v_lmmx2;
560   register vector signed int v_limx1;
561   register vector signed int v_limx2;
562   register vector signed int v_save_lmmx;
563   register vector signed int v_save_ldmx;
564   register vector signed int v_save_limx;
565   register vector signed int v_save_mmx;
566   register vector signed int v_save_dmx;
567   register vector signed int v1;
568   register vector signed int v2;
569   register vector signed int v3;
570   register vector signed int v4;
571   register vector signed int v5;
572   register vector signed int v6;
573   register vector signed int v7;
574   register vector signed int v8;
575   register vector signed int v9;
576   register vector signed int v10;
577   register vector signed int v11;
578   register vector signed int v12;
579   register vector signed int v13;
580   register vector signed int v14;
581   register vector signed int v15;
582 
583   /* load (-infinity) to all four elements in v_lowscore */
584   v_lowscore      = vec_lde(0, &t_lowscore );
585   mask1           = (vector unsigned int)vec_lvsl(0,&t_lowscore);
586   v_lowscore      = vec_perm(v_lowscore,v_lowscore,(vector unsigned char)mask1);
587   v_lowscore      = vec_splat(v_lowscore,0);
588 
589   v1 = vec_splat_s32(-1);
590   v2 = vec_splat_s32(0);
591   mask1 = (vector unsigned int)vec_sld(v1,v2,12); /* FF in first pos, rest. are 00 */
592   mask2 = vec_sld(mask1,mask1,12);
593   mask3 = vec_sld(mask1,mask1,8);
594   mask4 = vec_sld(mask1,mask1,4);
595 
596   /* Make sure our DP matrix has 0..L rows, 0..M columns; grow it if needed.
597    */
598   ResizePlan7Matrix(mx, L, hmm->M, &xmx, &mmx, &imx, &dmx);
599 
600   /* Initialization of the zero row.
601    */
602   xmx[0][XMN] = 0;		                     /* S->N, p=1            */
603   xmx[0][XMB] = hmm->xsc[XTN][MOVE];                 /* S->N->B, no N-tail   */
604   xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY;  /* need seq to get here */
605 
606   mmxi=mmx[0];
607   imxi=imx[0];
608   dmxi=dmx[0];
609   xmxi=xmx[0];
610 
611 
612   for (n = 0; n  < 5+hmm->M; n+=4) {
613     vec_st(v_lowscore, n*4, mmxi);
614     vec_st(v_lowscore, n*4, imxi);
615     vec_st(v_lowscore, n*4, dmxi);
616   }
617 
618 
619   /* Fill data beyound M with -INFTY, so we can take the maximum including
620    * elements with k>M.
621    */
622   for(k=1+hmm->M;k<=(3+hmm->M);k++) {
623       hmm->esc[k]=-INFTY;
624       hmm->bsc[k]=-INFTY;
625       for(i=0;i<7;i++)
626 	hmm->tsc[i][k]=-INFTY;
627       for(i=0;i<MAXCODE;i++) {
628 	hmm->msc[i][k]=-INFTY;
629 	hmm->isc[i][k]=-INFTY;
630       }
631   }
632 
633   /* Recursion. Done as a pull
634    * Note some slightly wasteful boundary conditions:
635    *    tsc[0] = -INFTY for all eight transitions (no node 0)
636    *    D_M and I_M are wastefully calculated (they don't exist)
637    */
638 
639   for (i = 1; i <= L; i++) {
640     /* pointers to last (i-1) row */
641     lmmxi=mmxi;
642     limxi=imxi;
643     ldmxi=dmxi;
644     lxmxi=xmxi;
645 
646     /* get pointers to this row */
647     mmxi=mmx[i];
648     imxi=imx[i];
649     dmxi=dmx[i];
650     xmxi=xmx[i];
651 
652     /* Set everything that doesnt depend on k here */
653 
654     /* load and splat (spread to all elements) XMX[i-1][XMB] */
655     v13   = vec_lde(0,&(xmx[i-1][XMB]));
656     v14   = (vector signed int)vec_lvsl(0,&(xmx[i-1][XMB]));
657     v13   = vec_perm(v13,v13,(vector unsigned char)v14);
658     v_xmb = vec_splat(v13,0);
659     p_tmm = hmm->tsc[TMM];
660     p_tim = hmm->tsc[TIM];
661     p_tdm = hmm->tsc[TDM];
662     p_bsc = hmm->bsc;
663     k = dsq[i];
664     p_msc = hmm->msc[k];
665     p_isc = hmm->isc[k];
666     p_tmd = hmm->tsc[TMD];
667     p_tdd = hmm->tsc[TDD];
668     p_tmi = hmm->tsc[TMI];
669     p_tii = hmm->tsc[TII];
670     p_esc = hmm->esc;
671     max_mmxesc = v_lowscore;
672 
673     /* the 0 element of vectors are aligned 12 bytes up from the 16-byte boundary,
674      * so we simply write the entire 16 bytes before the 16 byte boundary.
675      */
676     vec_st(v_lowscore,0,mmxi);
677     vec_st(v_lowscore,0,imxi);
678     vec_st(v_lowscore,0,dmxi);
679 
680     /* Load the first (i.e. 'previous') vector on last row for mmx,imx,dmx,
681      * and load the first vector of dmx and mmx on this row.
682      */
683     v_save_lmmx = vec_ld(-12, lmmxi);
684     v_save_limx = vec_ld(-12, limxi);
685     v_save_ldmx = vec_ld(-12, ldmxi);
686     v_save_mmx = vec_ld(-12, mmxi);
687     v_save_dmx = vec_ld(-12, dmxi);
688 
689     /* we have allocated extra memory, so it is perfectly OK
690      * to do the calculations for a couple of extra cells where
691      * k>hmm->M. These cells just wont be used.
692      */
693     for (n = 4, k=1; k < (hmm->M-3) ; n+=32, k+=8) {
694 	/* match state */
695 
696 	/* 1: check which of mmx[i-1][k-1]+TMM and imx[i-1][k-1]+TIM is better,
697 	* but do it for 8 elements in parallel.
698 	* Since we are comparing with data on the previous row but one position
699 	* earlier, we have to shift stuff. Load two new vectors each round,
700 	* and use the saved register from last round.
701 	*/
702 	/* load mmx data */
703 	v_lmmx1 = vec_ld(n,    lmmxi);
704 	v_lmmx2 = vec_ld(n+16, lmmxi);
705 
706 	/* load imx data */
707 	v_limx1 = vec_ld(n,    limxi);
708 	v_limx2 = vec_ld(n+16, limxi);
709 
710 	v5    = vec_ld(n,    ldmxi);	/* Load dmx data */
711 	v10   = vec_ld(n+16, ldmxi);
712 
713 	/* shift mmx, imx & dmx data */
714 	v1    = vec_sld(v_save_lmmx,v_lmmx1,12);
715 	v3    = vec_sld(v_save_limx,v_limx1,12);
716 	v9    = vec_sld(v_save_ldmx,v5,12);
717 
718 	/* shift mmx, imx & dmx data */
719 	v2    = vec_sld(v_lmmx1,v_lmmx2,12);
720 	v4    = vec_sld(v_limx1,v_limx2,12);
721 	v_save_ldmx = v10;
722 	v10   = vec_sld(v5,v10,12);
723 
724 	v_save_lmmx = v_lmmx2;
725 	v_save_limx = v_limx2;
726 
727 	/* v1,v2 now contains 8 element with mmx[i-1][k-1],
728 	 * v3,v4 contain 8 elements with imx[i-1][k-1],
729          * and v9,v10 contain 8 elements with dmx[i-1][k-1].
730 	 */
731 	/* load TMM, TIM & TDM entries from the HMM - these are aligned in memory */
732         v5    = vec_ld(n-4, p_tmm);
733         v6    = vec_ld(n+12, p_tmm);
734         v7    = vec_ld(n-4, p_tim);
735         v8    = vec_ld(n+12, p_tim);
736         v11   = vec_ld(n-4, p_tdm);
737         v12   = vec_ld(n+12, p_tdm);
738 	/* load bsc[k] */
739 	v14   = vec_ld(n, p_bsc);
740 	v15   = vec_ld(n+16, p_bsc);
741 
742 	/* calc mmx+TMM, imx+TIM, dmx+TDM and XMX+bsc with saturated arithmetic, so
743          * we don't loop if we add the large negative numbers used for -infinity.
744 	 */
745 	v1    = vec_adds(v1,v5);
746 	v2    = vec_adds(v2,v6);
747 	v3    = vec_adds(v3,v7);
748 	v4    = vec_adds(v4,v8);
749 	v9    = vec_adds(v9,v11);
750 	v10   = vec_adds(v10,v12);
751 	v14   = vec_adds(v14,v_xmb);
752 	v15   = vec_adds(v15,v_xmb);
753 	/* Select max of mmx+TMM and imx+TIM in each element */
754 	v1    = vec_max(v1,v3);
755 	v2    = vec_max(v2,v4);
756 	/* select max of dmx+TDM and XMX+bsc */
757 	v9    = vec_max(v9,v14);
758 	v10   = vec_max(v10,v15);
759         /* select max of the four alternatives */
760 	v1    = vec_max(v1,v9);
761 	v2    = vec_max(v2,v10);
762 	/* v1,v2 now contain the max values for the new mmx;
763 	 * check if we should add msc.
764          */
765 
766 	//printf("L=%d M=%d n=%d k=%d i=%d dsq[i]=%d p_msc=hmm->msc[dsq[i]]=%d hmm->msc[dsq[i]]=%d\n",
767 	//     L,hmm->M,n,k,i,dsq[i],p_msc,hmm->msc[dsq[i]]);
768         v3    = vec_ld(n,    p_msc);
769         v4    = vec_ld(n+16, p_msc);
770 
771         v5    = (vector signed int)vec_cmpgt(v3,v_lowscore);
772         v6    = (vector signed int)vec_cmpgt(v4,v_lowscore);
773 
774 	/* load esc[k] */
775 	v9    = vec_ld(n, p_esc);
776 	v10   = vec_ld(n+16, p_esc);
777 
778 	v1    = vec_adds(v1,v3);
779 	v2    = vec_adds(v2,v4);
780         v1    = vec_sel(v3,v1,(vector unsigned int)v5);
781         v2    = vec_sel(v4,v2,(vector unsigned int)v6);
782 
783 	/* have final values for mmx on this row in v1,v2 - store it */
784 	vec_st(v1, n,    mmxi);
785 	vec_st(v2, n+16, mmxi);
786 
787 	v9    = vec_adds(v9,v1);
788 	v10   = vec_adds(v10,v2);
789 	v9    = vec_max(v9,v10);
790 	max_mmxesc = vec_max(v9,max_mmxesc);
791 
792 	/* OK. We finished the match state. Normally we could now first
793 	 * do the delete and then the insertion. The problem is that the
794 	 * delete is a pain in the ass to vectorize, since each element
795 	 * depends on the previous one in the vector. This means I have
796 	 * to write this relatively simple operation as eight independent
797 	 * ones, just as we would have done in a non-vectorized code.
798 	 * Since this is independent of the insertion state changes, I
799 	 * try to hide the latencies by doing the delete and insert
800 	 * calculations in parallel.
801          * To make things easier I add 'del' in a comment on each
802 	 * line for calculations that are on the delete state, and 'ins'
803 	 * for the calculations for the insertion. Hang on...
804 	 */
805 
806 	/* We already have the match data on this row from the previous
807 	 * iteration in v_save_mmx. Rotate it so the element that used to be
808 	 * is pos 4 last iteration is in position 1, and pos 2-4 contain
809 	 * the the first three elements of mmx this iteration.
810          * And do the same type of rotation for v1/v2...
811 	 */
812 
813 	v4 = vec_sld(v1,v2,12); /* del */
814         v3 = vec_sld(v_save_mmx,v1,12); /* del */
815 	v_save_mmx = v2; /* Save for next iteration */
816 
817 	/* Rotate last dmx data so we have the fourth element in pos 1. */
818 	v_save_dmx = vec_sld(v_save_dmx,v_save_dmx,12); /* del */
819 
820         /* load TMD & TDD data */
821         v5   = vec_ld(n-4, p_tmd); /* del */
822 	v6   = vec_ld(n+12, p_tmd); /* del */
823         v7   = vec_ld(n-4, p_tdd); /* del */
824 	v8   = vec_ld(n+12, p_tdd); /* del */
825 
826 	/* calculate mmx+TMD */
827 	v3   = vec_adds(v3,v5); /* del */
828 	v4   = vec_adds(v4,v6); /* del */
829 
830 	/* Start the ugly stuff. We have rotated last dmx data. Add TDD to
831 	 * it and compare with v3/v4 (data from mmx+TMD alternative), but
832 	 * we only compare & replace the first position, so we use a mask!
833 	 */
834 
835         /* First position: Add TDD to v_save_dmx */
836 	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
837 	/* Select max of this and mmx+TMD and save it temporary to v_save_dmx */
838 	v_save_dmx = vec_max(v_save_dmx,v3); /* del */
839 	/* use a mask to select only the first element from v_save_dmx, rest from v3. */
840 	v3     = vec_sel(v3,v_save_dmx,mask1); /* del */
841 
842 	/* Start doing the insertion calculations in parallel.
843 	 * Load the TMI data.
844 	 */
845         v9    = vec_ld(n   , p_tmi); /* ins */
846 	v10   = vec_ld(n+16, p_tmi); /* ins */
847 	/* Deletion:
848          * Now we have an accurate pos 1 in v3. continue with pos2 the same
849          * way. Rotate to a temporary array, add to TDD, compare and use a mask
850          * to write back to only position 2.
851 	 */
852 	v_save_dmx = vec_sld(v3,v3,12); /* del */
853 	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
854 	v_save_dmx = vec_max(v_save_dmx,v3); /* del */
855 	v3     = vec_sel(v3,v_save_dmx,mask2); /* del */
856 
857 	/* More insertion stuff - load TII data */
858 	v11   = vec_ld(n   , p_tii); /* ins */
859 	v12   = vec_ld(n+16, p_tii); /* ins */
860 
861 	/* Deletion, position 3... */
862 	v_save_dmx = vec_sld(v3,v3,12); /* del */
863 	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
864 	v_save_dmx = vec_max(v_save_dmx,v3); /* del */
865 	v3     = vec_sel(v3,v_save_dmx,mask3); /* del */
866 
867 	/* insertion stuff: calculate mmx+TMI */
868 	v9     = vec_adds(v_lmmx1,v9); /* ins */
869 	v10    = vec_adds(v_lmmx2,v10); /* ins */
870 
871 	/* Deletion, position 4 */
872 	v_save_dmx = vec_sld(v3,v3,12); /* del */
873 	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
874 	v_save_dmx = vec_max(v_save_dmx,v3); /* del */
875 	v3     = vec_sel(v3,v_save_dmx,mask4); /* del */
876 
877 	/* insertion stuff: calculate imx+TII */
878 	v11    = vec_adds(v_limx1,v11); /* ins */
879 	v12    = vec_adds(v_limx2,v12); /* ins */
880 
881 	/* That was the first deletion vector, but we are unrolling.
882 	 * The next step is position '5', i.e. the first in vector 2.
883 	 * This one depends on the last position of vector 1, which we just finished.
884 	 */
885 	v_save_dmx = vec_sld(v3,v3,12); /* del */
886 	v_save_dmx = vec_adds(v_save_dmx,v8); /* del */
887 	v_save_dmx = vec_max(v_save_dmx,v4); /* del */
888 	v4     = vec_sel(v4,v_save_dmx,mask1); /* del */
889 
890 	/* insertion stuff: select max of mmx+TMI and imx+TII */
891 	v9     = vec_max(v9,v11); /* ins */
892 	v10    = vec_max(v10,v12); /* ins */
893 	/* insertion stuff: load data from hmm->isc[tmpidx] */
894 	v11    = vec_ld(n, p_isc); /* ins */
895 	v12    = vec_ld(n+16, p_isc); /* ins */
896 
897 	/* position 6 (2 in vector 2) */
898 	v_save_dmx = vec_sld(v4,v4,12); /* del */
899 	v_save_dmx = vec_adds(v_save_dmx,v8); /* del */
900 	v_save_dmx = vec_max(v_save_dmx,v4); /* del */
901 	v4     = vec_sel(v4,v_save_dmx,mask2); /* del */
902 
903 	/* insertion: compare max of mmx+TMI, imx+TII with hmm->isc */
904 	v13    = (vector signed int)vec_cmpgt(v11,v_lowscore);
905 	v14    = (vector signed int)vec_cmpgt(v12,v_lowscore);
906 
907 	/* position 7 (3 in vector 2) */
908 	v_save_dmx = vec_sld(v4,v4,12); /* del */
909 	v_save_dmx = vec_adds(v_save_dmx,v8); /* del */
910 	v_save_dmx = vec_max(v_save_dmx,v4); /* del */
911 	v4     = vec_sel(v4,v_save_dmx,mask3); /* del */
912 
913 	v9     = vec_adds(v9,v11);
914 	v10    = vec_adds(v10,v12);
915 	v9     = vec_sel(v11,v9,(vector unsigned int)v13);
916 	v10    = vec_sel(v12,v10,(vector unsigned int)v14);
917 
918 	/* position 8 (4 in vector 2) */
919 	v_save_dmx = vec_sld(v4,v4,12); /* del */
920 	v_save_dmx = vec_adds(v_save_dmx,v8); /* del */
921 	v_save_dmx = vec_max(v_save_dmx,v4); /* del */
922 	v_save_dmx = vec_sel(v4,v_save_dmx,mask4); /* del */
923 
924 	/* Puh! that was the deletion... v3/v_save_dmx now contain the updated dmx data;
925 	 * save it to memory. (v_save_dmx will be used next iteration)
926 	 */
927 	vec_st(v3,        n, dmxi); /* del */
928 	vec_st(v_save_dmx, n+16, dmxi); /* del */
929 
930 	/* save insertion too */
931 	vec_st(v9, n, imxi);
932 	vec_st(v10,n+16,imxi);
933 
934      }
935     /* odd loop */
936     if(k< (1+hmm->M)) {
937 	/* match state */
938 	/* 1: check which of mmx[i-1][k-1]+TMM and imx[i-1][k-1]+TIM is better,
939 	* but do it for 4 elements in parallel.
940 	* Since we are comparing with data on the previous row but one position
941 	* earlier, we have to shift stuff. Load two new vectors each round,
942 	* and use the saved register from last round.
943 	*/
944 	/* load mmx data */
945 	v_lmmx1 = vec_ld(n,    lmmxi);
946 
947 	/* load imx data */
948 	v_limx1 = vec_ld(n,    limxi);
949 
950 	v5    = vec_ld(n,    ldmxi);	/* Load dmx data */
951 
952 	/* shift mmx, imx & dmx data */
953 	v1    = vec_sld(v_save_lmmx,v_lmmx1,12);
954 	v3    = vec_sld(v_save_limx,v_limx1,12);
955 	v9    = vec_sld(v_save_ldmx,v5,12);
956 
957 	/* v1,v2 now contains 8 element with mmx[i-1][k-1],
958 	    * v3,v4 contain 8 elements with imx[i-1][k-1],
959 	    * and v9,v10 contain 8 elements with dmx[i-1][k-1].
960 	    */
961 	/* load TMM, TIM & TDM entries from the HMM - these are aligned in memory */
962         v5    = vec_ld(n-4, p_tmm);
963         v7    = vec_ld(n-4, p_tim);
964         v11   = vec_ld(n-4, p_tdm);
965 	/* load bsc[k] */
966 	v14   = vec_ld(n, p_bsc);
967 
968 	/* calc mmx+TMM, imx+TIM, dmx+TDM and XMX+bsc with saturated arithmetic, so
969 	    * we don't loop if we add the large negative numbers used for -infinity.
970 	    */
971 	v1    = vec_adds(v1,v5);
972 	v3    = vec_adds(v3,v7);
973 	v9    = vec_adds(v9,v11);
974 	v14   = vec_adds(v14,v_xmb);
975 	/* Select max of mmx+TMM and imx+TIM in each element */
976 	v1    = vec_max(v1,v3);
977 	/* select max of dmx+TDM and XMX+bsc */
978 	v9    = vec_max(v9,v14);
979         /* select max of the four alternatives */
980 	v1    = vec_max(v1,v9);
981 	/* v1,v2 now contain the max values for the new mmx;
982 	* check if we should add msc.
983 	    */
984 
985         v3    = vec_ld(n,    p_msc);
986 
987         v5    = (vector signed int)vec_cmpgt(v3,v_lowscore);
988 
989 	/* load esc[k] */
990 	v9    = vec_ld(n, p_esc);
991 
992 	v1    = vec_adds(v1,v3);
993         v1    = vec_sel(v3,v1,(vector unsigned int)v5);
994 
995 	/* have final values for mmx on this row in v1,v2 - store it */
996 	vec_st(v1, n,    mmxi);
997 
998 	v9    = vec_adds(v9,v1);
999 	max_mmxesc = vec_max(v9,max_mmxesc);
1000 
1001 	/* OK. We finished the match state. Normally we could now first
1002 	    * do the delete and then the insertion. The problem is that the
1003 	    * delete is a pain in the ass to vectorize, since each element
1004 	    * depends on the previous one in the vector. This means I have
1005 	    * to write this relatively simple operation as eight independent
1006 	    * ones, just as we would have done in a non-vectorized code.
1007 	    * Since this is independent of the insertion state changes, I
1008 	    * try to hide the latencies by doing the delete and insert
1009 	    * calculations in parallel.
1010 	    * To make things easier I add 'del' in a comment on each
1011 	    * line for calculations that are on the delete state, and 'ins'
1012 	    * for the calculations for the insertion. Hang on...
1013 	    */
1014 
1015 	/* We already have the match data on this row from the previous
1016 	    * iteration in v_save_mmx. Rotate it so the element that used to be
1017 	    * is pos 4 last iteration is in position 1, and pos 2-4 contain
1018 	    * the the first three elements of mmx this iteration.
1019 	    * And do the same type of rotation for v1/v2...
1020 	    */
1021 
1022         v3 = vec_sld(v_save_mmx,v1,12); /* del */
1023 
1024 	/* Rotate last dmx data so we have the fourth element in pos 1. */
1025 	v_save_dmx = vec_sld(v_save_dmx,v_save_dmx,12); /* del */
1026 
1027         /* load TMD & TDD data */
1028         v5   = vec_ld(n-4, p_tmd); /* del */
1029         v7   = vec_ld(n-4, p_tdd); /* del */
1030 
1031 	/* calculate mmx+TMD */
1032 	v3   = vec_adds(v3,v5); /* del */
1033 
1034 	/* Start the ugly stuff. We have rotated last dmx data. Add TDD to
1035 	    * it and compare with v3/v4 (data from mmx+TMD alternative), but
1036 	    * we only compare & replace the first position, so we use a mask!
1037 	    */
1038 
1039         /* First position: Add TDD to v_save_dmx */
1040 	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
1041 	/* Select max of this and mmx+TMD and save it temporary to v_save_dmx */
1042 	v_save_dmx = vec_max(v_save_dmx,v3); /* del */
1043 	/* use a mask to select only the first element from v_save_dmx, rest from v3. */
1044 	v3     = vec_sel(v3,v_save_dmx,mask1); /* del */
1045 
1046 	/* Start doing the insertion calculations in parallel.
1047 	    * Load the TMI data.
1048 	    */
1049         v9    = vec_ld(n   , p_tmi); /* ins */
1050 	/* Deletion:
1051 	    * Now we have an accurate pos 1 in v3. continue with pos2 the same
1052 	    * way. Rotate to a temporary array, add to TDD, compare and use a mask
1053 	    * to write back to only position 2.
1054 	    */
1055 	v_save_dmx = vec_sld(v3,v3,12); /* del */
1056 	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
1057 	v_save_dmx = vec_max(v_save_dmx,v3); /* del */
1058 	v3     = vec_sel(v3,v_save_dmx,mask2); /* del */
1059 
1060 	/* More insertion stuff - load TII data */
1061 	v11   = vec_ld(n   , p_tii); /* ins */
1062 
1063 	/* Deletion, position 3... */
1064 	v_save_dmx = vec_sld(v3,v3,12); /* del */
1065 	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
1066 	v_save_dmx = vec_max(v_save_dmx,v3); /* del */
1067 	v3     = vec_sel(v3,v_save_dmx,mask3); /* del */
1068 
1069 	/* insertion stuff: calculate mmx+TMI */
1070 	v9     = vec_adds(v_lmmx1,v9); /* ins */
1071 
1072 	/* Deletion, position 4 */
1073 	v_save_dmx = vec_sld(v3,v3,12); /* del */
1074 	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
1075 	v_save_dmx = vec_max(v_save_dmx,v3); /* del */
1076 	v3     = vec_sel(v3,v_save_dmx,mask4); /* del */
1077 
1078 	/* insertion stuff: calculate imx+TII */
1079 	v11    = vec_adds(v_limx1,v11); /* ins */
1080 
1081 	/* insertion stuff: select max of mmx+TMI and imx+TII */
1082 	v9     = vec_max(v9,v11); /* ins */
1083 	/* insertion stuff: load data from hmm->isc[tmpidx] */
1084 	v11    = vec_ld(n, p_isc); /* ins */
1085 
1086 	/* insertion: compare max of mmx+TMI, imx+TII with hmm->isc */
1087 	v13    = (vector signed int)vec_cmpgt(v11,v_lowscore);
1088 
1089 	v9     = vec_adds(v9,v11);
1090 	v9     = vec_sel(v11,v9,(vector unsigned int)v13);
1091 
1092 	/* Puh! that was the deletion... v3/v_save_dmx now contain the updated dmx data;
1093 	* save it to memory. (v_save_dmx will be used next iteration)
1094 	    */
1095 	vec_st(v3,        n, dmxi); /* del */
1096 
1097 	/* save insertion too */
1098 	vec_st(v9, n, imxi);
1099     }
1100     /* end of k loops */
1101 
1102 
1103 
1104     /* Now the special states. Order is important here.
1105 	* remember, C and J emissions are zero score by definition,
1106 	*/
1107 				/* N state */
1108     xmx[i][XMN] = -INFTY;
1109     if ((sc = xmx[i-1][XMN] + hmm->xsc[XTN][LOOP]) > -INFTY)
1110 	xmx[i][XMN] = sc;
1111 
1112 				/* E state */
1113     v2 = vec_sld(max_mmxesc,max_mmxesc,8);
1114     v2 = vec_max(v2,max_mmxesc);
1115     v1 = vec_sld(v2,v2,4);
1116     v1 = vec_max(v1,v2);
1117     vec_ste(v1,XME*4,xmxi);
1118 
1119 				/* J state */
1120     xmxi[XMJ] = -INFTY;
1121     if ((sc = lxmxi[XMJ] + hmm->xsc[XTJ][LOOP]) > -INFTY)
1122       xmxi[XMJ] = sc;
1123     if ((sc = xmxi[XME]   + hmm->xsc[XTE][LOOP]) > xmxi[XMJ])
1124       xmxi[XMJ] = sc;
1125 
1126 				/* B state */
1127     xmxi[XMB] = -INFTY;
1128     if ((sc = xmxi[XMN] + hmm->xsc[XTN][MOVE]) > -INFTY)
1129       xmxi[XMB] = sc;
1130     if ((sc = xmxi[XMJ] + hmm->xsc[XTJ][MOVE]) > xmxi[XMB])
1131       xmxi[XMB] = sc;
1132 
1133 				/* C state */
1134     xmxi[XMC] = -INFTY;
1135     if ((sc = lxmxi[XMC] + hmm->xsc[XTC][LOOP]) > -INFTY)
1136       xmxi[XMC] = sc;
1137     if ((sc = xmxi[XME] + hmm->xsc[XTE][MOVE]) > xmxi[XMC])
1138       xmxi[XMC] = sc;
1139   }
1140   /* T state (not stored) */
1141   sc = xmx[L][XMC] + hmm->xsc[XTC][MOVE];
1142 
1143   if (ret_tr != NULL) {
1144     P7ViterbiTrace(hmm, dsq, L, mx, &tr);
1145     *ret_tr = tr;
1146   }
1147 
1148   /* Note (Lindahl): we do NOT free the dpmatrix here anymore - the code was
1149    * spending 30% of the runtime allocating/freeing memory.
1150    * Provide a pointer to a dpmatrix_s structure to this routine,
1151    * and we try to reuse it. After the final call to P7Viterbi,
1152    * free it with FreePlan7Matrix.
1153    */
1154   return Scorify(sc);		/* the total Viterbi score. */
1155 }
1156 #endif /*the ALTIVEC port*/
1157 
1158