1 /*****************************************************************
2 * Unipro UGENE - Genome analysis suite
3 * Copyright (C) 2008 Unipro, Russia (http://ugene.net)
4 * All Rights Reserved
5 *
6 * This source code is distributed under the terms of the
7 * GNU General Public License. See the files COPYING and LICENSE
8 * for details.
9 *****************************************************************/
10 /************************************************************
11 * HMMER - Biological sequence analysis with profile HMMs
12 * Copyright (C) 1992-2003 Washington University School of Medicine
13 * All Rights Reserved
14 *
15 * This source code is distributed under the terms of the
16 * GNU General Public License. See the files COPYING and LICENSE
17 * for details.
18 ************************************************************/
19
20 /* fast_algorithms.c
21 * SRE, Sun Nov 10 08:54:48 2002 [AA 3080, Denver to StL]
22 * CVS $Id: fast_algorithms.c,v 1.1.1.1 2008/01/17 02:32:19 jizhu Exp $
23 *
24 * Optimized routines to replace slower implementations in core_algorithms.c.
25 *
26 * The routines in core_algorithms.c are designed for clarity
27 * and maintainability, not for speed. Implementations here
28 * are designed for speed, not clarity. If you're trying to
29 * understand the code, or optimize for a specific platform,
30 * you are probably better off looking at core_algorithms.c.
31 *
32 * P7Viterbi() is the key function to target optimization to.
33 * The implementation in core_algorithms.c is currently ifdef'ed
34 * out of the code. The implementation that is used by default
35 * is here, in fast_algorithms.c. A third implementation, from
36 * Erik Lindahl at Stanford, is Mac/Altivec specific.
37 *
38 * Which implementation is used is controlled by ifdef's. The
39 * default implementation uses a fast implementation of
40 * P7Viterbi() from here. Other options (mutually exclusive):
41 *
42 * -DSLOW
43 * enable original core_algorithms.c code: slower than default,
44 * but might be easier to follow, for someone trying
45 * to understand the DP code.
46 * -DALTIVEC
47 * enable Erik Lindahl's Altivec code for Macintosh OSX
48 */
49 //#if !defined ALTIVEC && !defined SLOW
50 #ifdef ALTIVEC
51 #include "config.h"
52 //#include "squidconf.h"
53
54 #include "structs.h"
55 #include "funcs.h"
56 //#include "squid.h"
57 #include <altivec.h>
58 #include <stdio.h>
59 #include <malloc.h>
60 //
61 // /* the DEFAULT P7Viterbi() is portably optimized; code follows:
62 // */
63 //
64 // /* Function: P7Viterbi() - portably optimized version
65 // * Incept: SRE, Fri Nov 15 13:14:33 2002 [St. Louis]
66 // *
67 // * Purpose: The Viterbi dynamic programming algorithm.
68 // * Derived from core_algorithms.c:P7Viterbi().
69 // *
70 // * Args: dsq - sequence in digitized form
71 // * L - length of dsq
72 // * hmm - the model
73 // * mx - re-used DP matrix
74 // * ret_tr - RETURN: traceback; pass NULL if it's not wanted
75 // *
76 // * Return: log P(S|M)/P(S|R), as a bit score
77 // */
78 // float
79 // P7Viterbi(unsigned char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s *mx, struct p7trace_s **ret_tr)
80 // {
81 // struct p7trace_s *tr;
82 // int **xmx;
83 // int **mmx;
84 // int **imx;
85 // int **dmx;
86 // int i,k;
87 // int sc;
88 // int *mc, *dc, *ic; /* pointers to rows of mmx, dmx, imx */
89 // int *ms, *is; /* pointers to msc[i], isc[i] */
90 // int *mpp, *mpc, *ip; /* ptrs to mmx[i-1], mmx[i], imx[i-1] */
91 // int *bp; /* ptr into bsc[] */
92 // int *ep; /* ptr into esc[] */
93 // int xmb; /* value of xmx[i-1][XMB] */
94 // int xme; /* max for xmx[i][XME] */
95 // int *dpp; /* ptr into dmx[i-1] (previous row) */
96 // int *tpmm, *tpmi, *tpmd, *tpim, *tpii, *tpdm, *tpdd; /* ptrs into tsc */
97 // int M;
98 //
99 // /* Make sure we have space for a DP matrix with 0..L rows, 0..M-1 columns.
100 // */
101 // ResizePlan7Matrix(mx, L, hmm->M, &xmx, &mmx, &imx, &dmx);
102 //
103 // /* Initialization of the zero row.
104 // */
105 // xmx[0][XMN] = 0; /* S->N, p=1 */
106 // xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */
107 // xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */
108 // for (k = 0; k <= hmm->M; k++)
109 // mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY; /* need seq to get here */
110 //
111 // /* Initializations that help icc vectorize.
112 // */
113 // M = hmm->M;
114 //
115 // /* Recursion. Done as a pull.
116 // * Note some slightly wasteful boundary conditions:
117 // * tsc[0] = -INFTY for all eight transitions (no node 0)
118 // * D_M and I_M are wastefully calculated (they don't exist)
119 // */
120 //
121 // tpmm = hmm->tsc[TMM];
122 // tpim = hmm->tsc[TIM];
123 // tpdm = hmm->tsc[TDM];
124 // tpmd = hmm->tsc[TMD];
125 // tpdd = hmm->tsc[TDD];
126 // tpmi = hmm->tsc[TMI];
127 // tpii = hmm->tsc[TII];
128 // bp = hmm->bsc;
129 // for (i = 1; i <= L; i++) {
130 // mc = mmx[i];
131 // dc = dmx[i];
132 // ic = imx[i];
133 // mpp = mmx[i-1];
134 // dpp = dmx[i-1];
135 // ip = imx[i-1];
136 // xmb = xmx[i-1][XMB];
137 // ms = hmm->msc[dsq[i]];
138 // is = hmm->isc[dsq[i]];
139 // mc[0] = -INFTY;
140 // dc[0] = -INFTY;
141 // ic[0] = -INFTY;
142 //
143 // for (k = 1; k <= M; k++) {
144 // mc[k] = mpp[k-1] + tpmm[k-1];
145 // if ((sc = ip[k-1] + tpim[k-1]) > mc[k]) mc[k] = sc;
146 // if ((sc = dpp[k-1] + tpdm[k-1]) > mc[k]) mc[k] = sc;
147 // if ((sc = xmb + bp[k]) > mc[k]) mc[k] = sc;
148 // mc[k] += ms[k];
149 // if (mc[k] < -INFTY) mc[k] = -INFTY;
150 //
151 // dc[k] = dc[k-1] + tpdd[k-1];
152 // if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc;
153 // if (dc[k] < -INFTY) dc[k] = -INFTY;
154 //
155 // if (k < M) {
156 // ic[k] = mpp[k] + tpmi[k];
157 // if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc;
158 // ic[k] += is[k];
159 // if (ic[k] < -INFTY) ic[k] = -INFTY;
160 // }
161 // }
162 //
163 // /* Now the special states. Order is important here.
164 // * remember, C and J emissions are zero score by definition,
165 // */
166 // /* N state */
167 // xmx[i][XMN] = -INFTY;
168 // if ((sc = xmx[i-1][XMN] + hmm->xsc[XTN][LOOP]) > -INFTY)
169 // xmx[i][XMN] = sc;
170 //
171 // /* E state */
172 // xme = -INFTY;
173 // mpc = mmx[i];
174 // ep = hmm->esc;
175 // for (k = 1; k <= hmm->M; k++)
176 // if ((sc = mpc[k] + ep[k]) > xme) xme = sc;
177 // xmx[i][XME] = xme;
178 // /* J state */
179 // xmx[i][XMJ] = -INFTY;
180 // if ((sc = xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP]) > -INFTY)
181 // xmx[i][XMJ] = sc;
182 // if ((sc = xmx[i][XME] + hmm->xsc[XTE][LOOP]) > xmx[i][XMJ])
183 // xmx[i][XMJ] = sc;
184 //
185 // /* B state */
186 // xmx[i][XMB] = -INFTY;
187 // if ((sc = xmx[i][XMN] + hmm->xsc[XTN][MOVE]) > -INFTY)
188 // xmx[i][XMB] = sc;
189 // if ((sc = xmx[i][XMJ] + hmm->xsc[XTJ][MOVE]) > xmx[i][XMB])
190 // xmx[i][XMB] = sc;
191 //
192 // /* C state */
193 // xmx[i][XMC] = -INFTY;
194 // if ((sc = xmx[i-1][XMC] + hmm->xsc[XTC][LOOP]) > -INFTY)
195 // xmx[i][XMC] = sc;
196 // if ((sc = xmx[i][XME] + hmm->xsc[XTE][MOVE]) > xmx[i][XMC])
197 // xmx[i][XMC] = sc;
198 // }
199 // /* T state (not stored) */
200 // sc = xmx[L][XMC] + hmm->xsc[XTC][MOVE];
201 //
202 // if (ret_tr != NULL) {
203 // P7ViterbiTrace(hmm, dsq, L, mx, &tr);
204 // *ret_tr = tr;
205 // }
206 //
207 // return Scorify(sc); /* the total Viterbi score. */
208 // }
209 // #endif /*default P7Viterbi, used when ALTIVEC and SLOW are not defined*/
210
211
212 //#ifdef ALTIVEC
213 /*################################################################
214 * The Altivec port, for Macintosh PowerPC.
215 * Erik Lindahl, Stanford University, 2002.
216 *
217 * Replaces the following functions:
218 * AllocPlan7Body() plan7.c (data alignment on 16-byte boundaries)
219 * CreatePlan7Matrix() core_algorithms.c (data alignment on 16-byte boundaries)
220 * ResizePlan7Matrix() core_algorithms.c (data alignment on 16-byte boundaries)
221 * P7Viterbi() core_algorithms.c (total recode, w/ Altivec instructions)
222 ################################################################*/
223 void
AllocPlan7Body(struct plan7_s * hmm,int M)224 AllocPlan7Body(struct plan7_s *hmm, int M)
225 {
226 int k, x;
227
228 hmm->M = M;
229
230 hmm->rf = (char *)MallocOrDie ((M+2) * sizeof(char));
231 hmm->cs = (char *)MallocOrDie ((M+2) * sizeof(char));
232 hmm->ca = (char *)MallocOrDie ((M+2) * sizeof(char));
233 hmm->map = (int *)MallocOrDie ((M+1) * sizeof(int));
234
235 hmm->t = (float **)MallocOrDie (M * sizeof(float *));
236 hmm->tsc = (int **)MallocOrDie (7 * sizeof(int *));
237 hmm->mat = (float **)MallocOrDie ((M+1) * sizeof(float *));
238 hmm->ins = (float **)MallocOrDie (M * sizeof(float *));
239 hmm->msc = (int **)MallocOrDie (MAXCODE * sizeof(int *));
240 hmm->isc = (int **)MallocOrDie (MAXCODE * sizeof(int *));
241
242 hmm->t[0] = (float *)MallocOrDie ((7*M) * sizeof(float));
243
244 /* Allocate extra memory so tsc[TMM,TIM,TDM,TMD,TDD] start on the
245 * 16-byte cache boundary, and tsc[TMI,TII] start
246 * 12 bytes offset from the boundary.
247 */
248 //hmm->tsc_mem = MallocOrDie (((7*(M+16))) * sizeof(int));
249 hmm->tsc_mem = (int *)memalign(16, (((7*(M+16))) * sizeof(int)));
250 memset( hmm->tsc_mem, 0, ((7*(M+16))) * sizeof(int) );
251 hmm->mat[0] = (float *)MallocOrDie ((MAXABET*(M+1)) * sizeof(float));
252 hmm->ins[0] = (float *)MallocOrDie ((MAXABET*M) * sizeof(float));
253 /* Allocate extra mem. to make sure all members of msc,isc start
254 * on 12-byte offsets from cache boundary.
255 */
256 hmm->msc_mem = (int *)memalign(128,((MAXCODE*(M+1+16)) * sizeof(int)));
257 memset( hmm->msc_mem, 0, (MAXCODE*(M+1+16)) * sizeof(int) );
258 hmm->isc_mem = (int *)memalign(128,(((MAXCODE*(M+16)) * sizeof(int))));
259 memset( hmm->isc_mem, 0, (MAXCODE*(M+16)) * sizeof(int) );
260 //hmm->msc_mem = (int *)MallocOrDie((MAXCODE*(M+1+16)) * sizeof(int));
261 //hmm->isc_mem = (int *)MallocOrDie((MAXCODE*(M+16)) * sizeof(int));
262
263 /* note allocation strategy for important 2D arrays -- trying
264 * to keep locality as much as possible, cache efficiency etc.
265 */
266 for (k = 1; k <= M; k++) {
267 hmm->mat[k] = hmm->mat[0] + k * MAXABET;
268 if (k < M) {
269 hmm->ins[k] = hmm->ins[0] + k * MAXABET;
270 hmm->t[k] = hmm->t[0] + k * 7;
271 }
272 }
273
274 /* align tsc pointers */
275 hmm->tsc[TMM] = (int *) (((((unsigned long int) hmm->tsc_mem) + 15) & (~0xf)));
276 hmm->tsc[TMI] = (int *) (((((unsigned long int) hmm->tsc_mem) + (M+12)*sizeof(int) + 15) & (~0xf)) + 12);
277 hmm->tsc[TMD] = (int *) (((((unsigned long int) hmm->tsc_mem) + 2*(M+12)*sizeof(int) + 15) & (~0xf)));
278 hmm->tsc[TIM] = (int *) (((((unsigned long int) hmm->tsc_mem) + 3*(M+12)*sizeof(int) + 15) & (~0xf)));
279 hmm->tsc[TII] = (int *) (((((unsigned long int) hmm->tsc_mem) + 4*(M+12)*sizeof(int) + 15) & (~0xf)) + 12);
280 hmm->tsc[TDM] = (int *) (((((unsigned long int) hmm->tsc_mem) + 5*(M+12)*sizeof(int) + 15) & (~0xf)));
281 hmm->tsc[TDD] = (int *) (((((unsigned long int) hmm->tsc_mem) + 6*(M+12)*sizeof(int) + 15) & (~0xf)));
282
283 for (x = 0; x < MAXCODE; x++) {
284 hmm->msc[x] = (int *) (((((unsigned long int)hmm->msc_mem) + x*(M+1+12)*sizeof(int) + 15) & (~0xf)) + 12);
285 hmm->isc[x] = (int *) (((((unsigned long int)hmm->isc_mem) + x*(M+12)*sizeof(int) + 15) & (~0xf)) + 12);
286 }
287 /* tsc[0] is used as a boundary condition sometimes [Viterbi()],
288 * so set to -inf always.
289 */
290 for (x = 0; x < 7; x++)
291 hmm->tsc[x][0] = -INFTY;
292
293 hmm->begin = (float *)MallocOrDie ((M+1) * sizeof(float));
294 hmm->bsc_mem= (int *)memalign(128,((M+1+12) * sizeof(int)));
295 memset( hmm->bsc_mem, 0, (M+1+12) * sizeof(int) );
296 //hmm->bsc_mem= (int *)MallocOrDie ((M+1+12) * sizeof(int));
297 hmm->end = (float *)MallocOrDie ((M+1) * sizeof(float));
298 //hmm->esc_mem= (int *)MallocOrDie ((M+1+12) * sizeof(int));
299 hmm->esc_mem= (int *)memalign(128, ((M+1+12) * sizeof(int)));
300 memset( hmm->esc_mem, 0, (M+1+12) * sizeof(int) );
301
302 hmm->bsc = (int *) (((((unsigned long int) hmm->bsc_mem) + 15) & (~0xf)) + 12);
303 hmm->esc = (int *) (((((unsigned long int) hmm->esc_mem) + 15) & (~0xf)) + 12);
304
305 return;
306 }
307
308
309 /* Function: CreatePlan7Matrix()
310 *
311 * Purpose: Create a dynamic programming matrix for standard Forward,
312 * Backward, or Viterbi, with scores kept as scaled log-odds
313 * integers. Keeps 2D arrays compact in RAM in an attempt
314 * to maximize cache hits.
315 *
316 * The mx structure can be dynamically grown, if a new
317 * HMM or seq exceeds the currently allocated size. Dynamic
318 * growing is more efficient than an alloc/free of a whole
319 * matrix for every new target. The ResizePlan7Matrix()
320 * call does this reallocation, if needed. Here, in the
321 * creation step, we set up some pads - to inform the resizing
322 * call how much to overallocate when it realloc's. If a pad
323 * is zero, we will not resize in that dimension.
324 *
325 * Args: N - N+1 rows are allocated, for sequence.
326 * M - size of model in nodes
327 * padN - over-realloc in seq/row dimension, or 0
328 * padM - over-realloc in HMM/column dimension, or 0
329 *
330 * Return: mx
331 * mx is allocated here. Caller frees with FreePlan7Matrix(mx).
332 */
333 struct dpmatrix_s *
CreatePlan7Matrix(int N,int M,int padN,int padM)334 CreatePlan7Matrix(int N, int M, int padN, int padM)
335 {
336 struct dpmatrix_s *mx;
337 int i,n;
338
339 mx = (struct dpmatrix_s *) MallocOrDie (sizeof(struct dpmatrix_s));
340 mx->xmx = (int **) MallocOrDie (sizeof(int *) * (N+1));
341 mx->mmx = (int **) MallocOrDie (sizeof(int *) * (N+1));
342 mx->imx = (int **) MallocOrDie (sizeof(int *) * (N+1));
343 mx->dmx = (int **) MallocOrDie (sizeof(int *) * (N+1));
344
345 /* For the memory accessed by the altivec routines, we want to have
346 * accesses aligned to 16-byte boundaries as far as possible.
347 * To accomplish this, we align extra memory and then set the first
348 * pointer on each row to point to 4 bytes before a boundary.
349 * This means element 1, which is the first one we work on, will be
350 * on a 16-byte boundary. We still make sure we 'own' the three bytes
351 * before, though, so we can load the vector with element 0 cache-aligned too.
352 * The real pointers to memory are kept in xmx_mem,mmx_mem,imx_mem,dmx_mem.
353 */
354 mx->xmx_mem = (void *) MallocOrDie (sizeof(int) * (N+1)*(5 + 16));
355 mx->mmx_mem = (void *) MallocOrDie (sizeof(int) * (N+1)*(M+2+16));
356 mx->imx_mem = (void *) MallocOrDie (sizeof(int) * (N+1)*(M+2+16));
357 mx->dmx_mem = (void *) MallocOrDie (sizeof(int) * (N+1)*(M+2+16));
358
359 mx->xmx[0] = (int *) (((((unsigned long int) mx->xmx_mem) + 15) & (~0xf)) + 12);
360 mx->mmx[0] = (int *) (((((unsigned long int) mx->mmx_mem) + 15) & (~0xf)) + 12);
361 mx->imx[0] = (int *) (((((unsigned long int) mx->imx_mem) + 15) & (~0xf)) + 12);
362 mx->dmx[0] = (int *) (((((unsigned long int) mx->dmx_mem) + 15) & (~0xf)) + 12);
363
364 /* And make sure the beginning of each row is aligned the same way */
365 for (i = 1; i <= N; i++)
366 {
367 mx->xmx[i] = mx->xmx[0] + i*(5+11) ; /* add 11 bytes per row, making it divisible by 4 */
368 n = 12 - (M+2)%4;
369 mx->mmx[i] = mx->mmx[0] + i*(M+2+n);
370 mx->imx[i] = mx->imx[0] + i*(M+2+n);
371 mx->dmx[i] = mx->dmx[0] + i*(M+2+n);
372 }
373
374 mx->maxN = N;
375 mx->maxM = M;
376 mx->padN = padN;
377 mx->padM = padM;
378
379 return mx;
380 }
381
382
383 struct dpmatrix_s *
CreatePlan7MatrixHP(int N,int M,int padN,int padM,char * memaddr)384 CreatePlan7MatrixHP(int N, int M, int padN, int padM, char* memaddr){
385 struct dpmatrix_s *mx;
386 int i,n;
387 mx = (struct dpmatrix_s *) (memaddr + 0x060000);
388 mx->xmx = (int **) (memaddr + 0x065000);
389 mx->mmx = (int **) (memaddr + 0x070000);
390 mx->imx = (int **) (memaddr + 0x075000);
391 mx->dmx = (int **) (memaddr + 0x080000);
392 mx->xmx_mem = (void *) (memaddr + 0x100000);
393 mx->mmx_mem = (void *) (memaddr + 0x150000);
394 mx->imx_mem = (void *) (memaddr + 0x200000);
395 mx->dmx_mem = (void *) (memaddr + 0x250000);
396
397 mx->xmx[0] = (int *) (((((unsigned long int) mx->xmx_mem) + 15) & (~0xf)) + 12);
398 mx->mmx[0] = (int *) (((((unsigned long int) mx->mmx_mem) + 15) & (~0xf)) + 12);
399 mx->imx[0] = (int *) (((((unsigned long int) mx->imx_mem) + 15) & (~0xf)) + 12);
400 mx->dmx[0] = (int *) (((((unsigned long int) mx->dmx_mem) + 15) & (~0xf)) + 12);
401
402 /* And make sure the beginning of each row is aligned the same way */
403 for (i = 1; i <= N; i++)
404 {
405 mx->xmx[i] = mx->xmx[0] + i*(5+11) ; /* add 11 bytes per row, making it divisible by 4 */
406 n = 12 - (M+2)%4;
407 mx->mmx[i] = mx->mmx[0] + i*(M+2+n);
408 mx->imx[i] = mx->imx[0] + i*(M+2+n);
409 mx->dmx[i] = mx->dmx[0] + i*(M+2+n);
410 }
411
412 mx->maxN = N;
413 mx->maxM = M;
414 mx->padN = padN;
415 mx->padM = padM;
416
417 return mx;
418 }
419
420
421
422
423 /* Function: ResizePlan7Matrix()
424 *
425 * Purpose: Reallocate a dynamic programming matrix, if necessary,
426 * for a problem of NxM: sequence length N, model size M.
427 * (N=1 for small memory score-only variants; we allocate
428 * N+1 rows in the DP matrix.)
429 *
430 * See additional comments in
431 * core_algorithms.c:ResizePlan7Matrix(), the normal version
432 * of this function. This version is only used in the
433 * Altivec (--enable-altivec) port.
434 *
435 * Returns individual ptrs to the four matrix components
436 * as a convenience.
437 *
438 * Args: mx - an already allocated model to grow.
439 * N - seq length to allocate for; N+1 rows
440 * M - size of model
441 * xmx, mmx, imx, dmx
442 * - RETURN: ptrs to four mx components as a convenience
443 *
444 * Return: (void)
445 * mx is (re)allocated here.
446 */
447 void
ResizePlan7Matrix(struct dpmatrix_s * mx,int N,int M,int *** xmx,int *** mmx,int *** imx,int *** dmx)448 ResizePlan7Matrix(struct dpmatrix_s *mx, int N, int M,
449 int ***xmx, int ***mmx, int ***imx, int ***dmx)
450 {
451 int i,n;
452
453 if (N <= mx->maxN && M <= mx->maxM)
454 {
455 if (xmx != NULL) *xmx = mx->xmx;
456 if (mmx != NULL) *mmx = mx->mmx;
457 if (imx != NULL) *imx = mx->imx;
458 if (dmx != NULL) *dmx = mx->dmx;
459 return;
460 }
461
462 if (N > mx->maxN) {
463 N += mx->padN;
464 mx->maxN = N;
465 mx->xmx = (int **) ReallocOrDie (mx->xmx, sizeof(int *) * (mx->maxN+1));
466 mx->mmx = (int **) ReallocOrDie (mx->mmx, sizeof(int *) * (mx->maxN+1));
467 mx->imx = (int **) ReallocOrDie (mx->imx, sizeof(int *) * (mx->maxN+1));
468 mx->dmx = (int **) ReallocOrDie (mx->dmx, sizeof(int *) * (mx->maxN+1));
469 }
470
471 if (M > mx->maxM) {
472 M += mx->padM;
473 mx->maxM = M;
474 }
475
476 mx->xmx_mem = ReallocOrDie (mx->xmx_mem, sizeof(int) * (mx->maxN+1)*(5 + 16));
477 mx->mmx_mem = ReallocOrDie (mx->mmx_mem, sizeof(int) * (mx->maxN+1)*(mx->maxM+2+16));
478 mx->imx_mem = ReallocOrDie (mx->imx_mem, sizeof(int) * (mx->maxN+1)*(mx->maxM+2+16));
479 mx->dmx_mem = ReallocOrDie (mx->dmx_mem, sizeof(int) * (mx->maxN+1)*(mx->maxM+2+16));
480
481 mx->xmx[0] = (int *) (((((unsigned long int) mx->xmx_mem) + 15) & (~0xf)) + 12);
482 mx->mmx[0] = (int *) (((((unsigned long int) mx->mmx_mem) + 15) & (~0xf)) + 12);
483 mx->imx[0] = (int *) (((((unsigned long int) mx->imx_mem) + 15) & (~0xf)) + 12);
484 mx->dmx[0] = (int *) (((((unsigned long int) mx->dmx_mem) + 15) & (~0xf)) + 12);
485
486 /* And make sure the beginning of each row is aligned the same way */
487 for (i = 1; i <= mx->maxN; i++)
488 {
489 mx->xmx[i] = mx->xmx[0] + i*(5+11) ; /* add 11 bytes per row, making it divisible by 4 */
490 n = 12 - (mx->maxM+2)%4;
491 mx->mmx[i] = mx->mmx[0] + i*(mx->maxM+2+n);
492 mx->imx[i] = mx->imx[0] + i*(mx->maxM+2+n);
493 mx->dmx[i] = mx->dmx[0] + i*(mx->maxM+2+n);
494 }
495
496 if (xmx != NULL) *xmx = mx->xmx;
497 if (mmx != NULL) *mmx = mx->mmx;
498 if (imx != NULL) *imx = mx->imx;
499 if (dmx != NULL) *dmx = mx->dmx;
500 }
501
502
503 /* Function: P7Viterbi()
504 *
505 * Purpose: The Viterbi dynamic programming algorithm; Altivec implementation
506 * by Erik Lindahl, Stanford University, 2002.
507 *
508 * Args: dsq - sequence in digitized form
509 * L - length of dsq
510 * hmm - the model
511 * mx - DP matrix (may get grown here)
512 * ret_tr - RETURN: traceback; pass NULL if it's not wanted
513 *
514 * Return: log P(S|M)/P(S|R), as a bit score
515 */
516 /* This first version of P7Viterbi has been accelerated with Altivec vectorization.
517 * On Apple hardware, it is up to a factor 10 faster than the old non-altivec version.
518 */
519 typedef union {
520 vector signed int v;
521 int i[4];
522 } ivector;
printivec(vector signed int z)523 void printivec(vector signed int z)
524 {
525 ivector q;
526 q.v=z;
527 printf("%d %d %d %d\n",q.i[0],q.i[1],q.i[2],q.i[3]);
528 }
529 float
P7Viterbi(unsigned char * dsq,int L,struct plan7_s * hmm,struct dpmatrix_s * mx,struct p7trace_s ** ret_tr)530 P7Viterbi(unsigned char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s *mx, struct p7trace_s **ret_tr)
531 {
532 struct p7trace_s *tr;
533 int **xmx;
534 int **mmx;
535 int **imx;
536 int **dmx;
537 int *mmxi,*xmxi,*imxi,*dmxi;
538 int *lmmxi,*lxmxi,*limxi,*ldmxi;
539 int *p_tmm,*p_tim,*p_tdm,*p_bsc,*p_msc,*p_tmd,*p_tdd,*p_tmi,*p_tii,*p_isc,*p_esc;
540 int i,n,k;
541 int sc;
542 /* gcc and motorola use different syntax for initializing vectors,
543 * so we use a dummy variable instead to get an adress to load from...
544 */
545 int t_lowscore = -INFTY;
546
547 /* vector variables. We avoid the stupid gcc register spill/fill code by
548 * limiting ourselves to 32 generic variables and making the all registers.
549 * (This reuse is the reason for the generic variable names).
550 */
551 register vector signed int v_lowscore;
552 register vector signed int max_mmxesc;
553 register vector signed int v_xmb;
554 register vector unsigned int mask1;
555 register vector unsigned int mask2;
556 register vector unsigned int mask3;
557 register vector unsigned int mask4;
558 register vector signed int v_lmmx1;
559 register vector signed int v_lmmx2;
560 register vector signed int v_limx1;
561 register vector signed int v_limx2;
562 register vector signed int v_save_lmmx;
563 register vector signed int v_save_ldmx;
564 register vector signed int v_save_limx;
565 register vector signed int v_save_mmx;
566 register vector signed int v_save_dmx;
567 register vector signed int v1;
568 register vector signed int v2;
569 register vector signed int v3;
570 register vector signed int v4;
571 register vector signed int v5;
572 register vector signed int v6;
573 register vector signed int v7;
574 register vector signed int v8;
575 register vector signed int v9;
576 register vector signed int v10;
577 register vector signed int v11;
578 register vector signed int v12;
579 register vector signed int v13;
580 register vector signed int v14;
581 register vector signed int v15;
582
583 /* load (-infinity) to all four elements in v_lowscore */
584 v_lowscore = vec_lde(0, &t_lowscore );
585 mask1 = (vector unsigned int)vec_lvsl(0,&t_lowscore);
586 v_lowscore = vec_perm(v_lowscore,v_lowscore,(vector unsigned char)mask1);
587 v_lowscore = vec_splat(v_lowscore,0);
588
589 v1 = vec_splat_s32(-1);
590 v2 = vec_splat_s32(0);
591 mask1 = (vector unsigned int)vec_sld(v1,v2,12); /* FF in first pos, rest. are 00 */
592 mask2 = vec_sld(mask1,mask1,12);
593 mask3 = vec_sld(mask1,mask1,8);
594 mask4 = vec_sld(mask1,mask1,4);
595
596 /* Make sure our DP matrix has 0..L rows, 0..M columns; grow it if needed.
597 */
598 ResizePlan7Matrix(mx, L, hmm->M, &xmx, &mmx, &imx, &dmx);
599
600 /* Initialization of the zero row.
601 */
602 xmx[0][XMN] = 0; /* S->N, p=1 */
603 xmx[0][XMB] = hmm->xsc[XTN][MOVE]; /* S->N->B, no N-tail */
604 xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY; /* need seq to get here */
605
606 mmxi=mmx[0];
607 imxi=imx[0];
608 dmxi=dmx[0];
609 xmxi=xmx[0];
610
611
612 for (n = 0; n < 5+hmm->M; n+=4) {
613 vec_st(v_lowscore, n*4, mmxi);
614 vec_st(v_lowscore, n*4, imxi);
615 vec_st(v_lowscore, n*4, dmxi);
616 }
617
618
619 /* Fill data beyound M with -INFTY, so we can take the maximum including
620 * elements with k>M.
621 */
622 for(k=1+hmm->M;k<=(3+hmm->M);k++) {
623 hmm->esc[k]=-INFTY;
624 hmm->bsc[k]=-INFTY;
625 for(i=0;i<7;i++)
626 hmm->tsc[i][k]=-INFTY;
627 for(i=0;i<MAXCODE;i++) {
628 hmm->msc[i][k]=-INFTY;
629 hmm->isc[i][k]=-INFTY;
630 }
631 }
632
633 /* Recursion. Done as a pull
634 * Note some slightly wasteful boundary conditions:
635 * tsc[0] = -INFTY for all eight transitions (no node 0)
636 * D_M and I_M are wastefully calculated (they don't exist)
637 */
638
639 for (i = 1; i <= L; i++) {
640 /* pointers to last (i-1) row */
641 lmmxi=mmxi;
642 limxi=imxi;
643 ldmxi=dmxi;
644 lxmxi=xmxi;
645
646 /* get pointers to this row */
647 mmxi=mmx[i];
648 imxi=imx[i];
649 dmxi=dmx[i];
650 xmxi=xmx[i];
651
652 /* Set everything that doesnt depend on k here */
653
654 /* load and splat (spread to all elements) XMX[i-1][XMB] */
655 v13 = vec_lde(0,&(xmx[i-1][XMB]));
656 v14 = (vector signed int)vec_lvsl(0,&(xmx[i-1][XMB]));
657 v13 = vec_perm(v13,v13,(vector unsigned char)v14);
658 v_xmb = vec_splat(v13,0);
659 p_tmm = hmm->tsc[TMM];
660 p_tim = hmm->tsc[TIM];
661 p_tdm = hmm->tsc[TDM];
662 p_bsc = hmm->bsc;
663 k = dsq[i];
664 p_msc = hmm->msc[k];
665 p_isc = hmm->isc[k];
666 p_tmd = hmm->tsc[TMD];
667 p_tdd = hmm->tsc[TDD];
668 p_tmi = hmm->tsc[TMI];
669 p_tii = hmm->tsc[TII];
670 p_esc = hmm->esc;
671 max_mmxesc = v_lowscore;
672
673 /* the 0 element of vectors are aligned 12 bytes up from the 16-byte boundary,
674 * so we simply write the entire 16 bytes before the 16 byte boundary.
675 */
676 vec_st(v_lowscore,0,mmxi);
677 vec_st(v_lowscore,0,imxi);
678 vec_st(v_lowscore,0,dmxi);
679
680 /* Load the first (i.e. 'previous') vector on last row for mmx,imx,dmx,
681 * and load the first vector of dmx and mmx on this row.
682 */
683 v_save_lmmx = vec_ld(-12, lmmxi);
684 v_save_limx = vec_ld(-12, limxi);
685 v_save_ldmx = vec_ld(-12, ldmxi);
686 v_save_mmx = vec_ld(-12, mmxi);
687 v_save_dmx = vec_ld(-12, dmxi);
688
689 /* we have allocated extra memory, so it is perfectly OK
690 * to do the calculations for a couple of extra cells where
691 * k>hmm->M. These cells just wont be used.
692 */
693 for (n = 4, k=1; k < (hmm->M-3) ; n+=32, k+=8) {
694 /* match state */
695
696 /* 1: check which of mmx[i-1][k-1]+TMM and imx[i-1][k-1]+TIM is better,
697 * but do it for 8 elements in parallel.
698 * Since we are comparing with data on the previous row but one position
699 * earlier, we have to shift stuff. Load two new vectors each round,
700 * and use the saved register from last round.
701 */
702 /* load mmx data */
703 v_lmmx1 = vec_ld(n, lmmxi);
704 v_lmmx2 = vec_ld(n+16, lmmxi);
705
706 /* load imx data */
707 v_limx1 = vec_ld(n, limxi);
708 v_limx2 = vec_ld(n+16, limxi);
709
710 v5 = vec_ld(n, ldmxi); /* Load dmx data */
711 v10 = vec_ld(n+16, ldmxi);
712
713 /* shift mmx, imx & dmx data */
714 v1 = vec_sld(v_save_lmmx,v_lmmx1,12);
715 v3 = vec_sld(v_save_limx,v_limx1,12);
716 v9 = vec_sld(v_save_ldmx,v5,12);
717
718 /* shift mmx, imx & dmx data */
719 v2 = vec_sld(v_lmmx1,v_lmmx2,12);
720 v4 = vec_sld(v_limx1,v_limx2,12);
721 v_save_ldmx = v10;
722 v10 = vec_sld(v5,v10,12);
723
724 v_save_lmmx = v_lmmx2;
725 v_save_limx = v_limx2;
726
727 /* v1,v2 now contains 8 element with mmx[i-1][k-1],
728 * v3,v4 contain 8 elements with imx[i-1][k-1],
729 * and v9,v10 contain 8 elements with dmx[i-1][k-1].
730 */
731 /* load TMM, TIM & TDM entries from the HMM - these are aligned in memory */
732 v5 = vec_ld(n-4, p_tmm);
733 v6 = vec_ld(n+12, p_tmm);
734 v7 = vec_ld(n-4, p_tim);
735 v8 = vec_ld(n+12, p_tim);
736 v11 = vec_ld(n-4, p_tdm);
737 v12 = vec_ld(n+12, p_tdm);
738 /* load bsc[k] */
739 v14 = vec_ld(n, p_bsc);
740 v15 = vec_ld(n+16, p_bsc);
741
742 /* calc mmx+TMM, imx+TIM, dmx+TDM and XMX+bsc with saturated arithmetic, so
743 * we don't loop if we add the large negative numbers used for -infinity.
744 */
745 v1 = vec_adds(v1,v5);
746 v2 = vec_adds(v2,v6);
747 v3 = vec_adds(v3,v7);
748 v4 = vec_adds(v4,v8);
749 v9 = vec_adds(v9,v11);
750 v10 = vec_adds(v10,v12);
751 v14 = vec_adds(v14,v_xmb);
752 v15 = vec_adds(v15,v_xmb);
753 /* Select max of mmx+TMM and imx+TIM in each element */
754 v1 = vec_max(v1,v3);
755 v2 = vec_max(v2,v4);
756 /* select max of dmx+TDM and XMX+bsc */
757 v9 = vec_max(v9,v14);
758 v10 = vec_max(v10,v15);
759 /* select max of the four alternatives */
760 v1 = vec_max(v1,v9);
761 v2 = vec_max(v2,v10);
762 /* v1,v2 now contain the max values for the new mmx;
763 * check if we should add msc.
764 */
765
766 //printf("L=%d M=%d n=%d k=%d i=%d dsq[i]=%d p_msc=hmm->msc[dsq[i]]=%d hmm->msc[dsq[i]]=%d\n",
767 // L,hmm->M,n,k,i,dsq[i],p_msc,hmm->msc[dsq[i]]);
768 v3 = vec_ld(n, p_msc);
769 v4 = vec_ld(n+16, p_msc);
770
771 v5 = (vector signed int)vec_cmpgt(v3,v_lowscore);
772 v6 = (vector signed int)vec_cmpgt(v4,v_lowscore);
773
774 /* load esc[k] */
775 v9 = vec_ld(n, p_esc);
776 v10 = vec_ld(n+16, p_esc);
777
778 v1 = vec_adds(v1,v3);
779 v2 = vec_adds(v2,v4);
780 v1 = vec_sel(v3,v1,(vector unsigned int)v5);
781 v2 = vec_sel(v4,v2,(vector unsigned int)v6);
782
783 /* have final values for mmx on this row in v1,v2 - store it */
784 vec_st(v1, n, mmxi);
785 vec_st(v2, n+16, mmxi);
786
787 v9 = vec_adds(v9,v1);
788 v10 = vec_adds(v10,v2);
789 v9 = vec_max(v9,v10);
790 max_mmxesc = vec_max(v9,max_mmxesc);
791
792 /* OK. We finished the match state. Normally we could now first
793 * do the delete and then the insertion. The problem is that the
794 * delete is a pain in the ass to vectorize, since each element
795 * depends on the previous one in the vector. This means I have
796 * to write this relatively simple operation as eight independent
797 * ones, just as we would have done in a non-vectorized code.
798 * Since this is independent of the insertion state changes, I
799 * try to hide the latencies by doing the delete and insert
800 * calculations in parallel.
801 * To make things easier I add 'del' in a comment on each
802 * line for calculations that are on the delete state, and 'ins'
803 * for the calculations for the insertion. Hang on...
804 */
805
806 /* We already have the match data on this row from the previous
807 * iteration in v_save_mmx. Rotate it so the element that used to be
808 * is pos 4 last iteration is in position 1, and pos 2-4 contain
809 * the the first three elements of mmx this iteration.
810 * And do the same type of rotation for v1/v2...
811 */
812
813 v4 = vec_sld(v1,v2,12); /* del */
814 v3 = vec_sld(v_save_mmx,v1,12); /* del */
815 v_save_mmx = v2; /* Save for next iteration */
816
817 /* Rotate last dmx data so we have the fourth element in pos 1. */
818 v_save_dmx = vec_sld(v_save_dmx,v_save_dmx,12); /* del */
819
820 /* load TMD & TDD data */
821 v5 = vec_ld(n-4, p_tmd); /* del */
822 v6 = vec_ld(n+12, p_tmd); /* del */
823 v7 = vec_ld(n-4, p_tdd); /* del */
824 v8 = vec_ld(n+12, p_tdd); /* del */
825
826 /* calculate mmx+TMD */
827 v3 = vec_adds(v3,v5); /* del */
828 v4 = vec_adds(v4,v6); /* del */
829
830 /* Start the ugly stuff. We have rotated last dmx data. Add TDD to
831 * it and compare with v3/v4 (data from mmx+TMD alternative), but
832 * we only compare & replace the first position, so we use a mask!
833 */
834
835 /* First position: Add TDD to v_save_dmx */
836 v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
837 /* Select max of this and mmx+TMD and save it temporary to v_save_dmx */
838 v_save_dmx = vec_max(v_save_dmx,v3); /* del */
839 /* use a mask to select only the first element from v_save_dmx, rest from v3. */
840 v3 = vec_sel(v3,v_save_dmx,mask1); /* del */
841
842 /* Start doing the insertion calculations in parallel.
843 * Load the TMI data.
844 */
845 v9 = vec_ld(n , p_tmi); /* ins */
846 v10 = vec_ld(n+16, p_tmi); /* ins */
847 /* Deletion:
848 * Now we have an accurate pos 1 in v3. continue with pos2 the same
849 * way. Rotate to a temporary array, add to TDD, compare and use a mask
850 * to write back to only position 2.
851 */
852 v_save_dmx = vec_sld(v3,v3,12); /* del */
853 v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
854 v_save_dmx = vec_max(v_save_dmx,v3); /* del */
855 v3 = vec_sel(v3,v_save_dmx,mask2); /* del */
856
857 /* More insertion stuff - load TII data */
858 v11 = vec_ld(n , p_tii); /* ins */
859 v12 = vec_ld(n+16, p_tii); /* ins */
860
861 /* Deletion, position 3... */
862 v_save_dmx = vec_sld(v3,v3,12); /* del */
863 v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
864 v_save_dmx = vec_max(v_save_dmx,v3); /* del */
865 v3 = vec_sel(v3,v_save_dmx,mask3); /* del */
866
867 /* insertion stuff: calculate mmx+TMI */
868 v9 = vec_adds(v_lmmx1,v9); /* ins */
869 v10 = vec_adds(v_lmmx2,v10); /* ins */
870
871 /* Deletion, position 4 */
872 v_save_dmx = vec_sld(v3,v3,12); /* del */
873 v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
874 v_save_dmx = vec_max(v_save_dmx,v3); /* del */
875 v3 = vec_sel(v3,v_save_dmx,mask4); /* del */
876
877 /* insertion stuff: calculate imx+TII */
878 v11 = vec_adds(v_limx1,v11); /* ins */
879 v12 = vec_adds(v_limx2,v12); /* ins */
880
881 /* That was the first deletion vector, but we are unrolling.
882 * The next step is position '5', i.e. the first in vector 2.
883 * This one depends on the last position of vector 1, which we just finished.
884 */
885 v_save_dmx = vec_sld(v3,v3,12); /* del */
886 v_save_dmx = vec_adds(v_save_dmx,v8); /* del */
887 v_save_dmx = vec_max(v_save_dmx,v4); /* del */
888 v4 = vec_sel(v4,v_save_dmx,mask1); /* del */
889
890 /* insertion stuff: select max of mmx+TMI and imx+TII */
891 v9 = vec_max(v9,v11); /* ins */
892 v10 = vec_max(v10,v12); /* ins */
893 /* insertion stuff: load data from hmm->isc[tmpidx] */
894 v11 = vec_ld(n, p_isc); /* ins */
895 v12 = vec_ld(n+16, p_isc); /* ins */
896
897 /* position 6 (2 in vector 2) */
898 v_save_dmx = vec_sld(v4,v4,12); /* del */
899 v_save_dmx = vec_adds(v_save_dmx,v8); /* del */
900 v_save_dmx = vec_max(v_save_dmx,v4); /* del */
901 v4 = vec_sel(v4,v_save_dmx,mask2); /* del */
902
903 /* insertion: compare max of mmx+TMI, imx+TII with hmm->isc */
904 v13 = (vector signed int)vec_cmpgt(v11,v_lowscore);
905 v14 = (vector signed int)vec_cmpgt(v12,v_lowscore);
906
907 /* position 7 (3 in vector 2) */
908 v_save_dmx = vec_sld(v4,v4,12); /* del */
909 v_save_dmx = vec_adds(v_save_dmx,v8); /* del */
910 v_save_dmx = vec_max(v_save_dmx,v4); /* del */
911 v4 = vec_sel(v4,v_save_dmx,mask3); /* del */
912
913 v9 = vec_adds(v9,v11);
914 v10 = vec_adds(v10,v12);
915 v9 = vec_sel(v11,v9,(vector unsigned int)v13);
916 v10 = vec_sel(v12,v10,(vector unsigned int)v14);
917
918 /* position 8 (4 in vector 2) */
919 v_save_dmx = vec_sld(v4,v4,12); /* del */
920 v_save_dmx = vec_adds(v_save_dmx,v8); /* del */
921 v_save_dmx = vec_max(v_save_dmx,v4); /* del */
922 v_save_dmx = vec_sel(v4,v_save_dmx,mask4); /* del */
923
924 /* Puh! that was the deletion... v3/v_save_dmx now contain the updated dmx data;
925 * save it to memory. (v_save_dmx will be used next iteration)
926 */
927 vec_st(v3, n, dmxi); /* del */
928 vec_st(v_save_dmx, n+16, dmxi); /* del */
929
930 /* save insertion too */
931 vec_st(v9, n, imxi);
932 vec_st(v10,n+16,imxi);
933
934 }
935 /* odd loop */
936 if(k< (1+hmm->M)) {
937 /* match state */
938 /* 1: check which of mmx[i-1][k-1]+TMM and imx[i-1][k-1]+TIM is better,
939 * but do it for 4 elements in parallel.
940 * Since we are comparing with data on the previous row but one position
941 * earlier, we have to shift stuff. Load two new vectors each round,
942 * and use the saved register from last round.
943 */
944 /* load mmx data */
945 v_lmmx1 = vec_ld(n, lmmxi);
946
947 /* load imx data */
948 v_limx1 = vec_ld(n, limxi);
949
950 v5 = vec_ld(n, ldmxi); /* Load dmx data */
951
952 /* shift mmx, imx & dmx data */
953 v1 = vec_sld(v_save_lmmx,v_lmmx1,12);
954 v3 = vec_sld(v_save_limx,v_limx1,12);
955 v9 = vec_sld(v_save_ldmx,v5,12);
956
957 /* v1,v2 now contains 8 element with mmx[i-1][k-1],
958 * v3,v4 contain 8 elements with imx[i-1][k-1],
959 * and v9,v10 contain 8 elements with dmx[i-1][k-1].
960 */
961 /* load TMM, TIM & TDM entries from the HMM - these are aligned in memory */
962 v5 = vec_ld(n-4, p_tmm);
963 v7 = vec_ld(n-4, p_tim);
964 v11 = vec_ld(n-4, p_tdm);
965 /* load bsc[k] */
966 v14 = vec_ld(n, p_bsc);
967
968 /* calc mmx+TMM, imx+TIM, dmx+TDM and XMX+bsc with saturated arithmetic, so
969 * we don't loop if we add the large negative numbers used for -infinity.
970 */
971 v1 = vec_adds(v1,v5);
972 v3 = vec_adds(v3,v7);
973 v9 = vec_adds(v9,v11);
974 v14 = vec_adds(v14,v_xmb);
975 /* Select max of mmx+TMM and imx+TIM in each element */
976 v1 = vec_max(v1,v3);
977 /* select max of dmx+TDM and XMX+bsc */
978 v9 = vec_max(v9,v14);
979 /* select max of the four alternatives */
980 v1 = vec_max(v1,v9);
981 /* v1,v2 now contain the max values for the new mmx;
982 * check if we should add msc.
983 */
984
985 v3 = vec_ld(n, p_msc);
986
987 v5 = (vector signed int)vec_cmpgt(v3,v_lowscore);
988
989 /* load esc[k] */
990 v9 = vec_ld(n, p_esc);
991
992 v1 = vec_adds(v1,v3);
993 v1 = vec_sel(v3,v1,(vector unsigned int)v5);
994
995 /* have final values for mmx on this row in v1,v2 - store it */
996 vec_st(v1, n, mmxi);
997
998 v9 = vec_adds(v9,v1);
999 max_mmxesc = vec_max(v9,max_mmxesc);
1000
1001 /* OK. We finished the match state. Normally we could now first
1002 * do the delete and then the insertion. The problem is that the
1003 * delete is a pain in the ass to vectorize, since each element
1004 * depends on the previous one in the vector. This means I have
1005 * to write this relatively simple operation as eight independent
1006 * ones, just as we would have done in a non-vectorized code.
1007 * Since this is independent of the insertion state changes, I
1008 * try to hide the latencies by doing the delete and insert
1009 * calculations in parallel.
1010 * To make things easier I add 'del' in a comment on each
1011 * line for calculations that are on the delete state, and 'ins'
1012 * for the calculations for the insertion. Hang on...
1013 */
1014
1015 /* We already have the match data on this row from the previous
1016 * iteration in v_save_mmx. Rotate it so the element that used to be
1017 * is pos 4 last iteration is in position 1, and pos 2-4 contain
1018 * the the first three elements of mmx this iteration.
1019 * And do the same type of rotation for v1/v2...
1020 */
1021
1022 v3 = vec_sld(v_save_mmx,v1,12); /* del */
1023
1024 /* Rotate last dmx data so we have the fourth element in pos 1. */
1025 v_save_dmx = vec_sld(v_save_dmx,v_save_dmx,12); /* del */
1026
1027 /* load TMD & TDD data */
1028 v5 = vec_ld(n-4, p_tmd); /* del */
1029 v7 = vec_ld(n-4, p_tdd); /* del */
1030
1031 /* calculate mmx+TMD */
1032 v3 = vec_adds(v3,v5); /* del */
1033
1034 /* Start the ugly stuff. We have rotated last dmx data. Add TDD to
1035 * it and compare with v3/v4 (data from mmx+TMD alternative), but
1036 * we only compare & replace the first position, so we use a mask!
1037 */
1038
1039 /* First position: Add TDD to v_save_dmx */
1040 v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
1041 /* Select max of this and mmx+TMD and save it temporary to v_save_dmx */
1042 v_save_dmx = vec_max(v_save_dmx,v3); /* del */
1043 /* use a mask to select only the first element from v_save_dmx, rest from v3. */
1044 v3 = vec_sel(v3,v_save_dmx,mask1); /* del */
1045
1046 /* Start doing the insertion calculations in parallel.
1047 * Load the TMI data.
1048 */
1049 v9 = vec_ld(n , p_tmi); /* ins */
1050 /* Deletion:
1051 * Now we have an accurate pos 1 in v3. continue with pos2 the same
1052 * way. Rotate to a temporary array, add to TDD, compare and use a mask
1053 * to write back to only position 2.
1054 */
1055 v_save_dmx = vec_sld(v3,v3,12); /* del */
1056 v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
1057 v_save_dmx = vec_max(v_save_dmx,v3); /* del */
1058 v3 = vec_sel(v3,v_save_dmx,mask2); /* del */
1059
1060 /* More insertion stuff - load TII data */
1061 v11 = vec_ld(n , p_tii); /* ins */
1062
1063 /* Deletion, position 3... */
1064 v_save_dmx = vec_sld(v3,v3,12); /* del */
1065 v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
1066 v_save_dmx = vec_max(v_save_dmx,v3); /* del */
1067 v3 = vec_sel(v3,v_save_dmx,mask3); /* del */
1068
1069 /* insertion stuff: calculate mmx+TMI */
1070 v9 = vec_adds(v_lmmx1,v9); /* ins */
1071
1072 /* Deletion, position 4 */
1073 v_save_dmx = vec_sld(v3,v3,12); /* del */
1074 v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
1075 v_save_dmx = vec_max(v_save_dmx,v3); /* del */
1076 v3 = vec_sel(v3,v_save_dmx,mask4); /* del */
1077
1078 /* insertion stuff: calculate imx+TII */
1079 v11 = vec_adds(v_limx1,v11); /* ins */
1080
1081 /* insertion stuff: select max of mmx+TMI and imx+TII */
1082 v9 = vec_max(v9,v11); /* ins */
1083 /* insertion stuff: load data from hmm->isc[tmpidx] */
1084 v11 = vec_ld(n, p_isc); /* ins */
1085
1086 /* insertion: compare max of mmx+TMI, imx+TII with hmm->isc */
1087 v13 = (vector signed int)vec_cmpgt(v11,v_lowscore);
1088
1089 v9 = vec_adds(v9,v11);
1090 v9 = vec_sel(v11,v9,(vector unsigned int)v13);
1091
1092 /* Puh! that was the deletion... v3/v_save_dmx now contain the updated dmx data;
1093 * save it to memory. (v_save_dmx will be used next iteration)
1094 */
1095 vec_st(v3, n, dmxi); /* del */
1096
1097 /* save insertion too */
1098 vec_st(v9, n, imxi);
1099 }
1100 /* end of k loops */
1101
1102
1103
1104 /* Now the special states. Order is important here.
1105 * remember, C and J emissions are zero score by definition,
1106 */
1107 /* N state */
1108 xmx[i][XMN] = -INFTY;
1109 if ((sc = xmx[i-1][XMN] + hmm->xsc[XTN][LOOP]) > -INFTY)
1110 xmx[i][XMN] = sc;
1111
1112 /* E state */
1113 v2 = vec_sld(max_mmxesc,max_mmxesc,8);
1114 v2 = vec_max(v2,max_mmxesc);
1115 v1 = vec_sld(v2,v2,4);
1116 v1 = vec_max(v1,v2);
1117 vec_ste(v1,XME*4,xmxi);
1118
1119 /* J state */
1120 xmxi[XMJ] = -INFTY;
1121 if ((sc = lxmxi[XMJ] + hmm->xsc[XTJ][LOOP]) > -INFTY)
1122 xmxi[XMJ] = sc;
1123 if ((sc = xmxi[XME] + hmm->xsc[XTE][LOOP]) > xmxi[XMJ])
1124 xmxi[XMJ] = sc;
1125
1126 /* B state */
1127 xmxi[XMB] = -INFTY;
1128 if ((sc = xmxi[XMN] + hmm->xsc[XTN][MOVE]) > -INFTY)
1129 xmxi[XMB] = sc;
1130 if ((sc = xmxi[XMJ] + hmm->xsc[XTJ][MOVE]) > xmxi[XMB])
1131 xmxi[XMB] = sc;
1132
1133 /* C state */
1134 xmxi[XMC] = -INFTY;
1135 if ((sc = lxmxi[XMC] + hmm->xsc[XTC][LOOP]) > -INFTY)
1136 xmxi[XMC] = sc;
1137 if ((sc = xmxi[XME] + hmm->xsc[XTE][MOVE]) > xmxi[XMC])
1138 xmxi[XMC] = sc;
1139 }
1140 /* T state (not stored) */
1141 sc = xmx[L][XMC] + hmm->xsc[XTC][MOVE];
1142
1143 if (ret_tr != NULL) {
1144 P7ViterbiTrace(hmm, dsq, L, mx, &tr);
1145 *ret_tr = tr;
1146 }
1147
1148 /* Note (Lindahl): we do NOT free the dpmatrix here anymore - the code was
1149 * spending 30% of the runtime allocating/freeing memory.
1150 * Provide a pointer to a dpmatrix_s structure to this routine,
1151 * and we try to reuse it. After the final call to P7Viterbi,
1152 * free it with FreePlan7Matrix.
1153 */
1154 return Scorify(sc); /* the total Viterbi score. */
1155 }
1156 #endif /*the ALTIVEC port*/
1157
1158