src/hmmer2/altivec_algorithms.cpp

/*****************************************************************
* Unipro UGENE - Genome analysis suite
* Copyright (C) 2008 Unipro, Russia (http://ugene.net)
* All Rights Reserved
*
*     This source code is distributed under the terms of the
*     GNU General Public License. See the files COPYING and LICENSE
*     for details.
*****************************************************************/
/************************************************************
 * HMMER - Biological sequence analysis with profile HMMs
 * Copyright (C) 1992-2003 Washington University School of Medicine
 * All Rights Reserved
 *
 *     This source code is distributed under the terms of the
 *     GNU General Public License. See the files COPYING and LICENSE
 *     for details.
 ************************************************************/

/* fast_algorithms.c
 * SRE, Sun Nov 10 08:54:48 2002 [AA 3080, Denver to StL]
 * CVS $Id: fast_algorithms.c,v 1.1.1.1 2008/01/17 02:32:19 jizhu Exp $
 *
 * Optimized routines to replace slower implementations in core_algorithms.c.
 *
 * The routines in core_algorithms.c are designed for clarity
 * and maintainability, not for speed. Implementations here
 * are designed for speed, not clarity. If you're trying to
 * understand the code, or optimize for a specific platform,
 * you are probably better off looking at core_algorithms.c.
 *
 * P7Viterbi() is the key function to target optimization to.
 * The implementation in core_algorithms.c is currently ifdef'ed
 * out of the code. The implementation that is used by default
 * is here, in fast_algorithms.c. A third implementation, from
 * Erik Lindahl at Stanford, is Mac/Altivec specific.
 *
 * Which implementation is used is controlled by ifdef's. The
 * default implementation uses a fast implementation of
 * P7Viterbi() from here. Other options (mutually exclusive):
 *
 * -DSLOW
 *   enable original core_algorithms.c code: slower than default,
 *   but might be easier to follow, for someone trying
 *   to understand the DP code.
 * -DALTIVEC
 *   enable Erik Lindahl's Altivec code for Macintosh OSX
 */
//#if !defined ALTIVEC && !defined SLOW
#ifdef ALTIVEC
#include "config.h"
//#include "squidconf.h"

#include "structs.h"
#include "funcs.h"
//#include "squid.h"
#include <altivec.h>
#include <stdio.h>
#include <malloc.h>
//
// /* the DEFAULT P7Viterbi() is portably optimized; code follows:
//  */
//
// /* Function: P7Viterbi() - portably optimized version
//  * Incept:   SRE, Fri Nov 15 13:14:33 2002 [St. Louis]
//  *
//  * Purpose:  The Viterbi dynamic programming algorithm.
//  *           Derived from core_algorithms.c:P7Viterbi().
//  *
//  * Args:     dsq    - sequence in digitized form
//  *           L      - length of dsq
//  *           hmm    - the model
//  *           mx     - re-used DP matrix
//  *           ret_tr - RETURN: traceback; pass NULL if it's not wanted
//  *
//  * Return:   log P(S|M)/P(S|R), as a bit score
//  */
// float
// P7Viterbi(unsigned char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s *mx, struct p7trace_s **ret_tr)
// {
//   struct p7trace_s  *tr;
//   int **xmx;
//   int **mmx;
//   int **imx;
//   int **dmx;
//   int   i,k;
//   int   sc;
//   int  *mc, *dc, *ic;        /* pointers to rows of mmx, dmx, imx */
//   int  *ms, *is;             /* pointers to msc[i], isc[i] */
//   int  *mpp, *mpc, *ip;      /* ptrs to mmx[i-1], mmx[i], imx[i-1] */
//   int  *bp;		     /* ptr into bsc[] */
//   int  *ep;                  /* ptr into esc[] */
//   int   xmb;		     /* value of xmx[i-1][XMB] */
//   int   xme;                 /* max for xmx[i][XME] */
//   int  *dpp;                 /* ptr into dmx[i-1] (previous row) */
//   int  *tpmm, *tpmi, *tpmd, *tpim, *tpii, *tpdm, *tpdd; /* ptrs into tsc */
//   int   M;
//
//   /* Make sure we have space for a DP matrix with 0..L rows, 0..M-1 columns.
//    */
//   ResizePlan7Matrix(mx, L, hmm->M, &xmx, &mmx, &imx, &dmx);
//
//   /* Initialization of the zero row.
//    */
//   xmx[0][XMN] = 0;		                     /* S->N, p=1            */
//   xmx[0][XMB] = hmm->xsc[XTN][MOVE];                 /* S->N->B, no N-tail   */
//   xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY;  /* need seq to get here */
//   for (k = 0; k <= hmm->M; k++)
//     mmx[0][k] = imx[0][k] = dmx[0][k] = -INFTY;      /* need seq to get here */
//
//   /* Initializations that help icc vectorize.
//    */
//   M        = hmm->M;
//
//   /* Recursion. Done as a pull.
//    * Note some slightly wasteful boundary conditions:
//    *    tsc[0] = -INFTY for all eight transitions (no node 0)
//    *    D_M and I_M are wastefully calculated (they don't exist)
//    */
//
//   tpmm  = hmm->tsc[TMM];
//   tpim  = hmm->tsc[TIM];
//   tpdm  = hmm->tsc[TDM];
//   tpmd  = hmm->tsc[TMD];
//   tpdd  = hmm->tsc[TDD];
//   tpmi  = hmm->tsc[TMI];
//   tpii  = hmm->tsc[TII];
//   bp    = hmm->bsc;
//   for (i = 1; i <= L; i++) {
//     mc    = mmx[i];
//     dc    = dmx[i];
//     ic    = imx[i];
//     mpp   = mmx[i-1];
//     dpp   = dmx[i-1];
//     ip    = imx[i-1];
//     xmb   = xmx[i-1][XMB];
//     ms    = hmm->msc[dsq[i]];
//     is    = hmm->isc[dsq[i]];
//     mc[0] = -INFTY;
//     dc[0] = -INFTY;
//     ic[0] = -INFTY;
//
//     for (k = 1; k <= M; k++) {
//       mc[k] = mpp[k-1]   + tpmm[k-1];
//       if ((sc = ip[k-1]  + tpim[k-1]) > mc[k])  mc[k] = sc;
//       if ((sc = dpp[k-1] + tpdm[k-1]) > mc[k])  mc[k] = sc;
//       if ((sc = xmb  + bp[k])         > mc[k])  mc[k] = sc;
//       mc[k] += ms[k];
//       if (mc[k] < -INFTY) mc[k] = -INFTY;
//
//       dc[k] = dc[k-1] + tpdd[k-1];
//       if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc;
//       if (dc[k] < -INFTY) dc[k] = -INFTY;
//
//       if (k < M) {
// 	ic[k] = mpp[k] + tpmi[k];
// 	if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc;
// 	ic[k] += is[k];
// 	if (ic[k] < -INFTY) ic[k] = -INFTY;
//       }
//     }
//
//     /* Now the special states. Order is important here.
//      * remember, C and J emissions are zero score by definition,
//      */
// 				/* N state */
//     xmx[i][XMN] = -INFTY;
//     if ((sc = xmx[i-1][XMN] + hmm->xsc[XTN][LOOP]) > -INFTY)
//       xmx[i][XMN] = sc;
//
// 				/* E state */
//     xme = -INFTY;
//     mpc = mmx[i];
//     ep  = hmm->esc;
//     for (k = 1; k <= hmm->M; k++)
//       if ((sc =  mpc[k] + ep[k]) > xme) xme = sc;
//     xmx[i][XME] = xme;
// 				/* J state */
//     xmx[i][XMJ] = -INFTY;
//     if ((sc = xmx[i-1][XMJ] + hmm->xsc[XTJ][LOOP]) > -INFTY)
//       xmx[i][XMJ] = sc;
//     if ((sc = xmx[i][XME]   + hmm->xsc[XTE][LOOP]) > xmx[i][XMJ])
//       xmx[i][XMJ] = sc;
//
// 				/* B state */
//     xmx[i][XMB] = -INFTY;
//     if ((sc = xmx[i][XMN] + hmm->xsc[XTN][MOVE]) > -INFTY)
//       xmx[i][XMB] = sc;
//     if ((sc = xmx[i][XMJ] + hmm->xsc[XTJ][MOVE]) > xmx[i][XMB])
//       xmx[i][XMB] = sc;
//
// 				/* C state */
//     xmx[i][XMC] = -INFTY;
//     if ((sc = xmx[i-1][XMC] + hmm->xsc[XTC][LOOP]) > -INFTY)
//       xmx[i][XMC] = sc;
//     if ((sc = xmx[i][XME] + hmm->xsc[XTE][MOVE]) > xmx[i][XMC])
//       xmx[i][XMC] = sc;
//   }
// 				/* T state (not stored) */
//   sc = xmx[L][XMC] + hmm->xsc[XTC][MOVE];
//
//   if (ret_tr != NULL) {
//     P7ViterbiTrace(hmm, dsq, L, mx, &tr);
//     *ret_tr = tr;
//   }
//
//   return Scorify(sc);		/* the total Viterbi score. */
// }
// #endif /*default P7Viterbi, used when ALTIVEC and SLOW are not defined*/


//#ifdef ALTIVEC
/*################################################################
 * The Altivec port, for Macintosh PowerPC.
 * Erik Lindahl, Stanford University, 2002.
 *
 * Replaces the following functions:
 *    AllocPlan7Body()      plan7.c                (data alignment on 16-byte boundaries)
 *    CreatePlan7Matrix()   core_algorithms.c      (data alignment on 16-byte boundaries)
 *    ResizePlan7Matrix()   core_algorithms.c      (data alignment on 16-byte boundaries)
 *    P7Viterbi()           core_algorithms.c      (total recode, w/ Altivec instructions)
 ################################################################*/
void
AllocPlan7Body(struct plan7_s *hmm, int M)
{
  int k, x;

  hmm->M = M;

  hmm->rf     = (char *)MallocOrDie ((M+2) * sizeof(char));
  hmm->cs     = (char *)MallocOrDie ((M+2) * sizeof(char));
  hmm->ca     = (char *)MallocOrDie ((M+2) * sizeof(char));
  hmm->map    = (int  *)MallocOrDie ((M+1) * sizeof(int));

  hmm->t      = (float **)MallocOrDie (M     *           sizeof(float *));
  hmm->tsc    = (int **)MallocOrDie (7     *           sizeof(int *));
  hmm->mat    = (float **)MallocOrDie ((M+1) *           sizeof(float *));
  hmm->ins    = (float **)MallocOrDie (M     *           sizeof(float *));
  hmm->msc    = (int **)MallocOrDie (MAXCODE   *       sizeof(int *));
  hmm->isc    = (int **)MallocOrDie (MAXCODE   *       sizeof(int *));

  hmm->t[0]   = (float *)MallocOrDie ((7*M)     *       sizeof(float));

  /* Allocate extra memory so tsc[TMM,TIM,TDM,TMD,TDD] start on the
   * 16-byte cache boundary, and tsc[TMI,TII] start
   * 12 bytes offset from the boundary.
   */
  //hmm->tsc_mem = MallocOrDie (((7*(M+16)))  *   sizeof(int));
  hmm->tsc_mem = (int *)memalign(16, (((7*(M+16)))  *   sizeof(int)));
  memset( hmm->tsc_mem, 0, ((7*(M+16)))  *   sizeof(int) );
  hmm->mat[0] = (float *)MallocOrDie ((MAXABET*(M+1)) * sizeof(float));
  hmm->ins[0] = (float *)MallocOrDie ((MAXABET*M) *     sizeof(float));
  /* Allocate extra mem. to make sure all members of msc,isc start
   * on 12-byte offsets from cache boundary.
   */
  hmm->msc_mem = (int *)memalign(128,((MAXCODE*(M+1+16)) * sizeof(int)));
  memset( hmm->msc_mem, 0, (MAXCODE*(M+1+16)) * sizeof(int) );
  hmm->isc_mem = (int *)memalign(128,(((MAXCODE*(M+16)) *   sizeof(int))));
  memset( hmm->isc_mem, 0, (MAXCODE*(M+16)) *   sizeof(int) );
  //hmm->msc_mem = (int *)MallocOrDie((MAXCODE*(M+1+16)) * sizeof(int));
  //hmm->isc_mem = (int *)MallocOrDie((MAXCODE*(M+16)) *   sizeof(int));

  /* note allocation strategy for important 2D arrays -- trying
   * to keep locality as much as possible, cache efficiency etc.
   */
  for (k = 1; k <= M; k++) {
    hmm->mat[k] = hmm->mat[0] + k * MAXABET;
    if (k < M) {
      hmm->ins[k] = hmm->ins[0] + k * MAXABET;
      hmm->t[k]   = hmm->t[0]   + k * 7;
    }
  }

  /* align tsc pointers */
  hmm->tsc[TMM] = (int *) (((((unsigned long int) hmm->tsc_mem) + 15) & (~0xf)));
  hmm->tsc[TMI] = (int *) (((((unsigned long int) hmm->tsc_mem) + (M+12)*sizeof(int) + 15) & (~0xf)) + 12);
  hmm->tsc[TMD] = (int *) (((((unsigned long int) hmm->tsc_mem) + 2*(M+12)*sizeof(int) + 15) & (~0xf)));
  hmm->tsc[TIM] = (int *) (((((unsigned long int) hmm->tsc_mem) + 3*(M+12)*sizeof(int) + 15) & (~0xf)));
  hmm->tsc[TII] = (int *) (((((unsigned long int) hmm->tsc_mem) + 4*(M+12)*sizeof(int) + 15) & (~0xf)) + 12);
  hmm->tsc[TDM] = (int *) (((((unsigned long int) hmm->tsc_mem) + 5*(M+12)*sizeof(int) + 15) & (~0xf)));
  hmm->tsc[TDD] = (int *) (((((unsigned long int) hmm->tsc_mem) + 6*(M+12)*sizeof(int) + 15) & (~0xf)));

  for (x = 0; x < MAXCODE; x++) {
    hmm->msc[x] = (int *) (((((unsigned long int)hmm->msc_mem) + x*(M+1+12)*sizeof(int) + 15) & (~0xf)) + 12);
    hmm->isc[x] = (int *) (((((unsigned long int)hmm->isc_mem) + x*(M+12)*sizeof(int) + 15) & (~0xf)) + 12);
  }
  /* tsc[0] is used as a boundary condition sometimes [Viterbi()],
   * so set to -inf always.
   */
  for (x = 0; x < 7; x++)
    hmm->tsc[x][0] = -INFTY;

  hmm->begin  = (float *)MallocOrDie ((M+1) * sizeof(float));
  hmm->bsc_mem= (int *)memalign(128,((M+1+12) * sizeof(int)));
  memset( hmm->bsc_mem, 0, (M+1+12) * sizeof(int) );
  //hmm->bsc_mem= (int *)MallocOrDie ((M+1+12) * sizeof(int));
  hmm->end    = (float *)MallocOrDie ((M+1) * sizeof(float));
  //hmm->esc_mem= (int *)MallocOrDie ((M+1+12) * sizeof(int));
  hmm->esc_mem= (int *)memalign(128, ((M+1+12) * sizeof(int)));
  memset( hmm->esc_mem, 0, (M+1+12) * sizeof(int) );

  hmm->bsc = (int *) (((((unsigned long int) hmm->bsc_mem) + 15) & (~0xf)) + 12);
  hmm->esc = (int *) (((((unsigned long int) hmm->esc_mem) + 15) & (~0xf)) + 12);

  return;
}


/* Function: CreatePlan7Matrix()
 *
 * Purpose:  Create a dynamic programming matrix for standard Forward,
 *           Backward, or Viterbi, with scores kept as scaled log-odds
 *           integers. Keeps 2D arrays compact in RAM in an attempt
 *           to maximize cache hits.
 *
 *           The mx structure can be dynamically grown, if a new
 *           HMM or seq exceeds the currently allocated size. Dynamic
 *           growing is more efficient than an alloc/free of a whole
 *           matrix for every new target. The ResizePlan7Matrix()
 *           call does this reallocation, if needed. Here, in the
 *           creation step, we set up some pads - to inform the resizing
 *           call how much to overallocate when it realloc's. If a pad
 *           is zero, we will not resize in that dimension.
 *
 * Args:     N     - N+1 rows are allocated, for sequence.
 *           M     - size of model in nodes
 *           padN  - over-realloc in seq/row dimension, or 0
 *           padM  - over-realloc in HMM/column dimension, or 0
 *
 * Return:   mx
 *           mx is allocated here. Caller frees with FreePlan7Matrix(mx).
 */
struct dpmatrix_s *
CreatePlan7Matrix(int N, int M, int padN, int padM)
{
  struct dpmatrix_s *mx;
  int i,n;

  mx         = (struct dpmatrix_s *) MallocOrDie (sizeof(struct dpmatrix_s));
  mx->xmx    = (int **) MallocOrDie (sizeof(int *) * (N+1));
  mx->mmx    = (int **) MallocOrDie (sizeof(int *) * (N+1));
  mx->imx    = (int **) MallocOrDie (sizeof(int *) * (N+1));
  mx->dmx    = (int **) MallocOrDie (sizeof(int *) * (N+1));

  /* For the memory accessed by the altivec routines, we want to have
   * accesses aligned to 16-byte boundaries as far as possible.
   * To accomplish this, we align extra memory and then set the first
   * pointer on each row to point to 4 bytes before a boundary.
   * This means element 1, which is the first one we work on, will be
   * on a 16-byte boundary. We still make sure we 'own' the three bytes
   * before, though, so we can load the vector with element 0 cache-aligned too.
   * The real pointers to memory are kept in xmx_mem,mmx_mem,imx_mem,dmx_mem.
   */
  mx->xmx_mem = (void *) MallocOrDie (sizeof(int) * (N+1)*(5 + 16));
  mx->mmx_mem = (void *) MallocOrDie (sizeof(int) * (N+1)*(M+2+16));
  mx->imx_mem = (void *) MallocOrDie (sizeof(int) * (N+1)*(M+2+16));
  mx->dmx_mem = (void *) MallocOrDie (sizeof(int) * (N+1)*(M+2+16));

  mx->xmx[0] = (int *) (((((unsigned long int) mx->xmx_mem) + 15) & (~0xf)) + 12);
  mx->mmx[0] = (int *) (((((unsigned long int) mx->mmx_mem) + 15) & (~0xf)) + 12);
  mx->imx[0] = (int *) (((((unsigned long int) mx->imx_mem) + 15) & (~0xf)) + 12);
  mx->dmx[0] = (int *) (((((unsigned long int) mx->dmx_mem) + 15) & (~0xf)) + 12);

  /* And make sure the beginning of each row is aligned the same way */
  for (i = 1; i <= N; i++)
    {
      mx->xmx[i] = mx->xmx[0] + i*(5+11) ; /* add 11 bytes per row, making it divisible by 4 */
      n = 12 - (M+2)%4;
      mx->mmx[i] = mx->mmx[0] + i*(M+2+n);
      mx->imx[i] = mx->imx[0] + i*(M+2+n);
      mx->dmx[i] = mx->dmx[0] + i*(M+2+n);
    }

  mx->maxN = N;
  mx->maxM = M;
  mx->padN = padN;
  mx->padM = padM;

  return mx;
}


struct dpmatrix_s *
CreatePlan7MatrixHP(int N, int M, int padN, int padM, char* memaddr){
  struct dpmatrix_s *mx;
  int i,n;
  mx         = (struct dpmatrix_s *) (memaddr + 0x060000);
  mx->xmx    = (int **) (memaddr + 0x065000);
  mx->mmx    = (int **) (memaddr + 0x070000);
  mx->imx    = (int **) (memaddr + 0x075000);
  mx->dmx    = (int **) (memaddr + 0x080000);
  mx->xmx_mem = (void *) (memaddr + 0x100000);
  mx->mmx_mem = (void *) (memaddr + 0x150000);
  mx->imx_mem = (void *) (memaddr + 0x200000);
  mx->dmx_mem = (void *) (memaddr + 0x250000);

  mx->xmx[0] = (int *) (((((unsigned long int) mx->xmx_mem) + 15) & (~0xf)) + 12);
  mx->mmx[0] = (int *) (((((unsigned long int) mx->mmx_mem) + 15) & (~0xf)) + 12);
  mx->imx[0] = (int *) (((((unsigned long int) mx->imx_mem) + 15) & (~0xf)) + 12);
  mx->dmx[0] = (int *) (((((unsigned long int) mx->dmx_mem) + 15) & (~0xf)) + 12);

  /* And make sure the beginning of each row is aligned the same way */
  for (i = 1; i <= N; i++)
    {
      mx->xmx[i] = mx->xmx[0] + i*(5+11) ; /* add 11 bytes per row, making it divisible by 4 */
      n = 12 - (M+2)%4;
      mx->mmx[i] = mx->mmx[0] + i*(M+2+n);
      mx->imx[i] = mx->imx[0] + i*(M+2+n);
      mx->dmx[i] = mx->dmx[0] + i*(M+2+n);
    }

  mx->maxN = N;
  mx->maxM = M;
  mx->padN = padN;
  mx->padM = padM;

  return mx;
}


/* Function: ResizePlan7Matrix()
 *
 * Purpose:  Reallocate a dynamic programming matrix, if necessary,
 *           for a problem of NxM: sequence length N, model size M.
 *           (N=1 for small memory score-only variants; we allocate
 *           N+1 rows in the DP matrix.)
 *
 *           See additional comments in
 *           core_algorithms.c:ResizePlan7Matrix(), the normal version
 *           of this function. This version is only used in the
 *           Altivec (--enable-altivec) port.
 *
 *           Returns individual ptrs to the four matrix components
 *           as a convenience.
 *
 * Args:     mx    - an already allocated model to grow.
 *           N     - seq length to allocate for; N+1 rows
 *           M     - size of model
 *           xmx, mmx, imx, dmx
 *                 - RETURN: ptrs to four mx components as a convenience
 *
 * Return:   (void)
 *           mx is (re)allocated here.
 */
void
ResizePlan7Matrix(struct dpmatrix_s *mx, int N, int M,
		  int ***xmx, int ***mmx, int ***imx, int ***dmx)
{
  int i,n;

  if (N <= mx->maxN && M <= mx->maxM)
    {
      if (xmx != NULL) *xmx = mx->xmx;
      if (mmx != NULL) *mmx = mx->mmx;
      if (imx != NULL) *imx = mx->imx;
      if (dmx != NULL) *dmx = mx->dmx;
      return;
    }

  if (N > mx->maxN) {
    N          += mx->padN;
    mx->maxN    = N;
    mx->xmx     = (int **) ReallocOrDie (mx->xmx, sizeof(int *) * (mx->maxN+1));
    mx->mmx     = (int **) ReallocOrDie (mx->mmx, sizeof(int *) * (mx->maxN+1));
    mx->imx     = (int **) ReallocOrDie (mx->imx, sizeof(int *) * (mx->maxN+1));
    mx->dmx     = (int **) ReallocOrDie (mx->dmx, sizeof(int *) * (mx->maxN+1));
  }

  if (M > mx->maxM) {
    M += mx->padM;
    mx->maxM = M;
  }

  mx->xmx_mem = ReallocOrDie (mx->xmx_mem, sizeof(int) * (mx->maxN+1)*(5 + 16));
  mx->mmx_mem = ReallocOrDie (mx->mmx_mem, sizeof(int) * (mx->maxN+1)*(mx->maxM+2+16));
  mx->imx_mem = ReallocOrDie (mx->imx_mem, sizeof(int) * (mx->maxN+1)*(mx->maxM+2+16));
  mx->dmx_mem = ReallocOrDie (mx->dmx_mem, sizeof(int) * (mx->maxN+1)*(mx->maxM+2+16));

  mx->xmx[0] = (int *) (((((unsigned long int) mx->xmx_mem) + 15) & (~0xf)) + 12);
  mx->mmx[0] = (int *) (((((unsigned long int) mx->mmx_mem) + 15) & (~0xf)) + 12);
  mx->imx[0] = (int *) (((((unsigned long int) mx->imx_mem) + 15) & (~0xf)) + 12);
  mx->dmx[0] = (int *) (((((unsigned long int) mx->dmx_mem) + 15) & (~0xf)) + 12);

  /* And make sure the beginning of each row is aligned the same way */
  for (i = 1; i <= mx->maxN; i++)
    {
      mx->xmx[i] = mx->xmx[0] + i*(5+11) ; /* add 11 bytes per row, making it divisible by 4 */
      n = 12 - (mx->maxM+2)%4;
      mx->mmx[i] = mx->mmx[0] + i*(mx->maxM+2+n);
      mx->imx[i] = mx->imx[0] + i*(mx->maxM+2+n);
      mx->dmx[i] = mx->dmx[0] + i*(mx->maxM+2+n);
    }

  if (xmx != NULL) *xmx = mx->xmx;
  if (mmx != NULL) *mmx = mx->mmx;
  if (imx != NULL) *imx = mx->imx;
  if (dmx != NULL) *dmx = mx->dmx;
}


/* Function: P7Viterbi()
 *
 * Purpose:  The Viterbi dynamic programming algorithm; Altivec implementation
 *           by Erik Lindahl, Stanford University, 2002.
 *
 * Args:     dsq    - sequence in digitized form
 *           L      - length of dsq
 *           hmm    - the model
 *           mx     - DP matrix (may get grown here)
 *           ret_tr - RETURN: traceback; pass NULL if it's not wanted
 *
 * Return:   log P(S|M)/P(S|R), as a bit score
 */
/* This first version of P7Viterbi has been accelerated with Altivec vectorization.
 * On Apple hardware, it is up to a factor 10 faster than the old non-altivec version.
 */
typedef union {
vector signed int v;
int i[4];
} ivector;
void printivec(vector signed int z)
{
    ivector q;
    q.v=z;
    printf("%d  %d  %d  %d\n",q.i[0],q.i[1],q.i[2],q.i[3]);
}
float
P7Viterbi(unsigned char *dsq, int L, struct plan7_s *hmm, struct dpmatrix_s *mx, struct p7trace_s **ret_tr)
{
  struct p7trace_s  *tr;
  int **xmx;
  int **mmx;
  int **imx;
  int **dmx;
  int  *mmxi,*xmxi,*imxi,*dmxi;
  int  *lmmxi,*lxmxi,*limxi,*ldmxi;
  int  *p_tmm,*p_tim,*p_tdm,*p_bsc,*p_msc,*p_tmd,*p_tdd,*p_tmi,*p_tii,*p_isc,*p_esc;
  int   i,n,k;
  int   sc;
  /* gcc and motorola use different syntax for initializing vectors,
   * so we use a dummy variable instead to get an adress to load from...
   */
  int t_lowscore = -INFTY;

  /* vector variables. We avoid the stupid gcc register spill/fill code by
   * limiting ourselves to 32 generic variables and making the all registers.
   * (This reuse is the reason for the generic variable names).
   */
  register vector signed int v_lowscore;
  register vector signed int max_mmxesc;
  register vector signed int v_xmb;
  register vector unsigned int mask1;
  register vector unsigned int mask2;
  register vector unsigned int mask3;
  register vector unsigned int mask4;
  register vector signed int v_lmmx1;
  register vector signed int v_lmmx2;
  register vector signed int v_limx1;
  register vector signed int v_limx2;
  register vector signed int v_save_lmmx;
  register vector signed int v_save_ldmx;
  register vector signed int v_save_limx;
  register vector signed int v_save_mmx;
  register vector signed int v_save_dmx;
  register vector signed int v1;
  register vector signed int v2;
  register vector signed int v3;
  register vector signed int v4;
  register vector signed int v5;
  register vector signed int v6;
  register vector signed int v7;
  register vector signed int v8;
  register vector signed int v9;
  register vector signed int v10;
  register vector signed int v11;
  register vector signed int v12;
  register vector signed int v13;
  register vector signed int v14;
  register vector signed int v15;

  /* load (-infinity) to all four elements in v_lowscore */
  v_lowscore      = vec_lde(0, &t_lowscore );
  mask1           = (vector unsigned int)vec_lvsl(0,&t_lowscore);
  v_lowscore      = vec_perm(v_lowscore,v_lowscore,(vector unsigned char)mask1);
  v_lowscore      = vec_splat(v_lowscore,0);

  v1 = vec_splat_s32(-1);
  v2 = vec_splat_s32(0);
  mask1 = (vector unsigned int)vec_sld(v1,v2,12); /* FF in first pos, rest. are 00 */
  mask2 = vec_sld(mask1,mask1,12);
  mask3 = vec_sld(mask1,mask1,8);
  mask4 = vec_sld(mask1,mask1,4);

  /* Make sure our DP matrix has 0..L rows, 0..M columns; grow it if needed.
   */
  ResizePlan7Matrix(mx, L, hmm->M, &xmx, &mmx, &imx, &dmx);

  /* Initialization of the zero row.
   */
  xmx[0][XMN] = 0;		                     /* S->N, p=1            */
  xmx[0][XMB] = hmm->xsc[XTN][MOVE];                 /* S->N->B, no N-tail   */
  xmx[0][XME] = xmx[0][XMC] = xmx[0][XMJ] = -INFTY;  /* need seq to get here */

  mmxi=mmx[0];
  imxi=imx[0];
  dmxi=dmx[0];
  xmxi=xmx[0];


  for (n = 0; n  < 5+hmm->M; n+=4) {
    vec_st(v_lowscore, n*4, mmxi);
    vec_st(v_lowscore, n*4, imxi);
    vec_st(v_lowscore, n*4, dmxi);
  }


  /* Fill data beyound M with -INFTY, so we can take the maximum including
   * elements with k>M.
   */
  for(k=1+hmm->M;k<=(3+hmm->M);k++) {
      hmm->esc[k]=-INFTY;
      hmm->bsc[k]=-INFTY;
      for(i=0;i<7;i++)
	hmm->tsc[i][k]=-INFTY;
      for(i=0;i<MAXCODE;i++) {
	hmm->msc[i][k]=-INFTY;
	hmm->isc[i][k]=-INFTY;
      }
  }

  /* Recursion. Done as a pull
   * Note some slightly wasteful boundary conditions:
   *    tsc[0] = -INFTY for all eight transitions (no node 0)
   *    D_M and I_M are wastefully calculated (they don't exist)
   */

  for (i = 1; i <= L; i++) {
    /* pointers to last (i-1) row */
    lmmxi=mmxi;
    limxi=imxi;
    ldmxi=dmxi;
    lxmxi=xmxi;

    /* get pointers to this row */
    mmxi=mmx[i];
    imxi=imx[i];
    dmxi=dmx[i];
    xmxi=xmx[i];

    /* Set everything that doesnt depend on k here */

    /* load and splat (spread to all elements) XMX[i-1][XMB] */
    v13   = vec_lde(0,&(xmx[i-1][XMB]));
    v14   = (vector signed int)vec_lvsl(0,&(xmx[i-1][XMB]));
    v13   = vec_perm(v13,v13,(vector unsigned char)v14);
    v_xmb = vec_splat(v13,0);
    p_tmm = hmm->tsc[TMM];
    p_tim = hmm->tsc[TIM];
    p_tdm = hmm->tsc[TDM];
    p_bsc = hmm->bsc;
    k = dsq[i];
    p_msc = hmm->msc[k];
    p_isc = hmm->isc[k];
    p_tmd = hmm->tsc[TMD];
    p_tdd = hmm->tsc[TDD];
    p_tmi = hmm->tsc[TMI];
    p_tii = hmm->tsc[TII];
    p_esc = hmm->esc;
    max_mmxesc = v_lowscore;

    /* the 0 element of vectors are aligned 12 bytes up from the 16-byte boundary,
     * so we simply write the entire 16 bytes before the 16 byte boundary.
     */
    vec_st(v_lowscore,0,mmxi);
    vec_st(v_lowscore,0,imxi);
    vec_st(v_lowscore,0,dmxi);

    /* Load the first (i.e. 'previous') vector on last row for mmx,imx,dmx,
     * and load the first vector of dmx and mmx on this row.
     */
    v_save_lmmx = vec_ld(-12, lmmxi);
    v_save_limx = vec_ld(-12, limxi);
    v_save_ldmx = vec_ld(-12, ldmxi);
    v_save_mmx = vec_ld(-12, mmxi);
    v_save_dmx = vec_ld(-12, dmxi);

    /* we have allocated extra memory, so it is perfectly OK
     * to do the calculations for a couple of extra cells where
     * k>hmm->M. These cells just wont be used.
     */
    for (n = 4, k=1; k < (hmm->M-3) ; n+=32, k+=8) {
	/* match state */

	/* 1: check which of mmx[i-1][k-1]+TMM and imx[i-1][k-1]+TIM is better,
	* but do it for 8 elements in parallel.
	* Since we are comparing with data on the previous row but one position
	* earlier, we have to shift stuff. Load two new vectors each round,
	* and use the saved register from last round.
	*/
	/* load mmx data */
	v_lmmx1 = vec_ld(n,    lmmxi);
	v_lmmx2 = vec_ld(n+16, lmmxi);

	/* load imx data */
	v_limx1 = vec_ld(n,    limxi);
	v_limx2 = vec_ld(n+16, limxi);

	v5    = vec_ld(n,    ldmxi);	/* Load dmx data */
	v10   = vec_ld(n+16, ldmxi);

	/* shift mmx, imx & dmx data */
	v1    = vec_sld(v_save_lmmx,v_lmmx1,12);
	v3    = vec_sld(v_save_limx,v_limx1,12);
	v9    = vec_sld(v_save_ldmx,v5,12);

	/* shift mmx, imx & dmx data */
	v2    = vec_sld(v_lmmx1,v_lmmx2,12);
	v4    = vec_sld(v_limx1,v_limx2,12);
	v_save_ldmx = v10;
	v10   = vec_sld(v5,v10,12);

	v_save_lmmx = v_lmmx2;
	v_save_limx = v_limx2;

	/* v1,v2 now contains 8 element with mmx[i-1][k-1],
	 * v3,v4 contain 8 elements with imx[i-1][k-1],
         * and v9,v10 contain 8 elements with dmx[i-1][k-1].
	 */
	/* load TMM, TIM & TDM entries from the HMM - these are aligned in memory */
        v5    = vec_ld(n-4, p_tmm);
        v6    = vec_ld(n+12, p_tmm);
        v7    = vec_ld(n-4, p_tim);
        v8    = vec_ld(n+12, p_tim);
        v11   = vec_ld(n-4, p_tdm);
        v12   = vec_ld(n+12, p_tdm);
	/* load bsc[k] */
	v14   = vec_ld(n, p_bsc);
	v15   = vec_ld(n+16, p_bsc);

	/* calc mmx+TMM, imx+TIM, dmx+TDM and XMX+bsc with saturated arithmetic, so
         * we don't loop if we add the large negative numbers used for -infinity.
	 */
	v1    = vec_adds(v1,v5);
	v2    = vec_adds(v2,v6);
	v3    = vec_adds(v3,v7);
	v4    = vec_adds(v4,v8);
	v9    = vec_adds(v9,v11);
	v10   = vec_adds(v10,v12);
	v14   = vec_adds(v14,v_xmb);
	v15   = vec_adds(v15,v_xmb);
	/* Select max of mmx+TMM and imx+TIM in each element */
	v1    = vec_max(v1,v3);
	v2    = vec_max(v2,v4);
	/* select max of dmx+TDM and XMX+bsc */
	v9    = vec_max(v9,v14);
	v10   = vec_max(v10,v15);
        /* select max of the four alternatives */
	v1    = vec_max(v1,v9);
	v2    = vec_max(v2,v10);
	/* v1,v2 now contain the max values for the new mmx;
	 * check if we should add msc.
         */

	//printf("L=%d M=%d n=%d k=%d i=%d dsq[i]=%d p_msc=hmm->msc[dsq[i]]=%d hmm->msc[dsq[i]]=%d\n",
	//     L,hmm->M,n,k,i,dsq[i],p_msc,hmm->msc[dsq[i]]);
        v3    = vec_ld(n,    p_msc);
        v4    = vec_ld(n+16, p_msc);

        v5    = (vector signed int)vec_cmpgt(v3,v_lowscore);
        v6    = (vector signed int)vec_cmpgt(v4,v_lowscore);

	/* load esc[k] */
	v9    = vec_ld(n, p_esc);
	v10   = vec_ld(n+16, p_esc);

	v1    = vec_adds(v1,v3);
	v2    = vec_adds(v2,v4);
        v1    = vec_sel(v3,v1,(vector unsigned int)v5);
        v2    = vec_sel(v4,v2,(vector unsigned int)v6);

	/* have final values for mmx on this row in v1,v2 - store it */
	vec_st(v1, n,    mmxi);
	vec_st(v2, n+16, mmxi);

	v9    = vec_adds(v9,v1);
	v10   = vec_adds(v10,v2);
	v9    = vec_max(v9,v10);
	max_mmxesc = vec_max(v9,max_mmxesc);

	/* OK. We finished the match state. Normally we could now first
	 * do the delete and then the insertion. The problem is that the
	 * delete is a pain in the ass to vectorize, since each element
	 * depends on the previous one in the vector. This means I have
	 * to write this relatively simple operation as eight independent
	 * ones, just as we would have done in a non-vectorized code.
	 * Since this is independent of the insertion state changes, I
	 * try to hide the latencies by doing the delete and insert
	 * calculations in parallel.
         * To make things easier I add 'del' in a comment on each
	 * line for calculations that are on the delete state, and 'ins'
	 * for the calculations for the insertion. Hang on...
	 */

	/* We already have the match data on this row from the previous
	 * iteration in v_save_mmx. Rotate it so the element that used to be
	 * is pos 4 last iteration is in position 1, and pos 2-4 contain
	 * the the first three elements of mmx this iteration.
         * And do the same type of rotation for v1/v2...
	 */

	v4 = vec_sld(v1,v2,12); /* del */
        v3 = vec_sld(v_save_mmx,v1,12); /* del */
	v_save_mmx = v2; /* Save for next iteration */

	/* Rotate last dmx data so we have the fourth element in pos 1. */
	v_save_dmx = vec_sld(v_save_dmx,v_save_dmx,12); /* del */

        /* load TMD & TDD data */
        v5   = vec_ld(n-4, p_tmd); /* del */
	v6   = vec_ld(n+12, p_tmd); /* del */
        v7   = vec_ld(n-4, p_tdd); /* del */
	v8   = vec_ld(n+12, p_tdd); /* del */

	/* calculate mmx+TMD */
	v3   = vec_adds(v3,v5); /* del */
	v4   = vec_adds(v4,v6); /* del */

	/* Start the ugly stuff. We have rotated last dmx data. Add TDD to
	 * it and compare with v3/v4 (data from mmx+TMD alternative), but
	 * we only compare & replace the first position, so we use a mask!
	 */

        /* First position: Add TDD to v_save_dmx */
	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
	/* Select max of this and mmx+TMD and save it temporary to v_save_dmx */
	v_save_dmx = vec_max(v_save_dmx,v3); /* del */
	/* use a mask to select only the first element from v_save_dmx, rest from v3. */
	v3     = vec_sel(v3,v_save_dmx,mask1); /* del */

	/* Start doing the insertion calculations in parallel.
	 * Load the TMI data.
	 */
        v9    = vec_ld(n   , p_tmi); /* ins */
	v10   = vec_ld(n+16, p_tmi); /* ins */
	/* Deletion:
         * Now we have an accurate pos 1 in v3. continue with pos2 the same
         * way. Rotate to a temporary array, add to TDD, compare and use a mask
         * to write back to only position 2.
	 */
	v_save_dmx = vec_sld(v3,v3,12); /* del */
	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
	v_save_dmx = vec_max(v_save_dmx,v3); /* del */
	v3     = vec_sel(v3,v_save_dmx,mask2); /* del */

	/* More insertion stuff - load TII data */
	v11   = vec_ld(n   , p_tii); /* ins */
	v12   = vec_ld(n+16, p_tii); /* ins */

	/* Deletion, position 3... */
	v_save_dmx = vec_sld(v3,v3,12); /* del */
	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
	v_save_dmx = vec_max(v_save_dmx,v3); /* del */
	v3     = vec_sel(v3,v_save_dmx,mask3); /* del */

	/* insertion stuff: calculate mmx+TMI */
	v9     = vec_adds(v_lmmx1,v9); /* ins */
	v10    = vec_adds(v_lmmx2,v10); /* ins */

	/* Deletion, position 4 */
	v_save_dmx = vec_sld(v3,v3,12); /* del */
	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
	v_save_dmx = vec_max(v_save_dmx,v3); /* del */
	v3     = vec_sel(v3,v_save_dmx,mask4); /* del */

	/* insertion stuff: calculate imx+TII */
	v11    = vec_adds(v_limx1,v11); /* ins */
	v12    = vec_adds(v_limx2,v12); /* ins */

	/* That was the first deletion vector, but we are unrolling.
	 * The next step is position '5', i.e. the first in vector 2.
	 * This one depends on the last position of vector 1, which we just finished.
	 */
	v_save_dmx = vec_sld(v3,v3,12); /* del */
	v_save_dmx = vec_adds(v_save_dmx,v8); /* del */
	v_save_dmx = vec_max(v_save_dmx,v4); /* del */
	v4     = vec_sel(v4,v_save_dmx,mask1); /* del */

	/* insertion stuff: select max of mmx+TMI and imx+TII */
	v9     = vec_max(v9,v11); /* ins */
	v10    = vec_max(v10,v12); /* ins */
	/* insertion stuff: load data from hmm->isc[tmpidx] */
	v11    = vec_ld(n, p_isc); /* ins */
	v12    = vec_ld(n+16, p_isc); /* ins */

	/* position 6 (2 in vector 2) */
	v_save_dmx = vec_sld(v4,v4,12); /* del */
	v_save_dmx = vec_adds(v_save_dmx,v8); /* del */
	v_save_dmx = vec_max(v_save_dmx,v4); /* del */
	v4     = vec_sel(v4,v_save_dmx,mask2); /* del */

	/* insertion: compare max of mmx+TMI, imx+TII with hmm->isc */
	v13    = (vector signed int)vec_cmpgt(v11,v_lowscore);
	v14    = (vector signed int)vec_cmpgt(v12,v_lowscore);

	/* position 7 (3 in vector 2) */
	v_save_dmx = vec_sld(v4,v4,12); /* del */
	v_save_dmx = vec_adds(v_save_dmx,v8); /* del */
	v_save_dmx = vec_max(v_save_dmx,v4); /* del */
	v4     = vec_sel(v4,v_save_dmx,mask3); /* del */

	v9     = vec_adds(v9,v11);
	v10    = vec_adds(v10,v12);
	v9     = vec_sel(v11,v9,(vector unsigned int)v13);
	v10    = vec_sel(v12,v10,(vector unsigned int)v14);

	/* position 8 (4 in vector 2) */
	v_save_dmx = vec_sld(v4,v4,12); /* del */
	v_save_dmx = vec_adds(v_save_dmx,v8); /* del */
	v_save_dmx = vec_max(v_save_dmx,v4); /* del */
	v_save_dmx = vec_sel(v4,v_save_dmx,mask4); /* del */

	/* Puh! that was the deletion... v3/v_save_dmx now contain the updated dmx data;
	 * save it to memory. (v_save_dmx will be used next iteration)
	 */
	vec_st(v3,        n, dmxi); /* del */
	vec_st(v_save_dmx, n+16, dmxi); /* del */

	/* save insertion too */
	vec_st(v9, n, imxi);
	vec_st(v10,n+16,imxi);

     }
    /* odd loop */
    if(k< (1+hmm->M)) {
	/* match state */
	/* 1: check which of mmx[i-1][k-1]+TMM and imx[i-1][k-1]+TIM is better,
	* but do it for 4 elements in parallel.
	* Since we are comparing with data on the previous row but one position
	* earlier, we have to shift stuff. Load two new vectors each round,
	* and use the saved register from last round.
	*/
	/* load mmx data */
	v_lmmx1 = vec_ld(n,    lmmxi);

	/* load imx data */
	v_limx1 = vec_ld(n,    limxi);

	v5    = vec_ld(n,    ldmxi);	/* Load dmx data */

	/* shift mmx, imx & dmx data */
	v1    = vec_sld(v_save_lmmx,v_lmmx1,12);
	v3    = vec_sld(v_save_limx,v_limx1,12);
	v9    = vec_sld(v_save_ldmx,v5,12);

	/* v1,v2 now contains 8 element with mmx[i-1][k-1],
	    * v3,v4 contain 8 elements with imx[i-1][k-1],
	    * and v9,v10 contain 8 elements with dmx[i-1][k-1].
	    */
	/* load TMM, TIM & TDM entries from the HMM - these are aligned in memory */
        v5    = vec_ld(n-4, p_tmm);
        v7    = vec_ld(n-4, p_tim);
        v11   = vec_ld(n-4, p_tdm);
	/* load bsc[k] */
	v14   = vec_ld(n, p_bsc);

	/* calc mmx+TMM, imx+TIM, dmx+TDM and XMX+bsc with saturated arithmetic, so
	    * we don't loop if we add the large negative numbers used for -infinity.
	    */
	v1    = vec_adds(v1,v5);
	v3    = vec_adds(v3,v7);
	v9    = vec_adds(v9,v11);
	v14   = vec_adds(v14,v_xmb);
	/* Select max of mmx+TMM and imx+TIM in each element */
	v1    = vec_max(v1,v3);
	/* select max of dmx+TDM and XMX+bsc */
	v9    = vec_max(v9,v14);
        /* select max of the four alternatives */
	v1    = vec_max(v1,v9);
	/* v1,v2 now contain the max values for the new mmx;
	* check if we should add msc.
	    */

        v3    = vec_ld(n,    p_msc);

        v5    = (vector signed int)vec_cmpgt(v3,v_lowscore);

	/* load esc[k] */
	v9    = vec_ld(n, p_esc);

	v1    = vec_adds(v1,v3);
        v1    = vec_sel(v3,v1,(vector unsigned int)v5);

	/* have final values for mmx on this row in v1,v2 - store it */
	vec_st(v1, n,    mmxi);

	v9    = vec_adds(v9,v1);
	max_mmxesc = vec_max(v9,max_mmxesc);

	/* OK. We finished the match state. Normally we could now first
	    * do the delete and then the insertion. The problem is that the
	    * delete is a pain in the ass to vectorize, since each element
	    * depends on the previous one in the vector. This means I have
	    * to write this relatively simple operation as eight independent
	    * ones, just as we would have done in a non-vectorized code.
	    * Since this is independent of the insertion state changes, I
	    * try to hide the latencies by doing the delete and insert
	    * calculations in parallel.
	    * To make things easier I add 'del' in a comment on each
	    * line for calculations that are on the delete state, and 'ins'
	    * for the calculations for the insertion. Hang on...
	    */

	/* We already have the match data on this row from the previous
	    * iteration in v_save_mmx. Rotate it so the element that used to be
	    * is pos 4 last iteration is in position 1, and pos 2-4 contain
	    * the the first three elements of mmx this iteration.
	    * And do the same type of rotation for v1/v2...
	    */

        v3 = vec_sld(v_save_mmx,v1,12); /* del */

	/* Rotate last dmx data so we have the fourth element in pos 1. */
	v_save_dmx = vec_sld(v_save_dmx,v_save_dmx,12); /* del */

        /* load TMD & TDD data */
        v5   = vec_ld(n-4, p_tmd); /* del */
        v7   = vec_ld(n-4, p_tdd); /* del */

	/* calculate mmx+TMD */
	v3   = vec_adds(v3,v5); /* del */

	/* Start the ugly stuff. We have rotated last dmx data. Add TDD to
	    * it and compare with v3/v4 (data from mmx+TMD alternative), but
	    * we only compare & replace the first position, so we use a mask!
	    */

        /* First position: Add TDD to v_save_dmx */
	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
	/* Select max of this and mmx+TMD and save it temporary to v_save_dmx */
	v_save_dmx = vec_max(v_save_dmx,v3); /* del */
	/* use a mask to select only the first element from v_save_dmx, rest from v3. */
	v3     = vec_sel(v3,v_save_dmx,mask1); /* del */

	/* Start doing the insertion calculations in parallel.
	    * Load the TMI data.
	    */
        v9    = vec_ld(n   , p_tmi); /* ins */
	/* Deletion:
	    * Now we have an accurate pos 1 in v3. continue with pos2 the same
	    * way. Rotate to a temporary array, add to TDD, compare and use a mask
	    * to write back to only position 2.
	    */
	v_save_dmx = vec_sld(v3,v3,12); /* del */
	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
	v_save_dmx = vec_max(v_save_dmx,v3); /* del */
	v3     = vec_sel(v3,v_save_dmx,mask2); /* del */

	/* More insertion stuff - load TII data */
	v11   = vec_ld(n   , p_tii); /* ins */

	/* Deletion, position 3... */
	v_save_dmx = vec_sld(v3,v3,12); /* del */
	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
	v_save_dmx = vec_max(v_save_dmx,v3); /* del */
	v3     = vec_sel(v3,v_save_dmx,mask3); /* del */

	/* insertion stuff: calculate mmx+TMI */
	v9     = vec_adds(v_lmmx1,v9); /* ins */

	/* Deletion, position 4 */
	v_save_dmx = vec_sld(v3,v3,12); /* del */
	v_save_dmx = vec_adds(v_save_dmx,v7); /* del */
	v_save_dmx = vec_max(v_save_dmx,v3); /* del */
	v3     = vec_sel(v3,v_save_dmx,mask4); /* del */

	/* insertion stuff: calculate imx+TII */
	v11    = vec_adds(v_limx1,v11); /* ins */

	/* insertion stuff: select max of mmx+TMI and imx+TII */
	v9     = vec_max(v9,v11); /* ins */
	/* insertion stuff: load data from hmm->isc[tmpidx] */
	v11    = vec_ld(n, p_isc); /* ins */

	/* insertion: compare max of mmx+TMI, imx+TII with hmm->isc */
	v13    = (vector signed int)vec_cmpgt(v11,v_lowscore);

	v9     = vec_adds(v9,v11);
	v9     = vec_sel(v11,v9,(vector unsigned int)v13);

	/* Puh! that was the deletion... v3/v_save_dmx now contain the updated dmx data;
	* save it to memory. (v_save_dmx will be used next iteration)
	    */
	vec_st(v3,        n, dmxi); /* del */

	/* save insertion too */
	vec_st(v9, n, imxi);
    }
    /* end of k loops */


    /* Now the special states. Order is important here.
	* remember, C and J emissions are zero score by definition,
	*/
				/* N state */
    xmx[i][XMN] = -INFTY;
    if ((sc = xmx[i-1][XMN] + hmm->xsc[XTN][LOOP]) > -INFTY)
	xmx[i][XMN] = sc;

				/* E state */
    v2 = vec_sld(max_mmxesc,max_mmxesc,8);
    v2 = vec_max(v2,max_mmxesc);
    v1 = vec_sld(v2,v2,4);
    v1 = vec_max(v1,v2);
    vec_ste(v1,XME*4,xmxi);

				/* J state */
    xmxi[XMJ] = -INFTY;
    if ((sc = lxmxi[XMJ] + hmm->xsc[XTJ][LOOP]) > -INFTY)
      xmxi[XMJ] = sc;
    if ((sc = xmxi[XME]   + hmm->xsc[XTE][LOOP]) > xmxi[XMJ])
      xmxi[XMJ] = sc;

				/* B state */
    xmxi[XMB] = -INFTY;
    if ((sc = xmxi[XMN] + hmm->xsc[XTN][MOVE]) > -INFTY)
      xmxi[XMB] = sc;
    if ((sc = xmxi[XMJ] + hmm->xsc[XTJ][MOVE]) > xmxi[XMB])
      xmxi[XMB] = sc;

				/* C state */
    xmxi[XMC] = -INFTY;
    if ((sc = lxmxi[XMC] + hmm->xsc[XTC][LOOP]) > -INFTY)
      xmxi[XMC] = sc;
    if ((sc = xmxi[XME] + hmm->xsc[XTE][MOVE]) > xmxi[XMC])
      xmxi[XMC] = sc;
  }
  /* T state (not stored) */
  sc = xmx[L][XMC] + hmm->xsc[XTC][MOVE];

  if (ret_tr != NULL) {
    P7ViterbiTrace(hmm, dsq, L, mx, &tr);
    *ret_tr = tr;
  }

  /* Note (Lindahl): we do NOT free the dpmatrix here anymore - the code was
   * spending 30% of the runtime allocating/freeing memory.
   * Provide a pointer to a dpmatrix_s structure to this routine,
   * and we try to reuse it. After the final call to P7Viterbi,
   * free it with FreePlan7Matrix.
   */
  return Scorify(sc);		/* the total Viterbi score. */
}
#endif /*the ALTIVEC port*/