/* K=9 r=1/2 Viterbi decoder with Intel SIMD * May 2001, Phil Karn, KA9Q */ #include #include #include #include "viterbi29.h" #include "parity.h" static int V29_init; int cpu_features(void); #if defined(SSE2) char id_viterbi29[] = "k=9 r=1/2 Viterbi decoder, SSE2 version"; #elif defined(SSE) char id_viterbi29[] = "k=9 r=1/2 Viterbi decoder, SSE version"; #elif defined(MMX) char id_viterbi29[] = "k=9 r=1/2 Viterbi decoder, MMX version"; #else char id_viterbi29[] = "k=9 r=1/2 Viterbi decoder, portable C version"; #endif #if defined(MMX) typedef union { long long p; char c[256]; } decision_t; #define EXTRACT_DECISION(d,state) ((d)->c[state] & 1) /* Combined tables used by mmxbfly */ unsigned char Mettab29_1[16][128] __attribute__ ((aligned(32))); unsigned char Mettab29_2[16][128] __attribute__ ((aligned(32))); #else typedef union { long long p; unsigned long w[8]; } decision_t; #define EXTRACT_DECISION(d,state) (((d)->w[state/32] >> (state%32)) & 1) /* Symbol branch table used by ssebfly and sse2bfly */ unsigned char Branchtab29_1[128] __attribute__ ((aligned(32))); unsigned char Branchtab29_2[128] __attribute__ ((aligned(32))); #endif /* State info for instance of Viterbi decoder * Don't change this without also changing references in (mmx|sse|sse2)bfly29.s! */ struct v29 { unsigned char metrics1[256]; /* path metric buffer 1 */ unsigned char metrics2[256]; /* path metric buffer 2 */ decision_t *dp; /* Pointer to decision output for current bit */ unsigned char *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */ decision_t *decisions; /* Beginning of decisions for block */ void *alloc_blk; /* Return value from malloc */ }; /* Create a new instance of a Viterbi decoder */ void *create_viterbi29(int len){ void *blk; struct v29 *vp; int state; if(!V29_init){ #if defined(SSE2) if(!(cpu_features() & (1 << 26))){ fprintf(stderr,"viterbi29: CPU does not support SSE2 instructions\n"); exit(1); } #elif defined(SSE) if(!(cpu_features() & (1 << 25))){ fprintf(stderr,"viterbi29: CPU does not support SSE instructions\n"); exit(1); } #elif defined(MMX) if(!(cpu_features() & (1 << 23))){ fprintf(stderr,"viterbi29: CPU does not support MMX instructions\n"); exit(1); } #endif /* Initialize metric tables */ for(state=0;state < 128;state++){ #if defined(MMX) int symbol; for(symbol = 0;symbol < 16;symbol++){ Mettab29_1[symbol][state] = parity((2*state) & V29POLYA) ? (15-symbol):symbol; Mettab29_2[symbol][state] = parity((2*state) & V29POLYB) ? (15-symbol):symbol; } #else Branchtab29_1[state] = parity((2*state) & V29POLYA) ? 15:0; Branchtab29_2[state] = parity((2*state) & V29POLYB) ? 15:0; #endif } V29_init = 1; } /* Malloc only guarantees 8-byte alignment, but we want to ensure that * the path metric arrays are on 32-byte boundaries. At least 16-byte * alignment is mandatory in the SSE2 version, but the Pentium III * cache line size is 32 bytes */ blk = malloc(sizeof(struct v29)+32); if((int)blk & 31){ /* Not on 32-byte boundary; shift up */ vp = (struct v29 *)(((int)blk + 32) & ~31); } else { vp = (struct v29 *)blk; } vp->alloc_blk = blk; /* Record original pointer from malloc for use by free() */ /* The decisions only need be 32-bit aligned */ #if defined(MMX) vp->dp = vp->decisions = malloc((len+8)*256); #else vp->dp = vp->decisions = malloc((len+8)*32); #endif vp->old_metrics = vp->metrics1; vp->new_metrics = vp->metrics2; return vp; } /* Initialize Viterbi decoder for start of new frame */ int init_viterbi29(void *p,int starting_state){ struct v29 *vp = p; memset(vp->metrics1,60,256); vp->old_metrics = vp->metrics1; vp->new_metrics = vp->metrics2; vp->dp = vp->decisions; vp->old_metrics[starting_state & 255] = 0; /* Bias known start state */ return 0; } /* Do Viterbi chainback */ int chainback_viterbi29( void *p, unsigned char *data, /* Decoded output data */ unsigned int nbits, /* Number of data bits */ unsigned int endstate){ /* Terminal encoder state */ struct v29 *vp = p; int k; decision_t *decisions = vp->decisions; /* Make room beyond the end of the encoder register so we can * accumulate a full byte of decoded data */ endstate %= 256; decisions += 8; /* Look past tail */ while(nbits-- != 0){ k = EXTRACT_DECISION(&decisions[nbits],endstate); /* The store into data[] only needs to be done every 8 bits. * But this avoids a conditional branch, and the writes will * combine in the cache anyway */ data[nbits>>3] = endstate = (endstate >> 1) | (k << 7); } return 0; } /* Delete instance of a Viterbi decoder */ void delete_viterbi29(void *p){ struct v29 *vp = p; if(vp != NULL){ free(vp->decisions); free(vp->alloc_blk); } } #if !defined(MMX) && !defined(SSE) & !defined(SSE2) /* C-language butterfly */ #define BFLY(i) {\ unsigned char metric,m0,m1,decision;\ metric = ((Branchtab29_1[i] ^ sym1) + (Branchtab29_2[i] ^ sym2) + 1)/2;\ m0 = vp->old_metrics[i] + metric;\ m1 = vp->old_metrics[i+128] + (15 - metric);\ decision = (m0-m1) >= 0;\ vp->new_metrics[2*i] = decision ? m1 : m0;\ vp->dp->w[i/16] |= decision << ((2*i)&31);\ m0 -= (metric+metric-15);\ m1 += (metric+metric-15);\ decision = (m0-m1) >= 0;\ vp->new_metrics[2*i+1] = decision ? m1 : m0;\ vp->dp->w[i/16] |= decision << ((2*i+1)&31);\ } int update_viterbi29(void *p,unsigned char sym1,unsigned char sym2){ int i; struct v29 *vp = p; unsigned char *tmp; int normalize = 0; for(i=0;i<8;i++) vp->dp->w[i] = 0; for(i=0;i<128;i++) BFLY(i); /* Renormalize metrics */ if(vp->new_metrics[0] > 150){ int i; unsigned char minmetric = 255; for(i=0;i<64;i++) if(vp->new_metrics[i] < minmetric) minmetric = vp->new_metrics[i]; for(i=0;i<64;i++) vp->new_metrics[i] -= minmetric; normalize = minmetric; } vp->dp++; tmp = vp->old_metrics; vp->old_metrics = vp->new_metrics; vp->new_metrics = tmp; return normalize; } #endif void emms_viterbi29(void){ #if defined(MMX) || defined(SSE) asm("emms"); #endif }