1 /* K=9 r=1/2 Viterbi decoder with Intel SIMD
2  * May 2001, Phil Karn, KA9Q
3  */
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <memory.h>
7 #include "viterbi29.h"
8 #include "parity.h"
9 
10 static int V29_init;
11 int cpu_features(void);
12 
13 #if defined(SSE2)
14 char id_viterbi29[] = "k=9 r=1/2 Viterbi decoder, SSE2 version";
15 #elif defined(SSE)
16 char id_viterbi29[] = "k=9 r=1/2 Viterbi decoder, SSE version";
17 #elif defined(MMX)
18 char id_viterbi29[] = "k=9 r=1/2 Viterbi decoder, MMX version";
19 #else
20 char id_viterbi29[] = "k=9 r=1/2 Viterbi decoder, portable C version";
21 #endif
22 
23 #if defined(MMX)
24 
25 typedef union { long long p; char c[256]; } decision_t;
26 #define EXTRACT_DECISION(d,state) ((d)->c[state] & 1)
27 
28 /* Combined tables used by mmxbfly */
29 unsigned char Mettab29_1[16][128] __attribute__ ((aligned(32)));
30 unsigned char Mettab29_2[16][128] __attribute__ ((aligned(32)));
31 
32 #else
33 
34 typedef union { long long p; unsigned long w[8]; } decision_t;
35 #define EXTRACT_DECISION(d,state) (((d)->w[state/32] >> (state%32)) & 1)
36 
37 /* Symbol branch table used by ssebfly and sse2bfly */
38 unsigned char Branchtab29_1[128] __attribute__ ((aligned(32)));
39 unsigned char Branchtab29_2[128] __attribute__ ((aligned(32)));
40 
41 #endif
42 
43 /* State info for instance of Viterbi decoder
44  * Don't change this without also changing references in (mmx|sse|sse2)bfly29.s!
45  */
46 struct v29 {
47   unsigned char metrics1[256]; /* path metric buffer 1 */
48   unsigned char metrics2[256]; /* path metric buffer 2 */
49   decision_t *dp;              /* Pointer to decision output for current bit */
50   unsigned char *old_metrics,*new_metrics; /* Pointers to path metrics, swapped on every bit */
51   decision_t *decisions;       /* Beginning of decisions for block */
52   void *alloc_blk;             /* Return value from malloc */
53 };
54 
55 
56 /* Create a new instance of a Viterbi decoder */
create_viterbi29(int len)57 void *create_viterbi29(int len){
58   void *blk;
59   struct v29 *vp;
60   int state;
61 
62   if(!V29_init){
63 #if defined(SSE2)
64     if(!(cpu_features() & (1 << 26))){
65       fprintf(stderr,"viterbi29: CPU does not support SSE2 instructions\n");
66       exit(1);
67     }
68 #elif defined(SSE)
69     if(!(cpu_features() & (1 << 25))){
70       fprintf(stderr,"viterbi29: CPU does not support SSE instructions\n");
71       exit(1);
72     }
73 #elif defined(MMX)
74     if(!(cpu_features() & (1 << 23))){
75       fprintf(stderr,"viterbi29: CPU does not support MMX instructions\n");
76       exit(1);
77     }
78 #endif
79     /* Initialize metric tables */
80     for(state=0;state < 128;state++){
81 #if defined(MMX)
82       int symbol;
83       for(symbol = 0;symbol < 16;symbol++){
84 	Mettab29_1[symbol][state] = parity((2*state) & V29POLYA) ? (15-symbol):symbol;
85 	Mettab29_2[symbol][state] = parity((2*state) & V29POLYB) ? (15-symbol):symbol;
86       }
87 #else
88       Branchtab29_1[state] = parity((2*state) & V29POLYA) ? 15:0;
89       Branchtab29_2[state] = parity((2*state) & V29POLYB) ? 15:0;
90 #endif
91     }
92     V29_init = 1;
93   }
94   /* Malloc only guarantees 8-byte alignment, but we want to ensure that
95    * the path metric arrays are on 32-byte boundaries. At least 16-byte
96    * alignment is mandatory in the SSE2 version, but the Pentium III
97    * cache line size is 32 bytes
98    */
99   blk = malloc(sizeof(struct v29)+32);
100   if((int)blk & 31){
101     /* Not on 32-byte boundary; shift up */
102     vp = (struct v29 *)(((int)blk + 32) & ~31);
103   } else {
104     vp = (struct v29 *)blk;
105   }
106   vp->alloc_blk = blk; /* Record original pointer from malloc for use by free() */
107 
108   /* The decisions only need be 32-bit aligned */
109 #if defined(MMX)
110   vp->dp = vp->decisions = malloc((len+8)*256);
111 #else
112   vp->dp = vp->decisions = malloc((len+8)*32);
113 #endif
114 
115   vp->old_metrics = vp->metrics1;
116   vp->new_metrics = vp->metrics2;
117   return vp;
118 }
119 
120 /* Initialize Viterbi decoder for start of new frame */
init_viterbi29(void * p,int starting_state)121 int init_viterbi29(void *p,int starting_state){
122   struct v29 *vp = p;
123 
124   memset(vp->metrics1,60,256);
125   vp->old_metrics = vp->metrics1;
126   vp->new_metrics = vp->metrics2;
127   vp->dp = vp->decisions;
128   vp->old_metrics[starting_state & 255] = 0; /* Bias known start state */
129   return 0;
130 }
131 
132 /* Do Viterbi chainback */
chainback_viterbi29(void * p,unsigned char * data,unsigned int nbits,unsigned int endstate)133 int chainback_viterbi29(
134       void *p,
135       unsigned char *data, /* Decoded output data */
136       unsigned int nbits, /* Number of data bits */
137       unsigned int endstate){ /* Terminal encoder state */
138 
139   struct v29 *vp = p;
140   int k;
141   decision_t *decisions = vp->decisions;
142 
143   /* Make room beyond the end of the encoder register so we can
144    * accumulate a full byte of decoded data
145    */
146   endstate %= 256;
147   decisions += 8; /* Look past tail */
148 
149   while(nbits-- != 0){
150     k = EXTRACT_DECISION(&decisions[nbits],endstate);
151     /* The store into data[] only needs to be done every 8 bits.
152      * But this avoids a conditional branch, and the writes will
153      * combine in the cache anyway
154      */
155     data[nbits>>3] = endstate = (endstate >> 1) | (k << 7);
156   }
157   return 0;
158 }
159 
160 /* Delete instance of a Viterbi decoder */
delete_viterbi29(void * p)161 void delete_viterbi29(void *p){
162   struct v29 *vp = p;
163 
164   if(vp != NULL){
165     free(vp->decisions);
166     free(vp->alloc_blk);
167   }
168 }
169 
170 #if !defined(MMX) && !defined(SSE) & !defined(SSE2)
171 
172 /* C-language butterfly */
173 #define BFLY(i) {\
174 unsigned char metric,m0,m1,decision;\
175     metric = ((Branchtab29_1[i] ^ sym1) + (Branchtab29_2[i] ^ sym2) + 1)/2;\
176     m0 = vp->old_metrics[i] + metric;\
177     m1 = vp->old_metrics[i+128] + (15 - metric);\
178     decision = (m0-m1) >= 0;\
179     vp->new_metrics[2*i] = decision ? m1 : m0;\
180     vp->dp->w[i/16] |= decision << ((2*i)&31);\
181     m0 -= (metric+metric-15);\
182     m1 += (metric+metric-15);\
183     decision = (m0-m1) >= 0;\
184     vp->new_metrics[2*i+1] = decision ? m1 : m0;\
185     vp->dp->w[i/16] |= decision << ((2*i+1)&31);\
186 }
187 
update_viterbi29(void * p,unsigned char sym1,unsigned char sym2)188 int update_viterbi29(void *p,unsigned char sym1,unsigned char sym2){
189   int i;
190   struct v29 *vp = p;
191   unsigned char *tmp;
192   int normalize = 0;
193 
194   for(i=0;i<8;i++)
195     vp->dp->w[i] = 0;
196 
197   for(i=0;i<128;i++)
198     BFLY(i);
199 
200   /* Renormalize metrics */
201   if(vp->new_metrics[0] > 150){
202     int i;
203     unsigned char minmetric = 255;
204 
205     for(i=0;i<64;i++)
206       if(vp->new_metrics[i] < minmetric)
207 	minmetric = vp->new_metrics[i];
208     for(i=0;i<64;i++)
209       vp->new_metrics[i] -= minmetric;
210     normalize = minmetric;
211   }
212 
213   vp->dp++;
214   tmp = vp->old_metrics;
215   vp->old_metrics = vp->new_metrics;
216   vp->new_metrics = tmp;
217 
218   return normalize;
219 }
220 #endif
221 
emms_viterbi29(void)222 void emms_viterbi29(void){
223 #if defined(MMX) || defined(SSE)
224   asm("emms");
225 #endif
226 }
227 
228