1 /* 16-bit signed integer dot product
2  * SSE2 version
3  * Copyright 2004 Phil Karn
4  * May be used under the terms of the GNU Lesser General Public License (LGPL)
5  */
6 #define _XOPEN_SOURCE 600
7 #include <stdlib.h>
8 #include <memory.h>
9 #include "fec.h"
10 
11 struct dotprod {
12   int len; /* Number of coefficients */
13 
14   /* On a SSE2 machine, these hold 8 copies of the coefficients,
15    * preshifted by 0,1,..7 words to meet all possible input data
16    * alignments (see Intel ap559 on MMX dot products).
17    */
18   signed short *coeffs[8];
19 };
20 
21 long dotprod_sse2_assist(signed short *a,signed short *b,int cnt);
22 
23 /* Create and return a descriptor for use with the dot product function */
initdp_sse2(signed short coeffs[],int len)24 void *initdp_sse2(signed short coeffs[],int len){
25   struct dotprod *dp;
26   int i,j,blksize;
27 
28   if(len == 0)
29     return NULL;
30 
31   dp = (struct dotprod *)calloc(1,sizeof(struct dotprod));
32   dp->len = len;
33 
34   /* Make 8 copies of coefficients, one for each data alignment,
35    * each aligned to 16-byte boundary
36    */
37   for(i=0;i<8;i++){
38     blksize = (1+(len+i-1)/8) * 8*sizeof(signed short);
39     posix_memalign((void **)&dp->coeffs[i],16,blksize);
40     memset(dp->coeffs[i],0,blksize);
41     for(j=0;j<len;j++)
42       dp->coeffs[i][j+i] = coeffs[j];
43   }
44   return (void *)dp;
45 }
46 
47 
48 /* Free a dot product descriptor created earlier */
freedp_sse2(void * p)49 void freedp_sse2(void *p){
50   struct dotprod *dp = (struct dotprod *)p;
51   int i;
52 
53   for(i=0;i<8;i++)
54     if(dp->coeffs[i] != NULL)
55       free(dp->coeffs[i]);
56   free(dp);
57 }
58 
59 /* Compute a dot product given a descriptor and an input array
60  * The length is taken from the descriptor
61  */
dotprod_sse2(void * p,signed short a[])62 long dotprod_sse2(void *p,signed short a[]){
63   struct dotprod *dp = (struct dotprod *)p;
64   int al;
65   signed short *ar;
66 
67   ar = (signed short *)((int)a & ~15);
68   al = a - ar;
69 
70   /* Call assembler routine to do the work, passing number of 8-word blocks */
71   return dotprod_sse2_assist(ar,dp->coeffs[al],(dp->len+al-1)/8+1);
72 }
73