1 // -*- c++ -*-
2 /* pconv.cu
3   experimental cuda opcodes
4   (c) Victor Lazzarini, 2013
5 
6   based on M Puckette's pitch tracking algorithm.
7 
8   This file is part of Csound.
9 
10   The Csound Library is free software; you can redistribute it
11   and/or modify it under the terms of the GNU Lesser General Public
12   License as published by the Free Software Foundation; either
13   version 2.1 of the License, or (at your option) any later version.
14 
15   Csound is distributed in the hope that it will be useful,
16   but WITHOUT ANY WARRANTY; without even the implied warranty of
17   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18   GNU Lesser General Public License for more details.
19 
20   You should have received a copy of the GNU Lesser General Public
21   License along with Csound; if not, write to the Free Software
22   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
23   02110-1301 USA
24 */
25 
26 #include <csdl.h>
27 #include <cufft.h>
28 
29 /*
30   each kernel processes one bin
31 */
pconvol(float * out,float * in,float * coef,int rp,int dftsize,int nparts,int end)32 __global__ void pconvol(float *out,float *in,
33 			float *coef,int rp, int dftsize,
34 			int nparts, int end) {
35   float re,im,re2,im2;
36 
37   /* thread count */
38   int t = (threadIdx.x + blockIdx.x*blockDim.x);
39 
40   int k = t<<1;       /* coef pos      */
41   int n = k%(dftsize+2);  /* inframe pos   */
42 
43   /* if beyond the buffer end, exit */
44   if(k >= end) return;
45   rp += k/(dftsize+2);       /*  rp pos */
46 
47   /* select correct input buffer */
48   in += (rp < nparts ? rp : rp%nparts)*(dftsize+2);
49 
50   re = coef[k]; im = coef[k+1];
51   re2 = in[n];  im2 = in[n+1];
52 
53   /* complex multiplication + sums */
54   out[k] = re*re2 - im*im2;
55   out[k+1] = re*im2 + re2*im;
56 
57   if(t > dftsize+1) return;
58   __syncthreads();
59     for(int i=dftsize+2; i < end; i+=(dftsize+2))
60       out[t] += out[t + i];
61 
62 
63 }
64 
65 /* sample-by-sample overlap-save operation */
olapsave(float * buf,float * in,int parts)66 __global__ void olapsave(float *buf, float *in, int parts){
67    int n = (threadIdx.x + blockIdx.x*blockDim.x);
68    buf[n] = in[n] + buf[parts+n];
69    buf[parts+n] = in[parts+n];
70 }
71 
72 
73 
74 typedef struct _pconv{
75   OPDS h;
76   MYFLT *aout, *asig, *ifn, *parts;
77   float *out, *coef, *in, *buf;
78   AUXCH  bufin, bufout;
79   int wp, nparts, dftsize, cnt;
80   cufftHandle plan, iplan;
81   int threads, blocks, othreads, oblocks;
82 } PCONV;
83 
84 
isPowerOfTwo(unsigned int x)85 int isPowerOfTwo (unsigned int x)
86 {
87   return ((x != 0) && !(x & (x - 1)));
88 }
89 
90 
destroy_pconv(CSOUND * csound,void * pp)91 static int destroy_pconv(CSOUND *csound, void *pp){
92   PCONV *p = (PCONV *) pp;
93   cufftDestroy(p->plan);
94   cufftDestroy(p->iplan);
95   cudaFree(p->coef);
96   cudaFree(p->in);
97   cudaFree(p->out);
98   cudaFree(p->buf);
99   return OK;
100 }
101 
102 
pconv_init(CSOUND * csound,PCONV * p)103 int pconv_init(CSOUND *csound, PCONV *p){
104 
105   FUNC *ftab = csound->FTnp2Find(csound, p->ifn);
106   float *tmp;
107   int tlen = ftab->flen;
108   int end, i, j, k, parts = *p->parts, dftsize, nparts;
109   MYFLT *tab = ftab->ftable;
110 
111   if(!isPowerOfTwo(parts))
112     return csound->InitError(csound, "partition size needs to be power of two\n");
113 
114   if(parts > tlen)
115      return csound->InitError(csound, "partition size too big \n");
116 
117   end = tlen + parts - 1;
118 
119   nparts = end / parts;
120   dftsize = parts << 1;
121   end = nparts*(dftsize+2);
122 
123   cudaMalloc(&p->coef, sizeof(float)*end);
124   cudaMalloc(&p->in, sizeof(float)*end);
125   cudaMalloc(&p->out, sizeof(float)*end);
126   cudaMalloc(&p->buf, sizeof(float)*(dftsize));
127 
128   cudaMemset(p->in,0,sizeof(float)*end);
129   cudaMemset(p->out, 0, sizeof(float)*end);
130   cudaMemset(p->buf, 0, sizeof(float)*(dftsize));
131   cudaMemset(p->coef, 0, sizeof(float)*end);
132 
133   p->wp = 0;
134 
135   if(!p->bufin.auxp || p->bufin.size < sizeof(float)*dftsize)
136      csound->AuxAlloc(csound, sizeof(float)*dftsize, &p->bufin);
137   if(!p->bufout.auxp || p->bufout.size < sizeof(float)*parts)
138      csound->AuxAlloc(csound, sizeof(float)*parts, &p->bufout);
139 
140   memset(p->bufout.auxp, 0, sizeof(float)*parts);
141 
142     cufftResult res;
143 
144   tmp = (float *) p->bufin.auxp;
145   if((res = cufftPlan1d(&p->plan, dftsize, CUFFT_R2C, 1))
146   != CUFFT_SUCCESS) csound->Message(csound, "cuda fft setup error %d\n", res);
147   cufftSetCompatibilityMode(p->plan, CUFFT_COMPATIBILITY_NATIVE);
148   cufftPlan1d(&p->iplan, dftsize, CUFFT_C2R, 1);
149   cufftSetCompatibilityMode(p->iplan, CUFFT_COMPATIBILITY_NATIVE);
150 
151 
152   for(i =0, k=0; i < nparts; i++){
153     for(j=0; j < dftsize; j++)
154       tmp[j] = j < parts && k < tlen ? tab[k++] : 0.f;
155       float *pp = p->coef + (nparts - 1 - i)*(dftsize+2);
156     cudaMemcpy(pp, tmp, sizeof(float)*dftsize,
157                cudaMemcpyHostToDevice);
158     if((res = cufftExecR2C(p->plan,pp,(cufftComplex*)pp))
159        != CUFFT_SUCCESS) csound->Message(csound, "cuda fft error %d\n", res);
160    }
161 
162   cudaDeviceSynchronize();
163   cudaDeviceProp deviceProp;
164   cudaGetDeviceProperties(&deviceProp, 0);
165   int blockspt = deviceProp.maxThreadsPerBlock;
166 
167   end >>= 1;
168 
169   p->blocks = end > blockspt ? ceil(end/blockspt) : 1;
170   p->threads = end > blockspt ? blockspt : end;
171   p->oblocks = parts > blockspt ? ceil(parts/blockspt) : 1;
172   p->othreads = parts > blockspt ? blockspt : parts;
173 
174   csound->RegisterDeinitCallback(csound, p, destroy_pconv);
175 
176   OPARMS parms;
177   csound->GetOParms(csound, &parms);
178   if(parms.odebug)
179    csound->Message(csound,
180      "blocks %d - threads/block %d - threads %d - dftsize %d - parts %d\n",
181 		   p->blocks, p->threads, end, dftsize, nparts);
182 
183   p->nparts = nparts;
184   p->dftsize = dftsize;
185   p->cnt = 0;
186 
187   return OK;
188 }
189 
pconv_perf(CSOUND * csound,PCONV * p)190 int pconv_perf(CSOUND *csound, PCONV *p){
191 
192   int dftsize = p->dftsize, cnt = p->cnt, wp = p->wp, nparts = p->nparts;
193   uint32_t offset = p->h.insdshead->ksmps_offset;
194   uint32_t early  = p->h.insdshead->ksmps_no_end;
195   uint32_t n, nsmps = CS_KSMPS;
196   float *bufin = (float *) p->bufin.auxp, *bufout = (float *) p->bufout.auxp;
197   MYFLT *asig = p->asig, *aout = p->aout;
198   float *in = p->in, *out = p->out, *coef = p->coef, *buf = p->buf;
199   int end = nparts*(dftsize+2);
200   int parts = *p->parts;
201 
202   if (UNLIKELY(offset)) memset(asig, '\0', offset*sizeof(MYFLT));
203   if (UNLIKELY(early)) {
204     nsmps -= early;
205     memset(&asig[nsmps], '\0', early*sizeof(MYFLT));
206   }
207 
208   for(n = offset; n < nsmps; n++){
209     bufin[cnt] = (float) asig[n];
210     aout[n] = (MYFLT) bufout[cnt]/dftsize;
211 
212     if(++cnt == parts){
213        /* in buffer pos */
214       int pos = wp*(dftsize+2);
215 
216        /* increment delay line pos
217           so that it points to the oldest partition
218        */
219        wp += 1;
220        if(wp == nparts) wp = 0;
221 
222        /* copy current buffer into newest partition */
223        cudaMemset(out, 0, sizeof(float)*(dftsize+2));
224        cudaMemcpy(&in[pos],bufin,sizeof(float)*dftsize,cudaMemcpyHostToDevice);
225 
226        /* apply transform */
227        if(cufftExecR2C(p->plan,&in[pos],(cufftComplex*)&in[pos])
228          != CUFFT_SUCCESS) csound->Message(csound, "cuda in fft error\n");
229        if (cudaDeviceSynchronize() != cudaSuccess)
230         csound->Message(csound,"Cuda error: Failed to synchronize\n");
231 
232        /* convolution */
233        pconvol<<<p->blocks,p->threads>>>(out, in, coef, wp, dftsize, nparts, end);
234 
235        if (cudaDeviceSynchronize() != cudaSuccess)
236         csound->Message(csound,"Cuda error: Failed to synchronize\n");
237 
238        /* transform output */
239        if(cufftExecC2R(p->iplan,(cufftComplex*)out,out)
240         != CUFFT_SUCCESS) csound->Message(csound, "cuda out fft error\n");
241 
242        if (cudaDeviceSynchronize() != cudaSuccess)
243         csound->Message(csound,"Cuda error: Failed to synchronize\n");
244 
245        /* overlap-save */
246        olapsave<<<p->oblocks,p->othreads>>>(buf,out,parts);
247 
248        /* copy buffer out */
249        cudaMemcpy(bufout,buf, sizeof(float)*parts,cudaMemcpyDeviceToHost);
250 
251        cnt = 0;
252     }
253   }
254   p->cnt = cnt;
255   p->wp = wp;
256   return OK;
257 }
258 
259 static OENTRY localops[] = {
260   {"cudapconv", sizeof(PCONV),0, 5, "a", "aii", (SUBR) pconv_init, NULL,
261     (SUBR) pconv_perf},
262 };
263 
264 extern "C" {
265   LINKAGE
266 }
267