1 // -*- c++ -*-
2 /* pconv.cu
3 experimental cuda opcodes
4 (c) Victor Lazzarini, 2013
5
6 based on M Puckette's pitch tracking algorithm.
7
8 This file is part of Csound.
9
10 The Csound Library is free software; you can redistribute it
11 and/or modify it under the terms of the GNU Lesser General Public
12 License as published by the Free Software Foundation; either
13 version 2.1 of the License, or (at your option) any later version.
14
15 Csound is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU Lesser General Public License for more details.
19
20 You should have received a copy of the GNU Lesser General Public
21 License along with Csound; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
23 02110-1301 USA
24 */
25
26 #include <csdl.h>
27 #include <cufft.h>
28
29 /*
30 each kernel processes one bin
31 */
pconvol(float * out,float * in,float * coef,int rp,int dftsize,int nparts,int end)32 __global__ void pconvol(float *out,float *in,
33 float *coef,int rp, int dftsize,
34 int nparts, int end) {
35 float re,im,re2,im2;
36
37 /* thread count */
38 int t = (threadIdx.x + blockIdx.x*blockDim.x);
39
40 int k = t<<1; /* coef pos */
41 int n = k%(dftsize+2); /* inframe pos */
42
43 /* if beyond the buffer end, exit */
44 if(k >= end) return;
45 rp += k/(dftsize+2); /* rp pos */
46
47 /* select correct input buffer */
48 in += (rp < nparts ? rp : rp%nparts)*(dftsize+2);
49
50 re = coef[k]; im = coef[k+1];
51 re2 = in[n]; im2 = in[n+1];
52
53 /* complex multiplication + sums */
54 out[k] = re*re2 - im*im2;
55 out[k+1] = re*im2 + re2*im;
56
57 if(t > dftsize+1) return;
58 __syncthreads();
59 for(int i=dftsize+2; i < end; i+=(dftsize+2))
60 out[t] += out[t + i];
61
62
63 }
64
65 /* sample-by-sample overlap-save operation */
olapsave(float * buf,float * in,int parts)66 __global__ void olapsave(float *buf, float *in, int parts){
67 int n = (threadIdx.x + blockIdx.x*blockDim.x);
68 buf[n] = in[n] + buf[parts+n];
69 buf[parts+n] = in[parts+n];
70 }
71
72
73
74 typedef struct _pconv{
75 OPDS h;
76 MYFLT *aout, *asig, *ifn, *parts;
77 float *out, *coef, *in, *buf;
78 AUXCH bufin, bufout;
79 int wp, nparts, dftsize, cnt;
80 cufftHandle plan, iplan;
81 int threads, blocks, othreads, oblocks;
82 } PCONV;
83
84
isPowerOfTwo(unsigned int x)85 int isPowerOfTwo (unsigned int x)
86 {
87 return ((x != 0) && !(x & (x - 1)));
88 }
89
90
destroy_pconv(CSOUND * csound,void * pp)91 static int destroy_pconv(CSOUND *csound, void *pp){
92 PCONV *p = (PCONV *) pp;
93 cufftDestroy(p->plan);
94 cufftDestroy(p->iplan);
95 cudaFree(p->coef);
96 cudaFree(p->in);
97 cudaFree(p->out);
98 cudaFree(p->buf);
99 return OK;
100 }
101
102
pconv_init(CSOUND * csound,PCONV * p)103 int pconv_init(CSOUND *csound, PCONV *p){
104
105 FUNC *ftab = csound->FTnp2Find(csound, p->ifn);
106 float *tmp;
107 int tlen = ftab->flen;
108 int end, i, j, k, parts = *p->parts, dftsize, nparts;
109 MYFLT *tab = ftab->ftable;
110
111 if(!isPowerOfTwo(parts))
112 return csound->InitError(csound, "partition size needs to be power of two\n");
113
114 if(parts > tlen)
115 return csound->InitError(csound, "partition size too big \n");
116
117 end = tlen + parts - 1;
118
119 nparts = end / parts;
120 dftsize = parts << 1;
121 end = nparts*(dftsize+2);
122
123 cudaMalloc(&p->coef, sizeof(float)*end);
124 cudaMalloc(&p->in, sizeof(float)*end);
125 cudaMalloc(&p->out, sizeof(float)*end);
126 cudaMalloc(&p->buf, sizeof(float)*(dftsize));
127
128 cudaMemset(p->in,0,sizeof(float)*end);
129 cudaMemset(p->out, 0, sizeof(float)*end);
130 cudaMemset(p->buf, 0, sizeof(float)*(dftsize));
131 cudaMemset(p->coef, 0, sizeof(float)*end);
132
133 p->wp = 0;
134
135 if(!p->bufin.auxp || p->bufin.size < sizeof(float)*dftsize)
136 csound->AuxAlloc(csound, sizeof(float)*dftsize, &p->bufin);
137 if(!p->bufout.auxp || p->bufout.size < sizeof(float)*parts)
138 csound->AuxAlloc(csound, sizeof(float)*parts, &p->bufout);
139
140 memset(p->bufout.auxp, 0, sizeof(float)*parts);
141
142 cufftResult res;
143
144 tmp = (float *) p->bufin.auxp;
145 if((res = cufftPlan1d(&p->plan, dftsize, CUFFT_R2C, 1))
146 != CUFFT_SUCCESS) csound->Message(csound, "cuda fft setup error %d\n", res);
147 cufftSetCompatibilityMode(p->plan, CUFFT_COMPATIBILITY_NATIVE);
148 cufftPlan1d(&p->iplan, dftsize, CUFFT_C2R, 1);
149 cufftSetCompatibilityMode(p->iplan, CUFFT_COMPATIBILITY_NATIVE);
150
151
152 for(i =0, k=0; i < nparts; i++){
153 for(j=0; j < dftsize; j++)
154 tmp[j] = j < parts && k < tlen ? tab[k++] : 0.f;
155 float *pp = p->coef + (nparts - 1 - i)*(dftsize+2);
156 cudaMemcpy(pp, tmp, sizeof(float)*dftsize,
157 cudaMemcpyHostToDevice);
158 if((res = cufftExecR2C(p->plan,pp,(cufftComplex*)pp))
159 != CUFFT_SUCCESS) csound->Message(csound, "cuda fft error %d\n", res);
160 }
161
162 cudaDeviceSynchronize();
163 cudaDeviceProp deviceProp;
164 cudaGetDeviceProperties(&deviceProp, 0);
165 int blockspt = deviceProp.maxThreadsPerBlock;
166
167 end >>= 1;
168
169 p->blocks = end > blockspt ? ceil(end/blockspt) : 1;
170 p->threads = end > blockspt ? blockspt : end;
171 p->oblocks = parts > blockspt ? ceil(parts/blockspt) : 1;
172 p->othreads = parts > blockspt ? blockspt : parts;
173
174 csound->RegisterDeinitCallback(csound, p, destroy_pconv);
175
176 OPARMS parms;
177 csound->GetOParms(csound, &parms);
178 if(parms.odebug)
179 csound->Message(csound,
180 "blocks %d - threads/block %d - threads %d - dftsize %d - parts %d\n",
181 p->blocks, p->threads, end, dftsize, nparts);
182
183 p->nparts = nparts;
184 p->dftsize = dftsize;
185 p->cnt = 0;
186
187 return OK;
188 }
189
pconv_perf(CSOUND * csound,PCONV * p)190 int pconv_perf(CSOUND *csound, PCONV *p){
191
192 int dftsize = p->dftsize, cnt = p->cnt, wp = p->wp, nparts = p->nparts;
193 uint32_t offset = p->h.insdshead->ksmps_offset;
194 uint32_t early = p->h.insdshead->ksmps_no_end;
195 uint32_t n, nsmps = CS_KSMPS;
196 float *bufin = (float *) p->bufin.auxp, *bufout = (float *) p->bufout.auxp;
197 MYFLT *asig = p->asig, *aout = p->aout;
198 float *in = p->in, *out = p->out, *coef = p->coef, *buf = p->buf;
199 int end = nparts*(dftsize+2);
200 int parts = *p->parts;
201
202 if (UNLIKELY(offset)) memset(asig, '\0', offset*sizeof(MYFLT));
203 if (UNLIKELY(early)) {
204 nsmps -= early;
205 memset(&asig[nsmps], '\0', early*sizeof(MYFLT));
206 }
207
208 for(n = offset; n < nsmps; n++){
209 bufin[cnt] = (float) asig[n];
210 aout[n] = (MYFLT) bufout[cnt]/dftsize;
211
212 if(++cnt == parts){
213 /* in buffer pos */
214 int pos = wp*(dftsize+2);
215
216 /* increment delay line pos
217 so that it points to the oldest partition
218 */
219 wp += 1;
220 if(wp == nparts) wp = 0;
221
222 /* copy current buffer into newest partition */
223 cudaMemset(out, 0, sizeof(float)*(dftsize+2));
224 cudaMemcpy(&in[pos],bufin,sizeof(float)*dftsize,cudaMemcpyHostToDevice);
225
226 /* apply transform */
227 if(cufftExecR2C(p->plan,&in[pos],(cufftComplex*)&in[pos])
228 != CUFFT_SUCCESS) csound->Message(csound, "cuda in fft error\n");
229 if (cudaDeviceSynchronize() != cudaSuccess)
230 csound->Message(csound,"Cuda error: Failed to synchronize\n");
231
232 /* convolution */
233 pconvol<<<p->blocks,p->threads>>>(out, in, coef, wp, dftsize, nparts, end);
234
235 if (cudaDeviceSynchronize() != cudaSuccess)
236 csound->Message(csound,"Cuda error: Failed to synchronize\n");
237
238 /* transform output */
239 if(cufftExecC2R(p->iplan,(cufftComplex*)out,out)
240 != CUFFT_SUCCESS) csound->Message(csound, "cuda out fft error\n");
241
242 if (cudaDeviceSynchronize() != cudaSuccess)
243 csound->Message(csound,"Cuda error: Failed to synchronize\n");
244
245 /* overlap-save */
246 olapsave<<<p->oblocks,p->othreads>>>(buf,out,parts);
247
248 /* copy buffer out */
249 cudaMemcpy(bufout,buf, sizeof(float)*parts,cudaMemcpyDeviceToHost);
250
251 cnt = 0;
252 }
253 }
254 p->cnt = cnt;
255 p->wp = wp;
256 return OK;
257 }
258
259 static OENTRY localops[] = {
260 {"cudapconv", sizeof(PCONV),0, 5, "a", "aii", (SUBR) pconv_init, NULL,
261 (SUBR) pconv_perf},
262 };
263
264 extern "C" {
265 LINKAGE
266 }
267