1 // -*- c++ -*-
2 /* pvsops.cu
3 experimental cuda opcodes
4
5 (c) Victor Lazzarini, 2013
6
7 based on M Puckette's pitch tracking algorithm.
8
9 This file is part of Csound.
10
11 The Csound Library is free software; you can redistribute it
12 and/or modify it under the terms of the GNU Lesser General Public
13 License as published by the Free Software Foundation; either
14 version 2.1 of the License, or (at your option) any later version.
15
16 Csound is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU Lesser General Public License for more details.
20
21 You should have received a copy of the GNU Lesser General Public
22 License along with Csound; if not, write to the Free Software
23 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
24 02110-1301 USA
25 */
26
27 #include <csdl.h>
28 #include <cufft.h>
29
30 #include <pstream.h>
31
AuxCudaAlloc(int size,AUXCH * p)32 void AuxCudaAlloc(int size, AUXCH *p){
33 float *mem;
34 cudaMalloc(&mem, size);
35 cudaMemset(mem, 0, size);
36 p->auxp = mem;
37 p->size = size;
38 }
39
40
41 /* kernel to convert from pvs to rectangular frame */
frompvs(float * inframe,float * fsig,double * lastph,double scal,double fac)42 __global__ void frompvs(float* inframe, float* fsig, double* lastph,
43 double scal, double fac) {
44
45 int k = threadIdx.x + blockIdx.x*blockDim.x;
46 int i = k << 1;
47 float mag = fsig[i];
48 double delta = (fsig[i+1] - k*scal)*fac;
49 double phi = fmod(lastph[k] + delta, TWOPI);
50 lastph[k] = phi;
51 inframe[i] = (float) (mag*cos(phi));
52 inframe[i+1] = (float) (mag*sin(phi));
53 }
54
winrotate(float * inframe2,float * inframe,float * win,int N,int offset)55 __global__ void winrotate(float* inframe2, float* inframe, float *win,
56 int N, int offset){
57 int k = (threadIdx.x + blockIdx.x*blockDim.x);
58 inframe2[k] = win[k]*inframe[(k+offset)%N];
59 }
60
61 typedef struct _pvsyn{
62 OPDS h;
63 MYFLT *asig;
64 PVSDAT *fsig;
65 float *inframe; /* N */
66 float *inframe2;
67 double *lastph; /* N/2 */
68 float *win; /* N */
69 int framecount;
70 int curframe;
71 AUXCH frames;
72 AUXCH count;
73 cufftHandle plan;
74 double scal, fac;
75 int bblocks, nblocks;
76 int bthreads, nthreads;
77 } PVSYN;
78
79 static int destroy_pvsyn(CSOUND *csound, void *pp);
80
pvsynset(CSOUND * csound,PVSYN * p)81 static int pvsynset(CSOUND *csound, PVSYN *p){
82
83 int N = p->fsig->N;
84
85 if((N != 0) && !(N & (N - 1))) {
86 int hsize = p->fsig->overlap;
87 int size, numframes, i, blockspt;
88 MYFLT sum = 0.0, bins = N/2;
89 float *win;
90 cudaDeviceProp deviceProp;
91 cudaGetDeviceProperties(&deviceProp, 0);
92 blockspt = deviceProp.maxThreadsPerBlock;
93 csound->Message(csound, "CUDAsynth: using device %s (capability %d.%d)\n",
94 deviceProp.name,deviceProp.major, deviceProp.minor);
95
96 if(p->fsig->wintype != 1)
97 return csound->InitError(csound,
98 "window type not implemented yet\n");
99 numframes = N/hsize;
100 size = N*sizeof(float)*numframes;
101 if(p->frames.auxp == NULL ||
102 p->frames.size < size)
103 csound->AuxAlloc(csound, size, &p->frames);
104 memset(p->frames.auxp, 0, size);
105
106 size = sizeof(int)*numframes;
107 if(p->count.auxp == NULL ||
108 p->count.size < size)
109 csound->AuxAlloc(csound, size, &p->count);
110 *((int *)(p->count.auxp)) = 0;
111 for(i=1; i < numframes; i++)
112 ((int *)(p->count.auxp))[i] =
113 (i + (1.f - (float)i/numframes))*N;
114
115 size = (N+2)*sizeof(float);
116 cudaMalloc(&p->inframe, size);
117 cudaMemset(p->inframe, 0, size);
118 size = (N+2)*sizeof(float);
119 cudaMalloc(&p->inframe2, size);
120 size = (N/2)*sizeof(double);
121 cudaMalloc(&p->lastph, size);
122 cudaMemset(p->lastph, 0, size);
123 size = N*sizeof(float);
124 cudaMalloc(&p->win, size);
125
126 win = (float *) malloc(sizeof(float)*(N+1));
127 for(i=0; i <= N; i++)
128 win[i] = (float) (0.5 - 0.5*cos(i*TWOPI/N));
129
130 for(i = 0; i < N; i++) sum += win[i];
131 sum = FL(2.0) / sum;
132 for(i = 0; i < N; i++) win[i] *= sum;
133 sum = FL(0.0);
134 for(i = 0; i <= N; i+=hsize)
135 sum += win[i] * win[i];
136 sum = (1.0/N)/(sum);
137 for(i = 0; i < N; i++) win[i] *= 3*sum/sqrt(numframes);
138 cudaMemcpy(p->win,win,N*sizeof(float),
139 cudaMemcpyHostToDevice);
140 free(win);
141
142 p->framecount = 0;
143 p->curframe = 0;
144
145 p->fac = TWOPI*hsize/csound->GetSr(csound);
146 p->scal =csound->GetSr(csound)/N;
147 cufftPlan1d(&p->plan, N, CUFFT_C2R, 1);
148 cufftSetCompatibilityMode(p->plan, CUFFT_COMPATIBILITY_NATIVE);
149 csound->RegisterDeinitCallback(csound, p, destroy_pvsyn);
150
151 p->bblocks = bins > blockspt? bins/blockspt : 1;
152 p->nblocks = N > blockspt ? N/blockspt : 1;
153 p->bthreads = bins/p->bblocks;
154 p->nthreads = N/p->nblocks;
155 if(csound->GetDebug(csound))
156 csound->Message(csound, "%d (%d each), %d (%d each)\n",
157 p->nblocks, p->nthreads, p->bblocks, p->bthreads);
158 return OK;
159 }
160 return csound->InitError(csound, "fftsize not power-of-two \n");
161
162 }
163
pvsynperf(CSOUND * csound,PVSYN * p)164 static int pvsynperf(CSOUND *csound, PVSYN *p){
165
166 int N = p->fsig->N, i;
167 int hsize = p->fsig->overlap;
168 uint32_t offset = p->h.insdshead->ksmps_offset;
169 uint32_t early = p->h.insdshead->ksmps_no_end;
170 uint32_t n, nsmps = CS_KSMPS;
171 MYFLT *asig = p->asig;
172 float *frames = (float *) p->frames.auxp;
173 int framecount = p->framecount;
174 int numframes = N/hsize;
175 int *count = (int *) p->count.auxp;
176
177 if (UNLIKELY(offset)) memset(asig, '\0', offset*sizeof(MYFLT));
178 if (UNLIKELY(early)) {
179 nsmps -= early;
180 memset(&asig[nsmps], '\0', early*sizeof(MYFLT));
181 }
182
183 for(n=offset; n < nsmps; n++) {
184
185 if(framecount == 0) {
186 int curframe = p->curframe;
187 /* start offset for current frame */
188 int start = N*curframe;
189 float *cur = &(frames[start]);
190 float *win = (float *) p->win;
191 float *inframe = p->inframe;
192 float *inframe2 = p->inframe2;
193 float *fsig = (float *) p->fsig->frame.auxp;
194 /* perf pvs to rect conversion */
195 frompvs<<<p->bblocks,p->bthreads>>>(inframe,fsig,p->lastph,p->scal,p->fac);
196 /* execute inverse real FFT */
197 if(cufftExecC2R(p->plan,(cufftComplex*)inframe,inframe)
198 != CUFFT_SUCCESS) csound->Message(csound, "cuda fft error\n");
199 if (cudaDeviceSynchronize() != cudaSuccess)
200 csound->Message(csound,"Cuda error: Failed to synchronize\n");
201 /* window and rotate data on device */
202 winrotate<<<p->nblocks,p->nthreads>>>(inframe2,inframe,win,N,hsize*curframe);
203 /* copy data to current out frame */
204 cudaMemcpy(cur,inframe2,N*sizeof(float),cudaMemcpyDeviceToHost);
205 /* reset counter for this frame to the start */
206 count[curframe] = start;
207 /* move current to next frame circularly */
208 p->curframe = ++(curframe) == numframes ? 0 : curframe;
209 framecount = hsize;
210 }
211 asig[n] = FL(0.0);
212 for(i=0; i < numframes; i++){
213 /* overlap-add */
214 asig[n] += frames[count[i]];
215 count[i]++;
216 }
217 framecount--;
218 }
219 p->framecount = framecount;
220 return OK;
221 }
222
destroy_pvsyn(CSOUND * csound,void * pp)223 static int destroy_pvsyn(CSOUND *csound, void *pp){
224 PVSYN *p = (PVSYN *) pp;
225 cufftDestroy(p->plan);
226 cudaFree(p->inframe);
227 cudaFree(p->inframe2);
228 cudaFree(p->lastph);
229 cudaFree(p->win);
230 return OK;
231 }
232
modTwoPi(double x)233 __device__ double modTwoPi(double x)
234 {
235 x = fmod(x,TWOPI);
236 return x <= -PI ? x + TWOPI :
237 (x > PI ? x - TWOPI : x);
238 }
239
240 /* kernel to convert from rectangular to pvs frame */
topvs(float * fsig,float * aframe,double * oldph,double scal,double fac)241 __global__ void topvs(float *fsig, float *aframe, double* oldph,
242 double scal, double fac) {
243 int k = threadIdx.x + blockIdx.x*blockDim.x;
244 int i = k << 1;
245 float re = aframe[i], im = aframe[i+1];
246 float mag = sqrtf(re*re + im*im);
247 double phi = atan2f(im,re);
248 double delta = phi - oldph[k];
249 oldph[k] = phi;
250 fsig[i] = mag;
251 fsig[i+1] = (float) ((modTwoPi(delta) + k*scal)*fac);
252 }
253
rotatewin(float * aframe2,float * aframe,float * win,int N,int offset)254 __global__ void rotatewin(float* aframe2, float *aframe, float *win,
255 int N, int offset){
256 int k = threadIdx.x + blockIdx.x*blockDim.x;
257 aframe2[(k+offset)%N] = win[k]*aframe[k];
258 }
259
260 typedef struct _pvan {
261 OPDS h;
262 PVSDAT *fsig;
263 MYFLT *asig,*fftsize,*hsize,*winsize,*wintype;
264
265 float *aframe; /* N */
266 float *aframe2;
267 double *oldph; /* N/2 */
268 float *win; /* N */
269
270 int framecount;
271 int curframe;
272 AUXCH frames;
273 AUXCH count;
274 cufftHandle plan;
275 double scal, fac;
276 int bblocks, nblocks;
277 int bthreads, nthreads;
278 } PVAN;
279
280 static int destroy_pvanal(CSOUND *csound, void *pp);
281
pvanalset(CSOUND * csound,PVAN * p)282 static int pvanalset(CSOUND *csound, PVAN *p){
283
284 int N = *p->fftsize;
285 if((N != 0) && !(N & (N - 1))) {
286 int size, numframes, i, bins = N/2;
287 int hsize = *p->hsize, blockspt;
288 float *win;
289 cudaDeviceProp deviceProp;
290 cudaGetDeviceProperties(&deviceProp, 0);
291 blockspt = deviceProp.maxThreadsPerBlock;
292 csound->Message(csound, "CUDAnal: using device %s (capability %d.%d)\n",
293 deviceProp.name,deviceProp.major, deviceProp.minor);
294
295 p->fsig->N = N;
296 p->fsig->overlap = hsize;
297 p->fsig->format = -1;
298 /* ignore winsize & wintype */
299 p->fsig->winsize = N;
300 p->fsig->wintype = 1;
301 p->fsig->framecount = 0;
302
303 numframes = N/hsize;
304 size = N*sizeof(float)*numframes;
305 if(p->frames.auxp == NULL ||
306 p->frames.size < size)
307 csound->AuxAlloc(csound, size, &p->frames);
308 memset(p->frames.auxp, 0, size);
309
310 size = (N+2)*sizeof(float);
311 if(p->fsig->frame.auxp == NULL ||
312 p->fsig->frame.size < size)
313 AuxCudaAlloc(size, &p->fsig->frame);
314
315 size = sizeof(int)*numframes;
316 if(p->count.auxp == NULL ||
317 p->count.size < size)
318 csound->AuxAlloc(csound, size, &p->count);
319 *((int *)(p->count.auxp)) = 0;
320 for(i=1; i < numframes; i++)
321 ((int *)(p->count.auxp))[i] =
322 (i + (float)i/numframes)*N;
323
324 size = (N+2)*sizeof(float);
325 cudaMalloc(&p->aframe, size);
326 size = (N+2)*sizeof(float);
327 cudaMalloc(&p->aframe2, size);
328 size = (N/2)*sizeof(double);
329 cudaMalloc(&p->oldph, size);
330 cudaMemset(p->oldph, 0, size);
331 size = N*sizeof(float);
332 cudaMalloc(&p->win, size);
333
334
335 win = (float *) malloc(sizeof(float)*N);
336 for(i=0; i < N; i++)
337 win[i] = (float) (0.5 - 0.5*cos(i*TWOPI/N));
338 float sum = 0.0;
339 for(i = 0; i < N; i++) sum += win[i];
340 sum = FL(2.0) / sum;
341 for(i = 0; i < N; i++) win[i] *= sum;
342
343 cudaMemcpy(p->win,win,N*sizeof(float),
344 cudaMemcpyHostToDevice);
345 free(win);
346
347 p->framecount = 1;
348 p->curframe = numframes-1;
349 p->fac = csound->GetSr(csound)/(TWOPI*hsize);
350 p->scal = (TWOPI*hsize)/N;
351 cufftPlan1d(&p->plan, N, CUFFT_R2C, 1);
352 cufftSetCompatibilityMode(p->plan, CUFFT_COMPATIBILITY_NATIVE);
353 csound->RegisterDeinitCallback(csound, p, destroy_pvanal);
354
355 p->bblocks = bins > blockspt? bins/blockspt : 1;
356 p->nblocks = N > blockspt ? N/blockspt : 1;
357 p->bthreads = bins/p->bblocks;
358 p->nthreads = N/p->nblocks;
359 if(csound->GetDebug(csound))
360 csound->Message(csound, "%d (%d each), %d (%d each)\n",
361 p->nblocks, p->nthreads, p->bblocks, p->bthreads);
362 return OK;
363 }
364 return csound->InitError(csound, "fftsize not power-of-two \n");
365 }
366
pvanalperf(CSOUND * csound,PVAN * p)367 static int pvanalperf(CSOUND *csound, PVAN *p){
368
369 int N = p->fsig->N, i;
370 int hsize = p->fsig->overlap;
371 uint32_t offset = p->h.insdshead->ksmps_offset;
372 uint32_t early = p->h.insdshead->ksmps_no_end;
373 uint32_t n, nsmps = CS_KSMPS;
374 MYFLT *asig = p->asig;
375 float *frames = (float *) p->frames.auxp;
376 int framecount = p->framecount;
377 int numframes = N/hsize;
378 int *count = (int *) p->count.auxp;
379
380 if (UNLIKELY(offset)) memset(asig, '\0', offset*sizeof(MYFLT));
381 if (UNLIKELY(early)) {
382 nsmps -= early;
383 memset(&asig[nsmps], '\0', early*sizeof(MYFLT));
384 }
385
386 for(n=offset; n < nsmps; n++) {
387 for(i=0; i < numframes; i++){
388 frames[count[i]] = asig[n];
389 count[i]++;
390 }
391 framecount++;
392
393 if(framecount == hsize) {
394 int curframe = p->curframe;
395 /* start offset for current frame */
396 int start = N*curframe;
397 float *cur = &(frames[start]);
398 float *win = (float *) p->win;
399 float *aframe = p->aframe;
400 float *aframe2 = p->aframe2;
401 float *fsig = (float *) p->fsig->frame.auxp;
402 cudaMemcpy(aframe,cur,N*sizeof(float),
403 cudaMemcpyHostToDevice);
404 /* window and rotate data on device */
405 rotatewin<<<p->nblocks,p->nthreads>>>(aframe2,aframe,win,N,
406 hsize*(numframes-curframe));
407 /* execute inverse real FFT */
408 if(cufftExecR2C(p->plan,aframe2,(cufftComplex*)aframe2)
409 != CUFFT_SUCCESS) csound->Message(csound, "cuda fft error\n");
410 if (cudaDeviceSynchronize() != cudaSuccess)
411 csound->Message(csound,"Cuda error: Failed to synchronize\n");
412 /* perf rect to pvs conversion */
413 topvs<<<p->bblocks,p->bthreads>>>(fsig,aframe2,p->oldph,p->scal,p->fac);
414 /* reset counter for this frame to the start */
415 count[curframe] = start;
416 /* move current to next frame circularly */
417 p->curframe = --(curframe) < 0 ? numframes-1 : curframe;
418 framecount = 0;
419 p->fsig->framecount++;
420 }
421 }
422 p->framecount = framecount;
423 return OK;
424 }
425
destroy_pvanal(CSOUND * csound,void * pp)426 static int destroy_pvanal(CSOUND *csound, void *pp){
427 PVAN *p = (PVAN *) pp;
428 cufftDestroy(p->plan);
429 cudaFree(p->aframe);
430 cudaFree(p->aframe2);
431 cudaFree(p->oldph);
432 cudaFree(p->fsig->frame.auxp);
433 p->fsig->frame.size = 0;
434 cudaFree(p->win);
435 return OK;
436 }
437
438 typedef struct _cudapvsgain2 {
439 OPDS h;
440 PVSDAT *fout;
441 PVSDAT *fa;
442 MYFLT *kgain;
443 int gridSize; // number of blocks in the grid (1D)
444 int blockSize; // number of threads in one block (1D)
445 uint32 lastframe;
446 } CUDAPVSGAIN2;
447
448 // kernel for scaling PV amplitudes
applygain(float * output,float * input,MYFLT g,int length)449 __global__ void applygain(float* output, float* input, MYFLT g, int length) {
450 int i = threadIdx.x + blockDim.x * blockIdx.x;
451 int j = i<<1;
452
453 if(j < length){
454 output[j] = (float) input[j] * g;
455 output[j+1] = input[j+1];
456 }
457 }
458
free_device(CSOUND * csound,void * pp)459 static int free_device(CSOUND* csound, void* pp){
460 CUDAPVSGAIN2* p = (CUDAPVSGAIN2*) pp;
461 cudaFree(p->fout->frame.auxp);
462 return OK;
463 }
464
cudapvsgain2set(CSOUND * csound,CUDAPVSGAIN2 * p)465 static int cudapvsgain2set(CSOUND *csound, CUDAPVSGAIN2 *p){
466
467 int32 N = p->fa->N;
468 int size = (N+2) * sizeof(float);
469 int maxBlockDim;
470 int SMcount;
471 int totNumThreads = (N+2)/2;
472 cudaDeviceProp deviceProp;
473 cudaGetDeviceProperties(&deviceProp,0);
474 maxBlockDim = deviceProp.maxThreadsPerBlock;
475 SMcount = deviceProp.multiProcessorCount;
476 csound->Message(csound, "cudapvsgain2 running on device %s (capability %d.%d)\n", deviceProp.name,
477 deviceProp.major, deviceProp.minor);
478
479 p->fout->sliding = 0;
480
481 if (p->fout->frame.auxp == NULL || p->fout->frame.size < size)
482 AuxCudaAlloc(size, &p->fout->frame);
483
484 p->blockSize = (((totNumThreads/SMcount)/32)+1)*32;
485 if (p->blockSize > maxBlockDim) p->blockSize = maxBlockDim;
486 p->gridSize = totNumThreads / p->blockSize + 1;
487 p->fout->N = N;
488 p->fout->overlap = p->fa->overlap;
489 p->fout->winsize = p->fa->winsize;
490 p->fout->wintype = p->fa->wintype;
491 p->fout->format = p->fa->format;
492 p->fout->framecount = 1;
493 p->lastframe = 0;
494
495 csound->RegisterDeinitCallback(csound, p, free_device);
496
497 return OK;
498 }
499
cudapvsgain2(CSOUND * csound,CUDAPVSGAIN2 * p)500 static int cudapvsgain2(CSOUND *csound, CUDAPVSGAIN2 *p)
501 {
502 int32 framelength = p->fa->N + 2;
503 MYFLT gain = *p->kgain;
504 float* fo = (float*) p->fout->frame.auxp;
505 float* fi = (float*) p->fa->frame.auxp;
506
507 if (p->lastframe < p->fa->framecount) {
508 if (cudaDeviceSynchronize() != cudaSuccess)
509 csound->Message(csound,"Cuda error: Failed to synchronize\n");
510 applygain<<<p->gridSize,p->blockSize>>>(fo, fi, gain, framelength);
511 p->fout->framecount = p->fa->framecount;
512 p->lastframe = p->fout->framecount;
513 }
514
515 return OK;
516 }
517
518
519 static OENTRY localops[] = {
520 {"cudasynth2", sizeof(PVSYN),0, 5, "a", "f", (SUBR) pvsynset, NULL,
521 (SUBR) pvsynperf},
522 {"cudanal2", sizeof(PVAN),0, 5, "f", "aiiii", (SUBR) pvanalset, NULL,
523 (SUBR) pvanalperf},
524 {"cudapvsgain2", sizeof(CUDAPVSGAIN2), 0, 3, "f", "fk",
525 (SUBR) cudapvsgain2set, (SUBR) cudapvsgain2, NULL}
526 };
527
528 extern "C" {
529 LINKAGE
530 }
531