1 // -*- c++ -*-
2 /* pvsops.cu
3   experimental cuda opcodes
4 
5   (c) Victor Lazzarini, 2013
6 
7   based on M Puckette's pitch tracking algorithm.
8 
9   This file is part of Csound.
10 
11   The Csound Library is free software; you can redistribute it
12   and/or modify it under the terms of the GNU Lesser General Public
13   License as published by the Free Software Foundation; either
14   version 2.1 of the License, or (at your option) any later version.
15 
16   Csound is distributed in the hope that it will be useful,
17   but WITHOUT ANY WARRANTY; without even the implied warranty of
18   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   GNU Lesser General Public License for more details.
20 
21   You should have received a copy of the GNU Lesser General Public
22   License along with Csound; if not, write to the Free Software
23   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
24   02110-1301 USA
25 */
26 
27 #include <csdl.h>
28 #include <cufft.h>
29 
30 #include <pstream.h>
31 
AuxCudaAlloc(int size,AUXCH * p)32 void AuxCudaAlloc(int size, AUXCH *p){
33   float *mem;
34   cudaMalloc(&mem, size);
35   cudaMemset(mem, 0, size);
36   p->auxp = mem;
37   p->size = size;
38 }
39 
40 
41 /* kernel to convert from pvs to rectangular frame */
frompvs(float * inframe,float * fsig,double * lastph,double scal,double fac)42 __global__ void frompvs(float* inframe, float* fsig, double* lastph,
43                         double scal, double fac) {
44 
45   int k = threadIdx.x + blockIdx.x*blockDim.x;
46   int i = k << 1;
47   float mag = fsig[i];
48   double delta = (fsig[i+1] - k*scal)*fac;
49   double phi = fmod(lastph[k] + delta, TWOPI);
50   lastph[k] = phi;
51   inframe[i] =  (float) (mag*cos(phi));
52   inframe[i+1] = (float) (mag*sin(phi));
53 }
54 
winrotate(float * inframe2,float * inframe,float * win,int N,int offset)55 __global__ void winrotate(float* inframe2, float* inframe, float *win,
56                           int N, int offset){
57   int k = (threadIdx.x + blockIdx.x*blockDim.x);
58   inframe2[k] = win[k]*inframe[(k+offset)%N];
59 }
60 
61 typedef struct _pvsyn{
62   OPDS h;
63   MYFLT *asig;
64   PVSDAT *fsig;
65   float *inframe; /* N */
66   float *inframe2;
67   double *lastph;  /* N/2 */
68   float *win;    /* N */
69   int framecount;
70   int curframe;
71   AUXCH  frames;
72   AUXCH  count;
73   cufftHandle plan;
74   double scal, fac;
75   int bblocks, nblocks;
76   int bthreads, nthreads;
77 } PVSYN;
78 
79 static int destroy_pvsyn(CSOUND *csound, void *pp);
80 
pvsynset(CSOUND * csound,PVSYN * p)81 static int pvsynset(CSOUND *csound, PVSYN *p){
82 
83   int N = p->fsig->N;
84 
85   if((N != 0) && !(N & (N - 1))) {
86     int hsize = p->fsig->overlap;
87     int size, numframes, i, blockspt;
88     MYFLT sum = 0.0, bins = N/2;
89     float *win;
90     cudaDeviceProp deviceProp;
91     cudaGetDeviceProperties(&deviceProp, 0);
92     blockspt = deviceProp.maxThreadsPerBlock;
93     csound->Message(csound, "CUDAsynth: using device %s (capability %d.%d)\n",
94         deviceProp.name,deviceProp.major, deviceProp.minor);
95 
96     if(p->fsig->wintype != 1)
97       return csound->InitError(csound,
98                                "window type not implemented yet\n");
99     numframes = N/hsize;
100     size = N*sizeof(float)*numframes;
101     if(p->frames.auxp == NULL ||
102        p->frames.size < size)
103       csound->AuxAlloc(csound, size, &p->frames);
104     memset(p->frames.auxp, 0, size);
105 
106     size = sizeof(int)*numframes;
107     if(p->count.auxp == NULL ||
108        p->count.size < size)
109       csound->AuxAlloc(csound, size, &p->count);
110     *((int *)(p->count.auxp)) =  0;
111     for(i=1; i < numframes; i++)
112       ((int *)(p->count.auxp))[i] =
113               (i + (1.f - (float)i/numframes))*N;
114 
115     size = (N+2)*sizeof(float);
116     cudaMalloc(&p->inframe, size);
117     cudaMemset(p->inframe, 0, size);
118     size = (N+2)*sizeof(float);
119     cudaMalloc(&p->inframe2, size);
120     size = (N/2)*sizeof(double);
121     cudaMalloc(&p->lastph, size);
122     cudaMemset(p->lastph, 0, size);
123     size = N*sizeof(float);
124     cudaMalloc(&p->win, size);
125 
126     win = (float *) malloc(sizeof(float)*(N+1));
127     for(i=0; i <= N; i++)
128       win[i] = (float) (0.5 - 0.5*cos(i*TWOPI/N));
129 
130     for(i = 0; i < N; i++) sum += win[i];
131     sum = FL(2.0) / sum;
132     for(i = 0; i < N; i++) win[i] *= sum;
133     sum = FL(0.0);
134     for(i = 0; i <= N; i+=hsize)
135                sum += win[i] * win[i];
136     sum = (1.0/N)/(sum);
137     for(i = 0; i < N; i++) win[i] *= 3*sum/sqrt(numframes);
138     cudaMemcpy(p->win,win,N*sizeof(float),
139                cudaMemcpyHostToDevice);
140     free(win);
141 
142     p->framecount = 0;
143     p->curframe = 0;
144 
145     p->fac = TWOPI*hsize/csound->GetSr(csound);
146     p->scal =csound->GetSr(csound)/N;
147     cufftPlan1d(&p->plan, N, CUFFT_C2R, 1);
148     cufftSetCompatibilityMode(p->plan, CUFFT_COMPATIBILITY_NATIVE);
149     csound->RegisterDeinitCallback(csound, p, destroy_pvsyn);
150 
151     p->bblocks = bins > blockspt? bins/blockspt : 1;
152     p->nblocks = N > blockspt ? N/blockspt : 1;
153     p->bthreads = bins/p->bblocks;
154     p->nthreads = N/p->nblocks;
155     if(csound->GetDebug(csound))
156       csound->Message(csound, "%d (%d each), %d (%d each)\n",
157                     p->nblocks, p->nthreads, p->bblocks, p->bthreads);
158     return OK;
159   }
160   return csound->InitError(csound, "fftsize not power-of-two \n");
161 
162 }
163 
pvsynperf(CSOUND * csound,PVSYN * p)164 static int pvsynperf(CSOUND *csound, PVSYN *p){
165 
166   int N = p->fsig->N, i;
167   int hsize = p->fsig->overlap;
168   uint32_t offset = p->h.insdshead->ksmps_offset;
169   uint32_t early  = p->h.insdshead->ksmps_no_end;
170   uint32_t n, nsmps = CS_KSMPS;
171   MYFLT *asig = p->asig;
172   float *frames = (float *) p->frames.auxp;
173   int framecount = p->framecount;
174   int numframes = N/hsize;
175   int *count = (int *) p->count.auxp;
176 
177   if (UNLIKELY(offset)) memset(asig, '\0', offset*sizeof(MYFLT));
178   if (UNLIKELY(early)) {
179     nsmps -= early;
180     memset(&asig[nsmps], '\0', early*sizeof(MYFLT));
181   }
182 
183   for(n=offset; n < nsmps; n++) {
184 
185     if(framecount == 0) {
186       int curframe = p->curframe;
187       /* start offset for current frame */
188       int start = N*curframe;
189       float *cur = &(frames[start]);
190       float *win = (float *) p->win;
191       float *inframe = p->inframe;
192       float *inframe2 = p->inframe2;
193       float *fsig = (float *) p->fsig->frame.auxp;
194       /* perf pvs to rect conversion */
195       frompvs<<<p->bblocks,p->bthreads>>>(inframe,fsig,p->lastph,p->scal,p->fac);
196       /* execute inverse real FFT */
197       if(cufftExecC2R(p->plan,(cufftComplex*)inframe,inframe)
198          != CUFFT_SUCCESS) csound->Message(csound, "cuda fft error\n");
199       if (cudaDeviceSynchronize() != cudaSuccess)
200         csound->Message(csound,"Cuda error: Failed to synchronize\n");
201       /* window and rotate data on device */
202       winrotate<<<p->nblocks,p->nthreads>>>(inframe2,inframe,win,N,hsize*curframe);
203       /* copy data to current out frame */
204       cudaMemcpy(cur,inframe2,N*sizeof(float),cudaMemcpyDeviceToHost);
205       /* reset counter for this frame to the start */
206       count[curframe] = start;
207       /* move current to next frame circularly */
208       p->curframe = ++(curframe) == numframes ? 0 : curframe;
209       framecount = hsize;
210     }
211     asig[n] = FL(0.0);
212     for(i=0; i < numframes; i++){
213       /* overlap-add */
214       asig[n] += frames[count[i]];
215       count[i]++;
216     }
217     framecount--;
218   }
219   p->framecount = framecount;
220   return OK;
221 }
222 
destroy_pvsyn(CSOUND * csound,void * pp)223 static int destroy_pvsyn(CSOUND *csound, void *pp){
224   PVSYN *p = (PVSYN *) pp;
225   cufftDestroy(p->plan);
226   cudaFree(p->inframe);
227   cudaFree(p->inframe2);
228   cudaFree(p->lastph);
229   cudaFree(p->win);
230   return OK;
231 }
232 
modTwoPi(double x)233 __device__ double modTwoPi(double x)
234 {
235   x = fmod(x,TWOPI);
236   return x <= -PI ? x + TWOPI :
237     (x > PI ? x - TWOPI : x);
238 }
239 
240 /* kernel to convert from rectangular to pvs frame */
topvs(float * fsig,float * aframe,double * oldph,double scal,double fac)241 __global__ void topvs(float *fsig, float *aframe, double* oldph,
242                       double scal, double fac) {
243   int k = threadIdx.x + blockIdx.x*blockDim.x;
244   int i = k << 1;
245   float re = aframe[i], im = aframe[i+1];
246   float mag = sqrtf(re*re + im*im);
247   double phi = atan2f(im,re);
248   double delta = phi - oldph[k];
249   oldph[k] = phi;
250   fsig[i] =  mag;
251   fsig[i+1] = (float) ((modTwoPi(delta) + k*scal)*fac);
252 }
253 
rotatewin(float * aframe2,float * aframe,float * win,int N,int offset)254 __global__ void rotatewin(float* aframe2, float *aframe, float *win,
255                           int N, int offset){
256   int k = threadIdx.x + blockIdx.x*blockDim.x;
257   aframe2[(k+offset)%N] = win[k]*aframe[k];
258 }
259 
260 typedef struct _pvan {
261   OPDS  h;
262   PVSDAT *fsig;
263   MYFLT *asig,*fftsize,*hsize,*winsize,*wintype;
264 
265   float *aframe; /* N */
266   float *aframe2;
267   double *oldph;  /* N/2 */
268   float *win;    /* N */
269 
270   int framecount;
271   int curframe;
272   AUXCH  frames;
273   AUXCH  count;
274   cufftHandle plan;
275   double scal, fac;
276   int bblocks, nblocks;
277   int bthreads, nthreads;
278 } PVAN;
279 
280 static int destroy_pvanal(CSOUND *csound, void *pp);
281 
pvanalset(CSOUND * csound,PVAN * p)282 static int pvanalset(CSOUND *csound, PVAN *p){
283 
284   int N = *p->fftsize;
285   if((N != 0) && !(N & (N - 1))) {
286     int size, numframes, i, bins = N/2;
287     int hsize = *p->hsize, blockspt;
288     float *win;
289     cudaDeviceProp deviceProp;
290     cudaGetDeviceProperties(&deviceProp, 0);
291     blockspt = deviceProp.maxThreadsPerBlock;
292     csound->Message(csound, "CUDAnal: using device %s (capability %d.%d)\n",
293         deviceProp.name,deviceProp.major, deviceProp.minor);
294 
295     p->fsig->N = N;
296     p->fsig->overlap = hsize;
297     p->fsig->format = -1;
298     /* ignore winsize & wintype */
299     p->fsig->winsize = N;
300     p->fsig->wintype = 1;
301     p->fsig->framecount = 0;
302 
303     numframes = N/hsize;
304     size = N*sizeof(float)*numframes;
305     if(p->frames.auxp == NULL ||
306        p->frames.size < size)
307       csound->AuxAlloc(csound, size, &p->frames);
308     memset(p->frames.auxp, 0, size);
309 
310     size = (N+2)*sizeof(float);
311     if(p->fsig->frame.auxp == NULL ||
312        p->fsig->frame.size < size)
313        AuxCudaAlloc(size, &p->fsig->frame);
314 
315     size = sizeof(int)*numframes;
316     if(p->count.auxp == NULL ||
317        p->count.size < size)
318       csound->AuxAlloc(csound, size, &p->count);
319     *((int *)(p->count.auxp)) =  0;
320     for(i=1; i < numframes; i++)
321       ((int *)(p->count.auxp))[i] =
322               (i + (float)i/numframes)*N;
323 
324     size = (N+2)*sizeof(float);
325     cudaMalloc(&p->aframe, size);
326     size = (N+2)*sizeof(float);
327     cudaMalloc(&p->aframe2, size);
328     size = (N/2)*sizeof(double);
329     cudaMalloc(&p->oldph, size);
330     cudaMemset(p->oldph, 0, size);
331     size = N*sizeof(float);
332     cudaMalloc(&p->win, size);
333 
334 
335     win = (float *) malloc(sizeof(float)*N);
336     for(i=0; i < N; i++)
337       win[i] = (float) (0.5 - 0.5*cos(i*TWOPI/N));
338     float sum = 0.0;
339    for(i = 0; i < N; i++) sum += win[i];
340     sum = FL(2.0) / sum;
341    for(i = 0; i < N; i++) win[i] *= sum;
342 
343     cudaMemcpy(p->win,win,N*sizeof(float),
344                cudaMemcpyHostToDevice);
345     free(win);
346 
347     p->framecount = 1;
348     p->curframe = numframes-1;
349     p->fac = csound->GetSr(csound)/(TWOPI*hsize);
350     p->scal = (TWOPI*hsize)/N;
351     cufftPlan1d(&p->plan, N, CUFFT_R2C, 1);
352     cufftSetCompatibilityMode(p->plan, CUFFT_COMPATIBILITY_NATIVE);
353     csound->RegisterDeinitCallback(csound, p, destroy_pvanal);
354 
355     p->bblocks = bins > blockspt? bins/blockspt : 1;
356     p->nblocks = N > blockspt ? N/blockspt : 1;
357     p->bthreads = bins/p->bblocks;
358     p->nthreads = N/p->nblocks;
359    if(csound->GetDebug(csound))
360     csound->Message(csound, "%d (%d each), %d (%d each)\n",
361                     p->nblocks, p->nthreads, p->bblocks, p->bthreads);
362     return OK;
363   }
364   return csound->InitError(csound, "fftsize not power-of-two \n");
365 }
366 
pvanalperf(CSOUND * csound,PVAN * p)367 static int pvanalperf(CSOUND *csound, PVAN *p){
368 
369   int N = p->fsig->N, i;
370   int hsize = p->fsig->overlap;
371   uint32_t offset = p->h.insdshead->ksmps_offset;
372   uint32_t early  = p->h.insdshead->ksmps_no_end;
373   uint32_t n, nsmps = CS_KSMPS;
374   MYFLT *asig = p->asig;
375   float *frames = (float *) p->frames.auxp;
376   int framecount = p->framecount;
377   int numframes = N/hsize;
378   int *count = (int *) p->count.auxp;
379 
380   if (UNLIKELY(offset)) memset(asig, '\0', offset*sizeof(MYFLT));
381   if (UNLIKELY(early)) {
382     nsmps -= early;
383     memset(&asig[nsmps], '\0', early*sizeof(MYFLT));
384   }
385 
386   for(n=offset; n < nsmps; n++) {
387    for(i=0; i < numframes; i++){
388       frames[count[i]] = asig[n];
389       count[i]++;
390     }
391     framecount++;
392 
393     if(framecount == hsize) {
394       int curframe = p->curframe;
395       /* start offset for current frame */
396       int start = N*curframe;
397       float *cur = &(frames[start]);
398       float *win = (float *) p->win;
399       float *aframe = p->aframe;
400       float *aframe2 = p->aframe2;
401       float *fsig = (float *) p->fsig->frame.auxp;
402       cudaMemcpy(aframe,cur,N*sizeof(float),
403              cudaMemcpyHostToDevice);
404       /* window and rotate data on device */
405       rotatewin<<<p->nblocks,p->nthreads>>>(aframe2,aframe,win,N,
406                                             hsize*(numframes-curframe));
407        /* execute inverse real FFT */
408       if(cufftExecR2C(p->plan,aframe2,(cufftComplex*)aframe2)
409       != CUFFT_SUCCESS) csound->Message(csound, "cuda fft error\n");
410        if (cudaDeviceSynchronize() != cudaSuccess)
411          csound->Message(csound,"Cuda error: Failed to synchronize\n");
412        /* perf rect to pvs conversion */
413        topvs<<<p->bblocks,p->bthreads>>>(fsig,aframe2,p->oldph,p->scal,p->fac);
414       /* reset counter for this frame to the start */
415       count[curframe] = start;
416       /* move current to next frame circularly */
417       p->curframe = --(curframe) < 0 ? numframes-1 : curframe;
418       framecount = 0;
419       p->fsig->framecount++;
420     }
421   }
422   p->framecount = framecount;
423   return OK;
424 }
425 
destroy_pvanal(CSOUND * csound,void * pp)426 static int destroy_pvanal(CSOUND *csound, void *pp){
427   PVAN *p = (PVAN *) pp;
428   cufftDestroy(p->plan);
429   cudaFree(p->aframe);
430   cudaFree(p->aframe2);
431   cudaFree(p->oldph);
432   cudaFree(p->fsig->frame.auxp);
433   p->fsig->frame.size = 0;
434   cudaFree(p->win);
435   return OK;
436 }
437 
438 typedef struct _cudapvsgain2 {
439   OPDS    h;
440   PVSDAT  *fout;
441   PVSDAT  *fa;
442   MYFLT   *kgain;
443   int gridSize;   // number of blocks in the grid (1D)
444   int blockSize;   // number of threads in one block (1D)
445   uint32  lastframe;
446 } CUDAPVSGAIN2;
447 
448 // kernel for scaling PV amplitudes
applygain(float * output,float * input,MYFLT g,int length)449 __global__ void applygain(float* output, float* input, MYFLT g, int length) {
450   int i = threadIdx.x + blockDim.x * blockIdx.x;
451   int j = i<<1;
452 
453   if(j < length){
454     output[j] = (float) input[j] * g;
455     output[j+1] = input[j+1];
456   }
457 }
458 
free_device(CSOUND * csound,void * pp)459 static int free_device(CSOUND* csound, void* pp){
460   CUDAPVSGAIN2* p = (CUDAPVSGAIN2*) pp;
461   cudaFree(p->fout->frame.auxp);
462   return OK;
463 }
464 
cudapvsgain2set(CSOUND * csound,CUDAPVSGAIN2 * p)465 static int cudapvsgain2set(CSOUND *csound, CUDAPVSGAIN2 *p){
466 
467   int32 N = p->fa->N;
468   int size = (N+2) * sizeof(float);
469   int maxBlockDim;
470   int SMcount;
471   int totNumThreads = (N+2)/2;
472   cudaDeviceProp deviceProp;
473   cudaGetDeviceProperties(&deviceProp,0);
474   maxBlockDim = deviceProp.maxThreadsPerBlock;
475   SMcount = deviceProp.multiProcessorCount;
476   csound->Message(csound, "cudapvsgain2 running on device %s (capability %d.%d)\n", deviceProp.name,
477      deviceProp.major, deviceProp.minor);
478 
479   p->fout->sliding = 0;
480 
481   if (p->fout->frame.auxp == NULL || p->fout->frame.size < size)
482     AuxCudaAlloc(size, &p->fout->frame);
483 
484   p->blockSize = (((totNumThreads/SMcount)/32)+1)*32;
485   if (p->blockSize > maxBlockDim) p->blockSize = maxBlockDim;
486   p->gridSize = totNumThreads / p->blockSize + 1;
487   p->fout->N = N;
488   p->fout->overlap = p->fa->overlap;
489   p->fout->winsize = p->fa->winsize;
490   p->fout->wintype = p->fa->wintype;
491   p->fout->format = p->fa->format;
492   p->fout->framecount = 1;
493   p->lastframe = 0;
494 
495   csound->RegisterDeinitCallback(csound, p, free_device);
496 
497   return OK;
498 }
499 
cudapvsgain2(CSOUND * csound,CUDAPVSGAIN2 * p)500 static int cudapvsgain2(CSOUND *csound, CUDAPVSGAIN2 *p)
501 {
502   int32   framelength = p->fa->N + 2;
503   MYFLT gain = *p->kgain;
504   float* fo = (float*) p->fout->frame.auxp;
505   float* fi = (float*) p->fa->frame.auxp;
506 
507   if (p->lastframe < p->fa->framecount) {
508      if (cudaDeviceSynchronize() != cudaSuccess)
509          csound->Message(csound,"Cuda error: Failed to synchronize\n");
510     applygain<<<p->gridSize,p->blockSize>>>(fo, fi, gain, framelength);
511     p->fout->framecount = p->fa->framecount;
512     p->lastframe = p->fout->framecount;
513   }
514 
515   return OK;
516 }
517 
518 
519 static OENTRY localops[] = {
520   {"cudasynth2", sizeof(PVSYN),0, 5, "a", "f", (SUBR) pvsynset, NULL,
521    (SUBR) pvsynperf},
522    {"cudanal2", sizeof(PVAN),0, 5, "f", "aiiii", (SUBR) pvanalset, NULL,
523     (SUBR) pvanalperf},
524   {"cudapvsgain2", sizeof(CUDAPVSGAIN2), 0, 3, "f", "fk",
525                                (SUBR) cudapvsgain2set, (SUBR) cudapvsgain2, NULL}
526 };
527 
528 extern "C" {
529   LINKAGE
530 }
531