1 // -*- c++ -*-
2 /* convf.cu
3   (c) Victor Lazzarini, 2013
4 
5   based on M Puckette's pitch tracking algorithm.
6 
7   This file is part of Csound.
8 
9   The Csound Library is free software; you can redistribute it
10   and/or modify it under the terms of the GNU Lesser General Public
11   License as published by the Free Software Foundation; either
12   version 2.1 of the License, or (at your option) any later version.
13 
14   Csound is distributed in the hope that it will be useful,
15   but WITHOUT ANY WARRANTY; without even the implied warranty of
16   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   GNU Lesser General Public License for more details.
18 
19   You should have received a copy of the GNU Lesser General Public
20   License along with Csound; if not, write to the Free Software
21   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
22   02110-1301 USA
23 */
24 
25 #include <csdl.h>
26 
convol(float * out,float * del,float * coefs,int irsize,int rp,int vsize)27 __global__ void convol(float *out, float *del, float *coefs, int irsize, int rp, int vsize) {
28   int t = (threadIdx.x + blockIdx.x*blockDim.x);
29   if(t >= irsize*vsize) return;
30   int n =  t%vsize;  /* sample index */
31   int h =  t/vsize;  /* coeff index */
32   int end = irsize+vsize;
33   rp += n + h; /* read point, oldest -> newest */
34   out[t] = del[rp < end ? rp : rp%end]*coefs[irsize-1-h];  /* single tap */
35   if(t >= vsize) return;
36   __syncthreads();
37   float a = 0.0;
38   for(int i=1, j=vsize; i < irsize; i++, j+=vsize)
39     a +=  out[n + j]; /* mix all taps */
40   out[n] += a;
41 }
42 
43 typedef struct _CONV {
44   OPDS h;
45   MYFLT *aout, *asig, *ifn;
46   float *coeffs, *out, *del;
47   int wp, irsize;
48   AUXCH buf;
49   int blocks, threads;
50 } CONV;
51 
52 
destroy_conv(CSOUND * csound,void * pp)53 static int destroy_conv(CSOUND *csound, void *pp){
54   CONV *p = (CONV *) pp;
55   cudaFree(p->coeffs);
56   cudaFree(p->del);
57   cudaFree(p->out);
58   return OK;
59 }
60 
conv_init(CSOUND * csound,CONV * p)61 static int conv_init(CSOUND *csound, CONV *p){
62 
63   FUNC *ftab = csound->FTnp2Find(csound, p->ifn);
64   int irsize = ftab->flen;
65   int nsmps = CS_KSMPS,i;
66   int threads = irsize*nsmps;
67   float *tmp;
68 
69   cudaMalloc(&p->coeffs, sizeof(float)*irsize);
70 
71   tmp = (float*) malloc(sizeof(float)*irsize);
72   for(i=0; i< irsize; i++)
73     tmp[i] = (float) ftab->ftable[i];
74   cudaMemcpy(p->coeffs,tmp, sizeof(float)*irsize,
75             cudaMemcpyHostToDevice);
76   free(tmp);
77 
78   cudaMalloc(&p->del, sizeof(float)*(irsize+nsmps));
79   cudaMalloc(&p->out, sizeof(float)*threads);
80   cudaMemset(p->del,0,sizeof(float)*(irsize+nsmps));
81   cudaMemset(p->out, 0, sizeof(float)*threads);
82 
83   p->wp = 0;
84   p->irsize = irsize;
85 
86   cudaDeviceProp deviceProp;
87   cudaGetDeviceProperties(&deviceProp, 0);
88   int blockspt = deviceProp.maxThreadsPerBlock;
89   csound->Message(csound, "CUDAconv: using device %s (capability %d.%d)\n",
90         deviceProp.name,deviceProp.major, deviceProp.minor);
91 
92   p->blocks = threads > blockspt ? ceil(threads/blockspt) : 1;
93   p->threads = threads > blockspt ? blockspt : threads;
94 
95   csound->RegisterDeinitCallback(csound, p, destroy_conv);
96   OPARMS parms;
97   csound->GetOParms(csound, &parms);
98   if(parms.odebug)
99    csound->Message(csound, "blocks %d, threads %d - %d\n", p->blocks, p->threads, threads);
100   if(p->buf.auxp == NULL)
101     csound->AuxAlloc(csound, sizeof(float)*CS_KSMPS, &p->buf);
102 
103   return OK;
104 
105 }
106 /* the delay size is irsize + vsize so that
107    we can shift in a whole block of samples */
conv_perf(CSOUND * csound,CONV * p)108 int conv_perf(CSOUND *csound, CONV *p){
109 
110    int nsmps = CS_KSMPS;
111    MYFLT *sig = p->asig, *aout = p->aout;
112    float *del = p->del, *out = p->out, *coefs = p->coeffs, *buf = (float *)p->buf.auxp;
113    int irsize = p->irsize;
114    int wp = p->wp, i;
115 
116   for(i=0; i < nsmps; i++) buf[i] = (float) sig[i];
117   if(wp > irsize) {
118      int front = wp - irsize;
119      cudaMemcpy(&del[wp], buf, sizeof(float)*(nsmps-front), cudaMemcpyHostToDevice);
120      cudaMemcpy(del, &buf[nsmps-front], sizeof(float)*front, cudaMemcpyHostToDevice);
121   }
122   else cudaMemcpy(&del[wp], buf, sizeof(float)*nsmps, cudaMemcpyHostToDevice);
123 
124   wp = (wp+nsmps)%(irsize+nsmps); /* wp is now the oldest sample in the delay */
125   convol<<<p->blocks,p->threads>>>(out, del, coefs, irsize, wp, nsmps);
126 
127   cudaMemcpy(buf, out, sizeof(float)*nsmps, cudaMemcpyDeviceToHost);
128 
129   for(i=0; i < nsmps; i++) aout[i] = (float) buf[i];
130   p->wp = wp;
131   return OK;
132 }
133 
134 static OENTRY localops[] = {
135   {"cudaconv", sizeof(CONV),0, 5, "a", "ai", (SUBR) conv_init, NULL,
136    (SUBR) conv_perf},
137 };
138 
139 extern "C" {
140   LINKAGE
141 }
142