1 // -*- c++ -*-
2 /* conv.cu
3 
4   (c) Victor Lazzarini, 2013
5 
6   This file is part of Csound.
7 
8   The Csound Library is free software; you can redistribute it
9   and/or modify it under the terms of the GNU Lesser General Public
10   License as published by the Free Software Foundation; either
11   version 2.1 of the License, or (at your option) any later version.
12 
13   Csound is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU Lesser General Public License for more details.
17 
18   You should have received a copy of the GNU Lesser General Public
19   License along with Csound; if not, write to the Free Software
20   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
21   02110-1301 USA
22 */
23 
24 #include <csdl.h>
25 
convol(MYFLT * out,MYFLT * del,MYFLT * coefs,int irsize,int rp,int vsize)26 __global__ void convol(MYFLT *out, MYFLT *del, MYFLT *coefs, int irsize, int rp, int vsize) {
27   int t = (threadIdx.x + blockIdx.x*blockDim.x);
28   if(t >= irsize*vsize) return;
29   int n =  t%vsize;  /* sample index */
30   int h =  t/vsize;  /* coeff index */
31   int end = irsize+vsize;
32   rp += n + h; /* read point, oldest -> newest */
33   out[t] = del[rp < end ? rp : rp%end]*coefs[irsize-1-h];  /* single tap */
34   if(t >= vsize) return;
35   __syncthreads();
36   MYFLT a = 0.0;
37   for(int i=1, j=vsize; i < irsize; i++, j+=vsize)
38     a +=  out[n + j]; /* mix all taps */
39   out[n] += a;
40 }
41 
42 // __device__ double atomicAdd(double* address, double val)
43 // {
44 //     unsigned long long int* address_as_ull =
45 //                               (unsigned long long int*)address;
46 //     unsigned long long int old = *address_as_ull, assumed;
47 //     do {
48 //         assumed = old;
49 //         old = atomicCAS(address_as_ull, assumed,
50 //                         __double_as_longlong(val +
51 //                                __longlong_as_double(assumed)));
52 //     } while (assumed != old);
53 //     return __longlong_as_double(old);
54 // }
55 
56 // __global__ void convol2(MYFLT *out, MYFLT *del, MYFLT *coefs, int irsize, int rp, int vsize) {
57 //   int t = (threadIdx.x + blockIdx.x*blockDim.x);
58 //   if(t >= irsize*vsize) return;
59 //   int n =  t%vsize;  /* sample index */
60 //   int h =  t/vsize;  /* coeff index */
61 //   int end = irsize+vsize;
62 //   rp += n + h; /* read point, oldest -> newest */
63 //   MYFLT s = del[rp < end ? rp : rp%end]*coefs[irsize-1-h];  /* single tap */
64 //   t == n ? out[n] = s : atomicAdd(&out[n], s);
65 // }
66 
67 typedef struct _CONV {
68   OPDS h;
69   MYFLT *aout, *asig, *ifn;
70   MYFLT *coeffs, *out, *del;
71   int wp, irsize;
72   int blocks, threads;
73 } CONV;
74 
75 
destroy_conv(CSOUND * csound,void * pp)76 static int destroy_conv(CSOUND *csound, void *pp){
77   CONV *p = (CONV *) pp;
78   cudaFree(p->coeffs);
79   cudaFree(p->del);
80   cudaFree(p->out);
81   return OK;
82 }
83 
conv_init(CSOUND * csound,CONV * p)84 static int conv_init(CSOUND *csound, CONV *p){
85 
86   FUNC *ftab = csound->FTnp2Find(csound, p->ifn);
87   int irsize = ftab->flen;
88   int nsmps = CS_KSMPS;
89   int threads = irsize*nsmps;
90 
91   cudaMalloc(&p->coeffs, sizeof(MYFLT)*irsize);
92   cudaMemcpy(p->coeffs, ftab->ftable, sizeof(MYFLT)*irsize,
93             cudaMemcpyHostToDevice);
94 
95   cudaMalloc(&p->del, sizeof(MYFLT)*(irsize+nsmps));
96   cudaMalloc(&p->out, sizeof(MYFLT)*threads);
97   cudaMemset(p->del,0,sizeof(MYFLT)*(irsize+nsmps));
98   cudaMemset(p->out, 0, sizeof(MYFLT)*threads);
99 
100   p->wp = 0;
101   p->irsize = irsize;
102 
103   cudaDeviceProp deviceProp;
104   cudaGetDeviceProperties(&deviceProp, 0);
105   int blockspt = deviceProp.maxThreadsPerBlock;
106   csound->Message(csound, "CUDAconv: using device %s (capability %d.%d)\n",
107         deviceProp.name,deviceProp.major, deviceProp.minor);
108 
109   p->blocks = threads > blockspt ? ceil(threads/blockspt) : 1;
110   p->threads = threads > blockspt ? blockspt : threads;
111 
112   csound->RegisterDeinitCallback(csound, p, destroy_conv);
113   OPARMS parms;
114   csound->GetOParms(csound, &parms);
115   if(parms.odebug)
116    csound->Message(csound, "blocks %d, threads %d - %d\n", p->blocks, p->threads, threads);
117 
118   return OK;
119 
120 }
121 /* the delay size is irsize + vsize so that
122    we can shift in a whole block of samples */
conv_perf(CSOUND * csound,CONV * p)123 int conv_perf(CSOUND *csound, CONV *p){
124 
125    int nsmps = CS_KSMPS;
126    MYFLT *sig = p->asig, *aout = p->aout;
127    MYFLT *del = p->del, *out = p->out, *coefs = p->coeffs;
128    int irsize = p->irsize;
129    int wp = p->wp;
130 
131   if(wp > irsize) {
132      int front = wp - irsize;
133      cudaMemcpy(&del[wp], sig, sizeof(MYFLT)*(nsmps-front), cudaMemcpyHostToDevice);
134      cudaMemcpy(del, &sig[nsmps-front], sizeof(MYFLT)*front, cudaMemcpyHostToDevice);
135   }
136   else cudaMemcpy(&del[wp], sig, sizeof(MYFLT)*nsmps, cudaMemcpyHostToDevice);
137 
138   wp = (wp+nsmps)%(irsize+nsmps); /* wp is now the oldest sample in the delay */
139   convol<<<p->blocks,p->threads>>>(out, del, coefs, irsize, wp, nsmps);
140 
141   cudaMemcpy(aout, out, sizeof(MYFLT)*nsmps, cudaMemcpyDeviceToHost);
142   p->wp = wp;
143   return OK;
144 }
145 
146 static OENTRY localops[] = {
147   {"cudaconv", sizeof(CONV),0, 5, "a", "ai", (SUBR) conv_init, NULL,
148    (SUBR) conv_perf},
149 };
150 
151 extern "C" {
152   LINKAGE
153 }
154