1 // -*- c++ -*-
2 /* conv.cu
3
4 (c) Victor Lazzarini, 2013
5
6 This file is part of Csound.
7
8 The Csound Library is free software; you can redistribute it
9 and/or modify it under the terms of the GNU Lesser General Public
10 License as published by the Free Software Foundation; either
11 version 2.1 of the License, or (at your option) any later version.
12
13 Csound is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public
19 License along with Csound; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
21 02110-1301 USA
22 */
23
24 #include <csdl.h>
25
convol(MYFLT * out,MYFLT * del,MYFLT * coefs,int irsize,int rp,int vsize)26 __global__ void convol(MYFLT *out, MYFLT *del, MYFLT *coefs, int irsize, int rp, int vsize) {
27 int t = (threadIdx.x + blockIdx.x*blockDim.x);
28 if(t >= irsize*vsize) return;
29 int n = t%vsize; /* sample index */
30 int h = t/vsize; /* coeff index */
31 int end = irsize+vsize;
32 rp += n + h; /* read point, oldest -> newest */
33 out[t] = del[rp < end ? rp : rp%end]*coefs[irsize-1-h]; /* single tap */
34 if(t >= vsize) return;
35 __syncthreads();
36 MYFLT a = 0.0;
37 for(int i=1, j=vsize; i < irsize; i++, j+=vsize)
38 a += out[n + j]; /* mix all taps */
39 out[n] += a;
40 }
41
42 // __device__ double atomicAdd(double* address, double val)
43 // {
44 // unsigned long long int* address_as_ull =
45 // (unsigned long long int*)address;
46 // unsigned long long int old = *address_as_ull, assumed;
47 // do {
48 // assumed = old;
49 // old = atomicCAS(address_as_ull, assumed,
50 // __double_as_longlong(val +
51 // __longlong_as_double(assumed)));
52 // } while (assumed != old);
53 // return __longlong_as_double(old);
54 // }
55
56 // __global__ void convol2(MYFLT *out, MYFLT *del, MYFLT *coefs, int irsize, int rp, int vsize) {
57 // int t = (threadIdx.x + blockIdx.x*blockDim.x);
58 // if(t >= irsize*vsize) return;
59 // int n = t%vsize; /* sample index */
60 // int h = t/vsize; /* coeff index */
61 // int end = irsize+vsize;
62 // rp += n + h; /* read point, oldest -> newest */
63 // MYFLT s = del[rp < end ? rp : rp%end]*coefs[irsize-1-h]; /* single tap */
64 // t == n ? out[n] = s : atomicAdd(&out[n], s);
65 // }
66
67 typedef struct _CONV {
68 OPDS h;
69 MYFLT *aout, *asig, *ifn;
70 MYFLT *coeffs, *out, *del;
71 int wp, irsize;
72 int blocks, threads;
73 } CONV;
74
75
destroy_conv(CSOUND * csound,void * pp)76 static int destroy_conv(CSOUND *csound, void *pp){
77 CONV *p = (CONV *) pp;
78 cudaFree(p->coeffs);
79 cudaFree(p->del);
80 cudaFree(p->out);
81 return OK;
82 }
83
conv_init(CSOUND * csound,CONV * p)84 static int conv_init(CSOUND *csound, CONV *p){
85
86 FUNC *ftab = csound->FTnp2Find(csound, p->ifn);
87 int irsize = ftab->flen;
88 int nsmps = CS_KSMPS;
89 int threads = irsize*nsmps;
90
91 cudaMalloc(&p->coeffs, sizeof(MYFLT)*irsize);
92 cudaMemcpy(p->coeffs, ftab->ftable, sizeof(MYFLT)*irsize,
93 cudaMemcpyHostToDevice);
94
95 cudaMalloc(&p->del, sizeof(MYFLT)*(irsize+nsmps));
96 cudaMalloc(&p->out, sizeof(MYFLT)*threads);
97 cudaMemset(p->del,0,sizeof(MYFLT)*(irsize+nsmps));
98 cudaMemset(p->out, 0, sizeof(MYFLT)*threads);
99
100 p->wp = 0;
101 p->irsize = irsize;
102
103 cudaDeviceProp deviceProp;
104 cudaGetDeviceProperties(&deviceProp, 0);
105 int blockspt = deviceProp.maxThreadsPerBlock;
106 csound->Message(csound, "CUDAconv: using device %s (capability %d.%d)\n",
107 deviceProp.name,deviceProp.major, deviceProp.minor);
108
109 p->blocks = threads > blockspt ? ceil(threads/blockspt) : 1;
110 p->threads = threads > blockspt ? blockspt : threads;
111
112 csound->RegisterDeinitCallback(csound, p, destroy_conv);
113 OPARMS parms;
114 csound->GetOParms(csound, &parms);
115 if(parms.odebug)
116 csound->Message(csound, "blocks %d, threads %d - %d\n", p->blocks, p->threads, threads);
117
118 return OK;
119
120 }
121 /* the delay size is irsize + vsize so that
122 we can shift in a whole block of samples */
conv_perf(CSOUND * csound,CONV * p)123 int conv_perf(CSOUND *csound, CONV *p){
124
125 int nsmps = CS_KSMPS;
126 MYFLT *sig = p->asig, *aout = p->aout;
127 MYFLT *del = p->del, *out = p->out, *coefs = p->coeffs;
128 int irsize = p->irsize;
129 int wp = p->wp;
130
131 if(wp > irsize) {
132 int front = wp - irsize;
133 cudaMemcpy(&del[wp], sig, sizeof(MYFLT)*(nsmps-front), cudaMemcpyHostToDevice);
134 cudaMemcpy(del, &sig[nsmps-front], sizeof(MYFLT)*front, cudaMemcpyHostToDevice);
135 }
136 else cudaMemcpy(&del[wp], sig, sizeof(MYFLT)*nsmps, cudaMemcpyHostToDevice);
137
138 wp = (wp+nsmps)%(irsize+nsmps); /* wp is now the oldest sample in the delay */
139 convol<<<p->blocks,p->threads>>>(out, del, coefs, irsize, wp, nsmps);
140
141 cudaMemcpy(aout, out, sizeof(MYFLT)*nsmps, cudaMemcpyDeviceToHost);
142 p->wp = wp;
143 return OK;
144 }
145
146 static OENTRY localops[] = {
147 {"cudaconv", sizeof(CONV),0, 5, "a", "ai", (SUBR) conv_init, NULL,
148 (SUBR) conv_perf},
149 };
150
151 extern "C" {
152 LINKAGE
153 }
154