1 // -*- c++ -*-
2 /* convf.cu
3 (c) Victor Lazzarini, 2013
4
5 based on M Puckette's pitch tracking algorithm.
6
7 This file is part of Csound.
8
9 The Csound Library is free software; you can redistribute it
10 and/or modify it under the terms of the GNU Lesser General Public
11 License as published by the Free Software Foundation; either
12 version 2.1 of the License, or (at your option) any later version.
13
14 Csound is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU Lesser General Public License for more details.
18
19 You should have received a copy of the GNU Lesser General Public
20 License along with Csound; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
22 02110-1301 USA
23 */
24
25 #include <csdl.h>
26
convol(float * out,float * del,float * coefs,int irsize,int rp,int vsize)27 __global__ void convol(float *out, float *del, float *coefs, int irsize, int rp, int vsize) {
28 int t = (threadIdx.x + blockIdx.x*blockDim.x);
29 if(t >= irsize*vsize) return;
30 int n = t%vsize; /* sample index */
31 int h = t/vsize; /* coeff index */
32 int end = irsize+vsize;
33 rp += n + h; /* read point, oldest -> newest */
34 out[t] = del[rp < end ? rp : rp%end]*coefs[irsize-1-h]; /* single tap */
35 if(t >= vsize) return;
36 __syncthreads();
37 float a = 0.0;
38 for(int i=1, j=vsize; i < irsize; i++, j+=vsize)
39 a += out[n + j]; /* mix all taps */
40 out[n] += a;
41 }
42
43 typedef struct _CONV {
44 OPDS h;
45 MYFLT *aout, *asig, *ifn;
46 float *coeffs, *out, *del;
47 int wp, irsize;
48 AUXCH buf;
49 int blocks, threads;
50 } CONV;
51
52
destroy_conv(CSOUND * csound,void * pp)53 static int destroy_conv(CSOUND *csound, void *pp){
54 CONV *p = (CONV *) pp;
55 cudaFree(p->coeffs);
56 cudaFree(p->del);
57 cudaFree(p->out);
58 return OK;
59 }
60
conv_init(CSOUND * csound,CONV * p)61 static int conv_init(CSOUND *csound, CONV *p){
62
63 FUNC *ftab = csound->FTnp2Find(csound, p->ifn);
64 int irsize = ftab->flen;
65 int nsmps = CS_KSMPS,i;
66 int threads = irsize*nsmps;
67 float *tmp;
68
69 cudaMalloc(&p->coeffs, sizeof(float)*irsize);
70
71 tmp = (float*) malloc(sizeof(float)*irsize);
72 for(i=0; i< irsize; i++)
73 tmp[i] = (float) ftab->ftable[i];
74 cudaMemcpy(p->coeffs,tmp, sizeof(float)*irsize,
75 cudaMemcpyHostToDevice);
76 free(tmp);
77
78 cudaMalloc(&p->del, sizeof(float)*(irsize+nsmps));
79 cudaMalloc(&p->out, sizeof(float)*threads);
80 cudaMemset(p->del,0,sizeof(float)*(irsize+nsmps));
81 cudaMemset(p->out, 0, sizeof(float)*threads);
82
83 p->wp = 0;
84 p->irsize = irsize;
85
86 cudaDeviceProp deviceProp;
87 cudaGetDeviceProperties(&deviceProp, 0);
88 int blockspt = deviceProp.maxThreadsPerBlock;
89 csound->Message(csound, "CUDAconv: using device %s (capability %d.%d)\n",
90 deviceProp.name,deviceProp.major, deviceProp.minor);
91
92 p->blocks = threads > blockspt ? ceil(threads/blockspt) : 1;
93 p->threads = threads > blockspt ? blockspt : threads;
94
95 csound->RegisterDeinitCallback(csound, p, destroy_conv);
96 OPARMS parms;
97 csound->GetOParms(csound, &parms);
98 if(parms.odebug)
99 csound->Message(csound, "blocks %d, threads %d - %d\n", p->blocks, p->threads, threads);
100 if(p->buf.auxp == NULL)
101 csound->AuxAlloc(csound, sizeof(float)*CS_KSMPS, &p->buf);
102
103 return OK;
104
105 }
106 /* the delay size is irsize + vsize so that
107 we can shift in a whole block of samples */
conv_perf(CSOUND * csound,CONV * p)108 int conv_perf(CSOUND *csound, CONV *p){
109
110 int nsmps = CS_KSMPS;
111 MYFLT *sig = p->asig, *aout = p->aout;
112 float *del = p->del, *out = p->out, *coefs = p->coeffs, *buf = (float *)p->buf.auxp;
113 int irsize = p->irsize;
114 int wp = p->wp, i;
115
116 for(i=0; i < nsmps; i++) buf[i] = (float) sig[i];
117 if(wp > irsize) {
118 int front = wp - irsize;
119 cudaMemcpy(&del[wp], buf, sizeof(float)*(nsmps-front), cudaMemcpyHostToDevice);
120 cudaMemcpy(del, &buf[nsmps-front], sizeof(float)*front, cudaMemcpyHostToDevice);
121 }
122 else cudaMemcpy(&del[wp], buf, sizeof(float)*nsmps, cudaMemcpyHostToDevice);
123
124 wp = (wp+nsmps)%(irsize+nsmps); /* wp is now the oldest sample in the delay */
125 convol<<<p->blocks,p->threads>>>(out, del, coefs, irsize, wp, nsmps);
126
127 cudaMemcpy(buf, out, sizeof(float)*nsmps, cudaMemcpyDeviceToHost);
128
129 for(i=0; i < nsmps; i++) aout[i] = (float) buf[i];
130 p->wp = wp;
131 return OK;
132 }
133
134 static OENTRY localops[] = {
135 {"cudaconv", sizeof(CONV),0, 5, "a", "ai", (SUBR) conv_init, NULL,
136 (SUBR) conv_perf},
137 };
138
139 extern "C" {
140 LINKAGE
141 }
142