1 // -*- c++ -*-
2 /* adsyn.cu
3 experimental cuda opcodes
4
5 (c) Victor Lazzarini, 2013
6
7 based on M Puckette's pitch tracking algorithm.
8
9 This file is part of Csound.
10
11 The Csound Library is free software; you can redistribute it
12 and/or modify it under the terms of the GNU Lesser General Public
13 License as published by the Free Software Foundation; either
14 version 2.1 of the License, or (at your option) any later version.
15
16 Csound is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU Lesser General Public License for more details.
20
21 You should have received a copy of the GNU Lesser General Public
22 License along with Csound; if not, write to the Free Software
23 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
24 02110-1301 USA
25 */
26
27 #include <csdl.h>
28 #include <pstream.h>
29
30 typedef struct cudadsyn_ {
31 OPDS h;
32 MYFLT *asig;
33 PVSDAT *fsig;
34 MYFLT *kamp, *kfreq;
35 MYFLT *inum;
36 float *out;
37 float *frame;
38 int64_t *ndx;
39 float *fp, *previous;
40 AUXCH out_;
41 int bins, blocks, threads;
42 int count;
43 int vsamps, mblocks, mthreads;
44 int framecount;
45 } CUDADSYN;
46
47 static int destroy_cudadsyn(CSOUND *csound, void *pp);
48
init_cudadsyn(CSOUND * csound,CUDADSYN * p)49 static int init_cudadsyn(CSOUND *csound, CUDADSYN *p){
50
51 int asize, ipsize, fpsize, blockspt;
52 if(p->fsig->overlap > 1024)
53 return csound->InitError(csound, "overlap is too large\n");
54 cudaDeviceProp deviceProp;
55 cudaGetDeviceProperties(&deviceProp, 0);
56 blockspt = deviceProp.maxThreadsPerBlock;
57 if(deviceProp.major < 2)
58 return csound->InitError(csound,
59 "this opcode requires device capability 2.0 minimum. Device is %d.%d\n",
60 deviceProp.major, deviceProp.minor );
61
62 p->bins = (p->fsig->N)/2;
63
64 if(*p->inum > 0 && *p->inum < p->bins) p->bins = *p->inum;
65
66 p->vsamps = p->fsig->overlap;
67 p->threads = p->bins*p->vsamps;
68 p->blocks = p->threads > blockspt ? p->threads/blockspt : 1;
69 p->mthreads = p->bins;
70 p->mblocks = p->mthreads > blockspt ? p->mthreads/blockspt : 1;
71
72 p->threads /= p->blocks;
73 p->mthreads /= p->mblocks;
74
75 asize = p->vsamps*sizeof(float);
76 ipsize = p->fsig->N*sizeof(int64_t)/2;
77 fpsize = p->fsig->N*sizeof(float);
78
79 cudaMalloc(&p->out, asize);
80 cudaMalloc(&p->ndx, ipsize);
81 cudaMalloc(&p->frame, fpsize);
82 cudaMalloc(&p->previous, fpsize/2);
83 cudaMemset(p->previous, 0, fpsize/2);
84 cudaMemset(p->ndx, 0, ipsize);
85 cudaMemset(p->out, 0, sizeof(float)*p->vsamps);
86
87 asize = p->vsamps*sizeof(float);
88 if(p->out_.auxp == NULL ||
89 p->out_.size < asize)
90 csound->AuxAlloc(csound, asize , &p->out_);
91
92 csound->RegisterDeinitCallback(csound, p, destroy_cudadsyn);
93 p->count = 0;
94 return OK;
95 }
96
sample(float * out,float * frame,float pitch,int64_t * ph,float * amps,int vsize,float sr)97 __global__ void sample(float *out, float *frame, float pitch, int64_t *ph,
98 float *amps, int vsize, float sr) {
99
100 int t = (threadIdx.x + blockIdx.x*blockDim.x);
101 int n = t%vsize; /* sample index */
102 int h = t/vsize; /* bin index */
103 //if(amps[h] < 0.001) return;
104 int k = h<<1;
105 int64_t lph;
106 float a = amps[h], ascl = ((float)n)/vsize;
107 int64_t d = (int64_t) round(frame[k+1]*pitch*FMAXLEN/sr);
108 lph = ph[h] + n*d;
109 a += ascl*(frame[k] - a);
110 atomicAdd(&out[n], a*sinf((2*PI*lph)/FMAXLEN));
111 // if(n) return;
112 // ph[h] = (ph[h] + vsize*d) & PHMASK;
113 // amps[h] = frame[k];
114 }
115
update(float * frame,float * amps,int64_t * ph,float pitch,int vsize,float sr)116 __global__ void update(float *frame, float *amps,
117 int64_t *ph,float pitch, int vsize, float sr){
118
119 int h = threadIdx.x + blockIdx.x*blockDim.x;
120 int k = h << 1;
121 /* update phases and amps */
122 ph[h] = (ph[h] + (int64_t)(vsize*round(pitch*frame[k+1]*FMAXLEN/sr))) & PHMASK;
123 amps[h] = frame[k];
124 }
125
perf_cudadsyn(CSOUND * csound,CUDADSYN * p)126 static int perf_cudadsyn(CSOUND *csound, CUDADSYN *p){
127
128 uint32_t offset = p->h.insdshead->ksmps_offset;
129 uint32_t early = p->h.insdshead->ksmps_no_end;
130 uint32_t n, nsmps = CS_KSMPS;
131 float *out_ = (float *) p->out_.auxp;
132 MYFLT *asig = p->asig;
133 int count = p->count, vsamps = p->vsamps;
134 p->fp = (float *) (p->fsig->frame.auxp);
135
136 if (UNLIKELY(offset)) memset(asig, '\0', offset*sizeof(MYFLT));
137 if (UNLIKELY(early)) {
138 nsmps -= early;
139 memset(&asig[nsmps], '\0', early*sizeof(MYFLT));
140 }
141
142 for(n=offset; n < nsmps; n++){
143 if(count == 0) {
144 cudaMemcpy(out_,p->out,vsamps*sizeof(float),cudaMemcpyDeviceToHost);
145 update<<<p->mblocks,p->mthreads>>>(p->frame,
146 p->previous,
147 p->ndx,
148 *p->kfreq,
149 vsamps,
150 csound->GetSr(csound));
151 cudaMemset(p->out, 0, sizeof(float)*vsamps);
152 cudaMemcpy(p->frame,p->fp,sizeof(float)*p->bins*2,cudaMemcpyHostToDevice);
153 sample<<<p->blocks,p->threads>>>(p->out,p->frame,
154 *p->kfreq,
155 p->ndx,
156 p->previous,
157 vsamps,
158 csound->GetSr(csound));
159
160 count = vsamps;
161 }
162 asig[n] = (MYFLT) out_[vsamps - count];
163 count--;
164 }
165 p->count = count;
166 return OK;
167 }
168
destroy_cudadsyn(CSOUND * csound,void * pp)169 static int destroy_cudadsyn(CSOUND *csound, void *pp){
170 CUDADSYN *p = (CUDADSYN *) pp;
171 cudaFree(p->out);
172 cudaFree(p->ndx);
173 cudaFree(p->previous);
174 cudaFree(p->frame);
175 return OK;
176 }
177
178
179 static OENTRY localops[] = {
180 {"cudasynth", sizeof(CUDADSYN),0, 5, "a", "fkko", (SUBR) init_cudadsyn, NULL,
181 (SUBR) perf_cudadsyn}
182 };
183
184 extern "C" {
185 LINKAGE
186 }
187
188