1 // -*- c++ -*-
2 /* adsyn11.cu
3 (c) Victor Lazzarini, 2013
4
5 based on M Puckette's pitch tracking algorithm.
6
7 This file is part of Csound.
8
9 The Csound Library is free software; you can redistribute it
10 and/or modify it under the terms of the GNU Lesser General Public
11 License as published by the Free Software Foundation; either
12 version 2.1 of the License, or (at your option) any later version.
13
14 Csound is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU Lesser General Public License for more details.
18
19 You should have received a copy of the GNU Lesser General Public
20 License along with Csound; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
22 02110-1301 USA
23 */
24
25 #include <csdl.h>
26 #include <pstream.h>
27
28 typedef struct cudadsyn_ {
29 OPDS h;
30 MYFLT *asig;
31 PVSDAT *fsig;
32 MYFLT *kamp, *kfreq;
33 MYFLT *inum;
34 float *out;
35 float *frame;
36 int64_t *ndx;
37 float *fp, *previous;
38 AUXCH out_;
39 int bins, blocks, threads;
40 int count;
41 int vsamps, mblocks, mthreads;
42 int framecount;
43 } CUDADSYN;
44
45 static int destroy_cudadsyn(CSOUND *csound, void *pp);
46
init_cudadsyn(CSOUND * csound,CUDADSYN * p)47 static int init_cudadsyn(CSOUND *csound, CUDADSYN *p){
48
49 int asize, ipsize, fpsize, blockspt;
50 if(p->fsig->overlap > 1024)
51 return csound->InitError(csound, "overlap is too large\n");
52 cudaDeviceProp deviceProp;
53 cudaGetDeviceProperties(&deviceProp, 0);
54 blockspt = deviceProp.maxThreadsPerBlock;
55 if(deviceProp.major < 1)
56 csound->InitError(csound,
57 "this opcode requires device capability 1.0 minimum. Device is %d.%d\n",
58 deviceProp.major, deviceProp.minor );
59
60 p->bins = (p->fsig->N)/2;
61
62 if(*p->inum > 0 && *p->inum < p->bins) p->bins = *p->inum;
63
64 p->vsamps = p->fsig->overlap;
65 p->threads = p->bins*p->vsamps;
66 p->blocks = p->threads > blockspt ? p->threads/blockspt : 1;
67 p->mthreads = p->bins;
68 p->mblocks = p->mthreads > blockspt ? p->mthreads/blockspt : 1;
69
70 p->threads /= p->blocks;
71 p->mthreads /= p->mblocks;
72
73 asize = p->vsamps*p->bins*sizeof(float);
74 ipsize = p->fsig->N*sizeof(int64_t)/2;
75 fpsize = p->fsig->N*sizeof(float);
76
77 cudaMalloc(&p->out, asize);
78 cudaMalloc(&p->ndx, ipsize);
79 cudaMalloc(&p->frame, fpsize);
80 cudaMalloc(&p->previous, fpsize/2);
81 cudaMemset(p->previous, 0, fpsize/2);
82 cudaMemset(p->ndx, 0, ipsize);
83
84 asize = p->vsamps*sizeof(float);
85 if(p->out_.auxp == NULL ||
86 p->out_.size < asize)
87 csound->AuxAlloc(csound, asize , &p->out_);
88
89 csound->RegisterDeinitCallback(csound, p, destroy_cudadsyn);
90 p->count = 0;
91 return OK;
92 }
93
sample(float * out,float * frame,float pitch,int64_t * ph,float * amps,int bins,int vsize,float sr)94 __global__ void sample(float *out, float *frame, float pitch, int64_t *ph,
95 float *amps, int bins, int vsize, float sr) {
96
97 int t = (threadIdx.x + blockIdx.x*blockDim.x);
98 int n = t%vsize; /* sample index */
99 int h = t/vsize; /* bin index */
100 int k = h<<1;
101 int64_t lph;
102 float a = amps[h], ascl = ((float)n)/vsize;
103 float fscal = pitch*FMAXLEN/sr;
104 lph = (ph[h] + (int64_t)(n*round(frame[k+1]*fscal))) & PHMASK;
105 a += ascl*(frame[k] - a);
106 out[t] = a*sinf((2*PI*lph)/FMAXLEN);
107 if(t >= vsize) return;
108 __syncthreads();
109 for(int i=vsize; i < vsize*bins; i+=vsize)
110 out[t] += out[t + i];
111 }
112
update(float * frame,float * amps,int64_t * ph,float pitch,int vsize,float sr)113 __global__ void update(float *frame, float *amps,
114 int64_t *ph,float pitch, int vsize, float sr){
115
116 int h = threadIdx.x + blockIdx.x*blockDim.x;
117 int k = h << 1;
118 /* update phases and amps */
119 ph[h] = (ph[h] + (int64_t)(vsize*round(pitch*frame[k+1]*FMAXLEN/sr))) & PHMASK;
120 amps[h] = frame[k];
121 }
122
perf_cudadsyn(CSOUND * csound,CUDADSYN * p)123 static int perf_cudadsyn(CSOUND *csound, CUDADSYN *p){
124
125 uint32_t offset = p->h.insdshead->ksmps_offset;
126 uint32_t early = p->h.insdshead->ksmps_no_end;
127 uint32_t n, nsmps = CS_KSMPS;
128 float *out_ = (float *) p->out_.auxp;
129 MYFLT *asig = p->asig;
130 int count = p->count, vsamps = p->vsamps;
131 p->fp = (float *) (p->fsig->frame.auxp);
132
133 if (UNLIKELY(offset)) memset(asig, '\0', offset*sizeof(MYFLT));
134 if (UNLIKELY(early)) {
135 nsmps -= early;
136 memset(&asig[nsmps], '\0', early*sizeof(MYFLT));
137 }
138
139 for(n=offset; n < nsmps; n++){
140 if(count == 0) {
141 cudaMemset(p->out, 0, sizeof(float)*vsamps);
142 cudaMemcpy(p->frame,p->fp,sizeof(float)*p->bins*2,cudaMemcpyHostToDevice);
143 sample<<<p->blocks,p->threads>>>(p->out,p->frame,
144 *p->kfreq,
145 p->ndx,
146 p->previous,
147 p->bins,
148 vsamps,
149 csound->GetSr(csound));
150 if (cudaDeviceSynchronize() != cudaSuccess)
151 csound->Message(csound,"Cuda error: Failed to synchronize\n");
152 update<<<p->mblocks,p->mthreads>>>(p->frame,
153 p->previous,
154 p->ndx,
155 *p->kfreq,
156 vsamps,
157 csound->GetSr(csound));
158 cudaMemcpy(out_,p->out,vsamps*sizeof(float),cudaMemcpyDeviceToHost);
159 count = vsamps;
160 }
161 asig[n] = (MYFLT) out_[vsamps - count];
162 count--;
163 }
164 p->count = count;
165 return OK;
166 }
167
destroy_cudadsyn(CSOUND * csound,void * pp)168 static int destroy_cudadsyn(CSOUND *csound, void *pp){
169 CUDADSYN *p = (CUDADSYN *) pp;
170 cudaFree(p->out);
171 cudaFree(p->ndx);
172 cudaFree(p->previous);
173 cudaFree(p->frame);
174 return OK;
175 }
176
177
178 static OENTRY localops[] = {
179 {"cudasynth", sizeof(CUDADSYN),0, 5, "a", "fkko", (SUBR) init_cudadsyn, NULL,
180 (SUBR) perf_cudadsyn}
181 };
182
183 extern "C" {
184 LINKAGE
185 }
186