1 // -*- c++ -*-
2 /* adsyn.cu
3    experimental cuda opcodes
4 
5   (c) Victor Lazzarini, 2013
6 
7   based on M Puckette's pitch tracking algorithm.
8 
9   This file is part of Csound.
10 
11   The Csound Library is free software; you can redistribute it
12   and/or modify it under the terms of the GNU Lesser General Public
13   License as published by the Free Software Foundation; either
14   version 2.1 of the License, or (at your option) any later version.
15 
16   Csound is distributed in the hope that it will be useful,
17   but WITHOUT ANY WARRANTY; without even the implied warranty of
18   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19   GNU Lesser General Public License for more details.
20 
21   You should have received a copy of the GNU Lesser General Public
22   License along with Csound; if not, write to the Free Software
23   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
24   02110-1301 USA
25 */
26 
27 #include <csdl.h>
28 #include <pstream.h>
29 
30 typedef struct cudadsyn_ {
31   OPDS h;
32   MYFLT *asig;
33   PVSDAT *fsig;
34   MYFLT *kamp, *kfreq;
35   MYFLT *inum;
36   float *out;
37   float *frame;
38   int64_t *ndx;
39   float *fp, *previous;
40   AUXCH out_;
41   int bins, blocks, threads;
42   int count;
43   int vsamps, mblocks, mthreads;
44   int framecount;
45 } CUDADSYN;
46 
47 static int destroy_cudadsyn(CSOUND *csound, void *pp);
48 
init_cudadsyn(CSOUND * csound,CUDADSYN * p)49 static int init_cudadsyn(CSOUND *csound, CUDADSYN *p){
50 
51   int asize, ipsize, fpsize, blockspt;
52   if(p->fsig->overlap > 1024)
53      return csound->InitError(csound, "overlap is too large\n");
54   cudaDeviceProp deviceProp;
55   cudaGetDeviceProperties(&deviceProp, 0);
56   blockspt = deviceProp.maxThreadsPerBlock;
57   if(deviceProp.major < 2)
58     return csound->InitError(csound,
59        "this opcode requires device capability 2.0 minimum. Device is %d.%d\n",
60         deviceProp.major, deviceProp.minor );
61 
62   p->bins = (p->fsig->N)/2;
63 
64   if(*p->inum > 0 && *p->inum < p->bins) p->bins = *p->inum;
65 
66   p->vsamps = p->fsig->overlap;
67   p->threads = p->bins*p->vsamps;
68   p->blocks = p->threads > blockspt ? p->threads/blockspt : 1;
69   p->mthreads = p->bins;
70   p->mblocks = p->mthreads >  blockspt ? p->mthreads/blockspt : 1;
71 
72   p->threads /= p->blocks;
73   p->mthreads /= p->mblocks;
74 
75   asize =  p->vsamps*sizeof(float);
76   ipsize = p->fsig->N*sizeof(int64_t)/2;
77   fpsize = p->fsig->N*sizeof(float);
78 
79   cudaMalloc(&p->out, asize);
80   cudaMalloc(&p->ndx, ipsize);
81   cudaMalloc(&p->frame, fpsize);
82   cudaMalloc(&p->previous, fpsize/2);
83   cudaMemset(p->previous, 0, fpsize/2);
84   cudaMemset(p->ndx, 0, ipsize);
85   cudaMemset(p->out, 0, sizeof(float)*p->vsamps);
86 
87   asize = p->vsamps*sizeof(float);
88   if(p->out_.auxp == NULL ||
89      p->out_.size < asize)
90     csound->AuxAlloc(csound, asize , &p->out_);
91 
92   csound->RegisterDeinitCallback(csound, p, destroy_cudadsyn);
93   p->count = 0;
94   return OK;
95 }
96 
sample(float * out,float * frame,float pitch,int64_t * ph,float * amps,int vsize,float sr)97 __global__ void sample(float *out, float *frame, float pitch, int64_t *ph,
98                        float *amps, int vsize, float sr) {
99 
100   int t = (threadIdx.x + blockIdx.x*blockDim.x);
101   int n =  t%vsize;  /* sample index */
102   int h = t/vsize;  /* bin index */
103   //if(amps[h] < 0.001) return;
104   int k = h<<1;
105   int64_t lph;
106   float a = amps[h], ascl = ((float)n)/vsize;
107   int64_t d =  (int64_t) round(frame[k+1]*pitch*FMAXLEN/sr);
108   lph = ph[h] + n*d;
109   a += ascl*(frame[k] - a);
110   atomicAdd(&out[n], a*sinf((2*PI*lph)/FMAXLEN));
111    // if(n) return;
112    // ph[h]  =  (ph[h] + vsize*d) & PHMASK;
113    // amps[h] = frame[k];
114 }
115 
update(float * frame,float * amps,int64_t * ph,float pitch,int vsize,float sr)116 __global__ void update(float *frame, float *amps,
117       int64_t *ph,float pitch, int vsize, float sr){
118 
119  int h = threadIdx.x + blockIdx.x*blockDim.x;
120  int k = h << 1;
121  /* update phases and amps */
122  ph[h]  = (ph[h] + (int64_t)(vsize*round(pitch*frame[k+1]*FMAXLEN/sr))) & PHMASK;
123  amps[h] = frame[k];
124 }
125 
perf_cudadsyn(CSOUND * csound,CUDADSYN * p)126 static int perf_cudadsyn(CSOUND *csound, CUDADSYN *p){
127 
128   uint32_t offset = p->h.insdshead->ksmps_offset;
129   uint32_t early  = p->h.insdshead->ksmps_no_end;
130   uint32_t n, nsmps = CS_KSMPS;
131   float *out_ = (float *) p->out_.auxp;
132   MYFLT      *asig = p->asig;
133   int count = p->count,  vsamps = p->vsamps;
134   p->fp = (float *) (p->fsig->frame.auxp);
135 
136   if (UNLIKELY(offset)) memset(asig, '\0', offset*sizeof(MYFLT));
137   if (UNLIKELY(early)) {
138     nsmps -= early;
139     memset(&asig[nsmps], '\0', early*sizeof(MYFLT));
140    }
141 
142   for(n=offset; n < nsmps; n++){
143     if(count == 0) {
144       cudaMemcpy(out_,p->out,vsamps*sizeof(float),cudaMemcpyDeviceToHost);
145       update<<<p->mblocks,p->mthreads>>>(p->frame,
146                                             p->previous,
147                                              p->ndx,
148                                              *p->kfreq,
149                                              vsamps,
150                                              csound->GetSr(csound));
151       cudaMemset(p->out, 0, sizeof(float)*vsamps);
152       cudaMemcpy(p->frame,p->fp,sizeof(float)*p->bins*2,cudaMemcpyHostToDevice);
153       sample<<<p->blocks,p->threads>>>(p->out,p->frame,
154                                                *p->kfreq,
155                                                 p->ndx,
156                                                 p->previous,
157                                                 vsamps,
158                                                 csound->GetSr(csound));
159 
160       count = vsamps;
161     }
162     asig[n] = (MYFLT) out_[vsamps - count];
163     count--;
164   }
165   p->count = count;
166   return OK;
167 }
168 
destroy_cudadsyn(CSOUND * csound,void * pp)169 static int destroy_cudadsyn(CSOUND *csound, void *pp){
170   CUDADSYN *p = (CUDADSYN *) pp;
171   cudaFree(p->out);
172   cudaFree(p->ndx);
173   cudaFree(p->previous);
174   cudaFree(p->frame);
175   return OK;
176 }
177 
178 
179 static OENTRY localops[] = {
180   {"cudasynth", sizeof(CUDADSYN),0, 5, "a", "fkko", (SUBR) init_cudadsyn, NULL,
181    (SUBR) perf_cudadsyn}
182 };
183 
184 extern "C" {
185   LINKAGE
186 }
187 
188