1 // -*- c++ -*-
2 /* adsyn11.cu
3   (c) Victor Lazzarini, 2013
4 
5   based on M Puckette's pitch tracking algorithm.
6 
7   This file is part of Csound.
8 
9   The Csound Library is free software; you can redistribute it
10   and/or modify it under the terms of the GNU Lesser General Public
11   License as published by the Free Software Foundation; either
12   version 2.1 of the License, or (at your option) any later version.
13 
14   Csound is distributed in the hope that it will be useful,
15   but WITHOUT ANY WARRANTY; without even the implied warranty of
16   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   GNU Lesser General Public License for more details.
18 
19   You should have received a copy of the GNU Lesser General Public
20   License along with Csound; if not, write to the Free Software
21   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
22   02110-1301 USA
23 */
24 
25 #include <csdl.h>
26 #include <pstream.h>
27 
28 typedef struct cudadsyn_ {
29   OPDS h;
30   MYFLT *asig;
31   PVSDAT *fsig;
32   MYFLT *kamp, *kfreq;
33   MYFLT *inum;
34   float *out;
35   float *frame;
36   int64_t *ndx;
37   float *fp, *previous;
38   AUXCH out_;
39   int bins, blocks, threads;
40   int count;
41   int vsamps, mblocks, mthreads;
42   int framecount;
43 } CUDADSYN;
44 
45 static int destroy_cudadsyn(CSOUND *csound, void *pp);
46 
init_cudadsyn(CSOUND * csound,CUDADSYN * p)47 static int init_cudadsyn(CSOUND *csound, CUDADSYN *p){
48 
49   int asize, ipsize, fpsize, blockspt;
50   if(p->fsig->overlap > 1024)
51      return csound->InitError(csound, "overlap is too large\n");
52   cudaDeviceProp deviceProp;
53   cudaGetDeviceProperties(&deviceProp, 0);
54   blockspt = deviceProp.maxThreadsPerBlock;
55   if(deviceProp.major < 1)
56    csound->InitError(csound,
57 		     "this opcode requires device capability 1.0 minimum. Device is %d.%d\n",
58         deviceProp.major, deviceProp.minor );
59 
60   p->bins = (p->fsig->N)/2;
61 
62   if(*p->inum > 0 && *p->inum < p->bins) p->bins = *p->inum;
63 
64   p->vsamps = p->fsig->overlap;
65   p->threads = p->bins*p->vsamps;
66   p->blocks = p->threads > blockspt ? p->threads/blockspt : 1;
67   p->mthreads = p->bins;
68   p->mblocks = p->mthreads >  blockspt ? p->mthreads/blockspt : 1;
69 
70   p->threads /= p->blocks;
71   p->mthreads /= p->mblocks;
72 
73   asize =  p->vsamps*p->bins*sizeof(float);
74   ipsize = p->fsig->N*sizeof(int64_t)/2;
75   fpsize = p->fsig->N*sizeof(float);
76 
77   cudaMalloc(&p->out, asize);
78   cudaMalloc(&p->ndx, ipsize);
79   cudaMalloc(&p->frame, fpsize);
80   cudaMalloc(&p->previous, fpsize/2);
81   cudaMemset(p->previous, 0, fpsize/2);
82   cudaMemset(p->ndx, 0, ipsize);
83 
84   asize = p->vsamps*sizeof(float);
85   if(p->out_.auxp == NULL ||
86      p->out_.size < asize)
87     csound->AuxAlloc(csound, asize , &p->out_);
88 
89   csound->RegisterDeinitCallback(csound, p, destroy_cudadsyn);
90   p->count = 0;
91   return OK;
92 }
93 
sample(float * out,float * frame,float pitch,int64_t * ph,float * amps,int bins,int vsize,float sr)94 __global__ void sample(float *out, float *frame, float pitch, int64_t *ph,
95                        float *amps, int bins, int vsize, float sr) {
96 
97   int t = (threadIdx.x + blockIdx.x*blockDim.x);
98   int n =  t%vsize;  /* sample index */
99   int h = t/vsize;  /* bin index */
100   int k = h<<1;
101   int64_t lph;
102   float a = amps[h], ascl = ((float)n)/vsize;
103   float fscal = pitch*FMAXLEN/sr;
104   lph = (ph[h] + (int64_t)(n*round(frame[k+1]*fscal))) & PHMASK;
105   a += ascl*(frame[k] - a);
106   out[t] = a*sinf((2*PI*lph)/FMAXLEN);
107   if(t >= vsize) return;
108   __syncthreads();
109   for(int i=vsize; i < vsize*bins; i+=vsize)
110     out[t] += out[t + i];
111 }
112 
update(float * frame,float * amps,int64_t * ph,float pitch,int vsize,float sr)113 __global__ void update(float *frame, float *amps,
114       int64_t *ph,float pitch, int vsize, float sr){
115 
116  int h = threadIdx.x + blockIdx.x*blockDim.x;
117  int k = h << 1;
118  /* update phases and amps */
119  ph[h]  = (ph[h] + (int64_t)(vsize*round(pitch*frame[k+1]*FMAXLEN/sr))) & PHMASK;
120  amps[h] = frame[k];
121 }
122 
perf_cudadsyn(CSOUND * csound,CUDADSYN * p)123 static int perf_cudadsyn(CSOUND *csound, CUDADSYN *p){
124 
125   uint32_t offset = p->h.insdshead->ksmps_offset;
126   uint32_t early  = p->h.insdshead->ksmps_no_end;
127   uint32_t n, nsmps = CS_KSMPS;
128   float *out_ = (float *) p->out_.auxp;
129   MYFLT      *asig = p->asig;
130   int count = p->count,  vsamps = p->vsamps;
131   p->fp = (float *) (p->fsig->frame.auxp);
132 
133   if (UNLIKELY(offset)) memset(asig, '\0', offset*sizeof(MYFLT));
134   if (UNLIKELY(early)) {
135     nsmps -= early;
136     memset(&asig[nsmps], '\0', early*sizeof(MYFLT));
137    }
138 
139   for(n=offset; n < nsmps; n++){
140     if(count == 0) {
141       cudaMemset(p->out, 0, sizeof(float)*vsamps);
142       cudaMemcpy(p->frame,p->fp,sizeof(float)*p->bins*2,cudaMemcpyHostToDevice);
143       sample<<<p->blocks,p->threads>>>(p->out,p->frame,
144                                                *p->kfreq,
145                                                 p->ndx,
146                                                 p->previous,
147                                                 p->bins,
148                                                 vsamps,
149                                                 csound->GetSr(csound));
150        if (cudaDeviceSynchronize() != cudaSuccess)
151        csound->Message(csound,"Cuda error: Failed to synchronize\n");
152        update<<<p->mblocks,p->mthreads>>>(p->frame,
153                                            p->previous,
154                                             p->ndx,
155                                             *p->kfreq,
156                                             vsamps,
157                                             csound->GetSr(csound));
158       cudaMemcpy(out_,p->out,vsamps*sizeof(float),cudaMemcpyDeviceToHost);
159       count = vsamps;
160     }
161     asig[n] = (MYFLT) out_[vsamps - count];
162     count--;
163   }
164   p->count = count;
165   return OK;
166 }
167 
destroy_cudadsyn(CSOUND * csound,void * pp)168 static int destroy_cudadsyn(CSOUND *csound, void *pp){
169   CUDADSYN *p = (CUDADSYN *) pp;
170   cudaFree(p->out);
171   cudaFree(p->ndx);
172   cudaFree(p->previous);
173   cudaFree(p->frame);
174   return OK;
175 }
176 
177 
178 static OENTRY localops[] = {
179   {"cudasynth", sizeof(CUDADSYN),0, 5, "a", "fkko", (SUBR) init_cudadsyn, NULL,
180    (SUBR) perf_cudadsyn}
181 };
182 
183 extern "C" {
184   LINKAGE
185 }
186