1 #include <cuda_runtime.h>
2 #include <curand.h>
3 #include <cublas_v2.h>
4 
5 #include "convolutional_layer.h"
6 #include "batchnorm_layer.h"
7 #include "gemm.h"
8 #include "blas.h"
9 #include "im2col.h"
10 #include "col2im.h"
11 #include "utils.h"
12 #include "dark_cuda.h"
13 #include "box.h"
14 
15 
binarize_kernel(float * x,int n,float * binary)16 __global__ void binarize_kernel(float *x, int n, float *binary)
17 {
18     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
19     if (i >= n) return;
20     binary[i] = (x[i] >= 0) ? 1 : -1;
21 }
22 
binarize_gpu(float * x,int n,float * binary)23 void binarize_gpu(float *x, int n, float *binary)
24 {
25     binarize_kernel<<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(x, n, binary);
26     CHECK_CUDA(cudaPeekAtLastError());
27 }
28 
binarize_input_kernel(float * input,int n,int size,float * binary)29 __global__ void binarize_input_kernel(float *input, int n, int size, float *binary)
30 {
31     int s = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
32     if (s >= size) return;
33     int i = 0;
34     float mean = 0;
35     for(i = 0; i < n; ++i){
36         mean += fabs(input[i*size + s]);
37     }
38     mean = mean / n;
39     for(i = 0; i < n; ++i){
40         binary[i*size + s] = (input[i*size + s] > 0) ? mean : -mean;
41     }
42 }
43 
binarize_input_gpu(float * input,int n,int size,float * binary)44 void binarize_input_gpu(float *input, int n, int size, float *binary)
45 {
46     binarize_input_kernel<<<cuda_gridsize(size), BLOCK, 0, get_cuda_stream() >>>(input, n, size, binary);
47     CHECK_CUDA(cudaPeekAtLastError());
48 }
49 
binarize_weights_kernel(float * weights,int n,int size,float * binary)50 __global__ void binarize_weights_kernel(float *weights, int n, int size, float *binary)
51 {
52     int f = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
53     if (f >= n) return;
54     int i = 0;
55     float mean = 0;
56     for (i = 0; i < size; ++i) {
57         mean += fabs(weights[f*size + i]);
58     }
59     mean = mean / size;
60     for (i = 0; i < size; ++i) {
61         binary[f*size + i] = (weights[f*size + i] > 0) ? mean : -mean;
62         //binary[f*size + i] = weights[f*size + i];
63     }
64 }
65 
binarize_weights_gpu(float * weights,int n,int size,float * binary)66 void binarize_weights_gpu(float *weights, int n, int size, float *binary)
67 {
68     binarize_weights_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> >(weights, n, size, binary);
69     CHECK_CUDA(cudaPeekAtLastError());
70 }
71 
72 
set_zero_kernel(float * src,int size)73 __global__ void set_zero_kernel(float *src, int size)
74 {
75     int i = blockIdx.x * blockDim.x + threadIdx.x;
76     if (i < size) src[i] = 0;
77 }
78 
79 __inline__ __device__
warpAllReduceSum(float val)80 float warpAllReduceSum(float val) {
81     for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2)
82 #if CUDART_VERSION >= 9000
83         val += __shfl_xor_sync(0xffffffff, val, mask);
84 #else
85         val += __shfl_xor(val, mask);
86 #endif
87     return val;
88 }
89 
90 // only if (size % 32 == 0)
reduce_kernel(float * weights,int n,int size,float * mean_arr_gpu)91 __global__ void reduce_kernel(float *weights, int n, int size, float *mean_arr_gpu)
92 {
93     int i = blockIdx.x * blockDim.x + threadIdx.x;
94     int f = i / size;
95     if (f >= n) return;
96     float warp_mean = warpAllReduceSum(fabs(weights[i]));
97     if(i % 32 == 0)
98         atomicAdd(&mean_arr_gpu[f], warp_mean / size);
99 }
100 
binarize_weights_mean_kernel(float * weights,int n,int size,float * binary,float * mean_arr_gpu)101 __global__ void binarize_weights_mean_kernel(float *weights, int n, int size, float *binary, float *mean_arr_gpu)
102 {
103     int i = blockIdx.x * blockDim.x + threadIdx.x;
104     int f = i / size;
105     if (f >= n) return;
106     float mean = mean_arr_gpu[f];
107     binary[i] = (weights[i] > 0) ? mean : -mean;
108 }
109 
fast_binarize_weights_gpu(float * weights,int n,int size,float * binary,float * mean_arr_gpu)110 void fast_binarize_weights_gpu(float *weights, int n, int size, float *binary, float *mean_arr_gpu)
111 {
112     if (size % 32 == 0) {
113         size_t gridsize = n * size;
114         const int num_blocks = get_number_of_blocks(gridsize, BLOCK);// gridsize / BLOCK + 1;
115 
116         set_zero_kernel << <(n/BLOCK + 1), BLOCK, 0, get_cuda_stream() >> > (mean_arr_gpu, n);
117         reduce_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (weights, n, size, mean_arr_gpu);
118         binarize_weights_mean_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (weights, n, size, binary, mean_arr_gpu);
119         CHECK_CUDA(cudaPeekAtLastError());
120     }
121     else {
122         binarize_weights_gpu(weights, n, size, binary);
123     }
124 }
125 
126 
cuda_f32_to_f16(float * input_f32,size_t size,half * output_f16)127 __global__ void cuda_f32_to_f16(float* input_f32, size_t size, half *output_f16)
128 {
129     int idx = blockIdx.x * blockDim.x + threadIdx.x;
130     if (idx < size) output_f16[idx] = __float2half(input_f32[idx]);
131     //if (idx < size) output_f16[idx] = __float2half_rn(input_f32[idx]); // can't be compiled on Linux without casting
132     // __float2half_ru, __float2half_rd, __float2half_rz, __float2half_rn
133     //if (idx < size) *((unsigned short *)output_f16 + idx) = __float2half(input_f32[idx]);
134 }
135 
cuda_convert_f32_to_f16(float * input_f32,size_t size,float * output_f16)136 void cuda_convert_f32_to_f16(float* input_f32, size_t size, float *output_f16) {
137     cuda_f32_to_f16 <<< get_number_of_blocks(size, BLOCK), BLOCK, 0, get_cuda_stream() >>> (input_f32, size, (half *)output_f16);
138     CHECK_CUDA(cudaPeekAtLastError());
139 }
140 
cuda_f16_to_f32(half * input_f16,size_t size,float * output_f32)141 __global__ void cuda_f16_to_f32(half* input_f16, size_t size, float *output_f32)
142 {
143     int idx = blockIdx.x * blockDim.x + threadIdx.x;
144     if (idx < size) output_f32[idx] = __half2float(input_f16[idx]);
145     //if (idx < size) output_f32[idx] = __half2float(*((unsigned short *)input_f16 + idx));
146 }
147 
cuda_convert_f16_to_f32(float * input_f16,size_t size,float * output_f32)148 void cuda_convert_f16_to_f32(float* input_f16, size_t size, float *output_f32) {
149     cuda_f16_to_f32 <<< get_number_of_blocks(size, BLOCK), BLOCK, 0, get_cuda_stream() >>> ((half *)input_f16, size, output_f32);
150     CHECK_CUDA(cudaPeekAtLastError());
151 }
152 
cuda_make_f16_from_f32_array(float * src,size_t n)153 half *cuda_make_f16_from_f32_array(float *src, size_t n)
154 {
155     half *dst16;
156     size_t size = sizeof(half)*n;
157     CHECK_CUDA(cudaMalloc((void **)&dst16, size));
158     if (src) {
159         assert(n > 0);
160         cuda_convert_f32_to_f16(src, n, (float *)dst16);
161     }
162     if (!dst16) error("Cuda malloc failed\n");
163     return dst16;
164 }
165 
forward_convolutional_layer_gpu(convolutional_layer l,network_state state)166 void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
167 {
168     //fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
169     if(l.binary){
170         binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu);
171         swap_binary(&l);
172     }
173 
174     if(l.xnor){
175         if (!l.align_bit_weights_gpu || state.train) {
176             //binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu);
177 
178             fast_binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu, l.mean_arr_gpu);
179         }
180 
181         if (l.align_bit_weights_gpu && !state.train && l.c >= 32 && l.stride_x == l.stride_y)
182         {
183             //return;
184             //cudaError_t status = cudaSuccess;
185             //int input_size = l.c*l.h*l.w*l.batch;
186 
187             int m = l.n / l.groups;
188             int k = l.size*l.size*l.c / l.groups;
189             int n = l.out_w*l.out_h;
190             //float * a = l.weights_gpu;
191 
192             // int i, j;
193             // for(i = 0; i < l.batch; ++i){
194             // for (j = 0; j < l.groups; ++j) {
195 
196             int ldb_align = l.lda_align;
197             size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
198             //size_t t_intput_size = new_ldb * n;
199             //size_t t_bit_input_size = t_intput_size / 8;// +1;
200 
201             if (l.c % 32 == 0)
202             {
203                 //printf("\n\n l.index = %d, l.w = %d, l.c = %d, l.n = %d, l.stride = %d, l.pad = %d - new XNOR \n", l.index, l.w, l.c, l.n, l.stride, l.pad);
204                 //printf("l.align_workspace_size = %d, (l.c * l.w * l.h)  = %d \n", l.align_workspace_size, (l.c * l.w * l.h));
205 
206                 //float *intput_cpu = (float *)calloc(l.inputs, sizeof(float));
207                 // state.input
208                 //cudaMemcpy(intput_cpu, state.input, l.inputs * sizeof(float), cudaMemcpyDefault);
209 
210                 int ldb_align = l.lda_align;
211                 size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
212                 //size_t t_intput_size = new_ldb * l.bit_align;// n;
213                 //size_t t_bit_input_size = t_intput_size / 8;// +1;
214 
215                 const int new_c = l.c / 32;
216 
217                 //float *re_packed_input = (float *)calloc(l.c * l.w * l.h, sizeof(float));
218                 //uint32_t *bin_re_packed_input = (uint32_t *)calloc(new_c * l.w * l.h + 1, sizeof(uint32_t));
219 
220                 // float32x4 by channel (as in cuDNN)
221                 //repack_input(intput_cpu, re_packed_input, l.w, l.h, l.c);
222 
223 
224                 // 32 x floats -> 1 x uint32_t
225                 //float_to_bit(re_packed_input, (uint8_t *)bin_re_packed_input, l.c * l.w * l.h);
226 
227                 //cudaDeviceSynchronize();
228                 //start_timer();
229 
230                 repack_input_gpu_bin(state.input, (uint32_t *)l.align_workspace_gpu, l.w, l.h, l.c);
231 
232                 //repack_input_gpu(state.input, state.workspace, l.w, l.h, l.c);
233 
234                 // 32 x floats -> 1 x uint32_t
235                 //float_to_bit_gpu(state.workspace, (unsigned char *)l.align_workspace_gpu, l.c * l.w * l.h);// l.align_workspace_size);
236 
237                 //cudaDeviceSynchronize();
238                 //stop_timer_and_show_name("repack_input_gpu + float_to_bit_gpu");
239 
240                 //free(re_packed_input);
241 
242                 // slow - convolution the packed inputs and weights: float x 32 by channel (as in cuDNN)
243                 //convolution_repacked((uint32_t *)bin_re_packed_input, (uint32_t *)l.align_bit_weights, l.output,
244                 //    l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr);
245 
246                 // // then exit from if()
247 
248                 //float *b = state.workspace;
249                 //float *b = (float *)calloc(100 * 1024 * 1024, sizeof(float));
250                 //float *c = l.output;
251                 //memset(c, 0, l.outputs * sizeof(float));
252 
253 
254                 //im2col_cpu_custom((float *)bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, b);
255 
256                 //cudaMemcpy(l.align_workspace_gpu, bin_re_packed_input, (new_c * l.w * l.h + 1) * sizeof(uint32_t), cudaMemcpyDefault);
257 
258                 //start_timer();
259                 im2col_ongpu(l.align_workspace_gpu, new_c, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
260                 //cudaDeviceSynchronize();
261                 //stop_timer_and_show_name("im2col_ongpu");
262 
263                 //free(bin_re_packed_input);
264 
265                 int new_k = l.size*l.size*l.c / 32;
266 
267                 // good for (l.c == 64)
268                 //gemm_nn_bin_32bit_packed(m, n, new_k, 1,
269                 //    l.align_bit_weights, l.new_lda/32,
270                 //    b, n,
271                 //    c, n, l.mean_arr);
272 
273                 // // then exit from if()
274 
275 
276                 //size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
277                 //size_t t_intput_size = new_ldb * l.bit_align;// n;
278                 //size_t t_bit_input_size = t_intput_size / 8;// +1;
279 
280                 //char *t_bit_input = (char *)calloc(t_bit_input_size, sizeof(char));
281                 //transpose_uint32((uint32_t *)b, (uint32_t *)t_bit_input, new_k, n, n, new_ldb);
282                 //cudaMemcpy(l.transposed_align_workspace_gpu, t_bit_input, t_bit_input_size * sizeof(char), cudaMemcpyDefault);
283 
284                 //cudaMemcpy(state.workspace, b, t_bit_input_size * sizeof(char), cudaMemcpyDefault);
285                 //printf("\n n = %d, n % 32 = %d, new_ldb = %d, new_ldb % 32 = %d \n", n, n % 32, new_ldb, new_ldb % 32);
286 
287                 //start_timer();
288                 transpose_uint32_gpu((uint32_t *)state.workspace, (uint32_t *)l.transposed_align_workspace_gpu, new_k, n, n, new_ldb);
289                 //cudaDeviceSynchronize();
290                 //stop_timer_and_show_name("transpose_uint32_gpu");
291 
292                 //cudaDeviceSynchronize();
293                 //stop_timer_and_show_name("repack_input_gpu_bin + im2col_ongpu + transpose_uint32_gpu_2");
294 
295                 //start_timer();
296                 gemm_nn_custom_bin_mean_transposed_gpu(m, n, k,
297                     (unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu,
298                     new_ldb, l.output_gpu, n, l.mean_arr_gpu, l.biases_gpu, l.activation == LEAKY,
299                     l.bin_conv_shortcut_in_gpu, l.bin_conv_shortcut_out_gpu);
300                 //cudaDeviceSynchronize();
301                 //stop_timer_and_show_name("gemm_nn_custom_bin_mean_transposed_gpu");
302 
303 
304                 // the main GEMM function
305                 //gemm_nn_custom_bin_mean_transposed(m, n, k, 1, (uint8_t *)l.align_bit_weights, new_ldb, (uint8_t *)t_bit_input, new_ldb, c, n, l.mean_arr);
306 
307                 //add_bias(l.output, l.biases, l.batch, l.n, l.out_h*l.out_w);
308 
309                 //cudaMemcpy(l.output_gpu, l.output, l.outputs * sizeof(float), cudaMemcpyDefault);
310 
311 
312                 // // alternative GEMM
313                 //gemm_nn_bin_transposed_32bit_packed(m, n, new_k, 1,
314                 //    l.align_bit_weights, l.new_lda/32,
315                 //    t_bit_input, new_ldb / 32,
316                 //    c, n, l.mean_arr);
317 
318                 //free(t_bit_input);
319 
320                 //free(b);
321             }
322             else
323             {
324                 //printf("\n\n l.index = %d, l.w = %d, l.c = %d, l.n = %d, l.stride = %d, l.pad = %d - old XNOR \n", l.index, l.w, l.c, l.n, l.stride, l.pad);
325                 //cudaDeviceSynchronize();
326 
327                 int i = 0;
328                 /*
329                 // if (l.stride == 1 && l.c >= 256 && l.size > 1)
330                 if (l.stride == 1 && l.c >= 1024 && l.size > 1 && 0)// && l.w >= 13) // disabled
331                 {
332                     // stride=1 only
333                     //start_timer();
334                     im2col_align_bin_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace, l.bit_align);
335                     //cudaDeviceSynchronize();
336                     //stop_timer_and_show_name("im2col_align_bin_ongpu");
337                 }
338                 else*/
339                 {
340                     //start_timer();
341                     im2col_align_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, l.align_workspace_gpu, l.bit_align);
342                     //cudaDeviceSynchronize();
343                     //stop_timer_and_show_name("im2col_align_ongpu");
344                     //getchar();
345 
346                     // should be optimized
347                     //start_timer();
348                     float_to_bit_gpu(l.align_workspace_gpu, (unsigned char *)state.workspace, l.align_workspace_size);
349                     //cudaDeviceSynchronize();
350                     //stop_timer_and_show_name("float_to_bit_gpu");
351                 }
352                 //start_timer();
353                 transpose_bin_gpu((unsigned char *)state.workspace, (unsigned char *)l.transposed_align_workspace_gpu, k, n, l.bit_align, new_ldb, 8);
354                 //cudaDeviceSynchronize();
355                 //stop_timer_and_show_name("transpose_bin_gpu");
356 
357                 //cudaDeviceSynchronize();
358                 //stop_timer_and_show_name("im2col_align_ongpu + float_to_bit_gpu + transpose_bin_gpu");
359 
360                 // should be optimized
361                 //if(0) {//if (k > 1000) {    // sequentially input-shared - BAD
362                 //    gemm_nn_custom_bin_mean_transposed_sequentially_gpu(m, n, k,
363                 //        (unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu, new_ldb, l.output_gpu, n, l.mean_arr_gpu);
364                 //}
365                 //else {  // coalescing & weights-shared-memory - GOOD
366                     //start_timer();
367                     gemm_nn_custom_bin_mean_transposed_gpu(m, n, k,
368                         (unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu,
369                         new_ldb, l.output_gpu, n, l.mean_arr_gpu, l.biases_gpu, l.activation == LEAKY,
370                         l.bin_conv_shortcut_in_gpu, l.bin_conv_shortcut_out_gpu);
371                     //cudaDeviceSynchronize();
372                     //stop_timer_and_show_name("gemm_nn_custom_bin_mean_transposed_gpu");
373                 //}
374                 //cudaDeviceSynchronize();
375                 //check_error(status);
376                 //getchar();
377             }
378 
379 
380             /*
381             {
382                 float_to_bit_gpu(state.input, (unsigned char *)l.align_workspace_gpu, input_size);
383                 convolve_bin_gpu(l.align_workspace_gpu, (float *)l.align_bit_weights_gpu, l.output_gpu, l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr_gpu);
384 
385                 //convolve_gpu(state.input, l.weights_gpu, l.output_gpu, l.w, l.h, l.c, l.n, l.size, l.pad);
386 
387                 //cudaDeviceSynchronize();
388                 //check_error(status);
389 
390                 add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
391             }
392             */
393 
394             //add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
395             if (l.activation == SWISH) activate_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu);
396             else if (l.activation == MISH) activate_array_mish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu);
397             else if (l.activation == NORM_CHAN) activate_array_normalize_channels_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu);
398             else if (l.activation == NORM_CHAN_SOFTMAX) activate_array_normalize_channels_softmax_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu, 0);
399             else if (l.activation == NORM_CHAN_SOFTMAX_MAXVAL) activate_array_normalize_channels_softmax_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu, 1);
400             else if (l.activation != LINEAR && l.activation != LEAKY) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
401             //if(l.activation != LINEAR && l.activation != LEAKY) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
402             //if (l.binary || l.xnor) swap_binary(&l);
403             //cudaDeviceSynchronize();
404             return;
405         }
406     }
407 
408     if (l.xnor) {
409         swap_binary(&l);
410         binarize_gpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input_gpu);
411         state.input = l.binary_input_gpu;
412     }
413 
414     //fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
415 
416 #ifdef CUDNN
417     //float one = 1;    // alpha[0], beta[0] is float for HALF and FLOAT
418     float alpha = 1, beta = 0;
419 
420 //#ifdef CUDNN_HALF
421     //if (state.use_mixed_precision) {
422     int iteration_num = get_current_iteration(state.net); // (*state.net.seen) / (state.net.batch*state.net.subdivisions);
423     if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || (iteration_num > 3 * state.net.burn_in) && state.net.loss_scale != 1) &&
424         (l.c / l.groups) % 8 == 0 && l.n % 8 == 0 && l.groups <= 1 && l.size > 1)
425     {
426         //printf("\n CUDNN_HALF!!! state.index = %d \n", state.index);
427 
428         // Note: For improved performance it is advised to use beta[0] = 0.0.
429         // For Tensor Core: cudnnSetConvolutionMathType() where cudnnMathType_t mathType = CUDNN_TENSOR_OP_MATH;
430         // 1. or CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM and use CUDNN_DATA_HALF
431         // 2. or CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED
432         // More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops
433 
434         const size_t input16_size = l.batch*l.c*l.w*l.h;
435         const size_t output16_size = l.batch*l.out_c*l.out_h*l.out_w;
436 
437         if (*state.net.max_input16_size < input16_size) {
438             //printf("\n input16_size: cur = %zu \t max = %zu \n", input16_size, *state.net.max_input16_size);
439             *state.net.max_input16_size = input16_size;
440             if (*state.net.input16_gpu) cuda_free(*state.net.input16_gpu);
441             assert(*state.net.max_input16_size > 0);
442             *state.net.input16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size);
443         }
444         float *input16 = *state.net.input16_gpu;
445 
446         if (*state.net.max_output16_size < output16_size) {
447             *state.net.max_output16_size = output16_size;
448             if (*state.net.output16_gpu) cuda_free(*state.net.output16_gpu);
449             assert(*state.net.max_output16_size > 0);
450             *state.net.output16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size);
451         }
452         float *output16 = *state.net.output16_gpu;
453 
454         assert(input16_size > 0);
455         cuda_convert_f32_to_f16(state.input, input16_size, input16);
456 
457         //fill_ongpu(output16_size / 2, 0, (float *)output16, 1);
458         CHECK_CUDNN(cudnnConvolutionForward(cudnn_handle(),
459             &alpha,
460             l.srcTensorDesc16,
461             input16,
462             l.weightDesc16,
463             l.weights_gpu16,
464             l.convDesc,
465             l.fw_algo16,
466             state.workspace,
467             l.workspace_size,
468             &beta,
469             l.dstTensorDesc16,
470             output16));
471 
472 
473         if (l.batch_normalize)
474         {
475             if (state.train && !state.net.adversarial) // Training
476             {
477                 simple_copy_ongpu(l.outputs*l.batch / 2, output16, l.x_gpu);
478                 //copy_ongpu(l.outputs*l.batch / 2, output16, 1, l.x_gpu, 1);
479                 //cudaMemcpyAsync(l.x_gpu, output16, l.outputs*l.batch*sizeof(half), cudaMemcpyDefault, get_cuda_stream());
480                 float one = 1.0f;
481                 float zero = 0.0f;
482                 // Batch-normalization can still take FP16 inputs and outputs, saving half the bandwidth
483                 // compared to FP32, it's just that the statistics and value adjustment should be done in FP32.
484                 CHECK_CUDNN(cudnnBatchNormalizationForwardTraining(cudnn_handle(),
485                     CUDNN_BATCHNORM_SPATIAL,
486                     &one,
487                     &zero,
488                     l.normDstTensorDescF16,
489                     l.x_gpu,            // input
490                     l.normDstTensorDescF16,
491                     output16,            // output
492                     l.normTensorDesc,
493                     l.scales_gpu,       // input
494                     l.biases_gpu,       // input
495                     .01,
496                     l.rolling_mean_gpu,        // input/output (should be FP32)
497                     l.rolling_variance_gpu,    // input/output (should be FP32)
498                     .00001,
499                     l.mean_gpu,            // output (should be FP32) - optional cache to speedup cudnnBatchNormalizationBackward()
500                     l.variance_gpu));    // output (should be FP32) - optional cache to speedup cudnnBatchNormalizationBackward()
501 
502                 cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
503                 //forward_batchnorm_layer_gpu(l, state);
504             }
505             else // Detection
506             {
507                 cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
508                 normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
509                 scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
510                 add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
511             }
512         }
513         else // BIAS only
514         {
515             cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
516             add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
517         }
518     }
519     else {
520 
521         //#else
522         /*
523         int input_nan_inf = is_nan_or_inf(state.input, l.inputs * l.batch);
524         printf("\n is_nan_or_inf(state.input) = %d \n", input_nan_inf);
525         if (input_nan_inf) getchar();
526 
527         int weights_nan_inf = is_nan_or_inf(l.weights_gpu, l.nweights);
528         printf("\n is_nan_or_inf(l.weights_gpu) = %d \n", weights_nan_inf);
529         if (weights_nan_inf) getchar();
530         */
531 
532         CHECK_CUDNN(cudnnConvolutionForward(cudnn_handle(),
533             &alpha, //&one,
534             l.srcTensorDesc,
535             state.input,
536             l.weightDesc,
537             l.weights_gpu,
538             l.convDesc,
539             l.fw_algo,
540             state.workspace,
541             l.workspace_size,
542             &beta,  //&one,
543             l.dstTensorDesc,
544             l.output_gpu));
545 
546         //cudaDeviceSynchronize();
547         if (l.batch_normalize) {
548             forward_batchnorm_layer_gpu(l, state);
549         }
550         else {
551             add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
552         }
553     //#endif    // CUDNN_HALF
554     }
555 
556 
557 #else
558     fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
559 
560     int i, j;
561     int m = l.n / l.groups;
562     int k = l.size*l.size*l.c / l.groups;
563     int n = l.out_w*l.out_h;
564     for(i = 0; i < l.batch; ++i){
565         for (j = 0; j < l.groups; ++j) {
566             //float *im = state.input + i*l.c*l.h*l.w;
567             float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
568             float *a = l.weights_gpu + j*l.nweights / l.groups;
569             float *b = state.workspace;
570             float *c = l.output_gpu + (i*l.groups + j)*n*m;
571             if (l.size == 1) {
572                 b = im;
573             }
574             else {
575                 //im2col_ongpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
576 
577                 im2col_gpu_ext(im,          // input
578                     l.c / l.groups,         // input channels
579                     l.h, l.w,               // input size (h, w)
580                     l.size, l.size,         // kernel size (h, w)
581                     l.pad * l.dilation, l.pad * l.dilation,   // padding (h, w)
582                     l.stride_y, l.stride_x,     // stride (h, w)
583                     l.dilation, l.dilation, // dilation (h, w)
584                     state.workspace);       // output
585 
586             }
587             //gemm_ongpu(0, 0, m, n, k, 1., a, k, b, n, 1., c + i*m*n, n);
588             gemm_ongpu(0, 0, m, n, k, 1, a, k, b, n, 1, c, n);
589         }
590     }
591 
592     if (l.batch_normalize) {
593         forward_batchnorm_layer_gpu(l, state);
594     }
595     else {
596         add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
597     }
598 #endif
599 
600 //#ifndef CUDNN_HALF
601 //#endif // no CUDNN_HALF
602 
603     if (l.activation == SWISH) activate_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu);
604     else if (l.activation == MISH) activate_array_mish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu);
605     else if (l.activation == NORM_CHAN) activate_array_normalize_channels_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu);
606     else if (l.activation == NORM_CHAN_SOFTMAX) activate_array_normalize_channels_softmax_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu, 0);
607     else if (l.activation == NORM_CHAN_SOFTMAX_MAXVAL) activate_array_normalize_channels_softmax_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output_gpu, 1);
608     else if (l.activation != LINEAR) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
609     //if(l.dot > 0) dot_error_gpu(l);
610     if(l.binary || l.xnor) swap_binary(&l);
611     //cudaDeviceSynchronize();    // for correct profiling of performance
612 
613     if (state.net.try_fix_nan) {
614         fix_nan_and_inf(l.output_gpu, l.outputs*l.batch);
615     }
616 
617     if(l.assisted_excitation && state.train) assisted_excitation_forward_gpu(l, state);
618 
619     if (l.antialiasing) {
620         network_state s = { 0 };
621         s.train = state.train;
622         s.workspace = state.workspace;
623         s.net = state.net;
624         if (!state.train) s.index = state.index;  // don't use TC for training (especially without cuda_convert_f32_to_f16() )
625         s.input = l.output_gpu;
626         forward_convolutional_layer_gpu(*(l.input_layer), s);
627         simple_copy_ongpu(l.outputs*l.batch, l.output_gpu, l.input_antialiasing_gpu);
628         simple_copy_ongpu(l.input_layer->outputs*l.input_layer->batch, l.input_layer->output_gpu, l.output_gpu);
629     }
630 }
631 
backward_convolutional_layer_gpu(convolutional_layer l,network_state state)632 void backward_convolutional_layer_gpu(convolutional_layer l, network_state state)
633 {
634     if (l.antialiasing) {
635         network_state s = { 0 };
636         s.train = state.train;
637         s.workspace = state.workspace;
638         s.net = state.net;
639         s.delta = l.delta_gpu;  // s.delta will be returned to l.delta_gpu
640         s.input = l.input_antialiasing_gpu;
641         //if (!state.train) s.index = state.index;  // don't use TC for training (especially without cuda_convert_f32_to_f16() )
642         simple_copy_ongpu(l.input_layer->outputs*l.input_layer->batch, l.delta_gpu, l.input_layer->delta_gpu);
643         backward_convolutional_layer_gpu(*(l.input_layer), s);
644 
645         simple_copy_ongpu(l.outputs*l.batch, l.input_antialiasing_gpu, l.output_gpu);
646     }
647 
648     if(state.net.try_fix_nan) constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
649 
650     if (l.activation == SWISH) gradient_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.delta_gpu);
651     else if (l.activation == MISH) gradient_array_mish_ongpu(l.outputs*l.batch, l.activation_input_gpu, l.delta_gpu);
652     else if (l.activation == NORM_CHAN_SOFTMAX || l.activation == NORM_CHAN_SOFTMAX_MAXVAL) gradient_array_normalize_channels_softmax_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu);
653     else if (l.activation == NORM_CHAN) gradient_array_normalize_channels_ongpu(l.output_gpu, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu);
654     else gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
655 
656     if (!l.batch_normalize)
657         backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
658 
659 //#ifndef CUDNN_HALF
660     //if(l.batch_normalize){
661     //    backward_batchnorm_layer_gpu(l, state);
662     //} else {
663     //    //backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
664     //}
665 //#endif // no CUDNN_HALF
666     float *original_input = state.input;
667 
668     if(l.xnor) state.input = l.binary_input_gpu;
669 #ifdef CUDNN
670     float one = 1.f;
671     float alpha = 1, beta = 0;
672 
673 //#ifdef CUDNN_HALF
674     int iteration_num = get_current_iteration(state.net); //(*state.net.seen) / (state.net.batch*state.net.subdivisions);
675     if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || (iteration_num > 3 * state.net.burn_in) && state.net.loss_scale != 1) &&
676         (l.c / l.groups) % 8 == 0 && l.n % 8 == 0  && l.groups <= 1 && l.size > 1)
677     {
678         const size_t input16_size = l.batch*l.c*l.w*l.h;
679         const size_t delta16_size = l.batch*l.n*l.out_w*l.out_h;
680 
681         if (*state.net.max_input16_size < input16_size) {
682             *state.net.max_input16_size = input16_size;
683             if (*state.net.input16_gpu) cuda_free(*state.net.input16_gpu);
684             assert(*state.net.max_input16_size > 0);
685             *state.net.input16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size);
686         }
687         float *input16 = *state.net.input16_gpu;
688 
689         if (*state.net.max_output16_size < delta16_size) {
690             *state.net.max_output16_size = delta16_size;
691             if (*state.net.output16_gpu) cuda_free(*state.net.output16_gpu);
692             assert(*state.net.max_output16_size > 0);
693             *state.net.output16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size);
694         }
695         float *delta16 = *state.net.output16_gpu;
696 
697         assert(input16_size > 0);
698         assert(delta16_size > 0);
699         cuda_convert_f32_to_f16(state.input, input16_size, input16);
700         cuda_convert_f32_to_f16(l.delta_gpu, delta16_size, delta16);
701 
702         if (l.batch_normalize) {
703             //if (!state.train) {
704             //    l.mean_gpu = l.rolling_mean_gpu;
705             //    l.variance_gpu = l.rolling_variance_gpu;
706             //}
707             float one = 1.0f;
708             float zero = 0.0f;
709             CHECK_CUDNN(cudnnBatchNormalizationBackward(cudnn_handle(),
710                 CUDNN_BATCHNORM_SPATIAL,
711                 &one,
712                 &zero,
713                 &one,
714                 &one,
715                 l.normDstTensorDescF16,
716                 l.x_gpu,                // input (input in BN-forward-inference)
717                 l.normDstTensorDescF16,
718                 delta16,                // input
719                 l.normDstTensorDescF16,
720                 l.output_gpu, //l.x_norm_gpu,            // output (new delta)
721                 l.normTensorDesc,
722                 l.scales_gpu,            // input (should be FP32)
723                 l.scale_updates_gpu,    // output (should be FP32)
724                 l.bias_updates_gpu,        // output (should be FP32)
725                 .00001,
726                 l.mean_gpu,                // input (should be FP32)
727                 l.variance_gpu));        // input (should be FP32)
728 
729             simple_copy_ongpu(l.outputs*l.batch / 2, l.output_gpu, delta16);
730             //copy_ongpu(l.outputs*l.batch / 2, l.x_norm_gpu, 1, delta16, 1);
731             //cudaMemcpyAsync(delta16, l.x_norm_gpu, l.outputs*l.batch * sizeof(half), cudaMemcpyDefault, get_cuda_stream());
732         }
733         else
734         {
735             //backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
736         }
737 
738         // convert input: state.input (x), l.delta_gpu (y) from fp32 to fp16
739         // get output: l.weight_updates_gpu (dw) and convert it to fp32 (ONLY if it is fp16)
740 
741         // calculate conv weight updates
742         // Already: l.weight_updates_gpu = (l.weight_updates_gpu - l.weight*decay*batch*subdivision)*momentum
743         //   so we should copy f32 to f16, or compute: f16=(w_up - w*d*b*s)*m
744         assert((l.nweights) > 0);
745         cuda_convert_f32_to_f16(l.weight_updates_gpu, l.nweights, l.weight_updates_gpu16);
746 
747         if (!state.net.adversarial && !l.train_only_bn) {
748             CHECK_CUDNN(cudnnConvolutionBackwardFilter(cudnn_handle(),
749                 &one,
750                 l.srcTensorDesc16,
751                 input16, //state.input,
752                 l.ddstTensorDesc16,
753                 delta16, //l.delta_gpu,
754                 l.convDesc,
755                 l.bf_algo16,
756                 state.workspace,
757                 l.workspace_size,
758                 &one,
759                 l.dweightDesc16,
760                 l.weight_updates_gpu16));    // l.weight_updates_gpu);
761 
762             cuda_convert_f16_to_f32(l.weight_updates_gpu16, l.nweights, l.weight_updates_gpu);
763         }
764 
765         if (state.delta) {
766             if (l.binary || l.xnor) swap_binary(&l);
767 
768             // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData
769             // calculate delta for the next layer
770             // convert input: l.weights_gpu (w), l.delta_gpu (dy) from fp32 to fp16
771             // get output: state.delta (dx) and convert it to fp32 (ONLY if it is fp16)
772             CHECK_CUDNN(cudnnConvolutionBackwardData(cudnn_handle(),
773                 &alpha,
774                 l.weightDesc16,
775                 l.weights_gpu16, //l.weights_gpu,
776                 l.ddstTensorDesc16,
777                 delta16, //l.delta_gpu,
778                 l.convDesc,
779                 l.bd_algo16,
780                 state.workspace,
781                 l.workspace_size,
782                 &beta,
783                 l.dsrcTensorDesc16,
784                 input16));    // state.delta);
785 
786             cuda_convert_f16_to_f32(input16, input16_size, state.delta);
787 
788             if (l.binary || l.xnor) swap_binary(&l);
789             if (l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta);
790         }
791     }
792     else {
793         //#else    // CUDNN_HALF
794 
795         if(l.batch_normalize){
796             backward_batchnorm_layer_gpu(l, state);
797         }
798 
799         if (!state.net.adversarial && !l.train_only_bn) {
800             // calculate conv weight updates
801             // if used: beta=1 then loss decreases faster
802             CHECK_CUDNN(cudnnConvolutionBackwardFilter(cudnn_handle(),
803                 &one,
804                 l.srcTensorDesc,
805                 state.input,
806                 l.ddstTensorDesc,
807                 l.delta_gpu,
808                 l.convDesc,
809                 l.bf_algo,
810                 state.workspace,
811                 l.workspace_size,
812                 &one,
813                 l.dweightDesc,
814                 l.weight_updates_gpu));
815         }
816 
817         if (state.delta) {
818             if (l.binary || l.xnor) swap_binary(&l);
819             // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData
820             // calculate delta for the next layer
821             CHECK_CUDNN(cudnnConvolutionBackwardData(cudnn_handle(),
822                 &one,
823                 l.weightDesc,
824                 l.weights_gpu,
825                 l.ddstTensorDesc,
826                 l.delta_gpu,
827                 l.convDesc,
828                 l.bd_algo,
829                 state.workspace,
830                 l.workspace_size,
831                 &one,
832                 l.dsrcTensorDesc,
833                 state.delta));
834 
835             if (l.binary || l.xnor) swap_binary(&l);
836             if (l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta);
837         }
838     }
839 
840 //#endif    // CUDNN_HALF
841 
842 #else    // CUDNN
843     if (l.batch_normalize) {
844         backward_batchnorm_layer_gpu(l, state);
845     }
846 
847     int m = l.n / l.groups;
848     int n = l.size*l.size*l.c / l.groups;
849     int k = l.out_w*l.out_h;
850 
851     int i, j;
852     for(i = 0; i < l.batch; ++i){
853         for (j = 0; j < l.groups; ++j) {
854             float * a = l.delta_gpu + (i*l.groups + j)*m*k;
855             float * b = state.workspace;
856             float * c = l.weight_updates_gpu + j*l.nweights / l.groups;
857 
858             float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
859 
860             if (!state.net.adversarial && !l.train_only_bn) {
861                 //im2col_ongpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
862                 im2col_gpu_ext(im,          // input
863                     l.c / l.groups,         // input channels
864                     l.h, l.w,               // input size (h, w)
865                     l.size, l.size,         // kernel size (h, w)
866                     l.pad * l.dilation, l.pad * l.dilation,   // padding (h, w)
867                     l.stride_y, l.stride_x,     // stride (h, w)
868                     l.dilation, l.dilation, // dilation (h, w)
869                     state.workspace);       // output
870                 //gemm_ongpu(0, 1, m, n, k, 1, a + i*m*k, k, b, k, 1, c, n);
871                 gemm_ongpu(0, 1, m, n, k, 1, a, k, b, k, 1, c, n);
872             }
873 
874             if (state.delta) {
875                 if (l.binary || l.xnor) swap_binary(&l);
876                 float * a = l.weights_gpu + j*l.nweights / l.groups;
877                 float * b = l.delta_gpu + (i*l.groups + j)*m*k;
878                 float * c = state.workspace;
879 
880                 //gemm_ongpu(1, 0, n, k, m, 1, a, n, b + i*k*m, k, 0, c, k);
881                 gemm_ongpu(1, 0, n, k, m, 1, a, n, b, k, 0, c, k);
882 
883 
884                 float *delta = state.delta + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
885 
886                 //col2im_ongpu(state.workspace, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, delta);
887                 col2im_gpu_ext(
888                     state.workspace,        // input
889                     l.c / l.groups,         // input channels
890                     l.h, l.w,               // input size (h, w)
891                     l.size, l.size,         // kernel size (h, w)
892                     l.pad * l.dilation, l.pad * l.dilation,   // padding size (h, w)
893                     l.stride_y, l.stride_x,     // stride size (h, w)
894                     l.dilation, l.dilation, // dilation size (h, w)
895                     delta);                 // output (delta)
896 
897                 if (l.binary || l.xnor) {
898                     swap_binary(&l);
899                 }
900                 if (l.xnor) gradient_array_ongpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, state.delta + i*l.c*l.h*l.w);
901             }
902         }
903     }
904 #endif
905     if (state.net.try_fix_nan) {
906         if (state.delta) {
907             reset_nan_and_inf(state.delta, l.inputs * l.batch);
908         }
909         int size = l.nweights;
910         reset_nan_and_inf(l.weight_updates_gpu, size);
911         fix_nan_and_inf(l.weights_gpu, size);
912     }
913 }
914 
calc_avg_activation_kernel(float * src,float * dst,int size,int channels,int batches)915 __global__ void calc_avg_activation_kernel(float *src, float *dst, int size, int channels, int batches)
916 {
917     int i = blockIdx.x * blockDim.x + threadIdx.x;
918     int xy = i % size;
919     int b = i / size;
920 
921     if (i < size*batches) {
922         dst[i] = 0;
923         for (int c = 0; c < channels; ++c) {
924             dst[i] += src[xy + size*(c + channels*b)];
925         }
926         dst[i] = dst[i] / channels;
927     }
928 }
929 
calc_avg_activation_gpu(float * src,float * dst,int size,int channels,int batches)930 void calc_avg_activation_gpu(float *src, float *dst, int size, int channels, int batches)
931 {
932     const int num_blocks = get_number_of_blocks(size*batches, BLOCK);
933 
934     calc_avg_activation_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (src, dst, size, channels, batches);
935 }
936 
937 
assisted_activation_kernel(float alpha,float * output,float * gt_gpu,float * a_avg_gpu,int size,int channels,int batches)938 __global__ void assisted_activation_kernel(float alpha, float *output, float *gt_gpu, float *a_avg_gpu, int size, int channels, int batches)
939 {
940     int i = blockIdx.x * blockDim.x + threadIdx.x;
941     int xy = i % size;
942     int b = i / size;
943 
944     if (b < batches) {
945         for (int c = 0; c < channels; ++c) {
946             output[xy + size*(c + channels*b)] += alpha * gt_gpu[i] * a_avg_gpu[i];
947             //output[xy + size*(c + channels*b)] += gt_gpu[i] * a_avg_gpu[i];
948             //output[xy + size*(c + channels*b)] += gt_gpu[i] * output[xy + size*(c + channels*b)];
949             //output[xy + size*(c + channels*b)] = a_avg_gpu[i];
950         }
951     }
952 }
953 
assisted_activation_gpu(float alpha,float * output,float * gt_gpu,float * a_avg_gpu,int size,int channels,int batches)954 void assisted_activation_gpu(float alpha, float *output, float *gt_gpu, float *a_avg_gpu, int size, int channels, int batches)
955 {
956     const int num_blocks = get_number_of_blocks(size*batches, BLOCK);
957 
958     assisted_activation_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (alpha, output, gt_gpu, a_avg_gpu, size, channels, batches);
959 }
960 
961 
assisted_activation2_kernel(float alpha,float * output,float * gt_gpu,float * a_avg_gpu,int size,int channels,int batches)962 __global__ void assisted_activation2_kernel(float alpha, float *output, float *gt_gpu, float *a_avg_gpu, int size, int channels, int batches)
963 {
964     int i = blockIdx.x * blockDim.x + threadIdx.x;
965     int xy = i % size;
966     int b = i / size;
967     float beta = 1 - alpha;
968 
969     if (b < batches) {
970         for (int c = 0; c < channels; ++c) {
971             if(gt_gpu[i] == 0)
972                 output[xy + size*(c + channels*b)] *= beta;
973 
974         }
975     }
976 }
977 
assisted_activation2_gpu(float alpha,float * output,float * gt_gpu,float * a_avg_gpu,int size,int channels,int batches)978 void assisted_activation2_gpu(float alpha, float *output, float *gt_gpu, float *a_avg_gpu, int size, int channels, int batches)
979 {
980     const int num_blocks = get_number_of_blocks(size*batches, BLOCK);
981 
982     assisted_activation2_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (alpha, output, gt_gpu, a_avg_gpu, size, channels, batches);
983 }
984 
assisted_excitation_forward_gpu(convolutional_layer l,network_state state)985 void assisted_excitation_forward_gpu(convolutional_layer l, network_state state)
986 {
987     const int iteration_num = get_current_iteration(state.net); //(*state.net.seen) / (state.net.batch*state.net.subdivisions);
988 
989     // epoch
990     //const float epoch = (float)(*state.net.seen) / state.net.train_images_num;
991 
992     // calculate alpha
993     //const float alpha = (1 + cos(3.141592 * iteration_num)) / (2 * state.net.max_batches);
994     //const float alpha = (1 + cos(3.141592 * epoch)) / (2 * state.net.max_batches);
995     float alpha = (1 + cos(3.141592 * iteration_num / state.net.max_batches)) / 2;
996     //float alpha = (1 + cos(3.141592 * iteration_num / state.net.max_batches));
997 
998     if (l.assisted_excitation == 1) {
999         if (iteration_num > state.net.max_batches / 2) return;
1000     }
1001     else {
1002         if (iteration_num < state.net.burn_in) return;
1003         else
1004             if (iteration_num > l.assisted_excitation) return;
1005         else
1006             alpha = (1 + cos(3.141592 * iteration_num / (state.net.burn_in + l.assisted_excitation))) / 2; // from 1 to 0
1007     }
1008 
1009     //printf("\n epoch = %f, alpha = %f, seen = %d, max_batches = %d, train_images_num = %d \n",
1010     //    epoch, alpha, (*state.net.seen), state.net.max_batches, state.net.train_images_num);
1011 
1012     //const int size = l.outputs * l.batch;
1013 
1014     float *a_avg = (float *)calloc(l.out_w * l.out_h * l.batch, sizeof(float));
1015     float *gt = (float *)calloc(l.out_w * l.out_h * l.batch, sizeof(float));
1016 
1017     int b;
1018     int w, h;
1019 
1020     l.max_boxes = state.net.num_boxes;
1021     l.truths = l.max_boxes*(4 + 1);
1022 
1023     int num_truth = l.batch*l.truths;
1024     float *truth_cpu = (float *)calloc(num_truth, sizeof(float));
1025     cuda_pull_array(state.truth, truth_cpu, num_truth);
1026     //cudaStreamSynchronize(get_cuda_stream());
1027     //CHECK_CUDA(cudaPeekAtLastError());
1028 
1029     for (b = 0; b < l.batch; ++b)
1030     {
1031         // calculate G
1032         int t;
1033         for (t = 0; t < state.net.num_boxes; ++t) {
1034             box truth = float_to_box_stride(truth_cpu + t*(4 + 1) + b*l.truths, 1);
1035             if (!truth.x) break;  // continue;
1036             float beta = 0;
1037             //float beta = 1 - alpha; // from 0 to 1
1038             float dw = (1 - truth.w) * beta;
1039             float dh = (1 - truth.h) * beta;
1040             //printf(" alpha = %f, beta = %f, truth.w = %f, dw = %f, tw+dw = %f, l.out_w = %d \n", alpha, beta, truth.w, dw, truth.w+dw, l.out_w);
1041 
1042             int left = floor((truth.x - (dw + truth.w) / 2) * l.out_w);
1043             int right = ceil((truth.x + (dw + truth.w) / 2) * l.out_w);
1044             int top = floor((truth.y - (dh + truth.h) / 2) * l.out_h);
1045             int bottom = ceil((truth.y + (dh + truth.h) / 2) * l.out_h);
1046             if (left < 0) left = 0;
1047             if (top < 0) top = 0;
1048             if (right > l.out_w) right = l.out_w;
1049             if (bottom > l.out_h) bottom = l.out_h;
1050 
1051             for (w = left; w <= right; w++) {
1052                 for (h = top; h < bottom; h++) {
1053                     gt[w + l.out_w * h + l.out_w*l.out_h*b] = 1;
1054                 }
1055             }
1056         }
1057     }
1058 
1059     cuda_push_array(l.gt_gpu, gt, l.out_w * l.out_h * l.batch);
1060     //cudaStreamSynchronize(get_cuda_stream());
1061     //CHECK_CUDA(cudaPeekAtLastError());
1062 
1063     // calc avg_output on GPU - for whole batch
1064     calc_avg_activation_gpu(l.output_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch);
1065     //cudaStreamSynchronize(get_cuda_stream());
1066     //CHECK_CUDA(cudaPeekAtLastError());
1067 
1068     // calc new output
1069     //assisted_activation2_gpu(1, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch);  // AE3: gt increases (beta = 1 - alpha = 0)
1070     //assisted_activation2_gpu(alpha, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch);
1071     assisted_activation_gpu(alpha, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch);
1072     //cudaStreamSynchronize(get_cuda_stream());
1073     //CHECK_CUDA(cudaPeekAtLastError());
1074 
1075 
1076 
1077     /*
1078     for (b = 0; b < l.batch; ++b)
1079     {
1080         // calculate average A
1081         for (w = 0; w < l.out_w; w++) {
1082             for (h = 0; h < l.out_h; h++) {
1083                 for (c = 0; c < l.out_c; c++) {
1084                     a_avg[w + l.out_w*(h + l.out_h*b)] += l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))];
1085                 }
1086                 a_avg[w + l.out_w*(h + l.out_h*b)] /= l.out_c;  // a_avg / d
1087             }
1088         }
1089     }
1090 
1091     // change activation
1092     for (b = 0; b < l.batch; ++b)
1093     {
1094         for (w = 0; w < l.out_w; w++) {
1095             for (h = 0; h < l.out_h; h++) {
1096                 for (c = 0; c < l.out_c; c++)
1097                 {
1098                     // a = a + alpha(t) + e(c,i,j) = a + alpha(t) + g(i,j) * avg_a(i,j) / channels
1099                     l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))] +=
1100                         alpha *
1101                         g[w + l.out_w*(h + l.out_h*b)] *
1102                         a_avg[w + l.out_w*(h + l.out_h*b)];
1103 
1104                     //l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))] =
1105                     //    alpha * g[w + l.out_w*(h + l.out_h*b)] * a_avg[w + l.out_w*(h + l.out_h*b)];
1106                 }
1107             }
1108         }
1109     }
1110     */
1111 
1112     if (0)   // visualize ground truth
1113     {
1114 #ifdef OPENCV
1115         cuda_pull_array(l.output_gpu, l.output, l.outputs * l.batch);
1116         cudaStreamSynchronize(get_cuda_stream());
1117         CHECK_CUDA(cudaPeekAtLastError());
1118 
1119         for (b = 0; b < l.batch; ++b)
1120         {
1121             printf(" Assisted Excitation alpha = %f \n", alpha);
1122             image img = float_to_image(l.out_w, l.out_h, 1, &gt[l.out_w*l.out_h*b]);
1123             char buff[100];
1124             sprintf(buff, "a_excitation_gt_%d", b);
1125             show_image_cv(img, buff);
1126 
1127             //image img2 = float_to_image(l.out_w, l.out_h, 1, &l.output[l.out_w*l.out_h*l.out_c*b]);
1128             image img2 = float_to_image_scaled(l.out_w, l.out_h, 1, &l.output[l.out_w*l.out_h*l.out_c*b]);
1129             char buff2[100];
1130             sprintf(buff2, "a_excitation_output_%d", b);
1131             show_image_cv(img2, buff2);
1132 
1133             /*
1134             int c = l.out_c;
1135             if (c > 4) c = 4;
1136             image img3 = float_to_image(l.out_w, l.out_h, c, &l.output[l.out_w*l.out_h*l.out_c*b]);
1137             image dc = collapse_image_layers(img3, 1);
1138             char buff3[100];
1139             sprintf(buff3, "a_excitation_act_collapsed_%d", b);
1140             show_image_cv(dc, buff3);
1141             */
1142 
1143             wait_key_cv(5);
1144         }
1145         wait_until_press_key_cv();
1146 #endif // OPENCV
1147     }
1148 
1149     free(truth_cpu);
1150     free(gt);
1151     free(a_avg);
1152 }
1153 
pull_convolutional_layer(convolutional_layer l)1154 void pull_convolutional_layer(convolutional_layer l)
1155 {
1156     cuda_pull_array_async(l.weights_gpu, l.weights, l.nweights);
1157     cuda_pull_array_async(l.biases_gpu, l.biases, l.n);
1158     cuda_pull_array_async(l.weight_updates_gpu, l.weight_updates, l.nweights);
1159     cuda_pull_array_async(l.bias_updates_gpu, l.bias_updates, l.n);
1160     if (l.batch_normalize){
1161         cuda_pull_array_async(l.scales_gpu, l.scales, l.n);
1162         cuda_pull_array_async(l.rolling_mean_gpu, l.rolling_mean, l.n);
1163         cuda_pull_array_async(l.rolling_variance_gpu, l.rolling_variance, l.n);
1164     }
1165     if (l.adam){
1166         cuda_pull_array_async(l.m_gpu, l.m, l.nweights);
1167         cuda_pull_array_async(l.v_gpu, l.v, l.nweights);
1168     }
1169     CHECK_CUDA(cudaPeekAtLastError());
1170     cudaStreamSynchronize(get_cuda_stream());
1171 }
1172 
push_convolutional_layer(convolutional_layer l)1173 void push_convolutional_layer(convolutional_layer l)
1174 {
1175     cuda_push_array(l.weights_gpu, l.weights, l.nweights);
1176 #ifdef CUDNN_HALF
1177     assert(l.nweights > 0);
1178     cuda_convert_f32_to_f16(l.weights_gpu, l.nweights, l.weights_gpu16);
1179 #endif
1180     cuda_push_array(l.biases_gpu, l.biases, l.n);
1181     if (l.train) {
1182         cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
1183         cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
1184     }
1185     if (l.batch_normalize){
1186         cuda_push_array(l.scales_gpu, l.scales, l.n);
1187         cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
1188         cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
1189     }
1190     if (l.adam){
1191         cuda_push_array(l.m_gpu, l.m, l.nweights);
1192         cuda_push_array(l.v_gpu, l.v, l.nweights);
1193     }
1194     CHECK_CUDA(cudaPeekAtLastError());
1195 }
1196 
update_convolutional_layer_gpu(layer l,int batch,float learning_rate_init,float momentum,float decay,float loss_scale)1197 void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale)
1198 {
1199 
1200         /*
1201         for (int angle = 0; angle < 360; angle++) {
1202             printf(" angle = %d \n", angle);
1203             smooth_rotate_weights_kernel(l.weights_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, angle, 0);
1204 
1205             cuda_pull_array(l.weight_deform_gpu, l.weights, l.nweights);
1206             visualize_convolutional_layer(l, "weights", NULL);
1207             wait_key_cv(10);
1208         }
1209         */
1210 
1211     if (l.deform) {
1212 
1213         //for (l.angle = 0; l.angle < 360; l.angle += 1)
1214         //{
1215             //stretch_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, l.angle/180, 1);
1216             //else simple_copy_ongpu(l.nweights, l.weight_updates_gpu, l.weight_deform_gpu);
1217 
1218             if (l.rotate) rotate_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, 1);
1219             else if (l.sway) sway_and_flip_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, l.angle, 1);
1220             else if (l.stretch) stretch_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, 0, 1);
1221             else if (l.stretch_sway) stretch_sway_flip_weights_gpu(l.weight_updates_gpu, l.weight_deform_gpu, l.nweights, l.n, l.size, l.angle, 1);
1222 
1223             //simple_copy_ongpu(l.nweights, l.weight_updates_gpu, l.weight_deform_gpu);
1224 
1225             reduce_and_expand_array_gpu(l.weight_deform_gpu, l.weight_updates_gpu, l.nweights, 4);
1226 
1227             //printf(" angle = %f \n", l.angle);
1228             //cuda_pull_array(l.weight_deform_gpu, l.weights, l.nweights);
1229             //visualize_convolutional_layer(l, "weights", NULL);
1230             //wait_key_cv(10);
1231         //}
1232 
1233     }
1234 
1235 
1236     float learning_rate = learning_rate_init*l.learning_rate_scale;
1237     //float momentum = a.momentum;
1238     //float decay = a.decay;
1239     //int batch = a.batch;
1240 
1241     // Loss scale for Mixed-Precision on Tensor-Cores
1242     if (loss_scale != 1.0) {
1243         if (l.weight_updates_gpu && l.nweights > 0) scal_ongpu(l.nweights, 1.0 / loss_scale, l.weight_updates_gpu, 1);
1244         if (l.bias_updates_gpu && l.n > 0) scal_ongpu(l.n, 1.0 / loss_scale, l.bias_updates_gpu, 1);
1245         if (l.scale_updates_gpu && l.n > 0) scal_ongpu(l.n, 1.0 / loss_scale, l.scale_updates_gpu, 1);
1246     }
1247 
1248     reset_nan_and_inf(l.weight_updates_gpu, l.nweights);
1249     fix_nan_and_inf(l.weights_gpu, l.nweights);
1250 
1251     // Gradient Centralization
1252     if (l.grad_centr && l.batch_normalize) {
1253         // weights[filters][channels][height][width]
1254         // for(filters) w[f] = w[f] - mean(w[c][h][w])
1255         gradient_centralization_gpu(l.size, l.size, l.c / l.groups, l.n, l.weight_updates_gpu);
1256     }
1257 
1258 
1259     if (l.adam) {
1260         //adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t);
1261         adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.nweights, batch, l.t);
1262 
1263         adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch, l.t);
1264         if (l.scales_gpu) {
1265             adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch, l.t);
1266         }
1267     }
1268     else {
1269         //axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
1270         //axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
1271         //scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1);
1272         axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
1273         axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
1274         scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1);
1275 
1276         axpy_ongpu(l.n, learning_rate / batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
1277         scal_ongpu(l.n, momentum, l.bias_updates_gpu, 1);
1278 
1279         if (l.scales_gpu) {
1280             axpy_ongpu(l.n, learning_rate / batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
1281             scal_ongpu(l.n, momentum, l.scale_updates_gpu, 1);
1282         }
1283     }
1284 
1285     if (l.deform) {
1286         //for (l.angle = 0; l.angle < 360; l.angle += 4)
1287         //{
1288             expand_array_gpu(l.weights_gpu, l.weight_deform_gpu, l.nweights, 4);
1289 
1290             //simple_copy_ongpu(l.nweights, l.weight_deform_gpu, l.weights_gpu);
1291 
1292             if (l.rotate) rotate_weights_gpu(l.weight_deform_gpu, l.weights_gpu, l.nweights, l.n, l.size, 0);
1293             else if (l.sway) sway_and_flip_weights_gpu(l.weight_deform_gpu, l.weights_gpu, l.nweights, l.n, l.size, l.angle, 0);
1294             else if (l.stretch) stretch_weights_gpu(l.weight_deform_gpu, l.weights_gpu, l.nweights, l.n, l.size, 0, 0);
1295             else if (l.stretch_sway) stretch_sway_flip_weights_gpu(l.weight_deform_gpu, l.weights_gpu, l.nweights, l.n, l.size, l.angle, 0);
1296 
1297             //printf(" angle = %f, reverse = %d \n", l.angle, 0);
1298             //cuda_pull_array(l.weights_gpu, l.weights, l.nweights);
1299             //visualize_convolutional_layer(l, "weights", NULL);
1300             //wait_key_cv(10);
1301         //}
1302     }
1303 
1304     if (l.clip) {
1305         constrain_ongpu(l.nweights, l.clip, l.weights_gpu, 1);
1306     }
1307 }
1308 
1309 
1310 
1311 /*
1312 void update_convolutional_layer_gpu(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay)
1313 {
1314     int size = layer.size*layer.size*layer.c*layer.n;
1315     axpy_ongpu(layer.n, learning_rate/batch, layer.bias_updates_gpu, 1, layer.biases_gpu, 1);
1316     scal_ongpu(layer.n, momentum, layer.bias_updates_gpu, 1);
1317 
1318     if(layer.scales_gpu){
1319         axpy_ongpu(layer.n, learning_rate/batch, layer.scale_updates_gpu, 1, layer.scales_gpu, 1);
1320         scal_ongpu(layer.n, momentum, layer.scale_updates_gpu, 1);
1321     }
1322 
1323     if(layer.adam){
1324         scal_ongpu(size, layer.B1, layer.m_gpu, 1);
1325         scal_ongpu(size, layer.B2, layer.v_gpu, 1);
1326 
1327         axpy_ongpu(size, -decay*batch, layer.weights_gpu, 1, layer.weight_updates_gpu, 1);
1328 
1329         axpy_ongpu(size, -(1-layer.B1), layer.weight_updates_gpu, 1, layer.m_gpu, 1);
1330         mul_ongpu(size, layer.weight_updates_gpu, 1, layer.weight_updates_gpu, 1);
1331         axpy_ongpu(size, (1-layer.B2), layer.weight_updates_gpu, 1, layer.v_gpu, 1);
1332 
1333         adam_gpu(size, layer.weights_gpu, layer.m_gpu, layer.v_gpu, layer.B1, layer.B2, learning_rate/batch, layer.eps, layer.t+1);
1334         fill_ongpu(size, 0, layer.weight_updates_gpu, 1);
1335     }else{
1336         axpy_ongpu(size, -decay*batch, layer.weights_gpu, 1, layer.weight_updates_gpu, 1);  // wu = wu - w*decay*batch
1337         axpy_ongpu(size, learning_rate/batch, layer.weight_updates_gpu, 1, layer.weights_gpu, 1); // w = w + wu*lr/batch
1338         scal_ongpu(size, momentum, layer.weight_updates_gpu, 1);    // wu = wu*momentum // wu = (wu - w*decay*batch)*momentum
1339         // w = w + (wu - w*decay*batch)*lr/batch = w + wu*lr/batch - w*decay*lr = w*(1-decay*lr) + wu*lr/batch
1340         //wu_prev = (wu_old - w_old*decay*batch)*momentum
1341 
1342 
1343         //weights_update = weights_update_new + (weights_update_old - weights_old*decay*batch)*momentum - weights_new*decay*batch =
1344         // = weights_update_new + weights_update_old*momentum - weights_old*decay*batch*momentum - weights_new*decay*batch
1345         // = weights_update_new + weights_update_old*momentum - (weights_old*momentum + weights_new)*decay*batch
1346 
1347         //------------- RESULT --------------
1348         // weights_update = weights_update_new + weights_update_old*momentum - (weights_old*momentum + weights_new)*decay*batch
1349         //-----------------------------------
1350 
1351         // weights_newest = weights_new + (weights_update_new + weights_update_old*momentum - (weights_old*momentum + weights_new)*decay*batch)*lr/batch
1352         // = weights_new + weights_update_new*lr/batch + weights_update_old*momentum*lr/batch - weights_old*momentum*decay*batch*lr/batch - weights_new*decay*batch*lr/batch
1353         // = weights_new + weights_update_new*lr/batch + weights_update_old*momentum*lr/batch - weights_old*momentum*decay*lr - weights_new*decay*lr
1354         // = weights_new*(1 - decay*lr) - weights_old*momentum*decay*lr + (weights_update_new + weights_update_old*momentum)*lr/batch
1355 
1356         //------------- RESULT --------------
1357         // weights_newest = weights_new*(1 - decay*lr) - weights_old*momentum*(decay*lr) + (weights_update_new + weights_update_old*momentum)*lr/batch =
1358         // = weights_new - (weights_new + weights_old*momentum)*decay*lr + (weights_update_new + weights_update_old*momentum)*lr / batch
1359         //-----------------------------------
1360     }
1361 }
1362 */
1363