1 #include "connected_layer.h"
2 #include "batchnorm_layer.h"
3 #include "convolutional_layer.h"
4 #include "utils.h"
5 #include "dark_cuda.h"
6 #include "blas.h"
7 #include "gemm.h"
8 
9 #include <math.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 
get_connected_workspace_size(layer l)14 size_t get_connected_workspace_size(layer l)
15 {
16 #ifdef CUDNN
17     return get_convolutional_workspace_size(l);
18     /*
19     if (gpu_index >= 0) {
20         size_t most = 0;
21         size_t s = 0;
22         CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(),
23             l.srcTensorDesc,
24             l.weightDesc,
25             l.convDesc,
26             l.dstTensorDesc,
27             l.fw_algo,
28             &s));
29         if (s > most) most = s;
30         CHECK_CUDNN(cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnn_handle(),
31             l.srcTensorDesc,
32             l.ddstTensorDesc,
33             l.convDesc,
34             l.dweightDesc,
35             l.bf_algo,
36             &s));
37         if (s > most) most = s;
38         CHECK_CUDNN(cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle(),
39             l.weightDesc,
40             l.ddstTensorDesc,
41             l.convDesc,
42             l.dsrcTensorDesc,
43             l.bd_algo,
44             &s));
45         if (s > most) most = s;
46         return most;
47     }
48     */
49 #endif
50     return 0;
51 }
52 
make_connected_layer(int batch,int steps,int inputs,int outputs,ACTIVATION activation,int batch_normalize)53 connected_layer make_connected_layer(int batch, int steps, int inputs, int outputs, ACTIVATION activation, int batch_normalize)
54 {
55     int total_batch = batch*steps;
56     int i;
57     connected_layer l = { (LAYER_TYPE)0 };
58     l.type = CONNECTED;
59 
60     l.inputs = inputs;
61     l.outputs = outputs;
62     l.batch= batch;
63     l.batch_normalize = batch_normalize;
64     l.h = 1;
65     l.w = 1;
66     l.c = inputs;
67     l.out_h = 1;
68     l.out_w = 1;
69     l.out_c = outputs;
70     l.n = l.out_c;
71     l.size = 1;
72     l.stride = l.stride_x = l.stride_y = 1;
73     l.pad = 0;
74     l.activation = activation;
75     l.learning_rate_scale = 1;
76     l.groups = 1;
77     l.dilation = 1;
78 
79     l.output = (float*)xcalloc(total_batch * outputs, sizeof(float));
80     l.delta = (float*)xcalloc(total_batch * outputs, sizeof(float));
81 
82     l.weight_updates = (float*)xcalloc(inputs * outputs, sizeof(float));
83     l.bias_updates = (float*)xcalloc(outputs, sizeof(float));
84 
85     l.weights = (float*)xcalloc(outputs * inputs, sizeof(float));
86     l.biases = (float*)xcalloc(outputs, sizeof(float));
87 
88     l.forward = forward_connected_layer;
89     l.backward = backward_connected_layer;
90     l.update = update_connected_layer;
91 
92     //float scale = 1./sqrt(inputs);
93     float scale = sqrt(2.f/inputs);
94     for(i = 0; i < outputs*inputs; ++i){
95         l.weights[i] = scale*rand_uniform(-1, 1);
96     }
97 
98     for(i = 0; i < outputs; ++i){
99         l.biases[i] = 0;
100     }
101 
102     if(batch_normalize){
103         l.scales = (float*)xcalloc(outputs, sizeof(float));
104         l.scale_updates = (float*)xcalloc(outputs, sizeof(float));
105         for(i = 0; i < outputs; ++i){
106             l.scales[i] = 1;
107         }
108 
109         l.mean = (float*)xcalloc(outputs, sizeof(float));
110         l.mean_delta = (float*)xcalloc(outputs, sizeof(float));
111         l.variance = (float*)xcalloc(outputs, sizeof(float));
112         l.variance_delta = (float*)xcalloc(outputs, sizeof(float));
113 
114         l.rolling_mean = (float*)xcalloc(outputs, sizeof(float));
115         l.rolling_variance = (float*)xcalloc(outputs, sizeof(float));
116 
117         l.x = (float*)xcalloc(total_batch * outputs, sizeof(float));
118         l.x_norm = (float*)xcalloc(total_batch * outputs, sizeof(float));
119     }
120 
121 #ifdef GPU
122     l.forward_gpu = forward_connected_layer_gpu;
123     l.backward_gpu = backward_connected_layer_gpu;
124     l.update_gpu = update_connected_layer_gpu;
125 
126     l.weights_gpu = cuda_make_array(l.weights, outputs*inputs);
127     l.biases_gpu = cuda_make_array(l.biases, outputs);
128 
129     l.weight_updates_gpu = cuda_make_array(l.weight_updates, outputs*inputs);
130     l.bias_updates_gpu = cuda_make_array(l.bias_updates, outputs);
131 
132     l.output_gpu = cuda_make_array(l.output, outputs*total_batch);
133     l.delta_gpu = cuda_make_array(l.delta, outputs*total_batch);
134     if (batch_normalize) {
135         l.scales_gpu = cuda_make_array(l.scales, outputs);
136         l.scale_updates_gpu = cuda_make_array(l.scale_updates, outputs);
137 
138         l.mean_gpu = cuda_make_array(l.mean, outputs);
139         l.variance_gpu = cuda_make_array(l.variance, outputs);
140 
141         l.rolling_mean_gpu = cuda_make_array(l.mean, outputs);
142         l.rolling_variance_gpu = cuda_make_array(l.variance, outputs);
143 
144         l.mean_delta_gpu = cuda_make_array(l.mean, outputs);
145         l.variance_delta_gpu = cuda_make_array(l.variance, outputs);
146 
147         l.x_gpu = cuda_make_array(l.output, total_batch*outputs);
148         l.x_norm_gpu = cuda_make_array(l.output, total_batch*outputs);
149     }
150 #ifdef CUDNN
151     create_convolutional_cudnn_tensors(&l);
152     cudnn_convolutional_setup(&l, cudnn_fastest, 0);   // cudnn_fastest, cudnn_smallest
153     l.workspace_size = get_connected_workspace_size(l);
154 #endif  // CUDNN
155 #endif  // GPU
156     fprintf(stderr, "connected                            %4d  ->  %4d\n", inputs, outputs);
157     return l;
158 }
159 
update_connected_layer(connected_layer l,int batch,float learning_rate,float momentum,float decay)160 void update_connected_layer(connected_layer l, int batch, float learning_rate, float momentum, float decay)
161 {
162     axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
163     scal_cpu(l.outputs, momentum, l.bias_updates, 1);
164 
165     if(l.batch_normalize){
166         axpy_cpu(l.outputs, learning_rate/batch, l.scale_updates, 1, l.scales, 1);
167         scal_cpu(l.outputs, momentum, l.scale_updates, 1);
168     }
169 
170     axpy_cpu(l.inputs*l.outputs, -decay*batch, l.weights, 1, l.weight_updates, 1);
171     axpy_cpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
172     scal_cpu(l.inputs*l.outputs, momentum, l.weight_updates, 1);
173 }
174 
forward_connected_layer(connected_layer l,network_state state)175 void forward_connected_layer(connected_layer l, network_state state)
176 {
177     int i;
178     fill_cpu(l.outputs*l.batch, 0, l.output, 1);
179     int m = l.batch;
180     int k = l.inputs;
181     int n = l.outputs;
182     float *a = state.input;
183     float *b = l.weights;
184     float *c = l.output;
185     gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
186     if(l.batch_normalize){
187         if(state.train){
188             mean_cpu(l.output, l.batch, l.outputs, 1, l.mean);
189             variance_cpu(l.output, l.mean, l.batch, l.outputs, 1, l.variance);
190 
191             scal_cpu(l.outputs, .95f, l.rolling_mean, 1);
192             axpy_cpu(l.outputs, .05f, l.mean, 1, l.rolling_mean, 1);
193             scal_cpu(l.outputs, .95f, l.rolling_variance, 1);
194             axpy_cpu(l.outputs, .05f, l.variance, 1, l.rolling_variance, 1);
195 
196             copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
197             normalize_cpu(l.output, l.mean, l.variance, l.batch, l.outputs, 1);
198             copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
199         } else {
200             normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.outputs, 1);
201         }
202         scale_bias(l.output, l.scales, l.batch, l.outputs, 1);
203     }
204     for(i = 0; i < l.batch; ++i){
205         axpy_cpu(l.outputs, 1, l.biases, 1, l.output + i*l.outputs, 1);
206     }
207     activate_array(l.output, l.outputs*l.batch, l.activation);
208 }
209 
backward_connected_layer(connected_layer l,network_state state)210 void backward_connected_layer(connected_layer l, network_state state)
211 {
212     int i;
213     gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
214     for(i = 0; i < l.batch; ++i){
215         axpy_cpu(l.outputs, 1, l.delta + i*l.outputs, 1, l.bias_updates, 1);
216     }
217     if(l.batch_normalize){
218         backward_scale_cpu(l.x_norm, l.delta, l.batch, l.outputs, 1, l.scale_updates);
219 
220         scale_bias(l.delta, l.scales, l.batch, l.outputs, 1);
221 
222         mean_delta_cpu(l.delta, l.variance, l.batch, l.outputs, 1, l.mean_delta);
223         variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.outputs, 1, l.variance_delta);
224         normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.outputs, 1, l.delta);
225     }
226 
227     int m = l.outputs;
228     int k = l.batch;
229     int n = l.inputs;
230     float *a = l.delta;
231     float *b = state.input;
232     float *c = l.weight_updates;
233     gemm(1,0,m,n,k,1,a,m,b,n,1,c,n);
234 
235     m = l.batch;
236     k = l.outputs;
237     n = l.inputs;
238 
239     a = l.delta;
240     b = l.weights;
241     c = state.delta;
242 
243     if(c) gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
244 }
245 
246 
denormalize_connected_layer(layer l)247 void denormalize_connected_layer(layer l)
248 {
249     int i, j;
250     for(i = 0; i < l.outputs; ++i){
251         float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .000001f);
252         for(j = 0; j < l.inputs; ++j){
253             l.weights[i*l.inputs + j] *= scale;
254         }
255         l.biases[i] -= l.rolling_mean[i] * scale;
256         l.scales[i] = 1;
257         l.rolling_mean[i] = 0;
258         l.rolling_variance[i] = 1;
259     }
260 }
261 
262 
statistics_connected_layer(layer l)263 void statistics_connected_layer(layer l)
264 {
265     if(l.batch_normalize){
266         printf("Scales ");
267         print_statistics(l.scales, l.outputs);
268         /*
269         printf("Rolling Mean ");
270         print_statistics(l.rolling_mean, l.outputs);
271         printf("Rolling Variance ");
272         print_statistics(l.rolling_variance, l.outputs);
273         */
274     }
275     printf("Biases ");
276     print_statistics(l.biases, l.outputs);
277     printf("Weights ");
278     print_statistics(l.weights, l.outputs);
279 }
280 
281 #ifdef GPU
282 
pull_connected_layer(connected_layer l)283 void pull_connected_layer(connected_layer l)
284 {
285     cuda_pull_array(l.weights_gpu, l.weights, l.inputs*l.outputs);
286     cuda_pull_array(l.biases_gpu, l.biases, l.outputs);
287     cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.inputs*l.outputs);
288     cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
289     if (l.batch_normalize){
290         cuda_pull_array(l.scales_gpu, l.scales, l.outputs);
291         cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.outputs);
292         cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.outputs);
293     }
294     CHECK_CUDA(cudaPeekAtLastError());
295 }
296 
push_connected_layer(connected_layer l)297 void push_connected_layer(connected_layer l)
298 {
299     cuda_push_array(l.weights_gpu, l.weights, l.inputs*l.outputs);
300     cuda_push_array(l.biases_gpu, l.biases, l.outputs);
301     cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.inputs*l.outputs);
302     cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
303     if (l.batch_normalize){
304         cuda_push_array(l.scales_gpu, l.scales, l.outputs);
305         cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.outputs);
306         cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.outputs);
307     }
308     CHECK_CUDA(cudaPeekAtLastError());
309 }
310 
update_connected_layer_gpu(connected_layer l,int batch,float learning_rate_init,float momentum,float decay,float loss_scale)311 void update_connected_layer_gpu(connected_layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale)
312 {
313     float learning_rate = learning_rate_init * l.learning_rate_scale;
314 
315     // Loss scale for Mixed-Precision on Tensor-Cores
316     if (loss_scale != 1.0) {
317         scal_ongpu(l.inputs*l.outputs, 1.0 / loss_scale, l.weight_updates_gpu, 1);
318         scal_ongpu(l.outputs, 1.0 / loss_scale, l.bias_updates_gpu, 1);
319         scal_ongpu(l.outputs, 1.0 / loss_scale, l.scale_updates_gpu, 1);
320     }
321 
322     axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
323     scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1);
324 
325     if(l.batch_normalize){
326         axpy_ongpu(l.outputs, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
327         scal_ongpu(l.outputs, momentum, l.scale_updates_gpu, 1);
328     }
329 
330     axpy_ongpu(l.inputs*l.outputs, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
331     axpy_ongpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
332     scal_ongpu(l.inputs*l.outputs, momentum, l.weight_updates_gpu, 1);
333 }
334 
forward_connected_layer_gpu(connected_layer l,network_state state)335 void forward_connected_layer_gpu(connected_layer l, network_state state)
336 {
337     fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
338 
339     int m = l.batch;
340     int k = l.inputs;
341     int n = l.outputs;
342     float * a = state.input;
343     float * b = l.weights_gpu;
344     float * c = l.output_gpu;
345 #ifdef CUDNN
346     float one = 1;    // alpha[0], beta[0]
347     float alpha = 1, beta = 0;
348 
349     CHECK_CUDNN(cudnnConvolutionForward(cudnn_handle(),
350         &alpha, //&one,
351         l.srcTensorDesc,
352         state.input,
353         l.weightDesc,
354         l.weights_gpu,
355         l.convDesc,
356         l.fw_algo,
357         state.workspace,
358         l.workspace_size,
359         &beta,  //&one,
360         l.dstTensorDesc,
361         l.output_gpu));
362 #else // CUDNN
363     gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
364 #endif // CUDNN
365 
366 	if (l.batch_normalize) {
367 		forward_batchnorm_layer_gpu(l, state);
368 	}
369 	else {
370 		add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.outputs, 1);
371 	}
372     //for(i = 0; i < l.batch; ++i) axpy_ongpu(l.outputs, 1, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1);
373     activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
374 }
375 
backward_connected_layer_gpu(connected_layer l,network_state state)376 void backward_connected_layer_gpu(connected_layer l, network_state state)
377 {
378     int i;
379     constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
380     gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
381     for(i = 0; i < l.batch; ++i){
382         axpy_ongpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1);
383     }
384 
385     if(l.batch_normalize){
386         backward_batchnorm_layer_gpu(l, state);
387     }
388 
389 #ifdef CUDNN_DISABLED
390     float one = 1;
391     // calculate conv weight updates
392     // if used: beta=1 then loss decreases faster
393     CHECK_CUDNN(cudnnConvolutionBackwardFilter(cudnn_handle(),
394         &one,
395         l.srcTensorDesc,
396         state.input,
397         l.ddstTensorDesc,
398         l.delta_gpu,
399         l.convDesc,
400         l.bf_algo,
401         state.workspace,
402         l.workspace_size,
403         &one,
404         l.dweightDesc,
405         l.weight_updates_gpu));
406 
407     if (state.delta) {
408         // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData
409         // calculate delta for the next layer
410 
411         CHECK_CUDNN(cudnnConvolutionBackwardData(cudnn_handle(),
412             &one,
413             l.weightDesc,
414             l.weights_gpu,
415             l.ddstTensorDesc,
416             l.delta_gpu,
417             l.convDesc,
418             l.bd_algo,
419             state.workspace,
420             l.workspace_size,
421             &one,
422             l.dsrcTensorDesc,
423             state.delta));
424     }
425 #else // CUDNN
426 
427     int m = l.outputs;
428     int k = l.batch;
429     int n = l.inputs;
430     float * a = l.delta_gpu;
431     float * b = state.input;
432     float * c = l.weight_updates_gpu;
433 
434     gemm_ongpu(1,0,m,n,k,1,a,m,b,n,1,c,n);
435 
436     m = l.batch;
437     k = l.outputs;
438     n = l.inputs;
439 
440     a = l.delta_gpu;
441     b = l.weights_gpu;
442     c = state.delta;
443 
444     if(c) gemm_ongpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
445 #endif // CUDNN
446 }
447 #endif
448