1 #include "connected_layer.h"
2 #include "batchnorm_layer.h"
3 #include "convolutional_layer.h"
4 #include "utils.h"
5 #include "dark_cuda.h"
6 #include "blas.h"
7 #include "gemm.h"
8
9 #include <math.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13
get_connected_workspace_size(layer l)14 size_t get_connected_workspace_size(layer l)
15 {
16 #ifdef CUDNN
17 return get_convolutional_workspace_size(l);
18 /*
19 if (gpu_index >= 0) {
20 size_t most = 0;
21 size_t s = 0;
22 CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(),
23 l.srcTensorDesc,
24 l.weightDesc,
25 l.convDesc,
26 l.dstTensorDesc,
27 l.fw_algo,
28 &s));
29 if (s > most) most = s;
30 CHECK_CUDNN(cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnn_handle(),
31 l.srcTensorDesc,
32 l.ddstTensorDesc,
33 l.convDesc,
34 l.dweightDesc,
35 l.bf_algo,
36 &s));
37 if (s > most) most = s;
38 CHECK_CUDNN(cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle(),
39 l.weightDesc,
40 l.ddstTensorDesc,
41 l.convDesc,
42 l.dsrcTensorDesc,
43 l.bd_algo,
44 &s));
45 if (s > most) most = s;
46 return most;
47 }
48 */
49 #endif
50 return 0;
51 }
52
make_connected_layer(int batch,int steps,int inputs,int outputs,ACTIVATION activation,int batch_normalize)53 connected_layer make_connected_layer(int batch, int steps, int inputs, int outputs, ACTIVATION activation, int batch_normalize)
54 {
55 int total_batch = batch*steps;
56 int i;
57 connected_layer l = { (LAYER_TYPE)0 };
58 l.type = CONNECTED;
59
60 l.inputs = inputs;
61 l.outputs = outputs;
62 l.batch= batch;
63 l.batch_normalize = batch_normalize;
64 l.h = 1;
65 l.w = 1;
66 l.c = inputs;
67 l.out_h = 1;
68 l.out_w = 1;
69 l.out_c = outputs;
70 l.n = l.out_c;
71 l.size = 1;
72 l.stride = l.stride_x = l.stride_y = 1;
73 l.pad = 0;
74 l.activation = activation;
75 l.learning_rate_scale = 1;
76 l.groups = 1;
77 l.dilation = 1;
78
79 l.output = (float*)xcalloc(total_batch * outputs, sizeof(float));
80 l.delta = (float*)xcalloc(total_batch * outputs, sizeof(float));
81
82 l.weight_updates = (float*)xcalloc(inputs * outputs, sizeof(float));
83 l.bias_updates = (float*)xcalloc(outputs, sizeof(float));
84
85 l.weights = (float*)xcalloc(outputs * inputs, sizeof(float));
86 l.biases = (float*)xcalloc(outputs, sizeof(float));
87
88 l.forward = forward_connected_layer;
89 l.backward = backward_connected_layer;
90 l.update = update_connected_layer;
91
92 //float scale = 1./sqrt(inputs);
93 float scale = sqrt(2.f/inputs);
94 for(i = 0; i < outputs*inputs; ++i){
95 l.weights[i] = scale*rand_uniform(-1, 1);
96 }
97
98 for(i = 0; i < outputs; ++i){
99 l.biases[i] = 0;
100 }
101
102 if(batch_normalize){
103 l.scales = (float*)xcalloc(outputs, sizeof(float));
104 l.scale_updates = (float*)xcalloc(outputs, sizeof(float));
105 for(i = 0; i < outputs; ++i){
106 l.scales[i] = 1;
107 }
108
109 l.mean = (float*)xcalloc(outputs, sizeof(float));
110 l.mean_delta = (float*)xcalloc(outputs, sizeof(float));
111 l.variance = (float*)xcalloc(outputs, sizeof(float));
112 l.variance_delta = (float*)xcalloc(outputs, sizeof(float));
113
114 l.rolling_mean = (float*)xcalloc(outputs, sizeof(float));
115 l.rolling_variance = (float*)xcalloc(outputs, sizeof(float));
116
117 l.x = (float*)xcalloc(total_batch * outputs, sizeof(float));
118 l.x_norm = (float*)xcalloc(total_batch * outputs, sizeof(float));
119 }
120
121 #ifdef GPU
122 l.forward_gpu = forward_connected_layer_gpu;
123 l.backward_gpu = backward_connected_layer_gpu;
124 l.update_gpu = update_connected_layer_gpu;
125
126 l.weights_gpu = cuda_make_array(l.weights, outputs*inputs);
127 l.biases_gpu = cuda_make_array(l.biases, outputs);
128
129 l.weight_updates_gpu = cuda_make_array(l.weight_updates, outputs*inputs);
130 l.bias_updates_gpu = cuda_make_array(l.bias_updates, outputs);
131
132 l.output_gpu = cuda_make_array(l.output, outputs*total_batch);
133 l.delta_gpu = cuda_make_array(l.delta, outputs*total_batch);
134 if (batch_normalize) {
135 l.scales_gpu = cuda_make_array(l.scales, outputs);
136 l.scale_updates_gpu = cuda_make_array(l.scale_updates, outputs);
137
138 l.mean_gpu = cuda_make_array(l.mean, outputs);
139 l.variance_gpu = cuda_make_array(l.variance, outputs);
140
141 l.rolling_mean_gpu = cuda_make_array(l.mean, outputs);
142 l.rolling_variance_gpu = cuda_make_array(l.variance, outputs);
143
144 l.mean_delta_gpu = cuda_make_array(l.mean, outputs);
145 l.variance_delta_gpu = cuda_make_array(l.variance, outputs);
146
147 l.x_gpu = cuda_make_array(l.output, total_batch*outputs);
148 l.x_norm_gpu = cuda_make_array(l.output, total_batch*outputs);
149 }
150 #ifdef CUDNN
151 create_convolutional_cudnn_tensors(&l);
152 cudnn_convolutional_setup(&l, cudnn_fastest, 0); // cudnn_fastest, cudnn_smallest
153 l.workspace_size = get_connected_workspace_size(l);
154 #endif // CUDNN
155 #endif // GPU
156 fprintf(stderr, "connected %4d -> %4d\n", inputs, outputs);
157 return l;
158 }
159
update_connected_layer(connected_layer l,int batch,float learning_rate,float momentum,float decay)160 void update_connected_layer(connected_layer l, int batch, float learning_rate, float momentum, float decay)
161 {
162 axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
163 scal_cpu(l.outputs, momentum, l.bias_updates, 1);
164
165 if(l.batch_normalize){
166 axpy_cpu(l.outputs, learning_rate/batch, l.scale_updates, 1, l.scales, 1);
167 scal_cpu(l.outputs, momentum, l.scale_updates, 1);
168 }
169
170 axpy_cpu(l.inputs*l.outputs, -decay*batch, l.weights, 1, l.weight_updates, 1);
171 axpy_cpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
172 scal_cpu(l.inputs*l.outputs, momentum, l.weight_updates, 1);
173 }
174
forward_connected_layer(connected_layer l,network_state state)175 void forward_connected_layer(connected_layer l, network_state state)
176 {
177 int i;
178 fill_cpu(l.outputs*l.batch, 0, l.output, 1);
179 int m = l.batch;
180 int k = l.inputs;
181 int n = l.outputs;
182 float *a = state.input;
183 float *b = l.weights;
184 float *c = l.output;
185 gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
186 if(l.batch_normalize){
187 if(state.train){
188 mean_cpu(l.output, l.batch, l.outputs, 1, l.mean);
189 variance_cpu(l.output, l.mean, l.batch, l.outputs, 1, l.variance);
190
191 scal_cpu(l.outputs, .95f, l.rolling_mean, 1);
192 axpy_cpu(l.outputs, .05f, l.mean, 1, l.rolling_mean, 1);
193 scal_cpu(l.outputs, .95f, l.rolling_variance, 1);
194 axpy_cpu(l.outputs, .05f, l.variance, 1, l.rolling_variance, 1);
195
196 copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
197 normalize_cpu(l.output, l.mean, l.variance, l.batch, l.outputs, 1);
198 copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
199 } else {
200 normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.outputs, 1);
201 }
202 scale_bias(l.output, l.scales, l.batch, l.outputs, 1);
203 }
204 for(i = 0; i < l.batch; ++i){
205 axpy_cpu(l.outputs, 1, l.biases, 1, l.output + i*l.outputs, 1);
206 }
207 activate_array(l.output, l.outputs*l.batch, l.activation);
208 }
209
backward_connected_layer(connected_layer l,network_state state)210 void backward_connected_layer(connected_layer l, network_state state)
211 {
212 int i;
213 gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
214 for(i = 0; i < l.batch; ++i){
215 axpy_cpu(l.outputs, 1, l.delta + i*l.outputs, 1, l.bias_updates, 1);
216 }
217 if(l.batch_normalize){
218 backward_scale_cpu(l.x_norm, l.delta, l.batch, l.outputs, 1, l.scale_updates);
219
220 scale_bias(l.delta, l.scales, l.batch, l.outputs, 1);
221
222 mean_delta_cpu(l.delta, l.variance, l.batch, l.outputs, 1, l.mean_delta);
223 variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.outputs, 1, l.variance_delta);
224 normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.outputs, 1, l.delta);
225 }
226
227 int m = l.outputs;
228 int k = l.batch;
229 int n = l.inputs;
230 float *a = l.delta;
231 float *b = state.input;
232 float *c = l.weight_updates;
233 gemm(1,0,m,n,k,1,a,m,b,n,1,c,n);
234
235 m = l.batch;
236 k = l.outputs;
237 n = l.inputs;
238
239 a = l.delta;
240 b = l.weights;
241 c = state.delta;
242
243 if(c) gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
244 }
245
246
denormalize_connected_layer(layer l)247 void denormalize_connected_layer(layer l)
248 {
249 int i, j;
250 for(i = 0; i < l.outputs; ++i){
251 float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .000001f);
252 for(j = 0; j < l.inputs; ++j){
253 l.weights[i*l.inputs + j] *= scale;
254 }
255 l.biases[i] -= l.rolling_mean[i] * scale;
256 l.scales[i] = 1;
257 l.rolling_mean[i] = 0;
258 l.rolling_variance[i] = 1;
259 }
260 }
261
262
statistics_connected_layer(layer l)263 void statistics_connected_layer(layer l)
264 {
265 if(l.batch_normalize){
266 printf("Scales ");
267 print_statistics(l.scales, l.outputs);
268 /*
269 printf("Rolling Mean ");
270 print_statistics(l.rolling_mean, l.outputs);
271 printf("Rolling Variance ");
272 print_statistics(l.rolling_variance, l.outputs);
273 */
274 }
275 printf("Biases ");
276 print_statistics(l.biases, l.outputs);
277 printf("Weights ");
278 print_statistics(l.weights, l.outputs);
279 }
280
281 #ifdef GPU
282
pull_connected_layer(connected_layer l)283 void pull_connected_layer(connected_layer l)
284 {
285 cuda_pull_array(l.weights_gpu, l.weights, l.inputs*l.outputs);
286 cuda_pull_array(l.biases_gpu, l.biases, l.outputs);
287 cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.inputs*l.outputs);
288 cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
289 if (l.batch_normalize){
290 cuda_pull_array(l.scales_gpu, l.scales, l.outputs);
291 cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.outputs);
292 cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.outputs);
293 }
294 CHECK_CUDA(cudaPeekAtLastError());
295 }
296
push_connected_layer(connected_layer l)297 void push_connected_layer(connected_layer l)
298 {
299 cuda_push_array(l.weights_gpu, l.weights, l.inputs*l.outputs);
300 cuda_push_array(l.biases_gpu, l.biases, l.outputs);
301 cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.inputs*l.outputs);
302 cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
303 if (l.batch_normalize){
304 cuda_push_array(l.scales_gpu, l.scales, l.outputs);
305 cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.outputs);
306 cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.outputs);
307 }
308 CHECK_CUDA(cudaPeekAtLastError());
309 }
310
update_connected_layer_gpu(connected_layer l,int batch,float learning_rate_init,float momentum,float decay,float loss_scale)311 void update_connected_layer_gpu(connected_layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale)
312 {
313 float learning_rate = learning_rate_init * l.learning_rate_scale;
314
315 // Loss scale for Mixed-Precision on Tensor-Cores
316 if (loss_scale != 1.0) {
317 scal_ongpu(l.inputs*l.outputs, 1.0 / loss_scale, l.weight_updates_gpu, 1);
318 scal_ongpu(l.outputs, 1.0 / loss_scale, l.bias_updates_gpu, 1);
319 scal_ongpu(l.outputs, 1.0 / loss_scale, l.scale_updates_gpu, 1);
320 }
321
322 axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
323 scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1);
324
325 if(l.batch_normalize){
326 axpy_ongpu(l.outputs, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
327 scal_ongpu(l.outputs, momentum, l.scale_updates_gpu, 1);
328 }
329
330 axpy_ongpu(l.inputs*l.outputs, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
331 axpy_ongpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
332 scal_ongpu(l.inputs*l.outputs, momentum, l.weight_updates_gpu, 1);
333 }
334
forward_connected_layer_gpu(connected_layer l,network_state state)335 void forward_connected_layer_gpu(connected_layer l, network_state state)
336 {
337 fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
338
339 int m = l.batch;
340 int k = l.inputs;
341 int n = l.outputs;
342 float * a = state.input;
343 float * b = l.weights_gpu;
344 float * c = l.output_gpu;
345 #ifdef CUDNN
346 float one = 1; // alpha[0], beta[0]
347 float alpha = 1, beta = 0;
348
349 CHECK_CUDNN(cudnnConvolutionForward(cudnn_handle(),
350 &alpha, //&one,
351 l.srcTensorDesc,
352 state.input,
353 l.weightDesc,
354 l.weights_gpu,
355 l.convDesc,
356 l.fw_algo,
357 state.workspace,
358 l.workspace_size,
359 &beta, //&one,
360 l.dstTensorDesc,
361 l.output_gpu));
362 #else // CUDNN
363 gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
364 #endif // CUDNN
365
366 if (l.batch_normalize) {
367 forward_batchnorm_layer_gpu(l, state);
368 }
369 else {
370 add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.outputs, 1);
371 }
372 //for(i = 0; i < l.batch; ++i) axpy_ongpu(l.outputs, 1, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1);
373 activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
374 }
375
backward_connected_layer_gpu(connected_layer l,network_state state)376 void backward_connected_layer_gpu(connected_layer l, network_state state)
377 {
378 int i;
379 constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
380 gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
381 for(i = 0; i < l.batch; ++i){
382 axpy_ongpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1);
383 }
384
385 if(l.batch_normalize){
386 backward_batchnorm_layer_gpu(l, state);
387 }
388
389 #ifdef CUDNN_DISABLED
390 float one = 1;
391 // calculate conv weight updates
392 // if used: beta=1 then loss decreases faster
393 CHECK_CUDNN(cudnnConvolutionBackwardFilter(cudnn_handle(),
394 &one,
395 l.srcTensorDesc,
396 state.input,
397 l.ddstTensorDesc,
398 l.delta_gpu,
399 l.convDesc,
400 l.bf_algo,
401 state.workspace,
402 l.workspace_size,
403 &one,
404 l.dweightDesc,
405 l.weight_updates_gpu));
406
407 if (state.delta) {
408 // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData
409 // calculate delta for the next layer
410
411 CHECK_CUDNN(cudnnConvolutionBackwardData(cudnn_handle(),
412 &one,
413 l.weightDesc,
414 l.weights_gpu,
415 l.ddstTensorDesc,
416 l.delta_gpu,
417 l.convDesc,
418 l.bd_algo,
419 state.workspace,
420 l.workspace_size,
421 &one,
422 l.dsrcTensorDesc,
423 state.delta));
424 }
425 #else // CUDNN
426
427 int m = l.outputs;
428 int k = l.batch;
429 int n = l.inputs;
430 float * a = l.delta_gpu;
431 float * b = state.input;
432 float * c = l.weight_updates_gpu;
433
434 gemm_ongpu(1,0,m,n,k,1,a,m,b,n,1,c,n);
435
436 m = l.batch;
437 k = l.outputs;
438 n = l.inputs;
439
440 a = l.delta_gpu;
441 b = l.weights_gpu;
442 c = state.delta;
443
444 if(c) gemm_ongpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
445 #endif // CUDNN
446 }
447 #endif
448