1 #include "batchnorm_layer.h"
2 #include "blas.h"
3 #include "utils.h"
4 #include <stdio.h>
5
make_batchnorm_layer(int batch,int w,int h,int c,int train)6 layer make_batchnorm_layer(int batch, int w, int h, int c, int train)
7 {
8 fprintf(stderr, "Batch Normalization Layer: %d x %d x %d image\n", w,h,c);
9 layer layer = { (LAYER_TYPE)0 };
10 layer.type = BATCHNORM;
11 layer.batch = batch;
12 layer.train = train;
13 layer.h = layer.out_h = h;
14 layer.w = layer.out_w = w;
15 layer.c = layer.out_c = c;
16
17 layer.n = layer.c;
18 layer.output = (float*)xcalloc(h * w * c * batch, sizeof(float));
19 layer.delta = (float*)xcalloc(h * w * c * batch, sizeof(float));
20 layer.inputs = w*h*c;
21 layer.outputs = layer.inputs;
22
23 layer.biases = (float*)xcalloc(c, sizeof(float));
24 layer.bias_updates = (float*)xcalloc(c, sizeof(float));
25
26 layer.scales = (float*)xcalloc(c, sizeof(float));
27 layer.scale_updates = (float*)xcalloc(c, sizeof(float));
28 int i;
29 for(i = 0; i < c; ++i){
30 layer.scales[i] = 1;
31 }
32
33 layer.mean = (float*)xcalloc(c, sizeof(float));
34 layer.variance = (float*)xcalloc(c, sizeof(float));
35
36 layer.rolling_mean = (float*)xcalloc(c, sizeof(float));
37 layer.rolling_variance = (float*)xcalloc(c, sizeof(float));
38
39 layer.forward = forward_batchnorm_layer;
40 layer.backward = backward_batchnorm_layer;
41 layer.update = update_batchnorm_layer;
42 #ifdef GPU
43 layer.forward_gpu = forward_batchnorm_layer_gpu;
44 layer.backward_gpu = backward_batchnorm_layer_gpu;
45 layer.update_gpu = update_batchnorm_layer_gpu;
46
47 layer.output_gpu = cuda_make_array(layer.output, h * w * c * batch);
48
49 layer.biases_gpu = cuda_make_array(layer.biases, c);
50 layer.scales_gpu = cuda_make_array(layer.scales, c);
51
52 if (train) {
53 layer.delta_gpu = cuda_make_array(layer.delta, h * w * c * batch);
54
55 layer.bias_updates_gpu = cuda_make_array(layer.bias_updates, c);
56 layer.scale_updates_gpu = cuda_make_array(layer.scale_updates, c);
57
58 layer.mean_delta_gpu = cuda_make_array(layer.mean, c);
59 layer.variance_delta_gpu = cuda_make_array(layer.variance, c);
60 }
61
62 layer.mean_gpu = cuda_make_array(layer.mean, c);
63 layer.variance_gpu = cuda_make_array(layer.variance, c);
64
65 layer.rolling_mean_gpu = cuda_make_array(layer.mean, c);
66 layer.rolling_variance_gpu = cuda_make_array(layer.variance, c);
67
68 if (train) {
69 layer.x_gpu = cuda_make_array(layer.output, layer.batch*layer.outputs);
70 #ifndef CUDNN
71 layer.x_norm_gpu = cuda_make_array(layer.output, layer.batch*layer.outputs);
72 #endif // not CUDNN
73 }
74
75 #ifdef CUDNN
76 CHECK_CUDNN(cudnnCreateTensorDescriptor(&layer.normTensorDesc));
77 CHECK_CUDNN(cudnnCreateTensorDescriptor(&layer.normDstTensorDesc));
78 CHECK_CUDNN(cudnnSetTensor4dDescriptor(layer.normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, layer.batch, layer.out_c, layer.out_h, layer.out_w));
79 CHECK_CUDNN(cudnnSetTensor4dDescriptor(layer.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, layer.out_c, 1, 1));
80 #endif
81 #endif
82 return layer;
83 }
84
backward_scale_cpu(float * x_norm,float * delta,int batch,int n,int size,float * scale_updates)85 void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates)
86 {
87 int i,b,f;
88 for(f = 0; f < n; ++f){
89 float sum = 0;
90 for(b = 0; b < batch; ++b){
91 for(i = 0; i < size; ++i){
92 int index = i + size*(f + n*b);
93 sum += delta[index] * x_norm[index];
94 }
95 }
96 scale_updates[f] += sum;
97 }
98 }
99
mean_delta_cpu(float * delta,float * variance,int batch,int filters,int spatial,float * mean_delta)100 void mean_delta_cpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta)
101 {
102
103 int i,j,k;
104 for(i = 0; i < filters; ++i){
105 mean_delta[i] = 0;
106 for (j = 0; j < batch; ++j) {
107 for (k = 0; k < spatial; ++k) {
108 int index = j*filters*spatial + i*spatial + k;
109 mean_delta[i] += delta[index];
110 }
111 }
112 mean_delta[i] *= (-1./sqrt(variance[i] + .00001f));
113 }
114 }
variance_delta_cpu(float * x,float * delta,float * mean,float * variance,int batch,int filters,int spatial,float * variance_delta)115 void variance_delta_cpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta)
116 {
117
118 int i,j,k;
119 for(i = 0; i < filters; ++i){
120 variance_delta[i] = 0;
121 for(j = 0; j < batch; ++j){
122 for(k = 0; k < spatial; ++k){
123 int index = j*filters*spatial + i*spatial + k;
124 variance_delta[i] += delta[index]*(x[index] - mean[i]);
125 }
126 }
127 variance_delta[i] *= -.5 * pow(variance[i] + .00001f, (float)(-3./2.));
128 }
129 }
normalize_delta_cpu(float * x,float * mean,float * variance,float * mean_delta,float * variance_delta,int batch,int filters,int spatial,float * delta)130 void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta)
131 {
132 int f, j, k;
133 for(j = 0; j < batch; ++j){
134 for(f = 0; f < filters; ++f){
135 for(k = 0; k < spatial; ++k){
136 int index = j*filters*spatial + f*spatial + k;
137 delta[index] = delta[index] * 1./(sqrt(variance[f]) + .00001f) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
138 }
139 }
140 }
141 }
142
resize_batchnorm_layer(layer * l,int w,int h)143 void resize_batchnorm_layer(layer *l, int w, int h)
144 {
145 l->out_h = l->h = h;
146 l->out_w = l->w = w;
147 l->outputs = l->inputs = h*w*l->c;
148
149 const int output_size = l->outputs * l->batch;
150
151 l->output = (float*)realloc(l->output, output_size * sizeof(float));
152 l->delta = (float*)realloc(l->delta, output_size * sizeof(float));
153
154 #ifdef GPU
155 cuda_free(l->output_gpu);
156 l->output_gpu = cuda_make_array(l->output, output_size);
157
158 if (l->train) {
159 cuda_free(l->delta_gpu);
160 l->delta_gpu = cuda_make_array(l->delta, output_size);
161
162 cuda_free(l->x_gpu);
163 l->x_gpu = cuda_make_array(l->output, output_size);
164 #ifndef CUDNN
165 cuda_free(l->x_norm_gpu);
166 l->x_norm_gpu = cuda_make_array(l->output, output_size);
167 #endif // not CUDNN
168 }
169
170
171 #ifdef CUDNN
172 CHECK_CUDNN(cudnnDestroyTensorDescriptor(l->normDstTensorDesc));
173 CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->normDstTensorDesc));
174 CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w));
175 #endif // CUDNN
176 #endif // GPU
177 }
178
forward_batchnorm_layer(layer l,network_state state)179 void forward_batchnorm_layer(layer l, network_state state)
180 {
181 if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
182 if(l.type == CONNECTED){
183 l.out_c = l.outputs;
184 l.out_h = l.out_w = 1;
185 }
186 if(state.train){
187 mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean);
188 variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h*l.out_w, l.variance);
189
190 scal_cpu(l.out_c, .9, l.rolling_mean, 1);
191 axpy_cpu(l.out_c, .1, l.mean, 1, l.rolling_mean, 1);
192 scal_cpu(l.out_c, .9, l.rolling_variance, 1);
193 axpy_cpu(l.out_c, .1, l.variance, 1, l.rolling_variance, 1);
194
195 copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
196 normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h*l.out_w);
197 copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
198 } else {
199 normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.out_c, l.out_h*l.out_w);
200 }
201 scale_bias(l.output, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
202 add_bias(l.output, l.biases, l.batch, l.out_c, l.out_w*l.out_h);
203 }
204
backward_batchnorm_layer(const layer l,network_state state)205 void backward_batchnorm_layer(const layer l, network_state state)
206 {
207 backward_scale_cpu(l.x_norm, l.delta, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates);
208
209 scale_bias(l.delta, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
210
211 mean_delta_cpu(l.delta, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta);
212 variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta);
213 normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.out_c, l.out_w*l.out_h, l.delta);
214 if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, l.delta, 1, state.delta, 1);
215 }
216
update_batchnorm_layer(layer l,int batch,float learning_rate,float momentum,float decay)217 void update_batchnorm_layer(layer l, int batch, float learning_rate, float momentum, float decay)
218 {
219 //int size = l.nweights;
220 axpy_cpu(l.c, learning_rate / batch, l.bias_updates, 1, l.biases, 1);
221 scal_cpu(l.c, momentum, l.bias_updates, 1);
222
223 axpy_cpu(l.c, learning_rate / batch, l.scale_updates, 1, l.scales, 1);
224 scal_cpu(l.c, momentum, l.scale_updates, 1);
225 }
226
227
228
229
230 #ifdef GPU
231
pull_batchnorm_layer(layer l)232 void pull_batchnorm_layer(layer l)
233 {
234 cuda_pull_array(l.biases_gpu, l.biases, l.out_c);
235 cuda_pull_array(l.scales_gpu, l.scales, l.out_c);
236 cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.out_c);
237 cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.out_c);
238 }
push_batchnorm_layer(layer l)239 void push_batchnorm_layer(layer l)
240 {
241 cuda_push_array(l.biases_gpu, l.biases, l.out_c);
242 cuda_push_array(l.scales_gpu, l.scales, l.out_c);
243 cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.out_c);
244 cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.out_c);
245 }
246
forward_batchnorm_layer_gpu(layer l,network_state state)247 void forward_batchnorm_layer_gpu(layer l, network_state state)
248 {
249 if (l.type == BATCHNORM) simple_copy_ongpu(l.outputs*l.batch, state.input, l.output_gpu);
250 //copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
251
252 if (state.net.adversarial) {
253 normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
254 scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
255 add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
256 return;
257 }
258
259 if (state.train) {
260 simple_copy_ongpu(l.outputs*l.batch, l.output_gpu, l.x_gpu);
261
262 // cbn
263 if (l.batch_normalize == 2) {
264
265 fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu);
266
267 //fast_v_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.v_cbn_gpu);
268 const int minibatch_index = state.net.current_subdivision + 1;
269 const int max_minibatch_index = state.net.subdivisions;
270 //printf("\n minibatch_index = %d, max_minibatch_index = %d \n", minibatch_index, max_minibatch_index);
271 const float alpha = 0.01;
272
273 int inverse_variance = 0;
274 #ifdef CUDNN
275 inverse_variance = 1;
276 #endif // CUDNN
277
278 fast_v_cbn_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, minibatch_index, max_minibatch_index, l.m_cbn_avg_gpu, l.v_cbn_avg_gpu, l.variance_gpu,
279 alpha, l.rolling_mean_gpu, l.rolling_variance_gpu, inverse_variance, .00001);
280
281 normalize_scale_bias_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.scales_gpu, l.biases_gpu, l.batch, l.out_c, l.out_h*l.out_w, inverse_variance, .00001f);
282
283 #ifndef CUDNN
284 simple_copy_ongpu(l.outputs*l.batch, l.output_gpu, l.x_norm_gpu);
285 #endif // CUDNN
286
287 //printf("\n CBN, minibatch_index = %d \n", minibatch_index);
288 }
289 else {
290 #ifdef CUDNN
291 float one = 1;
292 float zero = 0;
293 cudnnBatchNormalizationForwardTraining(cudnn_handle(),
294 CUDNN_BATCHNORM_SPATIAL,
295 &one,
296 &zero,
297 l.normDstTensorDesc,
298 l.x_gpu, // input
299 l.normDstTensorDesc,
300 l.output_gpu, // output
301 l.normTensorDesc,
302 l.scales_gpu,
303 l.biases_gpu,
304 .01,
305 l.rolling_mean_gpu, // output (should be FP32)
306 l.rolling_variance_gpu, // output (should be FP32)
307 .00001,
308 l.mean_gpu, // output (should be FP32)
309 l.variance_gpu); // output (should be FP32)
310
311 if (state.net.try_fix_nan) {
312 fix_nan_and_inf(l.scales_gpu, l.n);
313 fix_nan_and_inf(l.biases_gpu, l.n);
314 fix_nan_and_inf(l.mean_gpu, l.n);
315 fix_nan_and_inf(l.variance_gpu, l.n);
316 fix_nan_and_inf(l.rolling_mean_gpu, l.n);
317 fix_nan_and_inf(l.rolling_variance_gpu, l.n);
318 }
319
320 //simple_copy_ongpu(l.outputs*l.batch, l.output_gpu, l.x_norm_gpu);
321 #else // CUDNN
322 fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu);
323 fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu);
324
325 scal_ongpu(l.out_c, .99, l.rolling_mean_gpu, 1);
326 axpy_ongpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1);
327 scal_ongpu(l.out_c, .99, l.rolling_variance_gpu, 1);
328 axpy_ongpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1);
329
330 copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
331 normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
332 copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);
333
334 scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
335 add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
336 #endif // CUDNN
337 }
338 }
339 else {
340 normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
341 scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
342 add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
343 }
344
345 }
346
backward_batchnorm_layer_gpu(layer l,network_state state)347 void backward_batchnorm_layer_gpu(layer l, network_state state)
348 {
349 if (state.net.adversarial) {
350 inverse_variance_ongpu(l.out_c, l.rolling_variance_gpu, l.variance_gpu, 0.00001);
351
352 scale_bias_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
353 scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
354 return;
355 }
356
357 if (!state.train) {
358 //l.mean_gpu = l.rolling_mean_gpu;
359 //l.variance_gpu = l.rolling_variance_gpu;
360 simple_copy_ongpu(l.out_c, l.rolling_mean_gpu, l.mean_gpu);
361 #ifdef CUDNN
362 inverse_variance_ongpu(l.out_c, l.rolling_variance_gpu, l.variance_gpu, 0.00001);
363 #else
364 simple_copy_ongpu(l.out_c, l.rolling_variance_gpu, l.variance_gpu);
365 #endif
366 }
367
368 #ifdef CUDNN
369 float one = 1;
370 float zero = 0;
371 cudnnBatchNormalizationBackward(cudnn_handle(),
372 CUDNN_BATCHNORM_SPATIAL,
373 &one,
374 &zero,
375 &one,
376 &one,
377 l.normDstTensorDesc,
378 l.x_gpu, // input
379 l.normDstTensorDesc,
380 l.delta_gpu, // input
381 l.normDstTensorDesc,
382 l.output_gpu, //l.x_norm_gpu, // output
383 l.normTensorDesc,
384 l.scales_gpu, // input (should be FP32)
385 l.scale_updates_gpu, // output (should be FP32)
386 l.bias_updates_gpu, // output (should be FP32)
387 .00001,
388 l.mean_gpu, // input (should be FP32)
389 l.variance_gpu); // input (should be FP32)
390 simple_copy_ongpu(l.outputs*l.batch, l.output_gpu, l.delta_gpu);
391 //simple_copy_ongpu(l.outputs*l.batch, l.x_norm_gpu, l.delta_gpu);
392 #else // CUDNN
393 backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h);
394 backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu);
395
396 scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
397
398 fast_mean_delta_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta_gpu);
399 fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu);
400 normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu);
401 #endif // CUDNN
402 if (l.type == BATCHNORM) simple_copy_ongpu(l.outputs*l.batch, l.delta_gpu, state.delta);
403 //copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1);
404
405 if (state.net.try_fix_nan) {
406 fix_nan_and_inf(l.scale_updates_gpu, l.n);
407 fix_nan_and_inf(l.bias_updates_gpu, l.n);
408 }
409 }
410
update_batchnorm_layer_gpu(layer l,int batch,float learning_rate_init,float momentum,float decay,float loss_scale)411 void update_batchnorm_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale)
412 {
413 float learning_rate = learning_rate_init * l.learning_rate_scale / loss_scale;
414 //float momentum = a.momentum;
415 //float decay = a.decay;
416 //int batch = a.batch;
417
418 axpy_ongpu(l.c, learning_rate / batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
419 scal_ongpu(l.c, momentum, l.bias_updates_gpu, 1);
420
421 axpy_ongpu(l.c, learning_rate / batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
422 scal_ongpu(l.c, momentum, l.scale_updates_gpu, 1);
423 }
424
425 #endif // GPU
426