1 #include "batchnorm_layer.h"
2 #include "blas.h"
3 #include "utils.h"
4 #include <stdio.h>
5 
make_batchnorm_layer(int batch,int w,int h,int c,int train)6 layer make_batchnorm_layer(int batch, int w, int h, int c, int train)
7 {
8     fprintf(stderr, "Batch Normalization Layer: %d x %d x %d image\n", w,h,c);
9     layer layer = { (LAYER_TYPE)0 };
10     layer.type = BATCHNORM;
11     layer.batch = batch;
12     layer.train = train;
13     layer.h = layer.out_h = h;
14     layer.w = layer.out_w = w;
15     layer.c = layer.out_c = c;
16 
17     layer.n = layer.c;
18     layer.output = (float*)xcalloc(h * w * c * batch, sizeof(float));
19     layer.delta = (float*)xcalloc(h * w * c * batch, sizeof(float));
20     layer.inputs = w*h*c;
21     layer.outputs = layer.inputs;
22 
23     layer.biases = (float*)xcalloc(c, sizeof(float));
24     layer.bias_updates = (float*)xcalloc(c, sizeof(float));
25 
26     layer.scales = (float*)xcalloc(c, sizeof(float));
27     layer.scale_updates = (float*)xcalloc(c, sizeof(float));
28     int i;
29     for(i = 0; i < c; ++i){
30         layer.scales[i] = 1;
31     }
32 
33     layer.mean = (float*)xcalloc(c, sizeof(float));
34     layer.variance = (float*)xcalloc(c, sizeof(float));
35 
36     layer.rolling_mean = (float*)xcalloc(c, sizeof(float));
37     layer.rolling_variance = (float*)xcalloc(c, sizeof(float));
38 
39     layer.forward = forward_batchnorm_layer;
40     layer.backward = backward_batchnorm_layer;
41     layer.update = update_batchnorm_layer;
42 #ifdef GPU
43     layer.forward_gpu = forward_batchnorm_layer_gpu;
44     layer.backward_gpu = backward_batchnorm_layer_gpu;
45     layer.update_gpu = update_batchnorm_layer_gpu;
46 
47     layer.output_gpu =  cuda_make_array(layer.output, h * w * c * batch);
48 
49     layer.biases_gpu = cuda_make_array(layer.biases, c);
50     layer.scales_gpu = cuda_make_array(layer.scales, c);
51 
52     if (train) {
53         layer.delta_gpu = cuda_make_array(layer.delta, h * w * c * batch);
54 
55         layer.bias_updates_gpu = cuda_make_array(layer.bias_updates, c);
56         layer.scale_updates_gpu = cuda_make_array(layer.scale_updates, c);
57 
58         layer.mean_delta_gpu = cuda_make_array(layer.mean, c);
59         layer.variance_delta_gpu = cuda_make_array(layer.variance, c);
60     }
61 
62     layer.mean_gpu = cuda_make_array(layer.mean, c);
63     layer.variance_gpu = cuda_make_array(layer.variance, c);
64 
65     layer.rolling_mean_gpu = cuda_make_array(layer.mean, c);
66     layer.rolling_variance_gpu = cuda_make_array(layer.variance, c);
67 
68     if (train) {
69         layer.x_gpu = cuda_make_array(layer.output, layer.batch*layer.outputs);
70 #ifndef CUDNN
71         layer.x_norm_gpu = cuda_make_array(layer.output, layer.batch*layer.outputs);
72 #endif  // not CUDNN
73     }
74 
75 #ifdef CUDNN
76     CHECK_CUDNN(cudnnCreateTensorDescriptor(&layer.normTensorDesc));
77     CHECK_CUDNN(cudnnCreateTensorDescriptor(&layer.normDstTensorDesc));
78     CHECK_CUDNN(cudnnSetTensor4dDescriptor(layer.normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, layer.batch, layer.out_c, layer.out_h, layer.out_w));
79     CHECK_CUDNN(cudnnSetTensor4dDescriptor(layer.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, layer.out_c, 1, 1));
80 #endif
81 #endif
82     return layer;
83 }
84 
backward_scale_cpu(float * x_norm,float * delta,int batch,int n,int size,float * scale_updates)85 void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates)
86 {
87     int i,b,f;
88     for(f = 0; f < n; ++f){
89         float sum = 0;
90         for(b = 0; b < batch; ++b){
91             for(i = 0; i < size; ++i){
92                 int index = i + size*(f + n*b);
93                 sum += delta[index] * x_norm[index];
94             }
95         }
96         scale_updates[f] += sum;
97     }
98 }
99 
mean_delta_cpu(float * delta,float * variance,int batch,int filters,int spatial,float * mean_delta)100 void mean_delta_cpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta)
101 {
102 
103     int i,j,k;
104     for(i = 0; i < filters; ++i){
105         mean_delta[i] = 0;
106         for (j = 0; j < batch; ++j) {
107             for (k = 0; k < spatial; ++k) {
108                 int index = j*filters*spatial + i*spatial + k;
109                 mean_delta[i] += delta[index];
110             }
111         }
112         mean_delta[i] *= (-1./sqrt(variance[i] + .00001f));
113     }
114 }
variance_delta_cpu(float * x,float * delta,float * mean,float * variance,int batch,int filters,int spatial,float * variance_delta)115 void  variance_delta_cpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta)
116 {
117 
118     int i,j,k;
119     for(i = 0; i < filters; ++i){
120         variance_delta[i] = 0;
121         for(j = 0; j < batch; ++j){
122             for(k = 0; k < spatial; ++k){
123                 int index = j*filters*spatial + i*spatial + k;
124                 variance_delta[i] += delta[index]*(x[index] - mean[i]);
125             }
126         }
127         variance_delta[i] *= -.5 * pow(variance[i] + .00001f, (float)(-3./2.));
128     }
129 }
normalize_delta_cpu(float * x,float * mean,float * variance,float * mean_delta,float * variance_delta,int batch,int filters,int spatial,float * delta)130 void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta)
131 {
132     int f, j, k;
133     for(j = 0; j < batch; ++j){
134         for(f = 0; f < filters; ++f){
135             for(k = 0; k < spatial; ++k){
136                 int index = j*filters*spatial + f*spatial + k;
137                 delta[index] = delta[index] * 1./(sqrt(variance[f]) + .00001f) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
138             }
139         }
140     }
141 }
142 
resize_batchnorm_layer(layer * l,int w,int h)143 void resize_batchnorm_layer(layer *l, int w, int h)
144 {
145     l->out_h = l->h = h;
146     l->out_w = l->w = w;
147     l->outputs = l->inputs = h*w*l->c;
148 
149     const int output_size = l->outputs * l->batch;
150 
151     l->output = (float*)realloc(l->output, output_size * sizeof(float));
152     l->delta = (float*)realloc(l->delta, output_size * sizeof(float));
153 
154 #ifdef GPU
155     cuda_free(l->output_gpu);
156     l->output_gpu = cuda_make_array(l->output, output_size);
157 
158     if (l->train) {
159         cuda_free(l->delta_gpu);
160         l->delta_gpu = cuda_make_array(l->delta, output_size);
161 
162         cuda_free(l->x_gpu);
163         l->x_gpu = cuda_make_array(l->output, output_size);
164 #ifndef CUDNN
165         cuda_free(l->x_norm_gpu);
166         l->x_norm_gpu = cuda_make_array(l->output, output_size);
167 #endif  // not CUDNN
168     }
169 
170 
171 #ifdef CUDNN
172     CHECK_CUDNN(cudnnDestroyTensorDescriptor(l->normDstTensorDesc));
173     CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->normDstTensorDesc));
174     CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w));
175 #endif // CUDNN
176 #endif // GPU
177 }
178 
forward_batchnorm_layer(layer l,network_state state)179 void forward_batchnorm_layer(layer l, network_state state)
180 {
181     if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
182     if(l.type == CONNECTED){
183         l.out_c = l.outputs;
184         l.out_h = l.out_w = 1;
185     }
186     if(state.train){
187         mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean);
188         variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h*l.out_w, l.variance);
189 
190         scal_cpu(l.out_c, .9, l.rolling_mean, 1);
191         axpy_cpu(l.out_c, .1, l.mean, 1, l.rolling_mean, 1);
192         scal_cpu(l.out_c, .9, l.rolling_variance, 1);
193         axpy_cpu(l.out_c, .1, l.variance, 1, l.rolling_variance, 1);
194 
195         copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
196         normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h*l.out_w);
197         copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
198     } else {
199         normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.out_c, l.out_h*l.out_w);
200     }
201     scale_bias(l.output, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
202     add_bias(l.output, l.biases, l.batch, l.out_c, l.out_w*l.out_h);
203 }
204 
backward_batchnorm_layer(const layer l,network_state state)205 void backward_batchnorm_layer(const layer l, network_state state)
206 {
207     backward_scale_cpu(l.x_norm, l.delta, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates);
208 
209     scale_bias(l.delta, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
210 
211     mean_delta_cpu(l.delta, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta);
212     variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta);
213     normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.out_c, l.out_w*l.out_h, l.delta);
214     if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, l.delta, 1, state.delta, 1);
215 }
216 
update_batchnorm_layer(layer l,int batch,float learning_rate,float momentum,float decay)217 void update_batchnorm_layer(layer l, int batch, float learning_rate, float momentum, float decay)
218 {
219     //int size = l.nweights;
220     axpy_cpu(l.c, learning_rate / batch, l.bias_updates, 1, l.biases, 1);
221     scal_cpu(l.c, momentum, l.bias_updates, 1);
222 
223     axpy_cpu(l.c, learning_rate / batch, l.scale_updates, 1, l.scales, 1);
224     scal_cpu(l.c, momentum, l.scale_updates, 1);
225 }
226 
227 
228 
229 
230 #ifdef GPU
231 
pull_batchnorm_layer(layer l)232 void pull_batchnorm_layer(layer l)
233 {
234     cuda_pull_array(l.biases_gpu, l.biases, l.out_c);
235     cuda_pull_array(l.scales_gpu, l.scales, l.out_c);
236     cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.out_c);
237     cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.out_c);
238 }
push_batchnorm_layer(layer l)239 void push_batchnorm_layer(layer l)
240 {
241     cuda_push_array(l.biases_gpu, l.biases, l.out_c);
242     cuda_push_array(l.scales_gpu, l.scales, l.out_c);
243     cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.out_c);
244     cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.out_c);
245 }
246 
forward_batchnorm_layer_gpu(layer l,network_state state)247 void forward_batchnorm_layer_gpu(layer l, network_state state)
248 {
249     if (l.type == BATCHNORM) simple_copy_ongpu(l.outputs*l.batch, state.input, l.output_gpu);
250         //copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
251 
252     if (state.net.adversarial) {
253         normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
254         scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
255         add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
256         return;
257     }
258 
259     if (state.train) {
260         simple_copy_ongpu(l.outputs*l.batch, l.output_gpu, l.x_gpu);
261 
262         // cbn
263         if (l.batch_normalize == 2) {
264 
265             fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu);
266 
267             //fast_v_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.v_cbn_gpu);
268             const int minibatch_index = state.net.current_subdivision + 1;
269             const int max_minibatch_index = state.net.subdivisions;
270             //printf("\n minibatch_index = %d, max_minibatch_index = %d \n", minibatch_index, max_minibatch_index);
271             const float alpha = 0.01;
272 
273             int inverse_variance = 0;
274 #ifdef CUDNN
275             inverse_variance = 1;
276 #endif  // CUDNN
277 
278             fast_v_cbn_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, minibatch_index, max_minibatch_index, l.m_cbn_avg_gpu, l.v_cbn_avg_gpu, l.variance_gpu,
279                 alpha, l.rolling_mean_gpu, l.rolling_variance_gpu, inverse_variance, .00001);
280 
281             normalize_scale_bias_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.scales_gpu, l.biases_gpu, l.batch, l.out_c, l.out_h*l.out_w, inverse_variance, .00001f);
282 
283 #ifndef CUDNN
284             simple_copy_ongpu(l.outputs*l.batch, l.output_gpu, l.x_norm_gpu);
285 #endif  // CUDNN
286 
287             //printf("\n CBN, minibatch_index = %d \n", minibatch_index);
288         }
289         else {
290 #ifdef CUDNN
291             float one = 1;
292             float zero = 0;
293             cudnnBatchNormalizationForwardTraining(cudnn_handle(),
294                 CUDNN_BATCHNORM_SPATIAL,
295                 &one,
296                 &zero,
297                 l.normDstTensorDesc,
298                 l.x_gpu,                // input
299                 l.normDstTensorDesc,
300                 l.output_gpu,            // output
301                 l.normTensorDesc,
302                 l.scales_gpu,
303                 l.biases_gpu,
304                 .01,
305                 l.rolling_mean_gpu,        // output (should be FP32)
306                 l.rolling_variance_gpu,    // output (should be FP32)
307                 .00001,
308                 l.mean_gpu,            // output (should be FP32)
309                 l.variance_gpu);    // output (should be FP32)
310 
311             if (state.net.try_fix_nan) {
312                 fix_nan_and_inf(l.scales_gpu, l.n);
313                 fix_nan_and_inf(l.biases_gpu, l.n);
314                 fix_nan_and_inf(l.mean_gpu, l.n);
315                 fix_nan_and_inf(l.variance_gpu, l.n);
316                 fix_nan_and_inf(l.rolling_mean_gpu, l.n);
317                 fix_nan_and_inf(l.rolling_variance_gpu, l.n);
318             }
319 
320             //simple_copy_ongpu(l.outputs*l.batch, l.output_gpu, l.x_norm_gpu);
321 #else   // CUDNN
322             fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu);
323             fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu);
324 
325             scal_ongpu(l.out_c, .99, l.rolling_mean_gpu, 1);
326             axpy_ongpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1);
327             scal_ongpu(l.out_c, .99, l.rolling_variance_gpu, 1);
328             axpy_ongpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1);
329 
330             copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
331             normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
332             copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);
333 
334             scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
335             add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
336 #endif  // CUDNN
337         }
338     }
339     else {
340         normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
341         scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
342         add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
343     }
344 
345 }
346 
backward_batchnorm_layer_gpu(layer l,network_state state)347 void backward_batchnorm_layer_gpu(layer l, network_state state)
348 {
349     if (state.net.adversarial) {
350         inverse_variance_ongpu(l.out_c, l.rolling_variance_gpu, l.variance_gpu, 0.00001);
351 
352         scale_bias_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
353         scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
354         return;
355     }
356 
357     if (!state.train) {
358         //l.mean_gpu = l.rolling_mean_gpu;
359         //l.variance_gpu = l.rolling_variance_gpu;
360         simple_copy_ongpu(l.out_c, l.rolling_mean_gpu, l.mean_gpu);
361 #ifdef CUDNN
362         inverse_variance_ongpu(l.out_c, l.rolling_variance_gpu, l.variance_gpu, 0.00001);
363 #else
364         simple_copy_ongpu(l.out_c, l.rolling_variance_gpu, l.variance_gpu);
365 #endif
366     }
367 
368 #ifdef CUDNN
369     float one = 1;
370     float zero = 0;
371     cudnnBatchNormalizationBackward(cudnn_handle(),
372         CUDNN_BATCHNORM_SPATIAL,
373         &one,
374         &zero,
375         &one,
376         &one,
377         l.normDstTensorDesc,
378         l.x_gpu,                // input
379         l.normDstTensorDesc,
380         l.delta_gpu,            // input
381         l.normDstTensorDesc,
382         l.output_gpu, //l.x_norm_gpu,            // output
383         l.normTensorDesc,
384         l.scales_gpu,            // input (should be FP32)
385         l.scale_updates_gpu,    // output (should be FP32)
386         l.bias_updates_gpu,        // output (should be FP32)
387         .00001,
388         l.mean_gpu,                // input (should be FP32)
389         l.variance_gpu);        // input (should be FP32)
390     simple_copy_ongpu(l.outputs*l.batch, l.output_gpu, l.delta_gpu);
391     //simple_copy_ongpu(l.outputs*l.batch, l.x_norm_gpu, l.delta_gpu);
392 #else   // CUDNN
393     backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h);
394     backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu);
395 
396     scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
397 
398     fast_mean_delta_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta_gpu);
399     fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu);
400     normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu);
401 #endif  // CUDNN
402     if (l.type == BATCHNORM) simple_copy_ongpu(l.outputs*l.batch, l.delta_gpu, state.delta);
403         //copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1);
404 
405     if (state.net.try_fix_nan) {
406         fix_nan_and_inf(l.scale_updates_gpu, l.n);
407         fix_nan_and_inf(l.bias_updates_gpu, l.n);
408     }
409 }
410 
update_batchnorm_layer_gpu(layer l,int batch,float learning_rate_init,float momentum,float decay,float loss_scale)411 void update_batchnorm_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay, float loss_scale)
412 {
413     float learning_rate = learning_rate_init * l.learning_rate_scale / loss_scale;
414     //float momentum = a.momentum;
415     //float decay = a.decay;
416     //int batch = a.batch;
417 
418     axpy_ongpu(l.c, learning_rate / batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
419     scal_ongpu(l.c, momentum, l.bias_updates_gpu, 1);
420 
421     axpy_ongpu(l.c, learning_rate / batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
422     scal_ongpu(l.c, momentum, l.scale_updates_gpu, 1);
423 }
424 
425 #endif  // GPU
426