1 #include "convolutional_layer.h"
2 #include "utils.h"
3 #include "batchnorm_layer.h"
4 #include "im2col.h"
5 #include "col2im.h"
6 #include "blas.h"
7 #include "gemm.h"
8 #include "box.h"
9 #include <stdio.h>
10 #include <time.h>
11
12 #ifdef AI2
13 #include "xnor_layer.h"
14 #endif
15
16 #ifdef __cplusplus
17 #define PUT_IN_REGISTER
18 #else
19 #define PUT_IN_REGISTER register
20 #endif
21
22 #ifndef AI2
23 #define AI2 0
24 void forward_xnor_layer(layer l, network_state state);
25 #endif
26
swap_binary(convolutional_layer * l)27 void swap_binary(convolutional_layer *l)
28 {
29 float *swap = l->weights;
30 l->weights = l->binary_weights;
31 l->binary_weights = swap;
32
33 #ifdef GPU
34 swap = l->weights_gpu;
35 l->weights_gpu = l->binary_weights_gpu;
36 l->binary_weights_gpu = swap;
37 #endif
38 }
39
binarize_weights(float * weights,int n,int size,float * binary)40 void binarize_weights(float *weights, int n, int size, float *binary)
41 {
42 int i, f;
43 for(f = 0; f < n; ++f){
44 float mean = 0;
45 for(i = 0; i < size; ++i){
46 mean += fabs(weights[f*size + i]);
47 }
48 mean = mean / size;
49 for(i = 0; i < size; ++i){
50 binary[f*size + i] = (weights[f*size + i] > 0) ? mean: -mean;
51 }
52 }
53 }
54
binarize_cpu(float * input,int n,float * binary)55 void binarize_cpu(float *input, int n, float *binary)
56 {
57 int i;
58 for(i = 0; i < n; ++i){
59 binary[i] = (input[i] > 0) ? 1 : -1;
60 }
61 }
62
binarize_input(float * input,int n,int size,float * binary)63 void binarize_input(float *input, int n, int size, float *binary)
64 {
65 int i, s;
66 for(s = 0; s < size; ++s){
67 float mean = 0;
68 for(i = 0; i < n; ++i){
69 mean += fabs(input[i*size + s]);
70 }
71 mean = mean / n;
72 for(i = 0; i < n; ++i){
73 binary[i*size + s] = (input[i*size + s] > 0) ? mean : -mean;
74 }
75 }
76 }
77
convolutional_out_height(convolutional_layer l)78 int convolutional_out_height(convolutional_layer l)
79 {
80 return (l.h + 2*l.pad - l.size) / l.stride_y + 1;
81 }
82
convolutional_out_width(convolutional_layer l)83 int convolutional_out_width(convolutional_layer l)
84 {
85 return (l.w + 2*l.pad - l.size) / l.stride_x + 1;
86 }
87
get_convolutional_image(convolutional_layer l)88 image get_convolutional_image(convolutional_layer l)
89 {
90 int h,w,c;
91 h = convolutional_out_height(l);
92 w = convolutional_out_width(l);
93 c = l.n;
94 return float_to_image(w,h,c,l.output);
95 }
96
get_convolutional_delta(convolutional_layer l)97 image get_convolutional_delta(convolutional_layer l)
98 {
99 int h,w,c;
100 h = convolutional_out_height(l);
101 w = convolutional_out_width(l);
102 c = l.n;
103 return float_to_image(w,h,c,l.delta);
104 }
105
get_workspace_size32(layer l)106 size_t get_workspace_size32(layer l){
107 #ifdef CUDNN
108 if(gpu_index >= 0){
109 size_t most = 0;
110 size_t s = 0;
111 CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(),
112 l.srcTensorDesc,
113 l.weightDesc,
114 l.convDesc,
115 l.dstTensorDesc,
116 l.fw_algo,
117 &s));
118 if (s > most) most = s;
119 CHECK_CUDNN(cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnn_handle(),
120 l.srcTensorDesc,
121 l.ddstTensorDesc,
122 l.convDesc,
123 l.dweightDesc,
124 l.bf_algo,
125 &s));
126 if (s > most && l.train) most = s;
127 CHECK_CUDNN(cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle(),
128 l.weightDesc,
129 l.ddstTensorDesc,
130 l.convDesc,
131 l.dsrcTensorDesc,
132 l.bd_algo,
133 &s));
134 if (s > most && l.train) most = s;
135 return most;
136 }
137 #endif
138 if (l.xnor) {
139 size_t re_packed_input_size = l.c * l.w * l.h * sizeof(float);
140 size_t workspace_size = (size_t)l.bit_align*l.size*l.size*l.c * sizeof(float);
141 if (workspace_size < re_packed_input_size) workspace_size = re_packed_input_size;
142 return workspace_size;
143 }
144 return (size_t)l.out_h*l.out_w*l.size*l.size*(l.c / l.groups)*sizeof(float);
145 }
146
get_workspace_size16(layer l)147 size_t get_workspace_size16(layer l) {
148 #if defined(CUDNN) && defined(CUDNN_HALF)
149 if (gpu_index >= 0) {
150 size_t most = 0;
151 size_t s = 0;
152 CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(),
153 l.srcTensorDesc16,
154 l.weightDesc16,
155 l.convDesc,
156 l.dstTensorDesc16,
157 l.fw_algo16,
158 &s));
159 if (s > most) most = s;
160 CHECK_CUDNN(cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnn_handle(),
161 l.srcTensorDesc16,
162 l.ddstTensorDesc16,
163 l.convDesc,
164 l.dweightDesc16,
165 l.bf_algo16,
166 &s));
167 if (s > most && l.train) most = s;
168 CHECK_CUDNN(cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle(),
169 l.weightDesc16,
170 l.ddstTensorDesc16,
171 l.convDesc,
172 l.dsrcTensorDesc16,
173 l.bd_algo16,
174 &s));
175 if (s > most && l.train) most = s;
176 return most;
177 }
178 #endif
179 return 0;
180 //if (l.xnor) return (size_t)l.bit_align*l.size*l.size*l.c * sizeof(float);
181 //return (size_t)l.out_h*l.out_w*l.size*l.size*l.c * sizeof(float);
182 }
183
get_convolutional_workspace_size(layer l)184 size_t get_convolutional_workspace_size(layer l) {
185 size_t workspace_size = get_workspace_size32(l);
186 size_t workspace_size16 = get_workspace_size16(l);
187 if (workspace_size16 > workspace_size) workspace_size = workspace_size16;
188 return workspace_size;
189 }
190 #ifdef GPU
191 #ifdef CUDNN
create_convolutional_cudnn_tensors(layer * l)192 void create_convolutional_cudnn_tensors(layer *l)
193 {
194 CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->normTensorDesc));
195
196 CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->normDstTensorDesc));
197 CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->srcTensorDesc));
198 CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->dstTensorDesc));
199 CHECK_CUDNN(cudnnCreateFilterDescriptor(&l->weightDesc));
200 CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->dsrcTensorDesc));
201 CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->ddstTensorDesc));
202 CHECK_CUDNN(cudnnCreateFilterDescriptor(&l->dweightDesc));
203
204 CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->normDstTensorDescF16));
205 CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->srcTensorDesc16));
206 CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->dstTensorDesc16));
207 CHECK_CUDNN(cudnnCreateFilterDescriptor(&l->weightDesc16));
208 CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->dsrcTensorDesc16));
209 CHECK_CUDNN(cudnnCreateTensorDescriptor(&l->ddstTensorDesc16));
210 CHECK_CUDNN(cudnnCreateFilterDescriptor(&l->dweightDesc16));
211
212 CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&l->convDesc));
213 }
214
cudnn_convolutional_setup(layer * l,int cudnn_preference,size_t workspace_size_specify)215 void cudnn_convolutional_setup(layer *l, int cudnn_preference, size_t workspace_size_specify)
216 {
217
218 // CUDNN_HALF
219 // TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0):
220 // Tegra X1, Jetson TX1, DRIVE CX, DRIVE PX, Quadro GP100, Tesla P100
221 // PSEUDO_HALF_CONFIG is required for Tensor Cores - our case!
222
223 cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
224
225 #if(CUDNN_MAJOR >= 7)
226 // Tensor Core uses CUDNN_TENSOR_OP_MATH instead of CUDNN_DEFAULT_MATH
227 // For *_ALGO_WINOGRAD_NONFUSED can be used CUDNN_DATA_FLOAT
228 // otherwise Input, Filter and Output descriptors (xDesc, yDesc, wDesc, dxDesc, dyDesc and dwDesc as applicable) have dataType = CUDNN_DATA_HALF
229 // Three techniques for training using Mixed-precision: https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/
230 // 1. Accumulation into FP32
231 // 2. Loss Scaling - required only for: activation gradients. We do not use.
232 // 3. FP32 Master Copy of Weights
233 // More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops
234 if (l->groups < 1) l->groups = 1;
235 if (l->stride_x < 1) l->stride_x = 1;
236 if (l->stride_y < 1) l->stride_y = 1;
237 CHECK_CUDNN(cudnnSetConvolutionGroupCount(l->convDesc, l->groups));
238 CHECK_CUDNN(cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH));
239 #if((CUDNN_MAJOR*10 + CUDNN_MINOR) >= 72) // cuDNN >= 7.2
240 //CHECK_CUDNN(cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION)); // reduces the speed of regular and group convolution
241 #endif
242 #else //if(CUDNN_MAJOR >= 7)
243 if (l->groups > 1) {
244 error("CUDNN < 7 doesn't support groups, please upgrade!");
245 }
246 #endif
247
248 // INT8_CONFIG, INT8_EXT_CONFIG, INT8x4_CONFIG and INT8x4_EXT_CONFIG are only supported
249 // on architectures with DP4A support (compute capability 6.1 and later).
250 //cudnnDataType_t data_type = CUDNN_DATA_INT8;
251
252 // backward delta
253 CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w));
254 CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w));
255 CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->dweightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c / l->groups, l->size, l->size));
256
257 // forward
258 CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w));
259 CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w));
260 CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->weightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c / l->groups, l->size, l->size));
261
262 // backward delta
263 CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dsrcTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->c, l->h, l->w));
264 CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->ddstTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w));
265 CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->dweightDesc16, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, l->n, l->c / l->groups, l->size, l->size));
266
267 // forward
268 CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->srcTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->c, l->h, l->w));
269 CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dstTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w));
270 CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->weightDesc16, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, l->n, l->c / l->groups, l->size, l->size));
271
272 // batch norm
273 CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->normDstTensorDescF16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w));
274
275 // batch norm
276 CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1));
277 CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w));
278
279 //printf("\n l->dilation = %d, l->pad = %d, l->size = %d, l->stride = %d, l->stride_x = %d, l->stride_y = %d, l->groups = %d, l->w = %d, l->h = %d, l->c = %d, l->n = %d, l->out_w = %d, l->out_h = %d, l->out_c = %d, l->batch = %d, data_type = %d \n",
280 // l->dilation, l->pad, l->size, l->stride, l->stride_x, l->stride_y, l->groups, l->w, l->h, l->c, l->n, l->out_w, l->out_h, l->out_c, l->batch, data_type);
281 #if(CUDNN_MAJOR >= 6)
282 CHECK_CUDNN(cudnnSetConvolution2dDescriptor(l->convDesc, l->pad * l->dilation, l->pad * l->dilation, l->stride_y, l->stride_x, l->dilation, l->dilation, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT)); // cudnn >= 6.0
283 #else
284 CHECK_CUDNN(cudnnSetConvolution2dDescriptor(l->convDesc, l->pad * l->dilation, l->pad * l->dilation, l->stride_y, l->stride_x, l->dilation, l->dilation, CUDNN_CROSS_CORRELATION)); // cudnn 5.1
285 #endif
286 int forward_algo = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
287 int backward_algo = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
288 int backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST;
289 if (cudnn_preference == cudnn_smallest)
290 {
291 forward_algo = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
292 backward_algo = CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
293 backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
294 printf(" CUDNN-slow ");
295 }
296 if (cudnn_preference == cudnn_specify)
297 {
298 forward_algo = CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT;
299 backward_algo = CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT;
300 backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT;
301 //printf(" CUDNN-specified %zu ", workspace_size_specify);
302 }
303
304 CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
305 l->srcTensorDesc,
306 l->weightDesc,
307 l->convDesc,
308 l->dstTensorDesc,
309 (cudnnConvolutionFwdPreference_t)forward_algo,
310 workspace_size_specify,
311 &l->fw_algo));
312 CHECK_CUDNN(cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
313 l->weightDesc,
314 l->ddstTensorDesc,
315 l->convDesc,
316 l->dsrcTensorDesc,
317 (cudnnConvolutionBwdDataPreference_t)backward_algo,
318 workspace_size_specify,
319 &l->bd_algo));
320 CHECK_CUDNN(cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(),
321 l->srcTensorDesc,
322 l->ddstTensorDesc,
323 l->convDesc,
324 l->dweightDesc,
325 (cudnnConvolutionBwdFilterPreference_t)backward_filter,
326 workspace_size_specify,
327 &l->bf_algo));
328
329 //if (data_type == CUDNN_DATA_HALF)
330 {
331 // HALF-16 if(data_type == CUDNN_DATA_HALF)
332 l->fw_algo16 = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
333 l->bd_algo16 = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
334 l->bf_algo16 = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
335
336 // FLOAT-32 if(data_type == CUDNN_DATA_FLOAT)
337 //l->fw_algo16 = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
338 //l->bd_algo16 = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
339 //l->bf_algo16 = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED;
340 }
341 }
342 #endif
343 #endif
344
345
free_convolutional_batchnorm(convolutional_layer * l)346 void free_convolutional_batchnorm(convolutional_layer *l)
347 {
348 if (!l->share_layer) {
349 if (l->scales) free(l->scales), l->scales = NULL;
350 if (l->scale_updates) free(l->scale_updates), l->scale_updates = NULL;
351 if (l->mean) free(l->mean), l->mean = NULL;
352 if (l->variance) free(l->variance), l->variance = NULL;
353 if (l->mean_delta) free(l->mean_delta), l->mean_delta = NULL;
354 if (l->variance_delta) free(l->variance_delta), l->variance_delta = NULL;
355 if (l->rolling_mean) free(l->rolling_mean), l->rolling_mean = NULL;
356 if (l->rolling_variance) free(l->rolling_variance), l->rolling_variance = NULL;
357 if (l->x) free(l->x), l->x = NULL;
358 if (l->x_norm) free(l->x_norm), l->x_norm = NULL;
359
360 #ifdef GPU
361 if (l->scales_gpu) cuda_free(l->scales_gpu), l->scales_gpu = NULL;
362 if (l->scale_updates_gpu) cuda_free(l->scale_updates_gpu), l->scale_updates_gpu = NULL;
363 if (l->mean_gpu) cuda_free(l->mean_gpu), l->mean_gpu = NULL;
364 if (l->variance_gpu) cuda_free(l->variance_gpu), l->variance_gpu = NULL;
365 if (l->mean_delta_gpu) cuda_free(l->mean_delta_gpu), l->mean_delta_gpu = NULL;
366 if (l->variance_delta_gpu) cuda_free(l->variance_delta_gpu), l->variance_delta_gpu = NULL;
367 if (l->rolling_mean_gpu) cuda_free(l->rolling_mean_gpu), l->rolling_mean_gpu = NULL;
368 if (l->rolling_variance_gpu) cuda_free(l->rolling_variance_gpu), l->rolling_variance_gpu = NULL;
369 if (l->x_gpu) cuda_free(l->x_gpu), l->x_gpu = NULL;
370 if (l->x_norm_gpu) cuda_free(l->x_norm_gpu), l->x_norm_gpu = NULL;
371 #endif
372 }
373 }
374
make_convolutional_layer(int batch,int steps,int h,int w,int c,int n,int groups,int size,int stride_x,int stride_y,int dilation,int padding,ACTIVATION activation,int batch_normalize,int binary,int xnor,int adam,int use_bin_output,int index,int antialiasing,convolutional_layer * share_layer,int assisted_excitation,int deform,int train)375 convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride_x, int stride_y, int dilation, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index, int antialiasing, convolutional_layer *share_layer, int assisted_excitation, int deform, int train)
376 {
377 int total_batch = batch*steps;
378 int i;
379 convolutional_layer l = { (LAYER_TYPE)0 };
380 l.type = CONVOLUTIONAL;
381 l.train = train;
382
383 if (xnor) groups = 1; // disable groups for XNOR-net
384 if (groups < 1) groups = 1;
385
386 const int blur_stride_x = stride_x;
387 const int blur_stride_y = stride_y;
388 l.antialiasing = antialiasing;
389 if (antialiasing) {
390 stride_x = stride_y = l.stride = l.stride_x = l.stride_y = 1; // use stride=1 in host-layer
391 }
392
393 l.deform = deform;
394 l.assisted_excitation = assisted_excitation;
395 l.share_layer = share_layer;
396 l.index = index;
397 l.h = h;
398 l.w = w;
399 l.c = c;
400 l.groups = groups;
401 l.n = n;
402 l.binary = binary;
403 l.xnor = xnor;
404 l.use_bin_output = use_bin_output;
405 l.batch = batch;
406 l.steps = steps;
407 l.stride = stride_x;
408 l.stride_x = stride_x;
409 l.stride_y = stride_y;
410 l.dilation = dilation;
411 l.size = size;
412 l.pad = padding;
413 l.batch_normalize = batch_normalize;
414 l.learning_rate_scale = 1;
415 l.nweights = (c / groups) * n * size * size;
416
417 if (l.share_layer) {
418 if (l.size != l.share_layer->size || l.nweights != l.share_layer->nweights || l.c != l.share_layer->c || l.n != l.share_layer->n) {
419 printf(" Layer size, nweights, channels or filters don't match for the share_layer");
420 getchar();
421 }
422
423 l.weights = l.share_layer->weights;
424 l.weight_updates = l.share_layer->weight_updates;
425
426 l.biases = l.share_layer->biases;
427 l.bias_updates = l.share_layer->bias_updates;
428 }
429 else {
430 l.weights = (float*)xcalloc(l.nweights, sizeof(float));
431 l.biases = (float*)xcalloc(n, sizeof(float));
432
433 if (train) {
434 l.weight_updates = (float*)xcalloc(l.nweights, sizeof(float));
435 l.bias_updates = (float*)xcalloc(n, sizeof(float));
436 }
437 }
438
439 // float scale = 1./sqrt(size*size*c);
440 float scale = sqrt(2./(size*size*c/groups));
441 if (l.activation == NORM_CHAN || l.activation == NORM_CHAN_SOFTMAX || l.activation == NORM_CHAN_SOFTMAX_MAXVAL) {
442 for (i = 0; i < l.nweights; ++i) l.weights[i] = 1; // rand_normal();
443 }
444 else {
445 for (i = 0; i < l.nweights; ++i) l.weights[i] = scale*rand_uniform(-1, 1); // rand_normal();
446 }
447 int out_h = convolutional_out_height(l);
448 int out_w = convolutional_out_width(l);
449 l.out_h = out_h;
450 l.out_w = out_w;
451 l.out_c = n;
452 l.outputs = l.out_h * l.out_w * l.out_c;
453 l.inputs = l.w * l.h * l.c;
454 l.activation = activation;
455
456 l.output = (float*)xcalloc(total_batch*l.outputs, sizeof(float));
457 #ifndef GPU
458 if (train) l.delta = (float*)xcalloc(total_batch*l.outputs, sizeof(float));
459 #endif // not GPU
460
461 l.forward = forward_convolutional_layer;
462 l.backward = backward_convolutional_layer;
463 l.update = update_convolutional_layer;
464 if(binary){
465 l.binary_weights = (float*)xcalloc(l.nweights, sizeof(float));
466 l.cweights = (char*)xcalloc(l.nweights, sizeof(char));
467 l.scales = (float*)xcalloc(n, sizeof(float));
468 }
469 if(xnor){
470 l.binary_weights = (float*)xcalloc(l.nweights, sizeof(float));
471 l.binary_input = (float*)xcalloc(l.inputs * l.batch, sizeof(float));
472
473 int align = 32;// 8;
474 int src_align = l.out_h*l.out_w;
475 l.bit_align = src_align + (align - src_align % align);
476
477 l.mean_arr = (float*)xcalloc(l.n, sizeof(float));
478
479 const size_t new_c = l.c / 32;
480 size_t in_re_packed_input_size = new_c * l.w * l.h + 1;
481 l.bin_re_packed_input = (uint32_t*)xcalloc(in_re_packed_input_size, sizeof(uint32_t));
482
483 l.lda_align = 256; // AVX2
484 int k = l.size*l.size*l.c;
485 size_t k_aligned = k + (l.lda_align - k%l.lda_align);
486 size_t t_bit_input_size = k_aligned * l.bit_align / 8;
487 l.t_bit_input = (char*)xcalloc(t_bit_input_size, sizeof(char));
488 }
489
490 if(batch_normalize){
491 if (l.share_layer) {
492 l.scales = l.share_layer->scales;
493 l.scale_updates = l.share_layer->scale_updates;
494 l.mean = l.share_layer->mean;
495 l.variance = l.share_layer->variance;
496 l.mean_delta = l.share_layer->mean_delta;
497 l.variance_delta = l.share_layer->variance_delta;
498 l.rolling_mean = l.share_layer->rolling_mean;
499 l.rolling_variance = l.share_layer->rolling_variance;
500 }
501 else {
502 l.scales = (float*)xcalloc(n, sizeof(float));
503 for (i = 0; i < n; ++i) {
504 l.scales[i] = 1;
505 }
506 if (train) {
507 l.scale_updates = (float*)xcalloc(n, sizeof(float));
508
509 l.mean = (float*)xcalloc(n, sizeof(float));
510 l.variance = (float*)xcalloc(n, sizeof(float));
511
512 l.mean_delta = (float*)xcalloc(n, sizeof(float));
513 l.variance_delta = (float*)xcalloc(n, sizeof(float));
514 }
515 l.rolling_mean = (float*)xcalloc(n, sizeof(float));
516 l.rolling_variance = (float*)xcalloc(n, sizeof(float));
517 }
518
519 #ifndef GPU
520 if (train) {
521 l.x = (float*)xcalloc(total_batch * l.outputs, sizeof(float));
522 l.x_norm = (float*)xcalloc(total_batch * l.outputs, sizeof(float));
523 }
524 #endif // not GPU
525 }
526
527 #ifndef GPU
528 if (l.activation == SWISH || l.activation == MISH) l.activation_input = (float*)calloc(total_batch*l.outputs, sizeof(float));
529 #endif // not GPU
530
531 if(adam){
532 l.adam = 1;
533 l.m = (float*)xcalloc(l.nweights, sizeof(float));
534 l.v = (float*)xcalloc(l.nweights, sizeof(float));
535 l.bias_m = (float*)xcalloc(n, sizeof(float));
536 l.scale_m = (float*)xcalloc(n, sizeof(float));
537 l.bias_v = (float*)xcalloc(n, sizeof(float));
538 l.scale_v = (float*)xcalloc(n, sizeof(float));
539 }
540
541 #ifdef GPU
542
543
544 l.forward_gpu = forward_convolutional_layer_gpu;
545 l.backward_gpu = backward_convolutional_layer_gpu;
546 l.update_gpu = update_convolutional_layer_gpu;
547
548 if(gpu_index >= 0){
549
550 if (train && (l.activation == SWISH || l.activation == MISH)) {
551 l.activation_input_gpu = cuda_make_array(l.activation_input, total_batch*l.outputs);
552 }
553
554 if (l.deform) l.weight_deform_gpu = cuda_make_array(NULL, l.nweights);
555
556 if (adam) {
557 l.m_gpu = cuda_make_array(l.m, l.nweights);
558 l.v_gpu = cuda_make_array(l.v, l.nweights);
559 l.bias_m_gpu = cuda_make_array(l.bias_m, n);
560 l.bias_v_gpu = cuda_make_array(l.bias_v, n);
561 l.scale_m_gpu = cuda_make_array(l.scale_m, n);
562 l.scale_v_gpu = cuda_make_array(l.scale_v, n);
563 }
564 if (l.share_layer) {
565 l.weights_gpu = l.share_layer->weights_gpu;
566 l.weight_updates_gpu = l.share_layer->weight_updates_gpu;
567 l.weights_gpu16 = l.share_layer->weights_gpu16;
568 l.weight_updates_gpu16 = l.share_layer->weight_updates_gpu16;
569 l.biases_gpu = l.share_layer->biases_gpu;
570 l.bias_updates_gpu = l.share_layer->bias_updates_gpu;
571 }
572 else {
573 l.weights_gpu = cuda_make_array(l.weights, l.nweights);
574 if (train) l.weight_updates_gpu = cuda_make_array(l.weight_updates, l.nweights);
575 #ifdef CUDNN_HALF
576 l.weights_gpu16 = cuda_make_array(NULL, l.nweights / 2 + 1);
577 if (train) l.weight_updates_gpu16 = cuda_make_array(NULL, l.nweights / 2 + 1);
578 #endif // CUDNN_HALF
579 l.biases_gpu = cuda_make_array(l.biases, n);
580 if (train) l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
581 }
582
583 l.output_gpu = cuda_make_array(l.output, total_batch*out_h*out_w*n);
584 if (train) l.delta_gpu = cuda_make_array(l.delta, total_batch*out_h*out_w*n);
585
586 if(binary){
587 l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights);
588 }
589 if(xnor){
590 l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights);
591 l.mean_arr_gpu = cuda_make_array(0, l.n);
592 l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch);
593 }
594
595 if(batch_normalize){
596 if (l.share_layer) {
597 l.scales_gpu = l.share_layer->scales_gpu;
598 l.scale_updates_gpu = l.share_layer->scale_updates_gpu;
599 l.mean_gpu = l.share_layer->mean_gpu;
600 l.variance_gpu = l.share_layer->variance_gpu;
601 l.rolling_mean_gpu = l.share_layer->rolling_mean_gpu;
602 l.rolling_variance_gpu = l.share_layer->rolling_variance_gpu;
603 l.mean_delta_gpu = l.share_layer->mean_delta_gpu;
604 l.variance_delta_gpu = l.share_layer->variance_delta_gpu;
605 }
606 else {
607 l.scales_gpu = cuda_make_array(l.scales, n);
608
609 if (train) {
610 l.scale_updates_gpu = cuda_make_array(l.scale_updates, n);
611
612 l.mean_gpu = cuda_make_array(l.mean, n);
613 l.variance_gpu = cuda_make_array(l.variance, n);
614 l.m_cbn_avg_gpu = cuda_make_array(l.mean, n);
615 l.v_cbn_avg_gpu = cuda_make_array(l.variance, n);
616 #ifndef CUDNN
617 l.mean_delta_gpu = cuda_make_array(l.mean, n);
618 l.variance_delta_gpu = cuda_make_array(l.variance, n);
619 #endif // CUDNN
620 }
621
622 l.rolling_mean_gpu = cuda_make_array(l.mean, n);
623 l.rolling_variance_gpu = cuda_make_array(l.variance, n);
624 }
625
626 if (train) {
627 l.x_gpu = cuda_make_array(l.output, total_batch*out_h*out_w*n);
628 #ifndef CUDNN
629 l.x_norm_gpu = cuda_make_array(l.output, total_batch*out_h*out_w*n);
630 #endif // CUDNN
631 }
632 }
633
634 if (l.assisted_excitation)
635 {
636 const int size = l.out_w * l.out_h * l.batch;
637 l.gt_gpu = cuda_make_array(NULL, size);
638 l.a_avg_gpu = cuda_make_array(NULL, size);
639 }
640 #ifdef CUDNN
641 create_convolutional_cudnn_tensors(&l);
642 cudnn_convolutional_setup(&l, cudnn_fastest, 0);
643 #endif // CUDNN
644 }
645 #endif // GPU
646 l.workspace_size = get_convolutional_workspace_size(l);
647
648 //fprintf(stderr, "conv %5d %2d x%2d /%2d %4d x%4d x%4d -> %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c);
649 l.bflops = (2.0 * l.nweights * l.out_h*l.out_w) / 1000000000.;
650 if (l.xnor) l.bflops = l.bflops / 32;
651 if (l.xnor && l.use_bin_output) fprintf(stderr, "convXB");
652 else if (l.xnor) fprintf(stderr, "convX ");
653 else if (l.share_layer) fprintf(stderr, "convS ");
654 else if (l.assisted_excitation) fprintf(stderr, "convAE");
655 else fprintf(stderr, "conv ");
656
657 if (groups > 1) fprintf(stderr, "%5d/%4d ", n, groups);
658 else fprintf(stderr, "%5d ", n);
659
660 if (stride_x != stride_y) fprintf(stderr, "%2dx%2d/%2dx%2d ", size, size, stride_x, stride_y);
661 else {
662 if (dilation > 1) fprintf(stderr, "%2d x%2d/%2d(%1d)", size, size, stride_x, dilation);
663 else fprintf(stderr, "%2d x%2d/%2d ", size, size, stride_x);
664 }
665
666 fprintf(stderr, "%4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
667
668 //fprintf(stderr, "%5d/%2d %2d x%2d /%2d(%d)%4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", n, groups, size, size, stride, dilation, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
669
670 if (l.antialiasing) {
671 printf("AA: ");
672 l.input_layer = (layer*)calloc(1, sizeof(layer));
673 int blur_size = 3;
674 int blur_pad = blur_size / 2;
675 if (l.antialiasing == 2) {
676 blur_size = 2;
677 blur_pad = 0;
678 }
679 *(l.input_layer) = make_convolutional_layer(batch, steps, out_h, out_w, n, n, n, blur_size, blur_stride_x, blur_stride_y, 1, blur_pad, LINEAR, 0, 0, 0, 0, 0, index, 0, NULL, 0, 0, train);
680 const int blur_nweights = n * blur_size * blur_size; // (n / n) * n * blur_size * blur_size;
681 int i;
682 if (blur_size == 2) {
683 for (i = 0; i < blur_nweights; i += (blur_size*blur_size)) {
684 l.input_layer->weights[i + 0] = 1 / 4.f;
685 l.input_layer->weights[i + 1] = 1 / 4.f;
686 l.input_layer->weights[i + 2] = 1 / 4.f;
687 l.input_layer->weights[i + 3] = 1 / 4.f;
688 }
689 }
690 else {
691 for (i = 0; i < blur_nweights; i += (blur_size*blur_size)) {
692 l.input_layer->weights[i + 0] = 1 / 16.f;
693 l.input_layer->weights[i + 1] = 2 / 16.f;
694 l.input_layer->weights[i + 2] = 1 / 16.f;
695
696 l.input_layer->weights[i + 3] = 2 / 16.f;
697 l.input_layer->weights[i + 4] = 4 / 16.f;
698 l.input_layer->weights[i + 5] = 2 / 16.f;
699
700 l.input_layer->weights[i + 6] = 1 / 16.f;
701 l.input_layer->weights[i + 7] = 2 / 16.f;
702 l.input_layer->weights[i + 8] = 1 / 16.f;
703 }
704 }
705 for (i = 0; i < n; ++i) l.input_layer->biases[i] = 0;
706 #ifdef GPU
707 if (gpu_index >= 0) {
708 l.input_antialiasing_gpu = cuda_make_array(NULL, l.batch*l.outputs);
709 push_convolutional_layer(*(l.input_layer));
710 }
711 #endif // GPU
712 }
713
714 return l;
715 }
716
denormalize_convolutional_layer(convolutional_layer l)717 void denormalize_convolutional_layer(convolutional_layer l)
718 {
719 int i, j;
720 for(i = 0; i < l.n; ++i){
721 float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .00001);
722 for(j = 0; j < l.nweights; ++j){
723 l.weights[i*l.nweights + j] *= scale;
724 }
725 l.biases[i] -= l.rolling_mean[i] * scale;
726 l.scales[i] = 1;
727 l.rolling_mean[i] = 0;
728 l.rolling_variance[i] = 1;
729 }
730 }
731
test_convolutional_layer()732 void test_convolutional_layer()
733 {
734 convolutional_layer l = make_convolutional_layer(1, 1, 5, 5, 3, 2, 1, 5, 2, 2, 1, 1, LEAKY, 1, 0, 0, 0, 0, 0, 0, NULL, 0, 0, 0);
735 l.batch_normalize = 1;
736 float data[] = {1,1,1,1,1,
737 1,1,1,1,1,
738 1,1,1,1,1,
739 1,1,1,1,1,
740 1,1,1,1,1,
741 2,2,2,2,2,
742 2,2,2,2,2,
743 2,2,2,2,2,
744 2,2,2,2,2,
745 2,2,2,2,2,
746 3,3,3,3,3,
747 3,3,3,3,3,
748 3,3,3,3,3,
749 3,3,3,3,3,
750 3,3,3,3,3};
751 network_state state = {0};
752 state.input = data;
753 forward_convolutional_layer(l, state);
754 }
755
resize_convolutional_layer(convolutional_layer * l,int w,int h)756 void resize_convolutional_layer(convolutional_layer *l, int w, int h)
757 {
758 int total_batch = l->batch*l->steps;
759 int old_w = l->w;
760 int old_h = l->h;
761 l->w = w;
762 l->h = h;
763 int out_w = convolutional_out_width(*l);
764 int out_h = convolutional_out_height(*l);
765
766 l->out_w = out_w;
767 l->out_h = out_h;
768
769 l->outputs = l->out_h * l->out_w * l->out_c;
770 l->inputs = l->w * l->h * l->c;
771
772
773 l->output = (float*)xrealloc(l->output, total_batch * l->outputs * sizeof(float));
774 if (l->train) {
775 l->delta = (float*)xrealloc(l->delta, total_batch * l->outputs * sizeof(float));
776
777 if (l->batch_normalize) {
778 l->x = (float*)xrealloc(l->x, total_batch * l->outputs * sizeof(float));
779 l->x_norm = (float*)xrealloc(l->x_norm, total_batch * l->outputs * sizeof(float));
780 }
781 }
782
783 if (l->xnor) {
784 //l->binary_input = realloc(l->inputs*l->batch, sizeof(float));
785 }
786
787 if (l->activation == SWISH || l->activation == MISH) l->activation_input = (float*)realloc(l->activation_input, total_batch*l->outputs * sizeof(float));
788 #ifdef GPU
789 if (old_w < w || old_h < h || l->dynamic_minibatch) {
790 if (l->train) {
791 cuda_free(l->delta_gpu);
792 l->delta_gpu = cuda_make_array(l->delta, total_batch*l->outputs);
793 }
794
795 cuda_free(l->output_gpu);
796 l->output_gpu = cuda_make_array(l->output, total_batch*l->outputs);
797
798 if (l->batch_normalize) {
799 cuda_free(l->x_gpu);
800 l->x_gpu = cuda_make_array(l->output, total_batch*l->outputs);
801
802 #ifndef CUDNN
803 cuda_free(l->x_norm_gpu);
804 l->x_norm_gpu = cuda_make_array(l->output, total_batch*l->outputs);
805 #endif // CUDNN
806 }
807
808 if (l->xnor) {
809 cuda_free(l->binary_input_gpu);
810 l->binary_input_gpu = cuda_make_array(0, l->inputs*l->batch);
811 }
812
813 if (l->activation == SWISH || l->activation == MISH) {
814 cuda_free(l->activation_input_gpu);
815 l->activation_input_gpu = cuda_make_array(l->activation_input, total_batch*l->outputs);
816 }
817
818 if (l->assisted_excitation)
819 {
820 cuda_free(l->gt_gpu);
821 cuda_free(l->a_avg_gpu);
822
823 const int size = l->out_w * l->out_h * l->batch;
824 l->gt_gpu = cuda_make_array(NULL, size);
825 l->a_avg_gpu = cuda_make_array(NULL, size);
826 }
827 }
828 #ifdef CUDNN
829 cudnn_convolutional_setup(l, cudnn_fastest, 0);
830 #endif
831 #endif
832 l->workspace_size = get_convolutional_workspace_size(*l);
833
834 #ifdef CUDNN
835 // check for excessive memory consumption
836 size_t free_byte;
837 size_t total_byte;
838 CHECK_CUDA(cudaMemGetInfo(&free_byte, &total_byte));
839 if (l->workspace_size > free_byte || l->workspace_size >= total_byte / 2) {
840 printf(" used slow CUDNN algo without Workspace! Need memory: %zu, available: %zu\n", l->workspace_size, (free_byte < total_byte/2) ? free_byte : total_byte/2);
841 cudnn_convolutional_setup(l, cudnn_smallest, 0);
842 l->workspace_size = get_convolutional_workspace_size(*l);
843 }
844 #endif
845 }
846
set_specified_workspace_limit(convolutional_layer * l,size_t workspace_size_limit)847 void set_specified_workspace_limit(convolutional_layer *l, size_t workspace_size_limit)
848 {
849 #ifdef CUDNN
850 size_t free_byte;
851 size_t total_byte;
852 CHECK_CUDA(cudaMemGetInfo(&free_byte, &total_byte));
853 cudnn_convolutional_setup(l, cudnn_specify, workspace_size_limit);
854 l->workspace_size = get_convolutional_workspace_size(*l);
855 //printf("Set specified workspace limit for cuDNN: %zu, available: %zu, workspace = %zu \n", workspace_size_limit, free_byte, l->workspace_size);
856 #endif // CUDNN
857 }
858
add_bias(float * output,float * biases,int batch,int n,int size)859 void add_bias(float *output, float *biases, int batch, int n, int size)
860 {
861 int i,j,b;
862 for(b = 0; b < batch; ++b){
863 for(i = 0; i < n; ++i){
864 for(j = 0; j < size; ++j){
865 output[(b*n + i)*size + j] += biases[i];
866 }
867 }
868 }
869 }
870
scale_bias(float * output,float * scales,int batch,int n,int size)871 void scale_bias(float *output, float *scales, int batch, int n, int size)
872 {
873 int i,j,b;
874 for(b = 0; b < batch; ++b){
875 for(i = 0; i < n; ++i){
876 for(j = 0; j < size; ++j){
877 output[(b*n + i)*size + j] *= scales[i];
878 }
879 }
880 }
881 }
882
backward_bias(float * bias_updates,float * delta,int batch,int n,int size)883 void backward_bias(float *bias_updates, float *delta, int batch, int n, int size)
884 {
885 int i,b;
886 for(b = 0; b < batch; ++b){
887 for(i = 0; i < n; ++i){
888 bias_updates[i] += sum_array(delta+size*(i+b*n), size);
889 }
890 }
891 }
892
gemm_nn_custom(int M,int N,int K,float ALPHA,float * A,int lda,float * B,int ldb,float * C,int ldc)893 void gemm_nn_custom(int M, int N, int K, float ALPHA,
894 float *A, int lda,
895 float *B, int ldb,
896 float *C, int ldc)
897 {
898 int i, j, k;
899 for (i = 0; i < M; ++i) {
900 for (k = 0; k < K; ++k) {
901 PUT_IN_REGISTER float A_PART = ALPHA * A[i * lda + k];
902 //printf("\n weight = %f \n", A_PART);
903 for (j = 0; j < N; ++j) {
904 C[i*ldc + j] += A_PART*B[k*ldb + j];
905 }
906 }
907 }
908 }
909
910
get_mean_array(float * src,size_t size,size_t filters,float * mean_arr)911 void get_mean_array(float *src, size_t size, size_t filters, float *mean_arr) {
912 size_t i, counter;
913 counter = 0;
914 for (i = 0; i < size; i += size / filters) {
915 mean_arr[counter++] = fabs(src[i]);
916 }
917 }
918
919 /*
920 void float_to_bit(float *src, unsigned char *dst, size_t size) {
921
922 size_t dst_size = size / 8 + 1;
923 memset(dst, 0, dst_size);
924 size_t i, dst_i, dst_shift;
925 for (i = 0; i < size; ++i) {
926 if (src[i] > 0) set_bit(dst, i);
927 }
928 }
929 */
930
bit_to_float(unsigned char * src,float * dst,size_t size,size_t filters,float * mean_arr)931 void bit_to_float(unsigned char *src, float *dst, size_t size, size_t filters, float *mean_arr) {
932 memset(dst, 0, size *sizeof(float));
933 size_t i;
934
935 for (i = 0; i < size; ++i) {
936 float mean_val = 1;
937 if(mean_arr != NULL) mean_val = fabs(mean_arr[i / (size / filters)]);
938 if(get_bit(src, i)) dst[i] = mean_val;
939 else dst[i] = -mean_val;
940 }
941 }
942
binary_align_weights(convolutional_layer * l)943 void binary_align_weights(convolutional_layer *l)
944 {
945 int m = l->n; // (l->n / l->groups)
946 int k = l->size*l->size*l->c; // ->size*l->size*(l->c / l->groups)
947 size_t new_lda = k + (l->lda_align - k % l->lda_align); // (k / 8 + 1) * 8;
948 l->new_lda = new_lda;
949
950 binarize_weights(l->weights, m, k, l->binary_weights);
951
952 size_t align_weights_size = new_lda * m;
953 l->align_bit_weights_size = align_weights_size / 8 + 1;
954 float* align_weights = (float*)xcalloc(align_weights_size, sizeof(float));
955 l->align_bit_weights = (char*)xcalloc(l->align_bit_weights_size, sizeof(char));
956
957 size_t i, j;
958 // align A without transpose
959 for (i = 0; i < m; ++i) {
960 for (j = 0; j < k; ++j) {
961 align_weights[i*new_lda + j] = l->binary_weights[i*k + j];
962 }
963 }
964
965
966 if (l->c % 32 == 0)
967 //if(gpu_index < 0 && l->stride == 1 && l->pad == 1 && l->c % 32 == 0)
968 //if (l->stride == 1 && l->pad == 1 && l->c % 32 == 0)
969 {
970 int fil, chan;
971 const int items_per_filter = l->c * l->size * l->size;
972 //const int dst_items_per_filter = new_lda;
973 for (fil = 0; fil < l->n; ++fil)
974 {
975 for (chan = 0; chan < l->c; chan += 32)
976 {
977 const int items_per_channel = l->size*l->size;
978 for (i = 0; i < items_per_channel; ++i)
979 {
980 //uint32_t val = 0;
981 int c_pack;
982 for (c_pack = 0; c_pack < 32; ++c_pack) {
983 float src = l->binary_weights[fil*items_per_filter + (chan + c_pack)*items_per_channel + i];
984
985 //align_weights[fil*items_per_filter + chan*items_per_channel + i * 32 + c_pack] = src;
986
987 align_weights[fil*new_lda + chan*items_per_channel + i*32 + c_pack] = src;
988 //val |= (src << c);
989 }
990
991 }
992 }
993 }
994
995 //printf("\n l.index = %d \t aw[0] = %f, aw[1] = %f, aw[2] = %f, aw[3] = %f \n", l->index, align_weights[0], align_weights[1], align_weights[2], align_weights[3]);
996 //memcpy(l->binary_weights, align_weights, (l->size * l->size * l->c * l->n) * sizeof(float));
997
998 float_to_bit(align_weights, (unsigned char*)l->align_bit_weights, align_weights_size);
999
1000 //if (l->n >= 32)
1001 if(gpu_index >= 0)
1002 {
1003 //int M = l->n;
1004 //int N = l->out_w*l->out_h;
1005 //printf("\n M = %d, N = %d, M %% 8 = %d, N %% 8 = %d - weights \n", M, N, M % 8, N % 8);
1006 //printf("\n l.w = %d, l.c = %d, l.n = %d \n", l->w, l->c, l->n);
1007 for (i = 0; i < align_weights_size / 8; ++i) l->align_bit_weights[i] = ~(l->align_bit_weights[i]);
1008 }
1009
1010
1011
1012 get_mean_array(l->binary_weights, m*k, l->n, l->mean_arr);
1013 //get_mean_array(l->binary_weights, m*new_lda, l->n, l->mean_arr);
1014 }
1015 else {
1016 float_to_bit(align_weights, (unsigned char*)l->align_bit_weights, align_weights_size);
1017
1018 get_mean_array(l->binary_weights, m*k, l->n, l->mean_arr);
1019 }
1020
1021 //l->mean_arr = calloc(l->n, sizeof(float));
1022
1023 //get_mean_array(align_weights, align_weights_size, l->n, l->mean_arr);
1024
1025
1026
1027
1028 #ifdef GPU
1029 cudaError_t status;
1030 l->align_workspace_size = l->bit_align * l->size * l->size * l->c;
1031 status = cudaMalloc((void **)&l->align_workspace_gpu, l->align_workspace_size * sizeof(float));
1032 status = cudaMalloc((void **)&l->transposed_align_workspace_gpu, l->align_workspace_size * sizeof(float));
1033 CHECK_CUDA(status);
1034
1035 //l->align_bit_weights_gpu = cuda_make_array(l->align_bit_weights, l->align_bit_weights_size * sizeof(char)/sizeof(float));
1036 status = cudaMalloc((void **)&l->align_bit_weights_gpu, l->align_bit_weights_size);
1037 CHECK_CUDA(status);
1038 status = cudaMemcpy(l->align_bit_weights_gpu, l->align_bit_weights, l->align_bit_weights_size, cudaMemcpyHostToDevice);
1039 CHECK_CUDA(status);
1040 status = cudaMemcpy(l->binary_weights_gpu, l->binary_weights, m*k * sizeof(float), cudaMemcpyHostToDevice);
1041 CHECK_CUDA(status);
1042
1043 //l->mean_arr_gpu = cuda_make_array(l->mean_arr, l->n);
1044 cuda_push_array(l->mean_arr_gpu, l->mean_arr, l->n);
1045 CHECK_CUDA(cudaDeviceSynchronize());
1046 #endif // GPU
1047
1048 free(align_weights);
1049 }
1050
1051 // binary transpose
binary_transpose_align_input(int k,int n,float * b,char ** t_bit_input,size_t ldb_align,int bit_align)1052 size_t binary_transpose_align_input(int k, int n, float *b, char **t_bit_input, size_t ldb_align, int bit_align)
1053 {
1054 size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
1055 //printf("\n n = %d, bit_align = %d \n", n, bit_align);
1056 size_t t_intput_size = new_ldb * bit_align;// n;
1057 size_t t_bit_input_size = t_intput_size / 8;// +1;
1058
1059 memset(*t_bit_input, 0, t_bit_input_size * sizeof(char));
1060 //int src_size = k * bit_align;
1061
1062 // b - [bit_align, k] - [l.bit_align, l.size*l.size*l.c] = src_size
1063 // t_input - [bit_align, k] - [n', k]
1064 // t_bit_input - [new_ldb, n] - [k', n]
1065
1066 //transpose_bin(t_input, *t_bit_input, k, n, bit_align, new_ldb, 8);
1067 transpose_bin((uint32_t*)b, (uint32_t*)*t_bit_input, k, n, bit_align, new_ldb, 8);
1068
1069 return t_intput_size;
1070 }
1071
1072
forward_convolutional_layer(convolutional_layer l,network_state state)1073 void forward_convolutional_layer(convolutional_layer l, network_state state)
1074 {
1075 int out_h = convolutional_out_height(l);
1076 int out_w = convolutional_out_width(l);
1077 int i, j;
1078
1079 fill_cpu(l.outputs*l.batch, 0, l.output, 1);
1080
1081 if (l.xnor && (!l.align_bit_weights || state.train)) {
1082 if (!l.align_bit_weights || state.train) {
1083 binarize_weights(l.weights, l.n, l.nweights, l.binary_weights);
1084 //printf("\n binarize_weights l.align_bit_weights = %p \n", l.align_bit_weights);
1085 }
1086 swap_binary(&l);
1087 binarize_cpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input);
1088 state.input = l.binary_input;
1089 }
1090
1091 int m = l.n / l.groups;
1092 int k = l.size*l.size*l.c / l.groups;
1093 int n = out_h*out_w;
1094
1095 static int u = 0;
1096 u++;
1097
1098 for(i = 0; i < l.batch; ++i)
1099 {
1100 for (j = 0; j < l.groups; ++j)
1101 {
1102 float *a = l.weights +j*l.nweights / l.groups;
1103 float *b = state.workspace;
1104 float *c = l.output +(i*l.groups + j)*n*m;
1105
1106 //gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
1107 //gemm_nn_custom(m, n, k, 1, a, k, b, n, c, n);
1108 if (l.xnor && l.align_bit_weights && !state.train && l.stride_x == l.stride_y)
1109 {
1110 memset(b, 0, l.bit_align*l.size*l.size*l.c * sizeof(float));
1111
1112 if (l.c % 32 == 0)
1113 {
1114 //printf(" l.index = %d - new XNOR \n", l.index);
1115
1116 int ldb_align = l.lda_align;
1117 size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
1118 //size_t t_intput_size = new_ldb * l.bit_align;// n;
1119 //size_t t_bit_input_size = t_intput_size / 8;// +1;
1120
1121 int re_packed_input_size = l.c * l.w * l.h;
1122 memset(state.workspace, 0, re_packed_input_size * sizeof(float));
1123
1124 const size_t new_c = l.c / 32;
1125 size_t in_re_packed_input_size = new_c * l.w * l.h + 1;
1126 memset(l.bin_re_packed_input, 0, in_re_packed_input_size * sizeof(uint32_t));
1127
1128 //float *re_packed_input = calloc(l.c * l.w * l.h, sizeof(float));
1129 //uint32_t *bin_re_packed_input = calloc(new_c * l.w * l.h + 1, sizeof(uint32_t));
1130
1131 // float32x4 by channel (as in cuDNN)
1132 repack_input(state.input, state.workspace, l.w, l.h, l.c);
1133
1134 // 32 x floats -> 1 x uint32_t
1135 float_to_bit(state.workspace, (unsigned char *)l.bin_re_packed_input, l.c * l.w * l.h);
1136
1137 //free(re_packed_input);
1138
1139 // slow - convolution the packed inputs and weights: float x 32 by channel (as in cuDNN)
1140 //convolution_repacked((uint32_t *)bin_re_packed_input, (uint32_t *)l.align_bit_weights, l.output,
1141 // l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr);
1142
1143 // // then exit from if()
1144
1145
1146 im2col_cpu_custom((float *)l.bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
1147 //im2col_cpu((float *)bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, b);
1148
1149 //free(bin_re_packed_input);
1150
1151 int new_k = l.size*l.size*l.c / 32;
1152
1153 // good for (l.c == 64)
1154 //gemm_nn_bin_32bit_packed(m, n, new_k, 1,
1155 // l.align_bit_weights, l.new_lda/32,
1156 // b, n,
1157 // c, n, l.mean_arr);
1158
1159 // // then exit from if()
1160
1161 transpose_uint32((uint32_t *)state.workspace, (uint32_t*)l.t_bit_input, new_k, n, n, new_ldb);
1162
1163 // the main GEMM function
1164 gemm_nn_custom_bin_mean_transposed(m, n, k, 1, (unsigned char*)l.align_bit_weights, new_ldb, (unsigned char*)l.t_bit_input, new_ldb, c, n, l.mean_arr);
1165
1166 // // alternative GEMM
1167 //gemm_nn_bin_transposed_32bit_packed(m, n, new_k, 1,
1168 // l.align_bit_weights, l.new_lda/32,
1169 // t_bit_input, new_ldb / 32,
1170 // c, n, l.mean_arr);
1171
1172 //free(t_bit_input);
1173
1174 }
1175 else
1176 { // else (l.c % 32 != 0)
1177
1178 //--------------------------------------------------------
1179 //printf(" l.index = %d - old XNOR \n", l.index);
1180
1181 //im2col_cpu_custom_align(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b, l.bit_align);
1182 im2col_cpu_custom_bin(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace, l.bit_align);
1183
1184 //size_t output_size = l.outputs;
1185 //float *count_output = calloc(output_size, sizeof(float));
1186 //size_t bit_output_size = output_size / 8 + 1;
1187 //char *bit_output = calloc(bit_output_size, sizeof(char));
1188
1189 //size_t intput_size = n * k; // (out_h*out_w) X (l.size*l.size*l.c) : after im2col()
1190 //size_t bit_input_size = intput_size / 8 + 1;
1191 //char *bit_input = calloc(bit_input_size, sizeof(char));
1192
1193 //size_t weights_size = k * m; //l.size*l.size*l.c*l.n; // l.nweights
1194 //size_t bit_weights_size = weights_size / 8 + 1;
1195
1196 //char *bit_weights = calloc(bit_weights_size, sizeof(char));
1197 //float *mean_arr = calloc(l.n, sizeof(float));
1198
1199 // transpose B from NxK to KxN (x-axis (ldb = l.size*l.size*l.c) - should be multiple of 8 bits)
1200 {
1201 //size_t ldb_align = 256; // 256 bit for AVX2
1202 int ldb_align = l.lda_align;
1203 size_t new_ldb = k + (ldb_align - k%ldb_align);
1204 size_t t_intput_size = binary_transpose_align_input(k, n, state.workspace, &l.t_bit_input, ldb_align, l.bit_align);
1205
1206 // 5x times faster than gemm()-float32
1207 gemm_nn_custom_bin_mean_transposed(m, n, k, 1, (unsigned char*)l.align_bit_weights, new_ldb, (unsigned char*)l.t_bit_input, new_ldb, c, n, l.mean_arr);
1208
1209 //gemm_nn_custom_bin_mean_transposed(m, n, k, 1, bit_weights, k, t_bit_input, new_ldb, c, n, mean_arr);
1210
1211 //free(t_input);
1212 //free(t_bit_input);
1213 //}
1214 }
1215
1216 }
1217
1218 add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
1219
1220 //activate_array(l.output, m*n*l.batch, l.activation);
1221 if (l.activation == SWISH) activate_array_swish(l.output, l.outputs*l.batch, l.activation_input, l.output);
1222 else if (l.activation == MISH) activate_array_mish(l.output, l.outputs*l.batch, l.activation_input, l.output);
1223 else if (l.activation == NORM_CHAN) activate_array_normalize_channels(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output);
1224 else if (l.activation == NORM_CHAN_SOFTMAX) activate_array_normalize_channels_softmax(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output, 0);
1225 else if (l.activation == NORM_CHAN_SOFTMAX_MAXVAL) activate_array_normalize_channels_softmax(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output, 1);
1226 else activate_array_cpu_custom(l.output, m*n*l.batch, l.activation);
1227 return;
1228
1229 }
1230 else {
1231 //printf(" l.index = %d - FP32 \n", l.index);
1232 float *im = state.input + (i*l.groups + j)*(l.c / l.groups)*l.h*l.w;
1233 if (l.size == 1) {
1234 b = im;
1235 }
1236 else {
1237 //im2col_cpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
1238
1239 im2col_cpu_ext(im, // input
1240 l.c / l.groups, // input channels
1241 l.h, l.w, // input size (h, w)
1242 l.size, l.size, // kernel size (h, w)
1243 l.pad * l.dilation, l.pad * l.dilation, // padding (h, w)
1244 l.stride_y, l.stride_x, // stride (h, w)
1245 l.dilation, l.dilation, // dilation (h, w)
1246 b); // output
1247
1248 }
1249
1250 gemm(0, 0, m, n, k, 1, a, k, b, n, 1, c, n);
1251 // bit-count to float
1252 }
1253 //c += n*m;
1254 //state.input += l.c*l.h*l.w;
1255 }
1256 }
1257
1258 if(l.batch_normalize){
1259 forward_batchnorm_layer(l, state);
1260 }
1261 else {
1262 add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
1263 }
1264
1265 //activate_array(l.output, m*n*l.batch, l.activation);
1266 if (l.activation == SWISH) activate_array_swish(l.output, l.outputs*l.batch, l.activation_input, l.output);
1267 else if (l.activation == MISH) activate_array_mish(l.output, l.outputs*l.batch, l.activation_input, l.output);
1268 else if (l.activation == NORM_CHAN) activate_array_normalize_channels(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output);
1269 else if (l.activation == NORM_CHAN_SOFTMAX) activate_array_normalize_channels_softmax(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output, 0);
1270 else if (l.activation == NORM_CHAN_SOFTMAX_MAXVAL) activate_array_normalize_channels_softmax(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output, 1);
1271 else activate_array_cpu_custom(l.output, l.outputs*l.batch, l.activation);
1272
1273 if(l.binary || l.xnor) swap_binary(&l);
1274
1275 //visualize_convolutional_layer(l, "conv_visual", NULL);
1276 //wait_until_press_key_cv();
1277
1278 if(l.assisted_excitation && state.train) assisted_excitation_forward(l, state);
1279
1280 if (l.antialiasing) {
1281 network_state s = { 0 };
1282 s.train = state.train;
1283 s.workspace = state.workspace;
1284 s.net = state.net;
1285 s.input = l.output;
1286 forward_convolutional_layer(*(l.input_layer), s);
1287 //simple_copy_ongpu(l.outputs*l.batch, l.output, l.input_antialiasing);
1288 memcpy(l.output, l.input_layer->output, l.input_layer->outputs * l.input_layer->batch * sizeof(float));
1289 }
1290 }
1291
assisted_excitation_forward(convolutional_layer l,network_state state)1292 void assisted_excitation_forward(convolutional_layer l, network_state state)
1293 {
1294 const int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
1295
1296 // epoch
1297 //const float epoch = (float)(*state.net.seen) / state.net.train_images_num;
1298
1299 // calculate alpha
1300 //const float alpha = (1 + cos(3.141592 * iteration_num)) / (2 * state.net.max_batches);
1301 //const float alpha = (1 + cos(3.141592 * epoch)) / (2 * state.net.max_batches);
1302 float alpha = (1 + cos(3.141592 * iteration_num / state.net.max_batches));
1303
1304 if (l.assisted_excitation > 1) {
1305 if (iteration_num > l.assisted_excitation) alpha = 0;
1306 else alpha = (1 + cos(3.141592 * iteration_num / l.assisted_excitation));
1307 }
1308
1309 //printf("\n epoch = %f, alpha = %f, seen = %d, max_batches = %d, train_images_num = %d \n",
1310 // epoch, alpha, (*state.net.seen), state.net.max_batches, state.net.train_images_num);
1311
1312 float *a_avg = (float *)xcalloc(l.out_w * l.out_h * l.batch, sizeof(float));
1313 float *g = (float *)xcalloc(l.out_w * l.out_h * l.batch, sizeof(float));
1314
1315 int b;
1316 int w, h, c;
1317
1318 l.max_boxes = state.net.num_boxes;
1319 l.truths = l.max_boxes*(4 + 1);
1320
1321 for (b = 0; b < l.batch; ++b)
1322 {
1323 // calculate G
1324 int t;
1325 for (t = 0; t < state.net.num_boxes; ++t) {
1326 box truth = float_to_box_stride(state.truth + t*(4 + 1) + b*l.truths, 1);
1327 if (!truth.x) break; // continue;
1328
1329 int left = floor((truth.x - truth.w / 2) * l.out_w);
1330 int right = ceil((truth.x + truth.w / 2) * l.out_w);
1331 int top = floor((truth.y - truth.h / 2) * l.out_h);
1332 int bottom = ceil((truth.y + truth.h / 2) * l.out_h);
1333
1334 for (w = left; w <= right; w++) {
1335 for (h = top; h < bottom; h++) {
1336 g[w + l.out_w * h + l.out_w*l.out_h*b] = 1;
1337 }
1338 }
1339 }
1340 }
1341
1342 for (b = 0; b < l.batch; ++b)
1343 {
1344 // calculate average A
1345 for (w = 0; w < l.out_w; w++) {
1346 for (h = 0; h < l.out_h; h++) {
1347 for (c = 0; c < l.out_c; c++) {
1348 a_avg[w + l.out_w*(h + l.out_h*b)] += l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))];
1349 }
1350 a_avg[w + l.out_w*(h + l.out_h*b)] /= l.out_c; // a_avg / d
1351 }
1352 }
1353 }
1354
1355 // change activation
1356 for (b = 0; b < l.batch; ++b)
1357 {
1358 for (w = 0; w < l.out_w; w++) {
1359 for (h = 0; h < l.out_h; h++) {
1360 for (c = 0; c < l.out_c; c++)
1361 {
1362 // a = a + alpha(t) + e(c,i,j) = a + alpha(t) + g(i,j) * avg_a(i,j) / channels
1363 l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))] +=
1364 alpha *
1365 g[w + l.out_w*(h + l.out_h*b)] *
1366 a_avg[w + l.out_w*(h + l.out_h*b)];
1367
1368 //l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))] =
1369 // alpha * g[w + l.out_w*(h + l.out_h*b)] * a_avg[w + l.out_w*(h + l.out_h*b)];
1370 }
1371 }
1372 }
1373 }
1374
1375 if(0) // visualize ground truth
1376 {
1377 #ifdef OPENCV
1378 for (b = 0; b < l.batch; ++b)
1379 {
1380 image img = float_to_image(l.out_w, l.out_h, 1, &g[l.out_w*l.out_h*b]);
1381 char buff[100];
1382 sprintf(buff, "a_excitation_%d", b);
1383 show_image_cv(img, buff);
1384
1385 image img2 = float_to_image(l.out_w, l.out_h, 1, &l.output[l.out_w*l.out_h*l.out_c*b]);
1386 char buff2[100];
1387 sprintf(buff2, "a_excitation_act_%d", b);
1388 show_image_cv(img2, buff2);
1389 wait_key_cv(5);
1390 }
1391 wait_until_press_key_cv();
1392 #endif // OPENCV
1393 }
1394
1395 free(g);
1396 free(a_avg);
1397 }
1398
1399
backward_convolutional_layer(convolutional_layer l,network_state state)1400 void backward_convolutional_layer(convolutional_layer l, network_state state)
1401 {
1402 int i, j;
1403 int m = l.n / l.groups;
1404 int n = l.size*l.size*l.c / l.groups;
1405 int k = l.out_w*l.out_h;
1406
1407 if (l.activation == SWISH) gradient_array_swish(l.output, l.outputs*l.batch, l.activation_input, l.delta);
1408 else if (l.activation == MISH) gradient_array_mish(l.outputs*l.batch, l.activation_input, l.delta);
1409 else if (l.activation == NORM_CHAN_SOFTMAX || l.activation == NORM_CHAN_SOFTMAX_MAXVAL) gradient_array_normalize_channels_softmax(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.delta);
1410 else if (l.activation == NORM_CHAN) gradient_array_normalize_channels(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.delta);
1411 else gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
1412
1413 if (l.batch_normalize) {
1414 backward_batchnorm_layer(l, state);
1415 }
1416 else {
1417 backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);
1418 }
1419
1420 for (i = 0; i < l.batch; ++i) {
1421 for (j = 0; j < l.groups; ++j) {
1422 float *a = l.delta + (i*l.groups + j)*m*k;
1423 float *b = state.workspace;
1424 float *c = l.weight_updates + j*l.nweights / l.groups;
1425
1426 float *im = state.input + (i*l.groups + j)* (l.c / l.groups)*l.h*l.w;
1427
1428 //im2col_cpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
1429 im2col_cpu_ext(
1430 im, // input
1431 l.c / l.groups, // input channels
1432 l.h, l.w, // input size (h, w)
1433 l.size, l.size, // kernel size (h, w)
1434 l.pad * l.dilation, l.pad * l.dilation, // padding (h, w)
1435 l.stride_y, l.stride_x, // stride (h, w)
1436 l.dilation, l.dilation, // dilation (h, w)
1437 b); // output
1438
1439 gemm(0, 1, m, n, k, 1, a, k, b, k, 1, c, n);
1440
1441 if (state.delta) {
1442 a = l.weights + j*l.nweights / l.groups;
1443 b = l.delta + (i*l.groups + j)*m*k;
1444 c = state.workspace;
1445
1446 gemm(1, 0, n, k, m, 1, a, n, b, k, 0, c, k);
1447
1448 //col2im_cpu(state.workspace, l.c / l.groups, l.h, l.w, l.size, l.stride,
1449 // l.pad, state.delta + (i*l.groups + j)*l.c / l.groups*l.h*l.w);
1450
1451 col2im_cpu_ext(
1452 state.workspace, // input
1453 l.c / l.groups, // input channels (h, w)
1454 l.h, l.w, // input size (h, w)
1455 l.size, l.size, // kernel size (h, w)
1456 l.pad * l.dilation, l.pad * l.dilation, // padding (h, w)
1457 l.stride_y, l.stride_x, // stride (h, w)
1458 l.dilation, l.dilation, // dilation (h, w)
1459 state.delta + (i*l.groups + j)* (l.c / l.groups)*l.h*l.w); // output (delta)
1460 }
1461 }
1462 }
1463 }
1464
update_convolutional_layer(convolutional_layer l,int batch,float learning_rate_init,float momentum,float decay)1465 void update_convolutional_layer(convolutional_layer l, int batch, float learning_rate_init, float momentum, float decay)
1466 {
1467 float learning_rate = learning_rate_init*l.learning_rate_scale;
1468 //float momentum = a.momentum;
1469 //float decay = a.decay;
1470 //int batch = a.batch;
1471
1472 axpy_cpu(l.nweights, -decay*batch, l.weights, 1, l.weight_updates, 1);
1473 axpy_cpu(l.nweights, learning_rate / batch, l.weight_updates, 1, l.weights, 1);
1474 scal_cpu(l.nweights, momentum, l.weight_updates, 1);
1475
1476 axpy_cpu(l.n, learning_rate / batch, l.bias_updates, 1, l.biases, 1);
1477 scal_cpu(l.n, momentum, l.bias_updates, 1);
1478
1479 if (l.scales) {
1480 axpy_cpu(l.n, learning_rate / batch, l.scale_updates, 1, l.scales, 1);
1481 scal_cpu(l.n, momentum, l.scale_updates, 1);
1482 }
1483 }
1484
1485
1486
get_convolutional_weight(convolutional_layer l,int i)1487 image get_convolutional_weight(convolutional_layer l, int i)
1488 {
1489 int h = l.size;
1490 int w = l.size;
1491 int c = l.c / l.groups;
1492 return float_to_image(w, h, c, l.weights + i*h*w*c);
1493 }
1494
rgbgr_weights(convolutional_layer l)1495 void rgbgr_weights(convolutional_layer l)
1496 {
1497 int i;
1498 for (i = 0; i < l.n; ++i) {
1499 image im = get_convolutional_weight(l, i);
1500 if (im.c == 3) {
1501 rgbgr_image(im);
1502 }
1503 }
1504 }
1505
rescale_weights(convolutional_layer l,float scale,float trans)1506 void rescale_weights(convolutional_layer l, float scale, float trans)
1507 {
1508 int i;
1509 for (i = 0; i < l.n; ++i) {
1510 image im = get_convolutional_weight(l, i);
1511 if (im.c == 3) {
1512 scale_image(im, scale);
1513 float sum = sum_array(im.data, im.w*im.h*im.c);
1514 l.biases[i] += sum*trans;
1515 }
1516 }
1517 }
1518
get_weights(convolutional_layer l)1519 image *get_weights(convolutional_layer l)
1520 {
1521 image *weights = (image *)xcalloc(l.n, sizeof(image));
1522 int i;
1523 for (i = 0; i < l.n; ++i) {
1524 weights[i] = copy_image(get_convolutional_weight(l, i));
1525 normalize_image(weights[i]);
1526 /*
1527 char buff[256];
1528 sprintf(buff, "filter%d", i);
1529 save_image(weights[i], buff);
1530 */
1531 }
1532 //error("hey");
1533 return weights;
1534 }
1535
visualize_convolutional_layer(convolutional_layer l,char * window,image * prev_weights)1536 image *visualize_convolutional_layer(convolutional_layer l, char *window, image *prev_weights)
1537 {
1538 image *single_weights = get_weights(l);
1539 show_images(single_weights, l.n, window);
1540
1541 image delta = get_convolutional_image(l);
1542 image dc = collapse_image_layers(delta, 1);
1543 char buff[256];
1544 sprintf(buff, "%s: Output", window);
1545 show_image(dc, buff);
1546 //save_image(dc, buff);
1547 free_image(dc);
1548 return single_weights;
1549 }
1550
1551