1 #include "local_layer.h"
2 #include "utils.h"
3 #include "im2col.h"
4 #include "col2im.h"
5 #include "blas.h"
6 #include "gemm.h"
7 #include <stdio.h>
8 #include <time.h>
9
local_out_height(local_layer l)10 int local_out_height(local_layer l)
11 {
12 int h = l.h;
13 if (!l.pad) h -= l.size;
14 else h -= 1;
15 return h/l.stride + 1;
16 }
17
local_out_width(local_layer l)18 int local_out_width(local_layer l)
19 {
20 int w = l.w;
21 if (!l.pad) w -= l.size;
22 else w -= 1;
23 return w/l.stride + 1;
24 }
25
make_local_layer(int batch,int h,int w,int c,int n,int size,int stride,int pad,ACTIVATION activation)26 local_layer make_local_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation)
27 {
28 int i;
29 local_layer l = { (LAYER_TYPE)0 };
30 l.type = LOCAL;
31
32 l.h = h;
33 l.w = w;
34 l.c = c;
35 l.n = n;
36 l.batch = batch;
37 l.stride = stride;
38 l.size = size;
39 l.pad = pad;
40
41 int out_h = local_out_height(l);
42 int out_w = local_out_width(l);
43 int locations = out_h*out_w;
44 l.out_h = out_h;
45 l.out_w = out_w;
46 l.out_c = n;
47 l.outputs = l.out_h * l.out_w * l.out_c;
48 l.inputs = l.w * l.h * l.c;
49
50 l.weights = (float*)xcalloc(c * n * size * size * locations, sizeof(float));
51 l.weight_updates = (float*)xcalloc(c * n * size * size * locations, sizeof(float));
52
53 l.biases = (float*)xcalloc(l.outputs, sizeof(float));
54 l.bias_updates = (float*)xcalloc(l.outputs, sizeof(float));
55
56 // float scale = 1./sqrt(size*size*c);
57 float scale = sqrt(2./(size*size*c));
58 for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1,1);
59
60 l.col_image = (float*)xcalloc(out_h * out_w * size * size * c, sizeof(float));
61 l.output = (float*)xcalloc(l.batch * out_h * out_w * n, sizeof(float));
62 l.delta = (float*)xcalloc(l.batch * out_h * out_w * n, sizeof(float));
63
64 l.forward = forward_local_layer;
65 l.backward = backward_local_layer;
66 l.update = update_local_layer;
67
68 #ifdef GPU
69 l.forward_gpu = forward_local_layer_gpu;
70 l.backward_gpu = backward_local_layer_gpu;
71 l.update_gpu = update_local_layer_gpu;
72
73 l.weights_gpu = cuda_make_array(l.weights, c*n*size*size*locations);
74 l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size*locations);
75
76 l.biases_gpu = cuda_make_array(l.biases, l.outputs);
77 l.bias_updates_gpu = cuda_make_array(l.bias_updates, l.outputs);
78
79 l.col_image_gpu = cuda_make_array(l.col_image, out_h*out_w*size*size*c);
80 l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n);
81 l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
82
83 #endif
84 l.activation = activation;
85
86 fprintf(stderr, "Local Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n);
87
88 return l;
89 }
90
forward_local_layer(const local_layer l,network_state state)91 void forward_local_layer(const local_layer l, network_state state)
92 {
93 int out_h = local_out_height(l);
94 int out_w = local_out_width(l);
95 int i, j;
96 int locations = out_h * out_w;
97
98 for(i = 0; i < l.batch; ++i){
99 copy_cpu(l.outputs, l.biases, 1, l.output + i*l.outputs, 1);
100 }
101
102 for(i = 0; i < l.batch; ++i){
103 float *input = state.input + i*l.w*l.h*l.c;
104 im2col_cpu(input, l.c, l.h, l.w,
105 l.size, l.stride, l.pad, l.col_image);
106 float *output = l.output + i*l.outputs;
107 for(j = 0; j < locations; ++j){
108 float *a = l.weights + j*l.size*l.size*l.c*l.n;
109 float *b = l.col_image + j;
110 float *c = output + j;
111
112 int m = l.n;
113 int n = 1;
114 int k = l.size*l.size*l.c;
115
116 gemm(0,0,m,n,k,1,a,k,b,locations,1,c,locations);
117 }
118 }
119 activate_array(l.output, l.outputs*l.batch, l.activation);
120 }
121
backward_local_layer(local_layer l,network_state state)122 void backward_local_layer(local_layer l, network_state state)
123 {
124 int i, j;
125 int locations = l.out_w*l.out_h;
126
127 gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
128
129 for(i = 0; i < l.batch; ++i){
130 axpy_cpu(l.outputs, 1, l.delta + i*l.outputs, 1, l.bias_updates, 1);
131 }
132
133 for(i = 0; i < l.batch; ++i){
134 float *input = state.input + i*l.w*l.h*l.c;
135 im2col_cpu(input, l.c, l.h, l.w,
136 l.size, l.stride, l.pad, l.col_image);
137
138 for(j = 0; j < locations; ++j){
139 float *a = l.delta + i*l.outputs + j;
140 float *b = l.col_image + j;
141 float *c = l.weight_updates + j*l.size*l.size*l.c*l.n;
142 int m = l.n;
143 int n = l.size*l.size*l.c;
144 int k = 1;
145
146 gemm(0,1,m,n,k,1,a,locations,b,locations,1,c,n);
147 }
148
149 if(state.delta){
150 for(j = 0; j < locations; ++j){
151 float *a = l.weights + j*l.size*l.size*l.c*l.n;
152 float *b = l.delta + i*l.outputs + j;
153 float *c = l.col_image + j;
154
155 int m = l.size*l.size*l.c;
156 int n = 1;
157 int k = l.n;
158
159 gemm(1,0,m,n,k,1,a,m,b,locations,0,c,locations);
160 }
161
162 col2im_cpu(l.col_image, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
163 }
164 }
165 }
166
update_local_layer(local_layer l,int batch,float learning_rate,float momentum,float decay)167 void update_local_layer(local_layer l, int batch, float learning_rate, float momentum, float decay)
168 {
169 int locations = l.out_w*l.out_h;
170 int size = l.size*l.size*l.c*l.n*locations;
171 axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
172 scal_cpu(l.outputs, momentum, l.bias_updates, 1);
173
174 axpy_cpu(size, -decay*batch, l.weights, 1, l.weight_updates, 1);
175 axpy_cpu(size, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
176 scal_cpu(size, momentum, l.weight_updates, 1);
177 }
178
179 #ifdef GPU
180
forward_local_layer_gpu(const local_layer l,network_state state)181 void forward_local_layer_gpu(const local_layer l, network_state state)
182 {
183 int out_h = local_out_height(l);
184 int out_w = local_out_width(l);
185 int i, j;
186 int locations = out_h * out_w;
187
188 for(i = 0; i < l.batch; ++i){
189 copy_ongpu(l.outputs, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1);
190 }
191
192 for(i = 0; i < l.batch; ++i){
193 float *input = state.input + i*l.w*l.h*l.c;
194 im2col_ongpu(input, l.c, l.h, l.w,
195 l.size, l.stride, l.pad, l.col_image_gpu);
196 float *output = l.output_gpu + i*l.outputs;
197 for(j = 0; j < locations; ++j){
198 float *a = l.weights_gpu + j*l.size*l.size*l.c*l.n;
199 float *b = l.col_image_gpu + j;
200 float *c = output + j;
201
202 int m = l.n;
203 int n = 1;
204 int k = l.size*l.size*l.c;
205
206 gemm_ongpu(0,0,m,n,k,1,a,k,b,locations,1,c,locations);
207 }
208 }
209 activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
210 }
211
backward_local_layer_gpu(local_layer l,network_state state)212 void backward_local_layer_gpu(local_layer l, network_state state)
213 {
214 int i, j;
215 int locations = l.out_w*l.out_h;
216
217 gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
218 for(i = 0; i < l.batch; ++i){
219 axpy_ongpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1);
220 }
221
222 for(i = 0; i < l.batch; ++i){
223 float *input = state.input + i*l.w*l.h*l.c;
224 im2col_ongpu(input, l.c, l.h, l.w,
225 l.size, l.stride, l.pad, l.col_image_gpu);
226
227 for(j = 0; j < locations; ++j){
228 float *a = l.delta_gpu + i*l.outputs + j;
229 float *b = l.col_image_gpu + j;
230 float *c = l.weight_updates_gpu + j*l.size*l.size*l.c*l.n;
231 int m = l.n;
232 int n = l.size*l.size*l.c;
233 int k = 1;
234
235 gemm_ongpu(0,1,m,n,k,1,a,locations,b,locations,1,c,n);
236 }
237
238 if(state.delta){
239 for(j = 0; j < locations; ++j){
240 float *a = l.weights_gpu + j*l.size*l.size*l.c*l.n;
241 float *b = l.delta_gpu + i*l.outputs + j;
242 float *c = l.col_image_gpu + j;
243
244 int m = l.size*l.size*l.c;
245 int n = 1;
246 int k = l.n;
247
248 gemm_ongpu(1,0,m,n,k,1,a,m,b,locations,0,c,locations);
249 }
250
251 col2im_ongpu(l.col_image_gpu, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
252 }
253 }
254 }
255
update_local_layer_gpu(local_layer l,int batch,float learning_rate,float momentum,float decay,float loss_scale)256 void update_local_layer_gpu(local_layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale)
257 {
258 int locations = l.out_w*l.out_h;
259 int size = l.size*l.size*l.c*l.n*locations;
260 axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
261 scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1);
262
263 axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
264 axpy_ongpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
265 scal_ongpu(size, momentum, l.weight_updates_gpu, 1);
266 }
267
pull_local_layer(local_layer l)268 void pull_local_layer(local_layer l)
269 {
270 int locations = l.out_w*l.out_h;
271 int size = l.size*l.size*l.c*l.n*locations;
272 cuda_pull_array(l.weights_gpu, l.weights, size);
273 cuda_pull_array(l.biases_gpu, l.biases, l.outputs);
274 }
275
push_local_layer(local_layer l)276 void push_local_layer(local_layer l)
277 {
278 int locations = l.out_w*l.out_h;
279 int size = l.size*l.size*l.c*l.n*locations;
280 cuda_push_array(l.weights_gpu, l.weights, size);
281 cuda_push_array(l.biases_gpu, l.biases, l.outputs);
282 }
283 #endif
284