1 #include "crnn_layer.h"
2 #include "convolutional_layer.h"
3 #include "utils.h"
4 #include "dark_cuda.h"
5 #include "blas.h"
6 #include "gemm.h"
7
8 #include <math.h>
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12
increment_layer(layer * l,int steps)13 static void increment_layer(layer *l, int steps)
14 {
15 int num = l->outputs*l->batch*steps;
16 l->output += num;
17 l->delta += num;
18 l->x += num;
19 l->x_norm += num;
20
21 #ifdef GPU
22 l->output_gpu += num;
23 l->delta_gpu += num;
24 l->x_gpu += num;
25 l->x_norm_gpu += num;
26 #endif
27 }
28
make_crnn_layer(int batch,int h,int w,int c,int hidden_filters,int output_filters,int groups,int steps,int size,int stride,int dilation,int pad,ACTIVATION activation,int batch_normalize,int xnor,int train)29 layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int groups, int steps, int size, int stride, int dilation, int pad, ACTIVATION activation, int batch_normalize, int xnor, int train)
30 {
31 fprintf(stderr, "CRNN Layer: %d x %d x %d image, %d filters\n", h,w,c,output_filters);
32 batch = batch / steps;
33 layer l = { (LAYER_TYPE)0 };
34 l.train = train;
35 l.batch = batch;
36 l.type = CRNN;
37 l.steps = steps;
38 l.size = size;
39 l.stride = stride;
40 l.dilation = dilation;
41 l.pad = pad;
42 l.h = h;
43 l.w = w;
44 l.c = c;
45 l.groups = groups;
46 l.out_c = output_filters;
47 l.inputs = h * w * c;
48 l.hidden = h * w * hidden_filters;
49 l.xnor = xnor;
50
51 l.state = (float*)xcalloc(l.hidden * l.batch * (l.steps + 1), sizeof(float));
52
53 l.input_layer = (layer*)xcalloc(1, sizeof(layer));
54 *(l.input_layer) = make_convolutional_layer(batch, steps, h, w, c, hidden_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, 0, train);
55 l.input_layer->batch = batch;
56 if (l.workspace_size < l.input_layer->workspace_size) l.workspace_size = l.input_layer->workspace_size;
57
58 l.self_layer = (layer*)xcalloc(1, sizeof(layer));
59 *(l.self_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, hidden_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, 0, train);
60 l.self_layer->batch = batch;
61 if (l.workspace_size < l.self_layer->workspace_size) l.workspace_size = l.self_layer->workspace_size;
62
63 l.output_layer = (layer*)xcalloc(1, sizeof(layer));
64 *(l.output_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, 0, train);
65 l.output_layer->batch = batch;
66 if (l.workspace_size < l.output_layer->workspace_size) l.workspace_size = l.output_layer->workspace_size;
67
68 l.out_h = l.output_layer->out_h;
69 l.out_w = l.output_layer->out_w;
70 l.outputs = l.output_layer->outputs;
71
72 assert(l.input_layer->outputs == l.self_layer->outputs);
73 assert(l.input_layer->outputs == l.output_layer->inputs);
74
75 l.output = l.output_layer->output;
76 l.delta = l.output_layer->delta;
77
78 l.forward = forward_crnn_layer;
79 l.backward = backward_crnn_layer;
80 l.update = update_crnn_layer;
81
82 #ifdef GPU
83 l.forward_gpu = forward_crnn_layer_gpu;
84 l.backward_gpu = backward_crnn_layer_gpu;
85 l.update_gpu = update_crnn_layer_gpu;
86 l.state_gpu = cuda_make_array(l.state, l.batch*l.hidden*(l.steps + 1));
87 l.output_gpu = l.output_layer->output_gpu;
88 l.delta_gpu = l.output_layer->delta_gpu;
89 #endif
90
91 l.bflops = l.input_layer->bflops + l.self_layer->bflops + l.output_layer->bflops;
92
93 return l;
94 }
95
resize_crnn_layer(layer * l,int w,int h)96 void resize_crnn_layer(layer *l, int w, int h)
97 {
98 resize_convolutional_layer(l->input_layer, w, h);
99 if (l->workspace_size < l->input_layer->workspace_size) l->workspace_size = l->input_layer->workspace_size;
100
101 resize_convolutional_layer(l->self_layer, w, h);
102 if (l->workspace_size < l->self_layer->workspace_size) l->workspace_size = l->self_layer->workspace_size;
103
104 resize_convolutional_layer(l->output_layer, w, h);
105 if (l->workspace_size < l->output_layer->workspace_size) l->workspace_size = l->output_layer->workspace_size;
106
107 l->output = l->output_layer->output;
108 l->delta = l->output_layer->delta;
109
110 int hidden_filters = l->self_layer->c;
111 l->w = w;
112 l->h = h;
113 l->inputs = h * w * l->c;
114 l->hidden = h * w * hidden_filters;
115
116 l->out_h = l->output_layer->out_h;
117 l->out_w = l->output_layer->out_w;
118 l->outputs = l->output_layer->outputs;
119
120 assert(l->input_layer->inputs == l->inputs);
121 assert(l->self_layer->inputs == l->hidden);
122 assert(l->input_layer->outputs == l->self_layer->outputs);
123 assert(l->input_layer->outputs == l->output_layer->inputs);
124
125 l->state = (float*)xrealloc(l->state, l->batch*l->hidden*(l->steps + 1)*sizeof(float));
126
127 #ifdef GPU
128 if (l->state_gpu) cudaFree(l->state_gpu);
129 l->state_gpu = cuda_make_array(l->state, l->batch*l->hidden*(l->steps + 1));
130
131 l->output_gpu = l->output_layer->output_gpu;
132 l->delta_gpu = l->output_layer->delta_gpu;
133 #endif
134 }
135
free_state_crnn(layer l)136 void free_state_crnn(layer l)
137 {
138 int i;
139 for (i = 0; i < l.outputs * l.batch; ++i) l.self_layer->output[i] = rand_uniform(-1, 1);
140
141 #ifdef GPU
142 cuda_push_array(l.self_layer->output_gpu, l.self_layer->output, l.outputs * l.batch);
143 #endif // GPU
144 }
145
update_crnn_layer(layer l,int batch,float learning_rate,float momentum,float decay)146 void update_crnn_layer(layer l, int batch, float learning_rate, float momentum, float decay)
147 {
148 update_convolutional_layer(*(l.input_layer), batch, learning_rate, momentum, decay);
149 update_convolutional_layer(*(l.self_layer), batch, learning_rate, momentum, decay);
150 update_convolutional_layer(*(l.output_layer), batch, learning_rate, momentum, decay);
151 }
152
forward_crnn_layer(layer l,network_state state)153 void forward_crnn_layer(layer l, network_state state)
154 {
155 network_state s = {0};
156 s.train = state.train;
157 s.workspace = state.workspace;
158 s.net = state.net;
159 //s.index = state.index;
160 int i;
161 layer input_layer = *(l.input_layer);
162 layer self_layer = *(l.self_layer);
163 layer output_layer = *(l.output_layer);
164
165 if (state.train) {
166 fill_cpu(l.outputs * l.batch * l.steps, 0, output_layer.delta, 1);
167 fill_cpu(l.hidden * l.batch * l.steps, 0, self_layer.delta, 1);
168 fill_cpu(l.hidden * l.batch * l.steps, 0, input_layer.delta, 1);
169 fill_cpu(l.hidden * l.batch, 0, l.state, 1);
170 }
171
172 for (i = 0; i < l.steps; ++i) {
173 s.input = state.input;
174 forward_convolutional_layer(input_layer, s);
175
176 s.input = l.state;
177 forward_convolutional_layer(self_layer, s);
178
179 float *old_state = l.state;
180 if(state.train) l.state += l.hidden*l.batch;
181 if(l.shortcut){
182 copy_cpu(l.hidden * l.batch, old_state, 1, l.state, 1);
183 }else{
184 fill_cpu(l.hidden * l.batch, 0, l.state, 1);
185 }
186 axpy_cpu(l.hidden * l.batch, 1, input_layer.output, 1, l.state, 1);
187 axpy_cpu(l.hidden * l.batch, 1, self_layer.output, 1, l.state, 1);
188
189 s.input = l.state;
190 forward_convolutional_layer(output_layer, s);
191
192 state.input += l.inputs*l.batch;
193 increment_layer(&input_layer, 1);
194 increment_layer(&self_layer, 1);
195 increment_layer(&output_layer, 1);
196 }
197 }
198
backward_crnn_layer(layer l,network_state state)199 void backward_crnn_layer(layer l, network_state state)
200 {
201 network_state s = {0};
202 s.train = state.train;
203 s.workspace = state.workspace;
204 s.net = state.net;
205 //s.index = state.index;
206 int i;
207 layer input_layer = *(l.input_layer);
208 layer self_layer = *(l.self_layer);
209 layer output_layer = *(l.output_layer);
210
211 increment_layer(&input_layer, l.steps-1);
212 increment_layer(&self_layer, l.steps-1);
213 increment_layer(&output_layer, l.steps-1);
214
215 l.state += l.hidden*l.batch*l.steps;
216 for (i = l.steps-1; i >= 0; --i) {
217 copy_cpu(l.hidden * l.batch, input_layer.output, 1, l.state, 1);
218 axpy_cpu(l.hidden * l.batch, 1, self_layer.output, 1, l.state, 1);
219
220 s.input = l.state;
221 s.delta = self_layer.delta;
222 backward_convolutional_layer(output_layer, s);
223
224 l.state -= l.hidden*l.batch;
225 /*
226 if(i > 0){
227 copy_cpu(l.hidden * l.batch, input_layer.output - l.hidden*l.batch, 1, l.state, 1);
228 axpy_cpu(l.hidden * l.batch, 1, self_layer.output - l.hidden*l.batch, 1, l.state, 1);
229 }else{
230 fill_cpu(l.hidden * l.batch, 0, l.state, 1);
231 }
232 */
233
234 s.input = l.state;
235 s.delta = self_layer.delta - l.hidden*l.batch;
236 if (i == 0) s.delta = 0;
237 backward_convolutional_layer(self_layer, s);
238
239 copy_cpu(l.hidden*l.batch, self_layer.delta, 1, input_layer.delta, 1);
240 if (i > 0 && l.shortcut) axpy_cpu(l.hidden*l.batch, 1, self_layer.delta, 1, self_layer.delta - l.hidden*l.batch, 1);
241 s.input = state.input + i*l.inputs*l.batch;
242 if(state.delta) s.delta = state.delta + i*l.inputs*l.batch;
243 else s.delta = 0;
244 backward_convolutional_layer(input_layer, s);
245
246 increment_layer(&input_layer, -1);
247 increment_layer(&self_layer, -1);
248 increment_layer(&output_layer, -1);
249 }
250 }
251
252 #ifdef GPU
253
pull_crnn_layer(layer l)254 void pull_crnn_layer(layer l)
255 {
256 pull_convolutional_layer(*(l.input_layer));
257 pull_convolutional_layer(*(l.self_layer));
258 pull_convolutional_layer(*(l.output_layer));
259 }
260
push_crnn_layer(layer l)261 void push_crnn_layer(layer l)
262 {
263 push_convolutional_layer(*(l.input_layer));
264 push_convolutional_layer(*(l.self_layer));
265 push_convolutional_layer(*(l.output_layer));
266 }
267
update_crnn_layer_gpu(layer l,int batch,float learning_rate,float momentum,float decay,float loss_scale)268 void update_crnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay, float loss_scale)
269 {
270 update_convolutional_layer_gpu(*(l.input_layer), batch, learning_rate, momentum, decay, loss_scale);
271 update_convolutional_layer_gpu(*(l.self_layer), batch, learning_rate, momentum, decay, loss_scale);
272 update_convolutional_layer_gpu(*(l.output_layer), batch, learning_rate, momentum, decay, loss_scale);
273 }
274
forward_crnn_layer_gpu(layer l,network_state state)275 void forward_crnn_layer_gpu(layer l, network_state state)
276 {
277 network_state s = {0};
278 s.train = state.train;
279 s.workspace = state.workspace;
280 s.net = state.net;
281 if(!state.train) s.index = state.index; // don't use TC for training (especially without cuda_convert_f32_to_f16() )
282 int i;
283 layer input_layer = *(l.input_layer);
284 layer self_layer = *(l.self_layer);
285 layer output_layer = *(l.output_layer);
286
287 /*
288 #ifdef CUDNN_HALF // slow and bad for training
289 if (!state.train && state.net.cudnn_half) {
290 s.index = state.index;
291 cuda_convert_f32_to_f16(input_layer.weights_gpu, input_layer.c*input_layer.n*input_layer.size*input_layer.size, input_layer.weights_gpu16);
292 cuda_convert_f32_to_f16(self_layer.weights_gpu, self_layer.c*self_layer.n*self_layer.size*self_layer.size, self_layer.weights_gpu16);
293 cuda_convert_f32_to_f16(output_layer.weights_gpu, output_layer.c*output_layer.n*output_layer.size*output_layer.size, output_layer.weights_gpu16);
294 }
295 #endif //CUDNN_HALF
296 */
297
298 if (state.train) {
299 fill_ongpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1);
300 fill_ongpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1);
301 fill_ongpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1);
302 fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
303 }
304
305 for (i = 0; i < l.steps; ++i) {
306 s.input = state.input;
307 forward_convolutional_layer_gpu(input_layer, s);
308
309 s.input = l.state_gpu;
310 forward_convolutional_layer_gpu(self_layer, s);
311
312 float *old_state = l.state_gpu;
313 if(state.train) l.state_gpu += l.hidden*l.batch;
314 if(l.shortcut){
315 copy_ongpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1);
316 }else{
317 fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
318 }
319 axpy_ongpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
320 axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
321
322 s.input = l.state_gpu;
323 forward_convolutional_layer_gpu(output_layer, s);
324
325 state.input += l.inputs*l.batch;
326 increment_layer(&input_layer, 1);
327 increment_layer(&self_layer, 1);
328 increment_layer(&output_layer, 1);
329 }
330 }
331
backward_crnn_layer_gpu(layer l,network_state state)332 void backward_crnn_layer_gpu(layer l, network_state state)
333 {
334 network_state s = {0};
335 s.train = state.train;
336 s.workspace = state.workspace;
337 s.net = state.net;
338 //s.index = state.index;
339 int i;
340 layer input_layer = *(l.input_layer);
341 layer self_layer = *(l.self_layer);
342 layer output_layer = *(l.output_layer);
343 increment_layer(&input_layer, l.steps - 1);
344 increment_layer(&self_layer, l.steps - 1);
345 increment_layer(&output_layer, l.steps - 1);
346 float *init_state_gpu = l.state_gpu;
347 l.state_gpu += l.hidden*l.batch*l.steps;
348 for (i = l.steps-1; i >= 0; --i) {
349 //copy_ongpu(l.hidden * l.batch, input_layer.output_gpu, 1, l.state_gpu, 1); // commented in RNN
350 //axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1); // commented in RNN
351
352 s.input = l.state_gpu;
353 s.delta = self_layer.delta_gpu;
354 backward_convolutional_layer_gpu(output_layer, s);
355
356 l.state_gpu -= l.hidden*l.batch;
357
358 copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
359
360 s.input = l.state_gpu;
361 s.delta = self_layer.delta_gpu - l.hidden*l.batch;
362 if (i == 0) s.delta = 0;
363 backward_convolutional_layer_gpu(self_layer, s);
364
365 if (i > 0 && l.shortcut) axpy_ongpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1);
366 s.input = state.input + i*l.inputs*l.batch;
367 if(state.delta) s.delta = state.delta + i*l.inputs*l.batch;
368 else s.delta = 0;
369 backward_convolutional_layer_gpu(input_layer, s);
370
371 if (state.net.try_fix_nan) {
372 fix_nan_and_inf(output_layer.delta_gpu, output_layer.inputs * output_layer.batch);
373 fix_nan_and_inf(self_layer.delta_gpu, self_layer.inputs * self_layer.batch);
374 fix_nan_and_inf(input_layer.delta_gpu, input_layer.inputs * input_layer.batch);
375 }
376
377 increment_layer(&input_layer, -1);
378 increment_layer(&self_layer, -1);
379 increment_layer(&output_layer, -1);
380 }
381 fill_ongpu(l.hidden * l.batch, 0, init_state_gpu, 1); //clean l.state_gpu
382 }
383 #endif
384