1 #include "yolo_layer.h"
2 #include "activations.h"
3 #include "blas.h"
4 #include "box.h"
5 #include "dark_cuda.h"
6 #include "utils.h"
7 
8 #include <math.h>
9 #include <stdio.h>
10 #include <assert.h>
11 #include <string.h>
12 #include <stdlib.h>
13 
14 extern int check_mistakes;
15 
make_yolo_layer(int batch,int w,int h,int n,int total,int * mask,int classes,int max_boxes)16 layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes, int max_boxes)
17 {
18     int i;
19     layer l = { (LAYER_TYPE)0 };
20     l.type = YOLO;
21 
22     l.n = n;
23     l.total = total;
24     l.batch = batch;
25     l.h = h;
26     l.w = w;
27     l.c = n*(classes + 4 + 1);
28     l.out_w = l.w;
29     l.out_h = l.h;
30     l.out_c = l.c;
31     l.classes = classes;
32     l.cost = (float*)xcalloc(1, sizeof(float));
33     l.biases = (float*)xcalloc(total * 2, sizeof(float));
34     if(mask) l.mask = mask;
35     else{
36         l.mask = (int*)xcalloc(n, sizeof(int));
37         for(i = 0; i < n; ++i){
38             l.mask[i] = i;
39         }
40     }
41     l.bias_updates = (float*)xcalloc(n * 2, sizeof(float));
42     l.outputs = h*w*n*(classes + 4 + 1);
43     l.inputs = l.outputs;
44     l.max_boxes = max_boxes;
45     l.truths = l.max_boxes*(4 + 1);    // 90*(4 + 1);
46     l.delta = (float*)xcalloc(batch * l.outputs, sizeof(float));
47     l.output = (float*)xcalloc(batch * l.outputs, sizeof(float));
48     for(i = 0; i < total*2; ++i){
49         l.biases[i] = .5;
50     }
51 
52     l.forward = forward_yolo_layer;
53     l.backward = backward_yolo_layer;
54 #ifdef GPU
55     l.forward_gpu = forward_yolo_layer_gpu;
56     l.backward_gpu = backward_yolo_layer_gpu;
57     l.output_gpu = cuda_make_array(l.output, batch*l.outputs);
58     l.output_avg_gpu = cuda_make_array(l.output, batch*l.outputs);
59     l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs);
60 
61     free(l.output);
62     if (cudaSuccess == cudaHostAlloc(&l.output, batch*l.outputs*sizeof(float), cudaHostRegisterMapped)) l.output_pinned = 1;
63     else {
64         cudaGetLastError(); // reset CUDA-error
65         l.output = (float*)xcalloc(batch * l.outputs, sizeof(float));
66     }
67 
68     free(l.delta);
69     if (cudaSuccess == cudaHostAlloc(&l.delta, batch*l.outputs*sizeof(float), cudaHostRegisterMapped)) l.delta_pinned = 1;
70     else {
71         cudaGetLastError(); // reset CUDA-error
72         l.delta = (float*)xcalloc(batch * l.outputs, sizeof(float));
73     }
74 #endif
75 
76     fprintf(stderr, "yolo\n");
77     srand(time(0));
78 
79     return l;
80 }
81 
resize_yolo_layer(layer * l,int w,int h)82 void resize_yolo_layer(layer *l, int w, int h)
83 {
84     l->w = w;
85     l->h = h;
86 
87     l->outputs = h*w*l->n*(l->classes + 4 + 1);
88     l->inputs = l->outputs;
89 
90     if (!l->output_pinned) l->output = (float*)xrealloc(l->output, l->batch*l->outputs * sizeof(float));
91     if (!l->delta_pinned) l->delta = (float*)xrealloc(l->delta, l->batch*l->outputs*sizeof(float));
92 
93 #ifdef GPU
94     if (l->output_pinned) {
95         CHECK_CUDA(cudaFreeHost(l->output));
96         if (cudaSuccess != cudaHostAlloc(&l->output, l->batch*l->outputs * sizeof(float), cudaHostRegisterMapped)) {
97             cudaGetLastError(); // reset CUDA-error
98             l->output = (float*)xcalloc(l->batch * l->outputs, sizeof(float));
99             l->output_pinned = 0;
100         }
101     }
102 
103     if (l->delta_pinned) {
104         CHECK_CUDA(cudaFreeHost(l->delta));
105         if (cudaSuccess != cudaHostAlloc(&l->delta, l->batch*l->outputs * sizeof(float), cudaHostRegisterMapped)) {
106             cudaGetLastError(); // reset CUDA-error
107             l->delta = (float*)xcalloc(l->batch * l->outputs, sizeof(float));
108             l->delta_pinned = 0;
109         }
110     }
111 
112     cuda_free(l->delta_gpu);
113     cuda_free(l->output_gpu);
114     cuda_free(l->output_avg_gpu);
115 
116     l->delta_gpu =     cuda_make_array(l->delta, l->batch*l->outputs);
117     l->output_gpu =    cuda_make_array(l->output, l->batch*l->outputs);
118     l->output_avg_gpu = cuda_make_array(l->output, l->batch*l->outputs);
119 #endif
120 }
121 
get_yolo_box(float * x,float * biases,int n,int index,int i,int j,int lw,int lh,int w,int h,int stride)122 box get_yolo_box(float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, int stride)
123 {
124     box b;
125     // ln - natural logarithm (base = e)
126     // x` = t.x * lw - i;   // x = ln(x`/(1-x`))   // x - output of previous conv-layer
127     // y` = t.y * lh - i;   // y = ln(y`/(1-y`))   // y - output of previous conv-layer
128                             // w = ln(t.w * net.w / anchors_w); // w - output of previous conv-layer
129                             // h = ln(t.h * net.h / anchors_h); // h - output of previous conv-layer
130     b.x = (i + x[index + 0*stride]) / lw;
131     b.y = (j + x[index + 1*stride]) / lh;
132     b.w = exp(x[index + 2*stride]) * biases[2*n]   / w;
133     b.h = exp(x[index + 3*stride]) * biases[2*n+1] / h;
134     return b;
135 }
136 
fix_nan_inf(float val)137 static inline float fix_nan_inf(float val)
138 {
139     if (isnan(val) || isinf(val)) val = 0;
140     return val;
141 }
142 
clip_value(float val,const float max_val)143 static inline float clip_value(float val, const float max_val)
144 {
145     if (val > max_val) {
146         //printf("\n val = %f > max_val = %f \n", val, max_val);
147         val = max_val;
148     }
149     else if (val < -max_val) {
150         //printf("\n val = %f < -max_val = %f \n", val, -max_val);
151         val = -max_val;
152     }
153     return val;
154 }
155 
delta_yolo_box(box truth,float * x,float * biases,int n,int index,int i,int j,int lw,int lh,int w,int h,float * delta,float scale,int stride,float iou_normalizer,IOU_LOSS iou_loss,int accumulate,float max_delta)156 ious delta_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride, float iou_normalizer, IOU_LOSS iou_loss, int accumulate, float max_delta)
157 {
158     ious all_ious = { 0 };
159     // i - step in layer width
160     // j - step in layer height
161     //  Returns a box in absolute coordinates
162     box pred = get_yolo_box(x, biases, n, index, i, j, lw, lh, w, h, stride);
163     all_ious.iou = box_iou(pred, truth);
164     all_ious.giou = box_giou(pred, truth);
165     all_ious.diou = box_diou(pred, truth);
166     all_ious.ciou = box_ciou(pred, truth);
167     // avoid nan in dx_box_iou
168     if (pred.w == 0) { pred.w = 1.0; }
169     if (pred.h == 0) { pred.h = 1.0; }
170     if (iou_loss == MSE)    // old loss
171     {
172         float tx = (truth.x*lw - i);
173         float ty = (truth.y*lh - j);
174         float tw = log(truth.w*w / biases[2 * n]);
175         float th = log(truth.h*h / biases[2 * n + 1]);
176 
177         //printf(" tx = %f, ty = %f, tw = %f, th = %f \n", tx, ty, tw, th);
178         //printf(" x = %f, y = %f, w = %f, h = %f \n", x[index + 0 * stride], x[index + 1 * stride], x[index + 2 * stride], x[index + 3 * stride]);
179 
180         // accumulate delta
181         delta[index + 0 * stride] += scale * (tx - x[index + 0 * stride]) * iou_normalizer;
182         delta[index + 1 * stride] += scale * (ty - x[index + 1 * stride]) * iou_normalizer;
183         delta[index + 2 * stride] += scale * (tw - x[index + 2 * stride]) * iou_normalizer;
184         delta[index + 3 * stride] += scale * (th - x[index + 3 * stride]) * iou_normalizer;
185     }
186     else {
187         // https://github.com/generalized-iou/g-darknet
188         // https://arxiv.org/abs/1902.09630v2
189         // https://giou.stanford.edu/
190         all_ious.dx_iou = dx_box_iou(pred, truth, iou_loss);
191 
192         // jacobian^t (transpose)
193         //float dx = (all_ious.dx_iou.dl + all_ious.dx_iou.dr);
194         //float dy = (all_ious.dx_iou.dt + all_ious.dx_iou.db);
195         //float dw = ((-0.5 * all_ious.dx_iou.dl) + (0.5 * all_ious.dx_iou.dr));
196         //float dh = ((-0.5 * all_ious.dx_iou.dt) + (0.5 * all_ious.dx_iou.db));
197 
198         // jacobian^t (transpose)
199         float dx = all_ious.dx_iou.dt;
200         float dy = all_ious.dx_iou.db;
201         float dw = all_ious.dx_iou.dl;
202         float dh = all_ious.dx_iou.dr;
203 
204         // predict exponential, apply gradient of e^delta_t ONLY for w,h
205         dw *= exp(x[index + 2 * stride]);
206         dh *= exp(x[index + 3 * stride]);
207 
208         // normalize iou weight
209         dx *= iou_normalizer;
210         dy *= iou_normalizer;
211         dw *= iou_normalizer;
212         dh *= iou_normalizer;
213 
214 
215         dx = fix_nan_inf(dx);
216         dy = fix_nan_inf(dy);
217         dw = fix_nan_inf(dw);
218         dh = fix_nan_inf(dh);
219 
220         if (max_delta != FLT_MAX) {
221             dx = clip_value(dx, max_delta);
222             dy = clip_value(dy, max_delta);
223             dw = clip_value(dw, max_delta);
224             dh = clip_value(dh, max_delta);
225         }
226 
227 
228         if (!accumulate) {
229             delta[index + 0 * stride] = 0;
230             delta[index + 1 * stride] = 0;
231             delta[index + 2 * stride] = 0;
232             delta[index + 3 * stride] = 0;
233         }
234 
235         // accumulate delta
236         delta[index + 0 * stride] += dx;
237         delta[index + 1 * stride] += dy;
238         delta[index + 2 * stride] += dw;
239         delta[index + 3 * stride] += dh;
240     }
241 
242     return all_ious;
243 }
244 
averages_yolo_deltas(int class_index,int box_index,int stride,int classes,float * delta)245 void averages_yolo_deltas(int class_index, int box_index, int stride, int classes, float *delta)
246 {
247 
248     int classes_in_one_box = 0;
249     int c;
250     for (c = 0; c < classes; ++c) {
251         if (delta[class_index + stride*c] > 0) classes_in_one_box++;
252     }
253 
254     if (classes_in_one_box > 0) {
255         delta[box_index + 0 * stride] /= classes_in_one_box;
256         delta[box_index + 1 * stride] /= classes_in_one_box;
257         delta[box_index + 2 * stride] /= classes_in_one_box;
258         delta[box_index + 3 * stride] /= classes_in_one_box;
259     }
260 }
261 
delta_yolo_class(float * output,float * delta,int index,int class_id,int classes,int stride,float * avg_cat,int focal_loss,float label_smooth_eps,float * classes_multipliers)262 void delta_yolo_class(float *output, float *delta, int index, int class_id, int classes, int stride, float *avg_cat, int focal_loss, float label_smooth_eps, float *classes_multipliers)
263 {
264     int n;
265     if (delta[index + stride*class_id]){
266         float y_true = 1;
267         if(label_smooth_eps) y_true = y_true *  (1 - label_smooth_eps) + 0.5*label_smooth_eps;
268         float result_delta = y_true - output[index + stride*class_id];
269         if(!isnan(result_delta) && !isinf(result_delta)) delta[index + stride*class_id] = result_delta;
270         //delta[index + stride*class_id] = 1 - output[index + stride*class_id];
271 
272         if (classes_multipliers) delta[index + stride*class_id] *= classes_multipliers[class_id];
273         if(avg_cat) *avg_cat += output[index + stride*class_id];
274         return;
275     }
276     // Focal loss
277     if (focal_loss) {
278         // Focal Loss
279         float alpha = 0.5;    // 0.25 or 0.5
280         //float gamma = 2;    // hardcoded in many places of the grad-formula
281 
282         int ti = index + stride*class_id;
283         float pt = output[ti] + 0.000000000000001F;
284         // http://fooplot.com/#W3sidHlwZSI6MCwiZXEiOiItKDEteCkqKDIqeCpsb2coeCkreC0xKSIsImNvbG9yIjoiIzAwMDAwMCJ9LHsidHlwZSI6MTAwMH1d
285         float grad = -(1 - pt) * (2 * pt*logf(pt) + pt - 1);    // http://blog.csdn.net/linmingan/article/details/77885832
286         //float grad = (1 - pt) * (2 * pt*logf(pt) + pt - 1);    // https://github.com/unsky/focal-loss
287 
288         for (n = 0; n < classes; ++n) {
289             delta[index + stride*n] = (((n == class_id) ? 1 : 0) - output[index + stride*n]);
290 
291             delta[index + stride*n] *= alpha*grad;
292 
293             if (n == class_id && avg_cat) *avg_cat += output[index + stride*n];
294         }
295     }
296     else {
297         // default
298         for (n = 0; n < classes; ++n) {
299             float y_true = ((n == class_id) ? 1 : 0);
300             if (label_smooth_eps) y_true = y_true *  (1 - label_smooth_eps) + 0.5*label_smooth_eps;
301             float result_delta = y_true - output[index + stride*n];
302             if (!isnan(result_delta) && !isinf(result_delta)) delta[index + stride*n] = result_delta;
303 
304             if (classes_multipliers && n == class_id) delta[index + stride*class_id] *= classes_multipliers[class_id];
305             if (n == class_id && avg_cat) *avg_cat += output[index + stride*n];
306         }
307     }
308 }
309 
compare_yolo_class(float * output,int classes,int class_index,int stride,float objectness,int class_id,float conf_thresh)310 int compare_yolo_class(float *output, int classes, int class_index, int stride, float objectness, int class_id, float conf_thresh)
311 {
312     int j;
313     for (j = 0; j < classes; ++j) {
314         //float prob = objectness * output[class_index + stride*j];
315         float prob = output[class_index + stride*j];
316         if (prob > conf_thresh) {
317             return 1;
318         }
319     }
320     return 0;
321 }
322 
entry_index(layer l,int batch,int location,int entry)323 static int entry_index(layer l, int batch, int location, int entry)
324 {
325     int n =   location / (l.w*l.h);
326     int loc = location % (l.w*l.h);
327     return batch*l.outputs + n*l.w*l.h*(4+l.classes+1) + entry*l.w*l.h + loc;
328 }
329 
forward_yolo_layer(const layer l,network_state state)330 void forward_yolo_layer(const layer l, network_state state)
331 {
332     int i, j, b, t, n;
333     memcpy(l.output, state.input, l.outputs*l.batch * sizeof(float));
334 
335 #ifndef GPU
336     for (b = 0; b < l.batch; ++b) {
337         for (n = 0; n < l.n; ++n) {
338             int index = entry_index(l, b, n*l.w*l.h, 0);
339             activate_array(l.output + index, 2 * l.w*l.h, LOGISTIC);        // x,y,
340             scal_add_cpu(2 * l.w*l.h, l.scale_x_y, -0.5*(l.scale_x_y - 1), l.output + index, 1);    // scale x,y
341             index = entry_index(l, b, n*l.w*l.h, 4);
342             activate_array(l.output + index, (1 + l.classes)*l.w*l.h, LOGISTIC);
343         }
344     }
345 #endif
346 
347     // delta is zeroed
348     memset(l.delta, 0, l.outputs * l.batch * sizeof(float));
349     if (!state.train) return;
350     //float avg_iou = 0;
351     float tot_iou = 0;
352     float tot_giou = 0;
353     float tot_diou = 0;
354     float tot_ciou = 0;
355     float tot_iou_loss = 0;
356     float tot_giou_loss = 0;
357     float tot_diou_loss = 0;
358     float tot_ciou_loss = 0;
359     float recall = 0;
360     float recall75 = 0;
361     float avg_cat = 0;
362     float avg_obj = 0;
363     float avg_anyobj = 0;
364     int count = 0;
365     int class_count = 0;
366     *(l.cost) = 0;
367     for (b = 0; b < l.batch; ++b) {
368         for (j = 0; j < l.h; ++j) {
369             for (i = 0; i < l.w; ++i) {
370                 for (n = 0; n < l.n; ++n) {
371                     const int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
372                     const int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4);
373                     const int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
374                     const int stride = l.w*l.h;
375                     box pred = get_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.w*l.h);
376                     float best_match_iou = 0;
377                     int best_match_t = 0;
378                     float best_iou = 0;
379                     int best_t = 0;
380                     for (t = 0; t < l.max_boxes; ++t) {
381                         box truth = float_to_box_stride(state.truth + t*(4 + 1) + b*l.truths, 1);
382                         int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];
383                         if (class_id >= l.classes || class_id < 0) {
384                             printf("\n Warning: in txt-labels class_id=%d >= classes=%d in cfg-file. In txt-labels class_id should be [from 0 to %d] \n", class_id, l.classes, l.classes - 1);
385                             printf("\n truth.x = %f, truth.y = %f, truth.w = %f, truth.h = %f, class_id = %d \n", truth.x, truth.y, truth.w, truth.h, class_id);
386                             if (check_mistakes) getchar();
387                             continue; // if label contains class_id more than number of classes in the cfg-file and class_id check garbage value
388                         }
389                         if (!truth.x) break;  // continue;
390 
391                         float objectness = l.output[obj_index];
392                         if (isnan(objectness) || isinf(objectness)) l.output[obj_index] = 0;
393                         int class_id_match = compare_yolo_class(l.output, l.classes, class_index, l.w*l.h, objectness, class_id, 0.25f);
394 
395                         float iou = box_iou(pred, truth);
396                         if (iou > best_match_iou && class_id_match == 1) {
397                             best_match_iou = iou;
398                             best_match_t = t;
399                         }
400                         if (iou > best_iou) {
401                             best_iou = iou;
402                             best_t = t;
403                         }
404                     }
405 
406                     avg_anyobj += l.output[obj_index];
407                     l.delta[obj_index] = l.cls_normalizer * (0 - l.output[obj_index]);
408                     if (best_match_iou > l.ignore_thresh) {
409                         const float iou_multiplier = best_match_iou*best_match_iou;// (best_match_iou - l.ignore_thresh) / (1.0 - l.ignore_thresh);
410                         if (l.objectness_smooth) {
411                             l.delta[obj_index] = l.cls_normalizer * (iou_multiplier - l.output[obj_index]);
412 
413                             int class_id = state.truth[best_match_t*(4 + 1) + b*l.truths + 4];
414                             if (l.map) class_id = l.map[class_id];
415                             const float class_multiplier = (l.classes_multipliers) ? l.classes_multipliers[class_id] : 1.0f;
416                             l.delta[class_index + stride*class_id] = class_multiplier * (iou_multiplier - l.output[class_index + stride*class_id]);
417                         }
418                         else l.delta[obj_index] = 0;
419                     }
420                     else if (state.net.adversarial) {
421                         int stride = l.w*l.h;
422                         float scale = pred.w * pred.h;
423                         if (scale > 0) scale = sqrt(scale);
424                         l.delta[obj_index] = scale * l.cls_normalizer * (0 - l.output[obj_index]);
425                         int cl_id;
426                         for (cl_id = 0; cl_id < l.classes; ++cl_id) {
427                             if(l.output[class_index + stride*cl_id] * l.output[obj_index] > 0.25)
428                                 l.delta[class_index + stride*cl_id] = scale * (0 - l.output[class_index + stride*cl_id]);
429                         }
430                     }
431                     if (best_iou > l.truth_thresh) {
432                         const float iou_multiplier = best_iou*best_iou;// (best_iou - l.truth_thresh) / (1.0 - l.truth_thresh);
433                         if (l.objectness_smooth) l.delta[obj_index] = l.cls_normalizer * (iou_multiplier - l.output[obj_index]);
434                         else l.delta[obj_index] = l.cls_normalizer * (1 - l.output[obj_index]);
435                         //l.delta[obj_index] = l.cls_normalizer * (1 - l.output[obj_index]);
436 
437                         int class_id = state.truth[best_t*(4 + 1) + b*l.truths + 4];
438                         if (l.map) class_id = l.map[class_id];
439                         delta_yolo_class(l.output, l.delta, class_index, class_id, l.classes, l.w*l.h, 0, l.focal_loss, l.label_smooth_eps, l.classes_multipliers);
440                         const float class_multiplier = (l.classes_multipliers) ? l.classes_multipliers[class_id] : 1.0f;
441                         if (l.objectness_smooth) l.delta[class_index + stride*class_id] = class_multiplier * (iou_multiplier - l.output[class_index + stride*class_id]);
442                         box truth = float_to_box_stride(state.truth + best_t*(4 + 1) + b*l.truths, 1);
443                         delta_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer * class_multiplier, l.iou_loss, 1, l.max_delta);
444                     }
445                 }
446             }
447         }
448         for (t = 0; t < l.max_boxes; ++t) {
449             box truth = float_to_box_stride(state.truth + t*(4 + 1) + b*l.truths, 1);
450             if (truth.x < 0 || truth.y < 0 || truth.x > 1 || truth.y > 1 || truth.w < 0 || truth.h < 0) {
451                 char buff[256];
452                 printf(" Wrong label: truth.x = %f, truth.y = %f, truth.w = %f, truth.h = %f \n", truth.x, truth.y, truth.w, truth.h);
453                 sprintf(buff, "echo \"Wrong label: truth.x = %f, truth.y = %f, truth.w = %f, truth.h = %f\" >> bad_label.list",
454                     truth.x, truth.y, truth.w, truth.h);
455                 system(buff);
456             }
457             int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];
458             if (class_id >= l.classes || class_id < 0) continue; // if label contains class_id more than number of classes in the cfg-file and class_id check garbage value
459 
460             if (!truth.x) break;  // continue;
461             float best_iou = 0;
462             int best_n = 0;
463             i = (truth.x * l.w);
464             j = (truth.y * l.h);
465             box truth_shift = truth;
466             truth_shift.x = truth_shift.y = 0;
467             for (n = 0; n < l.total; ++n) {
468                 box pred = { 0 };
469                 pred.w = l.biases[2 * n] / state.net.w;
470                 pred.h = l.biases[2 * n + 1] / state.net.h;
471                 float iou = box_iou(pred, truth_shift);
472                 if (iou > best_iou) {
473                     best_iou = iou;
474                     best_n = n;
475                 }
476             }
477 
478             int mask_n = int_index(l.mask, best_n, l.n);
479             if (mask_n >= 0) {
480                 int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];
481                 if (l.map) class_id = l.map[class_id];
482 
483                 int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
484                 const float class_multiplier = (l.classes_multipliers) ? l.classes_multipliers[class_id] : 1.0f;
485                 ious all_ious = delta_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer * class_multiplier, l.iou_loss, 1, l.max_delta);
486 
487                 // range is 0 <= 1
488                 tot_iou += all_ious.iou;
489                 tot_iou_loss += 1 - all_ious.iou;
490                 // range is -1 <= giou <= 1
491                 tot_giou += all_ious.giou;
492                 tot_giou_loss += 1 - all_ious.giou;
493 
494                 tot_diou += all_ious.diou;
495                 tot_diou_loss += 1 - all_ious.diou;
496 
497                 tot_ciou += all_ious.ciou;
498                 tot_ciou_loss += 1 - all_ious.ciou;
499 
500                 int obj_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4);
501                 avg_obj += l.output[obj_index];
502                 l.delta[obj_index] = class_multiplier * l.cls_normalizer * (1 - l.output[obj_index]);
503 
504                 int class_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4 + 1);
505                 delta_yolo_class(l.output, l.delta, class_index, class_id, l.classes, l.w*l.h, &avg_cat, l.focal_loss, l.label_smooth_eps, l.classes_multipliers);
506 
507                 //printf(" label: class_id = %d, truth.x = %f, truth.y = %f, truth.w = %f, truth.h = %f \n", class_id, truth.x, truth.y, truth.w, truth.h);
508                 //printf(" mask_n = %d, l.output[obj_index] = %f, l.output[class_index + class_id] = %f \n\n", mask_n, l.output[obj_index], l.output[class_index + class_id]);
509 
510                 ++count;
511                 ++class_count;
512                 if (all_ious.iou > .5) recall += 1;
513                 if (all_ious.iou > .75) recall75 += 1;
514             }
515 
516             // iou_thresh
517             for (n = 0; n < l.total; ++n) {
518                 int mask_n = int_index(l.mask, n, l.n);
519                 if (mask_n >= 0 && n != best_n && l.iou_thresh < 1.0f) {
520                     box pred = { 0 };
521                     pred.w = l.biases[2 * n] / state.net.w;
522                     pred.h = l.biases[2 * n + 1] / state.net.h;
523                     float iou = box_iou_kind(pred, truth_shift, l.iou_thresh_kind); // IOU, GIOU, MSE, DIOU, CIOU
524                     // iou, n
525 
526                     if (iou > l.iou_thresh) {
527                         int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];
528                         if (l.map) class_id = l.map[class_id];
529 
530                         int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
531                         const float class_multiplier = (l.classes_multipliers) ? l.classes_multipliers[class_id] : 1.0f;
532                         ious all_ious = delta_yolo_box(truth, l.output, l.biases, n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer * class_multiplier, l.iou_loss, 1, l.max_delta);
533 
534                         // range is 0 <= 1
535                         tot_iou += all_ious.iou;
536                         tot_iou_loss += 1 - all_ious.iou;
537                         // range is -1 <= giou <= 1
538                         tot_giou += all_ious.giou;
539                         tot_giou_loss += 1 - all_ious.giou;
540 
541                         tot_diou += all_ious.diou;
542                         tot_diou_loss += 1 - all_ious.diou;
543 
544                         tot_ciou += all_ious.ciou;
545                         tot_ciou_loss += 1 - all_ious.ciou;
546 
547                         int obj_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4);
548                         avg_obj += l.output[obj_index];
549                         l.delta[obj_index] = class_multiplier * l.cls_normalizer * (1 - l.output[obj_index]);
550 
551                         int class_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4 + 1);
552                         delta_yolo_class(l.output, l.delta, class_index, class_id, l.classes, l.w*l.h, &avg_cat, l.focal_loss, l.label_smooth_eps, l.classes_multipliers);
553 
554                         ++count;
555                         ++class_count;
556                         if (all_ious.iou > .5) recall += 1;
557                         if (all_ious.iou > .75) recall75 += 1;
558                     }
559                 }
560             }
561         }
562 
563         // averages the deltas obtained by the function: delta_yolo_box()_accumulate
564         for (j = 0; j < l.h; ++j) {
565             for (i = 0; i < l.w; ++i) {
566                 for (n = 0; n < l.n; ++n) {
567                     int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
568                     int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
569                     const int stride = l.w*l.h;
570 
571                     averages_yolo_deltas(class_index, box_index, stride, l.classes, l.delta);
572                 }
573             }
574         }
575     }
576 
577     if (count == 0) count = 1;
578     if (class_count == 0) class_count = 1;
579 
580     //*(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
581     //printf("Region %d Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f,  count: %d\n", state.index, avg_iou / count, avg_cat / class_count, avg_obj / count, avg_anyobj / (l.w*l.h*l.n*l.batch), recall / count, recall75 / count, count);
582 
583     int stride = l.w*l.h;
584     float* no_iou_loss_delta = (float *)calloc(l.batch * l.outputs, sizeof(float));
585     memcpy(no_iou_loss_delta, l.delta, l.batch * l.outputs * sizeof(float));
586     for (b = 0; b < l.batch; ++b) {
587         for (j = 0; j < l.h; ++j) {
588             for (i = 0; i < l.w; ++i) {
589                 for (n = 0; n < l.n; ++n) {
590                     int index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
591                     no_iou_loss_delta[index + 0 * stride] = 0;
592                     no_iou_loss_delta[index + 1 * stride] = 0;
593                     no_iou_loss_delta[index + 2 * stride] = 0;
594                     no_iou_loss_delta[index + 3 * stride] = 0;
595                 }
596             }
597         }
598     }
599     float classification_loss = l.cls_normalizer * pow(mag_array(no_iou_loss_delta, l.outputs * l.batch), 2);
600     free(no_iou_loss_delta);
601     float loss = pow(mag_array(l.delta, l.outputs * l.batch), 2);
602     float iou_loss = loss - classification_loss;
603 
604     float avg_iou_loss = 0;
605     // gIOU loss + MSE (objectness) loss
606     if (l.iou_loss == MSE) {
607         *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
608     }
609     else {
610         // Always compute classification loss both for iou + cls loss and for logging with mse loss
611         // TODO: remove IOU loss fields before computing MSE on class
612         //   probably split into two arrays
613         if (l.iou_loss == GIOU) {
614             avg_iou_loss = count > 0 ? l.iou_normalizer * (tot_giou_loss / count) : 0;
615         }
616         else {
617             avg_iou_loss = count > 0 ? l.iou_normalizer * (tot_iou_loss / count) : 0;
618         }
619         *(l.cost) = avg_iou_loss + classification_loss;
620     }
621 
622     loss /= l.batch;
623     classification_loss /= l.batch;
624     iou_loss /= l.batch;
625 
626     fprintf(stderr, "v3 (%s loss, Normalizer: (iou: %.2f, cls: %.2f) Region %d Avg (IOU: %f, GIOU: %f), Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f, count: %d, class_loss = %f, iou_loss = %f, total_loss = %f \n",
627         (l.iou_loss == MSE ? "mse" : (l.iou_loss == GIOU ? "giou" : "iou")), l.iou_normalizer, l.cls_normalizer, state.index, tot_iou / count, tot_giou / count, avg_cat / class_count, avg_obj / count, avg_anyobj / (l.w*l.h*l.n*l.batch), recall / count, recall75 / count, count,
628         classification_loss, iou_loss, loss);
629 }
630 
backward_yolo_layer(const layer l,network_state state)631 void backward_yolo_layer(const layer l, network_state state)
632 {
633    axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1);
634 }
635 
636 // Converts output of the network to detection boxes
637 // w,h: image width,height
638 // netw,neth: network width,height
639 // relative: 1 (all callers seems to pass TRUE)
correct_yolo_boxes(detection * dets,int n,int w,int h,int netw,int neth,int relative,int letter)640 void correct_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative, int letter)
641 {
642     int i;
643     // network height (or width)
644     int new_w = 0;
645     // network height (or width)
646     int new_h = 0;
647     // Compute scale given image w,h vs network w,h
648     // I think this "rotates" the image to match network to input image w/h ratio
649     // new_h and new_w are really just network width and height
650     if (letter) {
651         if (((float)netw / w) < ((float)neth / h)) {
652             new_w = netw;
653             new_h = (h * netw) / w;
654         }
655         else {
656             new_h = neth;
657             new_w = (w * neth) / h;
658         }
659     }
660     else {
661         new_w = netw;
662         new_h = neth;
663     }
664     // difference between network width and "rotated" width
665     float deltaw = netw - new_w;
666     // difference between network height and "rotated" height
667     float deltah = neth - new_h;
668     // ratio between rotated network width and network width
669     float ratiow = (float)new_w / netw;
670     // ratio between rotated network width and network width
671     float ratioh = (float)new_h / neth;
672     for (i = 0; i < n; ++i) {
673 
674         box b = dets[i].bbox;
675         // x = ( x - (deltaw/2)/netw ) / ratiow;
676         //   x - [(1/2 the difference of the network width and rotated width) / (network width)]
677         b.x = (b.x - deltaw / 2. / netw) / ratiow;
678         b.y = (b.y - deltah / 2. / neth) / ratioh;
679         // scale to match rotation of incoming image
680         b.w *= 1 / ratiow;
681         b.h *= 1 / ratioh;
682 
683         // relative seems to always be == 1, I don't think we hit this condition, ever.
684         if (!relative) {
685             b.x *= w;
686             b.w *= w;
687             b.y *= h;
688             b.h *= h;
689         }
690 
691         dets[i].bbox = b;
692     }
693 }
694 
695 /*
696 void correct_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative, int letter)
697 {
698     int i;
699     int new_w=0;
700     int new_h=0;
701     if (letter) {
702         if (((float)netw / w) < ((float)neth / h)) {
703             new_w = netw;
704             new_h = (h * netw) / w;
705         }
706         else {
707             new_h = neth;
708             new_w = (w * neth) / h;
709         }
710     }
711     else {
712         new_w = netw;
713         new_h = neth;
714     }
715     for (i = 0; i < n; ++i){
716         box b = dets[i].bbox;
717         b.x =  (b.x - (netw - new_w)/2./netw) / ((float)new_w/netw);
718         b.y =  (b.y - (neth - new_h)/2./neth) / ((float)new_h/neth);
719         b.w *= (float)netw/new_w;
720         b.h *= (float)neth/new_h;
721         if(!relative){
722             b.x *= w;
723             b.w *= w;
724             b.y *= h;
725             b.h *= h;
726         }
727         dets[i].bbox = b;
728     }
729 }
730 */
731 
yolo_num_detections(layer l,float thresh)732 int yolo_num_detections(layer l, float thresh)
733 {
734     int i, n;
735     int count = 0;
736     for(n = 0; n < l.n; ++n){
737         for (i = 0; i < l.w*l.h; ++i) {
738             int obj_index  = entry_index(l, 0, n*l.w*l.h + i, 4);
739             if(l.output[obj_index] > thresh){
740                 ++count;
741             }
742         }
743     }
744     return count;
745 }
746 
yolo_num_detections_batch(layer l,float thresh,int batch)747 int yolo_num_detections_batch(layer l, float thresh, int batch)
748 {
749     int i, n;
750     int count = 0;
751     for (i = 0; i < l.w*l.h; ++i){
752         for(n = 0; n < l.n; ++n){
753             int obj_index  = entry_index(l, batch, n*l.w*l.h + i, 4);
754             if(l.output[obj_index] > thresh){
755                 ++count;
756             }
757         }
758     }
759     return count;
760 }
761 
avg_flipped_yolo(layer l)762 void avg_flipped_yolo(layer l)
763 {
764     int i,j,n,z;
765     float *flip = l.output + l.outputs;
766     for (j = 0; j < l.h; ++j) {
767         for (i = 0; i < l.w/2; ++i) {
768             for (n = 0; n < l.n; ++n) {
769                 for(z = 0; z < l.classes + 4 + 1; ++z){
770                     int i1 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + i;
771                     int i2 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + (l.w - i - 1);
772                     float swap = flip[i1];
773                     flip[i1] = flip[i2];
774                     flip[i2] = swap;
775                     if(z == 0){
776                         flip[i1] = -flip[i1];
777                         flip[i2] = -flip[i2];
778                     }
779                 }
780             }
781         }
782     }
783     for(i = 0; i < l.outputs; ++i){
784         l.output[i] = (l.output[i] + flip[i])/2.;
785     }
786 }
787 
get_yolo_detections(layer l,int w,int h,int netw,int neth,float thresh,int * map,int relative,detection * dets,int letter)788 int get_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets, int letter)
789 {
790     //printf("\n l.batch = %d, l.w = %d, l.h = %d, l.n = %d \n", l.batch, l.w, l.h, l.n);
791     int i,j,n;
792     float *predictions = l.output;
793     // This snippet below is not necessary
794     // Need to comment it in order to batch processing >= 2 images
795     //if (l.batch == 2) avg_flipped_yolo(l);
796     int count = 0;
797     for (i = 0; i < l.w*l.h; ++i){
798         int row = i / l.w;
799         int col = i % l.w;
800         for(n = 0; n < l.n; ++n){
801             int obj_index  = entry_index(l, 0, n*l.w*l.h + i, 4);
802             float objectness = predictions[obj_index];
803             //if(objectness <= thresh) continue;    // incorrect behavior for Nan values
804             if (objectness > thresh) {
805                 //printf("\n objectness = %f, thresh = %f, i = %d, n = %d \n", objectness, thresh, i, n);
806                 int box_index = entry_index(l, 0, n*l.w*l.h + i, 0);
807                 dets[count].bbox = get_yolo_box(predictions, l.biases, l.mask[n], box_index, col, row, l.w, l.h, netw, neth, l.w*l.h);
808                 dets[count].objectness = objectness;
809                 dets[count].classes = l.classes;
810                 for (j = 0; j < l.classes; ++j) {
811                     int class_index = entry_index(l, 0, n*l.w*l.h + i, 4 + 1 + j);
812                     float prob = objectness*predictions[class_index];
813                     dets[count].prob[j] = (prob > thresh) ? prob : 0;
814                 }
815                 ++count;
816             }
817         }
818     }
819     correct_yolo_boxes(dets, count, w, h, netw, neth, relative, letter);
820     return count;
821 }
822 
get_yolo_detections_batch(layer l,int w,int h,int netw,int neth,float thresh,int * map,int relative,detection * dets,int letter,int batch)823 int get_yolo_detections_batch(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets, int letter, int batch)
824 {
825     int i,j,n;
826     float *predictions = l.output;
827     //if (l.batch == 2) avg_flipped_yolo(l);
828     int count = 0;
829     for (i = 0; i < l.w*l.h; ++i){
830         int row = i / l.w;
831         int col = i % l.w;
832         for(n = 0; n < l.n; ++n){
833             int obj_index  = entry_index(l, batch, n*l.w*l.h + i, 4);
834             float objectness = predictions[obj_index];
835             //if(objectness <= thresh) continue;    // incorrect behavior for Nan values
836             if (objectness > thresh) {
837                 //printf("\n objectness = %f, thresh = %f, i = %d, n = %d \n", objectness, thresh, i, n);
838                 int box_index = entry_index(l, batch, n*l.w*l.h + i, 0);
839                 dets[count].bbox = get_yolo_box(predictions, l.biases, l.mask[n], box_index, col, row, l.w, l.h, netw, neth, l.w*l.h);
840                 dets[count].objectness = objectness;
841                 dets[count].classes = l.classes;
842                 for (j = 0; j < l.classes; ++j) {
843                     int class_index = entry_index(l, batch, n*l.w*l.h + i, 4 + 1 + j);
844                     float prob = objectness*predictions[class_index];
845                     dets[count].prob[j] = (prob > thresh) ? prob : 0;
846                 }
847                 ++count;
848             }
849         }
850     }
851     correct_yolo_boxes(dets, count, w, h, netw, neth, relative, letter);
852     return count;
853 }
854 
855 #ifdef GPU
856 
forward_yolo_layer_gpu(const layer l,network_state state)857 void forward_yolo_layer_gpu(const layer l, network_state state)
858 {
859     //copy_ongpu(l.batch*l.inputs, state.input, 1, l.output_gpu, 1);
860     simple_copy_ongpu(l.batch*l.inputs, state.input, l.output_gpu);
861     int b, n;
862     for (b = 0; b < l.batch; ++b){
863         for(n = 0; n < l.n; ++n){
864             int index = entry_index(l, b, n*l.w*l.h, 0);
865             // y = 1./(1. + exp(-x))
866             // x = ln(y/(1-y))  // ln - natural logarithm (base = e)
867             // if(y->1) x -> inf
868             // if(y->0) x -> -inf
869             activate_array_ongpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC);    // x,y
870             if (l.scale_x_y != 1) scal_add_ongpu(2 * l.w*l.h, l.scale_x_y, -0.5*(l.scale_x_y - 1), l.output_gpu + index, 1);      // scale x,y
871             index = entry_index(l, b, n*l.w*l.h, 4);
872             activate_array_ongpu(l.output_gpu + index, (1+l.classes)*l.w*l.h, LOGISTIC); // classes and objectness
873         }
874     }
875     if(!state.train || l.onlyforward){
876         //cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
877         if (l.mean_alpha && l.output_avg_gpu) mean_array_gpu(l.output_gpu, l.batch*l.outputs, l.mean_alpha, l.output_avg_gpu);
878         cuda_pull_array_async(l.output_gpu, l.output, l.batch*l.outputs);
879         CHECK_CUDA(cudaPeekAtLastError());
880         return;
881     }
882 
883     float *in_cpu = (float *)xcalloc(l.batch*l.inputs, sizeof(float));
884     cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
885     memcpy(in_cpu, l.output, l.batch*l.outputs*sizeof(float));
886     float *truth_cpu = 0;
887     if (state.truth) {
888         int num_truth = l.batch*l.truths;
889         truth_cpu = (float *)xcalloc(num_truth, sizeof(float));
890         cuda_pull_array(state.truth, truth_cpu, num_truth);
891     }
892     network_state cpu_state = state;
893     cpu_state.net = state.net;
894     cpu_state.index = state.index;
895     cpu_state.train = state.train;
896     cpu_state.truth = truth_cpu;
897     cpu_state.input = in_cpu;
898     forward_yolo_layer(l, cpu_state);
899     //forward_yolo_layer(l, state);
900     cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs);
901     free(in_cpu);
902     if (cpu_state.truth) free(cpu_state.truth);
903 }
904 
backward_yolo_layer_gpu(const layer l,network_state state)905 void backward_yolo_layer_gpu(const layer l, network_state state)
906 {
907     axpy_ongpu(l.batch*l.inputs, state.net.loss_scale, l.delta_gpu, 1, state.delta, 1);
908 }
909 #endif
910