1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 #if __AVX__
15 #include <immintrin.h>
16 #endif
17 
18 #include "yolov3detectionoutput_x86.h"
19 
20 #include <float.h>
21 #include <math.h>
22 
23 namespace ncnn {
24 
Yolov3DetectionOutput_x86()25 Yolov3DetectionOutput_x86::Yolov3DetectionOutput_x86()
26 {
27 }
28 
sigmoid(float x)29 static inline float sigmoid(float x)
30 {
31     return static_cast<float>(1.f / (1.f + exp(-x)));
32 }
33 
forward(const std::vector<Mat> & bottom_blobs,std::vector<Mat> & top_blobs,const Option & opt) const34 int Yolov3DetectionOutput_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
35 {
36     // gather all box
37     std::vector<BBoxRect> all_bbox_rects;
38 
39     for (size_t b = 0; b < bottom_blobs.size(); b++)
40     {
41         std::vector<std::vector<BBoxRect> > all_box_bbox_rects;
42         all_box_bbox_rects.resize(num_box);
43         const Mat& bottom_top_blobs = bottom_blobs[b];
44 
45         int w = bottom_top_blobs.w;
46         int h = bottom_top_blobs.h;
47         int channels = bottom_top_blobs.c;
48         //printf("%d %d %d\n", w, h, channels);
49         const int channels_per_box = channels / num_box;
50 
51         // anchor coord + box score + num_class
52         if (channels_per_box != 4 + 1 + num_class)
53             return -1;
54         size_t mask_offset = b * num_box;
55         int net_w = (int)(anchors_scale[b] * w);
56         int net_h = (int)(anchors_scale[b] * h);
57         //printf("%d %d\n", net_w, net_h);
58 
59         //printf("%d %d %d\n", w, h, channels);
60         #pragma omp parallel for num_threads(opt.num_threads)
61         for (int pp = 0; pp < num_box; pp++)
62         {
63             int p = pp * channels_per_box;
64             int biases_index = static_cast<int>(mask[pp + mask_offset]);
65             //printf("%d\n", biases_index);
66             const float bias_w = biases[biases_index * 2];
67             const float bias_h = biases[biases_index * 2 + 1];
68             //printf("%f %f\n", bias_w, bias_h);
69             const float* xptr = bottom_top_blobs.channel(p);
70             const float* yptr = bottom_top_blobs.channel(p + 1);
71             const float* wptr = bottom_top_blobs.channel(p + 2);
72             const float* hptr = bottom_top_blobs.channel(p + 3);
73 
74             const float* box_score_ptr = bottom_top_blobs.channel(p + 4);
75 
76             // softmax class scores
77             Mat scores = bottom_top_blobs.channel_range(p + 5, num_class);
78             //softmax->forward_inplace(scores, opt);
79 
80             const int cs = scores.cstep;
81 
82 #if __AVX__
83             const __m256i vi = _mm256_setr_epi32(
84                                    0, cs * 1, cs * 2, cs * 3, cs * 4, cs * 5, cs * 6, cs * 7);
85 #endif
86 
87             for (int i = 0; i < h; i++)
88             {
89                 for (int j = 0; j < w; j++)
90                 {
91 #if 0
92                     int class_index = 0;
93                     float class_score = -FLT_MAX;
94                     for (int q = 0; q < num_class; q++)
95                     {
96                         float score = scores.channel(q).row(i)[j];
97                         if (score > class_score)
98                         {
99                             class_index = q;
100                             class_score = score;
101                         }
102                     }
103 #else
104                     // find class index with max class score
105                     int class_index = 0;
106                     float class_score = -FLT_MAX;
107                     float* ptr = ((float*)scores.data) + i * w + j;
108                     float* end = ptr + num_class * cs;
109                     int q = 0;
110 #if __AVX__
111                     float* end8 = ptr + (num_class & -8) * cs;
112                     unsigned long index;
113 
114                     for (; ptr < end8; ptr += 8 * cs, q += 8)
115                     {
116                         __m256 p = _mm256_i32gather_ps(ptr, vi, 4);
117                         __m256 t = _mm256_max_ps(p, _mm256_permute2f128_ps(p, p, 1));
118                         t = _mm256_max_ps(t, _mm256_permute_ps(t, 0x4e));
119                         t = _mm256_max_ps(t, _mm256_permute_ps(t, 0xb1));
120                         float score = _mm_cvtss_f32(_mm256_extractf128_ps(t, 0));
121 
122                         if (score > class_score)
123                         {
124                             __m256 mi = _mm256_cmp_ps(p, t, _CMP_EQ_OQ);
125                             int mask = _mm256_movemask_ps(mi);
126 #ifdef _MSC_VER
127                             BitScanForward(&index, mask);
128 #else
129                             index = __builtin_ctz(mask);
130 #endif
131                             class_index = q + index;
132                             class_score = score;
133                         }
134                     }
135 #endif
136 
137                     for (; ptr < end; ptr += cs, q++)
138                     {
139                         if (*ptr > class_score)
140                         {
141                             class_index = q;
142                             class_score = *ptr;
143                         }
144                     }
145 #endif
146                     //sigmoid(box_score) * sigmoid(class_score)
147                     float confidence = 1.f / ((1.f + exp(-box_score_ptr[0]) * (1.f + exp(-class_score))));
148                     if (confidence >= confidence_threshold)
149                     {
150                         // region box
151                         float bbox_cx = (j + sigmoid(xptr[0])) / w;
152                         float bbox_cy = (i + sigmoid(yptr[0])) / h;
153                         float bbox_w = static_cast<float>(exp(wptr[0]) * bias_w / net_w);
154                         float bbox_h = static_cast<float>(exp(hptr[0]) * bias_h / net_h);
155 
156                         float bbox_xmin = bbox_cx - bbox_w * 0.5f;
157                         float bbox_ymin = bbox_cy - bbox_h * 0.5f;
158                         float bbox_xmax = bbox_cx + bbox_w * 0.5f;
159                         float bbox_ymax = bbox_cy + bbox_h * 0.5f;
160 
161                         float area = bbox_w * bbox_h;
162 
163                         BBoxRect c = {confidence, bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax, area, class_index};
164                         all_box_bbox_rects[pp].push_back(c);
165                     }
166 
167                     xptr++;
168                     yptr++;
169                     wptr++;
170                     hptr++;
171 
172                     box_score_ptr++;
173                 }
174             }
175         }
176 
177         for (int i = 0; i < num_box; i++)
178         {
179             const std::vector<BBoxRect>& box_bbox_rects = all_box_bbox_rects[i];
180 
181             all_bbox_rects.insert(all_bbox_rects.end(), box_bbox_rects.begin(), box_bbox_rects.end());
182         }
183     }
184 
185     // global sort inplace
186     qsort_descent_inplace(all_bbox_rects);
187 
188     // apply nms
189     std::vector<size_t> picked;
190     nms_sorted_bboxes(all_bbox_rects, picked, nms_threshold);
191 
192     // select
193     std::vector<BBoxRect> bbox_rects;
194 
195     for (size_t i = 0; i < picked.size(); i++)
196     {
197         size_t z = picked[i];
198         bbox_rects.push_back(all_bbox_rects[z]);
199     }
200 
201     // fill result
202     int num_detected = static_cast<int>(bbox_rects.size());
203     if (num_detected == 0)
204         return 0;
205 
206     Mat& top_blob = top_blobs[0];
207     top_blob.create(6, num_detected, 4u, opt.blob_allocator);
208     if (top_blob.empty())
209         return -100;
210 
211     for (int i = 0; i < num_detected; i++)
212     {
213         const BBoxRect& r = bbox_rects[i];
214         float score = r.score;
215         float* outptr = top_blob.row(i);
216 
217         outptr[0] = static_cast<float>(r.label + 1); // +1 for prepend background class
218         outptr[1] = score;
219         outptr[2] = r.xmin;
220         outptr[3] = r.ymin;
221         outptr[4] = r.xmax;
222         outptr[5] = r.ymax;
223     }
224 
225     return 0;
226 }
227 
228 } // namespace ncnn
229