1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 #if __AVX__
15 #include <immintrin.h>
16 #endif
17
18 #include "yolov3detectionoutput_x86.h"
19
20 #include <float.h>
21 #include <math.h>
22
23 namespace ncnn {
24
Yolov3DetectionOutput_x86()25 Yolov3DetectionOutput_x86::Yolov3DetectionOutput_x86()
26 {
27 }
28
sigmoid(float x)29 static inline float sigmoid(float x)
30 {
31 return static_cast<float>(1.f / (1.f + exp(-x)));
32 }
33
forward(const std::vector<Mat> & bottom_blobs,std::vector<Mat> & top_blobs,const Option & opt) const34 int Yolov3DetectionOutput_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
35 {
36 // gather all box
37 std::vector<BBoxRect> all_bbox_rects;
38
39 for (size_t b = 0; b < bottom_blobs.size(); b++)
40 {
41 std::vector<std::vector<BBoxRect> > all_box_bbox_rects;
42 all_box_bbox_rects.resize(num_box);
43 const Mat& bottom_top_blobs = bottom_blobs[b];
44
45 int w = bottom_top_blobs.w;
46 int h = bottom_top_blobs.h;
47 int channels = bottom_top_blobs.c;
48 //printf("%d %d %d\n", w, h, channels);
49 const int channels_per_box = channels / num_box;
50
51 // anchor coord + box score + num_class
52 if (channels_per_box != 4 + 1 + num_class)
53 return -1;
54 size_t mask_offset = b * num_box;
55 int net_w = (int)(anchors_scale[b] * w);
56 int net_h = (int)(anchors_scale[b] * h);
57 //printf("%d %d\n", net_w, net_h);
58
59 //printf("%d %d %d\n", w, h, channels);
60 #pragma omp parallel for num_threads(opt.num_threads)
61 for (int pp = 0; pp < num_box; pp++)
62 {
63 int p = pp * channels_per_box;
64 int biases_index = static_cast<int>(mask[pp + mask_offset]);
65 //printf("%d\n", biases_index);
66 const float bias_w = biases[biases_index * 2];
67 const float bias_h = biases[biases_index * 2 + 1];
68 //printf("%f %f\n", bias_w, bias_h);
69 const float* xptr = bottom_top_blobs.channel(p);
70 const float* yptr = bottom_top_blobs.channel(p + 1);
71 const float* wptr = bottom_top_blobs.channel(p + 2);
72 const float* hptr = bottom_top_blobs.channel(p + 3);
73
74 const float* box_score_ptr = bottom_top_blobs.channel(p + 4);
75
76 // softmax class scores
77 Mat scores = bottom_top_blobs.channel_range(p + 5, num_class);
78 //softmax->forward_inplace(scores, opt);
79
80 const int cs = scores.cstep;
81
82 #if __AVX__
83 const __m256i vi = _mm256_setr_epi32(
84 0, cs * 1, cs * 2, cs * 3, cs * 4, cs * 5, cs * 6, cs * 7);
85 #endif
86
87 for (int i = 0; i < h; i++)
88 {
89 for (int j = 0; j < w; j++)
90 {
91 #if 0
92 int class_index = 0;
93 float class_score = -FLT_MAX;
94 for (int q = 0; q < num_class; q++)
95 {
96 float score = scores.channel(q).row(i)[j];
97 if (score > class_score)
98 {
99 class_index = q;
100 class_score = score;
101 }
102 }
103 #else
104 // find class index with max class score
105 int class_index = 0;
106 float class_score = -FLT_MAX;
107 float* ptr = ((float*)scores.data) + i * w + j;
108 float* end = ptr + num_class * cs;
109 int q = 0;
110 #if __AVX__
111 float* end8 = ptr + (num_class & -8) * cs;
112 unsigned long index;
113
114 for (; ptr < end8; ptr += 8 * cs, q += 8)
115 {
116 __m256 p = _mm256_i32gather_ps(ptr, vi, 4);
117 __m256 t = _mm256_max_ps(p, _mm256_permute2f128_ps(p, p, 1));
118 t = _mm256_max_ps(t, _mm256_permute_ps(t, 0x4e));
119 t = _mm256_max_ps(t, _mm256_permute_ps(t, 0xb1));
120 float score = _mm_cvtss_f32(_mm256_extractf128_ps(t, 0));
121
122 if (score > class_score)
123 {
124 __m256 mi = _mm256_cmp_ps(p, t, _CMP_EQ_OQ);
125 int mask = _mm256_movemask_ps(mi);
126 #ifdef _MSC_VER
127 BitScanForward(&index, mask);
128 #else
129 index = __builtin_ctz(mask);
130 #endif
131 class_index = q + index;
132 class_score = score;
133 }
134 }
135 #endif
136
137 for (; ptr < end; ptr += cs, q++)
138 {
139 if (*ptr > class_score)
140 {
141 class_index = q;
142 class_score = *ptr;
143 }
144 }
145 #endif
146 //sigmoid(box_score) * sigmoid(class_score)
147 float confidence = 1.f / ((1.f + exp(-box_score_ptr[0]) * (1.f + exp(-class_score))));
148 if (confidence >= confidence_threshold)
149 {
150 // region box
151 float bbox_cx = (j + sigmoid(xptr[0])) / w;
152 float bbox_cy = (i + sigmoid(yptr[0])) / h;
153 float bbox_w = static_cast<float>(exp(wptr[0]) * bias_w / net_w);
154 float bbox_h = static_cast<float>(exp(hptr[0]) * bias_h / net_h);
155
156 float bbox_xmin = bbox_cx - bbox_w * 0.5f;
157 float bbox_ymin = bbox_cy - bbox_h * 0.5f;
158 float bbox_xmax = bbox_cx + bbox_w * 0.5f;
159 float bbox_ymax = bbox_cy + bbox_h * 0.5f;
160
161 float area = bbox_w * bbox_h;
162
163 BBoxRect c = {confidence, bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax, area, class_index};
164 all_box_bbox_rects[pp].push_back(c);
165 }
166
167 xptr++;
168 yptr++;
169 wptr++;
170 hptr++;
171
172 box_score_ptr++;
173 }
174 }
175 }
176
177 for (int i = 0; i < num_box; i++)
178 {
179 const std::vector<BBoxRect>& box_bbox_rects = all_box_bbox_rects[i];
180
181 all_bbox_rects.insert(all_bbox_rects.end(), box_bbox_rects.begin(), box_bbox_rects.end());
182 }
183 }
184
185 // global sort inplace
186 qsort_descent_inplace(all_bbox_rects);
187
188 // apply nms
189 std::vector<size_t> picked;
190 nms_sorted_bboxes(all_bbox_rects, picked, nms_threshold);
191
192 // select
193 std::vector<BBoxRect> bbox_rects;
194
195 for (size_t i = 0; i < picked.size(); i++)
196 {
197 size_t z = picked[i];
198 bbox_rects.push_back(all_bbox_rects[z]);
199 }
200
201 // fill result
202 int num_detected = static_cast<int>(bbox_rects.size());
203 if (num_detected == 0)
204 return 0;
205
206 Mat& top_blob = top_blobs[0];
207 top_blob.create(6, num_detected, 4u, opt.blob_allocator);
208 if (top_blob.empty())
209 return -100;
210
211 for (int i = 0; i < num_detected; i++)
212 {
213 const BBoxRect& r = bbox_rects[i];
214 float score = r.score;
215 float* outptr = top_blob.row(i);
216
217 outptr[0] = static_cast<float>(r.label + 1); // +1 for prepend background class
218 outptr[1] = score;
219 outptr[2] = r.xmin;
220 outptr[3] = r.ymin;
221 outptr[4] = r.xmax;
222 outptr[5] = r.ymax;
223 }
224
225 return 0;
226 }
227
228 } // namespace ncnn
229