1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "roialign_x86.h"
16
17 #include <math.h>
18
19 namespace ncnn {
20
21 // adapted from detectron2
22 // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/ROIAlign/ROIAlign_cpu.cpp
23 template<typename T>
24 struct PreCalc
25 {
26 int pos1;
27 int pos2;
28 int pos3;
29 int pos4;
30 T w1;
31 T w2;
32 T w3;
33 T w4;
34 };
35
36 template<typename T>
detectron2_pre_calc_for_bilinear_interpolate(const int height,const int width,const int pooled_height,const int pooled_width,const int iy_upper,const int ix_upper,T roi_start_h,T roi_start_w,T bin_size_h,T bin_size_w,int roi_bin_grid_h,int roi_bin_grid_w,std::vector<PreCalc<T>> & pre_calc)37 void detectron2_pre_calc_for_bilinear_interpolate(
38 const int height,
39 const int width,
40 const int pooled_height,
41 const int pooled_width,
42 const int iy_upper,
43 const int ix_upper,
44 T roi_start_h,
45 T roi_start_w,
46 T bin_size_h,
47 T bin_size_w,
48 int roi_bin_grid_h,
49 int roi_bin_grid_w,
50 std::vector<PreCalc<T> >& pre_calc)
51 {
52 int pre_calc_index = 0;
53 for (int ph = 0; ph < pooled_height; ph++)
54 {
55 for (int pw = 0; pw < pooled_width; pw++)
56 {
57 for (int iy = 0; iy < iy_upper; iy++)
58 {
59 const T yy = roi_start_h + ph * bin_size_h + static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
60 for (int ix = 0; ix < ix_upper; ix++)
61 {
62 const T xx = roi_start_w + pw * bin_size_w + static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
63
64 T x = xx;
65 T y = yy;
66 // deal with: inverse elements are out of feature map boundary
67 if (y < -1.0 || y > height || x < -1.0 || x > width)
68 {
69 // empty
70 PreCalc<T> pc;
71 pc.pos1 = 0;
72 pc.pos2 = 0;
73 pc.pos3 = 0;
74 pc.pos4 = 0;
75 pc.w1 = 0;
76 pc.w2 = 0;
77 pc.w3 = 0;
78 pc.w4 = 0;
79 pre_calc[pre_calc_index++] = pc;
80 continue;
81 }
82
83 if (y <= 0)
84 {
85 y = 0;
86 }
87 if (x <= 0)
88 {
89 x = 0;
90 }
91
92 int y_low = (int)y;
93 int x_low = (int)x;
94 int y_high;
95 int x_high;
96
97 if (y_low >= height - 1)
98 {
99 y_high = y_low = height - 1;
100 y = (T)y_low;
101 }
102 else
103 {
104 y_high = y_low + 1;
105 }
106
107 if (x_low >= width - 1)
108 {
109 x_high = x_low = width - 1;
110 x = (T)x_low;
111 }
112 else
113 {
114 x_high = x_low + 1;
115 }
116
117 T ly = y - y_low;
118 T lx = x - x_low;
119 T hy = (T)(1. - ly), hx = (T)(1. - lx);
120 T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
121
122 // save weights and indices
123 PreCalc<T> pc;
124 pc.pos1 = y_low * width + x_low;
125 pc.pos2 = y_low * width + x_high;
126 pc.pos3 = y_high * width + x_low;
127 pc.pos4 = y_high * width + x_high;
128 pc.w1 = w1;
129 pc.w2 = w2;
130 pc.w3 = w3;
131 pc.w4 = w4;
132 pre_calc[pre_calc_index++] = pc;
133 }
134 }
135 }
136 }
137 }
138
139 template<typename T>
original_pre_calc_for_bilinear_interpolate(const int height,const int width,const int pooled_height,const int pooled_width,T roi_start_h,T roi_start_w,T bin_size_h,T bin_size_w,int sampling_ratio,std::vector<PreCalc<T>> & pre_calc)140 void original_pre_calc_for_bilinear_interpolate(
141 const int height,
142 const int width,
143 const int pooled_height,
144 const int pooled_width,
145 T roi_start_h,
146 T roi_start_w,
147 T bin_size_h,
148 T bin_size_w,
149 int sampling_ratio,
150 std::vector<PreCalc<T> >& pre_calc)
151 {
152 int pre_calc_index = 0;
153 for (int ph = 0; ph < pooled_height; ph++)
154 {
155 for (int pw = 0; pw < pooled_width; pw++)
156 {
157 float hstart = roi_start_h + ph * bin_size_h;
158 float wstart = roi_start_w + pw * bin_size_w;
159 float hend = roi_start_h + (ph + 1) * bin_size_h;
160 float wend = roi_start_w + (pw + 1) * bin_size_w;
161 hstart = std::min(std::max(hstart, 0.f), (float)height);
162 wstart = std::min(std::max(wstart, 0.f), (float)width);
163 hend = std::min(std::max(hend, 0.f), (float)height);
164 wend = std::min(std::max(wend, 0.f), (float)width);
165
166 int bin_grid_h = (int)(sampling_ratio > 0 ? sampling_ratio : ceil(hend - hstart));
167 int bin_grid_w = (int)(sampling_ratio > 0 ? sampling_ratio : ceil(wend - wstart));
168
169 for (int by = 0; by < bin_grid_h; by++)
170 {
171 float y = hstart + (by + 0.5f) * bin_size_h / (float)bin_grid_h;
172
173 for (int bx = 0; bx < bin_grid_w; bx++)
174 {
175 float x = wstart + (bx + 0.5f) * bin_size_w / (float)bin_grid_w;
176 int x0 = (int)x;
177 int x1 = x0 + 1;
178 int y0 = (int)y;
179 int y1 = y0 + 1;
180
181 float a0 = x1 - x;
182 float a1 = x - x0;
183 float b0 = y1 - y;
184 float b1 = y - y0;
185
186 if (x1 >= width)
187 {
188 x1 = width - 1;
189 a0 = 1.f;
190 a1 = 0.f;
191 }
192 if (y1 >= height)
193 {
194 y1 = height - 1;
195 b0 = 1.f;
196 b1 = 0.f;
197 }
198 // save weights and indices
199 PreCalc<T> pc;
200 pc.pos1 = y0 * width + x0;
201 pc.pos2 = y0 * width + x1;
202 pc.pos3 = y1 * width + x0;
203 pc.pos4 = y1 * width + x1;
204 pc.w1 = a0 * b0;
205 pc.w2 = a1 * b0;
206 pc.w3 = a0 * b1;
207 pc.w4 = a1 * b1;
208 pre_calc[pre_calc_index++] = pc;
209 }
210 }
211 }
212 }
213 }
214
ROIAlign_x86()215 ROIAlign_x86::ROIAlign_x86()
216 {
217 }
218
forward(const std::vector<Mat> & bottom_blobs,std::vector<Mat> & top_blobs,const Option & opt) const219 int ROIAlign_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
220 {
221 const Mat& bottom_blob = bottom_blobs[0];
222 const int width = bottom_blob.w;
223 const int height = bottom_blob.h;
224 const size_t elemsize = bottom_blob.elemsize;
225 const int channels = bottom_blob.c;
226
227 const Mat& roi_blob = bottom_blobs[1];
228
229 Mat& top_blob = top_blobs[0];
230 top_blob.create(pooled_width, pooled_height, channels, elemsize, opt.blob_allocator);
231 if (top_blob.empty())
232 return -100;
233
234 // For each ROI R = [x y w h]: max pool over R
235 const float* roi_ptr = roi_blob;
236
237 float roi_start_w = roi_ptr[0] * spatial_scale;
238 float roi_start_h = roi_ptr[1] * spatial_scale;
239 float roi_end_w = roi_ptr[2] * spatial_scale;
240 float roi_end_h = roi_ptr[3] * spatial_scale;
241 if (aligned)
242 {
243 roi_start_w -= 0.5f;
244 roi_start_h -= 0.5f;
245 roi_end_w -= 0.5f;
246 roi_end_h -= 0.5f;
247 }
248
249 float roi_width = roi_end_w - roi_start_w;
250 float roi_height = roi_end_h - roi_start_h;
251
252 if (!aligned)
253 {
254 roi_width = std::max(roi_width, 1.f);
255 roi_height = std::max(roi_height, 1.f);
256 }
257
258 float bin_size_w = (float)roi_width / (float)pooled_width;
259 float bin_size_h = (float)roi_height / (float)pooled_height;
260
261 if (version == 0)
262 {
263 // original version
264 int roi_bin_grid_h = (int)(sampling_ratio > 0 ? sampling_ratio : ceil(roi_height / pooled_height));
265 int roi_bin_grid_w = (int)(sampling_ratio > 0 ? sampling_ratio : ceil(roi_width / pooled_width));
266 std::vector<PreCalc<float> > pre_calc(
267 (size_t)roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
268 original_pre_calc_for_bilinear_interpolate(
269 height,
270 width,
271 pooled_height,
272 pooled_width,
273 roi_start_h,
274 roi_start_w,
275 bin_size_h,
276 bin_size_w,
277 sampling_ratio,
278 pre_calc);
279
280 #pragma omp parallel for num_threads(opt.num_threads)
281 for (int q = 0; q < channels; q++)
282 {
283 const float* ptr = bottom_blob.channel(q);
284 float* outptr = top_blob.channel(q);
285 int pre_calc_index = 0;
286
287 for (int ph = 0; ph < pooled_height; ph++)
288 {
289 for (int pw = 0; pw < pooled_width; pw++)
290 {
291 // Compute pooling region for this output unit:
292 // start (included) = ph * roi_height / pooled_height
293 // end (excluded) = (ph + 1) * roi_height / pooled_height
294 float hstart = roi_start_h + ph * bin_size_h;
295 float wstart = roi_start_w + pw * bin_size_w;
296 float hend = roi_start_h + (ph + 1) * bin_size_h;
297 float wend = roi_start_w + (pw + 1) * bin_size_w;
298
299 hstart = std::min(std::max(hstart, 0.f), (float)height);
300 wstart = std::min(std::max(wstart, 0.f), (float)width);
301 hend = std::min(std::max(hend, 0.f), (float)height);
302 wend = std::min(std::max(wend, 0.f), (float)width);
303
304 int bin_grid_h = (int)(sampling_ratio > 0 ? sampling_ratio : ceil(hend - hstart));
305 int bin_grid_w = (int)(sampling_ratio > 0 ? sampling_ratio : ceil(wend - wstart));
306
307 bool is_empty = (hend <= hstart) || (wend <= wstart);
308 int area = bin_grid_h * bin_grid_w;
309
310 float sum = 0.f;
311 for (int by = 0; by < bin_grid_h; by++)
312 {
313 for (int bx = 0; bx < bin_grid_w; bx++)
314 {
315 PreCalc<float>& pc = pre_calc[pre_calc_index++];
316 // bilinear interpolate at (x,y)
317 sum += pc.w1 * ptr[pc.pos1] + pc.w2 * ptr[pc.pos2] + pc.w3 * ptr[pc.pos3] + pc.w4 * ptr[pc.pos4];
318 }
319 }
320 outptr[pw] = is_empty ? 0.f : (sum / (float)area);
321 }
322
323 outptr += pooled_width;
324 }
325 }
326 }
327 else if (version == 1)
328 {
329 // the version in detectron 2
330 int roi_bin_grid_h = (int)(sampling_ratio > 0 ? sampling_ratio : ceil(roi_height / pooled_height));
331 int roi_bin_grid_w = (int)(sampling_ratio > 0 ? sampling_ratio : ceil(roi_width / pooled_width));
332
333 const float count = (float)std::max(roi_bin_grid_h * roi_bin_grid_w, 1);
334
335 std::vector<PreCalc<float> > pre_calc(
336 (size_t)roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
337 detectron2_pre_calc_for_bilinear_interpolate(
338 height,
339 width,
340 pooled_height,
341 pooled_width,
342 roi_bin_grid_h,
343 roi_bin_grid_w,
344 roi_start_h,
345 roi_start_w,
346 bin_size_h,
347 bin_size_w,
348 roi_bin_grid_h,
349 roi_bin_grid_w,
350 pre_calc);
351
352 #pragma omp parallel for num_threads(opt.num_threads)
353 for (int q = 0; q < channels; q++)
354 {
355 const float* ptr = bottom_blob.channel(q);
356 float* outptr = top_blob.channel(q);
357 int pre_calc_index = 0;
358
359 for (int ph = 0; ph < pooled_height; ph++)
360 {
361 for (int pw = 0; pw < pooled_width; pw++)
362 {
363 float output_val = 0.f;
364 for (int iy = 0; iy < roi_bin_grid_h; iy++)
365 {
366 for (int ix = 0; ix < roi_bin_grid_w; ix++)
367 {
368 PreCalc<float>& pc = pre_calc[pre_calc_index++];
369
370 output_val += pc.w1 * ptr[pc.pos1] + pc.w2 * ptr[pc.pos2] + pc.w3 * ptr[pc.pos3] + pc.w4 * ptr[pc.pos4];
371 }
372 }
373 output_val /= count;
374 outptr[pw] = output_val;
375 }
376 outptr += pooled_width;
377 }
378 }
379 }
380
381 return 0;
382 }
383
384 } // namespace ncnn
385