1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20 /*!
21 * \file rroi_align.cc
22 * \brief rroi align operator
23 * \author Yixin Bao
24 * Forward pass adapted from Caffe2
25 * link: https://github.com/pytorch/pytorch/blob/master/caffe2/operators/roi_align_rotated_op.cc
26 */
27 #include "./rroi_align-inl.h"
28 #include <mshadow/tensor.h>
29 #include "math.h"
30
31 using std::max;
32 using std::min;
33 using std::floor;
34 using std::ceil;
35
36 namespace mxnet {
37 namespace op {
38
39 template <typename DType>
40 struct position_for_bilinear_interpolate {
41 // 4 positions and corresponding weights for
42 // computing bilinear interpolation
43 int pos1, pos2, pos3, pos4;
44 DType w1, w2, w3, w4;
45 };
46
47 template <typename DType>
pre_calc_for_bilinear_interpolate(const int height,const int width,const int pooled_height,const int pooled_width,const int iy_upper,const int ix_upper,DType roi_start_h,DType roi_start_w,DType bin_size_h,DType bin_size_w,int roi_bin_grid_h,int roi_bin_grid_w,DType roi_center_h,DType roi_center_w,DType theta,std::vector<position_for_bilinear_interpolate<DType>> * pre_calc)48 void pre_calc_for_bilinear_interpolate(
49 const int height, const int width, const int pooled_height, const int pooled_width,
50 const int iy_upper, const int ix_upper, DType roi_start_h, DType roi_start_w,
51 DType bin_size_h, DType bin_size_w, int roi_bin_grid_h, int roi_bin_grid_w,
52 DType roi_center_h, DType roi_center_w, DType theta,
53 std::vector<position_for_bilinear_interpolate<DType>> *pre_calc) {
54 int pre_calc_index = 0;
55 DType cosTheta = cos(theta);
56 DType sinTheta = sin(theta);
57 for (int ph = 0; ph < pooled_height; ph++) {
58 for (int pw = 0; pw < pooled_width; pw++) {
59 // calc bin grid position (xx,yy)
60 for (int iy = 0; iy < iy_upper; iy++) {
61 const DType yy = roi_start_h + ph * bin_size_h +
62 static_cast<DType>(iy + .5f) * bin_size_h /
63 static_cast<DType>(roi_bin_grid_h); // e.g., 0.5, 1.5
64 for (int ix = 0; ix < ix_upper; ix++) {
65 const DType xx = roi_start_w + pw * bin_size_w +
66 static_cast<DType>(ix + .5f) * bin_size_w /
67 static_cast<DType>(roi_bin_grid_w);
68
69 // Rotate by theta around the center and translate
70 DType x = xx * cosTheta + yy * sinTheta + roi_center_w;
71 DType y = yy * cosTheta - xx * sinTheta + roi_center_h;
72
73 // deal with: inverse elements are out of feature map boundary
74 if (y < -1.0 || y > height || x < -1.0 || x > width) {
75 // empty
76 position_for_bilinear_interpolate<DType> &pc = (*pre_calc)[pre_calc_index];
77 pc.pos1 = 0;
78 pc.pos2 = 0;
79 pc.pos3 = 0;
80 pc.pos4 = 0;
81 pc.w1 = 0;
82 pc.w2 = 0;
83 pc.w3 = 0;
84 pc.w4 = 0;
85 pre_calc_index += 1;
86 continue;
87 }
88 if (y <= 0) {
89 y = 0;
90 }
91 if (x <= 0) {
92 x = 0;
93 }
94
95 // calc 4 points for interpolation
96 int y_low = static_cast<int>(y);
97 int x_low = static_cast<int>(x);
98 int y_high;
99 int x_high;
100 if (y_low >= height - 1) {
101 y_high = y_low = height - 1;
102 y = (DType)y_low;
103 } else {
104 y_high = y_low + 1;
105 }
106 if (x_low >= width - 1) {
107 x_high = x_low = width - 1;
108 x = (DType)x_low;
109 } else {
110 x_high = x_low + 1;
111 }
112 DType ly = y - y_low;
113 DType lx = x - x_low;
114 DType hy = 1. - ly, hx = 1. - lx;
115 DType w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
116
117 // Save weights and indices
118 position_for_bilinear_interpolate<DType> &pc = (*pre_calc)[pre_calc_index];
119 pc.pos1 = y_low * width + x_low;
120 pc.pos2 = y_low * width + x_high;
121 pc.pos3 = y_high * width + x_low;
122 pc.pos4 = y_high * width + x_high;
123 pc.w1 = w1;
124 pc.w2 = w2;
125 pc.w3 = w3;
126 pc.w4 = w4;
127 pre_calc_index += 1;
128 }
129 }
130 }
131 }
132 }
133
134 template <typename DType>
RROIAlignForward(const OpContext & ctx,const RROIAlignParam & param,const std::vector<TBlob> & in_data,const std::vector<OpReqType> & req,const std::vector<TBlob> & out_data)135 inline void RROIAlignForward(const OpContext &ctx, const RROIAlignParam ¶m,
136 const std::vector<TBlob> &in_data, const std::vector<OpReqType> &req,
137 const std::vector<TBlob> &out_data) {
138 // data: [batch_size, c, h, w]
139 const TBlob &data = in_data[rroialign::kData];
140 const TBlob &bbox = in_data[rroialign::kBox];
141 const DType *bottom_data = data.dptr<DType>();
142 const int channels_ = data.size(1);
143 const int height_ = data.size(2);
144 const int width_ = data.size(3);
145 const index_t data_size_c = height_ * width_;
146 const index_t data_size = channels_ * data_size_c;
147
148 // bbox: [num_rois, 6] (6: [batch_index, x, y, w, h, theta])
149 const DType *bottom_rois = bbox.dptr<DType>();
150 const int num_rois = bbox.size(0);
151 const float spatial_scale_ = param.spatial_scale;
152 const int sampling_ratio_ = param.sampling_ratio;
153
154 // out: [num_rois, c, pooled_h, pooled_w]
155 const TBlob &out = out_data[rroialign::kOut];
156 DType *top_data = out.dptr<DType>();
157 const int pooled_height_ = out.size(2);
158 const int pooled_width_ = out.size(3);
159 const index_t out_size_c = pooled_height_ * pooled_width_;
160 const index_t out_size = channels_ * out_size_c;
161
162 // (n, c, ph, pw) is an element in the pooled output
163 // can be parallelized using omp
164 #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
165 for (int n = 0; n < num_rois; ++n) {
166 // Increment ROI data pointer
167 const DType *bottom_rois_n = bottom_rois + n * bbox.size(1);
168 DType *top_data_n = top_data + n * out_size;
169 int roi_batch_ind = static_cast<int>(bottom_rois_n[0]);
170 DType roi_center_w = bottom_rois_n[1] * spatial_scale_;
171 DType roi_center_h = bottom_rois_n[2] * spatial_scale_;
172 DType roi_width = bottom_rois_n[3] * spatial_scale_;
173 DType roi_height = bottom_rois_n[4] * spatial_scale_;
174 DType roi_theta = bottom_rois_n[5] * M_PI / 180.0;
175
176 // force malformed ROIs to be 1 * 1
177 roi_width = max(roi_width, (DType) 1.);
178 roi_height = max(roi_height, (DType) 1.);
179 // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
180 // Appropriate translation needs to be applied after.
181 DType roi_start_h = -roi_height / 2.0;
182 DType roi_start_w = -roi_width / 2.0;
183
184 const DType bin_size_h = static_cast<DType>(roi_height) / static_cast<DType>(pooled_height_);
185 const DType bin_size_w = static_cast<DType>(roi_width) / static_cast<DType>(pooled_width_);
186 // We use roi_bin_grid to sample the grid and mimic integral,
187 // e.g. roi_bin_grid = 2, means sample 2*2=4 points in each bin
188 int roi_bin_grid_h =
189 (sampling_ratio_ > 0) ? sampling_ratio_ : ceil(roi_height / pooled_height_);
190 int roi_bin_grid_w = (sampling_ratio_ > 0) ? sampling_ratio_ : ceil(roi_width / pooled_width_);
191 const DType bin_points_count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
192
193 // We want to precalculate indices and weights shared by all channels,
194 // this is the key point of optimization.
195 std::vector<position_for_bilinear_interpolate<DType>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
196 pooled_width_ * pooled_height_);
197
198 pre_calc_for_bilinear_interpolate(height_, width_, pooled_height_, pooled_width_,
199 roi_bin_grid_h, roi_bin_grid_w, roi_start_h, roi_start_w,
200 bin_size_h, bin_size_w, roi_bin_grid_h, roi_bin_grid_w,
201 roi_center_h, roi_center_w, roi_theta, &pre_calc);
202
203 for (int c = 0; c < channels_; ++c) {
204 const DType *offset_bottom_data = bottom_data + roi_batch_ind * data_size + c * data_size_c;
205 int pre_calc_index = 0;
206
207 for (int ph = 0; ph < pooled_height_; ph++) {
208 for (int pw = 0; pw < pooled_width_; pw++) {
209 DType output_val = 0.;
210 for (int iy = 0; iy < roi_bin_grid_h; iy++) {
211 for (int ix = 0; ix < roi_bin_grid_w; ix++) {
212 position_for_bilinear_interpolate<DType> pc = pre_calc[pre_calc_index];
213 output_val +=
214 pc.w1 * offset_bottom_data[pc.pos1] + pc.w2 * offset_bottom_data[pc.pos2] +
215 pc.w3 * offset_bottom_data[pc.pos3] + pc.w4 * offset_bottom_data[pc.pos4];
216
217 pre_calc_index += 1;
218 }
219 }
220 output_val /= bin_points_count; // avg pooling for bin grid
221 int index = c * pooled_height_ * pooled_width_ + ph * pooled_width_ + pw;
222 top_data_n[index] = output_val;
223 } // for pw
224 } // for ph
225 } // for c
226 } // for n
227 }
228
229 template<typename xpu>
RROIAlignForwardCompute(const nnvm::NodeAttrs & attrs,const OpContext & ctx,const std::vector<TBlob> & in_data,const std::vector<OpReqType> & req,const std::vector<TBlob> & out_data)230 void RROIAlignForwardCompute(const nnvm::NodeAttrs& attrs,
231 const OpContext& ctx, const std::vector<TBlob>& in_data,
232 const std::vector<OpReqType>& req,
233 const std::vector<TBlob>& out_data) {
234 const RROIAlignParam& param = nnvm::get<RROIAlignParam>(attrs.parsed);
235 CHECK_EQ(in_data.size(), 2);
236 CHECK_EQ(out_data.size(), 1);
237 CHECK_EQ(out_data[rroialign::kOut].shape_[0], in_data[rroialign::kBox].shape_[0]);
238
239 MSHADOW_REAL_TYPE_SWITCH(in_data[0].type_flag_, DType, {
240 RROIAlignForward<DType>(ctx, param, in_data, req, out_data);
241 })
242 }
243
244 template<typename xpu>
RROIAlignBackwardCompute(const nnvm::NodeAttrs & attrs,const OpContext & ctx,const std::vector<TBlob> & in_data,const std::vector<OpReqType> & req,const std::vector<TBlob> & out_data)245 void RROIAlignBackwardCompute(const nnvm::NodeAttrs& attrs,
246 const OpContext& ctx, const std::vector<TBlob>& in_data,
247 const std::vector<OpReqType>& req,
248 const std::vector<TBlob>& out_data) {
249 LOG(FATAL) << "RROIAlign: Backward is not supported.";
250 }
251
252 DMLC_REGISTER_PARAMETER(RROIAlignParam);
253
254 NNVM_REGISTER_OP(_contrib_RROIAlign)
255 .describe(R"code(Performs Rotated ROI Align on the input array.
256
257 This operator takes a 4D feature map as an input array and region proposals as `rois`,
258 then align the feature map over sub-regions of input and produces a fixed-sized output array.
259
260 Different from ROI Align, RROI Align uses rotated rois, which is suitable for text detection.
261 RRoIAlign computes the value of each sampling point by bilinear interpolation from the nearby
262 grid points on the rotated feature map. No quantization is performed on any coordinates
263 involved in the RoI, its bins, or the sampling points. Bilinear interpolation is used to
264 compute the exact values of the input features at four regularly sampled locations in
265 each RoI bin. Then the feature map can be aggregated by avgpooling.
266
267 References
268 ----------
269
270 Ma, Jianqi, et al. "Arbitrary-Oriented Scene Text Detection via Rotation Proposals."
271 IEEE Transactions on Multimedia, 2018.
272
273 )code" ADD_FILELINE)
274 .set_num_inputs(2)
275 .set_num_outputs(1)
276 .set_attr<nnvm::FListInputNames>("FListInputNames",
__anon3c73233e0102(const NodeAttrs& attrs) 277 [](const NodeAttrs& attrs) {
278 return std::vector<std::string>{"data", "rois"};
279 })
280 .set_attr<nnvm::FListOutputNames>("FListOutputNames",
__anon3c73233e0202(const NodeAttrs& attrs) 281 [](const NodeAttrs& attrs) {
282 return std::vector<std::string>{"output"};
283 })
284 .set_attr_parser(ParamParser<RROIAlignParam>)
285 .set_attr<mxnet::FInferShape>("FInferShape", [](const nnvm::NodeAttrs& attrs,
__anon3c73233e0302(const nnvm::NodeAttrs& attrs, mxnet::ShapeVector *in_shape, mxnet::ShapeVector *out_shape)286 mxnet::ShapeVector *in_shape, mxnet::ShapeVector *out_shape){
287 using namespace mshadow;
288 const RROIAlignParam& param = nnvm::get<RROIAlignParam>(attrs.parsed);
289 CHECK_EQ(in_shape->size(), 2U) << "Input:[data, rois]";
290 // data: [batch_size, c, h, w]
291 mxnet::TShape dshape = in_shape->at(rroialign::kData);
292 CHECK_EQ(dshape.ndim(), 4U) << "data should be a 4D tensor";
293 // bbox: [num_rois, 6]
294 mxnet::TShape bshape = in_shape->at(rroialign::kBox);
295 CHECK_EQ(bshape.ndim(), 2U) << "bbox should be a 2D tensor of shape [batch, 6]";
296 CHECK_EQ(bshape[1], 6U) << "bbox should be a 2D tensor of shape [batch, 6]";
297 // out: [num_rois, c, pooled_h, pooled_w]
298 out_shape->clear();
299 out_shape->push_back(Shape4(bshape[0], dshape[1], param.pooled_size[0], param.pooled_size[1]));
300 return true;
301 })
302 .set_attr<nnvm::FInferType>("FInferType", [](const nnvm::NodeAttrs& attrs,
__anon3c73233e0402(const nnvm::NodeAttrs& attrs, std::vector<int> *in_type, std::vector<int> *out_type) 303 std::vector<int> *in_type, std::vector<int> *out_type) {
304 CHECK_EQ(in_type->size(), 2U);
305 int dtype = (*in_type)[0];
306 CHECK_EQ(dtype, (*in_type)[1]);
307 CHECK_NE(dtype, -1) << "Input must have specified type";
308
309 out_type->clear();
310 out_type->push_back(dtype);
311 return true;
312 })
313 .set_attr<FCompute>("FCompute<cpu>", RROIAlignForwardCompute<cpu>)
314 .add_argument("data", "NDArray-or-Symbol", "Input data to the pooling operator, a 4D Feature maps")
315 .add_argument("rois", "NDArray-or-Symbol", "Bounding box coordinates, a 2D array")
316 .add_arguments(RROIAlignParam::__FIELDS__());
317
318 NNVM_REGISTER_OP(_backward_RROIAlign)
319 .set_num_outputs(2)
320 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
321 .set_attr_parser(ParamParser<RROIAlignParam>)
322 .set_attr<FCompute>("FCompute<cpu>", RROIAlignBackwardCompute<cpu>);
323
324 } // namespace op
325 } // namespace mxnet
326