1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements.  See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership.  The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License.  You may obtain a copy of the License at
9  *
10  *   http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied.  See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 /*!
21  * \file rroi_align.cc
22  * \brief rroi align operator
23  * \author Yixin Bao
24  * Forward pass adapted from Caffe2
25  * link: https://github.com/pytorch/pytorch/blob/master/caffe2/operators/roi_align_rotated_op.cc
26  */
27 #include "./rroi_align-inl.h"
28 #include <mshadow/tensor.h>
29 #include "math.h"
30 
31 using std::max;
32 using std::min;
33 using std::floor;
34 using std::ceil;
35 
36 namespace mxnet {
37 namespace op {
38 
39 template <typename DType>
40 struct position_for_bilinear_interpolate {
41   // 4 positions and corresponding weights for
42   // computing bilinear interpolation
43   int pos1, pos2, pos3, pos4;
44   DType w1, w2, w3, w4;
45 };
46 
47 template <typename DType>
pre_calc_for_bilinear_interpolate(const int height,const int width,const int pooled_height,const int pooled_width,const int iy_upper,const int ix_upper,DType roi_start_h,DType roi_start_w,DType bin_size_h,DType bin_size_w,int roi_bin_grid_h,int roi_bin_grid_w,DType roi_center_h,DType roi_center_w,DType theta,std::vector<position_for_bilinear_interpolate<DType>> * pre_calc)48 void pre_calc_for_bilinear_interpolate(
49     const int height, const int width, const int pooled_height, const int pooled_width,
50     const int iy_upper, const int ix_upper, DType roi_start_h, DType roi_start_w,
51     DType bin_size_h, DType bin_size_w, int roi_bin_grid_h, int roi_bin_grid_w,
52     DType roi_center_h, DType roi_center_w, DType theta,
53     std::vector<position_for_bilinear_interpolate<DType>> *pre_calc) {
54   int pre_calc_index = 0;
55   DType cosTheta = cos(theta);
56   DType sinTheta = sin(theta);
57   for (int ph = 0; ph < pooled_height; ph++) {
58     for (int pw = 0; pw < pooled_width; pw++) {
59       // calc bin grid position (xx,yy)
60       for (int iy = 0; iy < iy_upper; iy++) {
61         const DType yy = roi_start_h + ph * bin_size_h +
62             static_cast<DType>(iy + .5f) * bin_size_h /
63                 static_cast<DType>(roi_bin_grid_h);  // e.g., 0.5, 1.5
64         for (int ix = 0; ix < ix_upper; ix++) {
65           const DType xx = roi_start_w + pw * bin_size_w +
66               static_cast<DType>(ix + .5f) * bin_size_w /
67                   static_cast<DType>(roi_bin_grid_w);
68 
69           // Rotate by theta around the center and translate
70           DType x = xx * cosTheta + yy * sinTheta + roi_center_w;
71           DType y = yy * cosTheta - xx * sinTheta + roi_center_h;
72 
73           // deal with: inverse elements are out of feature map boundary
74           if (y < -1.0 || y > height || x < -1.0 || x > width) {
75             // empty
76             position_for_bilinear_interpolate<DType> &pc = (*pre_calc)[pre_calc_index];
77             pc.pos1 = 0;
78             pc.pos2 = 0;
79             pc.pos3 = 0;
80             pc.pos4 = 0;
81             pc.w1 = 0;
82             pc.w2 = 0;
83             pc.w3 = 0;
84             pc.w4 = 0;
85             pre_calc_index += 1;
86             continue;
87           }
88           if (y <= 0) {
89             y = 0;
90           }
91           if (x <= 0) {
92             x = 0;
93           }
94 
95           // calc 4 points for interpolation
96           int y_low = static_cast<int>(y);
97           int x_low = static_cast<int>(x);
98           int y_high;
99           int x_high;
100           if (y_low >= height - 1) {
101             y_high = y_low = height - 1;
102             y = (DType)y_low;
103           } else {
104             y_high = y_low + 1;
105           }
106           if (x_low >= width - 1) {
107             x_high = x_low = width - 1;
108             x = (DType)x_low;
109           } else {
110             x_high = x_low + 1;
111           }
112           DType ly = y - y_low;
113           DType lx = x - x_low;
114           DType hy = 1. - ly, hx = 1. - lx;
115           DType w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
116 
117           // Save weights and indices
118           position_for_bilinear_interpolate<DType> &pc = (*pre_calc)[pre_calc_index];
119           pc.pos1 = y_low * width + x_low;
120           pc.pos2 = y_low * width + x_high;
121           pc.pos3 = y_high * width + x_low;
122           pc.pos4 = y_high * width + x_high;
123           pc.w1 = w1;
124           pc.w2 = w2;
125           pc.w3 = w3;
126           pc.w4 = w4;
127           pre_calc_index += 1;
128         }
129       }
130     }
131   }
132 }
133 
134 template <typename DType>
RROIAlignForward(const OpContext & ctx,const RROIAlignParam & param,const std::vector<TBlob> & in_data,const std::vector<OpReqType> & req,const std::vector<TBlob> & out_data)135 inline void RROIAlignForward(const OpContext &ctx, const RROIAlignParam &param,
136                              const std::vector<TBlob> &in_data, const std::vector<OpReqType> &req,
137                              const std::vector<TBlob> &out_data) {
138   // data: [batch_size, c, h, w]
139   const TBlob &data = in_data[rroialign::kData];
140   const TBlob &bbox = in_data[rroialign::kBox];
141   const DType *bottom_data = data.dptr<DType>();
142   const int channels_ = data.size(1);
143   const int height_ = data.size(2);
144   const int width_ = data.size(3);
145   const index_t data_size_c = height_ * width_;
146   const index_t data_size = channels_ * data_size_c;
147 
148   // bbox: [num_rois, 6] (6: [batch_index, x, y, w, h, theta])
149   const DType *bottom_rois = bbox.dptr<DType>();
150   const int num_rois = bbox.size(0);
151   const float spatial_scale_ = param.spatial_scale;
152   const int sampling_ratio_ = param.sampling_ratio;
153 
154   // out: [num_rois, c, pooled_h, pooled_w]
155   const TBlob &out = out_data[rroialign::kOut];
156   DType *top_data = out.dptr<DType>();
157   const int pooled_height_ = out.size(2);
158   const int pooled_width_ = out.size(3);
159   const index_t out_size_c = pooled_height_ * pooled_width_;
160   const index_t out_size = channels_ * out_size_c;
161 
162   // (n, c, ph, pw) is an element in the pooled output
163   // can be parallelized using omp
164 #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
165   for (int n = 0; n < num_rois; ++n) {
166     // Increment ROI data pointer
167     const DType *bottom_rois_n = bottom_rois + n * bbox.size(1);
168     DType *top_data_n = top_data + n * out_size;
169     int roi_batch_ind = static_cast<int>(bottom_rois_n[0]);
170     DType roi_center_w = bottom_rois_n[1] * spatial_scale_;
171     DType roi_center_h = bottom_rois_n[2] * spatial_scale_;
172     DType roi_width = bottom_rois_n[3] * spatial_scale_;
173     DType roi_height = bottom_rois_n[4] * spatial_scale_;
174     DType roi_theta = bottom_rois_n[5] * M_PI / 180.0;
175 
176     // force malformed ROIs to be 1 * 1
177     roi_width = max(roi_width, (DType) 1.);
178     roi_height = max(roi_height, (DType) 1.);
179     // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
180     // Appropriate translation needs to be applied after.
181     DType roi_start_h = -roi_height / 2.0;
182     DType roi_start_w = -roi_width / 2.0;
183 
184     const DType bin_size_h = static_cast<DType>(roi_height) / static_cast<DType>(pooled_height_);
185     const DType bin_size_w = static_cast<DType>(roi_width) / static_cast<DType>(pooled_width_);
186     // We use roi_bin_grid to sample the grid and mimic integral,
187     // e.g. roi_bin_grid = 2, means sample 2*2=4 points in each bin
188     int roi_bin_grid_h =
189         (sampling_ratio_ > 0) ? sampling_ratio_ : ceil(roi_height / pooled_height_);
190     int roi_bin_grid_w = (sampling_ratio_ > 0) ? sampling_ratio_ : ceil(roi_width / pooled_width_);
191     const DType bin_points_count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
192 
193     // We want to precalculate indices and weights shared by all channels,
194     // this is the key point of optimization.
195     std::vector<position_for_bilinear_interpolate<DType>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
196                                                                    pooled_width_ * pooled_height_);
197 
198     pre_calc_for_bilinear_interpolate(height_, width_, pooled_height_, pooled_width_,
199                                       roi_bin_grid_h, roi_bin_grid_w, roi_start_h, roi_start_w,
200                                       bin_size_h, bin_size_w, roi_bin_grid_h, roi_bin_grid_w,
201                                       roi_center_h, roi_center_w, roi_theta, &pre_calc);
202 
203     for (int c = 0; c < channels_; ++c) {
204       const DType *offset_bottom_data = bottom_data + roi_batch_ind * data_size + c * data_size_c;
205       int pre_calc_index = 0;
206 
207       for (int ph = 0; ph < pooled_height_; ph++) {
208         for (int pw = 0; pw < pooled_width_; pw++) {
209           DType output_val = 0.;
210           for (int iy = 0; iy < roi_bin_grid_h; iy++) {
211             for (int ix = 0; ix < roi_bin_grid_w; ix++) {
212               position_for_bilinear_interpolate<DType> pc = pre_calc[pre_calc_index];
213               output_val +=
214                   pc.w1 * offset_bottom_data[pc.pos1] + pc.w2 * offset_bottom_data[pc.pos2] +
215                   pc.w3 * offset_bottom_data[pc.pos3] + pc.w4 * offset_bottom_data[pc.pos4];
216 
217               pre_calc_index += 1;
218             }
219           }
220           output_val /= bin_points_count;  // avg pooling for bin grid
221           int index = c * pooled_height_ * pooled_width_ + ph * pooled_width_ + pw;
222           top_data_n[index] = output_val;
223         }   // for pw
224       }   // for ph
225     }   // for c
226   }   // for n
227 }
228 
229 template<typename xpu>
RROIAlignForwardCompute(const nnvm::NodeAttrs & attrs,const OpContext & ctx,const std::vector<TBlob> & in_data,const std::vector<OpReqType> & req,const std::vector<TBlob> & out_data)230 void RROIAlignForwardCompute(const nnvm::NodeAttrs& attrs,
231                       const OpContext& ctx, const std::vector<TBlob>& in_data,
232                       const std::vector<OpReqType>& req,
233                       const std::vector<TBlob>& out_data) {
234   const RROIAlignParam& param = nnvm::get<RROIAlignParam>(attrs.parsed);
235   CHECK_EQ(in_data.size(), 2);
236   CHECK_EQ(out_data.size(), 1);
237   CHECK_EQ(out_data[rroialign::kOut].shape_[0], in_data[rroialign::kBox].shape_[0]);
238 
239   MSHADOW_REAL_TYPE_SWITCH(in_data[0].type_flag_, DType, {
240     RROIAlignForward<DType>(ctx, param, in_data, req, out_data);
241   })
242 }
243 
244 template<typename xpu>
RROIAlignBackwardCompute(const nnvm::NodeAttrs & attrs,const OpContext & ctx,const std::vector<TBlob> & in_data,const std::vector<OpReqType> & req,const std::vector<TBlob> & out_data)245 void RROIAlignBackwardCompute(const nnvm::NodeAttrs& attrs,
246                       const OpContext& ctx, const std::vector<TBlob>& in_data,
247                       const std::vector<OpReqType>& req,
248                       const std::vector<TBlob>& out_data) {
249   LOG(FATAL) << "RROIAlign: Backward is not supported.";
250 }
251 
252 DMLC_REGISTER_PARAMETER(RROIAlignParam);
253 
254 NNVM_REGISTER_OP(_contrib_RROIAlign)
255 .describe(R"code(Performs Rotated ROI Align on the input array.
256 
257 This operator takes a 4D feature map as an input array and region proposals as `rois`,
258 then align the feature map over sub-regions of input and produces a fixed-sized output array.
259 
260 Different from ROI Align, RROI Align uses rotated rois, which is suitable for text detection.
261 RRoIAlign computes the value of each sampling point by bilinear interpolation from the nearby
262 grid points on the rotated feature map. No quantization is performed on any coordinates
263 involved in the RoI, its bins, or the sampling points. Bilinear interpolation is used to
264 compute the exact values of the input features at four regularly sampled locations in
265 each RoI bin. Then the feature map can be aggregated by avgpooling.
266 
267 References
268 ----------
269 
270 Ma, Jianqi, et al. "Arbitrary-Oriented Scene Text Detection via Rotation Proposals."
271 IEEE Transactions on Multimedia, 2018.
272 
273 )code" ADD_FILELINE)
274 .set_num_inputs(2)
275 .set_num_outputs(1)
276 .set_attr<nnvm::FListInputNames>("FListInputNames",
__anon3c73233e0102(const NodeAttrs& attrs) 277     [](const NodeAttrs& attrs) {
278   return std::vector<std::string>{"data", "rois"};
279 })
280 .set_attr<nnvm::FListOutputNames>("FListOutputNames",
__anon3c73233e0202(const NodeAttrs& attrs) 281     [](const NodeAttrs& attrs) {
282   return std::vector<std::string>{"output"};
283 })
284 .set_attr_parser(ParamParser<RROIAlignParam>)
285 .set_attr<mxnet::FInferShape>("FInferShape", [](const nnvm::NodeAttrs& attrs,
__anon3c73233e0302(const nnvm::NodeAttrs& attrs, mxnet::ShapeVector *in_shape, mxnet::ShapeVector *out_shape)286       mxnet::ShapeVector *in_shape, mxnet::ShapeVector *out_shape){
287   using namespace mshadow;
288   const RROIAlignParam& param = nnvm::get<RROIAlignParam>(attrs.parsed);
289   CHECK_EQ(in_shape->size(), 2U) << "Input:[data, rois]";
290   // data: [batch_size, c, h, w]
291   mxnet::TShape dshape = in_shape->at(rroialign::kData);
292   CHECK_EQ(dshape.ndim(), 4U) << "data should be a 4D tensor";
293   // bbox: [num_rois, 6]
294   mxnet::TShape bshape = in_shape->at(rroialign::kBox);
295   CHECK_EQ(bshape.ndim(), 2U) << "bbox should be a 2D tensor of shape [batch, 6]";
296   CHECK_EQ(bshape[1], 6U) << "bbox should be a 2D tensor of shape [batch, 6]";
297   // out: [num_rois, c, pooled_h, pooled_w]
298   out_shape->clear();
299   out_shape->push_back(Shape4(bshape[0], dshape[1], param.pooled_size[0], param.pooled_size[1]));
300   return true;
301 })
302 .set_attr<nnvm::FInferType>("FInferType", [](const nnvm::NodeAttrs& attrs,
__anon3c73233e0402(const nnvm::NodeAttrs& attrs, std::vector<int> *in_type, std::vector<int> *out_type) 303       std::vector<int> *in_type, std::vector<int> *out_type) {
304   CHECK_EQ(in_type->size(), 2U);
305   int dtype = (*in_type)[0];
306   CHECK_EQ(dtype, (*in_type)[1]);
307   CHECK_NE(dtype, -1) << "Input must have specified type";
308 
309   out_type->clear();
310   out_type->push_back(dtype);
311   return true;
312 })
313 .set_attr<FCompute>("FCompute<cpu>", RROIAlignForwardCompute<cpu>)
314 .add_argument("data", "NDArray-or-Symbol", "Input data to the pooling operator, a 4D Feature maps")
315 .add_argument("rois", "NDArray-or-Symbol", "Bounding box coordinates, a 2D array")
316 .add_arguments(RROIAlignParam::__FIELDS__());
317 
318 NNVM_REGISTER_OP(_backward_RROIAlign)
319 .set_num_outputs(2)
320 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
321 .set_attr_parser(ParamParser<RROIAlignParam>)
322 .set_attr<FCompute>("FCompute<cpu>", RROIAlignBackwardCompute<cpu>);
323 
324 }  // namespace op
325 }  // namespace mxnet
326