1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements.  See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership.  The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License.  You may obtain a copy of the License at
9  *
10  *   http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied.  See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 /*!
20  * \file bilinear_resize.cc
21  * \brief bilinear resize operator
22  * \author Hang Zhang
23 */
24 #include "bilinear_resize-inl.h"
25 #include "../elemwise_op_common.h"
26 
27 namespace mxnet {
28 namespace op {
29 
30 using namespace mshadow;
31 
32 template<typename xpu, typename DType, typename AccReal>
SpatialUpSamplingBilinearUpdateOutput(mshadow::Stream<cpu> * s,const std::vector<TBlob> & input,const std::vector<TBlob> & output,bool align_corners)33 void SpatialUpSamplingBilinearUpdateOutput(mshadow::Stream<cpu> *s,
34                                            const std::vector<TBlob> &input,
35                                            const std::vector<TBlob> &output,
36                                            bool align_corners) {
37   Tensor<xpu, 4, DType> itensor = input[0].get<xpu, 4, DType>(s);
38   Tensor<xpu, 4, DType> otensor = output[0].get<xpu, 4, DType>(s);
39   int nbatch = otensor.size(0);
40   int channels = otensor.size(1);
41   int outputHeight = otensor.size(2);
42   int outputWidth = otensor.size(3);
43   int inputHeight = itensor.size(2);
44   int inputWidth = itensor.size(3);
45 
46   const auto nthreads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
47 
48   DType *idata = itensor.dptr_;
49   DType *odata = otensor.dptr_;
50   channels = nbatch * channels;
51   const int input_elems_per_channel = inputWidth * inputHeight;
52   const int output_elems_per_channel = outputWidth * outputHeight;
53 
54   // special case: just copy
55   if (inputHeight == outputHeight && inputWidth == outputWidth) {
56 #pragma omp parallel for num_threads(nthreads)
57     for (int index = 0; index < output_elems_per_channel; index++) {
58       const int h2 = index / outputWidth;
59       const int h1 = h2;
60       const int w2 = index % outputWidth;
61       const int w1 = w2;
62       const DType* pos1 = &idata[h1 * inputWidth + w1];
63       DType* pos2 = &odata[index];
64       for (int c = 0; c < channels; ++c) {
65         *pos2 = *pos1;
66         pos1 += input_elems_per_channel;
67         pos2 += output_elems_per_channel;
68       }
69     }
70     return;
71   }
72   const float rheight = area_pixel_compute_scale<float>(
73     inputHeight, outputHeight, align_corners);
74   const float rwidth = area_pixel_compute_scale<float>(
75     inputWidth, outputWidth, align_corners);
76 
77 #pragma omp parallel for num_threads(nthreads)
78   for (int index = 0; index < output_elems_per_channel; index++) {
79     const int h2 = index / outputWidth;
80     const int w2 = index % outputWidth;
81 
82   const float h1r = area_pixel_compute_source_index<float>(
83     rheight, h2, align_corners, false);
84     const int h1 = h1r;
85     const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
86     const DType h1lambda = h1r - h1;
87     const DType h0lambda = (DType)1. - h1lambda;
88 
89   const float w1r = area_pixel_compute_source_index<float>(
90     rwidth, w2, align_corners, false);
91     const int w1 = w1r;
92     const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
93     const DType w1lambda = w1r - w1;
94     const DType w0lambda = (DType)1. - w1lambda;
95     const DType* pos1 = &idata[h1 * inputWidth + w1];
96     DType* pos2 = &odata[index];
97 
98     for (int c = 0; c < channels; ++c) {
99       *pos2 = h0lambda * (w0lambda * (*pos1) + w1lambda * *(pos1 + w1p))
100             + h1lambda * (w0lambda * *(pos1 + h1p * inputWidth)
101             + w1lambda * *(pos1 + h1p * inputWidth + w1p));
102       pos1 += input_elems_per_channel;
103       pos2 += output_elems_per_channel;
104     }
105   }
106 }
107 
108 template<typename xpu, typename DType, typename AccReal>
SpatialUpSamplingBilinearUpdateGradInput(mshadow::Stream<cpu> * s,const std::vector<TBlob> & input,const std::vector<TBlob> & output,bool modeLike,bool align_corners)109 void SpatialUpSamplingBilinearUpdateGradInput(mshadow::Stream<cpu> *s,
110                                               const std::vector<TBlob> &input,
111                                               const std::vector<TBlob> &output,
112                                               bool modeLike,
113                                               bool align_corners) {
114   Tensor<xpu, 4, DType> gradOutput = input[0].get<xpu, 4, DType>(s);
115   Tensor<xpu, 4, DType> gradInput = output[0].get<xpu, 4, DType>(s);
116 
117   int nbatch = gradInput.size(0);
118   int channels = gradInput.size(1);
119   int outputHeight = gradOutput.size(2);
120   int outputWidth = gradOutput.size(3);
121   int inputHeight = gradInput.size(2);
122   int inputWidth = gradInput.size(3);
123 
124   const auto nthreads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
125 
126   DType *dataInput = gradInput.dptr_;
127   DType *dataOutput = gradOutput.dptr_;
128   channels = nbatch * channels;
129   const int input_elems_per_channel = inputWidth * inputHeight;
130   const int output_elems_per_channel = outputWidth * outputHeight;
131 
132   // special case: same-size matching grids
133   if (inputHeight == outputHeight && inputWidth == outputWidth) {
134 #pragma omp parallel for num_threads(nthreads)
135     for (int index = 0; index < output_elems_per_channel; index++) {
136       const int h2 = index / outputWidth;
137       const int h1 = h2;
138       const int w2 = index % outputWidth;
139       const int w1 = w2;
140       DType* pos1 = &dataInput[h1 * inputWidth + w1];
141       const DType* pos2 = &dataOutput[index];
142       for (int c = 0; c < channels; ++c) {
143         *pos1 += *pos2;
144         pos1 += input_elems_per_channel;
145         pos2 += output_elems_per_channel;
146       }
147     }
148     return;
149   }
150   const float rheight = area_pixel_compute_scale<float>(
151     inputHeight, outputHeight, align_corners);
152   const float rwidth = area_pixel_compute_scale<float>(
153     inputWidth, outputWidth, align_corners);
154 #pragma omp parallel for num_threads(nthreads)
155   for (int index = 0; index < output_elems_per_channel; index++) {
156     const int h2 = index / outputWidth;
157     const int w2 = index % outputWidth;
158 
159     const float h1r = area_pixel_compute_source_index<float>(
160         rheight, h2, align_corners, false);
161     const int h1 = h1r;
162     const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
163     const DType h1lambda = h1r - h1;
164     const DType h0lambda = (DType)1. - h1lambda;
165 
166     const float w1r = area_pixel_compute_source_index<float>(
167         rwidth, w2, align_corners, false);
168     const int w1 = w1r;
169     const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
170     const DType w1lambda = w1r - w1;
171     const DType w0lambda = (DType)1. - w1lambda;
172 
173     DType* posInput = &dataInput[h1 * inputWidth + w1];
174     const DType* posOutput = &dataOutput[index];
175     for (int c = 0; c < channels; ++c) {
176       #pragma omp critical
177       {
178         *posInput += h0lambda * w0lambda * (*posOutput);
179         *(posInput + w1p) += h0lambda * w1lambda * (*posOutput);
180         *(posInput + h1p * inputWidth) += h1lambda * w0lambda * (*posOutput);
181         *(posInput + h1p * inputWidth + w1p) += h1lambda * w1lambda * (*posOutput);
182       }
183       posInput += input_elems_per_channel;
184       posOutput += output_elems_per_channel;
185     }
186   }
187 
188   if (modeLike) {
189     Tensor<xpu, 4, DType> gradInputLike = output[1].get<xpu, 4, DType>(s);
190     int inputHeightLike = gradInputLike.size(2);
191     int inputWidthLike = gradInputLike.size(3);
192     DType *dataInputLike = gradInputLike.dptr_;
193     int channelsLike = nbatch * gradInputLike.size(1);
194 
195     const int inputLike_elems_per_channel = inputHeightLike * inputWidthLike;
196 #pragma omp parallel for num_threads(nthreads)
197     for (int index = 0; index < inputLike_elems_per_channel; index++) {
198       DType *posInput = &dataInputLike[index];
199       for (int c = 0; c < channelsLike; ++c) {
200         *posInput = 0;
201         posInput += inputLike_elems_per_channel;
202       }
203     }
204   }
205 }
206 
207 DMLC_REGISTER_PARAMETER(BilinearSampleParam);
208 
209 NNVM_REGISTER_OP(_contrib_BilinearResize2D)
210 .describe(R"code(
211 Perform 2D resizing (upsampling or downsampling) for 4D input using bilinear interpolation.
212 
213 Expected input is a 4 dimensional NDArray (NCHW) and the output
214 with the shape of (N x C x height x width).
215 The key idea of bilinear interpolation is to perform linear interpolation
216 first in one direction, and then again in the other direction. See the wikipedia of
217 `Bilinear interpolation  <https://en.wikipedia.org/wiki/Bilinear_interpolation>`_
218 for more details.
219 )code" ADD_FILELINE)
220 .set_attr_parser(ParamParser<BilinearSampleParam>)
221 .set_num_inputs(BilinearSampleOpNumInputs)
222 .set_num_outputs(1)
223 .set_attr<nnvm::FListInputNames>("FListInputNames", BilinearSampleOpInputNames)
224 .set_attr<mxnet::FInferShape>("FInferShape", BilinearSampleOpInferShape)
225 .set_attr<FCompute>("FCompute<cpu>", BilinearSampleOpForward<cpu>)
226 .set_attr<nnvm::FGradient>("FGradient",
227   ElemwiseGradUseNone{"_backward_contrib_BilinearResize2D"})
228 .add_argument("data", "NDArray-or-Symbol", "Input data")
229 .add_argument("like", "NDArray-or-Symbol", "Resize data to it's shape")
230 .add_arguments(BilinearSampleParam::__FIELDS__());
231 
232 NNVM_REGISTER_OP(_backward_contrib_BilinearResize2D)
233 .set_attr_parser(ParamParser<BilinearSampleParam>)
234 .set_num_inputs(1)
235 .set_num_outputs(BilinearSampleOpNumBackwardOutputs)
236 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
237 .set_attr<FCompute>("FCompute<cpu>", BilinearSampleOpBackward<cpu>);
238 
239 
240 }  // namespace op
241 }  // namespace mxnet
242