1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements.  See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership.  The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License.  You may obtain a copy of the License at
9  *
10  *   http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied.  See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 /*!
21  * \file np_unique_op.cc
22  */
23 
24 #include "./np_unique_op.h"
25 
26 namespace mxnet {
27 namespace op {
28 
NumpyUniqueType(const nnvm::NodeAttrs & attrs,std::vector<int> * in_attrs,std::vector<int> * out_attrs)29 inline bool NumpyUniqueType(const nnvm::NodeAttrs& attrs,
30                             std::vector<int> *in_attrs,
31                             std::vector<int> *out_attrs) {
32   CHECK_EQ(in_attrs->size(), 1U);
33   TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
34   TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
35   for (size_t i = 1; i < out_attrs->size(); ++i) {
36     TYPE_ASSIGN_CHECK(*out_attrs, i, mshadow::kInt64);
37   }
38   return out_attrs->at(0) != -1;
39 }
40 
NumpyUniqueStorageType(const nnvm::NodeAttrs & attrs,const int dev_mask,DispatchMode * dispatch_mode,std::vector<int> * in_attrs,std::vector<int> * out_attrs)41 inline bool NumpyUniqueStorageType(const nnvm::NodeAttrs& attrs,
42                                    const int dev_mask,
43                                    DispatchMode* dispatch_mode,
44                                    std::vector<int> *in_attrs,
45                                    std::vector<int> *out_attrs) {
46   CHECK_EQ(in_attrs->size(), 1U);
47   // CHECK_EQ(out_attrs->size(), 1U);
48   for (int &attr : *in_attrs) {
49     CHECK_EQ(attr, kDefaultStorage) << "Only default storage is supported";
50   }
51   for (int &attr : *out_attrs) {
52     attr = kDefaultStorage;
53   }
54   *dispatch_mode = DispatchMode::kFComputeEx;
55   return true;
56 }
57 
58 struct UniqueComputeAuxCPUKernel {
59   // assume that idx have been flattened to a 1-D tensor (N,)
60   // assume that out_data and in_data have been flattened to 2-D tensors, (N, M) and (K, M)
61   // M is the number of columns of in_data and out_data
62   // i is the index of out_data
63   template<typename DType>
Mapmxnet::op::UniqueComputeAuxCPUKernel64   MSHADOW_XINLINE static void Map(dim_t i, DType* out_data, const DType* in_data,
65                                   const dim_t* idx, const dim_t M) {
66     dim_t j = idx[i];
67     std::memcpy(out_data + i * M, in_data + j * M, M * sizeof(DType));
68   }
69 };
70 
71 struct UniqueComputeMaskCPUKernel {
72   template<typename DType>
Mapmxnet::op::UniqueComputeMaskCPUKernel73   MSHADOW_XINLINE static void Map(dim_t i,
74                                   dim_t* out_data,
75                                   const DType* in_data,
76                                   const dim_t numel) {
77     if (i == 0) {
78       out_data[i] = 1;
79     } else {
80       out_data[i] = (std::memcmp(in_data + i * numel,
81                      in_data + (i - 1) * numel, numel * sizeof(DType)) == 0) ? 0 : 1;
82     }
83   }
84 };
85 
NumpyUniqueCPUNoneAxisImpl(const NumpyUniqueParam & param,const OpContext & ctx,const std::vector<NDArray> & inputs,const std::vector<OpReqType> & req,const std::vector<NDArray> & outputs)86 void NumpyUniqueCPUNoneAxisImpl(const NumpyUniqueParam& param,
87                                 const OpContext &ctx,
88                                 const std::vector<NDArray> &inputs,
89                                 const std::vector<OpReqType> &req,
90                                 const std::vector<NDArray> &outputs) {
91   MSHADOW_TYPE_SWITCH(outputs[0].dtype(), DType, {
92     mshadow::Stream<cpu> *stream = ctx.get_stream<cpu>();
93 
94     DType* input_data = inputs[0].data().dptr<DType>();
95     dim_t input_size = inputs[0].shape().Size();
96     if (param.return_index || param.return_inverse || param.return_counts) {
97       // argsort, result in perm
98       std::vector<dim_t> perm(input_size);
99       std::iota(perm.begin(), perm.end(), 0);
100       std::stable_sort(perm.begin(), perm.end(),
101         [&input_data] (dim_t i1, dim_t i2) {return input_data[i1] < input_data[i2];});
102       // sorted data in aux
103       std::vector<DType> aux(input_size);
104       mxnet_op::Kernel<UniqueComputeAuxCPUKernel, cpu>::Launch(
105         stream, input_size, aux.data(), input_data, perm.data(), 1);
106       // calculate unique mask
107       std::vector<dim_t> mask(input_size);
108       mxnet_op::Kernel<UniqueComputeMaskCPUKernel, cpu>::Launch(
109         stream, input_size, mask.data(), aux.data(), 1);
110       // Calculate prefix sum
111       std::vector<int32_t> prefix_sum(input_size, 0);
112       int32_t valid_num = 0;
113       for (dim_t i = 0; i < input_size; i++) {
114         prefix_sum[i] = (i == 0) ? 0 : prefix_sum[i - 1];
115         prefix_sum[i] += (mask[i]) ? 1 : 0;
116       }
117       valid_num = prefix_sum[input_size - 1];
118       // set the output shape forcefully
119       mxnet::TShape s(1, valid_num);
120       const_cast<NDArray &>(outputs[0]).Init(s);
121       // launch kernal to obtain unique array, reuse boolean_mask kernel
122       mxnet_op::Kernel<BooleanMaskForwardCPUKernel, cpu>::Launch(
123         stream, input_size, outputs[0].data().dptr<DType>(), aux.data(),
124         prefix_sum.data(), 1);
125       // handle other optional outputs
126       int output_flag = 0;
127       if (param.return_index) {
128         output_flag += 1;
129         const_cast<NDArray &>(outputs[output_flag]).Init(s);
130         dim_t* unique_indices = outputs[output_flag].data().dptr<dim_t>();
131         // reuse boolean_mask kernel
132         mxnet_op::Kernel<BooleanMaskForwardCPUKernel, cpu>::Launch(
133           stream, input_size, unique_indices, perm.data(),
134           prefix_sum.data(), 1);
135       }
136       if (param.return_inverse) {
137         output_flag += 1;
138         const_cast<NDArray &>(outputs[output_flag]).Init(mxnet::TShape(1, input_size));
139         dim_t* unique_inverse = outputs[output_flag].data().dptr<dim_t>();
140         mxnet_op::Kernel<UniqueReturnInverseKernel, cpu>::Launch(
141           stream, input_size, unique_inverse, prefix_sum.data(), perm.data());
142       }
143       if (param.return_counts) {
144         output_flag += 1;
145         std::vector<dim_t> idx(valid_num + 1);
146         auto iter = idx.begin();
147         for (dim_t i = 0; i < input_size; ++i) {
148           if (mask[i]) {
149             *iter = i;
150             ++iter;
151           }
152         }
153         *iter = input_size;
154         const_cast<NDArray &>(outputs[output_flag]).Init(s);
155         dim_t* unique_counts = outputs[output_flag].data().dptr<dim_t>();
156         mxnet_op::Kernel<UniqueReturnCountsKernel, cpu>::Launch(
157           stream, valid_num, unique_counts, idx.data());
158       }
159     } else {
160       std::set<DType> set(input_data, input_data + input_size);
161       mxnet::TShape s(1, set.size());
162       const_cast<NDArray &>(outputs[0]).Init(s);
163       std::copy(set.begin(), set.end(), outputs[0].data().dptr<DType>());
164     }
165   });
166 }
167 
NumpyUniqueCPUImpl(const NumpyUniqueParam & param,const OpContext & ctx,const std::vector<NDArray> & inputs,const std::vector<OpReqType> & req,const std::vector<NDArray> & outputs)168 void NumpyUniqueCPUImpl(const NumpyUniqueParam& param,
169                         const OpContext &ctx,
170                         const std::vector<NDArray> &inputs,
171                         const std::vector<OpReqType> &req,
172                         const std::vector<NDArray> &outputs) {
173   CHECK(param.axis.value() >= -1 * inputs[0].shape().ndim() &&
174       param.axis.value() < inputs[0].shape().ndim())
175       << "Axis should be in the range of [-r, r-1] where r is the rank of input tensor";
176   MSHADOW_TYPE_SWITCH(outputs[0].dtype(), DType, {
177     using namespace mshadow;
178     using namespace mshadow::expr;
179     Stream<cpu> *stream = ctx.get_stream<cpu>();
180     const index_t actual_axis =
181         param.axis.value() + ((param.axis.value() < 0) ? inputs[0].shape().ndim() : 0);
182     // reshape tensor to [origin_shape[axis], -1]
183     const mxnet::TShape origin_shape = inputs[0].shape();
184     Tensor<cpu, 3, DType> input_tensor_3d =
185         inputs[0].data().FlatTo3D<cpu, DType>(actual_axis, stream);
186     Tensor<cpu, 1, DType> workspace = ctx.requested[0].get_space_typed<cpu, 1, DType>(
187         Shape1(input_tensor_3d.shape_.Size() * 2), stream);
188     Tensor<cpu, 3, DType> input_tensor(workspace.dptr_, Shape3(
189         input_tensor_3d.shape_[1], input_tensor_3d.shape_[0], input_tensor_3d.shape_[2]), stream);
190     input_tensor = swapaxis<1, 0>(input_tensor_3d);
191     const Shape<3> temp_shape = input_tensor.shape_;
192     DType* input_data = input_tensor.dptr_;
193     dim_t numel = temp_shape[1] * temp_shape[2];
194     // argsort, result in perm
195     std::vector<dim_t> perm(temp_shape[0]);
196     std::iota(perm.begin(), perm.end(), 0);
197     std::stable_sort(perm.begin(), perm.end(),
198       [&](dim_t a, dim_t b) -> bool {
199         for (dim_t i = 0; i < numel; ++i) {
200           DType lhs = input_data[i + a * numel];
201           DType rhs = input_data[i + b * numel];
202           if (lhs < rhs) {
203             return true;
204           } else if (lhs > rhs) {
205             return false;
206           }
207         }
208         return false;
209       });
210     // sorted data in aux
211     Tensor<cpu, 2, DType> aux(workspace.dptr_ + input_tensor_3d.shape_.Size(),
212         Shape2(temp_shape[0], temp_shape[1] * temp_shape[2]), stream);
213     mxnet_op::Kernel<UniqueComputeAuxCPUKernel, cpu>::Launch(
214       stream, temp_shape[0], aux.dptr_, input_data, perm.data(), numel);
215     // calculate unique mask
216     std::vector<dim_t> mask(temp_shape[0]);
217     mxnet_op::Kernel<UniqueComputeMaskCPUKernel, cpu>::Launch(
218       stream, temp_shape[0], mask.data(), aux.dptr_, numel);
219     // calculate prefix sum
220     std::vector<int32_t> prefix_sum(temp_shape[0], 0);
221     int32_t valid_num = 0;
222     for (dim_t i = 0; i < temp_shape[0]; i++) {
223       prefix_sum[i] = (i == 0) ? 0 : prefix_sum[i - 1];
224       prefix_sum[i] += (mask[i]) ? 1 : 0;
225     }
226     valid_num = prefix_sum[temp_shape[0] - 1];
227     // store the temp output data, reuse the space of 'input_tensor'
228     Tensor<cpu, 3, DType> temp_tensor(workspace.dptr_,
229         Shape3(valid_num, temp_shape[1], temp_shape[2]), stream);
230     // launch kernal to obtain unique array, reuse boolean_mask kernel
231     mxnet_op::Kernel<BooleanMaskForwardCPUKernel, cpu>::Launch(
232       stream, temp_shape[0], temp_tensor.dptr_, aux.dptr_,
233       prefix_sum.data(), numel);
234     // set the output shape forcefully and swap axis back
235     mxnet::TShape out_shape(origin_shape);
236     out_shape[actual_axis] = valid_num;
237     const_cast<NDArray &>(outputs[0]).Init(out_shape);
238     Tensor<cpu, 3, DType> output_tensor(outputs[0].data().dptr<DType>(),
239         Shape3(temp_shape[1], valid_num, temp_shape[2]), stream);
240     output_tensor = swapaxis<1, 0>(temp_tensor);
241     // handle other optional outputs
242     int output_flag = 0;
243     if (param.return_index) {
244       output_flag += 1;
245       const_cast<NDArray &>(outputs[output_flag]).Init(mxnet::TShape(1, valid_num));
246       dim_t* unique_indices = outputs[output_flag].data().dptr<dim_t>();
247       // reuse boolean_mask kernel
248       mxnet_op::Kernel<BooleanMaskForwardCPUKernel, cpu>::Launch(
249         stream, temp_shape[0], unique_indices, perm.data(),
250         prefix_sum.data(), 1);
251     }
252     if (param.return_inverse) {
253       output_flag += 1;
254       const_cast<NDArray &>(outputs[output_flag]).Init(mxnet::TShape(1, temp_shape[0]));
255       dim_t* unique_inverse = outputs[output_flag].data().dptr<dim_t>();
256       mxnet_op::Kernel<UniqueReturnInverseKernel, cpu>::Launch(
257         stream, temp_shape[0], unique_inverse, prefix_sum.data(), perm.data());
258     }
259     if (param.return_counts) {
260       output_flag += 1;
261       std::vector<dim_t> idx(valid_num + 1);
262       auto iter = idx.begin();
263       for (dim_t i = 0; i < temp_shape[0]; ++i) {
264         if (mask[i]) {
265           *iter = i;
266           ++iter;
267         }
268       }
269       *iter = temp_shape[0];
270       const_cast<NDArray &>(outputs[output_flag]).Init(mxnet::TShape(1, valid_num));
271       dim_t* unique_counts = outputs[output_flag].data().dptr<dim_t>();
272       mxnet_op::Kernel<UniqueReturnCountsKernel, cpu>::Launch(
273           stream, valid_num, unique_counts, idx.data());
274     }
275   });
276 }
277 
NumpyUniqueCPUForward(const nnvm::NodeAttrs & attrs,const OpContext & ctx,const std::vector<NDArray> & inputs,const std::vector<OpReqType> & req,const std::vector<NDArray> & outputs)278 void NumpyUniqueCPUForward(const nnvm::NodeAttrs& attrs,
279                            const OpContext &ctx,
280                            const std::vector<NDArray> &inputs,
281                            const std::vector<OpReqType> &req,
282                            const std::vector<NDArray> &outputs) {
283   CHECK_EQ(inputs.size(), 1U);
284   CHECK(req[0] == kWriteTo || req[0] == kWriteInplace);
285   using namespace mshadow;
286   const NumpyUniqueParam& param = nnvm::get<NumpyUniqueParam>(attrs.parsed);
287   if (inputs[0].shape().ndim() == 0) {
288     CHECK(!param.axis.has_value() || param.axis.value() == -1 || param.axis.value() == 0)
289       << "Axis can only be -1 or 0 for scalor tensor";
290     Stream<cpu> *s = ctx.get_stream<cpu>();
291     mxnet::TShape shape_1(1, 1);
292     const_cast<NDArray &>(outputs[0]).Init(shape_1);
293     MSHADOW_TYPE_SWITCH(outputs[0].dtype(), DType, {
294       Tensor<cpu, 1, DType> unique_out =
295           outputs[0].data().get_with_shape<cpu, 1, DType>(Shape1(1), s);
296       ASSIGN_DISPATCH(unique_out, OpReqType::kWriteTo, inputs[0].data().dptr<DType>()[0]);
297     });
298     int output_flag = 0;
299     if (param.return_index) {
300       output_flag += 1;
301       const_cast<NDArray &>(outputs[output_flag]).Init(shape_1);
302       Tensor<cpu, 1, dim_t> outdata =
303           outputs[output_flag].data().get_with_shape<cpu, 1, dim_t>(Shape1(1), s);
304       ASSIGN_DISPATCH(outdata, OpReqType::kWriteTo, 0);
305     }
306     if (param.return_inverse) {
307       output_flag += 1;
308       const_cast<NDArray &>(outputs[output_flag]).Init(shape_1);
309       Tensor<cpu, 1, dim_t> outdata =
310           outputs[output_flag].data().get_with_shape<cpu, 1, dim_t>(Shape1(1), s);
311       ASSIGN_DISPATCH(outdata, OpReqType::kWriteTo, 0);
312     }
313     if (param.return_counts) {
314       output_flag += 1;
315       const_cast<NDArray &>(outputs[output_flag]).Init(shape_1);
316       Tensor<cpu, 1, dim_t> outdata =
317           outputs[output_flag].data().get_with_shape<cpu, 1, dim_t>(Shape1(1), s);
318       ASSIGN_DISPATCH(outdata, OpReqType::kWriteTo, 1);
319     }
320   } else if (inputs[0].shape().Size() == 0) {
321     // If the input tensor is zero size, only a check on axis is needed
322     if (param.axis.has_value()) {
323       int axis = param.axis.value();
324       if (axis < 0) axis += inputs[0].shape().ndim();
325       CHECK(axis >= 0 && axis < inputs[0].shape().ndim())
326         << "Axis must be within the range of input tensor's dimension";
327     }
328     // set the shapes of outputs
329     mxnet::TShape shape_0(1, 0);
330     const_cast<NDArray &>(outputs[0]).Init(shape_0);
331     int output_flag = 0;
332     if (param.return_index) {
333       output_flag += 1;
334       const_cast<NDArray &>(outputs[output_flag]).Init(shape_0);
335     }
336     if (param.return_inverse) {
337       output_flag += 1;
338       const_cast<NDArray &>(outputs[output_flag]).Init(shape_0);
339     }
340     if (param.return_counts) {
341       output_flag += 1;
342       const_cast<NDArray &>(outputs[output_flag]).Init(shape_0);
343     }
344   } else {
345     if (!param.axis.has_value()) {
346       NumpyUniqueCPUNoneAxisImpl(param, ctx, inputs, req, outputs);
347     } else {
348       NumpyUniqueCPUImpl(param, ctx, inputs, req, outputs);
349     }
350   }
351 }
352 
353 DMLC_REGISTER_PARAMETER(NumpyUniqueParam);
354 
355 NNVM_REGISTER_OP(_npi_unique)
356 .set_attr_parser(ParamParser<NumpyUniqueParam>)
357 .set_num_inputs(1)
__anona8074d720302(const NodeAttrs& attrs) 358 .set_num_outputs([](const NodeAttrs& attrs) {
359     const NumpyUniqueParam& param = nnvm::get<NumpyUniqueParam>(attrs.parsed);
360     int output_num = 1;
361     if (param.return_index) output_num += 1;
362     if (param.return_inverse) output_num += 1;
363     if (param.return_counts) output_num += 1;
364     return output_num;
365   })
366 .set_attr<nnvm::FListInputNames>("FListInputNames",
__anona8074d720402(const NodeAttrs& attrs) 367   [](const NodeAttrs& attrs) {
368     return std::vector<std::string>{"data"};
369   })
370 .set_attr<nnvm::FInferType>("FInferType", NumpyUniqueType)
371 .set_attr<FComputeEx>("FComputeEx<cpu>", NumpyUniqueCPUForward)
372 .set_attr<FInferStorageType>("FInferStorageType", NumpyUniqueStorageType)
373 .set_attr<FResourceRequest>("FResourceRequest",
__anona8074d720502(const NodeAttrs& attrs) 374   [](const NodeAttrs& attrs) {
375     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
376   })
377 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
378 .add_argument("data", "NDArray-or-Symbol", "The input array")
379 .add_arguments(NumpyUniqueParam::__FIELDS__());
380 
381 }  // namespace op
382 }  // namespace mxnet
383