1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements.  See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership.  The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License.  You may obtain a copy of the License at
9  *
10  *   http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied.  See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 /*!
21  * \file tensor_gpu-inl.h
22  * \brief implementation of GPU host code
23  * \author Bing Xu, Tianqi Chen
24  */
25 #ifndef MSHADOW_TENSOR_GPU_INL_H_
26 #define MSHADOW_TENSOR_GPU_INL_H_
27 #include "./base.h"
28 #include "./tensor.h"
29 
30 namespace mshadow {
31 #if MSHADOW_USE_CUDA
32 template<>
33 inline void InitTensorEngine<gpu>(int dev_id) {
34   cudaDeviceProp prop;
35   int device_id = 0;
36   int device_count = 0;
37   cudaGetDeviceCount(&device_count);
38   CHECK_GT(device_count, 0) << "Cannot find CUDA device. Please check CUDA-Configuration";
39   if (dev_id < 0) {
40     device_id = 0;
41   } else {
42     device_id = dev_id;
43   }
44   CHECK_LT(device_id, device_count) << "Incorrect Device ID";
45   MSHADOW_CUDA_CALL(cudaSetDevice(device_id));
46   MSHADOW_CUDA_CALL(cudaGetDeviceProperties(&prop, device_id));
47 }
48 template<>
49 inline void ShutdownTensorEngine<gpu>(void) {
50 }
51 template<>
52 inline void SetDevice<gpu>(int devid) {
53   MSHADOW_CUDA_CALL(cudaSetDevice(devid));
54 }
55 template<int dim, typename DType>
AllocSpace(Tensor<gpu,dim,DType> * obj,bool pad)56 inline void AllocSpace(Tensor<gpu, dim, DType> *obj, bool pad) {
57   size_t pitch;
58   // common choice for cuda mem align unit is 32
59   if (pad && obj->size(dim - 1) >= MSHADOW_MIN_PAD_RATIO * 32) {
60     MSHADOW_CUDA_CALL(cudaMallocPitch(reinterpret_cast<void**>(&(obj->dptr_)), &pitch,
61                                       obj->size(dim - 1) * sizeof(DType),
62                                       obj->shape_.FlatTo2D()[0]));
63     obj->stride_ = static_cast<index_t>(pitch / sizeof(DType));
64   } else {
65     obj->stride_ = obj->size(dim - 1);
66     MSHADOW_CUDA_CALL(cudaMallocPitch(reinterpret_cast<void**>(&(obj->dptr_)), &pitch,
67                                       obj->shape_.Size() * sizeof(DType), 1));
68   }
69 }
70 template<int dim, typename DType>
FreeSpace(Tensor<gpu,dim,DType> * obj)71 inline void FreeSpace(Tensor<gpu, dim, DType> *obj) {
72   MSHADOW_CUDA_CALL(cudaFree(obj->dptr_));
73   obj->dptr_ = NULL;
74 }
75 template<typename A, typename B, int dim, typename DType>
Copy(Tensor<A,dim,DType> _dst,Tensor<B,dim,DType> _src,cudaMemcpyKind kind,Stream<gpu> * stream)76 inline void Copy(Tensor<A, dim, DType> _dst,
77                  Tensor<B, dim, DType> _src,
78                  cudaMemcpyKind kind,
79                  Stream<gpu> *stream) {
80   CHECK_EQ(_dst.shape_, _src.shape_) << "Copy:shape mismatch";
81   Tensor<A, 2, DType> dst = _dst.FlatTo2D();
82   Tensor<B, 2, DType> src = _src.FlatTo2D();
83   MSHADOW_CUDA_CALL(cudaMemcpy2DAsync(dst.dptr_, dst.stride_ * sizeof(DType),
84                                       src.dptr_, src.stride_ * sizeof(DType),
85                                       dst.size(1) * sizeof(DType),
86                                       dst.size(0), kind,
87                                       Stream<gpu>::GetStream(stream)));
88   // use synchronize call behavior for zero stream
89   if (stream == NULL) {
90     MSHADOW_CUDA_CALL(cudaStreamSynchronize(0));
91   }
92 }
93 template<int dim, typename DType>
Copy(Tensor<cpu,dim,DType> dst,const Tensor<gpu,dim,DType> & src,Stream<gpu> * stream)94 inline void Copy(Tensor<cpu, dim, DType> dst,
95                  const Tensor<gpu, dim, DType> &src,
96                  Stream<gpu> *stream) {
97   Copy(dst, src, cudaMemcpyDeviceToHost, stream);
98 }
99 template<int dim, typename DType>
Copy(Tensor<gpu,dim,DType> dst,const Tensor<gpu,dim,DType> & src,Stream<gpu> * stream)100 inline void Copy(Tensor<gpu, dim, DType> dst,
101                  const Tensor<gpu, dim, DType> &src,
102                  Stream<gpu> *stream) {
103   Copy(dst, src, cudaMemcpyDeviceToDevice, stream);
104 }
105 template<int dim, typename DType>
Copy(Tensor<gpu,dim,DType> dst,const Tensor<cpu,dim,DType> & src,Stream<gpu> * stream)106 inline void Copy(Tensor<gpu, dim, DType> dst,
107                  const Tensor<cpu, dim, DType> &src,
108                  Stream<gpu> *stream) {
109   Copy(dst, src, cudaMemcpyHostToDevice, stream);
110 }
111 #endif  // MSHADOW_USE_CUDA
112 }  // namespace mshadow
113 
114 // the following part is included only if compiler is nvcc
115 #ifdef __CUDACC__
116 #include "./cuda/tensor_gpu-inl.cuh"
117 
118 namespace mshadow {
119 template<typename Saver, typename R, int dim,
120          typename DType, typename E, int etype>
MapExp(TRValue<R,gpu,dim,DType> * dst,const expr::Exp<E,DType,etype> & exp)121 inline void MapExp(TRValue<R, gpu, dim, DType> *dst,
122                    const expr::Exp<E, DType, etype> &exp) {
123   expr::TypeCheckPass<expr::TypeCheck<gpu, dim, DType, E>::kMapPass>
124       ::Error_All_Tensor_in_Exp_Must_Have_Same_Type();
125   Shape<dim> eshape = expr::ShapeCheck<dim, E>::Check(exp.self());
126   Shape<dim> dshape = expr::ShapeCheck<dim, R>::Check(dst->self());
127   CHECK(eshape[0] == 0 || eshape == dshape)
128     << "Assignment: Shape of Tensors are not consistent with target, "
129     << "eshape: " << eshape << " dshape:" << dshape;
130   cuda::MapPlan<Saver>(MakePlan(dst->self()),
131                        MakePlan(exp.self()),
132                        dshape.FlatTo2D(),
133                        Stream<gpu>::GetStream(expr::StreamInfo<gpu, R>::Get(dst->self())));
134 }
135 
136 template<typename Saver, typename Reducer,
137          typename R, typename DType, typename E, int etype>
MapReduceKeepLowest(TRValue<R,gpu,1,DType> * dst,const expr::Exp<E,DType,etype> & exp,DType scale)138 inline void MapReduceKeepLowest(TRValue<R, gpu, 1, DType> *dst,
139                                 const expr::Exp<E, DType, etype> &exp,
140                                 DType scale) {
141   expr::TypeCheckPass<expr::TypeCheck<gpu, 1, DType, E>::kRedPass>
142       ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
143   Shape<2> eshape = expr::ShapeCheck<expr::ExpInfo<E>::kDim, E>
144       ::Check(exp.self()).FlatTo2D();
145   Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self());
146   CHECK_EQ(eshape[1], dshape[0]) << "MapReduceKeepLowest::reduction dimension do not match";
147   CHECK_NE(eshape[0], 0U) << "can not reduce over empty tensor";
148   cuda::MapReduceKeepLowest<Saver, Reducer>
149       (MakePlan(dst->self()), MakePlan(exp.self()), scale, eshape,
150        Stream<gpu>::GetStream(expr::StreamInfo<gpu, R>::Get(dst->self())));
151 }
152 
153 template<typename Saver, typename Reducer, int dimkeep,
154          typename R, typename DType, typename E, int etype>
MapReduceKeepHighDim(TRValue<R,gpu,1,DType> * dst,const expr::Exp<E,DType,etype> & exp,DType scale)155 inline void MapReduceKeepHighDim(TRValue<R, gpu, 1, DType> *dst,
156                                  const expr::Exp<E, DType, etype> &exp,
157                                  DType scale) {
158   expr::TypeCheckPass<expr::TypeCheck<gpu, dimkeep, DType, E>::kRedPass>
159       ::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
160   typedef Shape<expr::ExpInfo<E>::kDim> EShape;
161   EShape eshape = expr::ShapeCheck<expr::ExpInfo<E>::kDim, E>
162       ::Check(exp.self());
163     Shape<1> dshape = expr::ShapeCheck<1, R>::Check(dst->self());
164   CHECK_EQ(eshape[dimkeep], dshape[0]) << "MapReduceKeepHighDim::reduction dimension do not match";
165   // use equvalent form
166   Shape<4> pshape = Shape4(eshape.ProdShape(0, dimkeep),
167                            eshape[dimkeep],
168                            eshape.ProdShape(dimkeep + 1, EShape::kSubdim),
169                            eshape[EShape::kSubdim]);
170   // call equavalent map red dim 2
171   cuda::MapReduceKeepDim1<Saver, Reducer>
172       (MakePlan(dst->self()), MakePlan(exp.self()), scale, pshape,
173        Stream<gpu>::GetStream(expr::StreamInfo<gpu, R>::Get(dst->self())));
174 }
175 template<typename DType>
Softmax(Tensor<gpu,2,DType> dst,const Tensor<gpu,2,DType> & src)176 inline void Softmax(Tensor<gpu, 2, DType> dst,
177                     const Tensor<gpu, 2, DType>& src) {
178   cuda::Softmax(dst, src);
179 }
180 
181 template<typename DType>
Softmax(Tensor<gpu,3,DType> dst,const Tensor<gpu,3,DType> & src)182 inline void Softmax(Tensor<gpu, 3, DType> dst,
183                     const Tensor<gpu, 3, DType>& src) {
184   cuda::Softmax(dst, src);
185 }
186 
187 template<typename DType>
SoftmaxGrad(const Tensor<gpu,2,DType> & dst,const Tensor<gpu,2,DType> & src,const Tensor<gpu,1,DType> & label)188 inline void SoftmaxGrad(const Tensor<gpu, 2, DType> &dst,
189                         const Tensor<gpu, 2, DType> &src,
190                         const Tensor<gpu, 1, DType> &label) {
191   cuda::SoftmaxGrad(dst, src, label);
192 }
193 
194 template<typename DType>
SmoothSoftmaxGrad(const Tensor<gpu,2,DType> & dst,const Tensor<gpu,2,DType> & src,const Tensor<gpu,1,DType> & label,const float alpha)195 inline void SmoothSoftmaxGrad(const Tensor<gpu, 2, DType> &dst,
196                               const Tensor<gpu, 2, DType> &src,
197                               const Tensor<gpu, 1, DType> &label,
198                               const float alpha) {
199   cuda::SmoothSoftmaxGrad(dst, src, label, alpha);
200 }
201 
202 template<typename DType>
SoftmaxGrad(const Tensor<gpu,2,DType> & dst,const Tensor<gpu,2,DType> & src,const Tensor<gpu,1,DType> & label,const DType & ignore_label)203 inline void SoftmaxGrad(const Tensor<gpu, 2, DType> &dst,
204                         const Tensor<gpu, 2, DType> &src,
205                         const Tensor<gpu, 1, DType> &label,
206                         const DType &ignore_label) {
207   cuda::SoftmaxGrad(dst, src, label, ignore_label);
208 }
209 
210 template<typename DType>
SmoothSoftmaxGrad(const Tensor<gpu,2,DType> & dst,const Tensor<gpu,2,DType> & src,const Tensor<gpu,1,DType> & label,const DType & ignore_label,const float alpha)211 inline void SmoothSoftmaxGrad(const Tensor<gpu, 2, DType> &dst,
212                               const Tensor<gpu, 2, DType> &src,
213                               const Tensor<gpu, 1, DType> &label,
214                               const DType &ignore_label,
215                               const float alpha) {
216   cuda::SmoothSoftmaxGrad(dst, src, label, ignore_label, alpha);
217 }
218 
219 template<typename DType>
SoftmaxGrad(const Tensor<gpu,3,DType> & dst,const Tensor<gpu,3,DType> & src,const Tensor<gpu,2,DType> & label)220 inline void SoftmaxGrad(const Tensor<gpu, 3, DType> &dst,
221                         const Tensor<gpu, 3, DType> &src,
222                         const Tensor<gpu, 2, DType> &label) {
223   cuda::SoftmaxGrad(dst, src, label);
224 }
225 
226 template<typename DType>
SoftmaxGrad(const Tensor<gpu,3,DType> & dst,const Tensor<gpu,3,DType> & src,const Tensor<gpu,2,DType> & label,const DType & ignore_label)227 inline void SoftmaxGrad(const Tensor<gpu, 3, DType> &dst,
228                         const Tensor<gpu, 3, DType> &src,
229                         const Tensor<gpu, 2, DType> &label,
230                         const DType &ignore_label) {
231   cuda::SoftmaxGrad(dst, src, label, ignore_label);
232 }
233 
234 template<bool clip, typename IndexType, typename DType>
AddTakeGrad(Tensor<gpu,2,DType> dst,const Tensor<gpu,1,IndexType> & index,const Tensor<gpu,2,DType> & src)235 inline void AddTakeGrad(Tensor<gpu, 2, DType> dst,
236                         const Tensor<gpu, 1, IndexType>& index,
237                         const Tensor<gpu, 2, DType> &src) {
238   cuda::AddTakeGrad<clip, IndexType, DType>(dst, index, src);
239 }
240 
241 template<typename IndexType, typename DType>
AddTakeGradLargeBatch(Tensor<gpu,2,DType> dst,const Tensor<gpu,1,IndexType> & sorted,const Tensor<gpu,1,IndexType> & index,const Tensor<gpu,2,DType> & src)242 inline void AddTakeGradLargeBatch(Tensor<gpu, 2, DType> dst,
243                                   const Tensor<gpu, 1, IndexType>& sorted,
244                                   const Tensor<gpu, 1, IndexType>& index,
245                                   const Tensor<gpu, 2, DType> &src) {
246   cuda::AddTakeGradLargeBatch(dst, sorted, index, src);
247 }
248 
249 template<typename KDType, typename VDType>
SortByKey(Tensor<gpu,1,KDType> keys,Tensor<gpu,1,VDType> values,bool is_ascend)250 inline void SortByKey(Tensor<gpu, 1, KDType> keys, Tensor<gpu, 1, VDType> values,
251                       bool is_ascend) {
252   cuda::SortByKey(keys, values, is_ascend);
253 }
254 
255 template<typename IndexType, typename DType>
IndexFill(Tensor<gpu,2,DType> dst,const Tensor<gpu,1,IndexType> & index,const Tensor<gpu,2,DType> & src)256 inline void IndexFill(Tensor<gpu, 2, DType> dst,
257                       const Tensor<gpu, 1, IndexType>& index,
258                       const Tensor<gpu, 2, DType> &src) {
259   cuda::IndexFill(dst, index, src);
260 }
261 }  // namespace mshadow
262 #endif  // __CUDACC__
263 #endif  // MSHADOW_TENSOR_GPU_INL_H_
264