1 /*******************************************************************************
2 * Copyright 2020 Intel Corporation
3 * Copyright 2020 Codeplay Software Limited
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 *     http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *******************************************************************************/
17 
18 #ifndef GPU_NVIDIA_CUDNN_SOFTMAX_IMPL_HPP
19 #define GPU_NVIDIA_CUDNN_SOFTMAX_IMPL_HPP
20 
21 #include "cudnn.h"
22 
23 #include "gpu/nvidia/sycl_cuda_utils.hpp"
24 
25 namespace dnnl {
26 namespace impl {
27 namespace gpu {
28 namespace nvidia {
29 
30 struct cudnn_softmax_impl_base_t {
31     cudnnDataType_t data_type;
32     int ndims;
33     cudnnSoftmaxAlgorithm_t alg_kind;
34     // cuDNN only supports softmax on channel dimension
35     cudnnSoftmaxMode_t mode = cudnnSoftmaxMode_t::CUDNN_SOFTMAX_MODE_CHANNEL;
36     // oneDNN softmax primitive doesn't support any post-ops or attributes,
37     // hence we can set alpha = 1 and beta = 0 for all cases
38     float alpha = 1.0f;
39     float beta = 0.0f;
40 
~cudnn_softmax_impl_base_tdnnl::impl::gpu::nvidia::cudnn_softmax_impl_base_t41     virtual ~cudnn_softmax_impl_base_t() {}
42 
43     virtual status_t init(const softmax_pd_t *pd) = 0;
44 
45     virtual void execute(cudnnHandle_t handle, void **x, int size) const = 0;
46 
47     // Mapping between dnnl algorithm and cuDNN softmax algorithm
convert_alg_kinddnnl::impl::gpu::nvidia::cudnn_softmax_impl_base_t48     status_t convert_alg_kind(
49             bool is_log_softmax, cudnnSoftmaxAlgorithm_t *cuda_alg_kind) const {
50         if (is_log_softmax) {
51             *cuda_alg_kind = cudnnSoftmaxAlgorithm_t::CUDNN_SOFTMAX_LOG;
52         } else {
53             *cuda_alg_kind = cudnnSoftmaxAlgorithm_t::CUDNN_SOFTMAX_ACCURATE;
54         }
55         return status::success;
56     }
57 
convert_dims_softmaxdnnl::impl::gpu::nvidia::cudnn_softmax_impl_base_t58     status_t convert_dims_softmax(const dims_t &orig_dims, int *modified_dims,
59             int axis, int ndims, format_tag_t tag,
60             cudnnTensorFormat_t &format) const {
61 
62         // Initialise all dims to 1
63         for (int i = 0; i < 4; i++) {
64             modified_dims[i] = 1;
65         }
66         if (axis == 1) {
67             // Copy dimensions into the new array
68             format = tag == dnnl_nhwc ? cudnnTensorFormat_t::CUDNN_TENSOR_NHWC
69                                       : cudnnTensorFormat_t::CUDNN_TENSOR_NCHW;
70             int num_dims = ndims < 4 ? ndims : 4;
71             for (int i = 0; i < num_dims; i++) {
72                 modified_dims[i] = orig_dims[i];
73             }
74             for (int i = 4; i < ndims; i++) {
75                 modified_dims[3] *= orig_dims[i];
76             }
77             return status::success;
78         }
79         format = cudnnTensorFormat_t::CUDNN_TENSOR_NCHW;
80         switch (tag) {
81             case dnnl_cn: {
82                 modified_dims[0] = orig_dims[1];
83                 modified_dims[1] = orig_dims[0];
84                 break;
85             }
86             case dnnl_nchw: {
87                 switch (axis) {
88                     case 0:
89                         modified_dims[1] = orig_dims[axis];
90                         modified_dims[2] = orig_dims[1];
91                         for (int i = 2; i < ndims; i++) {
92                             modified_dims[3] *= orig_dims[i];
93                         }
94                         break;
95                     default: {
96                         for (int i = 0; i < axis; i++) {
97                             modified_dims[0] *= orig_dims[i];
98                         }
99                         modified_dims[1] = orig_dims[axis];
100                         if (axis == ndims - 1) { return status::success; }
101                         for (int i = axis + 1; i < ndims; i++) {
102                             modified_dims[2] *= orig_dims[i];
103                         }
104                         break;
105                     }
106                 }
107                 break;
108             }
109             case dnnl_nhwc:
110                 switch (axis) {
111                     case 0:
112                         modified_dims[1] = orig_dims[0];
113                         for (int i = 1; i < ndims; i++) {
114                             modified_dims[2] *= orig_dims[i];
115                         }
116                         break;
117                     case 2:
118                         modified_dims[0] = orig_dims[0];
119                         modified_dims[1] = orig_dims[2];
120                         for (int i = 3; i < ndims; i++) {
121                             modified_dims[2] *= orig_dims[i];
122                         }
123                         modified_dims[3] = orig_dims[1];
124                         break;
125                     case 3:
126                         modified_dims[0] = orig_dims[0] * orig_dims[2];
127                         modified_dims[1] = orig_dims[3];
128                         modified_dims[2] = ndims == 4 ? 1 : orig_dims[4];
129                         modified_dims[3] = orig_dims[1];
130                         break;
131                 }
132                 break;
133             default: return status::unimplemented;
134         }
135         return status::success;
136     }
137 
convert_tagdnnl::impl::gpu::nvidia::cudnn_softmax_impl_base_t138     status_t convert_tag(const memory_desc_t *md, format_tag_t &tag) const {
139         const memory_desc_wrapper mem_wrapper(md);
140         if (mem_wrapper.matches_one_of_tag(format_tag::ba)) {
141             tag = dnnl_cn;
142         } else if (mem_wrapper.matches_one_of_tag(format_tag::ab,
143                            format_tag::abc, format_tag::abcd, format_tag::abcde,
144                            format_tag::abcdef)) {
145             tag = dnnl_nchw;
146         } else if (mem_wrapper.matches_one_of_tag(format_tag::acb,
147                            format_tag::acdb, format_tag::acdeb)) {
148             tag = dnnl_nhwc;
149         } else {
150             return status::unimplemented;
151         }
152         return status::success;
153     }
154 };
155 
156 struct cudnn_softmax_fwd_impl_t : public cudnn_softmax_impl_base_t {
157     int dims[DNNL_MAX_NDIMS];
158     cudnnTensorDescriptor_t tensor_desc;
159     cudnnTensorFormat_t format;
160 
initdnnl::impl::gpu::nvidia::cudnn_softmax_fwd_impl_t161     status_t init(const softmax_pd_t *pd) override {
162         // If any of the dimensions are 0 we should not continue with
163         // creating cudnn descriptors
164         if (has_zero_dims(pd->src_md(0)->dims, pd->ndims())) {
165             return status::success;
166         }
167 
168         if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; }
169         ndims = pd->ndims() < 4 ? 4 : pd->ndims();
170 
171         format_tag_t tag;
172         CHECK(convert_tag(pd->src_md(), tag));
173         CHECK(convert_dims_softmax(pd->src_md()->padded_dims, dims, pd->axis(),
174                 pd->ndims(), tag, format));
175 
176         convert_alg_kind(pd->is_logsoftmax(), &alg_kind);
177 
178         assert(pd->src_md()->data_type == pd->dst_md()->data_type);
179 
180         CHECK(convert_data_type(pd->src_md(), &data_type));
181 
182         CHECK(create_and_set_tensor_descriptor_ex(
183                 &tensor_desc, format, data_type, 4, dims));
184         return status::success;
185     }
186 
executednnl::impl::gpu::nvidia::cudnn_softmax_fwd_impl_t187     void execute(cudnnHandle_t handle, void **x, int size) const override {
188         // Confirm that 2 arguments were passed, src and dst
189         assert(size == 2);
190         CUDNN_EXECUTE_FUNC(cudnnSoftmaxForward, handle, alg_kind, mode, &alpha,
191                 tensor_desc, x[0], &beta, tensor_desc, x[1]);
192     }
193 
~cudnn_softmax_fwd_impl_tdnnl::impl::gpu::nvidia::cudnn_softmax_fwd_impl_t194     ~cudnn_softmax_fwd_impl_t() {
195         CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, tensor_desc);
196     }
197 };
198 
199 struct cudnn_softmax_bwd_impl_t : public cudnn_softmax_impl_base_t {
200     int dims[DNNL_MAX_NDIMS];
201     int dims_dst[DNNL_MAX_NDIMS];
202     cudnnTensorDescriptor_t tensor_dst_desc;
203     cudnnTensorDescriptor_t tensor_diff_desc;
204     cudnnTensorFormat_t format;
205 
initdnnl::impl::gpu::nvidia::cudnn_softmax_bwd_impl_t206     status_t init(const softmax_pd_t *pd) override {
207         // If any of the dimensions are 0 we should not continue with
208         // creating cudnn descriptors
209         if (memory_desc_wrapper(pd->desc()->diff_desc).has_zero_dim())
210             return status::success;
211 
212         if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; }
213         ndims = pd->ndims() < 4 ? 4 : pd->ndims();
214 
215         format_tag_t tag;
216         CHECK(convert_tag(pd->dst_md(), tag));
217         CHECK(convert_dims_softmax(pd->dst_md()->padded_dims, dims_dst,
218                 pd->axis(), pd->ndims(), tag, format));
219         CHECK(convert_dims_softmax(pd->diff_src_md()->padded_dims, dims,
220                 pd->axis(), pd->ndims(), tag, format));
221 
222         convert_alg_kind(pd->is_logsoftmax(), &alg_kind);
223 
224         assert(pd->diff_dst_md()->data_type == pd->dst_md()->data_type);
225         assert(pd->diff_dst_md()->data_type == pd->diff_src_md()->data_type);
226 
227         CHECK(convert_data_type(pd->dst_md(), &data_type));
228 
229         CHECK(create_and_set_tensor_descriptor_ex(
230                 &tensor_dst_desc, format, data_type, 4, dims_dst));
231         CHECK(create_and_set_tensor_descriptor_ex(
232                 &tensor_diff_desc, format, data_type, 4, dims));
233         return status::success;
234     }
235 
executednnl::impl::gpu::nvidia::cudnn_softmax_bwd_impl_t236     void execute(cudnnHandle_t handle, void **x, int size) const override {
237         // Assert that 3 arguments were passed src, diff_dst and diff_src
238         assert(size == 3);
239         CUDNN_EXECUTE_FUNC(cudnnSoftmaxBackward, handle, alg_kind, mode, &alpha,
240                 tensor_dst_desc, x[0], tensor_diff_desc, x[1], &beta,
241                 tensor_diff_desc, x[2]);
242     }
243 
~cudnn_softmax_bwd_impl_tdnnl::impl::gpu::nvidia::cudnn_softmax_bwd_impl_t244     ~cudnn_softmax_bwd_impl_t() {
245         CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, tensor_dst_desc);
246         CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, tensor_diff_desc);
247     }
248 };
249 
250 } // namespace nvidia
251 } // namespace gpu
252 } // namespace impl
253 } // namespace dnnl
254 
255 #endif
256