1 /******************************************************************************* 2 * Copyright 2020 Intel Corporation 3 * Copyright 2020 Codeplay Software Limited 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 *******************************************************************************/ 17 18 #ifndef GPU_NVIDIA_CUDNN_SOFTMAX_IMPL_HPP 19 #define GPU_NVIDIA_CUDNN_SOFTMAX_IMPL_HPP 20 21 #include "cudnn.h" 22 23 #include "gpu/nvidia/sycl_cuda_utils.hpp" 24 25 namespace dnnl { 26 namespace impl { 27 namespace gpu { 28 namespace nvidia { 29 30 struct cudnn_softmax_impl_base_t { 31 cudnnDataType_t data_type; 32 int ndims; 33 cudnnSoftmaxAlgorithm_t alg_kind; 34 // cuDNN only supports softmax on channel dimension 35 cudnnSoftmaxMode_t mode = cudnnSoftmaxMode_t::CUDNN_SOFTMAX_MODE_CHANNEL; 36 // oneDNN softmax primitive doesn't support any post-ops or attributes, 37 // hence we can set alpha = 1 and beta = 0 for all cases 38 float alpha = 1.0f; 39 float beta = 0.0f; 40 ~cudnn_softmax_impl_base_tdnnl::impl::gpu::nvidia::cudnn_softmax_impl_base_t41 virtual ~cudnn_softmax_impl_base_t() {} 42 43 virtual status_t init(const softmax_pd_t *pd) = 0; 44 45 virtual void execute(cudnnHandle_t handle, void **x, int size) const = 0; 46 47 // Mapping between dnnl algorithm and cuDNN softmax algorithm convert_alg_kinddnnl::impl::gpu::nvidia::cudnn_softmax_impl_base_t48 status_t convert_alg_kind( 49 bool is_log_softmax, cudnnSoftmaxAlgorithm_t *cuda_alg_kind) const { 50 if (is_log_softmax) { 51 *cuda_alg_kind = cudnnSoftmaxAlgorithm_t::CUDNN_SOFTMAX_LOG; 52 } else { 53 *cuda_alg_kind = cudnnSoftmaxAlgorithm_t::CUDNN_SOFTMAX_ACCURATE; 54 } 55 return status::success; 56 } 57 convert_dims_softmaxdnnl::impl::gpu::nvidia::cudnn_softmax_impl_base_t58 status_t convert_dims_softmax(const dims_t &orig_dims, int *modified_dims, 59 int axis, int ndims, format_tag_t tag, 60 cudnnTensorFormat_t &format) const { 61 62 // Initialise all dims to 1 63 for (int i = 0; i < 4; i++) { 64 modified_dims[i] = 1; 65 } 66 if (axis == 1) { 67 // Copy dimensions into the new array 68 format = tag == dnnl_nhwc ? cudnnTensorFormat_t::CUDNN_TENSOR_NHWC 69 : cudnnTensorFormat_t::CUDNN_TENSOR_NCHW; 70 int num_dims = ndims < 4 ? ndims : 4; 71 for (int i = 0; i < num_dims; i++) { 72 modified_dims[i] = orig_dims[i]; 73 } 74 for (int i = 4; i < ndims; i++) { 75 modified_dims[3] *= orig_dims[i]; 76 } 77 return status::success; 78 } 79 format = cudnnTensorFormat_t::CUDNN_TENSOR_NCHW; 80 switch (tag) { 81 case dnnl_cn: { 82 modified_dims[0] = orig_dims[1]; 83 modified_dims[1] = orig_dims[0]; 84 break; 85 } 86 case dnnl_nchw: { 87 switch (axis) { 88 case 0: 89 modified_dims[1] = orig_dims[axis]; 90 modified_dims[2] = orig_dims[1]; 91 for (int i = 2; i < ndims; i++) { 92 modified_dims[3] *= orig_dims[i]; 93 } 94 break; 95 default: { 96 for (int i = 0; i < axis; i++) { 97 modified_dims[0] *= orig_dims[i]; 98 } 99 modified_dims[1] = orig_dims[axis]; 100 if (axis == ndims - 1) { return status::success; } 101 for (int i = axis + 1; i < ndims; i++) { 102 modified_dims[2] *= orig_dims[i]; 103 } 104 break; 105 } 106 } 107 break; 108 } 109 case dnnl_nhwc: 110 switch (axis) { 111 case 0: 112 modified_dims[1] = orig_dims[0]; 113 for (int i = 1; i < ndims; i++) { 114 modified_dims[2] *= orig_dims[i]; 115 } 116 break; 117 case 2: 118 modified_dims[0] = orig_dims[0]; 119 modified_dims[1] = orig_dims[2]; 120 for (int i = 3; i < ndims; i++) { 121 modified_dims[2] *= orig_dims[i]; 122 } 123 modified_dims[3] = orig_dims[1]; 124 break; 125 case 3: 126 modified_dims[0] = orig_dims[0] * orig_dims[2]; 127 modified_dims[1] = orig_dims[3]; 128 modified_dims[2] = ndims == 4 ? 1 : orig_dims[4]; 129 modified_dims[3] = orig_dims[1]; 130 break; 131 } 132 break; 133 default: return status::unimplemented; 134 } 135 return status::success; 136 } 137 convert_tagdnnl::impl::gpu::nvidia::cudnn_softmax_impl_base_t138 status_t convert_tag(const memory_desc_t *md, format_tag_t &tag) const { 139 const memory_desc_wrapper mem_wrapper(md); 140 if (mem_wrapper.matches_one_of_tag(format_tag::ba)) { 141 tag = dnnl_cn; 142 } else if (mem_wrapper.matches_one_of_tag(format_tag::ab, 143 format_tag::abc, format_tag::abcd, format_tag::abcde, 144 format_tag::abcdef)) { 145 tag = dnnl_nchw; 146 } else if (mem_wrapper.matches_one_of_tag(format_tag::acb, 147 format_tag::acdb, format_tag::acdeb)) { 148 tag = dnnl_nhwc; 149 } else { 150 return status::unimplemented; 151 } 152 return status::success; 153 } 154 }; 155 156 struct cudnn_softmax_fwd_impl_t : public cudnn_softmax_impl_base_t { 157 int dims[DNNL_MAX_NDIMS]; 158 cudnnTensorDescriptor_t tensor_desc; 159 cudnnTensorFormat_t format; 160 initdnnl::impl::gpu::nvidia::cudnn_softmax_fwd_impl_t161 status_t init(const softmax_pd_t *pd) override { 162 // If any of the dimensions are 0 we should not continue with 163 // creating cudnn descriptors 164 if (has_zero_dims(pd->src_md(0)->dims, pd->ndims())) { 165 return status::success; 166 } 167 168 if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; } 169 ndims = pd->ndims() < 4 ? 4 : pd->ndims(); 170 171 format_tag_t tag; 172 CHECK(convert_tag(pd->src_md(), tag)); 173 CHECK(convert_dims_softmax(pd->src_md()->padded_dims, dims, pd->axis(), 174 pd->ndims(), tag, format)); 175 176 convert_alg_kind(pd->is_logsoftmax(), &alg_kind); 177 178 assert(pd->src_md()->data_type == pd->dst_md()->data_type); 179 180 CHECK(convert_data_type(pd->src_md(), &data_type)); 181 182 CHECK(create_and_set_tensor_descriptor_ex( 183 &tensor_desc, format, data_type, 4, dims)); 184 return status::success; 185 } 186 executednnl::impl::gpu::nvidia::cudnn_softmax_fwd_impl_t187 void execute(cudnnHandle_t handle, void **x, int size) const override { 188 // Confirm that 2 arguments were passed, src and dst 189 assert(size == 2); 190 CUDNN_EXECUTE_FUNC(cudnnSoftmaxForward, handle, alg_kind, mode, &alpha, 191 tensor_desc, x[0], &beta, tensor_desc, x[1]); 192 } 193 ~cudnn_softmax_fwd_impl_tdnnl::impl::gpu::nvidia::cudnn_softmax_fwd_impl_t194 ~cudnn_softmax_fwd_impl_t() { 195 CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, tensor_desc); 196 } 197 }; 198 199 struct cudnn_softmax_bwd_impl_t : public cudnn_softmax_impl_base_t { 200 int dims[DNNL_MAX_NDIMS]; 201 int dims_dst[DNNL_MAX_NDIMS]; 202 cudnnTensorDescriptor_t tensor_dst_desc; 203 cudnnTensorDescriptor_t tensor_diff_desc; 204 cudnnTensorFormat_t format; 205 initdnnl::impl::gpu::nvidia::cudnn_softmax_bwd_impl_t206 status_t init(const softmax_pd_t *pd) override { 207 // If any of the dimensions are 0 we should not continue with 208 // creating cudnn descriptors 209 if (memory_desc_wrapper(pd->desc()->diff_desc).has_zero_dim()) 210 return status::success; 211 212 if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; } 213 ndims = pd->ndims() < 4 ? 4 : pd->ndims(); 214 215 format_tag_t tag; 216 CHECK(convert_tag(pd->dst_md(), tag)); 217 CHECK(convert_dims_softmax(pd->dst_md()->padded_dims, dims_dst, 218 pd->axis(), pd->ndims(), tag, format)); 219 CHECK(convert_dims_softmax(pd->diff_src_md()->padded_dims, dims, 220 pd->axis(), pd->ndims(), tag, format)); 221 222 convert_alg_kind(pd->is_logsoftmax(), &alg_kind); 223 224 assert(pd->diff_dst_md()->data_type == pd->dst_md()->data_type); 225 assert(pd->diff_dst_md()->data_type == pd->diff_src_md()->data_type); 226 227 CHECK(convert_data_type(pd->dst_md(), &data_type)); 228 229 CHECK(create_and_set_tensor_descriptor_ex( 230 &tensor_dst_desc, format, data_type, 4, dims_dst)); 231 CHECK(create_and_set_tensor_descriptor_ex( 232 &tensor_diff_desc, format, data_type, 4, dims)); 233 return status::success; 234 } 235 executednnl::impl::gpu::nvidia::cudnn_softmax_bwd_impl_t236 void execute(cudnnHandle_t handle, void **x, int size) const override { 237 // Assert that 3 arguments were passed src, diff_dst and diff_src 238 assert(size == 3); 239 CUDNN_EXECUTE_FUNC(cudnnSoftmaxBackward, handle, alg_kind, mode, &alpha, 240 tensor_dst_desc, x[0], tensor_diff_desc, x[1], &beta, 241 tensor_diff_desc, x[2]); 242 } 243 ~cudnn_softmax_bwd_impl_tdnnl::impl::gpu::nvidia::cudnn_softmax_bwd_impl_t244 ~cudnn_softmax_bwd_impl_t() { 245 CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, tensor_dst_desc); 246 CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, tensor_diff_desc); 247 } 248 }; 249 250 } // namespace nvidia 251 } // namespace gpu 252 } // namespace impl 253 } // namespace dnnl 254 255 #endif 256