1 /*******************************************************************************
2 * Copyright 2020 Intel Corporation
3 * Copyright 2020 Codeplay Software Limited
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 *     http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *******************************************************************************/
17 
18 #include "gpu/nvidia/cudnn_convolution.hpp"
19 #include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
20 #include "gpu/nvidia/sycl_cuda_stream.hpp"
21 #include "gpu/nvidia/sycl_cuda_utils.hpp"
22 
23 namespace dnnl {
24 namespace impl {
25 namespace gpu {
26 namespace nvidia {
27 
execute_convolution(const exec_ctx_t & ctx,bool with_bias,bool with_scratchpad) const28 status_t cudnn_convolution_fwd_t::execute_convolution(
29         const exec_ctx_t &ctx, bool with_bias, bool with_scratchpad) const {
30     nvidia::sycl_cuda_stream_t *cuda_stream
31             = utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
32 
33     return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
34         using scratch_acc_t = cl::sycl::accessor<uint8_t, 1,
35                 cl::sycl::access::mode::read_write>;
36         auto x_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
37         auto weights_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS);
38         auto y_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
39         std::shared_ptr<
40                 cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read>>
41                 bias_acc;
42         std::shared_ptr<scratch_acc_t> scratch_acc;
43         std::shared_ptr<scratch_acc_t> filter_scratch_acc;
44         std::shared_ptr<scratch_acc_t> temp_dst_acc;
45         std::shared_ptr<scratch_acc_t> temp_reorder_acc;
46         if (with_scratchpad) {
47             scratch_acc = std::make_shared<scratch_acc_t>(
48                     utils::downcast<sycl::sycl_buffer_memory_storage_t *>(
49                             ctx.get_scratchpad_grantor()
50                                     .get_memory_storage(memory_tracking::names::
51                                                     key_conv_cudnn_algo)
52                                     .get())
53                             ->buffer()
54                             .get_access<cl::sycl::access::mode::read_write>(
55                                     cgh));
56         }
57         if (with_bias) {
58             bias_acc = std::make_shared<cl::sycl::accessor<uint8_t, 1,
59                     cl::sycl::access::mode::read>>(
60                     CTX_IN_ACCESSOR(DNNL_ARG_BIAS));
61         }
62         if (pd()->impl_->using_transformed_filter()) {
63             filter_scratch_acc
64                     = std::make_shared<scratch_acc_t>(CTX_SCRATCH_ACCESSOR(
65                             memory_tracking::names::key_conv_cudnn_filter));
66         }
67 
68         if (pd()->use_temp_dst_) {
69             temp_dst_acc = std::make_shared<scratch_acc_t>(
70                     buffer(scratch_storage.get())
71                             .get_access<cl::sycl::access::mode::read_write>(
72                                     cgh));
73             temp_reorder_acc = std::make_shared<scratch_acc_t>(
74                     buffer(scratch_storage_2.get())
75                             .get_access<cl::sycl::access::mode::read_write>(
76                                     cgh));
77         }
78 
79         cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
80             auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
81                     cuda_stream->engine());
82             auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
83             auto handle = cuda_stream->get_cudnn_handle();
84 
85             std::vector<void *> args;
86             args.push_back(sc.memory<void *>(ih, x_acc));
87             args.push_back(sc.memory<void *>(ih, weights_acc));
88             args.push_back(sc.memory<void *>(ih, y_acc));
89             args.push_back(
90                     with_bias ? sc.memory<void *>(ih, *bias_acc) : nullptr);
91             args.push_back(with_scratchpad ? sc.memory<void *>(ih, *scratch_acc)
92                                            : nullptr);
93             args.push_back(pd()->impl_->using_transformed_filter()
94                             ? sc.memory<void *>(ih, *filter_scratch_acc)
95                             : nullptr);
96             args.push_back(pd()->use_temp_dst_
97                             ? sc.memory<void *>(ih, *temp_dst_acc)
98                             : nullptr);
99             args.push_back(pd()->use_temp_dst_
100                             ? sc.memory<void *>(ih, *temp_reorder_acc)
101                             : nullptr);
102             pd()->impl_->execute(handle, args);
103         });
104     });
105 }
106 
execute_convolution(const exec_ctx_t & ctx,bool with_bias,bool with_scratchpad) const107 status_t cudnn_convolution_bwd_data_t::execute_convolution(
108         const exec_ctx_t &ctx, bool with_bias, bool with_scratchpad) const {
109     nvidia::sycl_cuda_stream_t *cuda_stream
110             = utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
111 
112     return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
113         using scratch_acc_t = cl::sycl::accessor<uint8_t, 1,
114                 cl::sycl::access::mode::read_write>;
115         auto x_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SRC);
116         auto weights_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS);
117         auto y_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST);
118         std::shared_ptr<
119                 cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read>>
120                 bias_acc;
121         std::shared_ptr<scratch_acc_t> scratch_acc;
122         std::shared_ptr<scratch_acc_t> filter_scratch_acc;
123         if (with_scratchpad) {
124             scratch_acc = std::make_shared<scratch_acc_t>(
125                     utils::downcast<sycl::sycl_buffer_memory_storage_t *>(
126                             ctx.get_scratchpad_grantor()
127                                     .get_memory_storage(memory_tracking::names::
128                                                     key_conv_cudnn_algo)
129                                     .get())
130                             ->buffer()
131                             .get_access<cl::sycl::access::mode::read_write>(
132                                     cgh));
133         }
134         if (with_bias) {
135             bias_acc = std::make_shared<cl::sycl::accessor<uint8_t, 1,
136                     cl::sycl::access::mode::read>>(
137                     CTX_IN_ACCESSOR(DNNL_ARG_BIAS));
138         }
139         if (pd()->impl_->using_transformed_filter()) {
140             filter_scratch_acc
141                     = std::make_shared<scratch_acc_t>(CTX_SCRATCH_ACCESSOR(
142                             memory_tracking::names::key_conv_cudnn_filter));
143         }
144         cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
145             auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
146                     cuda_stream->engine());
147             auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
148             auto handle = cuda_stream->get_cudnn_handle();
149 
150             std::vector<void *> args;
151             args.push_back(sc.memory<void *>(ih, x_acc));
152             args.push_back(sc.memory<void *>(ih, weights_acc));
153             args.push_back(sc.memory<void *>(ih, y_acc));
154             args.push_back(
155                     with_bias ? sc.memory<void *>(ih, *bias_acc) : nullptr);
156             args.push_back(with_scratchpad ? sc.memory<void *>(ih, *scratch_acc)
157                                            : nullptr);
158             args.push_back(pd()->impl_->using_transformed_filter()
159                             ? sc.memory<void *>(ih, *filter_scratch_acc)
160                             : nullptr);
161             pd()->impl_->execute(handle, args);
162         });
163     });
164 }
execute_zero_dims(const exec_ctx_t & ctx) const165 status_t cudnn_convolution_bwd_weights_t::execute_zero_dims(
166         const exec_ctx_t &ctx) const {
167     nvidia::sycl_cuda_stream_t *cuda_stream
168             = utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
169 
170     return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
171         auto weights_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_WEIGHTS);
172         std::shared_ptr<
173                 cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write>>
174                 bias_acc;
175         if (pd()->with_bias()) {
176             bias_acc = std::make_shared<cl::sycl::accessor<uint8_t, 1,
177                     cl::sycl::access::mode::write>>(
178                     CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_BIAS));
179         }
180         cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
181             auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
182                     cuda_stream->engine());
183             auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
184             auto handle = cuda_stream->get_cudnn_handle();
185 
186             auto weights = sc.memory<void *>(ih, weights_acc);
187             void *bias = nullptr;
188             if (pd()->with_bias()) bias = sc.memory<void *>(ih, *bias_acc);
189             pd()->impl_->execute_set_weights_bias(handle, weights, bias, 0.f);
190         });
191     });
192 }
execute_convolution(const exec_ctx_t & ctx,bool with_bias,bool with_scratchpad) const193 status_t cudnn_convolution_bwd_weights_t::execute_convolution(
194         const exec_ctx_t &ctx, bool with_bias, bool with_scratchpad) const {
195     nvidia::sycl_cuda_stream_t *cuda_stream
196             = utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
197 
198     return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
199         using scratch_acc_t = cl::sycl::accessor<uint8_t, 1,
200                 cl::sycl::access::mode::read_write>;
201         auto x_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
202         auto weights_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_WEIGHTS);
203         auto y_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST);
204         std::shared_ptr<
205                 cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write>>
206                 bias_acc;
207         std::shared_ptr<scratch_acc_t> scratch_acc;
208         std::shared_ptr<scratch_acc_t> filter_scratch_acc;
209         if (with_scratchpad) {
210             scratch_acc = std::make_shared<scratch_acc_t>(
211                     utils::downcast<sycl::sycl_buffer_memory_storage_t *>(
212                             ctx.get_scratchpad_grantor()
213                                     .get_memory_storage(memory_tracking::names::
214                                                     key_conv_cudnn_algo)
215                                     .get())
216                             ->buffer()
217                             .get_access<cl::sycl::access::mode::read_write>(
218                                     cgh));
219         }
220         if (with_bias) {
221             bias_acc = std::make_shared<cl::sycl::accessor<uint8_t, 1,
222                     cl::sycl::access::mode::write>>(
223                     CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_BIAS));
224         }
225         if (pd()->impl_->using_transformed_filter()) {
226             filter_scratch_acc
227                     = std::make_shared<scratch_acc_t>(CTX_SCRATCH_ACCESSOR(
228                             memory_tracking::names::key_conv_cudnn_filter));
229         }
230 
231         cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
232             auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
233                     cuda_stream->engine());
234             auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
235             auto handle = cuda_stream->get_cudnn_handle();
236 
237             std::vector<void *> args;
238             args.push_back(sc.memory<void *>(ih, x_acc));
239             args.push_back(sc.memory<void *>(ih, weights_acc));
240             args.push_back(sc.memory<void *>(ih, y_acc));
241             args.push_back(
242                     with_bias ? sc.memory<void *>(ih, *bias_acc) : nullptr);
243             args.push_back(with_scratchpad ? sc.memory<void *>(ih, *scratch_acc)
244                                            : nullptr);
245             args.push_back(pd()->impl_->using_transformed_filter()
246                             ? sc.memory<void *>(ih, *filter_scratch_acc)
247                             : nullptr);
248             pd()->impl_->execute(handle, args);
249         });
250     });
251 }
252 
253 } // namespace nvidia
254 } // namespace gpu
255 } // namespace impl
256 } // namespace dnnl
257