1 /*******************************************************************************
2 * Copyright 2020 Intel Corporation
3 * Copyright 2020 Codeplay Software Limited
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *******************************************************************************/
17
18 #include "gpu/nvidia/cudnn_convolution.hpp"
19 #include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
20 #include "gpu/nvidia/sycl_cuda_stream.hpp"
21 #include "gpu/nvidia/sycl_cuda_utils.hpp"
22
23 namespace dnnl {
24 namespace impl {
25 namespace gpu {
26 namespace nvidia {
27
execute_convolution(const exec_ctx_t & ctx,bool with_bias,bool with_scratchpad) const28 status_t cudnn_convolution_fwd_t::execute_convolution(
29 const exec_ctx_t &ctx, bool with_bias, bool with_scratchpad) const {
30 nvidia::sycl_cuda_stream_t *cuda_stream
31 = utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
32
33 return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
34 using scratch_acc_t = cl::sycl::accessor<uint8_t, 1,
35 cl::sycl::access::mode::read_write>;
36 auto x_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
37 auto weights_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS);
38 auto y_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DST);
39 std::shared_ptr<
40 cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read>>
41 bias_acc;
42 std::shared_ptr<scratch_acc_t> scratch_acc;
43 std::shared_ptr<scratch_acc_t> filter_scratch_acc;
44 std::shared_ptr<scratch_acc_t> temp_dst_acc;
45 std::shared_ptr<scratch_acc_t> temp_reorder_acc;
46 if (with_scratchpad) {
47 scratch_acc = std::make_shared<scratch_acc_t>(
48 utils::downcast<sycl::sycl_buffer_memory_storage_t *>(
49 ctx.get_scratchpad_grantor()
50 .get_memory_storage(memory_tracking::names::
51 key_conv_cudnn_algo)
52 .get())
53 ->buffer()
54 .get_access<cl::sycl::access::mode::read_write>(
55 cgh));
56 }
57 if (with_bias) {
58 bias_acc = std::make_shared<cl::sycl::accessor<uint8_t, 1,
59 cl::sycl::access::mode::read>>(
60 CTX_IN_ACCESSOR(DNNL_ARG_BIAS));
61 }
62 if (pd()->impl_->using_transformed_filter()) {
63 filter_scratch_acc
64 = std::make_shared<scratch_acc_t>(CTX_SCRATCH_ACCESSOR(
65 memory_tracking::names::key_conv_cudnn_filter));
66 }
67
68 if (pd()->use_temp_dst_) {
69 temp_dst_acc = std::make_shared<scratch_acc_t>(
70 buffer(scratch_storage.get())
71 .get_access<cl::sycl::access::mode::read_write>(
72 cgh));
73 temp_reorder_acc = std::make_shared<scratch_acc_t>(
74 buffer(scratch_storage_2.get())
75 .get_access<cl::sycl::access::mode::read_write>(
76 cgh));
77 }
78
79 cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
80 auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
81 cuda_stream->engine());
82 auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
83 auto handle = cuda_stream->get_cudnn_handle();
84
85 std::vector<void *> args;
86 args.push_back(sc.memory<void *>(ih, x_acc));
87 args.push_back(sc.memory<void *>(ih, weights_acc));
88 args.push_back(sc.memory<void *>(ih, y_acc));
89 args.push_back(
90 with_bias ? sc.memory<void *>(ih, *bias_acc) : nullptr);
91 args.push_back(with_scratchpad ? sc.memory<void *>(ih, *scratch_acc)
92 : nullptr);
93 args.push_back(pd()->impl_->using_transformed_filter()
94 ? sc.memory<void *>(ih, *filter_scratch_acc)
95 : nullptr);
96 args.push_back(pd()->use_temp_dst_
97 ? sc.memory<void *>(ih, *temp_dst_acc)
98 : nullptr);
99 args.push_back(pd()->use_temp_dst_
100 ? sc.memory<void *>(ih, *temp_reorder_acc)
101 : nullptr);
102 pd()->impl_->execute(handle, args);
103 });
104 });
105 }
106
execute_convolution(const exec_ctx_t & ctx,bool with_bias,bool with_scratchpad) const107 status_t cudnn_convolution_bwd_data_t::execute_convolution(
108 const exec_ctx_t &ctx, bool with_bias, bool with_scratchpad) const {
109 nvidia::sycl_cuda_stream_t *cuda_stream
110 = utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
111
112 return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
113 using scratch_acc_t = cl::sycl::accessor<uint8_t, 1,
114 cl::sycl::access::mode::read_write>;
115 auto x_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_SRC);
116 auto weights_acc = CTX_IN_ACCESSOR(DNNL_ARG_WEIGHTS);
117 auto y_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST);
118 std::shared_ptr<
119 cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read>>
120 bias_acc;
121 std::shared_ptr<scratch_acc_t> scratch_acc;
122 std::shared_ptr<scratch_acc_t> filter_scratch_acc;
123 if (with_scratchpad) {
124 scratch_acc = std::make_shared<scratch_acc_t>(
125 utils::downcast<sycl::sycl_buffer_memory_storage_t *>(
126 ctx.get_scratchpad_grantor()
127 .get_memory_storage(memory_tracking::names::
128 key_conv_cudnn_algo)
129 .get())
130 ->buffer()
131 .get_access<cl::sycl::access::mode::read_write>(
132 cgh));
133 }
134 if (with_bias) {
135 bias_acc = std::make_shared<cl::sycl::accessor<uint8_t, 1,
136 cl::sycl::access::mode::read>>(
137 CTX_IN_ACCESSOR(DNNL_ARG_BIAS));
138 }
139 if (pd()->impl_->using_transformed_filter()) {
140 filter_scratch_acc
141 = std::make_shared<scratch_acc_t>(CTX_SCRATCH_ACCESSOR(
142 memory_tracking::names::key_conv_cudnn_filter));
143 }
144 cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
145 auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
146 cuda_stream->engine());
147 auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
148 auto handle = cuda_stream->get_cudnn_handle();
149
150 std::vector<void *> args;
151 args.push_back(sc.memory<void *>(ih, x_acc));
152 args.push_back(sc.memory<void *>(ih, weights_acc));
153 args.push_back(sc.memory<void *>(ih, y_acc));
154 args.push_back(
155 with_bias ? sc.memory<void *>(ih, *bias_acc) : nullptr);
156 args.push_back(with_scratchpad ? sc.memory<void *>(ih, *scratch_acc)
157 : nullptr);
158 args.push_back(pd()->impl_->using_transformed_filter()
159 ? sc.memory<void *>(ih, *filter_scratch_acc)
160 : nullptr);
161 pd()->impl_->execute(handle, args);
162 });
163 });
164 }
execute_zero_dims(const exec_ctx_t & ctx) const165 status_t cudnn_convolution_bwd_weights_t::execute_zero_dims(
166 const exec_ctx_t &ctx) const {
167 nvidia::sycl_cuda_stream_t *cuda_stream
168 = utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
169
170 return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
171 auto weights_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_WEIGHTS);
172 std::shared_ptr<
173 cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write>>
174 bias_acc;
175 if (pd()->with_bias()) {
176 bias_acc = std::make_shared<cl::sycl::accessor<uint8_t, 1,
177 cl::sycl::access::mode::write>>(
178 CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_BIAS));
179 }
180 cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
181 auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
182 cuda_stream->engine());
183 auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
184 auto handle = cuda_stream->get_cudnn_handle();
185
186 auto weights = sc.memory<void *>(ih, weights_acc);
187 void *bias = nullptr;
188 if (pd()->with_bias()) bias = sc.memory<void *>(ih, *bias_acc);
189 pd()->impl_->execute_set_weights_bias(handle, weights, bias, 0.f);
190 });
191 });
192 }
execute_convolution(const exec_ctx_t & ctx,bool with_bias,bool with_scratchpad) const193 status_t cudnn_convolution_bwd_weights_t::execute_convolution(
194 const exec_ctx_t &ctx, bool with_bias, bool with_scratchpad) const {
195 nvidia::sycl_cuda_stream_t *cuda_stream
196 = utils::downcast<nvidia::sycl_cuda_stream_t *>(ctx.stream());
197
198 return cuda_stream->interop_task([&](cl::sycl::handler &cgh) {
199 using scratch_acc_t = cl::sycl::accessor<uint8_t, 1,
200 cl::sycl::access::mode::read_write>;
201 auto x_acc = CTX_IN_ACCESSOR(DNNL_ARG_SRC);
202 auto weights_acc = CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_WEIGHTS);
203 auto y_acc = CTX_IN_ACCESSOR(DNNL_ARG_DIFF_DST);
204 std::shared_ptr<
205 cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write>>
206 bias_acc;
207 std::shared_ptr<scratch_acc_t> scratch_acc;
208 std::shared_ptr<scratch_acc_t> filter_scratch_acc;
209 if (with_scratchpad) {
210 scratch_acc = std::make_shared<scratch_acc_t>(
211 utils::downcast<sycl::sycl_buffer_memory_storage_t *>(
212 ctx.get_scratchpad_grantor()
213 .get_memory_storage(memory_tracking::names::
214 key_conv_cudnn_algo)
215 .get())
216 ->buffer()
217 .get_access<cl::sycl::access::mode::read_write>(
218 cgh));
219 }
220 if (with_bias) {
221 bias_acc = std::make_shared<cl::sycl::accessor<uint8_t, 1,
222 cl::sycl::access::mode::write>>(
223 CTX_OUT_ACCESSOR(DNNL_ARG_DIFF_BIAS));
224 }
225 if (pd()->impl_->using_transformed_filter()) {
226 filter_scratch_acc
227 = std::make_shared<scratch_acc_t>(CTX_SCRATCH_ACCESSOR(
228 memory_tracking::names::key_conv_cudnn_filter));
229 }
230
231 cgh.interop_task([=](const cl::sycl::interop_handler &ih) {
232 auto &sycl_engine = *utils::downcast<sycl_cuda_engine_t *>(
233 cuda_stream->engine());
234 auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
235 auto handle = cuda_stream->get_cudnn_handle();
236
237 std::vector<void *> args;
238 args.push_back(sc.memory<void *>(ih, x_acc));
239 args.push_back(sc.memory<void *>(ih, weights_acc));
240 args.push_back(sc.memory<void *>(ih, y_acc));
241 args.push_back(
242 with_bias ? sc.memory<void *>(ih, *bias_acc) : nullptr);
243 args.push_back(with_scratchpad ? sc.memory<void *>(ih, *scratch_acc)
244 : nullptr);
245 args.push_back(pd()->impl_->using_transformed_filter()
246 ? sc.memory<void *>(ih, *filter_scratch_acc)
247 : nullptr);
248 pd()->impl_->execute(handle, args);
249 });
250 });
251 }
252
253 } // namespace nvidia
254 } // namespace gpu
255 } // namespace impl
256 } // namespace dnnl
257