1 /*******************************************************************************
2 * Copyright 2019-2021 Intel Corporation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
16 
17 #ifndef GPU_COMPUTE_COMPUTE_ENGINE_HPP
18 #define GPU_COMPUTE_COMPUTE_ENGINE_HPP
19 
20 #include <cassert>
21 #include <memory>
22 #include <vector>
23 #include <initializer_list>
24 
25 #include "common/c_types_map.hpp"
26 #include "common/engine.hpp"
27 #include "common/primitive.hpp"
28 #include "common/primitive_iterator.hpp"
29 #include "common/verbose.hpp"
30 #include "gpu/compute/device_info.hpp"
31 #include "gpu/compute/dispatch.hpp"
32 #include "gpu/compute/kernel.hpp"
33 #include "gpu/compute/kernel_ctx.hpp"
34 #include "gpu/jit/jit_generator_base.hpp"
35 
36 namespace dnnl {
37 namespace impl {
38 namespace gpu {
39 namespace compute {
40 
41 class compute_engine_t : public engine_t {
42 public:
compute_engine_t(engine_kind_t kind,runtime_kind_t runtime_kind,size_t index)43     compute_engine_t(
44             engine_kind_t kind, runtime_kind_t runtime_kind, size_t index)
45         : engine_t(kind, runtime_kind, index) {}
46 
47     virtual status_t init();
48 
device_info() const49     const device_info_t *device_info() const { return device_info_.get(); }
50 
create_kernel(kernel_t * kernel,const char * kernel_name,const kernel_ctx_t & kernel_ctx) const51     status_t create_kernel(kernel_t *kernel, const char *kernel_name,
52             const kernel_ctx_t &kernel_ctx) const {
53 
54         std::vector<kernel_t> kernels(1);
55         auto status = create_kernels(&kernels, {kernel_name}, kernel_ctx);
56         if (status == status::success) *kernel = kernels[0];
57         return status;
58     }
59 
60     virtual status_t create_kernel(compute::kernel_t *kernel,
61             jit::jit_generator_base &jitter) const = 0;
62 
63     virtual status_t create_kernels(std::vector<compute::kernel_t> *kernels,
64             const std::vector<const char *> &kernel_names,
65             const compute::kernel_ctx_t &kernel_ctx) const = 0;
66 
create_kernels_from_ocl_source(std::vector<compute::kernel_t> * kernels,const std::vector<const char * > & kernel_names,const char * source_string,const compute::kernel_ctx_t & kernel_ctx) const67     virtual status_t create_kernels_from_ocl_source(
68             std::vector<compute::kernel_t> *kernels,
69             const std::vector<const char *> &kernel_names,
70             const char *source_string,
71             const compute::kernel_ctx_t &kernel_ctx) const {
72         assert(!"unexpected");
73         return status::success;
74     };
75 
get_zero_pad_primitive(primitive_t * & result,const resource_mapper_t * & resources)76     status_t get_zero_pad_primitive(
77             primitive_t *&result, const resource_mapper_t *&resources) {
78         std::call_once(zero_pad_init_, [&]() -> void {
79             zero_pad_desc_t desc;
80             desc.primitive_kind = primitive_kind::zero_pad;
81             dnnl_primitive_desc_iterator it(
82                     this, (op_desc_t *)&desc, nullptr, nullptr);
83             std::shared_ptr<primitive_desc_t> zero_pad_pd(*(++it));
84             if (zero_pad_pd == nullptr) return;
85 
86             status_t status
87                     = zero_pad_pd->create_primitive(zero_pad_primitive_, this);
88 #ifndef DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE
89             if (status == status::success) {
90                 status = zero_pad_primitive_->create_resource(
91                         this, zero_pad_resources_);
92             }
93 #endif
94             if (status != status::success) { zero_pad_primitive_.reset(); }
95         });
96 
97         result = zero_pad_primitive_.get();
98         resources = &zero_pad_resources_;
99         return result != nullptr ? status::success : status::unimplemented;
100     };
101 
mayiuse(device_ext_t ext) const102     bool mayiuse(device_ext_t ext) const { return device_info_->has(ext); }
103 
is_gen9() const104     bool is_gen9() const {
105         return device_info_->gpu_arch() == gpu_arch_t::gen9;
106     }
is_xe_lp() const107     bool is_xe_lp() const {
108         return device_info_->gpu_arch() == gpu_arch_t::xe_lp;
109     }
is_xe_hp() const110     bool is_xe_hp() const {
111         return device_info_->gpu_arch() == gpu_arch_t::xe_hp;
112     }
is_xe_hpg() const113     bool is_xe_hpg() const {
114         return device_info_->gpu_arch() == gpu_arch_t::xe_hpg;
115     }
is_xe_hpc() const116     bool is_xe_hpc() const {
117         return device_info_->gpu_arch() == gpu_arch_t::xe_hpc;
118     }
mayiuse_ngen_kernels()119     bool mayiuse_ngen_kernels() {
120         return device_info_->mayiuse_ngen_kernels(this);
121     }
mayiuse_non_uniform_work_groups() const122     bool mayiuse_non_uniform_work_groups() const {
123         return device_info_->mayiuse_non_uniform_work_groups();
124     }
mayiuse_sub_group(int size) const125     bool mayiuse_sub_group(int size) const {
126         return device_info_->mayiuse_sub_group(size);
127     }
mayiuse_sub_group(std::initializer_list<int> sizes) const128     bool mayiuse_sub_group(std::initializer_list<int> sizes) const {
129         for (int size : sizes)
130             if (!mayiuse_sub_group(size)) return false;
131         return true;
132     }
mayiuse_large_grf_mode() const133     bool mayiuse_large_grf_mode() const {
134         // XXX: XeHPG 128EU A0 causes hangs with large GRF mode.
135         if (is_xe_hpg() && device_info()->eu_count() == 128
136                 && device_info()->stepping_id() == 0)
137             return false;
138         return device_info_->gpu_arch() >= compute::gpu_arch_t::xe_hp;
139     }
140 
create_dispatch(const memory_desc_t * md=nullptr) const141     dispatch_t create_dispatch(const memory_desc_t *md = nullptr) const {
142         return dispatch_t(this, md);
143     }
144 
get_service_stream(stream_t * & stream)145     status_t get_service_stream(stream_t *&stream) override {
146         status_t status = status::success;
147         if (service_stream_ == nullptr) {
148             const std::lock_guard<std::mutex> lock(service_stream_mutex_);
149             if (service_stream_ == nullptr) {
150                 stream_t *service_stream_ptr;
151                 status = create_stream(
152                         &service_stream_ptr, stream_flags::default_flags);
153                 if (status == status::success)
154                     service_stream_.reset(service_stream_ptr);
155             }
156         }
157         stream = service_stream_.get();
158         return status;
159     }
160 
161     // non-blocking query to check if service stream is already created
is_service_stream_created() const162     bool is_service_stream_created() const { return (bool)service_stream_; }
163 
164     virtual std::function<void(void *)> get_program_list_deleter() const = 0;
165 
166 protected:
167     virtual status_t init_device_info() = 0;
168 
169 #ifdef DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE
170     ~compute_engine_t() override = default;
171 #endif
172 
173     std::shared_ptr<device_info_t> device_info_;
174 
175 private:
176     // Implement a zero_pad_primitive shared across the engine. The purpose is
177     // to prevent extra overhead associated with creating zero_pad_primitives
178     // for different inputs as ideally the zero_pad operations fast relative to
179     // the time to create the primitive.
180     std::shared_ptr<primitive_t> zero_pad_primitive_;
181     resource_mapper_t zero_pad_resources_;
182     std::once_flag zero_pad_init_;
183     std::unique_ptr<stream_t> service_stream_;
184     std::mutex service_stream_mutex_;
185 };
186 
187 } // namespace compute
188 } // namespace gpu
189 } // namespace impl
190 } // namespace dnnl
191 
192 // Exported for testing purposes only.
193 extern "C" bool DNNL_API dnnl_impl_gpu_mayiuse_ngen_kernels(
194         dnnl::impl::engine_t *engine);
195 
196 #endif
197