1 /*******************************************************************************
2 * Copyright 2019-2021 Intel Corporation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
16 
17 #ifndef GPU_JIT_XE_HP_SYSTOLIC_GEMM_HPP
18 #define GPU_JIT_XE_HP_SYSTOLIC_GEMM_HPP
19 
20 #include <assert.h>
21 #include <memory>
22 #include <tuple>
23 
24 #include "common/c_types_map.hpp"
25 #include "common/gemm_utils.hpp"
26 #include "common/memory_storage.hpp"
27 #include "common/utils.hpp"
28 #include "gpu/compute/compute.hpp"
29 #include "gpu/gemm/gpu_gemm.hpp"
30 #include "gpu/gpu_gemm_pd.hpp"
31 #include "gpu/jit/gemm/gen_gemm_kernel.hpp"
32 #include "gpu/jit/gemm/xe_hp_systolic_gemm_kernel.hpp"
33 #include "gpu/primitive_conf.hpp"
34 
35 namespace dnnl {
36 namespace impl {
37 namespace gpu {
38 namespace jit {
39 
40 struct xe_hp_systolic_gemm_t : public gpu_gemm_t {
41     struct pd_t : public gpu_gemm_pd_t {
42         using hint_class = void;
43 
pd_tdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t44         pd_t(const gemm_desc_t *adesc, const primitive_attr_t *attr,
45                 const hint_class *)
46             : gpu_gemm_pd_t(adesc, attr, nullptr) {}
47 
48         DECLARE_COMMON_PD_T("jit:xe_hp:gemm:any", xe_hp_systolic_gemm_t);
49 
50         status_t init(engine_t *engine);
51 
52         bool use_fma();
53         bool set_default_formats(data_type_t dt);
54 
55         size_t dyn_offset_a = 0;
56         size_t dyn_offset_b = 0;
57         size_t dyn_offset_c = 0;
58 
impl_acc_typednnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t59         data_type_t impl_acc_type() const {
60             using namespace data_type;
61             return utils::one_of(desc()->c_type(), f16, bf16, f32) ? f32 : s32;
62         }
63 
alphadnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t64         float alpha() const { return attr()->output_scales_.scales_[0]; }
65 
betadnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t66         float beta() const {
67             using namespace primitive_kind;
68             const auto &p = attr()->post_ops_;
69             return p.contain(sum, 0) ? p.entry_[0].sum.scale : 0.f;
70         }
71 
with_biasdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t72         bool with_bias() const {
73             return desc()->bias_type() != data_type::undef;
74         }
75 
bias_cmaskdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t76         int bias_cmask() const {
77             unsigned char to_cmask[4] = {0, 2, 1, 3};
78             return with_bias() ? to_cmask[(desc()->bias_mask() >> 1) & 3] : -1;
79         }
80 
packed_adnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t81         bool packed_a() const { return packed_a_; }
packed_bdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t82         bool packed_b() const { return packed_b_; }
packed_cdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t83         bool packed_c() const { return packed_c_; }
84 
lda_packeddnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t85         dim_t lda_packed() const {
86             return packed_a() ? desc()->b_desc.padded_dims[with_batch() ? 1 : 0]
87                               : 0;
88         }
ldb_packeddnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t89         dim_t ldb_packed() const {
90             return packed_b() ? desc()->a_desc.padded_dims[with_batch() ? 2 : 1]
91                               : 0;
92         }
ldc_packeddnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t93         dim_t ldc_packed() const {
94             return packed_c() ? desc()->c_desc.padded_dims[with_batch() ? 2 : 1]
95                               : 0;
96         }
97 
with_batchdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t98         bool with_batch() const { return desc()->is_batched(); }
with_ab_zero_pointsdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t99         bool with_ab_zero_points() const { return a_zp_ || b_zp_; }
with_c_zero_pointsdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t100         bool with_c_zero_points() const { return c_zp_; }
101 
unroll_mdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t102         int unroll_m() const { return unroll_m_; }
unroll_ndnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t103         int unroll_n() const { return unroll_n_; }
use_new_kernelsdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t104         bool use_new_kernels() const { return use_new_kernels_; }
kernel_tagdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t105         char kernel_tag() const { return kernel_tag_; }
106 
107         const compute::device_info_t *dev_info_ = nullptr;
108 
109     private:
110         bool any_prepacked_ = false;
111         bool packed_a_ = false, packed_b_ = false, packed_c_ = false;
112         bool a_zp_ = false, b_zp_ = false, c_zp_ = false;
113         bool use_new_kernels_ = false;
114         int unroll_m_ = 0;
115         int unroll_n_ = 0;
116         char kernel_tag_ = '\0';
117     };
118 
119     status_t init(engine_t *engine) override;
120     status_t init_res_storage(
121             engine_t *engine, gpu_resource_t *r) const override;
122 
123 public:
xe_hp_systolic_gemm_tdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t124     xe_hp_systolic_gemm_t(const pd_t *apd) : gpu_gemm_t(apd) {}
125 
126     virtual status_t execute(const gemm_exec_ctx_t &ctx) const override;
127 
128 private:
129     status_t init_compute_old(engine_t *engine);
130     status_t init_compute_new(engine_t *engine);
131 
132     bool enable_mn_blocking() const;
133     std::tuple<int64_t, int64_t, int64_t> get_blocking() const;
134 
135     status_t launch_clear_sum(const gemm_exec_ctx_t &ctx, int64_t r, int64_t c,
136             const memory_storage_t &dst, int32_t offset_dst, int32_t ld_dst,
137             bool copyb) const;
138     status_t launch_copy(const gemm_exec_ctx_t &ctx, int64_t r, int64_t c,
139             const memory_storage_t &src, int64_t offset_src, int64_t ld_src,
140             const memory_storage_t &dst, int32_t offset_dst, int32_t ld_dst,
141             bool copyb) const;
142     status_t launch_compute(const gemm_exec_ctx_t &ctx, int32_t m, int32_t n,
143             int32_t k, const memory_storage_t &ap, int64_t offset_a,
144             int32_t lda, const memory_storage_t &bp, int64_t offset_b,
145             int32_t ldb, const memory_storage_t &c, int64_t offset_c,
146             int32_t ldc, float alpha, float beta, int16_t ao, int16_t bo,
147             const memory_storage_t &co, int32_t offset_co, bool first_k_block,
148             bool last_k_block, int32_t batch, int32_t stride_a,
149             int32_t stride_b, int32_t stride_c) const;
150 
nice_lddnnl::impl::gpu::jit::xe_hp_systolic_gemm_t151     static int64_t nice_ld(int64_t ld, int sz, bool get_max = false) {
152         const auto align = 32;
153         const auto no_align = 64;
154 
155         auto new_ld = (ld * sz + align - 1) & ~(align - 1);
156         if (get_max || (new_ld & (no_align - 1)) == 0) new_ld += align;
157 
158         return new_ld / sz;
159     }
160 
get_ld_packeddnnl::impl::gpu::jit::xe_hp_systolic_gemm_t161     int64_t get_ld_packed(int64_t k, bool get_max = false) const {
162         using compute_kernel_t = xehp_systolic_gemm_kernel_t<gpu_xe_hp>;
163 
164         auto a_type = pd()->desc()->a_type();
165         auto a_sz = types::data_type_size(a_type);
166 
167         auto ld = utils::rnd_up(k, compute_kernel_t::unroll_k(a_type));
168         if (pd()->with_ab_zero_points()) ld += 32 / a_sz;
169 
170         return nice_ld(ld, int(a_sz), get_max);
171     }
172 
max_ld_packeddnnl::impl::gpu::jit::xe_hp_systolic_gemm_t173     int64_t max_ld_packed(int64_t k) const { return get_ld_packed(k, true); }
174 
175     static const int A_PACKED_ = 0;
176     static const int B_PACKED_ = 1;
177 
178     compute::kernel_t kernel_[2][2]; // [first_k_block][last_k_block]
179     compute::kernel_t copy_kernel_[2][2]; // [trans][clear_sum]
180 
181     CommonDriverInfo compute_info_;
182 
183     compute::gpu_arch_t arch_ = compute::gpu_arch_t::unknown;
184     int eu_count_ = 0;
185 
186     char co_kind_ = 'N';
187     bool walk_n_first_ = false;
188 
pddnnl::impl::gpu::jit::xe_hp_systolic_gemm_t189     const pd_t *pd() const { return (const pd_t *)gpu_primitive_t::pd().get(); }
190 };
191 
192 } // namespace jit
193 } // namespace gpu
194 } // namespace impl
195 } // namespace dnnl
196 
197 #endif
198 // vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
199