1 /******************************************************************************* 2 * Copyright 2019-2021 Intel Corporation 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 *******************************************************************************/ 16 17 #ifndef GPU_JIT_XE_HP_SYSTOLIC_GEMM_HPP 18 #define GPU_JIT_XE_HP_SYSTOLIC_GEMM_HPP 19 20 #include <assert.h> 21 #include <memory> 22 #include <tuple> 23 24 #include "common/c_types_map.hpp" 25 #include "common/gemm_utils.hpp" 26 #include "common/memory_storage.hpp" 27 #include "common/utils.hpp" 28 #include "gpu/compute/compute.hpp" 29 #include "gpu/gemm/gpu_gemm.hpp" 30 #include "gpu/gpu_gemm_pd.hpp" 31 #include "gpu/jit/gemm/gen_gemm_kernel.hpp" 32 #include "gpu/jit/gemm/xe_hp_systolic_gemm_kernel.hpp" 33 #include "gpu/primitive_conf.hpp" 34 35 namespace dnnl { 36 namespace impl { 37 namespace gpu { 38 namespace jit { 39 40 struct xe_hp_systolic_gemm_t : public gpu_gemm_t { 41 struct pd_t : public gpu_gemm_pd_t { 42 using hint_class = void; 43 pd_tdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t44 pd_t(const gemm_desc_t *adesc, const primitive_attr_t *attr, 45 const hint_class *) 46 : gpu_gemm_pd_t(adesc, attr, nullptr) {} 47 48 DECLARE_COMMON_PD_T("jit:xe_hp:gemm:any", xe_hp_systolic_gemm_t); 49 50 status_t init(engine_t *engine); 51 52 bool use_fma(); 53 bool set_default_formats(data_type_t dt); 54 55 size_t dyn_offset_a = 0; 56 size_t dyn_offset_b = 0; 57 size_t dyn_offset_c = 0; 58 impl_acc_typednnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t59 data_type_t impl_acc_type() const { 60 using namespace data_type; 61 return utils::one_of(desc()->c_type(), f16, bf16, f32) ? f32 : s32; 62 } 63 alphadnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t64 float alpha() const { return attr()->output_scales_.scales_[0]; } 65 betadnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t66 float beta() const { 67 using namespace primitive_kind; 68 const auto &p = attr()->post_ops_; 69 return p.contain(sum, 0) ? p.entry_[0].sum.scale : 0.f; 70 } 71 with_biasdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t72 bool with_bias() const { 73 return desc()->bias_type() != data_type::undef; 74 } 75 bias_cmaskdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t76 int bias_cmask() const { 77 unsigned char to_cmask[4] = {0, 2, 1, 3}; 78 return with_bias() ? to_cmask[(desc()->bias_mask() >> 1) & 3] : -1; 79 } 80 packed_adnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t81 bool packed_a() const { return packed_a_; } packed_bdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t82 bool packed_b() const { return packed_b_; } packed_cdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t83 bool packed_c() const { return packed_c_; } 84 lda_packeddnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t85 dim_t lda_packed() const { 86 return packed_a() ? desc()->b_desc.padded_dims[with_batch() ? 1 : 0] 87 : 0; 88 } ldb_packeddnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t89 dim_t ldb_packed() const { 90 return packed_b() ? desc()->a_desc.padded_dims[with_batch() ? 2 : 1] 91 : 0; 92 } ldc_packeddnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t93 dim_t ldc_packed() const { 94 return packed_c() ? desc()->c_desc.padded_dims[with_batch() ? 2 : 1] 95 : 0; 96 } 97 with_batchdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t98 bool with_batch() const { return desc()->is_batched(); } with_ab_zero_pointsdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t99 bool with_ab_zero_points() const { return a_zp_ || b_zp_; } with_c_zero_pointsdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t100 bool with_c_zero_points() const { return c_zp_; } 101 unroll_mdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t102 int unroll_m() const { return unroll_m_; } unroll_ndnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t103 int unroll_n() const { return unroll_n_; } use_new_kernelsdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t104 bool use_new_kernels() const { return use_new_kernels_; } kernel_tagdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t::pd_t105 char kernel_tag() const { return kernel_tag_; } 106 107 const compute::device_info_t *dev_info_ = nullptr; 108 109 private: 110 bool any_prepacked_ = false; 111 bool packed_a_ = false, packed_b_ = false, packed_c_ = false; 112 bool a_zp_ = false, b_zp_ = false, c_zp_ = false; 113 bool use_new_kernels_ = false; 114 int unroll_m_ = 0; 115 int unroll_n_ = 0; 116 char kernel_tag_ = '\0'; 117 }; 118 119 status_t init(engine_t *engine) override; 120 status_t init_res_storage( 121 engine_t *engine, gpu_resource_t *r) const override; 122 123 public: xe_hp_systolic_gemm_tdnnl::impl::gpu::jit::xe_hp_systolic_gemm_t124 xe_hp_systolic_gemm_t(const pd_t *apd) : gpu_gemm_t(apd) {} 125 126 virtual status_t execute(const gemm_exec_ctx_t &ctx) const override; 127 128 private: 129 status_t init_compute_old(engine_t *engine); 130 status_t init_compute_new(engine_t *engine); 131 132 bool enable_mn_blocking() const; 133 std::tuple<int64_t, int64_t, int64_t> get_blocking() const; 134 135 status_t launch_clear_sum(const gemm_exec_ctx_t &ctx, int64_t r, int64_t c, 136 const memory_storage_t &dst, int32_t offset_dst, int32_t ld_dst, 137 bool copyb) const; 138 status_t launch_copy(const gemm_exec_ctx_t &ctx, int64_t r, int64_t c, 139 const memory_storage_t &src, int64_t offset_src, int64_t ld_src, 140 const memory_storage_t &dst, int32_t offset_dst, int32_t ld_dst, 141 bool copyb) const; 142 status_t launch_compute(const gemm_exec_ctx_t &ctx, int32_t m, int32_t n, 143 int32_t k, const memory_storage_t &ap, int64_t offset_a, 144 int32_t lda, const memory_storage_t &bp, int64_t offset_b, 145 int32_t ldb, const memory_storage_t &c, int64_t offset_c, 146 int32_t ldc, float alpha, float beta, int16_t ao, int16_t bo, 147 const memory_storage_t &co, int32_t offset_co, bool first_k_block, 148 bool last_k_block, int32_t batch, int32_t stride_a, 149 int32_t stride_b, int32_t stride_c) const; 150 nice_lddnnl::impl::gpu::jit::xe_hp_systolic_gemm_t151 static int64_t nice_ld(int64_t ld, int sz, bool get_max = false) { 152 const auto align = 32; 153 const auto no_align = 64; 154 155 auto new_ld = (ld * sz + align - 1) & ~(align - 1); 156 if (get_max || (new_ld & (no_align - 1)) == 0) new_ld += align; 157 158 return new_ld / sz; 159 } 160 get_ld_packeddnnl::impl::gpu::jit::xe_hp_systolic_gemm_t161 int64_t get_ld_packed(int64_t k, bool get_max = false) const { 162 using compute_kernel_t = xehp_systolic_gemm_kernel_t<gpu_xe_hp>; 163 164 auto a_type = pd()->desc()->a_type(); 165 auto a_sz = types::data_type_size(a_type); 166 167 auto ld = utils::rnd_up(k, compute_kernel_t::unroll_k(a_type)); 168 if (pd()->with_ab_zero_points()) ld += 32 / a_sz; 169 170 return nice_ld(ld, int(a_sz), get_max); 171 } 172 max_ld_packeddnnl::impl::gpu::jit::xe_hp_systolic_gemm_t173 int64_t max_ld_packed(int64_t k) const { return get_ld_packed(k, true); } 174 175 static const int A_PACKED_ = 0; 176 static const int B_PACKED_ = 1; 177 178 compute::kernel_t kernel_[2][2]; // [first_k_block][last_k_block] 179 compute::kernel_t copy_kernel_[2][2]; // [trans][clear_sum] 180 181 CommonDriverInfo compute_info_; 182 183 compute::gpu_arch_t arch_ = compute::gpu_arch_t::unknown; 184 int eu_count_ = 0; 185 186 char co_kind_ = 'N'; 187 bool walk_n_first_ = false; 188 pddnnl::impl::gpu::jit::xe_hp_systolic_gemm_t189 const pd_t *pd() const { return (const pd_t *)gpu_primitive_t::pd().get(); } 190 }; 191 192 } // namespace jit 193 } // namespace gpu 194 } // namespace impl 195 } // namespace dnnl 196 197 #endif 198 // vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s 199