/dports/misc/mxnet/incubator-mxnet-1.9.0/3rdparty/mkldnn/src/gpu/ocl/gemm/ |
H A D | gen9_gemm_nocopy_f16.cl | 227 global half *B_ptrs[2] = {B, B + 16 * ldb}; 250 b[j] = VLOAD4_ALIGNED(0, (B_ptrs[j] + h)); 264 b[j] = B_ptrs[j][h]; 294 if (jrem > j * 16) b[j] = B_ptrs[j][h]; 414 (global ushort *)(B_ptrs[0] + h * ldb))); 438 if (jrem > 0) b[0].s0 = B_ptrs[0][h * ldb + lid]; 524 global half *B_ptrs[2] = {B, B + 16 * ldb}; 542 b[z] = VLOAD4_ALIGNED(0, (B_ptrs[z] + h)); 554 b[z] = B_ptrs[z][h]; 577 if (jrem > z * 16) b[z] = B_ptrs[z][h]; [all …]
|
H A D | gen9_gemm_nocopy_f32.cl | 574 global float *B_ptrs[2] = {B, B + 16 * ldb}; 592 if (jrem > hh * 16) b[hh] = vload4(0, B_ptrs[hh]); 593 B_ptrs[hh] += 4; 608 if (jrem > hh * 16) b[hh] = *B_ptrs[hh]; 609 B_ptrs[hh]++;
|
/dports/math/onednn/oneDNN-2.5.1/src/gpu/ocl/gemm/ |
H A D | gen9_gemm_nocopy_f16.cl | 227 global half *B_ptrs[2] = {B, B + 16 * ldb}; 250 b[j] = VLOAD4_ALIGNED(0, (B_ptrs[j] + h)); 264 b[j] = B_ptrs[j][h]; 294 if (jrem > j * 16) b[j] = B_ptrs[j][h]; 414 (global ushort *)(B_ptrs[0] + h * ldb))); 438 if (jrem > 0) b[0].s0 = B_ptrs[0][h * ldb + lid]; 524 global half *B_ptrs[2] = {B, B + 16 * ldb}; 542 b[z] = VLOAD4_ALIGNED(0, (B_ptrs[z] + h)); 554 b[z] = B_ptrs[z][h]; 577 if (jrem > z * 16) b[z] = B_ptrs[z][h]; [all …]
|
H A D | gen9_gemm_nocopy_f32.cl | 574 global float *B_ptrs[2] = {B, B + 16 * ldb}; 592 if (jrem > hh * 16) b[hh] = vload4(0, B_ptrs[hh]); 593 B_ptrs[hh] += 4; 608 if (jrem > hh * 16) b[hh] = *B_ptrs[hh]; 609 B_ptrs[hh]++;
|
/dports/math/libxsmm/libxsmm-1.16.3/src/template/ |
H A D | libxsmm_dnn_convolve_st_fwd_custom_custom_generic_bf16.tpl.c | 25 const element_input_type *B_ptrs[1024]; variable 167 br_gemm_kernel2(A_ptrs, B_ptrs, out_ptr, &n_blocks); 191 br_gemm_kernel2(A_ptrs, B_ptrs, out_ptr, &n_blocks); 213 br_gemm_kernel(A_ptrs, B_ptrs, out_ptr, &n_blocks); 269 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->bloc… 280 br_gemm_kernel(A_ptrs, B_ptrs, out_ptr, &n_blocks); 354 br_gemm_kernel2(A_ptrs, B_ptrs, out_ptr, &n_blocks); 376 br_gemm_kernel2(A_ptrs, B_ptrs, out_ptr, &n_blocks); 398 br_gemm_kernel(A_ptrs, B_ptrs, out_ptr, &n_blocks); 473 br_gemm_kernel(A_ptrs, B_ptrs, out_ptr, &n_blocks); [all …]
|
H A D | libxsmm_dnn_convolve_st_fwd_custom_custom_generic.tpl.c | 24 const element_input_type *B_ptrs[1024]; variable 187 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki + 1, 0, handle->… 196 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->bloc… 205 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->bloc… 259 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->bloc… 265 …br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, ofm1, oj_use, oi_use, 0, handle… 322 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki + 1, 0, handle->… 331 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->bloc… 340 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->bloc… 397 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ifm2, ij_use + kj, ii_use + ki, 0, handle->bloc… [all …]
|
H A D | libxsmm_dnn_convolve_st_fwd_nhwc_custom-rsck_generic.tpl.c | 24 const element_input_type *B_ptrs[1024]; variable 198 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki + 1, ifm2, 0, IFH, IFW… 212 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki, ifm2, 0, IFH, IFW, ha… 226 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki, ifm2, 0, IFH, IFW, ha… 285 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki, ifm2, 0, IFH, IFW, ha… 291 …br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, output, img, oj_use, oi_use, ofm1, 0, handle… 353 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki + 1, ifm2, 0, IFH, IFW… 367 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki, ifm2, 0, IFH, IFW, ha… 381 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki, ifm2, 0, IFH, IFW, ha… 443 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img, ij_use + kj, ii_use + ki, ifm2, 0, IFH, IFW, ha… [all …]
|
H A D | libxsmm_dnn_convolve_st_bwd_custom_custom_generic_bf16.tpl.c | 27 const element_input_type *B_ptrs[1024]; variable 159 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki + 1, 0, handle-… 167 br_gemm_kernel2(A_ptrs, B_ptrs, del_inp_ptr, &n_blocks); 183 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blo… 191 br_gemm_kernel2(A_ptrs, B_ptrs, del_inp_ptr, &n_blocks); 207 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blo… 215 br_gemm_kernel(A_ptrs, B_ptrs, del_inp_ptr, &n_blocks); 274 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blo… 284 br_gemm_kernel(A_ptrs, B_ptrs, del_inp_ptr, &n_blocks); 338 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blo… [all …]
|
H A D | libxsmm_dnn_convolve_st_upd_nhwc_custom-rsck_generic.tpl.c | 47 const element_input_type *B_ptrs[1024]; variable 281 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ij + j_br * handle->desc.u, ii, ifm1, 0,… 287 …br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, han… 297 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ij + j_br * handle->desc.u, ii + 1, ifm1… 303 …br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, ha… 313 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ij + j_br * handle->desc.u, ii, ifm1, 0,… 330 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ij + j_br * handle->desc.u, ii, ifm1, 0,… 346 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ij + j_br * handle->desc.u, ii, ifm1, 0,… 502 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input_use, img, ij + j_br * handle->desc.u, ii, ifm1, 0, IFHP… 554 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ij + j_br * handle->desc.u, ii, ifm1, 0,… [all …]
|
H A D | libxsmm_dnn_convolve_st_bwd_custom_custom_generic.tpl.c | 23 const element_input_type *B_ptrs[1024]; variable 174 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki + 1, 0, handle-… 178 …br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use + 1, 0… 183 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blo… 187 …br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use, 0, ha… 192 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blo… 196 …br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use, 0, han… 242 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blo… 248 …br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ifm1, ij_use, ii_use, 0, han… 292 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, ofm2, oj_use + kj, oi_use + ki, 0, handle->blo… [all …]
|
H A D | libxsmm_dnn_convolve_st_bwd_nhwc_custom-rsck_generic.tpl.c | 23 const element_input_type *B_ptrs[1024]; variable 189 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, oj_use + kj, oi_use + ki + 1, ofm2, 0, handle-… 193 …br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ij_use, ii_use + 1, ifm1, 0… 198 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, oj_use + kj, oi_use + ki, ofm2, 0, handle->ofh… 202 …br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ij_use, ii_use, ifm1, 0, IF… 207 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, oj_use + kj, oi_use + ki, ofm2, 0, handle->ofh… 211 …br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ij_use, ii_use, ifm1, 0, IFH… 257 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, oj_use + kj, oi_use + ki, ofm2, 0, handle->ofh… 263 …br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(5, del_input, img, ij_use, ii_use, ifm1, 0, IFH… 307 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, output, img, oj_use + kj, oi_use + ki, ofm2, 0, handle->ofh… [all …]
|
H A D | libxsmm_dnn_convolve_st_upd_custom_custom_generic.tpl.c | 32 const element_input_type *B_ptrs[1024]; variable 242 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ifm1, ij + j_br * handle->desc.u, ii, 0,… 247 …br_gemm_kernel(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, han… 253 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ifm1, ij + j_br * handle->desc.u, ii + 1… 258 …br_gemm_kernel2(A_ptrs, B_ptrs, &LIBXSMM_VLA_ACCESS(6, weight_global, ofm1, ifm1, kj, ki, 0, 0, ha… 264 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ifm1, ij + j_br * handle->desc.u, ii, 0,… 276 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ifm1, ij + j_br * handle->desc.u, ii, 0,… 287 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ifm1, ij + j_br * handle->desc.u, ii, 0,… 424 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input_use, img, ifm1, ij + j_br * handle->desc.u, ii, 0, hand… 466 …B_ptrs[ind] = &LIBXSMM_VLA_ACCESS(5, input, img + img_br, ifm1, ij + j_br * handle->desc.u, ii, 0,… [all …]
|
H A D | libxsmm_dnn_convolve_st_upd_custom_custom_generic_bf16.tpl.c | 128 const element_input_type *B_ptrs[1024]; variable 298 …B_ptrs[j_br] = (element_input_type*) &LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, 0, j_br, 0, handle… 301 br_gemm_kernel(A_ptrs, B_ptrs, dst_ptr, &n_blocks); 416 …B_ptrs[j_br] = (element_input_type*) &LIBXSMM_VLA_ACCESS(5, tr_input_2, img, 0, 0, j_br, 0, handle… 419 br_gemm_kernel(A_ptrs, B_ptrs, dst_ptr, &n_blocks); 526 …B_ptrs[img_br] = &LIBXSMM_VLA_ACCESS(4, tr_input, img + img_br, ifm1, 0, pix + kj * handle->ifwp +… 529 br_gemm_kernel(A_ptrs, B_ptrs, dst_ptr, &n_blocks);
|