/dports/math/libxsmm/libxsmm-1.16.3/src/template/ |
H A D | libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c | 112 element_stats_type* del_beta_img_ptr; variable 117 del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, 32); 186 _mm512_storeu_ps( del_beta_img_ptr, lcl_vdbeta ); 188 _mm512_storeu_ps( del_beta_img_ptr+16, lcl_vdbeta2 ); 206 lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); 208 lcl_vdbeta2 = _mm512_add_ps( lcl_vdbeta2, _mm512_loadu_ps( del_beta_img_ptr+16 ) ); 210 del_beta_img_ptr += 32; 230 lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); 234 del_beta_img_ptr += 32; 238 _mm512_storeu_ps( del_beta_img_ptr - (32*nImg), lcl_vdbeta ); [all …]
|
H A D | libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c | 109 element_stats_type* del_beta_img_ptr; variable 114 del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, 16); 158 _mm512_storeu_ps( del_beta_img_ptr, lcl_vdbeta ); 168 … element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, 0, 0, nImg, 16); variable 174 lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); 176 del_beta_img_ptr += 16; 186 … element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, 0, 0, nImg, 16); variable 192 lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); 194 del_beta_img_ptr += 16; 198 _mm512_storeu_ps( del_beta_img_ptr - (nImg*16), lcl_vdbeta );
|
H A D | libxsmm_dnn_fusedbatchnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c | 118 element_stats_type* del_beta_img_ptr; variable 123 del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, 64); 232 _mm512_storeu_ps( del_beta_img_ptr, lcl_vdbeta ); 234 _mm512_storeu_ps( del_beta_img_ptr+16, lcl_vdbeta2 ); 236 _mm512_storeu_ps( del_beta_img_ptr+32, lcl_vdbeta3 ); 238 _mm512_storeu_ps( del_beta_img_ptr+48, lcl_vdbeta4 ); 254 lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); 256 del_beta_img_ptr += 64; 272 lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); 274 del_beta_img_ptr += 64; [all …]
|
H A D | libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c32_avx512.tpl.c | 109 element_stats_type* del_beta_img_ptr; variable 114 del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, 32); 185 _mm512_storeu_ps( del_beta_img_ptr, lcl_vdbeta ); 187 _mm512_storeu_ps( del_beta_img_ptr+16, lcl_vdbeta2 ); 195 … element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, 0, 0, nImg, 32); variable 203 lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); 205 lcl_vdbeta2 = _mm512_add_ps( lcl_vdbeta2, _mm512_loadu_ps( del_beta_img_ptr+16 ) ); 207 del_beta_img_ptr += 32;
|
H A D | libxsmm_dnn_fusedbatchnorm_st_bwd_custom_generic.tpl.c | 114 element_stats_type* del_beta_img_ptr; variable 119 del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, nFmBlock); 184 del_beta_img_ptr[v] = lcl_beta_ptr[v]; 205 …element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, nFmBl… variable 210 del_beta_ptr[v] += del_beta_img_ptr[v]; 222 …element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, nFmBl… variable 227 del_beta_ptr[v] += del_beta_img_ptr[v];
|
H A D | libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c64_avx512.tpl.c | 115 element_stats_type* del_beta_img_ptr; variable 120 del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, 64); 231 _mm512_storeu_ps( del_beta_img_ptr, lcl_vdbeta ); 233 _mm512_storeu_ps( del_beta_img_ptr+16, lcl_vdbeta2 ); 235 _mm512_storeu_ps( del_beta_img_ptr+32, lcl_vdbeta3 ); 237 _mm512_storeu_ps( del_beta_img_ptr+48, lcl_vdbeta4 ); 245 …element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, (fm/4), 0, ((fm%4)*16),… variable 251 lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); 253 del_beta_img_ptr += 64;
|
H A D | libxsmm_dnn_fusedgroupnorm_st_bwd_custom_f32_bf16_c16_avx512.tpl.c | 106 element_stats_type* del_beta_img_ptr; variable 111 del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, 16); 154 _mm512_storeu_ps( del_beta_img_ptr, lcl_vdbeta ); 162 … element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, 0, 0, nImg, 16); variable 168 lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); 170 del_beta_img_ptr += 16;
|
H A D | libxsmm_dnn_fusedgroupnorm_st_bwd_custom_generic.tpl.c | 130 element_stats_type* del_beta_img_ptr; variable 133 del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, nFmBlock); 201 del_beta_img_ptr[v] = lcl_beta_ptr[v]; 254 …element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, nFmBl… variable 259 del_beta_ptr[v] += del_beta_img_ptr[v];
|