1 /****************************************************************************** 2 * Copyright (c) Intel Corporation - All rights reserved. * 3 * This file is part of the LIBXSMM library. * 4 * * 5 * For information on the license, see the LICENSE file. * 6 * Further information: https://github.com/hfp/libxsmm/ * 7 * SPDX-License-Identifier: BSD-3-Clause * 8 ******************************************************************************/ 9 /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.) 10 ******************************************************************************/ 11 12 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_BF16) 13 # define _mm512_load_act(A) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16)) 14 #if 1 15 # define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A) 16 # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) 17 # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16))) 18 #else 19 # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) 20 # define _mm512_store_act(A,B) _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16))) 21 #endif 22 #else 23 # define _mm512_load_act(A) _mm512_loadu_ps(A) 24 # define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B) 25 # define _mm512_store_act(A,B) _mm512_storeu_ps(A,B) 26 #endif 27 28 /* size variables, all const */ 29 const int nImg = handle->desc.partN; 30 const int ifh = handle->desc.H; 31 const int ifw = handle->desc.W; 32 const int sh = handle->desc.u; 33 const int sw = handle->desc.v; 34 const int ofh = ifh/sh; 35 const int ofw = ifw/sw; 36 const int iph = handle->desc.pad_h_in; 37 const int ipw = handle->desc.pad_w_in; 38 const int oph = handle->desc.pad_h_out; 39 const int opw = handle->desc.pad_w_out; 40 const int ofhp = ofh + 2*oph; 41 const int ofwp = ofw + 2*opw; 42 const int ifhp = ifh + 2*iph; 43 const int ifwp = ifw + 2*ipw; 44 /* here we assume that input and output blocking is similar */ 45 const int nBlocksFm = handle->blocksifm; 46 47 const element_stats_type nhw = (element_stats_type)(handle->desc.fullN * ifh * ifw); 48 const element_stats_type recp_nhw = 1.0f/nhw; 49 50 /* computing first logical thread */ 51 const int ltid = tid - start_thread; 52 /* number of tasks that could be run in parallel */ 53 const int work = nImg * nBlocksFm; 54 /* compute chunk size */ 55 const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1); 56 /* compute thr_begin and thr_end */ 57 const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work; 58 const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work; 59 60 /* number of tasks that could be run in parallel, delta gamma and beta reduction */ 61 const int work2 = nBlocksFm; 62 /* compute chunk size */ 63 const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1); 64 /* compute thr_begin and thr_end */ 65 const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2; 66 const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2; 67 68 /* loop variables */ 69 int img = 0; 70 int fm = 0; 71 int imgfm = 0; 72 int hi = 0; 73 int wi = 0; 74 int ho = 0; 75 int wo = 0; 76 77 LIBXSMM_VLA_DECL(5, element_input_type, dinput, (element_input_type* )handle->grad_input->data, nBlocksFm, ifhp, ifwp, 32); 78 LIBXSMM_VLA_DECL(5, element_input_type, input, (element_input_type* )handle->reg_input->data, nBlocksFm, ifhp, ifwp, 32); 79 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) 80 LIBXSMM_VLA_DECL(5, element_input_type, dinput_add, (element_input_type* )handle->grad_add->data, nBlocksFm, ifhp, ifwp, 32); 81 #endif 82 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) 83 LIBXSMM_VLA_DECL(5, const element_output_type, output, (element_output_type*)handle->reg_output->data, nBlocksFm, ofhp, ofwp, 32); 84 #endif 85 LIBXSMM_VLA_DECL(5, element_output_type, doutput, (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, 32); 86 87 LIBXSMM_VLA_DECL(2, const element_stats_type, gamma, (element_stats_type*)handle->reg_gamma->data, 32); 88 LIBXSMM_VLA_DECL(2, element_stats_type, dgamma, (element_stats_type*)handle->grad_gamma->data, 32); 89 LIBXSMM_VLA_DECL(2, element_stats_type, dbeta, (element_stats_type*)handle->grad_beta->data, 32); 90 LIBXSMM_VLA_DECL(2, const element_stats_type, bmean, (element_stats_type*)handle->expvalue->data, 32); 91 LIBXSMM_VLA_DECL(2, const element_stats_type, brstd, (element_stats_type*)handle->rcpstddev->data, 32); 92 LIBXSMM_VLA_DECL(3, element_stats_type, dgamma_img, (element_stats_type*)handle->scratch, nImg, 32); 93 LIBXSMM_VLA_DECL(3, element_stats_type, dbeta_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)32), nImg, 32); 94 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) 95 LIBXSMM_VLA_DECL(5, const unsigned char, relumask, (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 4); 96 #endif 97 98 /* lazy barrier init */ 99 libxsmm_barrier_init(handle->barrier, ltid); 100 101 if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || 102 ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) || 103 ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) > 0) ) { 104 for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { 105 __m512 lcl_vdgamma = _mm512_setzero_ps(); 106 __m512 lcl_vdbeta = _mm512_setzero_ps(); 107 __m512 lcl_vdgamma2 = _mm512_setzero_ps(); 108 __m512 lcl_vdbeta2 = _mm512_setzero_ps(); 109 __m512 lcl_vbmean, lcl_vbrstd; 110 __m512 lcl_vbmean2, lcl_vbrstd2; 111 element_stats_type* del_gamma_img_ptr; 112 element_stats_type* del_beta_img_ptr; 113 114 img = imgfm / nBlocksFm; 115 fm = imgfm % nBlocksFm; 116 del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, 32); 117 del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, img, 0, nImg, 32); 118 lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 32) ); 119 lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 32) ); 120 lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 32) ); 121 lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 32) ); 122 123 for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { 124 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) 125 element_input_type* del_input_add_ptr = &LIBXSMM_VLA_ACCESS(5, dinput_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); 126 #endif 127 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) 128 const element_output_type* output_ptr = &LIBXSMM_VLA_ACCESS(5, output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32); 129 #endif 130 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) 131 const unsigned char* relumask_ptr = &LIBXSMM_VLA_ACCESS(5, relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 4); 132 #endif 133 const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); 134 element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32); 135 for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { 136 __m512 lcl_vdeloutput, lcl_vdeloutput2; 137 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) || defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) 138 const __m512 vzero = _mm512_setzero_ps(); 139 __mmask16 lcl_relumask, lcl_relumask2; 140 #endif 141 142 lcl_vdeloutput = _mm512_load_act( del_output_ptr ); 143 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) 144 lcl_relumask = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr ), vzero, _CMP_NEQ_OQ ); 145 lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, vzero, lcl_vdeloutput ); 146 _mm512_store_act( del_output_ptr, lcl_vdeloutput ); 147 #endif 148 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) 149 lcl_relumask = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); 150 lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, vzero, lcl_vdeloutput ); 151 _mm512_store_act( del_output_ptr, lcl_vdeloutput ); 152 relumask_ptr += 2; 153 #endif 154 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) 155 _mm512_stream_act( del_input_add_ptr, lcl_vdeloutput ); 156 #endif 157 lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ), lcl_vdeloutput ), lcl_vbrstd ) ); 158 lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, lcl_vdeloutput ); 159 160 lcl_vdeloutput2 = _mm512_load_act( del_output_ptr+16 ); 161 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) 162 lcl_relumask2 = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr+16 ), vzero, _CMP_NEQ_OQ ); 163 lcl_vdeloutput2 = _mm512_mask_blend_ps( lcl_relumask2, vzero, lcl_vdeloutput2 ); 164 _mm512_store_act( del_output_ptr+16, lcl_vdeloutput2 ); 165 output_ptr += 32; 166 #endif 167 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK) 168 lcl_relumask2 = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr ); 169 lcl_vdeloutput2 = _mm512_mask_blend_ps( lcl_relumask2, vzero, lcl_vdeloutput2 ); 170 _mm512_store_act( del_output_ptr+16, lcl_vdeloutput2 ); 171 relumask_ptr += 2; 172 #endif 173 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE) 174 _mm512_stream_act( del_input_add_ptr+16, lcl_vdeloutput2 ); 175 del_input_add_ptr += sw*32; 176 #endif 177 lcl_vdgamma2 = _mm512_add_ps( lcl_vdgamma2, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ), lcl_vdeloutput2 ), lcl_vbrstd2 ) ); 178 lcl_vdbeta2 = _mm512_add_ps( lcl_vdbeta2, lcl_vdeloutput2 ); 179 180 input_ptr += sw*32; 181 del_output_ptr += 32; 182 } 183 } 184 185 _mm512_storeu_ps( del_gamma_img_ptr, lcl_vdgamma ); 186 _mm512_storeu_ps( del_beta_img_ptr, lcl_vdbeta ); 187 _mm512_storeu_ps( del_gamma_img_ptr+16, lcl_vdgamma2 ); 188 _mm512_storeu_ps( del_beta_img_ptr+16, lcl_vdbeta2 ); 189 } 190 191 libxsmm_barrier_wait(handle->barrier, ltid); 192 193 if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || 194 ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0) ) { 195 /* now we need to reduce the del_gamm and del_beta */ 196 for ( fm = thr_begin2; fm < thr_end2; ++fm ) { 197 element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, 0, 0, nImg, 32); 198 element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, 0, 0, nImg, 32); 199 __m512 lcl_vdgamma = _mm512_setzero_ps(); 200 __m512 lcl_vdbeta = _mm512_setzero_ps(); 201 __m512 lcl_vdgamma2 = _mm512_setzero_ps(); 202 __m512 lcl_vdbeta2 = _mm512_setzero_ps(); 203 204 for ( img=0; img < nImg; img++ ) { 205 lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_loadu_ps( del_gamma_img_ptr ) ); 206 lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); 207 lcl_vdgamma2 = _mm512_add_ps( lcl_vdgamma2, _mm512_loadu_ps( del_gamma_img_ptr+16 ) ); 208 lcl_vdbeta2 = _mm512_add_ps( lcl_vdbeta2, _mm512_loadu_ps( del_beta_img_ptr+16 ) ); 209 del_gamma_img_ptr += 32; 210 del_beta_img_ptr += 32; 211 } 212 213 _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, 32), lcl_vdgamma ); 214 _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, 32), lcl_vdbeta ); 215 _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 16, 32), lcl_vdgamma2 ); 216 _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 16, 32), lcl_vdbeta2 ); 217 } 218 } else { 219 /* now we need to reduce the del_gamm and del_beta */ 220 for ( fm = thr_begin2; fm < thr_end2; ++fm ) { 221 element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, 0, 0, nImg, 32); 222 element_stats_type* del_beta_img_ptr = &LIBXSMM_VLA_ACCESS(3, dbeta_img, fm, 0, 0, nImg, 32); 223 __m512 lcl_vdgamma = _mm512_setzero_ps(); 224 __m512 lcl_vdbeta = _mm512_setzero_ps(); 225 __m512 lcl_vdgamma2 = _mm512_setzero_ps(); 226 __m512 lcl_vdbeta2 = _mm512_setzero_ps(); 227 228 for ( img=0; img < nImg; img++ ) { 229 lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_loadu_ps( del_gamma_img_ptr ) ); 230 lcl_vdbeta = _mm512_add_ps( lcl_vdbeta, _mm512_loadu_ps( del_beta_img_ptr ) ); 231 lcl_vdgamma2 = _mm512_add_ps( lcl_vdgamma2, _mm512_loadu_ps( del_gamma_img_ptr+16 ) ); 232 lcl_vdbeta2 = _mm512_add_ps( lcl_vdbeta2, _mm512_loadu_ps( del_beta_img_ptr+16 ) ); 233 del_gamma_img_ptr += 32; 234 del_beta_img_ptr += 32; 235 } 236 237 _mm512_storeu_ps( del_gamma_img_ptr - (32*nImg), lcl_vdgamma ); 238 _mm512_storeu_ps( del_beta_img_ptr - (32*nImg), lcl_vdbeta ); 239 _mm512_storeu_ps( del_gamma_img_ptr - (32*nImg) + 16, lcl_vdgamma2 ); 240 _mm512_storeu_ps( del_beta_img_ptr - (32*nImg) + 16, lcl_vdbeta2 ); 241 } 242 } 243 244 libxsmm_barrier_wait(handle->barrier, ltid); 245 } 246 247 if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0) || 248 ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) > 0) ) { 249 /* now we apply the actual backward batch norm */ 250 for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) { 251 __m512 lcl_vgamma, lcl_vbmean, lcl_vbrstd, lcl_vdgamma, lcl_vdbeta; 252 __m512 lcl_vgamma2, lcl_vbmean2, lcl_vbrstd2, lcl_vdgamma2, lcl_vdbeta2; 253 __m512 lcl_vnhw = _mm512_set1_ps( nhw ); 254 __m512 lcl_vrec_nhw = _mm512_set1_ps( recp_nhw ); 255 256 img = imgfm / nBlocksFm; 257 fm = imgfm % nBlocksFm; 258 lcl_vgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 0, 32) ); 259 lcl_vbmean = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0, 32) ); 260 lcl_vbrstd = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0, 32) ); 261 lcl_vdgamma = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0, 32) ); 262 lcl_vdbeta = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 0, 32) ); 263 264 lcl_vgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma, fm, 16, 32) ); 265 lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 32) ); 266 lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 32) ); 267 lcl_vdgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 16, 32) ); 268 lcl_vdbeta2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta, fm, 16, 32) ); 269 270 for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) { 271 element_input_type* del_input_ptr = &LIBXSMM_VLA_ACCESS(5, dinput, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); 272 const element_input_type* input_ptr = &LIBXSMM_VLA_ACCESS(5, input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32); 273 const element_output_type* del_output_ptr = &LIBXSMM_VLA_ACCESS(5, doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32); 274 for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) { 275 __m512 lcl_vdelinput; 276 __m512 lcl_vdelinput2; 277 278 lcl_vdelinput = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ); 279 lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vdgamma ); 280 lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vbrstd ); 281 lcl_vdelinput = _mm512_add_ps( lcl_vdbeta, lcl_vdelinput ); 282 lcl_vdelinput = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr ) ), lcl_vdelinput ); 283 lcl_vdelinput = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput ); 284 lcl_vdelinput = _mm512_mul_ps( lcl_vbrstd, lcl_vdelinput ); 285 lcl_vdelinput = _mm512_mul_ps( lcl_vgamma, lcl_vdelinput ); 286 287 lcl_vdelinput2 = _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ); 288 lcl_vdelinput2 = _mm512_mul_ps( lcl_vdelinput2, lcl_vdgamma2 ); 289 lcl_vdelinput2 = _mm512_mul_ps( lcl_vdelinput2, lcl_vbrstd2 ); 290 lcl_vdelinput2 = _mm512_add_ps( lcl_vdbeta2, lcl_vdelinput2 ); 291 lcl_vdelinput2 = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr+16 ) ), lcl_vdelinput2 ); 292 lcl_vdelinput2 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput2 ); 293 lcl_vdelinput2 = _mm512_mul_ps( lcl_vbrstd2, lcl_vdelinput2 ); 294 lcl_vdelinput2 = _mm512_mul_ps( lcl_vgamma2, lcl_vdelinput2 ); 295 296 _mm512_stream_act( del_input_ptr, lcl_vdelinput ); 297 _mm512_stream_act( del_input_ptr+16, lcl_vdelinput2 ); 298 299 del_input_ptr += sw*32; 300 input_ptr += sw*32; 301 del_output_ptr += 32; 302 } 303 } 304 } 305 306 libxsmm_barrier_wait(handle->barrier, ltid); 307 } 308 309 # undef _mm512_load_act 310 # undef _mm512_stream_act 311 # undef _mm512_store_act 312 313