1 /******************************************************************************
2 * Copyright (c) Intel Corporation - All rights reserved.                      *
3 * This file is part of the LIBXSMM library.                                   *
4 *                                                                             *
5 * For information on the license, see the LICENSE file.                       *
6 * Further information: https://github.com/hfp/libxsmm/                        *
7 * SPDX-License-Identifier: BSD-3-Clause                                       *
8 ******************************************************************************/
9 /* Alexander Heinecke, Sasikanth Avancha (Intel Corp.)
10 ******************************************************************************/
11 
12 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_BF16)
13 # define _mm512_load_act(A)   _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)(A))),16))
14 #if 1
15 # define _mm512_roundbf16rne(A) LIBXSMM_INTRINSICS_MM512_ROUNDNE_BF16(A)
16 # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16)))
17 # define _mm512_store_act(A,B)  _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_roundbf16rne((B)),16)))
18 #else
19 # define _mm512_stream_act(A,B) _mm256_stream_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16)))
20 # define _mm512_store_act(A,B)  _mm256_storeu_si256((__m256i*)(A),_mm512_cvtepi32_epi16(_mm512_srai_epi32(_mm512_castps_si512((B)),16)))
21 #endif
22 #else
23 # define _mm512_load_act(A)   _mm512_loadu_ps(A)
24 # define _mm512_stream_act(A,B) LIBXSMM_INTRINSICS_MM512_STREAM_PS(A,B)
25 # define _mm512_store_act(A,B)  _mm512_storeu_ps(A,B)
26 #endif
27 
28 /* size variables, all const */
29 const int nImg = handle->desc.partN;
30 const int ifh = handle->desc.H;
31 const int ifw = handle->desc.W;
32 const int sh = handle->desc.u;
33 const int sw = handle->desc.v;
34 const int ofh = ifh/sh;
35 const int ofw = ifw/sw;
36 const int iph = handle->desc.pad_h_in;
37 const int ipw = handle->desc.pad_w_in;
38 const int oph = handle->desc.pad_h_out;
39 const int opw = handle->desc.pad_w_out;
40 const int ofhp = ofh + 2*oph;
41 const int ofwp = ofw + 2*opw;
42 const int ifhp = ifh + 2*iph;
43 const int ifwp = ifw + 2*ipw;
44 /* here we assume that input and output blocking is similar */
45 const int nBlocksFm = handle->blocksifm;
46 
47 const element_stats_type nhw = (element_stats_type)(handle->desc.fullN * ifh * ifw);
48 const element_stats_type recp_nhw = 1.0f/nhw;
49 
50 /* computing first logical thread */
51 const int ltid = tid - start_thread;
52 /* number of tasks that could be run in parallel */
53 const int work = nImg * nBlocksFm;
54 /* compute chunk size */
55 const int chunksize = (work % handle->desc.threads == 0) ? (work / handle->desc.threads) : ((work / handle->desc.threads) + 1);
56 /* compute thr_begin and thr_end */
57 const int thr_begin = (ltid * chunksize < work) ? (ltid * chunksize) : work;
58 const int thr_end = ((ltid + 1) * chunksize < work) ? ((ltid + 1) * chunksize) : work;
59 
60 /* number of tasks that could be run in parallel, delta gamma and beta reduction */
61 const int work2 = nBlocksFm;
62 /* compute chunk size */
63 const int chunksize2 = (work2 % handle->desc.threads == 0) ? (work2 / handle->desc.threads) : ((work2 / handle->desc.threads) + 1);
64 /* compute thr_begin and thr_end */
65 const int thr_begin2 = (ltid * chunksize2 < work2) ? (ltid * chunksize2) : work2;
66 const int thr_end2 = ((ltid + 1) * chunksize2 < work2) ? ((ltid + 1) * chunksize2) : work2;
67 
68 /* loop variables */
69 int img = 0;
70 int fm = 0;
71 int imgfm = 0;
72 int hi = 0;
73 int wi = 0;
74 int ho = 0;
75 int wo = 0;
76 
77 LIBXSMM_VLA_DECL(5,       element_input_type,  dinput,     (element_input_type* )handle->grad_input->data,  nBlocksFm, ifhp, ifwp, 32);
78 LIBXSMM_VLA_DECL(5,       element_input_type,   input,     (element_input_type* )handle->reg_input->data,   nBlocksFm, ifhp, ifwp, 32);
79 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE)
80 LIBXSMM_VLA_DECL(5,       element_input_type,  dinput_add, (element_input_type* )handle->grad_add->data,    nBlocksFm, ifhp, ifwp, 32);
81 #endif
82 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU)
83 LIBXSMM_VLA_DECL(5, const element_output_type, output,     (element_output_type*)handle->reg_output->data,  nBlocksFm, ofhp, ofwp, 32);
84 #endif
85 LIBXSMM_VLA_DECL(5,       element_output_type, doutput,    (element_output_type*)handle->grad_output->data, nBlocksFm, ofhp, ofwp, 32);
86 
87 LIBXSMM_VLA_DECL(2, const element_stats_type,  gamma,      (element_stats_type*)handle->reg_gamma->data,  32);
88 LIBXSMM_VLA_DECL(2,       element_stats_type,  dgamma,     (element_stats_type*)handle->grad_gamma->data, 32);
89 LIBXSMM_VLA_DECL(2,       element_stats_type,  dbeta,      (element_stats_type*)handle->grad_beta->data,  32);
90 LIBXSMM_VLA_DECL(2, const element_stats_type,  bmean,      (element_stats_type*)handle->expvalue->data,   32);
91 LIBXSMM_VLA_DECL(2, const element_stats_type,  brstd,      (element_stats_type*)handle->rcpstddev->data,  32);
92 LIBXSMM_VLA_DECL(3,       element_stats_type,  dgamma_img, (element_stats_type*)handle->scratch,                                                    nImg, 32);
93 LIBXSMM_VLA_DECL(3,       element_stats_type,  dbeta_img, ((element_stats_type*)handle->scratch) + ((size_t)nImg * (size_t)nBlocksFm * (size_t)32), nImg, 32);
94 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK)
95 LIBXSMM_VLA_DECL(5, const unsigned char,       relumask,   (unsigned char*)handle->relumask->data, nBlocksFm, ofhp, ofwp, 4);
96 #endif
97 
98 /* lazy barrier init */
99 libxsmm_barrier_init(handle->barrier, ltid);
100 
101 if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0)            ||
102      ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0)       ||
103      ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS_NORED) > 0)    ) {
104   for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) {
105     __m512 lcl_vdgamma  = _mm512_setzero_ps();
106     __m512 lcl_vdbeta   = _mm512_setzero_ps();
107     __m512 lcl_vdgamma2 = _mm512_setzero_ps();
108     __m512 lcl_vdbeta2  = _mm512_setzero_ps();
109     __m512 lcl_vbmean,  lcl_vbrstd;
110     __m512 lcl_vbmean2, lcl_vbrstd2;
111     element_stats_type* del_gamma_img_ptr;
112     element_stats_type* del_beta_img_ptr;
113 
114     img = imgfm / nBlocksFm;
115     fm = imgfm % nBlocksFm;
116     del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, img, 0, nImg, 32);
117     del_beta_img_ptr  = &LIBXSMM_VLA_ACCESS(3, dbeta_img,  fm, img, 0, nImg, 32);
118     lcl_vbmean  = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 0,  32) );
119     lcl_vbrstd  = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 0,  32) );
120     lcl_vbmean2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean, fm, 16, 32) );
121     lcl_vbrstd2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd, fm, 16, 32) );
122 
123     for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) {
124 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE)
125             element_input_type*  del_input_add_ptr = &LIBXSMM_VLA_ACCESS(5, dinput_add, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32);
126 #endif
127 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU)
128       const element_output_type* output_ptr        = &LIBXSMM_VLA_ACCESS(5,     output, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32);
129 #endif
130 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK)
131       const unsigned char*       relumask_ptr      = &LIBXSMM_VLA_ACCESS(5,   relumask, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 4);
132 #endif
133       const element_input_type*  input_ptr         = &LIBXSMM_VLA_ACCESS(5,      input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32);
134             element_output_type* del_output_ptr    = &LIBXSMM_VLA_ACCESS(5,    doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32);
135       for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) {
136         __m512 lcl_vdeloutput, lcl_vdeloutput2;
137 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU) || defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK)
138         const __m512 vzero = _mm512_setzero_ps();
139         __mmask16 lcl_relumask, lcl_relumask2;
140 #endif
141 
142         lcl_vdeloutput = _mm512_load_act( del_output_ptr );
143 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU)
144         lcl_relumask = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr ), vzero, _CMP_NEQ_OQ );
145         lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, vzero, lcl_vdeloutput );
146         _mm512_store_act( del_output_ptr, lcl_vdeloutput );
147 #endif
148 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK)
149         lcl_relumask = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr );
150         lcl_vdeloutput = _mm512_mask_blend_ps( lcl_relumask, vzero, lcl_vdeloutput );
151         _mm512_store_act( del_output_ptr, lcl_vdeloutput );
152         relumask_ptr += 2;
153 #endif
154 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE)
155         _mm512_stream_act( del_input_add_ptr, lcl_vdeloutput );
156 #endif
157         lcl_vdgamma = _mm512_add_ps( lcl_vdgamma, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean ), lcl_vdeloutput ), lcl_vbrstd ) );
158         lcl_vdbeta  = _mm512_add_ps( lcl_vdbeta, lcl_vdeloutput );
159 
160         lcl_vdeloutput2 = _mm512_load_act( del_output_ptr+16 );
161 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU)
162         lcl_relumask2 = _mm512_cmp_ps_mask( _mm512_load_act( output_ptr+16 ), vzero, _CMP_NEQ_OQ );
163         lcl_vdeloutput2 = _mm512_mask_blend_ps( lcl_relumask2, vzero, lcl_vdeloutput2 );
164         _mm512_store_act( del_output_ptr+16, lcl_vdeloutput2 );
165         output_ptr += 32;
166 #endif
167 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_RELU_WITH_MASK)
168         lcl_relumask2 = LIBXSMM_INTRINSICS_MM512_LOAD_MASK16( relumask_ptr );
169         lcl_vdeloutput2 = _mm512_mask_blend_ps( lcl_relumask2, vzero, lcl_vdeloutput2 );
170         _mm512_store_act( del_output_ptr+16, lcl_vdeloutput2 );
171         relumask_ptr += 2;
172 #endif
173 #if defined(LIBXSMM_DNN_FUSEDBN_BWD_ENABLE_ELTWISE)
174         _mm512_stream_act( del_input_add_ptr+16, lcl_vdeloutput2 );
175         del_input_add_ptr += sw*32;
176 #endif
177         lcl_vdgamma2 = _mm512_add_ps( lcl_vdgamma2, _mm512_mul_ps( _mm512_mul_ps( _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 ), lcl_vdeloutput2 ), lcl_vbrstd2 ) );
178         lcl_vdbeta2  = _mm512_add_ps( lcl_vdbeta2, lcl_vdeloutput2 );
179 
180         input_ptr += sw*32;
181         del_output_ptr += 32;
182       }
183     }
184 
185     _mm512_storeu_ps( del_gamma_img_ptr,    lcl_vdgamma );
186     _mm512_storeu_ps( del_beta_img_ptr,     lcl_vdbeta );
187     _mm512_storeu_ps( del_gamma_img_ptr+16, lcl_vdgamma2 );
188     _mm512_storeu_ps( del_beta_img_ptr+16,  lcl_vdbeta2 );
189   }
190 
191   libxsmm_barrier_wait(handle->barrier, ltid);
192 
193   if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0)      ||
194        ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSTATS) > 0)    ) {
195     /* now we need to reduce the del_gamm and del_beta */
196     for ( fm = thr_begin2; fm < thr_end2; ++fm ) {
197       element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, 0, 0, nImg, 32);
198       element_stats_type* del_beta_img_ptr  = &LIBXSMM_VLA_ACCESS(3, dbeta_img,  fm, 0, 0, nImg, 32);
199       __m512 lcl_vdgamma  = _mm512_setzero_ps();
200       __m512 lcl_vdbeta   = _mm512_setzero_ps();
201       __m512 lcl_vdgamma2 = _mm512_setzero_ps();
202       __m512 lcl_vdbeta2  = _mm512_setzero_ps();
203 
204       for ( img=0; img < nImg; img++ ) {
205         lcl_vdgamma  = _mm512_add_ps( lcl_vdgamma,  _mm512_loadu_ps( del_gamma_img_ptr ) );
206         lcl_vdbeta   = _mm512_add_ps( lcl_vdbeta,   _mm512_loadu_ps( del_beta_img_ptr  ) );
207         lcl_vdgamma2 = _mm512_add_ps( lcl_vdgamma2, _mm512_loadu_ps( del_gamma_img_ptr+16 ) );
208         lcl_vdbeta2  = _mm512_add_ps( lcl_vdbeta2,  _mm512_loadu_ps( del_beta_img_ptr+16  ) );
209         del_gamma_img_ptr += 32;
210         del_beta_img_ptr  += 32;
211       }
212 
213       _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 0,  32), lcl_vdgamma );
214       _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta,  fm, 0,  32), lcl_vdbeta  );
215       _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma, fm, 16, 32), lcl_vdgamma2 );
216       _mm512_storeu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta,  fm, 16, 32), lcl_vdbeta2  );
217     }
218   } else {
219     /* now we need to reduce the del_gamm and del_beta */
220     for ( fm = thr_begin2; fm < thr_end2; ++fm ) {
221       element_stats_type* del_gamma_img_ptr = &LIBXSMM_VLA_ACCESS(3, dgamma_img, fm, 0, 0, nImg, 32);
222       element_stats_type* del_beta_img_ptr  = &LIBXSMM_VLA_ACCESS(3, dbeta_img,  fm, 0, 0, nImg, 32);
223       __m512 lcl_vdgamma  = _mm512_setzero_ps();
224       __m512 lcl_vdbeta   = _mm512_setzero_ps();
225       __m512 lcl_vdgamma2 = _mm512_setzero_ps();
226       __m512 lcl_vdbeta2  = _mm512_setzero_ps();
227 
228       for ( img=0; img < nImg; img++ ) {
229         lcl_vdgamma  = _mm512_add_ps( lcl_vdgamma,  _mm512_loadu_ps( del_gamma_img_ptr ) );
230         lcl_vdbeta   = _mm512_add_ps( lcl_vdbeta,   _mm512_loadu_ps( del_beta_img_ptr  ) );
231         lcl_vdgamma2 = _mm512_add_ps( lcl_vdgamma2, _mm512_loadu_ps( del_gamma_img_ptr+16 ) );
232         lcl_vdbeta2  = _mm512_add_ps( lcl_vdbeta2,  _mm512_loadu_ps( del_beta_img_ptr+16  ) );
233         del_gamma_img_ptr += 32;
234         del_beta_img_ptr  += 32;
235       }
236 
237       _mm512_storeu_ps( del_gamma_img_ptr - (32*nImg),      lcl_vdgamma );
238       _mm512_storeu_ps( del_beta_img_ptr - (32*nImg),       lcl_vdbeta  );
239       _mm512_storeu_ps( del_gamma_img_ptr - (32*nImg) + 16, lcl_vdgamma2 );
240       _mm512_storeu_ps( del_beta_img_ptr - (32*nImg) + 16,  lcl_vdbeta2  );
241     }
242   }
243 
244   libxsmm_barrier_wait(handle->barrier, ltid);
245 }
246 
247 if ( ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BN) > 0)      ||
248      ((handle->desc.fuse_ops & LIBXSMM_DNN_FUSEDBN_OPS_BNSCALE) > 0)    ) {
249   /* now we apply the actual backward batch norm */
250   for ( imgfm = thr_begin; imgfm < thr_end; ++imgfm ) {
251     __m512 lcl_vgamma,  lcl_vbmean,  lcl_vbrstd,  lcl_vdgamma,  lcl_vdbeta;
252     __m512 lcl_vgamma2, lcl_vbmean2, lcl_vbrstd2, lcl_vdgamma2, lcl_vdbeta2;
253     __m512 lcl_vnhw      = _mm512_set1_ps( nhw );
254     __m512 lcl_vrec_nhw  = _mm512_set1_ps( recp_nhw );
255 
256     img = imgfm / nBlocksFm;
257     fm = imgfm % nBlocksFm;
258     lcl_vgamma   = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma,     fm, 0, 32) );
259     lcl_vbmean   = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean,     fm, 0, 32) );
260     lcl_vbrstd   = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd,     fm, 0, 32) );
261     lcl_vdgamma  = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma,    fm, 0, 32) );
262     lcl_vdbeta   = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta,     fm, 0, 32) );
263 
264     lcl_vgamma2  = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, gamma,     fm, 16, 32) );
265     lcl_vbmean2  = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, bmean,     fm, 16, 32) );
266     lcl_vbrstd2  = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, brstd,     fm, 16, 32) );
267     lcl_vdgamma2 = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dgamma,    fm, 16, 32) );
268     lcl_vdbeta2  = _mm512_loadu_ps( &LIBXSMM_VLA_ACCESS(2, dbeta,     fm, 16, 32) );
269 
270     for ( hi=iph, ho=oph; hi < (ifh + iph); hi+=sh, ho++ ) {
271       element_input_type*  del_input_ptr     = &LIBXSMM_VLA_ACCESS(5,     dinput, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32);
272       const element_input_type*  input_ptr         = &LIBXSMM_VLA_ACCESS(5,      input, img, fm, hi, ipw, 0, nBlocksFm, ifhp, ifwp, 32);
273       const element_output_type* del_output_ptr    = &LIBXSMM_VLA_ACCESS(5,    doutput, img, fm, ho, opw, 0, nBlocksFm, ofhp, ofwp, 32);
274       for ( wi=ipw, wo=opw; wi < (ifw + ipw); wi+=sw, wo++ ) {
275         __m512 lcl_vdelinput;
276         __m512 lcl_vdelinput2;
277 
278         lcl_vdelinput = _mm512_sub_ps( _mm512_load_act( input_ptr ), lcl_vbmean );
279         lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vdgamma );
280         lcl_vdelinput = _mm512_mul_ps( lcl_vdelinput, lcl_vbrstd  );
281         lcl_vdelinput = _mm512_add_ps( lcl_vdbeta, lcl_vdelinput  );
282         lcl_vdelinput = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr ) ), lcl_vdelinput );
283         lcl_vdelinput = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput );
284         lcl_vdelinput = _mm512_mul_ps( lcl_vbrstd, lcl_vdelinput );
285         lcl_vdelinput = _mm512_mul_ps( lcl_vgamma, lcl_vdelinput );
286 
287         lcl_vdelinput2 = _mm512_sub_ps( _mm512_load_act( input_ptr+16 ), lcl_vbmean2 );
288         lcl_vdelinput2 = _mm512_mul_ps( lcl_vdelinput2, lcl_vdgamma2 );
289         lcl_vdelinput2 = _mm512_mul_ps( lcl_vdelinput2, lcl_vbrstd2  );
290         lcl_vdelinput2 = _mm512_add_ps( lcl_vdbeta2, lcl_vdelinput2  );
291         lcl_vdelinput2 = _mm512_sub_ps( _mm512_mul_ps( lcl_vnhw, _mm512_load_act( del_output_ptr+16 ) ), lcl_vdelinput2 );
292         lcl_vdelinput2 = _mm512_mul_ps( lcl_vrec_nhw, lcl_vdelinput2 );
293         lcl_vdelinput2 = _mm512_mul_ps( lcl_vbrstd2, lcl_vdelinput2 );
294         lcl_vdelinput2 = _mm512_mul_ps( lcl_vgamma2, lcl_vdelinput2 );
295 
296         _mm512_stream_act( del_input_ptr,    lcl_vdelinput );
297         _mm512_stream_act( del_input_ptr+16, lcl_vdelinput2 );
298 
299         del_input_ptr += sw*32;
300         input_ptr += sw*32;
301         del_output_ptr += 32;
302       }
303     }
304   }
305 
306   libxsmm_barrier_wait(handle->barrier, ltid);
307 }
308 
309 # undef _mm512_load_act
310 # undef _mm512_stream_act
311 # undef _mm512_store_act
312 
313