1 /******************************************************************************
2 * Copyright (c) Intel Corporation - All rights reserved.                      *
3 * This file is part of the LIBXSMM library.                                   *
4 *                                                                             *
5 * For information on the license, see the LICENSE file.                       *
6 * Further information: https://github.com/hfp/libxsmm/                        *
7 * SPDX-License-Identifier: BSD-3-Clause                                       *
8 ******************************************************************************/
9 /* Alexander Heinecke (Intel Corp.)
10 ******************************************************************************/
11 #include <libxsmm.h>
12 
13 #include <stdlib.h>
14 #include <string.h>
15 #include <stdio.h>
16 #include <math.h>
17 #if defined(_OPENMP)
18 # include <omp.h>
19 #endif
20 
21 /* include c-based dnn library */
22 #include "../common/dnn_common.h"
23 
24 #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \
25   fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \
26 }
27 
main(int argc,char * argv[])28 int main(int argc, char* argv[])
29 {
30   float *naive_input, *naive_output, *naive_filter, *naive_delinput, *naive_deloutput, *naive_delfilter;
31   libxsmm_bfloat16 *naive_input_bf16, *naive_filter_bf16, *naive_delinput_bf16, *naive_delfilter_bf16;
32   float *naive_libxsmm_output, *naive_libxsmm_delinput_f32, *naive_libxsmm_delfilter_f32;
33   libxsmm_bfloat16 *naive_libxsmm_delinput, *naive_libxsmm_delfilter;
34   libxsmm_bfloat16 *input_libxsmm, *filter_libxsmm, *delinput_libxsmm, *delfilter_libxsmm;
35   float *output_libxsmm, *deloutput_libxsmm;
36 
37   naive_fullyconnected_t naive_param;
38   void* scratch;
39   size_t scratch_size = 0;
40 
41   /* some parameters we can overwrite via cli,
42      default is some inner layer of overfeat */
43   int iters = 10;         /* repetitions of benchmark */
44   int nImg = 32;          /* mini-batch size, "N" */
45   int nIFm = 256;          /* number of input feature maps, "C" */
46   int nOFm = 256;          /* number of input feature maps, "C" */
47   int fuse_type = 0;      /* 0: nothing fused, 1: relu fused, 2: elementwise fused, 3: relu and elementwise fused */
48   char type = 'A';        /* 'A': ALL, 'F': FP, 'B': BP, 'U', WU */
49   char format = 'L';
50 
51   const char *const env_check = getenv("CHECK");
52   const double check = LIBXSMM_ABS(0 == env_check ? 1 : atof(env_check));
53 
54 #if defined(_OPENMP)
55   int nThreads = omp_get_max_threads(); /* number of threads */
56 #else
57   int nThreads = 1; /* number of threads */
58 #endif
59 
60   unsigned long long l_start, l_end;
61   double l_total = 0.0;
62   double gflop = 0.0;
63   int i;
64 
65   libxsmm_dnn_fullyconnected_desc fullyconnected_desc;
66   libxsmm_dnn_fullyconnected* libxsmm_handle;
67   libxsmm_dnn_tensor*  libxsmm_input;
68   libxsmm_dnn_tensor*  libxsmm_delinput;
69   libxsmm_dnn_tensor*  libxsmm_output;
70   libxsmm_dnn_tensor*  libxsmm_deloutput;
71   libxsmm_dnn_tensor*  libxsmm_filter;
72   libxsmm_dnn_tensor*  libxsmm_delfilter;
73   libxsmm_dnn_tensor_datalayout* libxsmm_layout;
74   libxsmm_dnn_err_t status;
75   libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS;
76 
77   libxsmm_matdiff_info norms_fwd, norms_bwd, norms_upd, diff;
78   libxsmm_matdiff_clear(&norms_fwd);
79   libxsmm_matdiff_clear(&norms_bwd);
80   libxsmm_matdiff_clear(&norms_upd);
81   libxsmm_matdiff_clear(&diff);
82 
83   if (argc > 1 && !strncmp(argv[1], "-h", 3)) {
84     printf("Usage: %s iters nImg nIFm nOFm fuse_type type format\n", argv[0]);
85     return 0;
86   }
87   libxsmm_rng_set_seed(1);
88 
89   /* reading new values from cli */
90   i = 1;
91   if (argc > i) iters      = atoi(argv[i++]);
92   if (argc > i) nImg       = atoi(argv[i++]);
93   if (argc > i) nIFm       = atoi(argv[i++]);
94   if (argc > i) nOFm       = atoi(argv[i++]);
95   if (argc > i) fuse_type  = atoi(argv[i++]);
96   if (argc > i) type       = *(argv[i++]);
97   if (argc > i) format     = *(argv[i++]);
98 
99   if (type != 'A' && type != 'F' && type != 'B' && type != 'U') {
100     printf("type needs to be 'A' (All), 'F' (FP only), 'B' (BP only), 'U' (UP only)\n");
101     return -1;
102   }
103   if ( fuse_type != 0 ) {
104     printf("fuse type needs to be 0\n");
105     return -1;
106   }
107   if (format != 'L') {
108     printf("format needs to be 'L' (libxsmm)\n");
109     return -1;
110   }
111 
112   /* set struct for naive convolution */
113   naive_param.N = nImg;
114   naive_param.C = nIFm;
115   naive_param.K = nOFm;
116   naive_param.fuse_type = fuse_type;
117 
118 #if defined(__SSE3__)
119   _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
120   _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
121   _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
122 #endif
123 
124   /* print some summary */
125   printf("##########################################\n");
126   printf("#          Setting Up (Common)           #\n");
127   printf("##########################################\n");
128   printf("PARAMS: N:%d  C:%d  K:%d\n", nImg, nIFm, nOFm);
129   printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf("  Threads:%d\n", nThreads); else printf("\n");
130   printf("SIZE Input  (MB): %10.2f MiB\n", (double)(nImg*nIFm*sizeof(libxsmm_bfloat16))/(1024.0*1024.0) );
131   printf("SIZE Output (MB): %10.2f MiB\n", (double)(nImg*nOFm*sizeof(float))/(1024.0*1024.0) );
132   printf("SIZE Input   (1): %10.2f MiB\n", (double)(1*nIFm*   sizeof(libxsmm_bfloat16))/(1024.0*1024.0) );
133   printf("SIZE Output  (1): %10.2f MiB\n", (double)(1*nOFm*   sizeof(float))/(1024.0*1024.0) );
134   printf("SIZE Filter     : %10.2f MiB\n", (double)(nIFm*nOFm*sizeof(libxsmm_bfloat16))/(1024.0*1024.0) );
135 
136   /* allocate data */
137   naive_input                 = (float*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(float), 2097152);
138   naive_delinput              = (float*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(float), 2097152);
139   naive_output                = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152);
140   naive_deloutput             = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152);
141   naive_filter                = (float*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(float), 2097152);
142   naive_delfilter             = (float*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(float), 2097152);
143 
144   naive_input_bf16            = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(libxsmm_bfloat16), 2097152);
145   naive_delinput_bf16         = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(libxsmm_bfloat16), 2097152);
146   naive_filter_bf16           = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(libxsmm_bfloat16), 2097152);
147   naive_delfilter_bf16        = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(libxsmm_bfloat16), 2097152);
148 
149   naive_libxsmm_delinput      = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(libxsmm_bfloat16), 2097152);
150   naive_libxsmm_output        = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152);
151   naive_libxsmm_delfilter     = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(libxsmm_bfloat16), 2097152);
152   naive_libxsmm_delinput_f32  = (float*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(float), 2097152);
153   naive_libxsmm_delfilter_f32 = (float*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(float), 2097152);
154 
155   input_libxsmm               = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(libxsmm_bfloat16), 2097152);
156   delinput_libxsmm            = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(libxsmm_bfloat16), 2097152);
157   output_libxsmm              = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152);
158   deloutput_libxsmm           = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152);
159   filter_libxsmm              = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(libxsmm_bfloat16), 2097152);
160   delfilter_libxsmm           = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(libxsmm_bfloat16), 2097152);
161 
162   /* initialize data */
163   init_buf( naive_input,     nImg*nIFm, 0, 0 );
164   init_buf( naive_delinput,  nImg*nIFm, 0, 0 );
165   init_buf( naive_output,    nImg*nOFm, 0, 0 );
166   init_buf( naive_deloutput, nImg*nOFm, 0, 0 );
167   init_buf( naive_filter,    nIFm*nOFm, 0, 0 );
168   init_buf( naive_delfilter, nIFm*nOFm, 0, 0 );
169 
170   libxsmm_rne_convert_fp32_bf16( naive_input,     naive_input_bf16,     nImg*nIFm );
171   libxsmm_rne_convert_fp32_bf16( naive_delinput,  naive_delinput_bf16,  nImg*nIFm );
172   libxsmm_rne_convert_fp32_bf16( naive_filter,    naive_filter_bf16,    nIFm*nOFm );
173   libxsmm_rne_convert_fp32_bf16( naive_delfilter, naive_delfilter_bf16, nIFm*nOFm );
174 
175   if (LIBXSMM_NEQ(0, check)) {
176     printf("##########################################\n");
177     printf("#         Computing Reference ...        #\n");
178     printf("##########################################\n");
179     if (type == 'A' || type == 'F') {
180       naive_fullyconnected_fp(&naive_param, naive_input, naive_output, naive_filter);
181     }
182     if (type == 'A' || type == 'B') {
183       naive_fullyconnected_bp(&naive_param, naive_delinput, naive_deloutput, naive_filter);
184     }
185     if (type == 'A' || type == 'U') {
186       naive_fullyconnected_wu(&naive_param, naive_input, naive_deloutput, naive_delfilter);
187     }
188     printf("##########################################\n");
189     printf("#      Computing Reference ... done      #\n");
190     printf("##########################################\n");
191   }
192 
193   if (format == 'A' || format == 'L') {
194     printf("\n");
195     printf("##########################################\n");
196     printf("#      Setting Up  (custom-Storage)      #\n");
197     printf("##########################################\n");
198 
199     /* setup LIBXSMM handle */
200     fullyconnected_desc.N = nImg;
201     fullyconnected_desc.C = nIFm;
202     fullyconnected_desc.K = nOFm;
203     fullyconnected_desc.threads = nThreads;
204     fullyconnected_desc.datatype_in = LIBXSMM_DNN_DATATYPE_BF16;
205     fullyconnected_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32;
206     fullyconnected_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;
207     fullyconnected_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;
208     fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE;
209 
210     libxsmm_handle = libxsmm_dnn_create_fullyconnected( fullyconnected_desc, &status );
211     CHKERR_LIBXSMM_DNN( status );
212 
213     /* setup LIBXSMM buffers */
214     libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status );
215     libxsmm_input  = libxsmm_dnn_link_tensor( libxsmm_layout, input_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status );
216     printf("inner activation blocking: %i\n", libxsmm_layout->dim_size[0] );
217     libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout );
218 
219     libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status );
220     libxsmm_delinput  = libxsmm_dnn_link_tensor( libxsmm_layout, delinput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status );
221     libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout );
222 
223     libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status );
224     libxsmm_output  = libxsmm_dnn_link_tensor( libxsmm_layout, output_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status );
225     libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout );
226 
227     libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status );
228     libxsmm_deloutput  = libxsmm_dnn_link_tensor( libxsmm_layout, deloutput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status );
229     libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout );
230 
231     libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER, &status ); CHKERR_LIBXSMM_DNN( status );
232     libxsmm_filter  = libxsmm_dnn_link_tensor( libxsmm_layout, filter_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status );
233     libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout );
234 
235     libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER, &status ); CHKERR_LIBXSMM_DNN( status );
236     libxsmm_delfilter  = libxsmm_dnn_link_tensor( libxsmm_layout, delfilter_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status );
237     libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout );
238 
239     /* copy in data to LIBXSMM format */
240     /* we can also use the layout functions and set the data on our
241        own external to the library */
242     CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_input,        (void*)naive_input_bf16,     LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) );
243     CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_output,       (void*)naive_output,         LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) );
244     CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_filter,       (void*)naive_filter_bf16,    LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) );
245     CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_delinput,     (void*)naive_delinput_bf16,  LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) );
246     CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_deloutput,    (void*)naive_deloutput,      LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) );
247     CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_delfilter,    (void*)naive_delfilter_bf16, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) );
248 
249     /* bind buffers and filter to handle */
250     CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_input,        LIBXSMM_DNN_REGULAR_INPUT ) );
251     CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_delinput,     LIBXSMM_DNN_GRADIENT_INPUT ) );
252     CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_output,       LIBXSMM_DNN_REGULAR_OUTPUT ) );
253     CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_deloutput,    LIBXSMM_DNN_GRADIENT_OUTPUT ) );
254     CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_filter,       LIBXSMM_DNN_REGULAR_FILTER ) );
255     CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_delfilter,    LIBXSMM_DNN_GRADIENT_FILTER ) );
256 
257     /* let's allocate and bind scratch */
258     scratch_size = libxsmm_dnn_fullyconnected_get_scratch_size( libxsmm_handle, &status );
259     CHKERR_LIBXSMM_DNN( status );
260     scratch = libxsmm_aligned_scratch( scratch_size, 2097152 );
261     CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_scratch( libxsmm_handle, scratch ) );
262     /* set scratch to bogus to make sure that libxsmm takes care of zeroing internally */
263     init_buf( (float*)scratch, scratch_size/4, 0, 0 );
264 
265     if ((type == 'A' || type == 'F') && LIBXSMM_NEQ(0, check)) {
266       printf("##########################################\n");
267       printf("#   Correctness - FWD (custom-Storage)   #\n");
268       printf("##########################################\n");
269 
270 #if defined(_OPENMP)
271 #     pragma omp parallel
272 #endif
273       {
274 #if defined(_OPENMP)
275         const int tid = omp_get_thread_num();
276 #else
277         const int tid = 0;
278 #endif
279         CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) );
280       }
281 
282       /* copy out data */
283       CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_output, (void*)naive_libxsmm_output, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) );
284 
285       /* compare */
286       libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nImg*nOFm, 1, naive_output, naive_libxsmm_output, 0, 0);
287       printf("L1 reference  : %.25g\n", norms_fwd.l1_ref);
288       printf("L1 test       : %.25g\n", norms_fwd.l1_tst);
289       printf("L2 abs.error  : %.24f\n", norms_fwd.l2_abs);
290       printf("L2 rel.error  : %.24f\n", norms_fwd.l2_rel);
291       printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs);
292       printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel);
293       printf("Check-norm    : %.24f\n", norms_fwd.normf_rel);
294       libxsmm_matdiff_reduce(&diff, &norms_fwd);
295     }
296 
297     if ( (type == 'A' || type == 'B') && LIBXSMM_NEQ(0, check) ) {
298       printf("##########################################\n");
299       printf("#   Correctness - BWD (custom-Storage)   #\n");
300       printf("##########################################\n");
301 
302 #if defined(_OPENMP)
303 #     pragma omp parallel
304 #endif
305       {
306 #if defined(_OPENMP)
307         const int tid = omp_get_thread_num();
308 #else
309         const int tid = 0;
310 #endif
311         CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ) );
312       }
313 
314       /* copy out data */
315       CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_delinput,     (void*)naive_libxsmm_delinput,     LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) );
316       libxsmm_convert_bf16_f32( naive_libxsmm_delinput, naive_libxsmm_delinput_f32, nImg*nIFm );
317 
318       /* compare */
319       libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nImg*nIFm, 1, naive_delinput, naive_libxsmm_delinput_f32, 0, 0);
320       printf("L1 reference  : %.25g\n", norms_bwd.l1_ref);
321       printf("L1 test       : %.25g\n", norms_bwd.l1_tst);
322       printf("L2 abs.error  : %.24f\n", norms_bwd.l2_abs);
323       printf("L2 rel.error  : %.24f\n", norms_bwd.l2_rel);
324       printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs);
325       printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel);
326       printf("Check-norm    : %.24f\n", norms_bwd.normf_rel);
327       libxsmm_matdiff_reduce(&diff, &norms_bwd);
328     }
329 
330     if ( (type == 'A' || type == 'U') && LIBXSMM_NEQ(0, check) ) {
331       printf("##########################################\n");
332       printf("#   Correctness - UPD (custom-Storage)   #\n");
333       printf("##########################################\n");
334 
335 #if defined(_OPENMP)
336 #     pragma omp parallel
337 #endif
338       {
339 #if defined(_OPENMP)
340         const int tid = omp_get_thread_num();
341 #else
342         const int tid = 0;
343 #endif
344         CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ) );
345       }
346 
347       /* copy out data */
348       CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_delfilter,     (void*)naive_libxsmm_delfilter,     LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) );
349       libxsmm_convert_bf16_f32( naive_libxsmm_delfilter, naive_libxsmm_delfilter_f32, nIFm*nOFm );
350 
351       /* compare */
352       libxsmm_matdiff(&norms_upd, LIBXSMM_DATATYPE_F32, nIFm*nOFm, 1, naive_delfilter, naive_libxsmm_delfilter_f32, 0, 0);
353       printf("L1 reference  : %.25g\n", norms_upd.l1_ref);
354       printf("L1 test       : %.25g\n", norms_upd.l1_tst);
355       printf("L2 abs.error  : %.24f\n", norms_upd.l2_abs);
356       printf("L2 rel.error  : %.24f\n", norms_upd.l2_rel);
357       printf("Linf abs.error: %.24f\n", norms_upd.linf_abs);
358       printf("Linf rel.error: %.24f\n", norms_upd.linf_rel);
359       printf("Check-norm    : %.24f\n", norms_upd.normf_rel);
360       libxsmm_matdiff_reduce(&diff, &norms_upd);
361     }
362 
363     if (type == 'A' || type == 'F') {
364       printf("##########################################\n");
365       printf("#   Performance - FWD (custom-Storage)   #\n");
366       printf("##########################################\n");
367       l_start = libxsmm_timer_tick();
368 #if defined(_OPENMP)
369 #     pragma omp parallel private(i)
370 #endif
371       {
372 #if defined(_OPENMP)
373         const int tid = omp_get_thread_num();
374 #else
375         const int tid = 0;
376 #endif
377         for (i = 0; i < iters; ++i) {
378           libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid );
379         }
380       }
381       l_end = libxsmm_timer_tick();
382       l_total = libxsmm_timer_duration(l_start, l_end);
383 
384       gflop = (2.0*(double)nImg*(double)nIFm*(double)nOFm*(double)iters) / (1000*1000*1000);
385 
386       printf("GFLOP  = %.5g\n", gflop/(double)iters);
387       printf("fp time = %.5g\n", ((double)(l_total/iters)));
388       printf("GFLOPS  = %.5g\n", gflop/l_total);
389 
390       printf("PERFDUMP,FP,%s,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIFm,
391         nOFm, ((double)(l_total/iters)), gflop/l_total, norms_fwd.l1_ref, norms_fwd.l1_tst,
392         norms_fwd.l2_abs, norms_fwd.l2_rel, norms_fwd.linf_abs, norms_fwd.linf_rel, norms_fwd.normf_rel);
393     }
394 
395     if (type == 'A' || type == 'B') {
396       printf("##########################################\n");
397       printf("#   Performance - BWD (custom-Storage)   #\n");
398       printf("##########################################\n");
399       l_start = libxsmm_timer_tick();
400 #if defined(_OPENMP)
401 #     pragma omp parallel private(i)
402 #endif
403       {
404 #if defined(_OPENMP)
405         const int tid = omp_get_thread_num();
406 #else
407         const int tid = 0;
408 #endif
409         for (i = 0; i < iters; ++i) {
410           libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid );
411         }
412       }
413       l_end = libxsmm_timer_tick();
414       l_total = libxsmm_timer_duration(l_start, l_end);
415 
416       gflop = (2.0*(double)nImg*(double)nIFm*(double)nOFm*(double)iters) / (1000*1000*1000);
417 
418       printf("GFLOP  = %.5g\n", gflop/(double)iters);
419       printf("fp time = %.5g\n", ((double)(l_total/iters)));
420       printf("GFLOPS  = %.5g\n", gflop/l_total);
421 
422       printf("PERFDUMP,BP,%s,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIFm,
423         nOFm, ((double)(l_total/iters)), gflop/l_total, norms_bwd.l1_ref, norms_bwd.l1_tst,
424         norms_bwd.l2_abs, norms_bwd.l2_rel, norms_bwd.linf_abs, norms_bwd.linf_rel, norms_bwd.normf_rel);
425     }
426 
427     if (type == 'A' || type == 'U') {
428       printf("##########################################\n");
429       printf("#   Performance - UPD (custom-Storage)   #\n");
430       printf("##########################################\n");
431       l_start = libxsmm_timer_tick();
432 #if defined(_OPENMP)
433 #     pragma omp parallel private(i)
434 #endif
435       {
436 #if defined(_OPENMP)
437         const int tid = omp_get_thread_num();
438 #else
439         const int tid = 0;
440 #endif
441         for (i = 0; i < iters; ++i) {
442           libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid );
443         }
444       }
445       l_end = libxsmm_timer_tick();
446       l_total = libxsmm_timer_duration(l_start, l_end);
447 
448       gflop = (2.0*(double)nImg*(double)nIFm*(double)nOFm*(double)iters) / (1000*1000*1000);
449 
450       printf("GFLOP  = %.5g\n", gflop/(double)iters);
451       printf("fp time = %.5g\n", ((double)(l_total/iters)));
452       printf("GFLOPS  = %.5g\n", gflop/l_total);
453 
454       printf("PERFDUMP,UP,%s,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIFm,
455         nOFm, ((double)(l_total/iters)), gflop/l_total, norms_upd.l1_ref, norms_upd.l1_tst,
456         norms_upd.l2_abs, norms_upd.l2_rel, norms_upd.linf_abs, norms_upd.linf_rel, norms_upd.normf_rel);
457     }
458 
459     /* clean-up */
460     CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_scratch( libxsmm_handle ) );
461     libxsmm_free(scratch);
462     CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ) );
463     CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT ) );
464     CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT ) );
465     CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT ) );
466     CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER ) );
467     CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER ) );
468 
469     CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) );
470     CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delinput ) );
471     CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_output ) );
472     CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_deloutput ) );
473     CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_filter ) );
474     CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delfilter ) );
475 
476     CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_fullyconnected( libxsmm_handle ) );
477   }
478 
479   /* deallocate data */
480   libxsmm_free(naive_input);
481   libxsmm_free(naive_output);
482   libxsmm_free(naive_delinput);
483   libxsmm_free(naive_deloutput);
484   libxsmm_free(naive_filter);
485   libxsmm_free(naive_delfilter);
486   libxsmm_free(naive_input_bf16);
487   libxsmm_free(naive_delinput_bf16);
488   libxsmm_free(naive_filter_bf16);
489   libxsmm_free(naive_delfilter_bf16);
490   libxsmm_free(naive_libxsmm_output);
491   libxsmm_free(naive_libxsmm_delinput);
492   libxsmm_free(naive_libxsmm_delfilter);
493   libxsmm_free(naive_libxsmm_delinput_f32);
494   libxsmm_free(naive_libxsmm_delfilter_f32);
495   libxsmm_free(input_libxsmm);
496   libxsmm_free(output_libxsmm);
497   libxsmm_free(delinput_libxsmm);
498   libxsmm_free(deloutput_libxsmm);
499   libxsmm_free(filter_libxsmm);
500   libxsmm_free(delfilter_libxsmm);
501 
502   { const char *const env_check_scale = getenv("CHECK_SCALE");
503     const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 1.0 : atof(env_check_scale));
504     if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) {
505       fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel);
506       exit(EXIT_FAILURE);
507     }
508   }
509 
510   /* some empty lines at the end */
511   printf("\n\n\n");
512 
513   return global_status;
514 }
515 
516