1 /******************************************************************************
2 * Copyright (c) Intel Corporation - All rights reserved. *
3 * This file is part of the LIBXSMM library. *
4 * *
5 * For information on the license, see the LICENSE file. *
6 * Further information: https://github.com/hfp/libxsmm/ *
7 * SPDX-License-Identifier: BSD-3-Clause *
8 ******************************************************************************/
9 /* Alexander Heinecke (Intel Corp.)
10 ******************************************************************************/
11 #include <libxsmm.h>
12
13 #include <stdlib.h>
14 #include <string.h>
15 #include <stdio.h>
16 #include <math.h>
17 #if defined(_OPENMP)
18 # include <omp.h>
19 #endif
20
21 /* include c-based dnn library */
22 #include "../common/dnn_common.h"
23
24 #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \
25 fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \
26 }
27
main(int argc,char * argv[])28 int main(int argc, char* argv[])
29 {
30 float *naive_input, *naive_output, *naive_filter, *naive_delinput, *naive_deloutput, *naive_delfilter;
31 libxsmm_bfloat16 *naive_input_bf16, *naive_filter_bf16, *naive_delinput_bf16, *naive_delfilter_bf16;
32 float *naive_libxsmm_output, *naive_libxsmm_delinput_f32, *naive_libxsmm_delfilter_f32;
33 libxsmm_bfloat16 *naive_libxsmm_delinput, *naive_libxsmm_delfilter;
34 libxsmm_bfloat16 *input_libxsmm, *filter_libxsmm, *delinput_libxsmm, *delfilter_libxsmm;
35 float *output_libxsmm, *deloutput_libxsmm;
36
37 naive_fullyconnected_t naive_param;
38 void* scratch;
39 size_t scratch_size = 0;
40
41 /* some parameters we can overwrite via cli,
42 default is some inner layer of overfeat */
43 int iters = 10; /* repetitions of benchmark */
44 int nImg = 32; /* mini-batch size, "N" */
45 int nIFm = 256; /* number of input feature maps, "C" */
46 int nOFm = 256; /* number of input feature maps, "C" */
47 int fuse_type = 0; /* 0: nothing fused, 1: relu fused, 2: elementwise fused, 3: relu and elementwise fused */
48 char type = 'A'; /* 'A': ALL, 'F': FP, 'B': BP, 'U', WU */
49 char format = 'L';
50
51 const char *const env_check = getenv("CHECK");
52 const double check = LIBXSMM_ABS(0 == env_check ? 1 : atof(env_check));
53
54 #if defined(_OPENMP)
55 int nThreads = omp_get_max_threads(); /* number of threads */
56 #else
57 int nThreads = 1; /* number of threads */
58 #endif
59
60 unsigned long long l_start, l_end;
61 double l_total = 0.0;
62 double gflop = 0.0;
63 int i;
64
65 libxsmm_dnn_fullyconnected_desc fullyconnected_desc;
66 libxsmm_dnn_fullyconnected* libxsmm_handle;
67 libxsmm_dnn_tensor* libxsmm_input;
68 libxsmm_dnn_tensor* libxsmm_delinput;
69 libxsmm_dnn_tensor* libxsmm_output;
70 libxsmm_dnn_tensor* libxsmm_deloutput;
71 libxsmm_dnn_tensor* libxsmm_filter;
72 libxsmm_dnn_tensor* libxsmm_delfilter;
73 libxsmm_dnn_tensor_datalayout* libxsmm_layout;
74 libxsmm_dnn_err_t status;
75 libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS;
76
77 libxsmm_matdiff_info norms_fwd, norms_bwd, norms_upd, diff;
78 libxsmm_matdiff_clear(&norms_fwd);
79 libxsmm_matdiff_clear(&norms_bwd);
80 libxsmm_matdiff_clear(&norms_upd);
81 libxsmm_matdiff_clear(&diff);
82
83 if (argc > 1 && !strncmp(argv[1], "-h", 3)) {
84 printf("Usage: %s iters nImg nIFm nOFm fuse_type type format\n", argv[0]);
85 return 0;
86 }
87 libxsmm_rng_set_seed(1);
88
89 /* reading new values from cli */
90 i = 1;
91 if (argc > i) iters = atoi(argv[i++]);
92 if (argc > i) nImg = atoi(argv[i++]);
93 if (argc > i) nIFm = atoi(argv[i++]);
94 if (argc > i) nOFm = atoi(argv[i++]);
95 if (argc > i) fuse_type = atoi(argv[i++]);
96 if (argc > i) type = *(argv[i++]);
97 if (argc > i) format = *(argv[i++]);
98
99 if (type != 'A' && type != 'F' && type != 'B' && type != 'U') {
100 printf("type needs to be 'A' (All), 'F' (FP only), 'B' (BP only), 'U' (UP only)\n");
101 return -1;
102 }
103 if ( fuse_type != 0 ) {
104 printf("fuse type needs to be 0\n");
105 return -1;
106 }
107 if (format != 'L') {
108 printf("format needs to be 'L' (libxsmm)\n");
109 return -1;
110 }
111
112 /* set struct for naive convolution */
113 naive_param.N = nImg;
114 naive_param.C = nIFm;
115 naive_param.K = nOFm;
116 naive_param.fuse_type = fuse_type;
117
118 #if defined(__SSE3__)
119 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
120 _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
121 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
122 #endif
123
124 /* print some summary */
125 printf("##########################################\n");
126 printf("# Setting Up (Common) #\n");
127 printf("##########################################\n");
128 printf("PARAMS: N:%d C:%d K:%d\n", nImg, nIFm, nOFm);
129 printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n");
130 printf("SIZE Input (MB): %10.2f MiB\n", (double)(nImg*nIFm*sizeof(libxsmm_bfloat16))/(1024.0*1024.0) );
131 printf("SIZE Output (MB): %10.2f MiB\n", (double)(nImg*nOFm*sizeof(float))/(1024.0*1024.0) );
132 printf("SIZE Input (1): %10.2f MiB\n", (double)(1*nIFm* sizeof(libxsmm_bfloat16))/(1024.0*1024.0) );
133 printf("SIZE Output (1): %10.2f MiB\n", (double)(1*nOFm* sizeof(float))/(1024.0*1024.0) );
134 printf("SIZE Filter : %10.2f MiB\n", (double)(nIFm*nOFm*sizeof(libxsmm_bfloat16))/(1024.0*1024.0) );
135
136 /* allocate data */
137 naive_input = (float*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(float), 2097152);
138 naive_delinput = (float*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(float), 2097152);
139 naive_output = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152);
140 naive_deloutput = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152);
141 naive_filter = (float*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(float), 2097152);
142 naive_delfilter = (float*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(float), 2097152);
143
144 naive_input_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(libxsmm_bfloat16), 2097152);
145 naive_delinput_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(libxsmm_bfloat16), 2097152);
146 naive_filter_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(libxsmm_bfloat16), 2097152);
147 naive_delfilter_bf16 = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(libxsmm_bfloat16), 2097152);
148
149 naive_libxsmm_delinput = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(libxsmm_bfloat16), 2097152);
150 naive_libxsmm_output = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152);
151 naive_libxsmm_delfilter = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(libxsmm_bfloat16), 2097152);
152 naive_libxsmm_delinput_f32 = (float*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(float), 2097152);
153 naive_libxsmm_delfilter_f32 = (float*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(float), 2097152);
154
155 input_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(libxsmm_bfloat16), 2097152);
156 delinput_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nImg*nIFm*sizeof(libxsmm_bfloat16), 2097152);
157 output_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152);
158 deloutput_libxsmm = (float*)libxsmm_aligned_malloc( nImg*nOFm*sizeof(float), 2097152);
159 filter_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(libxsmm_bfloat16), 2097152);
160 delfilter_libxsmm = (libxsmm_bfloat16*)libxsmm_aligned_malloc( nIFm*nOFm*sizeof(libxsmm_bfloat16), 2097152);
161
162 /* initialize data */
163 init_buf( naive_input, nImg*nIFm, 0, 0 );
164 init_buf( naive_delinput, nImg*nIFm, 0, 0 );
165 init_buf( naive_output, nImg*nOFm, 0, 0 );
166 init_buf( naive_deloutput, nImg*nOFm, 0, 0 );
167 init_buf( naive_filter, nIFm*nOFm, 0, 0 );
168 init_buf( naive_delfilter, nIFm*nOFm, 0, 0 );
169
170 libxsmm_rne_convert_fp32_bf16( naive_input, naive_input_bf16, nImg*nIFm );
171 libxsmm_rne_convert_fp32_bf16( naive_delinput, naive_delinput_bf16, nImg*nIFm );
172 libxsmm_rne_convert_fp32_bf16( naive_filter, naive_filter_bf16, nIFm*nOFm );
173 libxsmm_rne_convert_fp32_bf16( naive_delfilter, naive_delfilter_bf16, nIFm*nOFm );
174
175 if (LIBXSMM_NEQ(0, check)) {
176 printf("##########################################\n");
177 printf("# Computing Reference ... #\n");
178 printf("##########################################\n");
179 if (type == 'A' || type == 'F') {
180 naive_fullyconnected_fp(&naive_param, naive_input, naive_output, naive_filter);
181 }
182 if (type == 'A' || type == 'B') {
183 naive_fullyconnected_bp(&naive_param, naive_delinput, naive_deloutput, naive_filter);
184 }
185 if (type == 'A' || type == 'U') {
186 naive_fullyconnected_wu(&naive_param, naive_input, naive_deloutput, naive_delfilter);
187 }
188 printf("##########################################\n");
189 printf("# Computing Reference ... done #\n");
190 printf("##########################################\n");
191 }
192
193 if (format == 'A' || format == 'L') {
194 printf("\n");
195 printf("##########################################\n");
196 printf("# Setting Up (custom-Storage) #\n");
197 printf("##########################################\n");
198
199 /* setup LIBXSMM handle */
200 fullyconnected_desc.N = nImg;
201 fullyconnected_desc.C = nIFm;
202 fullyconnected_desc.K = nOFm;
203 fullyconnected_desc.threads = nThreads;
204 fullyconnected_desc.datatype_in = LIBXSMM_DNN_DATATYPE_BF16;
205 fullyconnected_desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32;
206 fullyconnected_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;
207 fullyconnected_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;
208 fullyconnected_desc.fuse_ops = LIBXSMM_DNN_FULLYCONNECTED_FUSE_NONE;
209
210 libxsmm_handle = libxsmm_dnn_create_fullyconnected( fullyconnected_desc, &status );
211 CHKERR_LIBXSMM_DNN( status );
212
213 /* setup LIBXSMM buffers */
214 libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT, &status ); CHKERR_LIBXSMM_DNN( status );
215 libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, input_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status );
216 printf("inner activation blocking: %i\n", libxsmm_layout->dim_size[0] );
217 libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout );
218
219 libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT, &status ); CHKERR_LIBXSMM_DNN( status );
220 libxsmm_delinput = libxsmm_dnn_link_tensor( libxsmm_layout, delinput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status );
221 libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout );
222
223 libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status );
224 libxsmm_output = libxsmm_dnn_link_tensor( libxsmm_layout, output_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status );
225 libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout );
226
227 libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status );
228 libxsmm_deloutput = libxsmm_dnn_link_tensor( libxsmm_layout, deloutput_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status );
229 libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout );
230
231 libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER, &status ); CHKERR_LIBXSMM_DNN( status );
232 libxsmm_filter = libxsmm_dnn_link_tensor( libxsmm_layout, filter_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status );
233 libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout );
234
235 libxsmm_layout = libxsmm_dnn_fullyconnected_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER, &status ); CHKERR_LIBXSMM_DNN( status );
236 libxsmm_delfilter = libxsmm_dnn_link_tensor( libxsmm_layout, delfilter_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status );
237 libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout );
238
239 /* copy in data to LIBXSMM format */
240 /* we can also use the layout functions and set the data on our
241 own external to the library */
242 CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_input, (void*)naive_input_bf16, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) );
243 CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_output, (void*)naive_output, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) );
244 CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_filter, (void*)naive_filter_bf16, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) );
245 CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_delinput, (void*)naive_delinput_bf16, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) );
246 CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_deloutput, (void*)naive_deloutput, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) );
247 CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_delfilter, (void*)naive_delfilter_bf16, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) );
248
249 /* bind buffers and filter to handle */
250 CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_REGULAR_INPUT ) );
251 CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_delinput, LIBXSMM_DNN_GRADIENT_INPUT ) );
252 CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_output, LIBXSMM_DNN_REGULAR_OUTPUT ) );
253 CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_deloutput, LIBXSMM_DNN_GRADIENT_OUTPUT ) );
254 CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_REGULAR_FILTER ) );
255 CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_tensor( libxsmm_handle, libxsmm_delfilter, LIBXSMM_DNN_GRADIENT_FILTER ) );
256
257 /* let's allocate and bind scratch */
258 scratch_size = libxsmm_dnn_fullyconnected_get_scratch_size( libxsmm_handle, &status );
259 CHKERR_LIBXSMM_DNN( status );
260 scratch = libxsmm_aligned_scratch( scratch_size, 2097152 );
261 CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_bind_scratch( libxsmm_handle, scratch ) );
262 /* set scratch to bogus to make sure that libxsmm takes care of zeroing internally */
263 init_buf( (float*)scratch, scratch_size/4, 0, 0 );
264
265 if ((type == 'A' || type == 'F') && LIBXSMM_NEQ(0, check)) {
266 printf("##########################################\n");
267 printf("# Correctness - FWD (custom-Storage) #\n");
268 printf("##########################################\n");
269
270 #if defined(_OPENMP)
271 # pragma omp parallel
272 #endif
273 {
274 #if defined(_OPENMP)
275 const int tid = omp_get_thread_num();
276 #else
277 const int tid = 0;
278 #endif
279 CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) );
280 }
281
282 /* copy out data */
283 CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_output, (void*)naive_libxsmm_output, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) );
284
285 /* compare */
286 libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_F32, nImg*nOFm, 1, naive_output, naive_libxsmm_output, 0, 0);
287 printf("L1 reference : %.25g\n", norms_fwd.l1_ref);
288 printf("L1 test : %.25g\n", norms_fwd.l1_tst);
289 printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs);
290 printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel);
291 printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs);
292 printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel);
293 printf("Check-norm : %.24f\n", norms_fwd.normf_rel);
294 libxsmm_matdiff_reduce(&diff, &norms_fwd);
295 }
296
297 if ( (type == 'A' || type == 'B') && LIBXSMM_NEQ(0, check) ) {
298 printf("##########################################\n");
299 printf("# Correctness - BWD (custom-Storage) #\n");
300 printf("##########################################\n");
301
302 #if defined(_OPENMP)
303 # pragma omp parallel
304 #endif
305 {
306 #if defined(_OPENMP)
307 const int tid = omp_get_thread_num();
308 #else
309 const int tid = 0;
310 #endif
311 CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid ) );
312 }
313
314 /* copy out data */
315 CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_delinput, (void*)naive_libxsmm_delinput, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) );
316 libxsmm_convert_bf16_f32( naive_libxsmm_delinput, naive_libxsmm_delinput_f32, nImg*nIFm );
317
318 /* compare */
319 libxsmm_matdiff(&norms_bwd, LIBXSMM_DATATYPE_F32, nImg*nIFm, 1, naive_delinput, naive_libxsmm_delinput_f32, 0, 0);
320 printf("L1 reference : %.25g\n", norms_bwd.l1_ref);
321 printf("L1 test : %.25g\n", norms_bwd.l1_tst);
322 printf("L2 abs.error : %.24f\n", norms_bwd.l2_abs);
323 printf("L2 rel.error : %.24f\n", norms_bwd.l2_rel);
324 printf("Linf abs.error: %.24f\n", norms_bwd.linf_abs);
325 printf("Linf rel.error: %.24f\n", norms_bwd.linf_rel);
326 printf("Check-norm : %.24f\n", norms_bwd.normf_rel);
327 libxsmm_matdiff_reduce(&diff, &norms_bwd);
328 }
329
330 if ( (type == 'A' || type == 'U') && LIBXSMM_NEQ(0, check) ) {
331 printf("##########################################\n");
332 printf("# Correctness - UPD (custom-Storage) #\n");
333 printf("##########################################\n");
334
335 #if defined(_OPENMP)
336 # pragma omp parallel
337 #endif
338 {
339 #if defined(_OPENMP)
340 const int tid = omp_get_thread_num();
341 #else
342 const int tid = 0;
343 #endif
344 CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid ) );
345 }
346
347 /* copy out data */
348 CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_delfilter, (void*)naive_libxsmm_delfilter, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) );
349 libxsmm_convert_bf16_f32( naive_libxsmm_delfilter, naive_libxsmm_delfilter_f32, nIFm*nOFm );
350
351 /* compare */
352 libxsmm_matdiff(&norms_upd, LIBXSMM_DATATYPE_F32, nIFm*nOFm, 1, naive_delfilter, naive_libxsmm_delfilter_f32, 0, 0);
353 printf("L1 reference : %.25g\n", norms_upd.l1_ref);
354 printf("L1 test : %.25g\n", norms_upd.l1_tst);
355 printf("L2 abs.error : %.24f\n", norms_upd.l2_abs);
356 printf("L2 rel.error : %.24f\n", norms_upd.l2_rel);
357 printf("Linf abs.error: %.24f\n", norms_upd.linf_abs);
358 printf("Linf rel.error: %.24f\n", norms_upd.linf_rel);
359 printf("Check-norm : %.24f\n", norms_upd.normf_rel);
360 libxsmm_matdiff_reduce(&diff, &norms_upd);
361 }
362
363 if (type == 'A' || type == 'F') {
364 printf("##########################################\n");
365 printf("# Performance - FWD (custom-Storage) #\n");
366 printf("##########################################\n");
367 l_start = libxsmm_timer_tick();
368 #if defined(_OPENMP)
369 # pragma omp parallel private(i)
370 #endif
371 {
372 #if defined(_OPENMP)
373 const int tid = omp_get_thread_num();
374 #else
375 const int tid = 0;
376 #endif
377 for (i = 0; i < iters; ++i) {
378 libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid );
379 }
380 }
381 l_end = libxsmm_timer_tick();
382 l_total = libxsmm_timer_duration(l_start, l_end);
383
384 gflop = (2.0*(double)nImg*(double)nIFm*(double)nOFm*(double)iters) / (1000*1000*1000);
385
386 printf("GFLOP = %.5g\n", gflop/(double)iters);
387 printf("fp time = %.5g\n", ((double)(l_total/iters)));
388 printf("GFLOPS = %.5g\n", gflop/l_total);
389
390 printf("PERFDUMP,FP,%s,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIFm,
391 nOFm, ((double)(l_total/iters)), gflop/l_total, norms_fwd.l1_ref, norms_fwd.l1_tst,
392 norms_fwd.l2_abs, norms_fwd.l2_rel, norms_fwd.linf_abs, norms_fwd.linf_rel, norms_fwd.normf_rel);
393 }
394
395 if (type == 'A' || type == 'B') {
396 printf("##########################################\n");
397 printf("# Performance - BWD (custom-Storage) #\n");
398 printf("##########################################\n");
399 l_start = libxsmm_timer_tick();
400 #if defined(_OPENMP)
401 # pragma omp parallel private(i)
402 #endif
403 {
404 #if defined(_OPENMP)
405 const int tid = omp_get_thread_num();
406 #else
407 const int tid = 0;
408 #endif
409 for (i = 0; i < iters; ++i) {
410 libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_BWD, 0, tid );
411 }
412 }
413 l_end = libxsmm_timer_tick();
414 l_total = libxsmm_timer_duration(l_start, l_end);
415
416 gflop = (2.0*(double)nImg*(double)nIFm*(double)nOFm*(double)iters) / (1000*1000*1000);
417
418 printf("GFLOP = %.5g\n", gflop/(double)iters);
419 printf("fp time = %.5g\n", ((double)(l_total/iters)));
420 printf("GFLOPS = %.5g\n", gflop/l_total);
421
422 printf("PERFDUMP,BP,%s,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIFm,
423 nOFm, ((double)(l_total/iters)), gflop/l_total, norms_bwd.l1_ref, norms_bwd.l1_tst,
424 norms_bwd.l2_abs, norms_bwd.l2_rel, norms_bwd.linf_abs, norms_bwd.linf_rel, norms_bwd.normf_rel);
425 }
426
427 if (type == 'A' || type == 'U') {
428 printf("##########################################\n");
429 printf("# Performance - UPD (custom-Storage) #\n");
430 printf("##########################################\n");
431 l_start = libxsmm_timer_tick();
432 #if defined(_OPENMP)
433 # pragma omp parallel private(i)
434 #endif
435 {
436 #if defined(_OPENMP)
437 const int tid = omp_get_thread_num();
438 #else
439 const int tid = 0;
440 #endif
441 for (i = 0; i < iters; ++i) {
442 libxsmm_dnn_fullyconnected_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_UPD, 0, tid );
443 }
444 }
445 l_end = libxsmm_timer_tick();
446 l_total = libxsmm_timer_duration(l_start, l_end);
447
448 gflop = (2.0*(double)nImg*(double)nIFm*(double)nOFm*(double)iters) / (1000*1000*1000);
449
450 printf("GFLOP = %.5g\n", gflop/(double)iters);
451 printf("fp time = %.5g\n", ((double)(l_total/iters)));
452 printf("GFLOPS = %.5g\n", gflop/l_total);
453
454 printf("PERFDUMP,UP,%s,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIFm,
455 nOFm, ((double)(l_total/iters)), gflop/l_total, norms_upd.l1_ref, norms_upd.l1_tst,
456 norms_upd.l2_abs, norms_upd.l2_rel, norms_upd.linf_abs, norms_upd.linf_rel, norms_upd.normf_rel);
457 }
458
459 /* clean-up */
460 CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_scratch( libxsmm_handle ) );
461 libxsmm_free(scratch);
462 CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ) );
463 CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_INPUT ) );
464 CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT ) );
465 CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_OUTPUT ) );
466 CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER ) );
467 CHKERR_LIBXSMM_DNN( libxsmm_dnn_fullyconnected_release_tensor( libxsmm_handle, LIBXSMM_DNN_GRADIENT_FILTER ) );
468
469 CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) );
470 CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delinput ) );
471 CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_output ) );
472 CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_deloutput ) );
473 CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_filter ) );
474 CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_delfilter ) );
475
476 CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_fullyconnected( libxsmm_handle ) );
477 }
478
479 /* deallocate data */
480 libxsmm_free(naive_input);
481 libxsmm_free(naive_output);
482 libxsmm_free(naive_delinput);
483 libxsmm_free(naive_deloutput);
484 libxsmm_free(naive_filter);
485 libxsmm_free(naive_delfilter);
486 libxsmm_free(naive_input_bf16);
487 libxsmm_free(naive_delinput_bf16);
488 libxsmm_free(naive_filter_bf16);
489 libxsmm_free(naive_delfilter_bf16);
490 libxsmm_free(naive_libxsmm_output);
491 libxsmm_free(naive_libxsmm_delinput);
492 libxsmm_free(naive_libxsmm_delfilter);
493 libxsmm_free(naive_libxsmm_delinput_f32);
494 libxsmm_free(naive_libxsmm_delfilter_f32);
495 libxsmm_free(input_libxsmm);
496 libxsmm_free(output_libxsmm);
497 libxsmm_free(delinput_libxsmm);
498 libxsmm_free(deloutput_libxsmm);
499 libxsmm_free(filter_libxsmm);
500 libxsmm_free(delfilter_libxsmm);
501
502 { const char *const env_check_scale = getenv("CHECK_SCALE");
503 const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 1.0 : atof(env_check_scale));
504 if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) {
505 fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel);
506 exit(EXIT_FAILURE);
507 }
508 }
509
510 /* some empty lines at the end */
511 printf("\n\n\n");
512
513 return global_status;
514 }
515
516