1 /******************************************************************************
2 * Copyright (c) Intel Corporation - All rights reserved. *
3 * This file is part of the LIBXSMM library. *
4 * *
5 * For information on the license, see the LICENSE file. *
6 * Further information: https://github.com/hfp/libxsmm/ *
7 * SPDX-License-Identifier: BSD-3-Clause *
8 ******************************************************************************/
9 /* Alexander Heinecke, Evangelos Georganas, Hans Pabst,
10 Dhiraj Kalamkar, Ankush Mandal (Intel Corp.)
11 ******************************************************************************/
12 #include <libxsmm.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <stdio.h>
16 #if defined(_OPENMP)
17 # include <omp.h>
18 #endif
19
20 #define USE_OVERWRITE
21
22 /* include c-based dnn library */
23 #include "../common/dnn_common.h"
24
25 #define CHKERR_LIBXSMM_DNN(A) { const int chkerr_libxsmm_dnn_ = A; if (LIBXSMM_DNN_SUCCESS != chkerr_libxsmm_dnn_) { \
26 fprintf(stderr, "%s\n", libxsmm_dnn_get_error(chkerr_libxsmm_dnn_)); global_status = chkerr_libxsmm_dnn_; } \
27 }
28
main(int argc,char * argv[])29 int main(int argc, char* argv[])
30 {
31 unsigned char *naive_input, *naive_input_tmp;
32 char *naive_filter;
33 int *naive_output_fp;
34 int *naive_libxsmm_output;
35 unsigned char *input_libxsmm;
36 char *filter_libxsmm;
37 int *output_libxsmm;
38 int ifhp, ifwp, ofhp, ofwp, ofh, ofw;
39 int stride_h, stride_w, pad_h, pad_w, pad_h_in, pad_w_in, pad_h_out, pad_w_out;
40 naive_conv_t naive_param;
41 void* scratch;
42 size_t scratch_size;
43
44 /* some parameters we can overwrite via cli,
45 default is some inner layer of overfeat */
46 int iters = 10; /* repetitions of benchmark */
47 int ifw = 14; /* input width, "W" */
48 int ifh = 18; /* input height, "H" */
49 int nImg = 32; /* mini-batch size, "N" */
50 int nIfm = 256; /* number of input feature maps, "C" */
51 int nOfm = 512; /* number of output feature maps, "K" */
52 int kh = 3; /* filter height, "R" */
53 int kw = 3; /* filter width, "S" */
54 int padh = 1; /* padding in input, height */
55 int padw = 1; /* padding in input, width */
56 int stride = 1; /* stride when accessing inputs */
57 char type = 'A'; /* 'A': ALL, 'F': FP, 'B': BP, 'U', WU */
58 char format = 'L';
59
60 const char *const env_check = getenv("CHECK");
61 const double check = LIBXSMM_ABS(0 == env_check ? 1 : atof(env_check));
62
63 #if defined(_OPENMP)
64 int nThreads = omp_get_max_threads(); /* number of threads */
65 #else
66 int nThreads = 1; /* number of threads */
67 #endif
68 int padding_mode = 0; /* padding mode */
69
70 unsigned long long l_start, l_end;
71 double l_total = 0.0;
72 double lpOps = 0.0; /* number of low precision operations */
73 int i;
74
75 libxsmm_dnn_conv_desc conv_desc;
76 libxsmm_dnn_layer* libxsmm_handle;
77 libxsmm_dnn_tensor* libxsmm_input;
78 libxsmm_dnn_tensor* libxsmm_output;
79 libxsmm_dnn_tensor* libxsmm_filter;
80 libxsmm_dnn_tensor_datalayout* libxsmm_layout;
81 libxsmm_dnn_err_t status;
82 libxsmm_dnn_err_t global_status = LIBXSMM_DNN_SUCCESS;
83
84 libxsmm_matdiff_info norms_fwd, diff;
85 libxsmm_matdiff_clear(&norms_fwd);
86 libxsmm_matdiff_clear(&diff);
87
88 if (argc > 1 && !strncmp(argv[1], "-h", 3)) {
89 printf("Usage: %s iters inpWidth inpHeight nImg nIfm nOfm kw kh pad stride type padding_mode\n", argv[0]);
90 return 0;
91 }
92 srand(1);
93
94 /* reading new values from cli */
95 i = 1;
96 if (argc > i) iters = atoi(argv[i++]);
97 if (argc > i) ifw = atoi(argv[i++]);
98 if (argc > i) ifh = atoi(argv[i++]);
99 if (argc > i) nImg = atoi(argv[i++]);
100 if (argc > i) nIfm = atoi(argv[i++]);
101 if (argc > i) nOfm = atoi(argv[i++]);
102 if (argc > i) kw = atoi(argv[i++]);
103 if (argc > i) kh = atoi(argv[i++]);
104 if (argc > i) padw = atoi(argv[i++]);
105 if (argc > i) padh = atoi(argv[i++]);
106 if (argc > i) stride = atoi(argv[i++]);
107 if (argc > i) type = *(argv[i++]);
108 if (argc > i) format = *(argv[i++]);
109 if (argc > i) padding_mode = atoi(argv[i++]);
110
111 if (type != 'A' && type != 'F' && type != 'B' && type != 'U') {
112 printf("type needs to be 'A' (All), 'F' (FP only), 'B' (BP only), 'U' (WU only)\n");
113 return 0;
114 }
115
116 if (format != 'L') {
117 printf("format needs to be 'L'\n");
118 return 0;
119 }
120
121 stride_w = stride;
122 stride_h = stride;
123 pad_w = padw;
124 pad_h = padh;
125
126 if (0 == padding_mode) {
127 pad_h_in = 0;
128 pad_w_in = 0;
129 pad_h_out = 0;
130 pad_w_out = 0;
131 }
132 else {
133 /* TODO: change "1" to "0" if "padding_mode = -1" is acknowledged */
134 if (1 < padding_mode) pad_w = padding_mode;
135 pad_h_in = pad_h;
136 pad_w_in = pad_w;
137 pad_h_out = pad_h;
138 pad_w_out = pad_w;
139 }
140
141 /* deriving some values for naive code */
142 ofh = (ifh + 2 * pad_h - kh) / stride_h + 1;
143 ofw = (ifw + 2 * pad_w - kw) / stride_w + 1;
144 ifhp = ifh + 2 * pad_h_in;
145 ifwp = ifw + 2 * pad_w_in;
146 ofhp = ofh + 2 * pad_h_out;
147 ofwp = ofw + 2 * pad_w_out;
148
149 /* set struct for naive convolution */
150 naive_param.nImg = nImg;
151 naive_param.nIfm = nIfm;
152 naive_param.nOfm = nOfm;
153 naive_param.ifhp = ifhp;
154 naive_param.ifwp = ifwp;
155 naive_param.ifh = ifh;
156 naive_param.ifw = ifw;
157 naive_param.ofhp = ofhp;
158 naive_param.ofwp = ofwp;
159 naive_param.ofh = ofh;
160 naive_param.ofw = ofw;
161 naive_param.pad_h = pad_h;
162 naive_param.pad_w = pad_w;
163 naive_param.pad_h_in = pad_h_in;
164 naive_param.pad_w_in = pad_w_in;
165 naive_param.pad_h_out = pad_h_out;
166 naive_param.pad_w_out = pad_w_out;
167 naive_param.kh = kh;
168 naive_param.kw = kw;
169 naive_param.stride_h = stride_h;
170 naive_param.stride_w = stride_w;
171
172 /* print some summary */
173 printf("##########################################\n");
174 printf("# Setting Up Common #\n");
175 printf("##########################################\n");
176 printf("PARAMS: W:%d H:%d N:%d C:%d K:%d R:%d S:%d P:%d Q:%d STRIDE:%d\n", ifw, ifh, nImg, nIfm, nOfm, kw, kh, ofh, ofw, stride);
177 printf("PARAMS: ITERS:%d", iters); if (LIBXSMM_FEQ(0, check)) printf(" Threads:%d\n", nThreads); else printf("\n");
178 printf(" InImg %dx%d Padded (%dx%d)\n", ifh, ifw, ifhp, ifwp);
179 printf("OutImg %dx%d Padded (%dx%d)\n", ofh, ofw, ofhp, ofwp);
180 printf("SIZE Input (MB): %10.2f MiB\n", (double)(nImg*nIfm*ifhp*ifwp*sizeof(unsigned char))/(1024.0*1024.0) );
181 printf("SIZE Output (MB): %10.2f MiB\n", (double)(nImg*nOfm*ofhp*ofwp*sizeof(int))/(1024.0*1024.0) );
182 printf("SIZE Input (1): %10.2f MiB\n", (double)(1*nIfm*ifhp*ifwp* sizeof(unsigned char))/(1024.0*1024.0) );
183 printf("SIZE Output (1): %10.2f MiB\n", (double)(1*nOfm*ofhp*ofwp* sizeof(int))/(1024.0*1024.0) );
184 printf("SIZE Weight : %10.2f MiB\n", (double)(nIfm*nOfm*kw*kh* sizeof(char))/(1024.0*1024.0) );
185
186 /* allocate data */
187 naive_input = (unsigned char*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(unsigned char), 2097152);
188 naive_output_fp = (int* )libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(int), 2097152);
189 naive_libxsmm_output = (int* )libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(int), 2097152);
190 naive_filter = (char*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(char), 2097152);
191 input_libxsmm = (unsigned char*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(unsigned char), 2097152);
192 filter_libxsmm = (char*)libxsmm_aligned_malloc( nOfm*nIfm*kh*kw* sizeof(char), 2097152);
193 output_libxsmm = (int*) libxsmm_aligned_malloc( nImg*nOfm*ofhp*ofwp*sizeof(int), 2097152);
194
195 /* initialize data */
196 naive_input_tmp = (unsigned char*)libxsmm_aligned_malloc( nImg*nIfm*ifhp*ifwp*sizeof(unsigned char), 2097152);
197 zero_buf_uint8(naive_input, nImg*nIfm*ifhp*ifwp);
198 if (padding_mode == 0 ) {
199 init_buf_uint8(naive_input, nImg*nIfm*ifhp*ifwp, 0, 0);
200 } else {
201 init_buf_uint8(naive_input_tmp, nImg*nIfm*ifh*ifw, 0, 0);
202 copy_internal_nchw_uint8( naive_input , naive_input_tmp, nImg, nIfm, ifh, ifw, pad_h, pad_w);
203 }
204 init_buf_int8(naive_filter, nOfm*nIfm*kh*kw, 0, 0);
205 zero_buf_int32(naive_output_fp, nImg*nOfm*ofhp*ofwp);
206 zero_buf_int32(output_libxsmm, nImg*nOfm*ofhp*ofwp);
207 zero_buf_int32(naive_libxsmm_output, nImg*nOfm*ofhp*ofwp);
208
209 if (LIBXSMM_NEQ(0, check)) {
210 printf("##########################################\n");
211 printf("# Computing Reference ... #\n");
212 printf("##########################################\n");
213 /* run naive convolutions */
214 if (type == 'A' || type == 'F') {
215 naive_conv_fp_int8int32(&naive_param, naive_input, naive_output_fp, naive_filter);
216 }
217
218 printf("##########################################\n");
219 printf("# Computing Reference ... done #\n");
220 printf("##########################################\n");
221 }
222
223 printf("\n");
224 printf("##########################################\n");
225 printf("# Setting Up (custom-Storage) #\n");
226 printf("##########################################\n");
227
228 /* setup LIBXSMM handle */
229 conv_desc.N = nImg;
230 conv_desc.C = nIfm;
231 conv_desc.H = ifh;
232 conv_desc.W = ifw;
233 conv_desc.K = nOfm;
234 conv_desc.R = kh;
235 conv_desc.S = kw;
236 conv_desc.u = stride_h;
237 conv_desc.v = stride_w;
238 conv_desc.pad_h = pad_h;
239 conv_desc.pad_w = pad_w;
240 conv_desc.pad_h_in = pad_h_in;
241 conv_desc.pad_w_in = pad_w_in;
242 conv_desc.pad_h_out = pad_h_out;
243 conv_desc.pad_w_out = pad_w_out;
244 conv_desc.threads = nThreads;
245 conv_desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT;
246 conv_desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;
247 conv_desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM;
248 conv_desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
249 conv_desc.options = LIBXSMM_DNN_CONV_OPTION_OVERWRITE;
250 conv_desc.datatype_in = LIBXSMM_DNN_DATATYPE_I8;
251 conv_desc.datatype_out = LIBXSMM_DNN_DATATYPE_I32;
252
253 libxsmm_handle = libxsmm_dnn_create_conv_layer( conv_desc, &status );
254 CHKERR_LIBXSMM_DNN( status );
255
256 /* setup LIBXSMM buffers and filter */
257 libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_INPUT, &status ); CHKERR_LIBXSMM_DNN( status );
258 libxsmm_input = libxsmm_dnn_link_tensor( libxsmm_layout, input_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status );
259 libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout );
260
261 libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_OUTPUT, &status ); CHKERR_LIBXSMM_DNN( status );
262 libxsmm_output = libxsmm_dnn_link_tensor( libxsmm_layout, output_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status );
263 libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout );
264
265 libxsmm_layout = libxsmm_dnn_create_tensor_datalayout( libxsmm_handle, LIBXSMM_DNN_FILTER, &status ); CHKERR_LIBXSMM_DNN( status );
266 libxsmm_filter = libxsmm_dnn_link_tensor( libxsmm_layout, filter_libxsmm, &status ); CHKERR_LIBXSMM_DNN( status );
267 libxsmm_dnn_destroy_tensor_datalayout( libxsmm_layout );
268
269 /* copy in data to LIBXSMM format */
270 /* we can also use the layout functions and set the data on our
271 own external to the library, @TODO, we plan to add an example here */
272 CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_input, (void*)naive_input, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) );
273 CHKERR_LIBXSMM_DNN( libxsmm_dnn_zero_tensor( libxsmm_output ) );
274 CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyin_tensor( libxsmm_filter, (void*)naive_filter, LIBXSMM_DNN_TENSOR_FORMAT_KCRS ) );
275
276 /* bind buffers and filter to handle */
277 CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_input, LIBXSMM_DNN_REGULAR_INPUT ) );
278 CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_output, LIBXSMM_DNN_REGULAR_OUTPUT ) );
279 CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_tensor( libxsmm_handle, libxsmm_filter, LIBXSMM_DNN_REGULAR_FILTER ) );
280
281 /* let's allocate and bind scratch */
282 scratch_size = libxsmm_dnn_get_scratch_size( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, &status );
283 CHKERR_LIBXSMM_DNN( status );
284 scratch = libxsmm_aligned_scratch( scratch_size, 2097152 );
285 CHKERR_LIBXSMM_DNN( libxsmm_dnn_bind_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL, scratch ) );
286 /* set scratch to bogus to make sure that libxsmm takes care of zeroing internally */
287
288 if ((type == 'A' || type == 'F') && LIBXSMM_NEQ(0, check)) {
289 printf("##############################################\n");
290 printf("# Check Correctness - FWD (custom-Storage) #\n");
291 printf("##############################################\n");
292 /* run LIBXSMM convolutions */
293 #if defined(_OPENMP)
294 # pragma omp parallel
295 #endif
296 {
297 #if defined(_OPENMP)
298 const int tid = omp_get_thread_num();
299 #else
300 const int tid = 0;
301 #endif
302 CHKERR_LIBXSMM_DNN( libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid ) );
303 }
304 /* copy out data */
305 CHKERR_LIBXSMM_DNN( libxsmm_dnn_copyout_tensor( libxsmm_output, (void*)naive_libxsmm_output, LIBXSMM_DNN_TENSOR_FORMAT_NCHW ) );
306
307 /* compare */
308 libxsmm_matdiff(&norms_fwd, LIBXSMM_DATATYPE_I32, nImg*nOfm*ofhp*ofwp, 1, naive_output_fp, naive_libxsmm_output, 0, 0);
309 printf("L1 reference : %.25g\n", norms_fwd.l1_ref);
310 printf("L1 test : %.25g\n", norms_fwd.l1_tst);
311 printf("L2 abs.error : %.24f\n", norms_fwd.l2_abs);
312 printf("L2 rel.error : %.24f\n", norms_fwd.l2_rel);
313 printf("Linf abs.error: %.24f\n", norms_fwd.linf_abs);
314 printf("Linf rel.error: %.24f\n", norms_fwd.linf_rel);
315 printf("Check-norm : %.24f\n", norms_fwd.normf_rel);
316 libxsmm_matdiff_reduce(&diff, &norms_fwd);
317 }
318
319 if (type == 'A' || type == 'F') {
320 printf("##########################################\n");
321 printf("# Performance - FWD (custom-Storage) #\n");
322 printf("##########################################\n");
323 /* run LIBXSMM convolution for performance */
324 for (i = 0; i < 10; ++i) {
325 #if defined(_OPENMP)
326 # pragma omp parallel
327 #endif
328 {
329 #if defined(_OPENMP)
330 const int tid = omp_get_thread_num();
331 #else
332 const int tid = 0;
333 #endif
334 libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid );
335 }
336 }
337 l_start = libxsmm_timer_tick();
338 for (i = 0; i < iters; ++i) {
339 #if defined(_OPENMP)
340 # pragma omp parallel
341 #endif
342 {
343 #if defined(_OPENMP)
344 const int tid = omp_get_thread_num();
345 #else
346 const int tid = 0;
347 #endif
348 libxsmm_dnn_execute_st( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_FWD, 0, tid );
349 }
350 }
351 l_end = libxsmm_timer_tick();
352 l_total = libxsmm_timer_duration(l_start, l_end);
353 lpOps = (double)nImg * (double)nIfm * (double)nOfm * (double)ofh * (double)ofw * (double)(2 * kh * kw) * (double)iters;
354
355 printf("GOP = %.5g\n", lpOps*1e-9/(double)iters);
356 printf("fp time = %.5g\n", ((double)(l_total/iters)));
357 printf("GOPS = %.5g\n", (lpOps*1e-9)/l_total);
358
359 printf("PERFDUMP,FP,%s,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%i,%.5g,%.5g,%f,%f,%f,%f,%f,%f,%f\n", LIBXSMM_VERSION, nThreads, nImg, nIfm, nOfm,
360 ifw, ifh, kw, kh, stride, padw, padh, ((double)(l_total/iters)), (lpOps*1e-9)/l_total, norms_fwd.l1_ref, norms_fwd.l1_tst,
361 norms_fwd.l2_abs, norms_fwd.l2_rel, norms_fwd.linf_abs, norms_fwd.linf_rel, norms_fwd.normf_rel);
362 }
363
364 /* clean-up */
365 CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_scratch( libxsmm_handle, LIBXSMM_DNN_COMPUTE_KIND_ALL ) );
366 libxsmm_free(scratch);
367 CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_INPUT ) );
368 CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_OUTPUT ) );
369 CHKERR_LIBXSMM_DNN( libxsmm_dnn_release_tensor( libxsmm_handle, LIBXSMM_DNN_REGULAR_FILTER ) );
370 CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_input ) );
371 CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_output ) );
372 CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_tensor( libxsmm_filter ) );
373 CHKERR_LIBXSMM_DNN( libxsmm_dnn_destroy_conv_layer( libxsmm_handle ) );
374
375 /* deallocate data */
376 libxsmm_free(naive_input);
377 libxsmm_free(naive_output_fp);
378 libxsmm_free(naive_libxsmm_output);
379 libxsmm_free(naive_filter);
380 libxsmm_free(input_libxsmm);
381 libxsmm_free(output_libxsmm);
382 libxsmm_free(filter_libxsmm);
383
384 { const char *const env_check_scale = getenv("CHECK_SCALE");
385 const double check_scale = LIBXSMM_ABS(0 == env_check_scale ? 1.0 : atof(env_check_scale));
386 if (LIBXSMM_NEQ(0, check) && (check < 100.0 * check_scale * diff.normf_rel) && (global_status == LIBXSMM_DNN_SUCCESS)) {
387 fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel);
388 exit(EXIT_FAILURE);
389 }
390 }
391
392 /* some empty lines at the end */
393 printf("\n\n\n");
394
395 return global_status;
396 }
397
398