1 /******************************************************************************
2 * Copyright (c) Intel Corporation - All rights reserved. *
3 * This file is part of the LIBXSMM library. *
4 * *
5 * For information on the license, see the LICENSE file. *
6 * Further information: https://github.com/hfp/libxsmm/ *
7 * SPDX-License-Identifier: BSD-3-Clause *
8 ******************************************************************************/
9 /* Greg Henry, Hans Pabst, Alexander Heinecke (Intel Corp.)
10 ******************************************************************************/
11 #if 0
12 #define USE_KERNEL_GENERATION_DIRECTLY
13 #endif
14 #if 0
15 #define USE_PREDEFINED_ASSEMBLY
16 #define USE_XSMM_GENERATED
17 #define TIME_MKL
18 #endif
19 #if 0
20 #define TEST_SINGLE
21 #endif
22
23 #if !defined(USE_PREDEFINED_ASSEMBLY) && !defined(USE_XSMM_GENERATED) && !defined(TIME_MKL) && \
24 (!defined(__linux__) || !defined(USE_KERNEL_GENERATION_DIRECTLY))
25 # define USE_XSMM_GENERATED
26 # include <libxsmm.h>
27 #else
28 # include <libxsmm_source.h>
29 # include <unistd.h>
30 # include <signal.h>
31 # include <malloc.h>
32 # include <sys/mman.h>
33 #endif
34 #include <stdlib.h>
35 #include <string.h>
36 #include <stdio.h>
37 #include <math.h>
38
39 #define BUFSIZE 32*32
40 #define BUFSIZE2 64000
41
42 #if 0
43 #define TRIANGLE_IS_IDENTITY
44 #endif
45
46 LIBXSMM_INLINE
dcopy_to_temp(int layout,double * A,int lda,int m,int n,double * Atemp,unsigned int VLEN)47 void dcopy_to_temp ( int layout, double *A, int lda, int m, int n, double *Atemp,
48 unsigned int VLEN )
49 {
50 int i, j;
51
52 if ( lda*n > BUFSIZE )
53 {
54 printf("Reference routine not set up for matrices so large\n");
55 exit(-1);
56 }
57 if ( layout == 102 )
58 {
59 /* printf("Column major\n"); */
60 for ( j = 0; j < n; j++ )
61 {
62 for ( i = 0; i < m; i++ )
63 {
64 Atemp[i+j*m] = A[i*VLEN+j*lda*VLEN];
65 }
66 }
67 #if EVENTUALLY_USE_THIS_LOOP_IT_SHOULD_BE_FASTER
68 for ( j = 0; j < n; j++ )
69 {
70 for ( i = 0, ia = 0; i < m; i++, ia+=VLEN )
71 {
72 Atemp[i+j*m] = A[ ia+j*lda*VLEN ];
73 }
74 }
75 #endif
76 } else {
77 /* printf("Row major\n"); */
78 for ( j = 0; j < n; j++ )
79 {
80 for ( i = 0; i < m; i++ )
81 {
82 /* Transpose the data */
83 Atemp[i+j*m] = A[j*VLEN+i*lda*VLEN];
84 }
85 }
86 }
87 }
88
89 LIBXSMM_INLINE
scopy_to_temp(int layout,float * A,int lda,int m,int n,float * Atemp,unsigned int VLEN)90 void scopy_to_temp ( int layout, float *A, int lda, int m, int n, float *Atemp,
91 unsigned int VLEN )
92 {
93 int i, j;
94
95 if ( lda*n > BUFSIZE )
96 {
97 printf("Reference routine not set up for matrices so large\n");
98 exit(-1);
99 }
100 if ( layout == 102 )
101 {
102 /* printf("Column major\n"); */
103 for ( j = 0; j < n; j++ )
104 {
105 for ( i = 0; i < m; i++ )
106 {
107 Atemp[i+j*m] = A[i*VLEN+j*lda*VLEN];
108 }
109 }
110 } else {
111 /* printf("Row major\n"); */
112 for ( j = 0; j < n; j++ )
113 {
114 for ( i = 0; i < m; i++ )
115 {
116 /* Transpose the data */
117 Atemp[i+j*m] = A[j*VLEN+i*lda*VLEN];
118 }
119 }
120 }
121 }
122
123 LIBXSMM_INLINE
dcopy_from_temp(int layout,double * A,int lda,int m,int n,double * Atemp,unsigned int VLEN)124 void dcopy_from_temp ( int layout, double *A, int lda, int m, int n, double *Atemp,
125 unsigned int VLEN )
126 {
127 int i, j, ia;
128
129 if ( lda*n > BUFSIZE )
130 {
131 printf("Reference routine not set up for matrices so large\n");
132 }
133 if ( layout == 102 )
134 {
135 for ( j = 0; j < n; j++ )
136 {
137 for ( i = 0, ia = 0; i < m; i++, ia+=VLEN )
138 {
139 A[ia+j*lda*VLEN] = Atemp[i+j*m];
140 }
141 }
142 } else {
143 for ( j = 0; j < n; j++ )
144 {
145 for ( i = 0; i < m; i++ )
146 {
147 /* Transpose the data */
148 A[j*VLEN+i*lda*VLEN] = Atemp[i+j*m];
149 }
150 }
151 }
152 }
153
154 LIBXSMM_INLINE
scopy_from_temp(int layout,float * A,int lda,int m,int n,float * Atemp,unsigned int VLEN)155 void scopy_from_temp ( int layout, float *A, int lda, int m, int n, float *Atemp,
156 unsigned int VLEN )
157 {
158 int i, j, ia;
159
160 if ( lda*n > BUFSIZE )
161 {
162 printf("Reference routine not set up for matrices so large\n");
163 }
164 if ( layout == 102 )
165 {
166 for ( j = 0; j < n; j++ )
167 {
168 for ( i = 0, ia = 0; i < m; i++, ia+=VLEN )
169 {
170 A[ia+j*lda*VLEN] = Atemp[i+j*m];
171 }
172 }
173 } else {
174 for ( j = 0; j < n; j++ )
175 {
176 for ( i = 0; i < m; i++ )
177 {
178 /* Transpose the data */
179 A[j*VLEN+i*lda*VLEN] = Atemp[i+j*m];
180 }
181 }
182 }
183 }
184
185 void
show_real_matrix(unsigned int m,unsigned int n,double * A,unsigned int lda)186 show_real_matrix ( unsigned int m, unsigned int n, double *A, unsigned int lda )
187 {
188 unsigned int i, j;
189
190 for ( i = 1; i <= m; i++ )
191 {
192 for ( j = 1; j <= n; j++ )
193 {
194 printf("%g ",A[(j-1)*lda+(i-1)]);
195 }
196 printf("\n");
197 }
198 }
199
200 #if !defined(USE_MKL_FOR_REFERENCE) && !defined(LIBXSMM_NOFORTRAN) && (!defined(__BLAS) || (0 != __BLAS))
201 extern void dgetrf_();
202 extern void dgetrfnp_();
203
204 /* Reference code for compact dgetrf. Note that this just copies data into
205 a buffer from the compact storage and calls the regular dgetrf code. This
206 is very naive reference code just used for testing purposes */
207 LIBXSMM_INLINE
compact_dgetrf_(unsigned int * layout,unsigned int * m,unsigned int * n,double * A,unsigned int * lda,unsigned int * nmat,unsigned int * VLEN)208 void compact_dgetrf_ ( unsigned int *layout, unsigned int *m,
209 unsigned int *n, double *A, unsigned int *lda,
210 unsigned int *nmat, unsigned int *VLEN )
211 {
212 unsigned int i, j, num, info, col;
213 double *Ap, Atemp[BUFSIZE];
214 static int ntimes = 0;
215
216 if ( ++ntimes < 3 ) printf("Inside reference compact_dgetrf_()\n");
217 if ( ++ntimes < 3 ) printf("layout=%d m=%d n=%d lda=%d nmat=%d VLEN=%d\n",*layout,*m,*n,*lda,*nmat,*VLEN);
218
219 if ( *layout == 102 ) col = *n; else col = *m;
220
221 for ( i = 0, num = 0; i < (*nmat); i+= *VLEN, num++ )
222 {
223 for ( j = 0; j < *VLEN; j++ )
224 {
225 /* Unpack the data, call a reference DGETRF, repack the data */
226 Ap = &A[j+num*(*lda)*col*(*VLEN)];
227 if (++ntimes < 6 ) printf("Doing a dgetrf at place i=%d j=%d num=%d Ap[%d]=%g\n",i,j,num,j+num*(*lda)*col*(*VLEN),Ap[0]);
228 dcopy_to_temp ( *layout, Ap, *lda, *m, *n, Atemp, *VLEN );
229 #if 0
230 if ( *m <= 4 && *n <= 4 ) {
231 printf("Matrix with i=%d j=%d num=%d loc=%ld lda=%d\n",i,j,num,j+num*(*lda)*col*(*VLEN),*lda);
232 show_real_matrix ( *m, *n, Atemp, *m );
233 }
234 #endif
235 info = 0;
236 dgetrfnp_ ( m, n, Atemp, m, &info );
237 #if 0
238 if ( *m <= 4 && *n <= 4 ) {
239 printf("Result with i=%d j=%d num=%d loc=%ld\n",i,j,num,j+num*(*lda)*col*(*VLEN));
240 show_real_matrix ( *m, *n, Atemp, *m );
241 }
242 #endif
243 if ( info != 0 ) printf("*** BAD news reference code got info=%d in case i=%d num=%d j=%d\n",info,i,num,j);
244 dcopy_from_temp ( *layout, Ap, *lda, *m, *n, Atemp, *VLEN );
245 #if 0
246 printf("i=%d num=%d j=%d Ap[20]=%g\n",i,num,j,Ap[20]);
247 #endif
248 }
249 }
250 }
251
252 extern void sgetrf_();
253 extern void sgetrfnp_();
254
255 /* Reference code for compact sgetrf. Note that this just copies data into
256 a buffer from the compact storage and calls the regular sgetrf code. This
257 is very naive reference code just used for testing purposes */
258 /* Note: if layout==101 (row major), then this code is known to only work when
259 * nmat == VLEN. To check for accuracy otherwise, transpose everything */
260 LIBXSMM_INLINE
compact_sgetrf_(unsigned int * layout,unsigned int * m,unsigned int * n,float * A,unsigned int * lda,unsigned int * nmat,unsigned int * VLEN)261 void compact_sgetrf_ ( unsigned int *layout, unsigned int *m,
262 unsigned int *n, float *A, unsigned int *lda,
263 unsigned int *nmat, unsigned int *VLEN )
264 {
265 unsigned int i, j, num, info;
266 float *Ap, Atemp[BUFSIZE];
267 static int ntimes = 0;
268
269 if ( ++ntimes < 3 ) printf("Inside reference compact_sgetrf_()\n");
270 if ( ++ntimes < 3 ) printf("layout=%d VLEN=%d nmat=%d\n",*layout, *VLEN, *nmat );
271 for ( i = 0, num = 0; i < (*nmat); i+= *VLEN, num++ )
272 {
273 for ( j = 0; j < *VLEN; j++ )
274 {
275 /* Unpack the data, call a reference SGETRF, repack the data */
276 Ap = &A[j+num*(*lda)*(*n)*(*VLEN)];
277 if (++ntimes < 3 ) printf("Doing a sgetrf at place i=%d j=%d num=%d Ap[%d]=%g\n",i,j,num,j+num*(*lda)*(*n)*(*VLEN),Ap[0]);
278 scopy_to_temp ( *layout, Ap, *lda, *m, *n, Atemp, *VLEN );
279 sgetrfnp_ ( m, n, Atemp, m, &info );
280 if ( info != 0 ) printf("Bad news! Serial reference got info=%d\n",info);
281 scopy_from_temp ( *layout, Ap, *lda, *m, *n, Atemp, *VLEN );
282 }
283 }
284 }
285
286 #endif
287
288 #define DUPLICATE_ELEMENTS_ACROSS
289
290 LIBXSMM_INLINE
dfill_matrix(int layout,double * matrix,unsigned int nmat,unsigned int ld,unsigned int m,unsigned int n,unsigned int VLEN)291 void dfill_matrix ( int layout, double *matrix, unsigned int nmat, unsigned int ld, unsigned int m, unsigned int n, unsigned int VLEN )
292 {
293 unsigned int i, j, k, k1, row, col;
294 size_t address;
295 double dtmp = 0;
296
297 if ( layout == 102 ) { row = m; col = n; } else { row = n; col = m; }
298 if ( ld < row )
299 {
300 fprintf(stderr,"Error is dfill_matrix: ld=%u row=%u (m=%u n=%u) mismatched!\n",ld,row, m, n);
301 exit(-1);
302 }
303
304 for ( k1 = 1; k1 <= nmat/VLEN; k1++ ) {
305 for ( j = 1; j <= col; j++ ) {
306 for ( i = 1; i <= ld; i++ ) {
307 for ( k = 1; k <= VLEN; k++ ) {
308 address = (k1-1)*col*ld*VLEN + (j-1)*ld*VLEN + (i-1)*VLEN + (k-1);
309 #ifdef DUPLICATE_ELEMENTS_ACROSS
310 if ( k == 1 )
311 #endif
312 if ( i <= row ) dtmp = 1.0 - 2.0*libxsmm_rng_f64();
313 else dtmp = -99.9;
314 matrix [ address ] = dtmp;
315 }
316 }
317 }
318 }
319 }
320
321 LIBXSMM_INLINE
dfill_identity(double * matrix,unsigned int ld,unsigned int m,unsigned int n,int VLEN,int number_of_cases)322 void dfill_identity ( double *matrix, unsigned int ld, unsigned int m, unsigned int n, int VLEN, int number_of_cases )
323 {
324 unsigned int h, i, j, k, ia;
325 double dtmp = 0;
326
327 if ( ld < m ) {
328 fprintf(stderr,"Error in dfill_identity: ld=%u m=%u mismatched!\n",ld,m);
329 exit(-1);
330 }
331 for ( h = 0; h < (unsigned int)number_of_cases; h++ ) {
332 ia = h*ld*n*VLEN;
333 for ( j = 1; j <= n; j++ ) {
334 for ( i = 1; i <= ld; i++ ) {
335 if ( i == j ) dtmp = 1.0; else dtmp = 0.0;
336 for ( k = 0; k < (unsigned int)VLEN; k++ ) matrix[ia++] = dtmp;
337 }
338 }
339 }
340 }
341 LIBXSMM_INLINE
sfill_matrix(int layout,float * matrix,unsigned int nmat,unsigned int ld,unsigned int m,unsigned int n,unsigned int VLEN)342 void sfill_matrix ( int layout, float *matrix, unsigned int nmat, unsigned int ld, unsigned int m, unsigned int n, unsigned int VLEN )
343 {
344 unsigned int i, j, k, k1, row, col;
345 size_t address;
346 double dtmp = 0;
347
348 if ( layout == 102 ) { row = m; col = n; } else { row = n; col = m; }
349 if ( ld < row )
350 {
351 fprintf(stderr,"Error is sfill_matrix: ld=%u row=%u (m=%u n=%u) mismatched!\n",ld,row, m, n);
352 exit(-1);
353 }
354
355 for ( k1 = 1; k1 <= nmat/VLEN; k1++ ) {
356 for ( j = 1; j <= col; j++ ) {
357 for ( i = 1; i <= ld; i++ ) {
358 for ( k = 1; k <= VLEN; k++ ) {
359 address = (k1-1)*col*ld*VLEN + (j-1)*ld*VLEN + (i-1)*VLEN + (k-1);
360 #ifdef DUPLICATE_ELEMENTS_ACROSS
361 if ( k == 1 )
362 #endif
363 if ( i <= row ) dtmp = 1.0 - 2.0*libxsmm_rng_f64();
364 else dtmp = -99.9;
365 matrix [ address ] = (float) dtmp;
366 }
367 }
368 }
369 }
370 }
371
372 LIBXSMM_INLINE
residual_s(unsigned int layout,float * A,unsigned int nmat,unsigned int VLEN,unsigned int lda,unsigned int m,unsigned int n,float * B,unsigned int ldb,unsigned int * nerrs,unsigned int * ncorr)373 double residual_s ( unsigned int layout, float *A,
374 unsigned int nmat, unsigned int VLEN,
375 unsigned int lda, unsigned int m, unsigned int n,
376 float *B, unsigned int ldb, unsigned int *nerrs,
377 unsigned int *ncorr )
378 {
379 unsigned int i, j, k, k1, row, col;
380 double atmp, btmp, dtmp, ref, derror;
381 static int ntimes = 0;
382 size_t address;
383
384 *nerrs = 0;
385 *ncorr = 0;
386 derror = 0.0;
387
388 if ( layout == 102 ) { row = m; col = n; } else { row = n; col = m; }
389
390 for ( k1 = 1; k1 <= nmat/VLEN; k1++ ) {
391 for ( j = 1; j <= col; j++ ) {
392 for ( i = 1; i <= row; i++ ) {
393 for ( k = 1; k <= VLEN; k++ ) {
394 address= (k1-1)*col*lda*VLEN + (j-1)*lda*VLEN + (i-1)*VLEN + (k-1);
395 atmp = (double) A[ address ];
396 address= (k1-1)*col*ldb*VLEN + (j-1)*ldb*VLEN + (i-1)*VLEN + (k-1);
397 btmp = (double) B[ address ];
398 ref = LIBXSMM_MAX(atmp,-atmp);
399 if ( atmp > btmp ) {
400 dtmp = atmp - btmp;
401 } else {
402 dtmp = btmp - atmp;
403 }
404 if ( isnan(dtmp) || isinf(dtmp) ) {
405 if ( ++ntimes < 15 ) {
406 printf("Denormal bug: A[%ld]=A(%u,%u,%u,%u) is %g B(%u,%u,%u,%u) is %g\n",address,k,i,j,k1,atmp,k,i,j,k1,btmp);
407 }
408 }
409 if ( (dtmp / ref > 1.0e-4) && (dtmp > 1.0e-7) ) {
410 *nerrs = *nerrs + 1;
411 if ( ++ntimes < 15 ) {
412 printf("Bug #%i: A[%ld]=A(%u,%u,%u,%u) expected=%g instead=%g err=%g\n",ntimes,address,k,i,j,k1,atmp,btmp,dtmp);
413 }
414 } else {
415 if ( (*nerrs > 0) && (ntimes < 10) && (*ncorr < 40) ) {
416 printf("Cor #%u: A[%ld]=A(%u,%u,%u,%u) expected=%g\n",*ncorr+1,address,k,i,j,k1,atmp);
417 }
418 *ncorr = *ncorr + 1;
419 }
420 derror += dtmp;
421 }
422 }
423 }
424 }
425 return ( derror );
426 }
427
428 LIBXSMM_INLINE
residual_d(unsigned int layout,double * A,unsigned int nmat,unsigned int VLEN,unsigned int lda,unsigned int m,unsigned int n,double * B,unsigned int ldb,unsigned int * nerrs,unsigned int * ncorr)429 double residual_d ( unsigned int layout, double *A,
430 unsigned int nmat, unsigned int VLEN,
431 unsigned int lda, unsigned int m, unsigned int n,
432 double *B, unsigned int ldb, unsigned int *nerrs,
433 unsigned int *ncorr )
434 {
435 unsigned int i, j;
436 double atmp, btmp, dtmp, ref, derror;
437 static int ntimes = 0;
438
439 *nerrs = 0;
440 *ncorr = 0;
441 derror = 0.0;
442 for ( j = 1; j<= n; j++ )
443 {
444 for ( i = 1; i <= m; i++ )
445 {
446 atmp = (double) A[ (j-1)*lda + (i-1)];
447 btmp = (double) B[ (j-1)*ldb + (i-1)];
448 ref = LIBXSMM_MAX(atmp,-atmp);
449 if ( atmp >= btmp ) {
450 dtmp = atmp - btmp;
451 } else {
452 dtmp = btmp - atmp;
453 }
454 if ( isnan(dtmp) || isinf(dtmp) )
455 {
456 if ( ++ntimes < 15 )
457 {
458 printf("Denormal bug: A(%u,%u) is %g B(%u,%u) is %g\n",i,j,atmp,i,j,btmp);
459 }
460 }
461 if ( (dtmp / ref > 1.0e-12) && (dtmp > 1.0e-15) )
462 {
463 *nerrs = *nerrs + 1;
464 if ( ++ntimes < 15 )
465 {
466 printf("Bug #%d: A(%u,%u) expected=%g instead=%g err=%g\n",ntimes,i,j,atmp,btmp,dtmp);
467 }
468 } else {
469 if ( (*nerrs > 0) && (ntimes < 10) && (*ncorr < 40) )
470 {
471 printf("Cor #%u: A(%u,%u) expected=%g\n",*ncorr+1,i,j,atmp);
472 }
473 *ncorr = *ncorr + 1;
474 }
475 derror += dtmp;
476 }
477 }
478 return ( derror );
479 }
480
481 #ifdef USE_PREDEFINED_ASSEMBLY
482 extern void getrf_();
483 #endif
484 #ifdef MKL_TIMER
485 extern double dsecnd_();
486 #endif
487
488 #if 1
489 #ifndef AVX2_TESTING
490 #define AVX2_TESTING
491 #endif
492 #else
493 #ifndef AVX512_TESTING
494 #define AVX512_TESTING
495 #endif
496 #endif
497 #if !defined(AVX2_TESTING) && !defined(AVX512_TESTING)
498 #define AVX2_TESTING
499 #endif
500 #if defined(AVX2_TESTING) && defined(AVX512_TESTING)
501 #error Compile with either AVX2_TESTING or AVX512_TESTING never both
502 #endif
503
504
main(int argc,char * argv[])505 int main(int argc, char* argv[])
506 {
507 unsigned int m=8, n=8, lda=8, ldb=8, nerrs, num, nmat, ntest;
508 unsigned int layout, asize, bsize;
509 #ifdef AVX512_TESTING
510 unsigned int VLEND=8, VLENS=16;
511 int arch=LIBXSMM_X86_AVX512_CORE;
512 #else
513 unsigned int VLEND=4, VLENS=8;
514 int arch=LIBXSMM_X86_AVX2;
515 #endif
516 unsigned int ncorr;
517 unsigned int i, j, large_entry;
518 char side='L', uplo='L', trans='N', diag='N';
519 float *sa, *sb, *sc, *sd;
520 double *da, *db, *dc, *dd, *tmpbuf;
521 double dalpha = 1.0;
522 float salpha;
523 double dtmp;
524 size_t sizea;
525 const unsigned char *cptr;
526 unsigned long op_count;
527 unsigned int typesize8 = 8;
528 const libxsmm_getrf_descriptor* desc8 = NULL;
529 #ifdef TEST_SINGLE
530 unsigned int typesize4 = 4;
531 const libxsmm_getrf_descriptor* desc4 = NULL;
532 #endif
533 #ifdef USE_XSMM_GENERATED
534 libxsmm_descriptor_blob blob;
535 libxsmm_getrf_xfunction mykernel = NULL;
536 #endif
537 #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__)
538 void (*opcode_routine)();
539 unsigned char *routine_output;
540 libxsmm_generated_code io_generated_code;
541 int pagesize = sysconf(_SC_PAGE_SIZE);
542 if (pagesize == -1) fprintf(stderr,"sysconf pagesize\n");
543 routine_output = (unsigned char *) mmap(NULL,
544 BUFSIZE2, PROT_READ|PROT_WRITE,
545 MAP_PRIVATE|MAP_ANONYMOUS, 0,0);
546 if (mprotect(routine_output, BUFSIZE2,
547 PROT_EXEC | PROT_READ | PROT_WRITE ) == -1)
548 fprintf(stderr,"mprotect\n");
549 printf("Routine ready\n");
550 io_generated_code.generated_code = &routine_output[0];
551 io_generated_code.buffer_size = BUFSIZE2;
552 io_generated_code.code_size = 0;
553 io_generated_code.code_type = 2;
554 io_generated_code.last_error = 0;
555 io_generated_code.sf_size = 0;
556 #endif
557
558 printf("\nUSAGE: %s m n lda nmat layout ntest\n",argv[0]);
559 if ( argc <= 3 )
560 {
561 printf("Compact LU (GETRF, no pivots) a mxn matrix of leading dim lda\n");
562 printf("This will test the jit of 1 VLEN work of nmat at a time\n");
563 printf("Defaults: m=n=lda=nmat=8, layout=102 (col major), ntest=1\n");
564 }
565 if ( argc > 1 ) m = atoi(argv[1]); else m = 8;
566 if ( argc > 2 ) n = atoi(argv[2]); else n = 8;
567 if ( argc > 3 ) lda= atoi(argv[3]); else lda = 8;
568 if ( argc > 4 ) nmat = atoi(argv[4]); else nmat = 8;
569 if ( argc > 5 ) layout = atoi(argv[5]); else layout=102;
570 if ( argc > 6 ) ntest = atoi(argv[6]); else ntest = 1;
571 salpha = (float)dalpha;
572
573 m = LIBXSMM_MAX(m,1);
574 n = LIBXSMM_MAX(n,1);
575 ntest = LIBXSMM_MAX(ntest,1);
576 #ifdef TEST_SINGLE
577 nmat = LIBXSMM_MAX(VLENS,nmat - (nmat%VLENS));
578 #else
579 nmat = LIBXSMM_MAX(VLEND,nmat - (nmat%VLEND));
580 #endif
581 layout = LIBXSMM_MAX(LIBXSMM_MIN(layout,102),101);
582
583 if ( layout == 102 ) lda = LIBXSMM_MAX(lda,m);
584 else lda = LIBXSMM_MAX(lda,n);
585
586 if ( m >= n ) {
587 op_count = nmat * (double)n * (double)n * (3.0*(double)m-(double)n) / 3.0;
588 } else {
589 op_count = nmat * (double)m * (double)m * (3.0*(double)n-(double)m) / 3.0;
590 }
591
592 printf("This is a real*%d tester for JIT compact DGETRF kernels! (m=%u n=%u lda=%u layout=%d nmat=%d)\n",typesize8,m,n,lda,layout,nmat);
593 #ifdef USE_XSMM_GENERATED
594 printf("This code tests the LIBXSMM generated kernels\n");
595 #endif
596 #ifdef USE_PREDEFINED_ASSEMBLY
597 printf("This code tests some predefined assembly kernel\n");
598 #endif
599 #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__)
600 printf("This code tests kernel generation directly\n");
601 #endif
602 #ifdef TIME_MKL
603 printf("This code tests MKL compact batch directly\n");
604 #endif
605 #ifdef AVX512_TESTING
606 printf("This binary tests only AVX512 codes\n");
607 #endif
608 #ifdef AVX2_TESTING
609 printf("This binary tests only AVX2 codes\n");
610 #endif
611
612 desc8 = libxsmm_getrf_descriptor_init(&blob, typesize8, m, n, lda, layout);
613 #ifdef TEST_SINGLE
614 desc4 = libxsmm_getrf_descriptor_init(&blob, typesize4, m, n, lda, layout);
615 #endif
616 #ifdef USE_XSMM_GENERATED
617 printf("calling libxsmm_dispatch_getrf: typesize8=%u\n",typesize8);
618 mykernel = libxsmm_dispatch_getrf(desc8);
619 printf("done calling libxsmm_dispatch_getrf: typesize8=%u\n",typesize8);
620 if ( mykernel == NULL ) printf("R8 Kernel after the create call is null\n");
621 #ifdef TEST_SINGLE
622 mykernel = libxsmm_dispatch_getrf(desc4);
623 if ( mykernel == NULL ) printf("R4 kernel after the create call is null\n");
624 #endif
625 #endif
626
627 #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__)
628 libxsmm_generator_getrf_kernel( &io_generated_code, desc8, arch );
629 #endif
630
631 #ifndef NO_ACCURACY_CHECK
632 printf("mallocing matrices\n");
633 #endif
634 if ( layout == 102 ) sizea = lda*n*nmat;
635 else sizea = lda*m*nmat;
636
637 sa = (float *) malloc ( sizea*sizeof(float) );
638 da = (double *) malloc ( sizea*sizeof(double) );
639 sc = (float *) malloc ( sizea*sizeof(float) );
640 dc = (double *) malloc ( sizea*sizeof(double) );
641 sd = (float *) malloc ( sizea*sizeof(float) );
642 dd = (double *) malloc ( sizea*sizeof(double) );
643
644 large_entry = LIBXSMM_MIN(256,sizea);
645 large_entry = large_entry - (large_entry%16);
646 while ( large_entry > m*n*nmat ) {
647 large_entry /= 2;
648 }
649 large_entry = LIBXSMM_MAX(large_entry,4);
650
651 #ifndef NO_ACCURACY_CHECK
652 printf("filling matrices\n");
653 #endif
654 sfill_matrix ( layout, sa, nmat, lda, m, n, VLEND );
655 #ifdef TRIANGLE_IS_IDENTITY
656 printf("Warning: setting triangular matrix to identity. Not good for accuracy testing\n");
657 dfill_identity ( da, lda, m, m, VLEND, nmat/VLEND );
658 #else
659 dfill_matrix ( layout, da, nmat, lda, m, n, VLEND );
660 #endif
661
662 #ifndef NO_ACCURACY_CHECK
663 for ( i = 0; i < sizea; i++ ) sc[i]=sa[i];
664 for ( i = 0; i < sizea; i++ ) dc[i]=da[i];
665 for ( i = 0; i < sizea; i++ ) sd[i]=sa[i];
666 for ( i = 0; i < sizea; i++ ) dd[i]=da[i];
667 printf("Pointing at the kernel now\n");
668 #endif
669
670 #ifdef USE_XSMM_GENERATED
671 cptr = (const unsigned char*) mykernel;
672 #endif
673 #ifdef USE_PREDEFINED_ASSEMBLY
674 cptr = (const unsigned char*) getrf_;
675 #endif
676 #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__)
677 cptr = (const unsigned char*) &routine_output[0];
678 opcode_routine = (void *) &cptr[0];
679 #endif
680
681 #ifndef TIME_MKL
682 # define DUMP_ASSEMBLY_FILE
683 #endif
684
685 #ifdef DUMP_ASSEMBLY_FILE
686 #define ASSEMBLY_DUMP_SIZE 4000
687 printf("Dumping assembly file (first %d bytes)\n",ASSEMBLY_DUMP_SIZE);
688 FILE *fp = fopen("foo.s","w");
689 char buffer[80];
690 fputs("\t.text\n",fp);
691 fputs("\t.align 256\n",fp);
692 fputs("\t.globl getrf_\n",fp);
693 fputs("getrf_:\n",fp);
694 for (i = 0; i < ASSEMBLY_DUMP_SIZE; i+=4 )
695 {
696 sprintf(buffer,".byte 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",cptr[i],cptr[i+1],cptr[i+2],cptr[i+3]);
697 fputs(buffer,fp);
698 }
699 fputs("\tretq\n",fp);
700 fputs("\t.type getrf_,@function\n",fp);
701 fputs("\t.size getrf_,.-getrf_\n",fp);
702 fclose(fp);
703 #endif
704
705 #if defined(USE_MKL_FOR_REFERENCE) || defined(TIME_MKL)
706 # include <mkl.h>
707 int info;
708 MKL_LAYOUT CLAYOUT = (layout == 101) ? MKL_ROW_MAJOR : MKL_COL_MAJOR;
709 MKL_SIDE SIDE = (side == 'R' || side == 'r') ? MKL_RIGHT : MKL_LEFT;
710 MKL_UPLO UPLO = (uplo == 'U' || uplo == 'u') ? MKL_UPPER : MKL_LOWER;
711 MKL_TRANSPOSE TRANSA = (trans == 'N' || trans == 'n') ? MKL_NOTRANS : MKL_TRANS;
712 MKL_DIAG DIAG = (diag == 'N' || diag == 'n') ? MKL_NONUNIT : MKL_UNIT;
713 MKL_COMPACT_PACK CMP_FORMAT = mkl_get_format_compact();
714 #if 0
715 MKL_COMPACT_PACK CMP_FORMAT = MKL_COMPACT_AVX;
716 #endif
717 #endif
718
719 #ifndef NO_ACCURACY_CHECK
720 printf("Before routine, initial A(1,1)=%g A[%d]=%g\n",da[0],large_entry,da[large_entry]);
721 #endif
722 #ifdef USE_PREDEFINED_ASSEMBLY
723 double one = 1.0;
724 #endif
725 double timer, firsttime = 0;
726 #ifdef MKL_TIMER
727 double tmptimer;
728 tmptimer = dsecnd_();
729 #else
730 unsigned long long l_start, l_end;
731 #endif
732
733 timer = 0.0;
734 for ( j = 0; j < (int)ntest; j++ )
735 {
736 #ifndef TRIANGLE_IS_IDENTITY
737 for ( i = 0; i < (int)sizea; i++ ) dc[i]=da[i];
738 #endif
739 for ( i = 0 , num = 0; i < (int)nmat; i+= (int)VLEND, num++ )
740 {
741 double *Ap;
742
743 if ( layout == 102 ) Ap = &dc[num*lda*n*VLEND];
744 else Ap = &dc[num*lda*m*VLEND];
745 #ifdef MKL_TIMER
746 tmptimer = dsecnd_();
747 #else
748 l_start = libxsmm_timer_tick();
749 #endif
750
751 #if !defined(USE_XSMM_GENERATED) && !defined(USE_PREDEFINED_ASSEMBLY) && !defined(USE_KERNEL_GENERATION_DIRECTLY) && !defined(TIME_MKL) && !defined(USE_PREDEFINED_ASSEMBLY_XCT)
752 gen_compact_dgetrf_ ( &layout, &m, &n, Ap, &lda, &VLEND );
753 #endif
754 #ifdef USE_XSMM_GENERATED
755 mykernel ( Ap, Ap, NULL );
756 #endif
757 #ifdef USE_PREDEFINED_ASSEMBLY
758 getrf_ ( Ap, Ap, &one );
759 #endif
760 #ifdef USE_KERNEL_GENERATION_DIRECTLY
761 (*opcode_routine)( Ap );
762 #endif
763 #ifdef TIME_MKL
764 #if 1
765 info = 0;
766 mkl_dgetrfnp_compact ( CLAYOUT, m, n, dc, lda, &info, CMP_FORMAT, nmat );
767 i+=nmat; /* Because MKL will do everything */
768 #else
769 mkl_dgetrfnp_compact ( CLAYOUT, m, n, Ap, lda, &info, CMP_FORMAT, VLEND );
770 #endif
771 #endif
772 #ifdef USE_PREDEFINED_ASSEMBLY_XCT
773 getrf_xct_ ( Ap, &one );
774 #endif
775 #ifdef MKL_TIMER
776 dtmp = dsecnd_() - tmptimer;
777 #else
778 l_end = libxsmm_timer_tick();
779 dtmp = libxsmm_timer_duration(l_start,l_end);
780 #endif
781 if ( j == 0 ) firsttime=dtmp;
782 timer += dtmp;
783 }
784 }
785 if ( ntest >= 100 ) {
786 /* Skip the first timing: super necessary if using MKL */
787 timer = (timer-firsttime)/((double)(ntest-1));
788 } else {
789 timer /= ((double)ntest);
790 }
791
792 #ifndef NO_ACCURACY_CHECK
793 printf("Average time to get through %u matrices: %g\n",nmat,timer);
794 printf("Gflops: %g\n",(double)op_count/(timer*1.0e9));
795 printf("after routine, new C(1,1)=%g C[%d]=%g\n",dc[0],large_entry,dc[large_entry]);
796 #endif
797
798 #ifdef TEST_SINGLE
799 printf("Before r4 routine, initial C(1,1)=%g C[%d]=%g\n",sc[0],large_entry,sc[large_entry]);
800
801 for ( i = 0 , num = 0; i < nmat; i+= VLENS, num++ )
802 {
803 float *Ap;
804 if ( layout == 102 ) Ap = &sc[num*lda*n*VLENS];
805 else Ap = &sc[num*lda*m*VLENS];
806 #ifdef USE_XSMM_GENERATED
807 mykernel ( Ap, Ap, NULL );
808 #endif
809 #ifdef USE_KERNEL_GENERATION_DIRECTLY
810 (*opcode_routine)( Ap );
811 #endif
812 #ifdef TIME_MKL
813 info = 0;
814 mkl_sgetrfnp_compact ( CLAYOUT, m, n, sc, lda, &info, CMP_FORMAT, nmat );
815 i+=nmat; /* Because MKL will do everything */
816 #endif
817 }
818 printf("after r4 routine, new C(1,1)=%g C[%d]=%g\n",dc[0],large_entry,dc[large_entry]);
819 #endif
820
821 #ifndef NO_ACCURACY_CHECK
822 /* Call some reference code now on a copy of the B matrix (C) */
823 double timer2 = 0.0;
824 for ( j = 0; j < (int)ntest; j++ )
825 {
826 #ifndef TRIANGLE_IS_IDENTITY
827 for ( i = 0; i < (int)sizea; i++ ) dd[i]=da[i];
828 #endif
829
830 #ifdef MKL_TIMER
831 tmptimer = dsecnd_();
832 #else
833 l_start = libxsmm_timer_tick();
834 #endif
835
836 #if !defined(USE_MKL_FOR_REFERENCE) && !defined(LIBXSMM_NOFORTRAN) && (!defined(__BLAS) || (0 != __BLAS))
837 compact_dgetrf_ ( &layout, &m, &n, dd, &lda, &nmat, &VLEND );
838 #elif defined(USE_MKL_FOR_REFERENCE)
839 mkl_dgetrfnp_compact ( CLAYOUT, m, n, dd, lda, info, CMP_FORMAT, nmat );
840 #endif
841
842 #ifdef MKL_TIMER
843 timer2 += dsecnd_() - tmptimer;
844 #else
845 l_end = libxsmm_timer_tick();
846 timer2 += libxsmm_timer_duration(l_start,l_end);
847 #endif
848
849 }
850 timer2 /= ((double)ntest);
851 printf("Reference time=%g Reference Gflops=%g\n",timer2,op_count/(timer2*1.0e9));
852
853 #ifndef TEST_SINGLE
854 /* Compute the residual between B and C */
855 dtmp = residual_d ( layout, dc, nmat, VLEND, lda, m, n, dd, lda, &nerrs, &ncorr );
856 printf("R8 m=%u n=%u lda=%u error: %g number of errors: %u corrects: %u",m,n,lda,dtmp,nerrs,ncorr);
857 if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*8 %u case",m,n,layout);
858 printf("\n");
859 #endif
860
861 #ifdef TEST_SINGLE
862 /* Call some reference code now on a copy of the B matrix (C) */
863 for ( i = 0; i < lda*n*nmat; i++ ) sd[i]=sa[i];
864 compact_sgetrf_ ( &layout, &m, &n, sd, &lda, &nmat, &VLENS );
865 /* Compute the residual between C and D */
866 dtmp = residual_s ( layout, sc, nmat, VLENS, lda, m, n, sd, lda, &nerrs, &ncorr );
867 printf("float m=%u n=%u lda=%u error: %g number of errors: %u corrects: %u\n",m,n,lda,dtmp,nerrs,ncorr);
868 if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*4 case",m,n);
869 printf("\n");
870 #endif
871
872 #else
873 for ( j = 0, nerrs = 0; j < lda*n*nmat; j++ )
874 {
875 if ( isnan(dc[j]) || isinf(dc[j]) )
876 {
877 if ( ++nerrs < 10 )
878 {
879 printf("WARNING: dc[%d]=%g\n",j,dc[j]);
880 }
881 }
882 }
883 printf("%g,real*8 m=%u n=%u lda=%u Denormals=%u Time=%g Gflops=%g",op_count/(timer*1.0e9),m,n,lda,nerrs,timer,op_count/(timer*1.0e9));
884 if ( nerrs > 0 ) printf(" -> FAILED at %ux%u real*8 case",m,n);
885 printf("\n");
886 #endif
887
888 free(dd);
889 free(sd);
890 free(dc);
891 free(sc);
892 free(da);
893 free(sa);
894
895 return 0;
896 }
897
898