1 /******************************************************************************
2 * Copyright (c) Intel Corporation - All rights reserved.                      *
3 * This file is part of the LIBXSMM library.                                   *
4 *                                                                             *
5 * For information on the license, see the LICENSE file.                       *
6 * Further information: https://github.com/hfp/libxsmm/                        *
7 * SPDX-License-Identifier: BSD-3-Clause                                       *
8 ******************************************************************************/
9 /* Greg Henry, Hans Pabst, Alexander Heinecke (Intel Corp.)
10 ******************************************************************************/
11 #if 0
12 #define USE_KERNEL_GENERATION_DIRECTLY
13 #endif
14 #if 0
15 #define USE_PREDEFINED_ASSEMBLY
16 #define USE_XSMM_GENERATED
17 #define TIME_MKL
18 #endif
19 #if 0
20 #define TEST_SINGLE
21 #endif
22 
23 #if !defined(USE_PREDEFINED_ASSEMBLY) && !defined(USE_XSMM_GENERATED) && !defined(TIME_MKL) && \
24    (!defined(__linux__) || !defined(USE_KERNEL_GENERATION_DIRECTLY))
25 # define USE_XSMM_GENERATED
26 # include <libxsmm.h>
27 #else
28 # include <libxsmm_source.h>
29 # include <unistd.h>
30 # include <signal.h>
31 # include <malloc.h>
32 # include <sys/mman.h>
33 #endif
34 #include <stdlib.h>
35 #include <string.h>
36 #include <stdio.h>
37 #include <math.h>
38 
39 #define BUFSIZE 32*32
40 #define BUFSIZE2 64000
41 
42 #if 0
43 #define TRIANGLE_IS_IDENTITY
44 #endif
45 
46 LIBXSMM_INLINE
dcopy_to_temp(int layout,double * A,int lda,int m,int n,double * Atemp,unsigned int VLEN)47 void dcopy_to_temp ( int layout, double *A, int lda, int m, int n, double *Atemp,
48                      unsigned int VLEN )
49 {
50     int i, j;
51 
52     if ( lda*n > BUFSIZE )
53     {
54        printf("Reference routine not set up for matrices so large\n");
55        exit(-1);
56     }
57     if ( layout == 102 )
58     {
59        /* printf("Column major\n"); */
60        for ( j = 0; j < n; j++ )
61        {
62           for ( i = 0; i < m; i++ )
63           {
64              Atemp[i+j*m] = A[i*VLEN+j*lda*VLEN];
65           }
66        }
67 #if EVENTUALLY_USE_THIS_LOOP_IT_SHOULD_BE_FASTER
68        for ( j = 0; j < n; j++ )
69        {
70           for ( i = 0, ia = 0; i < m; i++, ia+=VLEN )
71           {
72              Atemp[i+j*m] = A[ ia+j*lda*VLEN ];
73           }
74        }
75 #endif
76     } else {
77        /* printf("Row major\n"); */
78        for ( j = 0; j < n; j++ )
79        {
80           for ( i = 0; i < m; i++ )
81           {
82              /* Transpose the data */
83              Atemp[i+j*m] = A[j*VLEN+i*lda*VLEN];
84           }
85        }
86     }
87 }
88 
89 LIBXSMM_INLINE
scopy_to_temp(int layout,float * A,int lda,int m,int n,float * Atemp,unsigned int VLEN)90 void scopy_to_temp ( int layout, float *A, int lda, int m, int n, float *Atemp,
91                      unsigned int VLEN )
92 {
93     int i, j;
94 
95     if ( lda*n > BUFSIZE )
96     {
97        printf("Reference routine not set up for matrices so large\n");
98        exit(-1);
99     }
100     if ( layout == 102 )
101     {
102        /* printf("Column major\n"); */
103        for ( j = 0; j < n; j++ )
104        {
105           for ( i = 0; i < m; i++ )
106           {
107              Atemp[i+j*m] = A[i*VLEN+j*lda*VLEN];
108           }
109        }
110     } else {
111        /* printf("Row major\n"); */
112        for ( j = 0; j < n; j++ )
113        {
114           for ( i = 0; i < m; i++ )
115           {
116              /* Transpose the data */
117              Atemp[i+j*m] = A[j*VLEN+i*lda*VLEN];
118           }
119        }
120     }
121 }
122 
123 LIBXSMM_INLINE
dcopy_from_temp(int layout,double * A,int lda,int m,int n,double * Atemp,unsigned int VLEN)124 void dcopy_from_temp ( int layout, double *A, int lda, int m, int n, double *Atemp,
125                        unsigned int VLEN )
126 {
127     int i, j, ia;
128 
129     if ( lda*n > BUFSIZE )
130     {
131        printf("Reference routine not set up for matrices so large\n");
132     }
133     if ( layout == 102 )
134     {
135        for ( j = 0; j < n; j++ )
136        {
137           for ( i = 0, ia = 0; i < m; i++, ia+=VLEN )
138           {
139              A[ia+j*lda*VLEN] = Atemp[i+j*m];
140           }
141        }
142     } else {
143        for ( j = 0; j < n; j++ )
144        {
145           for ( i = 0; i < m; i++ )
146           {
147              /* Transpose the data */
148              A[j*VLEN+i*lda*VLEN] = Atemp[i+j*m];
149           }
150        }
151     }
152 }
153 
154 LIBXSMM_INLINE
scopy_from_temp(int layout,float * A,int lda,int m,int n,float * Atemp,unsigned int VLEN)155 void scopy_from_temp ( int layout, float *A, int lda, int m, int n, float *Atemp,
156                        unsigned int VLEN )
157 {
158     int i, j, ia;
159 
160     if ( lda*n > BUFSIZE )
161     {
162        printf("Reference routine not set up for matrices so large\n");
163     }
164     if ( layout == 102 )
165     {
166        for ( j = 0; j < n; j++ )
167        {
168           for ( i = 0, ia = 0; i < m; i++, ia+=VLEN )
169           {
170              A[ia+j*lda*VLEN] = Atemp[i+j*m];
171           }
172        }
173     } else {
174        for ( j = 0; j < n; j++ )
175        {
176           for ( i = 0; i < m; i++ )
177           {
178              /* Transpose the data */
179              A[j*VLEN+i*lda*VLEN] = Atemp[i+j*m];
180           }
181        }
182     }
183 }
184 
185 void
show_real_matrix(unsigned int m,unsigned int n,double * A,unsigned int lda)186 show_real_matrix ( unsigned int m, unsigned int n, double *A, unsigned int lda )
187 {
188     unsigned int i, j;
189 
190     for ( i = 1; i <= m; i++ )
191     {
192        for ( j = 1; j <= n; j++ )
193        {
194           printf("%g ",A[(j-1)*lda+(i-1)]);
195        }
196        printf("\n");
197     }
198 }
199 
200 #if !defined(USE_MKL_FOR_REFERENCE) && !defined(LIBXSMM_NOFORTRAN) && (!defined(__BLAS) || (0 != __BLAS))
201 extern void dgetrf_();
202 extern void dgetrfnp_();
203 
204 /* Reference code for compact dgetrf. Note that this just copies data into
205    a buffer from the compact storage and calls the regular dgetrf code. This
206    is very naive reference code just used for testing purposes */
207 LIBXSMM_INLINE
compact_dgetrf_(unsigned int * layout,unsigned int * m,unsigned int * n,double * A,unsigned int * lda,unsigned int * nmat,unsigned int * VLEN)208 void compact_dgetrf_ ( unsigned int *layout, unsigned int *m,
209                       unsigned int *n, double *A, unsigned int *lda,
210                       unsigned int *nmat, unsigned int *VLEN )
211 {
212     unsigned int i, j, num, info, col;
213     double *Ap, Atemp[BUFSIZE];
214     static int ntimes = 0;
215 
216     if ( ++ntimes < 3 ) printf("Inside reference compact_dgetrf_()\n");
217     if ( ++ntimes < 3 ) printf("layout=%d m=%d n=%d lda=%d nmat=%d VLEN=%d\n",*layout,*m,*n,*lda,*nmat,*VLEN);
218 
219     if ( *layout == 102 ) col = *n; else col = *m;
220 
221     for ( i = 0, num = 0; i < (*nmat); i+= *VLEN, num++ )
222     {
223        for ( j = 0; j < *VLEN; j++ )
224        {
225            /* Unpack the data, call a reference DGETRF, repack the data */
226            Ap = &A[j+num*(*lda)*col*(*VLEN)];
227 if (++ntimes < 6 ) printf("Doing a dgetrf at place i=%d j=%d num=%d Ap[%d]=%g\n",i,j,num,j+num*(*lda)*col*(*VLEN),Ap[0]);
228            dcopy_to_temp ( *layout, Ap, *lda, *m, *n, Atemp, *VLEN );
229 #if 0
230            if ( *m <= 4 && *n <= 4 ) {
231               printf("Matrix with i=%d j=%d num=%d loc=%ld lda=%d\n",i,j,num,j+num*(*lda)*col*(*VLEN),*lda);
232               show_real_matrix ( *m, *n, Atemp, *m );
233            }
234 #endif
235            info = 0;
236            dgetrfnp_ ( m, n, Atemp, m, &info );
237 #if 0
238            if ( *m <= 4 && *n <= 4 ) {
239               printf("Result with i=%d j=%d num=%d loc=%ld\n",i,j,num,j+num*(*lda)*col*(*VLEN));
240               show_real_matrix ( *m, *n, Atemp, *m );
241            }
242 #endif
243            if ( info != 0 ) printf("*** BAD news reference code got info=%d in case i=%d num=%d j=%d\n",info,i,num,j);
244            dcopy_from_temp ( *layout, Ap, *lda, *m, *n, Atemp, *VLEN );
245 #if 0
246            printf("i=%d num=%d j=%d Ap[20]=%g\n",i,num,j,Ap[20]);
247 #endif
248        }
249     }
250 }
251 
252 extern void sgetrf_();
253 extern void sgetrfnp_();
254 
255 /* Reference code for compact sgetrf. Note that this just copies data into
256    a buffer from the compact storage and calls the regular sgetrf code. This
257    is very naive reference code just used for testing purposes */
258 /* Note: if layout==101 (row major), then this code is known to only work when
259  *        nmat == VLEN. To check for accuracy otherwise, transpose everything */
260 LIBXSMM_INLINE
compact_sgetrf_(unsigned int * layout,unsigned int * m,unsigned int * n,float * A,unsigned int * lda,unsigned int * nmat,unsigned int * VLEN)261 void compact_sgetrf_ ( unsigned int *layout, unsigned int *m,
262                       unsigned int *n, float *A, unsigned int *lda,
263                       unsigned int *nmat, unsigned int *VLEN )
264 {
265     unsigned int i, j, num, info;
266     float *Ap, Atemp[BUFSIZE];
267     static int ntimes = 0;
268 
269     if ( ++ntimes < 3 ) printf("Inside reference compact_sgetrf_()\n");
270     if ( ++ntimes < 3 ) printf("layout=%d VLEN=%d nmat=%d\n",*layout, *VLEN, *nmat );
271     for ( i = 0, num = 0; i < (*nmat); i+= *VLEN, num++ )
272     {
273        for ( j = 0; j < *VLEN; j++ )
274        {
275            /* Unpack the data, call a reference SGETRF, repack the data */
276            Ap = &A[j+num*(*lda)*(*n)*(*VLEN)];
277 if (++ntimes < 3 ) printf("Doing a sgetrf at place i=%d j=%d num=%d Ap[%d]=%g\n",i,j,num,j+num*(*lda)*(*n)*(*VLEN),Ap[0]);
278            scopy_to_temp ( *layout, Ap, *lda, *m, *n, Atemp, *VLEN );
279            sgetrfnp_ ( m, n, Atemp, m, &info );
280            if ( info != 0 ) printf("Bad news! Serial reference got info=%d\n",info);
281            scopy_from_temp ( *layout, Ap, *lda, *m, *n, Atemp, *VLEN );
282        }
283     }
284 }
285 
286 #endif
287 
288 #define DUPLICATE_ELEMENTS_ACROSS
289 
290 LIBXSMM_INLINE
dfill_matrix(int layout,double * matrix,unsigned int nmat,unsigned int ld,unsigned int m,unsigned int n,unsigned int VLEN)291 void dfill_matrix ( int layout, double *matrix, unsigned int nmat, unsigned int ld, unsigned int m, unsigned int n, unsigned int VLEN )
292 {
293   unsigned int i, j, k, k1, row, col;
294   size_t address;
295   double dtmp = 0;
296 
297   if ( layout == 102 ) { row = m; col = n; } else { row = n; col = m; }
298   if ( ld < row )
299   {
300      fprintf(stderr,"Error is dfill_matrix: ld=%u row=%u (m=%u n=%u) mismatched!\n",ld,row, m, n);
301      exit(-1);
302   }
303 
304   for ( k1 = 1; k1 <= nmat/VLEN; k1++ ) {
305      for ( j = 1; j <= col; j++ ) {
306         for ( i = 1; i <= ld; i++ ) {
307            for ( k = 1; k <= VLEN; k++ ) {
308               address = (k1-1)*col*ld*VLEN + (j-1)*ld*VLEN + (i-1)*VLEN + (k-1);
309 #ifdef DUPLICATE_ELEMENTS_ACROSS
310               if ( k == 1 )
311 #endif
312               if ( i <= row ) dtmp = 1.0 - 2.0*libxsmm_rng_f64();
313               else            dtmp = -99.9;
314               matrix [ address ] = dtmp;
315            }
316         }
317      }
318   }
319 }
320 
321 LIBXSMM_INLINE
dfill_identity(double * matrix,unsigned int ld,unsigned int m,unsigned int n,int VLEN,int number_of_cases)322 void dfill_identity ( double *matrix, unsigned int ld, unsigned int m, unsigned int n, int VLEN, int number_of_cases )
323 {
324   unsigned int h, i, j, k, ia;
325   double dtmp = 0;
326 
327   if ( ld < m ) {
328      fprintf(stderr,"Error in dfill_identity: ld=%u m=%u mismatched!\n",ld,m);
329      exit(-1);
330   }
331   for ( h = 0; h < (unsigned int)number_of_cases; h++ ) {
332      ia = h*ld*n*VLEN;
333      for ( j = 1; j <= n; j++ ) {
334         for ( i = 1; i <= ld; i++ ) {
335            if ( i == j ) dtmp = 1.0; else dtmp = 0.0;
336            for ( k = 0; k < (unsigned int)VLEN; k++ ) matrix[ia++] = dtmp;
337         }
338      }
339   }
340 }
341 LIBXSMM_INLINE
sfill_matrix(int layout,float * matrix,unsigned int nmat,unsigned int ld,unsigned int m,unsigned int n,unsigned int VLEN)342 void sfill_matrix ( int layout, float *matrix, unsigned int nmat, unsigned int ld, unsigned int m, unsigned int n, unsigned int VLEN )
343 {
344   unsigned int i, j, k, k1, row, col;
345   size_t address;
346   double dtmp = 0;
347 
348   if ( layout == 102 ) { row = m; col = n; } else { row = n; col = m; }
349   if ( ld < row )
350   {
351      fprintf(stderr,"Error is sfill_matrix: ld=%u row=%u (m=%u n=%u) mismatched!\n",ld,row, m, n);
352      exit(-1);
353   }
354 
355   for ( k1 = 1; k1 <= nmat/VLEN; k1++ ) {
356      for ( j = 1; j <= col; j++ ) {
357         for ( i = 1; i <= ld; i++ ) {
358            for ( k = 1; k <= VLEN; k++ ) {
359               address = (k1-1)*col*ld*VLEN + (j-1)*ld*VLEN + (i-1)*VLEN + (k-1);
360 #ifdef DUPLICATE_ELEMENTS_ACROSS
361               if ( k == 1 )
362 #endif
363               if ( i <= row ) dtmp = 1.0 - 2.0*libxsmm_rng_f64();
364               else            dtmp = -99.9;
365               matrix [ address ] = (float) dtmp;
366            }
367         }
368      }
369   }
370 }
371 
372 LIBXSMM_INLINE
residual_s(unsigned int layout,float * A,unsigned int nmat,unsigned int VLEN,unsigned int lda,unsigned int m,unsigned int n,float * B,unsigned int ldb,unsigned int * nerrs,unsigned int * ncorr)373 double residual_s ( unsigned int layout, float *A,
374                     unsigned int nmat, unsigned int VLEN,
375                     unsigned int lda, unsigned int m, unsigned int n,
376                     float *B, unsigned int ldb, unsigned int *nerrs,
377                     unsigned int *ncorr )
378 {
379    unsigned int i, j, k, k1, row, col;
380    double atmp, btmp, dtmp, ref, derror;
381    static int ntimes = 0;
382    size_t address;
383 
384    *nerrs = 0;
385    *ncorr = 0;
386    derror = 0.0;
387 
388    if ( layout == 102 ) { row = m; col = n; } else { row = n; col = m; }
389 
390    for ( k1 = 1; k1 <= nmat/VLEN; k1++ ) {
391       for ( j = 1; j <= col; j++ ) {
392          for ( i = 1; i <= row; i++ ) {
393             for ( k = 1; k <= VLEN; k++ ) {
394                address= (k1-1)*col*lda*VLEN + (j-1)*lda*VLEN + (i-1)*VLEN + (k-1);
395                atmp = (double) A[ address ];
396                address= (k1-1)*col*ldb*VLEN + (j-1)*ldb*VLEN + (i-1)*VLEN + (k-1);
397                btmp = (double) B[ address ];
398                ref = LIBXSMM_MAX(atmp,-atmp);
399                if ( atmp > btmp ) {
400                   dtmp = atmp - btmp;
401                } else {
402                   dtmp = btmp - atmp;
403                }
404                if ( isnan(dtmp) || isinf(dtmp) ) {
405                   if ( ++ntimes < 15 ) {
406                      printf("Denormal bug: A[%ld]=A(%u,%u,%u,%u) is %g B(%u,%u,%u,%u) is %g\n",address,k,i,j,k1,atmp,k,i,j,k1,btmp);
407                   }
408                }
409                if ( (dtmp / ref > 1.0e-4) && (dtmp > 1.0e-7) ) {
410                   *nerrs = *nerrs + 1;
411                   if ( ++ntimes < 15 ) {
412                      printf("Bug #%i: A[%ld]=A(%u,%u,%u,%u) expected=%g instead=%g err=%g\n",ntimes,address,k,i,j,k1,atmp,btmp,dtmp);
413                   }
414                } else {
415                   if ( (*nerrs > 0) && (ntimes < 10) && (*ncorr < 40) ) {
416                      printf("Cor #%u: A[%ld]=A(%u,%u,%u,%u) expected=%g\n",*ncorr+1,address,k,i,j,k1,atmp);
417                   }
418                   *ncorr = *ncorr + 1;
419                }
420                derror += dtmp;
421             }
422          }
423       }
424    }
425    return ( derror );
426 }
427 
428 LIBXSMM_INLINE
residual_d(unsigned int layout,double * A,unsigned int nmat,unsigned int VLEN,unsigned int lda,unsigned int m,unsigned int n,double * B,unsigned int ldb,unsigned int * nerrs,unsigned int * ncorr)429 double residual_d ( unsigned int layout, double *A,
430                     unsigned int nmat, unsigned int VLEN,
431                     unsigned int lda, unsigned int m, unsigned int n,
432                     double *B, unsigned int ldb, unsigned int *nerrs,
433                     unsigned int *ncorr )
434 {
435    unsigned int i, j;
436    double atmp, btmp, dtmp, ref, derror;
437    static int ntimes = 0;
438 
439    *nerrs = 0;
440    *ncorr = 0;
441    derror = 0.0;
442    for ( j = 1; j<= n; j++ )
443    {
444       for ( i = 1; i <= m; i++ )
445       {
446          atmp = (double) A[ (j-1)*lda + (i-1)];
447          btmp = (double) B[ (j-1)*ldb + (i-1)];
448          ref  = LIBXSMM_MAX(atmp,-atmp);
449          if ( atmp >= btmp ) {
450              dtmp = atmp - btmp;
451          } else {
452              dtmp = btmp - atmp;
453          }
454          if ( isnan(dtmp) || isinf(dtmp) )
455          {
456              if ( ++ntimes < 15 )
457              {
458                 printf("Denormal bug: A(%u,%u) is %g B(%u,%u) is %g\n",i,j,atmp,i,j,btmp);
459              }
460          }
461          if ( (dtmp / ref > 1.0e-12) && (dtmp > 1.0e-15) )
462          {
463              *nerrs = *nerrs + 1;
464              if ( ++ntimes < 15 )
465              {
466                 printf("Bug #%d: A(%u,%u) expected=%g instead=%g err=%g\n",ntimes,i,j,atmp,btmp,dtmp);
467              }
468          } else {
469              if ( (*nerrs > 0) && (ntimes < 10) && (*ncorr < 40) )
470              {
471                 printf("Cor #%u: A(%u,%u) expected=%g\n",*ncorr+1,i,j,atmp);
472              }
473              *ncorr = *ncorr + 1;
474          }
475          derror += dtmp;
476       }
477    }
478    return ( derror );
479 }
480 
481 #ifdef USE_PREDEFINED_ASSEMBLY
482 extern void getrf_();
483 #endif
484 #ifdef MKL_TIMER
485 extern double dsecnd_();
486 #endif
487 
488 #if 1
489   #ifndef AVX2_TESTING
490   #define AVX2_TESTING
491   #endif
492 #else
493   #ifndef AVX512_TESTING
494   #define AVX512_TESTING
495   #endif
496 #endif
497 #if !defined(AVX2_TESTING) && !defined(AVX512_TESTING)
498   #define AVX2_TESTING
499 #endif
500 #if defined(AVX2_TESTING) && defined(AVX512_TESTING)
501   #error Compile with either AVX2_TESTING or AVX512_TESTING never both
502 #endif
503 
504 
main(int argc,char * argv[])505 int main(int argc, char* argv[])
506 {
507   unsigned int m=8, n=8, lda=8, ldb=8, nerrs, num, nmat, ntest;
508   unsigned int layout, asize, bsize;
509 #ifdef AVX512_TESTING
510   unsigned int VLEND=8, VLENS=16;
511   int arch=LIBXSMM_X86_AVX512_CORE;
512 #else
513   unsigned int VLEND=4, VLENS=8;
514   int arch=LIBXSMM_X86_AVX2;
515 #endif
516   unsigned int ncorr;
517   unsigned int i, j, large_entry;
518   char side='L', uplo='L', trans='N', diag='N';
519   float  *sa, *sb, *sc, *sd;
520   double *da, *db, *dc, *dd, *tmpbuf;
521   double dalpha = 1.0;
522   float  salpha;
523   double dtmp;
524   size_t sizea;
525   const unsigned char *cptr;
526   unsigned long op_count;
527   unsigned int typesize8 = 8;
528   const libxsmm_getrf_descriptor* desc8 = NULL;
529 #ifdef TEST_SINGLE
530   unsigned int typesize4 = 4;
531   const libxsmm_getrf_descriptor* desc4 = NULL;
532 #endif
533 #ifdef USE_XSMM_GENERATED
534   libxsmm_descriptor_blob blob;
535   libxsmm_getrf_xfunction mykernel = NULL;
536 #endif
537 #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__)
538   void (*opcode_routine)();
539   unsigned char *routine_output;
540   libxsmm_generated_code io_generated_code;
541   int pagesize = sysconf(_SC_PAGE_SIZE);
542   if (pagesize == -1) fprintf(stderr,"sysconf pagesize\n");
543   routine_output = (unsigned char *) mmap(NULL,
544                       BUFSIZE2, PROT_READ|PROT_WRITE,
545                       MAP_PRIVATE|MAP_ANONYMOUS, 0,0);
546   if (mprotect(routine_output, BUFSIZE2,
547                 PROT_EXEC | PROT_READ | PROT_WRITE ) == -1)
548       fprintf(stderr,"mprotect\n");
549   printf("Routine ready\n");
550   io_generated_code.generated_code = &routine_output[0];
551   io_generated_code.buffer_size = BUFSIZE2;
552   io_generated_code.code_size = 0;
553   io_generated_code.code_type = 2;
554   io_generated_code.last_error = 0;
555   io_generated_code.sf_size = 0;
556 #endif
557 
558   printf("\nUSAGE: %s m n lda nmat layout ntest\n",argv[0]);
559   if ( argc <= 3 )
560   {
561      printf("Compact LU (GETRF, no pivots) a mxn matrix of leading dim lda\n");
562      printf("This will test the jit of 1 VLEN work of nmat at a time\n");
563      printf("Defaults: m=n=lda=nmat=8, layout=102 (col major), ntest=1\n");
564   }
565   if ( argc > 1 ) m = atoi(argv[1]); else m = 8;
566   if ( argc > 2 ) n = atoi(argv[2]); else n = 8;
567   if ( argc > 3 ) lda= atoi(argv[3]); else lda = 8;
568   if ( argc > 4 ) nmat = atoi(argv[4]); else nmat = 8;
569   if ( argc > 5 ) layout = atoi(argv[5]); else layout=102;
570   if ( argc > 6 ) ntest = atoi(argv[6]); else ntest = 1;
571   salpha = (float)dalpha;
572 
573   m = LIBXSMM_MAX(m,1);
574   n = LIBXSMM_MAX(n,1);
575   ntest = LIBXSMM_MAX(ntest,1);
576 #ifdef TEST_SINGLE
577   nmat = LIBXSMM_MAX(VLENS,nmat - (nmat%VLENS));
578 #else
579   nmat = LIBXSMM_MAX(VLEND,nmat - (nmat%VLEND));
580 #endif
581   layout = LIBXSMM_MAX(LIBXSMM_MIN(layout,102),101);
582 
583   if ( layout == 102 ) lda = LIBXSMM_MAX(lda,m);
584   else                 lda = LIBXSMM_MAX(lda,n);
585 
586   if ( m >= n ) {
587      op_count = nmat * (double)n * (double)n * (3.0*(double)m-(double)n) / 3.0;
588   } else {
589      op_count = nmat * (double)m * (double)m * (3.0*(double)n-(double)m) / 3.0;
590   }
591 
592   printf("This is a real*%d tester for JIT compact DGETRF kernels! (m=%u n=%u lda=%u layout=%d nmat=%d)\n",typesize8,m,n,lda,layout,nmat);
593 #ifdef USE_XSMM_GENERATED
594   printf("This code tests the LIBXSMM generated kernels\n");
595 #endif
596 #ifdef USE_PREDEFINED_ASSEMBLY
597   printf("This code tests some predefined assembly kernel\n");
598 #endif
599 #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__)
600   printf("This code tests kernel generation directly\n");
601 #endif
602 #ifdef TIME_MKL
603   printf("This code tests MKL compact batch directly\n");
604 #endif
605 #ifdef AVX512_TESTING
606   printf("This binary tests only AVX512 codes\n");
607 #endif
608 #ifdef AVX2_TESTING
609   printf("This binary tests only AVX2 codes\n");
610 #endif
611 
612   desc8 = libxsmm_getrf_descriptor_init(&blob, typesize8, m, n, lda, layout);
613 #ifdef TEST_SINGLE
614   desc4 = libxsmm_getrf_descriptor_init(&blob, typesize4, m, n, lda, layout);
615 #endif
616 #ifdef USE_XSMM_GENERATED
617   printf("calling libxsmm_dispatch_getrf: typesize8=%u\n",typesize8);
618   mykernel = libxsmm_dispatch_getrf(desc8);
619   printf("done calling libxsmm_dispatch_getrf: typesize8=%u\n",typesize8);
620   if ( mykernel == NULL ) printf("R8 Kernel after the create call is null\n");
621 #ifdef TEST_SINGLE
622   mykernel = libxsmm_dispatch_getrf(desc4);
623   if ( mykernel == NULL ) printf("R4 kernel after the create call is null\n");
624 #endif
625 #endif
626 
627 #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__)
628   libxsmm_generator_getrf_kernel( &io_generated_code, desc8, arch );
629 #endif
630 
631 #ifndef NO_ACCURACY_CHECK
632   printf("mallocing matrices\n");
633 #endif
634   if ( layout == 102 ) sizea = lda*n*nmat;
635   else                 sizea = lda*m*nmat;
636 
637   sa  = (float  *) malloc ( sizea*sizeof(float) );
638   da  = (double *) malloc ( sizea*sizeof(double) );
639   sc  = (float  *) malloc ( sizea*sizeof(float) );
640   dc  = (double *) malloc ( sizea*sizeof(double) );
641   sd  = (float  *) malloc ( sizea*sizeof(float) );
642   dd  = (double *) malloc ( sizea*sizeof(double) );
643 
644   large_entry = LIBXSMM_MIN(256,sizea);
645   large_entry = large_entry - (large_entry%16);
646   while ( large_entry > m*n*nmat ) {
647      large_entry /= 2;
648   }
649   large_entry = LIBXSMM_MAX(large_entry,4);
650 
651 #ifndef NO_ACCURACY_CHECK
652   printf("filling matrices\n");
653 #endif
654   sfill_matrix ( layout, sa, nmat, lda, m, n, VLEND );
655 #ifdef TRIANGLE_IS_IDENTITY
656   printf("Warning: setting triangular matrix to identity. Not good for accuracy testing\n");
657   dfill_identity ( da, lda, m, m, VLEND, nmat/VLEND );
658 #else
659   dfill_matrix ( layout, da, nmat, lda, m, n, VLEND );
660 #endif
661 
662 #ifndef NO_ACCURACY_CHECK
663   for ( i = 0; i < sizea; i++ ) sc[i]=sa[i];
664   for ( i = 0; i < sizea; i++ ) dc[i]=da[i];
665   for ( i = 0; i < sizea; i++ ) sd[i]=sa[i];
666   for ( i = 0; i < sizea; i++ ) dd[i]=da[i];
667   printf("Pointing at the kernel now\n");
668 #endif
669 
670 #ifdef USE_XSMM_GENERATED
671   cptr = (const unsigned char*) mykernel;
672 #endif
673 #ifdef USE_PREDEFINED_ASSEMBLY
674   cptr = (const unsigned char*) getrf_;
675 #endif
676 #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__)
677   cptr = (const unsigned char*) &routine_output[0];
678   opcode_routine = (void *) &cptr[0];
679 #endif
680 
681 #ifndef TIME_MKL
682 # define DUMP_ASSEMBLY_FILE
683 #endif
684 
685 #ifdef DUMP_ASSEMBLY_FILE
686   #define ASSEMBLY_DUMP_SIZE 4000
687   printf("Dumping assembly file (first %d bytes)\n",ASSEMBLY_DUMP_SIZE);
688   FILE *fp = fopen("foo.s","w");
689   char buffer[80];
690   fputs("\t.text\n",fp);
691   fputs("\t.align 256\n",fp);
692   fputs("\t.globl getrf_\n",fp);
693   fputs("getrf_:\n",fp);
694   for (i = 0; i < ASSEMBLY_DUMP_SIZE; i+=4 )
695   {
696      sprintf(buffer,".byte 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",cptr[i],cptr[i+1],cptr[i+2],cptr[i+3]);
697      fputs(buffer,fp);
698   }
699   fputs("\tretq\n",fp);
700   fputs("\t.type getrf_,@function\n",fp);
701   fputs("\t.size getrf_,.-getrf_\n",fp);
702   fclose(fp);
703 #endif
704 
705 #if defined(USE_MKL_FOR_REFERENCE) || defined(TIME_MKL)
706 # include <mkl.h>
707   int info;
708   MKL_LAYOUT CLAYOUT = (layout == 101) ? MKL_ROW_MAJOR : MKL_COL_MAJOR;
709   MKL_SIDE SIDE = (side == 'R' || side == 'r') ? MKL_RIGHT : MKL_LEFT;
710   MKL_UPLO UPLO = (uplo == 'U' || uplo == 'u') ? MKL_UPPER : MKL_LOWER;
711   MKL_TRANSPOSE TRANSA = (trans == 'N' || trans == 'n') ? MKL_NOTRANS : MKL_TRANS;
712   MKL_DIAG DIAG = (diag == 'N' || diag == 'n') ? MKL_NONUNIT : MKL_UNIT;
713   MKL_COMPACT_PACK CMP_FORMAT = mkl_get_format_compact();
714 #if 0
715   MKL_COMPACT_PACK CMP_FORMAT = MKL_COMPACT_AVX;
716 #endif
717 #endif
718 
719 #ifndef NO_ACCURACY_CHECK
720   printf("Before routine, initial A(1,1)=%g A[%d]=%g\n",da[0],large_entry,da[large_entry]);
721 #endif
722 #ifdef USE_PREDEFINED_ASSEMBLY
723   double one = 1.0;
724 #endif
725   double timer, firsttime = 0;
726 #ifdef MKL_TIMER
727   double tmptimer;
728   tmptimer = dsecnd_();
729 #else
730   unsigned long long l_start, l_end;
731 #endif
732 
733   timer = 0.0;
734   for ( j = 0; j < (int)ntest; j++ )
735   {
736 #ifndef TRIANGLE_IS_IDENTITY
737   for ( i = 0; i < (int)sizea; i++ ) dc[i]=da[i];
738 #endif
739   for ( i = 0 , num = 0; i < (int)nmat; i+= (int)VLEND, num++ )
740   {
741      double *Ap;
742 
743      if ( layout == 102 ) Ap = &dc[num*lda*n*VLEND];
744      else                 Ap = &dc[num*lda*m*VLEND];
745 #ifdef MKL_TIMER
746      tmptimer = dsecnd_();
747 #else
748      l_start = libxsmm_timer_tick();
749 #endif
750 
751 #if !defined(USE_XSMM_GENERATED) && !defined(USE_PREDEFINED_ASSEMBLY) && !defined(USE_KERNEL_GENERATION_DIRECTLY) && !defined(TIME_MKL) && !defined(USE_PREDEFINED_ASSEMBLY_XCT)
752      gen_compact_dgetrf_ ( &layout, &m, &n, Ap, &lda, &VLEND );
753 #endif
754 #ifdef USE_XSMM_GENERATED
755      mykernel ( Ap, Ap, NULL );
756 #endif
757 #ifdef USE_PREDEFINED_ASSEMBLY
758      getrf_ ( Ap, Ap, &one );
759 #endif
760 #ifdef USE_KERNEL_GENERATION_DIRECTLY
761      (*opcode_routine)( Ap );
762 #endif
763 #ifdef TIME_MKL
764   #if 1
765      info = 0;
766      mkl_dgetrfnp_compact ( CLAYOUT, m, n, dc, lda, &info, CMP_FORMAT, nmat );
767      i+=nmat; /* Because MKL will do everything */
768   #else
769      mkl_dgetrfnp_compact ( CLAYOUT, m, n, Ap, lda, &info, CMP_FORMAT, VLEND );
770   #endif
771 #endif
772 #ifdef USE_PREDEFINED_ASSEMBLY_XCT
773      getrf_xct_ ( Ap, &one );
774 #endif
775 #ifdef MKL_TIMER
776      dtmp = dsecnd_() - tmptimer;
777 #else
778      l_end = libxsmm_timer_tick();
779      dtmp = libxsmm_timer_duration(l_start,l_end);
780 #endif
781      if ( j == 0 ) firsttime=dtmp;
782      timer += dtmp;
783   }
784   }
785   if ( ntest >= 100 ) {
786       /* Skip the first timing: super necessary if using MKL */
787       timer = (timer-firsttime)/((double)(ntest-1));
788   } else {
789       timer /= ((double)ntest);
790   }
791 
792 #ifndef NO_ACCURACY_CHECK
793   printf("Average time to get through %u matrices: %g\n",nmat,timer);
794   printf("Gflops: %g\n",(double)op_count/(timer*1.0e9));
795   printf("after routine, new      C(1,1)=%g C[%d]=%g\n",dc[0],large_entry,dc[large_entry]);
796 #endif
797 
798 #ifdef TEST_SINGLE
799   printf("Before r4 routine, initial C(1,1)=%g C[%d]=%g\n",sc[0],large_entry,sc[large_entry]);
800 
801   for ( i = 0 , num = 0; i < nmat; i+= VLENS, num++ )
802   {
803      float *Ap;
804      if ( layout == 102 ) Ap = &sc[num*lda*n*VLENS];
805      else                 Ap = &sc[num*lda*m*VLENS];
806 #ifdef USE_XSMM_GENERATED
807      mykernel ( Ap, Ap, NULL );
808 #endif
809 #ifdef USE_KERNEL_GENERATION_DIRECTLY
810      (*opcode_routine)( Ap );
811 #endif
812 #ifdef TIME_MKL
813      info = 0;
814      mkl_sgetrfnp_compact ( CLAYOUT, m, n, sc, lda, &info, CMP_FORMAT, nmat );
815      i+=nmat; /* Because MKL will do everything */
816 #endif
817   }
818   printf("after r4 routine, new      C(1,1)=%g C[%d]=%g\n",dc[0],large_entry,dc[large_entry]);
819 #endif
820 
821 #ifndef NO_ACCURACY_CHECK
822   /* Call some reference code now on a copy of the B matrix (C) */
823   double timer2 = 0.0;
824   for ( j = 0; j < (int)ntest; j++ )
825   {
826 #ifndef TRIANGLE_IS_IDENTITY
827   for ( i = 0; i < (int)sizea; i++ ) dd[i]=da[i];
828 #endif
829 
830 #ifdef MKL_TIMER
831   tmptimer = dsecnd_();
832 #else
833   l_start = libxsmm_timer_tick();
834 #endif
835 
836 #if !defined(USE_MKL_FOR_REFERENCE) && !defined(LIBXSMM_NOFORTRAN) && (!defined(__BLAS) || (0 != __BLAS))
837   compact_dgetrf_ ( &layout, &m, &n, dd, &lda, &nmat, &VLEND );
838 #elif defined(USE_MKL_FOR_REFERENCE)
839   mkl_dgetrfnp_compact ( CLAYOUT, m, n, dd, lda, info, CMP_FORMAT, nmat );
840 #endif
841 
842 #ifdef MKL_TIMER
843   timer2 += dsecnd_() - tmptimer;
844 #else
845   l_end = libxsmm_timer_tick();
846   timer2 += libxsmm_timer_duration(l_start,l_end);
847 #endif
848 
849   }
850   timer2 /= ((double)ntest);
851   printf("Reference time=%g Reference Gflops=%g\n",timer2,op_count/(timer2*1.0e9));
852 
853 #ifndef TEST_SINGLE
854   /* Compute the residual between B and C */
855   dtmp = residual_d ( layout, dc, nmat, VLEND, lda, m, n, dd, lda, &nerrs, &ncorr );
856   printf("R8 m=%u n=%u lda=%u error: %g number of errors: %u corrects: %u",m,n,lda,dtmp,nerrs,ncorr);
857   if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*8 %u case",m,n,layout);
858   printf("\n");
859 #endif
860 
861 #ifdef TEST_SINGLE
862   /* Call some reference code now on a copy of the B matrix (C) */
863   for ( i = 0; i < lda*n*nmat; i++ ) sd[i]=sa[i];
864   compact_sgetrf_ ( &layout, &m, &n, sd, &lda, &nmat, &VLENS );
865   /* Compute the residual between C and D */
866   dtmp = residual_s ( layout, sc, nmat, VLENS, lda, m, n, sd, lda, &nerrs, &ncorr );
867   printf("float m=%u n=%u lda=%u error: %g number of errors: %u corrects: %u\n",m,n,lda,dtmp,nerrs,ncorr);
868   if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*4 case",m,n);
869   printf("\n");
870 #endif
871 
872 #else
873   for ( j = 0, nerrs = 0; j < lda*n*nmat; j++ )
874   {
875      if ( isnan(dc[j]) || isinf(dc[j]) )
876      {
877         if ( ++nerrs < 10 )
878         {
879            printf("WARNING: dc[%d]=%g\n",j,dc[j]);
880         }
881      }
882   }
883   printf("%g,real*8 m=%u n=%u lda=%u Denormals=%u Time=%g Gflops=%g",op_count/(timer*1.0e9),m,n,lda,nerrs,timer,op_count/(timer*1.0e9));
884   if ( nerrs > 0 ) printf(" -> FAILED at %ux%u real*8 case",m,n);
885   printf("\n");
886 #endif
887 
888   free(dd);
889   free(sd);
890   free(dc);
891   free(sc);
892   free(da);
893   free(sa);
894 
895   return 0;
896 }
897 
898