1 /*
2     Copyright (c) 2005-2020 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #include <string>
18 #include <cstring>
19 #include <cstdio>
20 #include <cmath>
21 #include <vector>
22 #include <map>
23 
24 #include "mkl_lapack.h"
25 #include "mkl.h"
26 
27 #include "tbb/tbb_config.h"
28 #include "tbb/flow_graph.h"
29 #include "tbb/tick_count.h"
30 #include "tbb/global_control.h"
31 
32 // Application command line arguments parsing
33 #include "../../common/utility/utility.h"
34 #include "../../common/utility/get_default_num_threads.h"
35 
36 /************************************************************
37  FORWARD DECLARATIONS
38 ************************************************************/
39 
40 /**********************************************
41  Read or generate a positive-definite matrix
42  -- reads from file if fname != NULL
43      -- sets n to matrix size
44      -- allocates and reads values in to A
45  -- otherwise generates a matrix
46      -- uses n to determine size
47      -- allocates and generates values in to A
48 **********************************************/
49 void matrix_init( double * &A, int &n, const char *fname );
50 
51 /**********************************************
52  Writes a lower triangular matrix to a file
53  -- first line of file is n
54  -- subsequently 1 row per line
55 **********************************************/
56 void matrix_write ( double *A, int n, const char *fname, bool is_triangular = false );
57 
58 /************************************************************
59  GLOBAL VARIABLES
60 ************************************************************/
61 bool g_benchmark_run = false;
62 int g_n = -1, g_b = -1, g_num_trials = 1;
63 char *g_input_file_name = NULL;
64 char *g_output_prefix = NULL;
65 std::string g_alg_name;
66 int g_num_tbb_threads;
67 
68 // Creates tiled array
create_tile_array(double * A,int n,int b)69 static double ***create_tile_array( double *A, int n, int b ) {
70     const int p = n/b;
71     double ***tile = (double ***)calloc( sizeof( double ** ), p );
72 
73     for ( int j = 0; j < p; ++j ) {
74         tile[j] = (double **)calloc( sizeof( double * ), p );
75     }
76 
77     for ( int j = 0; j < p; ++j ) {
78         for ( int i = 0; i < p; ++i ) {
79             double *temp_block = (double *)calloc( sizeof( double ), b*b );
80 
81             for ( int A_j = j*b, T_j = 0; T_j < b; ++A_j, ++T_j ) {
82                 for ( int A_i = i*b, T_i = 0; T_i < b; ++A_i, ++T_i ) {
83                     temp_block[T_j*b+T_i] = A[A_j*n+A_i];
84                 }
85             }
86 
87             tile[j][i] = temp_block;
88         }
89     }
90     return tile;
91 }
92 
collapse_tile_array(double *** tile,double * A,int n,int b)93 static void collapse_tile_array( double ***tile, double *A, int n, int b ) {
94     const int p = n/b;
95 
96     for ( int j = 0; j < p; ++j ) {
97         for ( int i = 0; i < p; ++i ) {
98             double *temp_block = tile[j][i];
99 
100             for ( int A_j = j*b, T_j = 0; T_j < b; ++A_j, ++T_j ) {
101                 for ( int A_i = i*b, T_i = 0; T_i < b; ++A_i, ++T_i ) {
102                     A[A_j*n+A_i] = temp_block[T_j*b+T_i];
103                 }
104             }
105 
106             free( temp_block );
107             tile[j][i] = NULL;
108         }
109 
110         free( tile[j] );
111     }
112 
113     free( tile );
114 }
115 
116 /************************************************************
117  Helper base class: algorithm
118 ************************************************************/
119 class algorithm {
120 
121     std::string name;
122     bool is_tiled;
123 
check_if_valid(double * A0,double * C,double * A,int n)124     bool check_if_valid( double *A0, double *C, double *A, int n ) {
125         char transa = 'n', transb = 't';
126         double alpha = 1;
127         double beta = 0;
128 
129         for ( int i = 0; i < n; ++i ) {
130             for ( int j = i+1; j < n; ++j ) {
131                 A0[j*n+i] = 0.;
132             }
133         }
134 
135         dgemm ( &transa, &transb, &n, &n, &n, &alpha, A0, &n, A0, &n, &beta, C, &n );
136 
137         for ( int j = 0; j < n; ++j ) {
138             for ( int i = 0; i < n; ++i ) {
139                 const double epsilon = std::abs( A[j*n+i]*0.1 );
140 
141                 if ( std::abs( C[j*n+i] - A[j*n+i] ) > epsilon ) {
142                     printf( "ERROR: %s did not validate at C(%d,%d) = %lf != A(%d,%d) = %lf\n",
143                         name.c_str(), i, j, C[j*n+i], i, j, A[j*n+i] );
144                     printf( "ERROR: %g; %g < %g < %g\n", epsilon, A[j*n+i] - epsilon, C[j*n+i], A[j*n+i] + epsilon );
145                     return false;
146                 }
147             }
148         }
149         return true;
150     }
151 
152 public:
algorithm(const std::string & alg_name,bool t)153     algorithm( const std::string& alg_name, bool t ) : name(alg_name), is_tiled(t) {}
154 
operator ()(double * A,int n,int b,int trials)155     double operator() ( double *A, int n, int b, int trials ) {
156         tbb::tick_count t0, t1;
157         double elapsed_time = 0.0;
158         double *A0 = (double *)calloc( sizeof( double ), n*n );
159         double *C = (double *)calloc( sizeof( double ), n*n );
160 
161         for ( int t = 0; t < trials+1; ++t ) {
162             if ( is_tiled ) {
163                 double ***tile = create_tile_array( A, n, b );
164                 t0 = tbb::tick_count::now();
165                 func( tile, n, b );
166                 t1 = tbb::tick_count::now();
167 
168                 collapse_tile_array( tile, A0, n, b );
169             }
170             else {
171                 memcpy( A0, A, sizeof( double )*n*n );
172                 t0 = tbb::tick_count::now();
173                 func( A0, n, b );
174                 t1 = tbb::tick_count::now();
175             }
176 
177             if ( t ) elapsed_time += (t1-t0).seconds();
178 
179             if( !g_benchmark_run && !check_if_valid( A0, C, A, n ) ) {
180                 if ( g_output_prefix ) {
181                     std::string s( g_output_prefix );
182                     s += "_" + name + ".txt";
183                     matrix_write( A0, g_n, s.c_str(), true );
184                     free( A0 );
185                     free( C );
186                     return 0.;
187                 }
188             }
189         }
190 
191         if ( g_output_prefix ) {
192             std::string s( g_output_prefix );
193             s += "_" + name + ".txt";
194             matrix_write( A0, g_n, s.c_str(), true );
195         }
196 
197         printf( "%s %d %d %d %d %lf %lf\n", name.c_str(), g_num_tbb_threads, trials, n, b, elapsed_time, elapsed_time/trials );
198         free( A0 );
199         free( C );
200         return elapsed_time;
201     }
202 
203 protected:
204     // Main algorithm body function must be defined in any direved class
205     virtual void func( void * ptr, int n, int b ) = 0;
206 };
207 
208 /***********************************************************/
209 
call_dpotf2(double *** tile,int b,int k)210 static void call_dpotf2( double ***tile, int b, int k ) {
211     double *A_block = tile[k][k];
212     char uplo = 'l';
213     int info = 0;
214     dpotf2( &uplo, &b, A_block, &b, &info );
215     return;
216 }
217 
call_dtrsm(double *** tile,int b,int k,int j)218 static void call_dtrsm( double ***tile, int b, int k, int j ) {
219     double *A_block = tile[k][j];
220     double *L_block = tile[k][k];
221     char uplo = 'l', side = 'r', transa = 't', diag = 'n';
222     double alpha = 1;
223     dtrsm( &side, &uplo, &transa, &diag, &b, &b, &alpha, L_block, &b, A_block, &b );
224     return;
225 }
226 
call_dsyr2k(double *** tile,int b,int k,int j,int i)227 static void call_dsyr2k( double ***tile, int b, int k, int j, int i ) {
228     double *A_block = tile[i][j];
229     char transa = 'n', transb = 't';
230     char uplo = 'l';
231     double alpha = -1;
232     double beta = 1;
233 
234     if ( i == j ) {   // Diagonal block
235         double *L_block = tile[k][i];
236         dsyrk( &uplo, &transa, &b, &b, &alpha, L_block, &b, &beta, A_block, &b );
237     } else {   // Non-diagonal block
238         double *L2_block = tile[k][i];
239         double *L1_block = tile[k][j];
240         dgemm( &transa, &transb, &b, &b, &b, &alpha, L1_block, &b, L2_block, &b, &beta, A_block, &b );
241     }
242     return;
243 }
244 
245 class algorithm_crout : public algorithm
246 {
247 public:
algorithm_crout()248     algorithm_crout() : algorithm("crout_cholesky", true) {}
249 
250 protected:
func(void * ptr,int n,int b)251     virtual void func( void * ptr, int n, int b ) {
252         double ***tile = (double ***)ptr;
253         const int p = n/b;
254 
255         for ( int k = 0; k < p; ++k ) {
256             call_dpotf2( tile, b, k );
257 
258             for ( int j = k+1; j < p; ++j ) {
259                 call_dtrsm( tile, b, k, j );
260 
261                 for ( int i = k+1; i <= j; ++i ) {
262                     call_dsyr2k( tile, b, k, j, i );
263                 }
264             }
265         }
266     }
267 };
268 
269 class algorithm_dpotrf : public algorithm
270 {
271 public:
algorithm_dpotrf()272     algorithm_dpotrf() : algorithm("dpotrf_cholesky", false) {}
273 
274 protected:
func(void * ptr,int n,int)275     virtual void func( void * ptr, int n, int /* b */ ) {
276         double *A = (double *)ptr;
277         int lda = n;
278         int info = 0;
279         char uplo = 'l';
280         dpotrf( &uplo, &n, A, &lda, &info );
281     }
282 };
283 
284 /************************************************************
285  Begin data join graph based version of cholesky
286 ************************************************************/
287 
288 typedef union {
289     char a[4];
290     size_t tag;
291 } tag_t;
292 
293 typedef double * tile_t;
294 
295 typedef std::pair< tag_t, tile_t > tagged_tile_t;
296 typedef tbb::flow::tuple< tagged_tile_t > t1_t;
297 typedef tbb::flow::tuple< tagged_tile_t, tagged_tile_t > t2_t;
298 typedef tbb::flow::tuple< tagged_tile_t, tagged_tile_t, tagged_tile_t > t3_t;
299 
300 typedef tbb::flow::multifunction_node< tagged_tile_t, t1_t > dpotf2_node_t;
301 typedef tbb::flow::multifunction_node< t2_t, t2_t > dtrsm_node_t;
302 typedef tbb::flow::multifunction_node< t3_t, t3_t > dsyr2k_node_t;
303 
304 typedef tbb::flow::join_node< t2_t, tbb::flow::tag_matching > dtrsm_join_t;
305 typedef tbb::flow::join_node< t3_t, tbb::flow::tag_matching > dsyr2k_join_t;
306 
307 class dpotf2_body {
308     int p;
309     int b;
310 public:
dpotf2_body(int p_,int b_)311     dpotf2_body( int p_, int b_ ) : p(p_), b(b_) {}
312 
operator ()(const tagged_tile_t & in,dpotf2_node_t::output_ports_type & ports)313     void operator()( const tagged_tile_t &in, dpotf2_node_t::output_ports_type &ports ) {
314         int k = in.first.a[0];
315         tile_t A_block = in.second;
316         tag_t t;
317         t.tag = 0;
318         t.a[0] = k;
319         char uplo = 'l';
320         int info = 0;
321         dpotf2( &uplo, &b, A_block, &b, &info );
322 
323         // Send to dtrsms in same column
324         // k == k  j == k
325         t.a[2] = k;
326         for ( int j = k+1; j < p; ++j ) {
327             t.a[1] = j;
328             tbb::flow::get<0>( ports ).try_put( std::make_pair( t, A_block ) );
329         }
330     }
331 };
332 
333 class dtrsm_body {
334     int p;
335     int b;
336 public:
dtrsm_body(int p_,int b_)337     dtrsm_body( int p_, int b_ ) : p(p_), b(b_) {}
338 
operator ()(const t2_t & in,dtrsm_node_t::output_ports_type & ports)339     void operator()( const t2_t &in, dtrsm_node_t::output_ports_type &ports ) {
340         using tbb::flow::get;
341 
342         tagged_tile_t in0 = get<0>( in );
343         tagged_tile_t in1 = get<1>( in );
344         int k = in0.first.a[0];
345         int j = in0.first.a[1];
346         tile_t L_block = in0.second;
347         tile_t A_block = in1.second;
348         tag_t t;
349         t.tag = 0;
350         t.a[0] = k;
351         char uplo = 'l', side = 'r', transa = 't', diag = 'n';
352         double alpha = 1;
353         dtrsm( &side, &uplo, &transa, &diag, &b, &b, &alpha, L_block, &b, A_block, &b);
354 
355         // Send to rest of my row
356         t.a[1] = j;
357         for ( int i = k+1; i <= j; ++i ) {
358             t.a[2] = i;
359             get<0>( ports ).try_put( std::make_pair( t, A_block ) );
360         }
361 
362         // Send to transposed row
363         t.a[2] = j;
364         for ( int i = j; i < p; ++i ) {
365             t.a[1] = i;
366             get<1>( ports ).try_put( std::make_pair( t, A_block ) );
367         }
368     }
369 };
370 
371 class dsyr2k_body {
372     int p;
373     int b;
374 public:
dsyr2k_body(int p_,int b_)375     dsyr2k_body( int p_, int b_ ) : p(p_), b(b_) {}
376 
operator ()(const t3_t & in,dsyr2k_node_t::output_ports_type & ports)377     void operator()( const t3_t &in, dsyr2k_node_t::output_ports_type &ports ) {
378         using tbb::flow::get;
379 
380         tag_t t;
381         t.tag = 0;
382         char transa = 'n', transb = 't';
383         char uplo = 'l';
384         double alpha = -1;
385         double beta = 1;
386 
387         tagged_tile_t in0 = get<0>( in );
388         tagged_tile_t in1 = get<1>( in );
389         tagged_tile_t in2 = get<2>( in );
390         int k = in2.first.a[0];
391         int j = in2.first.a[1];
392         int i = in2.first.a[2];
393 
394         tile_t A_block = in2.second;
395         if ( i == j ) {   // Diagonal block
396             tile_t L_block = in0.second;
397             dsyrk( &uplo, &transa, &b, &b, &alpha, L_block, &b, &beta, A_block, &b );
398         } else {   // Non-diagonal block
399             tile_t L1_block = in0.second;
400             tile_t L2_block = in1.second;
401             dgemm( &transa, &transb, &b, &b, &b, &alpha, L1_block, &b, L2_block, &b, &beta, A_block, &b );
402         }
403 
404         // All outputs flow to next step
405         t.a[0] = k+1;
406         t.a[1] = j;
407         t.a[2] = i;
408         if ( k != p-1 && j == k+1 && i == k+1 ) {
409             get<0>( ports ).try_put( std::make_pair( t, A_block ) );
410         }
411 
412         if ( k < p-2 ) {
413             if ( i == k+1 && j > i ) {
414                 t.a[0] = k+1;
415                 t.a[1] = j;
416                 get<1>( ports ).try_put( std::make_pair( t, A_block ) );
417             }
418 
419             if ( j != k+1 && i != k+1 ) {
420                 t.a[0] = k+1;
421                 t.a[1] = j;
422                 t.a[2] = i;
423                 get<2>( ports ).try_put( std::make_pair( t, A_block ) );
424             }
425         }
426     }
427 };
428 
429 struct tagged_tile_to_size_t {
operator ()tagged_tile_to_size_t430     size_t operator()( const tagged_tile_t &t ) {
431         return t.first.tag;
432     }
433 };
434 
435 class algorithm_join : public algorithm
436 {
437 public:
algorithm_join()438     algorithm_join() : algorithm("data_join_cholesky", true) {}
439 
440 protected:
func(void * ptr,int n,int b)441     virtual void func( void * ptr, int n, int b ) {
442         using tbb::flow::unlimited;
443         using tbb::flow::output_port;
444         using tbb::flow::input_port;
445 
446         double ***tile = (double ***)ptr;
447         const int p = n/b;
448         tbb::flow::graph g;
449 
450         dpotf2_node_t dpotf2_node( g, unlimited, dpotf2_body(p, b) );
451         dtrsm_node_t dtrsm_node( g, unlimited, dtrsm_body(p, b) );
452         dsyr2k_node_t dsyr2k_node( g, unlimited, dsyr2k_body(p, b) );
453         dtrsm_join_t dtrsm_join( g, tagged_tile_to_size_t(), tagged_tile_to_size_t() );
454         dsyr2k_join_t dsyr2k_join( g, tagged_tile_to_size_t(), tagged_tile_to_size_t(), tagged_tile_to_size_t() );
455 
456         make_edge( output_port<0>( dsyr2k_node ), dpotf2_node );
457 
458         make_edge( output_port<0>( dpotf2_node ), input_port<0>( dtrsm_join ) );
459         make_edge( output_port<1>( dsyr2k_node ), input_port<1>( dtrsm_join ) );
460         make_edge( dtrsm_join, dtrsm_node );
461 
462         make_edge( output_port<0>( dtrsm_node ), input_port<0>( dsyr2k_join ) );
463         make_edge( output_port<1>( dtrsm_node ), input_port<1>( dsyr2k_join ) );
464         make_edge( output_port<2>( dsyr2k_node ), input_port<2>( dsyr2k_join ) );
465         make_edge( dsyr2k_join, dsyr2k_node );
466 
467         // Now we need to send out the tiles to their first nodes
468         tag_t t;
469         t.tag = 0;
470         t.a[0] = 0;
471         t.a[1] = 0;
472         t.a[2] = 0;
473 
474         // Send to feedback input of first dpotf2
475         // k == 0, j == 0, i == 0
476         dpotf2_node.try_put( std::make_pair( t, tile[0][0] ) );
477 
478         // Send to feedback input (port 1) of each dtrsm
479         // k == 0, j == 1..p-1
480         for ( int j = 1; j < p; ++j ) {
481             t.a[1] = j;
482             input_port<1>( dtrsm_join ).try_put( std::make_pair( t, tile[0][j] ) );
483         }
484 
485         // Send to feedback input (port 2) of each dsyr2k
486         // k == 0
487         for ( int i = 1; i < p; ++i ) {
488             t.a[2] = i;
489 
490             for ( int j = i; j < p; ++j ) {
491                 t.a[1] = j;
492                 input_port<2>( dsyr2k_join ).try_put( std::make_pair( t, tile[i][j] ) );
493             }
494         }
495 
496         g.wait_for_all();
497     }
498 };
499 
500 /************************************************************
501  End data join graph based version of cholesky
502 ************************************************************/
503 
504 /************************************************************
505  Begin dependence graph based version of cholesky
506 ************************************************************/
507 
508 typedef tbb::flow::continue_node< tbb::flow::continue_msg > continue_type;
509 typedef continue_type * continue_ptr_type;
510 
511 #if !__TBB_CPP11_LAMBDAS_PRESENT
512 // Using helper functor classes (instead of built-in C++ 11 lambda functions)
513 class call_dpotf2_functor
514 {
515     double ***tile;
516     int b, k;
517 public:
call_dpotf2_functor(double *** tile_,int b_,int k_)518     call_dpotf2_functor( double ***tile_, int b_, int k_ )
519         : tile(tile_), b(b_), k(k_) {}
520 
operator ()(const tbb::flow::continue_msg &)521     void operator()( const tbb::flow::continue_msg & ) { call_dpotf2( tile, b, k ); }
522 };
523 
524 class call_dtrsm_functor
525 {
526     double ***tile;
527     int b, k, j;
528 public:
call_dtrsm_functor(double *** tile_,int b_,int k_,int j_)529     call_dtrsm_functor( double ***tile_, int b_, int k_, int j_ )
530         : tile(tile_), b(b_), k(k_), j(j_) {}
531 
operator ()(const tbb::flow::continue_msg &)532     void operator()( const tbb::flow::continue_msg & ) { call_dtrsm( tile, b, k, j ); }
533 };
534 
535 class call_dsyr2k_functor
536 {
537     double ***tile;
538     int b, k, j, i;
539 public:
call_dsyr2k_functor(double *** tile_,int b_,int k_,int j_,int i_)540     call_dsyr2k_functor( double ***tile_, int b_, int k_, int j_, int i_ )
541         : tile(tile_), b(b_), k(k_), j(j_), i(i_) {}
542 
operator ()(const tbb::flow::continue_msg &)543     void operator()( const tbb::flow::continue_msg & ) { call_dsyr2k( tile, b, k, j, i ); }
544 };
545 
546 #endif // !__TBB_CPP11_LAMBDAS_PRESENT
547 
548 class algorithm_depend : public algorithm
549 {
550 public:
algorithm_depend()551     algorithm_depend() : algorithm("depend_cholesky", true) {}
552 
553 protected:
func(void * ptr,int n,int b)554     virtual void func( void * ptr, int n, int b ) {
555         double ***tile = (double ***)ptr;
556 
557         const int p = n/b;
558         continue_ptr_type *c = new continue_ptr_type[p];
559         continue_ptr_type **t = new continue_ptr_type *[p];
560         continue_ptr_type ***u = new continue_ptr_type **[p];
561 
562         tbb::flow::graph g;
563         for ( int k = p-1; k >= 0; --k ) {
564             c[k] = new continue_type( g,
565 #if __TBB_CPP11_LAMBDAS_PRESENT
566                 [=]( const tbb::flow::continue_msg & ) { call_dpotf2( tile, b, k ); } );
567 #else
568                 call_dpotf2_functor( tile, b, k ) );
569 #endif // __TBB_CPP11_LAMBDAS_PRESENT
570             t[k] = new continue_ptr_type[p];
571             u[k] = new continue_ptr_type *[p];
572 
573             for ( int j = k+1; j < p; ++j ) {
574                 t[k][j] = new continue_type( g,
575 #if __TBB_CPP11_LAMBDAS_PRESENT
576                     [=]( const tbb::flow::continue_msg & ) { call_dtrsm( tile, b, k, j ); } );
577 #else
578                     call_dtrsm_functor( tile, b, k, j ) );
579 #endif // __TBB_CPP11_LAMBDAS_PRESENT
580                 make_edge( *c[k], *t[k][j] );
581                 u[k][j] = new continue_ptr_type[p];
582 
583                 for ( int i = k+1; i <= j; ++i ) {
584                     u[k][j][i] = new continue_type( g,
585 #if __TBB_CPP11_LAMBDAS_PRESENT
586                         [=]( const tbb::flow::continue_msg & ) { call_dsyr2k( tile, b, k, j, i ); } );
587 #else
588                         call_dsyr2k_functor( tile, b, k, j, i ) );
589 #endif // __TBB_CPP11_LAMBDAS_PRESENT
590 
591                     if ( k < p-2 && k+1 != j && k+1 != i ) {
592                         make_edge( *u[k][j][i], *u[k+1][j][i] );
593                     }
594 
595                     make_edge( *t[k][j], *u[k][j][i] );
596 
597                     if ( i != j ) {
598                         make_edge( *t[k][i], *u[k][j][i] );
599                     }
600 
601                     if ( k < p-2 && j > i && i == k+1 ) {
602                         make_edge( *u[k][j][i], *t[i][j] );
603                     }
604                 }
605             }
606 
607             if ( k != p-1 ) {
608                 make_edge( *u[k][k+1][k+1], *c[k+1] );
609             }
610         }
611 
612         c[0]->try_put( tbb::flow::continue_msg() );
613         g.wait_for_all();
614     }
615 }; // class algorithm_depend
616 
617 /************************************************************
618  End dependence graph based version of cholesky
619 ************************************************************/
620 
process_args(int argc,char * argv[])621 bool process_args( int argc, char *argv[] ) {
622     utility::parse_cli_arguments( argc, argv,
623         utility::cli_argument_pack()
624         //"-h" option for displaying help is present implicitly
625         .positional_arg( g_n, "size", "the row/column size of NxN matrix (size <= 46000)" )
626         .positional_arg( g_b, "blocksize", "the block size; size must be a multiple of the blocksize" )
627         .positional_arg( g_num_trials, "num_trials", "the number of times to run each algorithm" )
628         .positional_arg( g_output_prefix, "output_prefix",
629             "if provided the prefix will be preappended to output files:\n"
630             "                     output_prefix_posdef.txt\n"
631             "                     output_prefix_X.txt; where X is the algorithm used\n"
632             "                 if output_prefix is not provided, no output will be written" )
633         .positional_arg( g_alg_name, "algorithm", "name of the used algorithm - can be dpotrf, crout, depend or join" )
634         .positional_arg( g_num_tbb_threads, "num_tbb_threads", "number of started TBB threads" )
635 
636         .arg( g_input_file_name, "input_file", "if provided it will be read to get the input matrix" )
637         .arg( g_benchmark_run, "-x", "skips all validation" )
638     );
639 
640     if ( g_n > 46000 ) {
641         printf( "ERROR: invalid 'size' value (must be less or equal 46000): %d\n", g_n );
642         return false;
643     }
644 
645     if ( g_n%g_b != 0 ) {
646         printf( "ERROR: size %d must be a multiple of the blocksize %d\n", g_n, g_b );
647         return false;
648     }
649 
650     if ( g_n/g_b > 256 ) {
651         // Because tile index size is 1 byte only in tag_t type
652         printf( "ERROR: size / blocksize must be less or equal 256, but %d / %d = %d\n", g_n, g_b, g_n/g_b );
653         return false;
654     }
655 
656     if ( g_b == -1 || (g_n == -1 && g_input_file_name == NULL) ) {
657         return false;
658     }
659 
660     return true;
661 }
662 
main(int argc,char * argv[])663 int main(int argc, char *argv[]) {
664     g_num_tbb_threads = utility::get_default_num_threads();
665     typedef std::map< std::string, algorithm * > algmap_t;
666     algmap_t algmap;
667 
668     // Init algorithms
669     algmap.insert(std::pair<std::string, algorithm *>("dpotrf", new algorithm_dpotrf));
670     algmap.insert(std::pair<std::string, algorithm *>("crout", new algorithm_crout));
671     algmap.insert(std::pair<std::string, algorithm *>("depend", new algorithm_depend));
672     algmap.insert(std::pair<std::string, algorithm *>("join", new algorithm_join));
673 
674     if ( !process_args( argc, argv ) ) {
675         printf( "ERROR: Invalid arguments. Run: %s -h\n", argv[0] );
676         exit( 1 );
677     }
678 
679     tbb::global_control c(tbb::global_control::max_allowed_parallelism, g_num_tbb_threads);
680     double *A = NULL;
681 
682     // Read input matrix
683     matrix_init( A, g_n, g_input_file_name );
684 
685     // Write input matrix if output_prefix is set and we didn't read from a file
686     if ( !g_input_file_name && g_output_prefix ) {
687         std::string s( g_output_prefix );
688         s += "_posdef.txt";
689         matrix_write( A, g_n, s.c_str() );
690     }
691 
692     if ( g_alg_name.empty() ) {
693         for ( algmap_t::iterator i = algmap.begin(); i != algmap.end(); ++i ) {
694             algorithm* const alg = i->second;
695             (*alg)( A, g_n, g_b, g_num_trials );
696         }
697     }
698     else {
699         algmap_t::iterator alg_iter = algmap.find(g_alg_name);
700 
701         if ( alg_iter != algmap.end() ) {
702             algorithm* const alg = alg_iter->second;
703             (*alg)( A, g_n, g_b, g_num_trials );
704         }
705         else {
706             printf( "ERROR: Invalid algorithm name: %s\n", g_alg_name.c_str() );
707             exit( 2 );
708         }
709     }
710 
711     free( A );
712     return 0;
713 }
714