1 /*
2 Copyright (c) 2005-2020 Intel Corporation
3
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15 */
16
17 #include <string>
18 #include <cstring>
19 #include <cstdio>
20 #include <cmath>
21 #include <vector>
22 #include <map>
23
24 #include "mkl_lapack.h"
25 #include "mkl.h"
26
27 #include "tbb/tbb_config.h"
28 #include "tbb/flow_graph.h"
29 #include "tbb/tick_count.h"
30 #include "tbb/global_control.h"
31
32 // Application command line arguments parsing
33 #include "../../common/utility/utility.h"
34 #include "../../common/utility/get_default_num_threads.h"
35
36 /************************************************************
37 FORWARD DECLARATIONS
38 ************************************************************/
39
40 /**********************************************
41 Read or generate a positive-definite matrix
42 -- reads from file if fname != NULL
43 -- sets n to matrix size
44 -- allocates and reads values in to A
45 -- otherwise generates a matrix
46 -- uses n to determine size
47 -- allocates and generates values in to A
48 **********************************************/
49 void matrix_init( double * &A, int &n, const char *fname );
50
51 /**********************************************
52 Writes a lower triangular matrix to a file
53 -- first line of file is n
54 -- subsequently 1 row per line
55 **********************************************/
56 void matrix_write ( double *A, int n, const char *fname, bool is_triangular = false );
57
58 /************************************************************
59 GLOBAL VARIABLES
60 ************************************************************/
61 bool g_benchmark_run = false;
62 int g_n = -1, g_b = -1, g_num_trials = 1;
63 char *g_input_file_name = NULL;
64 char *g_output_prefix = NULL;
65 std::string g_alg_name;
66 int g_num_tbb_threads;
67
68 // Creates tiled array
create_tile_array(double * A,int n,int b)69 static double ***create_tile_array( double *A, int n, int b ) {
70 const int p = n/b;
71 double ***tile = (double ***)calloc( sizeof( double ** ), p );
72
73 for ( int j = 0; j < p; ++j ) {
74 tile[j] = (double **)calloc( sizeof( double * ), p );
75 }
76
77 for ( int j = 0; j < p; ++j ) {
78 for ( int i = 0; i < p; ++i ) {
79 double *temp_block = (double *)calloc( sizeof( double ), b*b );
80
81 for ( int A_j = j*b, T_j = 0; T_j < b; ++A_j, ++T_j ) {
82 for ( int A_i = i*b, T_i = 0; T_i < b; ++A_i, ++T_i ) {
83 temp_block[T_j*b+T_i] = A[A_j*n+A_i];
84 }
85 }
86
87 tile[j][i] = temp_block;
88 }
89 }
90 return tile;
91 }
92
collapse_tile_array(double *** tile,double * A,int n,int b)93 static void collapse_tile_array( double ***tile, double *A, int n, int b ) {
94 const int p = n/b;
95
96 for ( int j = 0; j < p; ++j ) {
97 for ( int i = 0; i < p; ++i ) {
98 double *temp_block = tile[j][i];
99
100 for ( int A_j = j*b, T_j = 0; T_j < b; ++A_j, ++T_j ) {
101 for ( int A_i = i*b, T_i = 0; T_i < b; ++A_i, ++T_i ) {
102 A[A_j*n+A_i] = temp_block[T_j*b+T_i];
103 }
104 }
105
106 free( temp_block );
107 tile[j][i] = NULL;
108 }
109
110 free( tile[j] );
111 }
112
113 free( tile );
114 }
115
116 /************************************************************
117 Helper base class: algorithm
118 ************************************************************/
119 class algorithm {
120
121 std::string name;
122 bool is_tiled;
123
check_if_valid(double * A0,double * C,double * A,int n)124 bool check_if_valid( double *A0, double *C, double *A, int n ) {
125 char transa = 'n', transb = 't';
126 double alpha = 1;
127 double beta = 0;
128
129 for ( int i = 0; i < n; ++i ) {
130 for ( int j = i+1; j < n; ++j ) {
131 A0[j*n+i] = 0.;
132 }
133 }
134
135 dgemm ( &transa, &transb, &n, &n, &n, &alpha, A0, &n, A0, &n, &beta, C, &n );
136
137 for ( int j = 0; j < n; ++j ) {
138 for ( int i = 0; i < n; ++i ) {
139 const double epsilon = std::abs( A[j*n+i]*0.1 );
140
141 if ( std::abs( C[j*n+i] - A[j*n+i] ) > epsilon ) {
142 printf( "ERROR: %s did not validate at C(%d,%d) = %lf != A(%d,%d) = %lf\n",
143 name.c_str(), i, j, C[j*n+i], i, j, A[j*n+i] );
144 printf( "ERROR: %g; %g < %g < %g\n", epsilon, A[j*n+i] - epsilon, C[j*n+i], A[j*n+i] + epsilon );
145 return false;
146 }
147 }
148 }
149 return true;
150 }
151
152 public:
algorithm(const std::string & alg_name,bool t)153 algorithm( const std::string& alg_name, bool t ) : name(alg_name), is_tiled(t) {}
154
operator ()(double * A,int n,int b,int trials)155 double operator() ( double *A, int n, int b, int trials ) {
156 tbb::tick_count t0, t1;
157 double elapsed_time = 0.0;
158 double *A0 = (double *)calloc( sizeof( double ), n*n );
159 double *C = (double *)calloc( sizeof( double ), n*n );
160
161 for ( int t = 0; t < trials+1; ++t ) {
162 if ( is_tiled ) {
163 double ***tile = create_tile_array( A, n, b );
164 t0 = tbb::tick_count::now();
165 func( tile, n, b );
166 t1 = tbb::tick_count::now();
167
168 collapse_tile_array( tile, A0, n, b );
169 }
170 else {
171 memcpy( A0, A, sizeof( double )*n*n );
172 t0 = tbb::tick_count::now();
173 func( A0, n, b );
174 t1 = tbb::tick_count::now();
175 }
176
177 if ( t ) elapsed_time += (t1-t0).seconds();
178
179 if( !g_benchmark_run && !check_if_valid( A0, C, A, n ) ) {
180 if ( g_output_prefix ) {
181 std::string s( g_output_prefix );
182 s += "_" + name + ".txt";
183 matrix_write( A0, g_n, s.c_str(), true );
184 free( A0 );
185 free( C );
186 return 0.;
187 }
188 }
189 }
190
191 if ( g_output_prefix ) {
192 std::string s( g_output_prefix );
193 s += "_" + name + ".txt";
194 matrix_write( A0, g_n, s.c_str(), true );
195 }
196
197 printf( "%s %d %d %d %d %lf %lf\n", name.c_str(), g_num_tbb_threads, trials, n, b, elapsed_time, elapsed_time/trials );
198 free( A0 );
199 free( C );
200 return elapsed_time;
201 }
202
203 protected:
204 // Main algorithm body function must be defined in any direved class
205 virtual void func( void * ptr, int n, int b ) = 0;
206 };
207
208 /***********************************************************/
209
call_dpotf2(double *** tile,int b,int k)210 static void call_dpotf2( double ***tile, int b, int k ) {
211 double *A_block = tile[k][k];
212 char uplo = 'l';
213 int info = 0;
214 dpotf2( &uplo, &b, A_block, &b, &info );
215 return;
216 }
217
call_dtrsm(double *** tile,int b,int k,int j)218 static void call_dtrsm( double ***tile, int b, int k, int j ) {
219 double *A_block = tile[k][j];
220 double *L_block = tile[k][k];
221 char uplo = 'l', side = 'r', transa = 't', diag = 'n';
222 double alpha = 1;
223 dtrsm( &side, &uplo, &transa, &diag, &b, &b, &alpha, L_block, &b, A_block, &b );
224 return;
225 }
226
call_dsyr2k(double *** tile,int b,int k,int j,int i)227 static void call_dsyr2k( double ***tile, int b, int k, int j, int i ) {
228 double *A_block = tile[i][j];
229 char transa = 'n', transb = 't';
230 char uplo = 'l';
231 double alpha = -1;
232 double beta = 1;
233
234 if ( i == j ) { // Diagonal block
235 double *L_block = tile[k][i];
236 dsyrk( &uplo, &transa, &b, &b, &alpha, L_block, &b, &beta, A_block, &b );
237 } else { // Non-diagonal block
238 double *L2_block = tile[k][i];
239 double *L1_block = tile[k][j];
240 dgemm( &transa, &transb, &b, &b, &b, &alpha, L1_block, &b, L2_block, &b, &beta, A_block, &b );
241 }
242 return;
243 }
244
245 class algorithm_crout : public algorithm
246 {
247 public:
algorithm_crout()248 algorithm_crout() : algorithm("crout_cholesky", true) {}
249
250 protected:
func(void * ptr,int n,int b)251 virtual void func( void * ptr, int n, int b ) {
252 double ***tile = (double ***)ptr;
253 const int p = n/b;
254
255 for ( int k = 0; k < p; ++k ) {
256 call_dpotf2( tile, b, k );
257
258 for ( int j = k+1; j < p; ++j ) {
259 call_dtrsm( tile, b, k, j );
260
261 for ( int i = k+1; i <= j; ++i ) {
262 call_dsyr2k( tile, b, k, j, i );
263 }
264 }
265 }
266 }
267 };
268
269 class algorithm_dpotrf : public algorithm
270 {
271 public:
algorithm_dpotrf()272 algorithm_dpotrf() : algorithm("dpotrf_cholesky", false) {}
273
274 protected:
func(void * ptr,int n,int)275 virtual void func( void * ptr, int n, int /* b */ ) {
276 double *A = (double *)ptr;
277 int lda = n;
278 int info = 0;
279 char uplo = 'l';
280 dpotrf( &uplo, &n, A, &lda, &info );
281 }
282 };
283
284 /************************************************************
285 Begin data join graph based version of cholesky
286 ************************************************************/
287
288 typedef union {
289 char a[4];
290 size_t tag;
291 } tag_t;
292
293 typedef double * tile_t;
294
295 typedef std::pair< tag_t, tile_t > tagged_tile_t;
296 typedef tbb::flow::tuple< tagged_tile_t > t1_t;
297 typedef tbb::flow::tuple< tagged_tile_t, tagged_tile_t > t2_t;
298 typedef tbb::flow::tuple< tagged_tile_t, tagged_tile_t, tagged_tile_t > t3_t;
299
300 typedef tbb::flow::multifunction_node< tagged_tile_t, t1_t > dpotf2_node_t;
301 typedef tbb::flow::multifunction_node< t2_t, t2_t > dtrsm_node_t;
302 typedef tbb::flow::multifunction_node< t3_t, t3_t > dsyr2k_node_t;
303
304 typedef tbb::flow::join_node< t2_t, tbb::flow::tag_matching > dtrsm_join_t;
305 typedef tbb::flow::join_node< t3_t, tbb::flow::tag_matching > dsyr2k_join_t;
306
307 class dpotf2_body {
308 int p;
309 int b;
310 public:
dpotf2_body(int p_,int b_)311 dpotf2_body( int p_, int b_ ) : p(p_), b(b_) {}
312
operator ()(const tagged_tile_t & in,dpotf2_node_t::output_ports_type & ports)313 void operator()( const tagged_tile_t &in, dpotf2_node_t::output_ports_type &ports ) {
314 int k = in.first.a[0];
315 tile_t A_block = in.second;
316 tag_t t;
317 t.tag = 0;
318 t.a[0] = k;
319 char uplo = 'l';
320 int info = 0;
321 dpotf2( &uplo, &b, A_block, &b, &info );
322
323 // Send to dtrsms in same column
324 // k == k j == k
325 t.a[2] = k;
326 for ( int j = k+1; j < p; ++j ) {
327 t.a[1] = j;
328 tbb::flow::get<0>( ports ).try_put( std::make_pair( t, A_block ) );
329 }
330 }
331 };
332
333 class dtrsm_body {
334 int p;
335 int b;
336 public:
dtrsm_body(int p_,int b_)337 dtrsm_body( int p_, int b_ ) : p(p_), b(b_) {}
338
operator ()(const t2_t & in,dtrsm_node_t::output_ports_type & ports)339 void operator()( const t2_t &in, dtrsm_node_t::output_ports_type &ports ) {
340 using tbb::flow::get;
341
342 tagged_tile_t in0 = get<0>( in );
343 tagged_tile_t in1 = get<1>( in );
344 int k = in0.first.a[0];
345 int j = in0.first.a[1];
346 tile_t L_block = in0.second;
347 tile_t A_block = in1.second;
348 tag_t t;
349 t.tag = 0;
350 t.a[0] = k;
351 char uplo = 'l', side = 'r', transa = 't', diag = 'n';
352 double alpha = 1;
353 dtrsm( &side, &uplo, &transa, &diag, &b, &b, &alpha, L_block, &b, A_block, &b);
354
355 // Send to rest of my row
356 t.a[1] = j;
357 for ( int i = k+1; i <= j; ++i ) {
358 t.a[2] = i;
359 get<0>( ports ).try_put( std::make_pair( t, A_block ) );
360 }
361
362 // Send to transposed row
363 t.a[2] = j;
364 for ( int i = j; i < p; ++i ) {
365 t.a[1] = i;
366 get<1>( ports ).try_put( std::make_pair( t, A_block ) );
367 }
368 }
369 };
370
371 class dsyr2k_body {
372 int p;
373 int b;
374 public:
dsyr2k_body(int p_,int b_)375 dsyr2k_body( int p_, int b_ ) : p(p_), b(b_) {}
376
operator ()(const t3_t & in,dsyr2k_node_t::output_ports_type & ports)377 void operator()( const t3_t &in, dsyr2k_node_t::output_ports_type &ports ) {
378 using tbb::flow::get;
379
380 tag_t t;
381 t.tag = 0;
382 char transa = 'n', transb = 't';
383 char uplo = 'l';
384 double alpha = -1;
385 double beta = 1;
386
387 tagged_tile_t in0 = get<0>( in );
388 tagged_tile_t in1 = get<1>( in );
389 tagged_tile_t in2 = get<2>( in );
390 int k = in2.first.a[0];
391 int j = in2.first.a[1];
392 int i = in2.first.a[2];
393
394 tile_t A_block = in2.second;
395 if ( i == j ) { // Diagonal block
396 tile_t L_block = in0.second;
397 dsyrk( &uplo, &transa, &b, &b, &alpha, L_block, &b, &beta, A_block, &b );
398 } else { // Non-diagonal block
399 tile_t L1_block = in0.second;
400 tile_t L2_block = in1.second;
401 dgemm( &transa, &transb, &b, &b, &b, &alpha, L1_block, &b, L2_block, &b, &beta, A_block, &b );
402 }
403
404 // All outputs flow to next step
405 t.a[0] = k+1;
406 t.a[1] = j;
407 t.a[2] = i;
408 if ( k != p-1 && j == k+1 && i == k+1 ) {
409 get<0>( ports ).try_put( std::make_pair( t, A_block ) );
410 }
411
412 if ( k < p-2 ) {
413 if ( i == k+1 && j > i ) {
414 t.a[0] = k+1;
415 t.a[1] = j;
416 get<1>( ports ).try_put( std::make_pair( t, A_block ) );
417 }
418
419 if ( j != k+1 && i != k+1 ) {
420 t.a[0] = k+1;
421 t.a[1] = j;
422 t.a[2] = i;
423 get<2>( ports ).try_put( std::make_pair( t, A_block ) );
424 }
425 }
426 }
427 };
428
429 struct tagged_tile_to_size_t {
operator ()tagged_tile_to_size_t430 size_t operator()( const tagged_tile_t &t ) {
431 return t.first.tag;
432 }
433 };
434
435 class algorithm_join : public algorithm
436 {
437 public:
algorithm_join()438 algorithm_join() : algorithm("data_join_cholesky", true) {}
439
440 protected:
func(void * ptr,int n,int b)441 virtual void func( void * ptr, int n, int b ) {
442 using tbb::flow::unlimited;
443 using tbb::flow::output_port;
444 using tbb::flow::input_port;
445
446 double ***tile = (double ***)ptr;
447 const int p = n/b;
448 tbb::flow::graph g;
449
450 dpotf2_node_t dpotf2_node( g, unlimited, dpotf2_body(p, b) );
451 dtrsm_node_t dtrsm_node( g, unlimited, dtrsm_body(p, b) );
452 dsyr2k_node_t dsyr2k_node( g, unlimited, dsyr2k_body(p, b) );
453 dtrsm_join_t dtrsm_join( g, tagged_tile_to_size_t(), tagged_tile_to_size_t() );
454 dsyr2k_join_t dsyr2k_join( g, tagged_tile_to_size_t(), tagged_tile_to_size_t(), tagged_tile_to_size_t() );
455
456 make_edge( output_port<0>( dsyr2k_node ), dpotf2_node );
457
458 make_edge( output_port<0>( dpotf2_node ), input_port<0>( dtrsm_join ) );
459 make_edge( output_port<1>( dsyr2k_node ), input_port<1>( dtrsm_join ) );
460 make_edge( dtrsm_join, dtrsm_node );
461
462 make_edge( output_port<0>( dtrsm_node ), input_port<0>( dsyr2k_join ) );
463 make_edge( output_port<1>( dtrsm_node ), input_port<1>( dsyr2k_join ) );
464 make_edge( output_port<2>( dsyr2k_node ), input_port<2>( dsyr2k_join ) );
465 make_edge( dsyr2k_join, dsyr2k_node );
466
467 // Now we need to send out the tiles to their first nodes
468 tag_t t;
469 t.tag = 0;
470 t.a[0] = 0;
471 t.a[1] = 0;
472 t.a[2] = 0;
473
474 // Send to feedback input of first dpotf2
475 // k == 0, j == 0, i == 0
476 dpotf2_node.try_put( std::make_pair( t, tile[0][0] ) );
477
478 // Send to feedback input (port 1) of each dtrsm
479 // k == 0, j == 1..p-1
480 for ( int j = 1; j < p; ++j ) {
481 t.a[1] = j;
482 input_port<1>( dtrsm_join ).try_put( std::make_pair( t, tile[0][j] ) );
483 }
484
485 // Send to feedback input (port 2) of each dsyr2k
486 // k == 0
487 for ( int i = 1; i < p; ++i ) {
488 t.a[2] = i;
489
490 for ( int j = i; j < p; ++j ) {
491 t.a[1] = j;
492 input_port<2>( dsyr2k_join ).try_put( std::make_pair( t, tile[i][j] ) );
493 }
494 }
495
496 g.wait_for_all();
497 }
498 };
499
500 /************************************************************
501 End data join graph based version of cholesky
502 ************************************************************/
503
504 /************************************************************
505 Begin dependence graph based version of cholesky
506 ************************************************************/
507
508 typedef tbb::flow::continue_node< tbb::flow::continue_msg > continue_type;
509 typedef continue_type * continue_ptr_type;
510
511 #if !__TBB_CPP11_LAMBDAS_PRESENT
512 // Using helper functor classes (instead of built-in C++ 11 lambda functions)
513 class call_dpotf2_functor
514 {
515 double ***tile;
516 int b, k;
517 public:
call_dpotf2_functor(double *** tile_,int b_,int k_)518 call_dpotf2_functor( double ***tile_, int b_, int k_ )
519 : tile(tile_), b(b_), k(k_) {}
520
operator ()(const tbb::flow::continue_msg &)521 void operator()( const tbb::flow::continue_msg & ) { call_dpotf2( tile, b, k ); }
522 };
523
524 class call_dtrsm_functor
525 {
526 double ***tile;
527 int b, k, j;
528 public:
call_dtrsm_functor(double *** tile_,int b_,int k_,int j_)529 call_dtrsm_functor( double ***tile_, int b_, int k_, int j_ )
530 : tile(tile_), b(b_), k(k_), j(j_) {}
531
operator ()(const tbb::flow::continue_msg &)532 void operator()( const tbb::flow::continue_msg & ) { call_dtrsm( tile, b, k, j ); }
533 };
534
535 class call_dsyr2k_functor
536 {
537 double ***tile;
538 int b, k, j, i;
539 public:
call_dsyr2k_functor(double *** tile_,int b_,int k_,int j_,int i_)540 call_dsyr2k_functor( double ***tile_, int b_, int k_, int j_, int i_ )
541 : tile(tile_), b(b_), k(k_), j(j_), i(i_) {}
542
operator ()(const tbb::flow::continue_msg &)543 void operator()( const tbb::flow::continue_msg & ) { call_dsyr2k( tile, b, k, j, i ); }
544 };
545
546 #endif // !__TBB_CPP11_LAMBDAS_PRESENT
547
548 class algorithm_depend : public algorithm
549 {
550 public:
algorithm_depend()551 algorithm_depend() : algorithm("depend_cholesky", true) {}
552
553 protected:
func(void * ptr,int n,int b)554 virtual void func( void * ptr, int n, int b ) {
555 double ***tile = (double ***)ptr;
556
557 const int p = n/b;
558 continue_ptr_type *c = new continue_ptr_type[p];
559 continue_ptr_type **t = new continue_ptr_type *[p];
560 continue_ptr_type ***u = new continue_ptr_type **[p];
561
562 tbb::flow::graph g;
563 for ( int k = p-1; k >= 0; --k ) {
564 c[k] = new continue_type( g,
565 #if __TBB_CPP11_LAMBDAS_PRESENT
566 [=]( const tbb::flow::continue_msg & ) { call_dpotf2( tile, b, k ); } );
567 #else
568 call_dpotf2_functor( tile, b, k ) );
569 #endif // __TBB_CPP11_LAMBDAS_PRESENT
570 t[k] = new continue_ptr_type[p];
571 u[k] = new continue_ptr_type *[p];
572
573 for ( int j = k+1; j < p; ++j ) {
574 t[k][j] = new continue_type( g,
575 #if __TBB_CPP11_LAMBDAS_PRESENT
576 [=]( const tbb::flow::continue_msg & ) { call_dtrsm( tile, b, k, j ); } );
577 #else
578 call_dtrsm_functor( tile, b, k, j ) );
579 #endif // __TBB_CPP11_LAMBDAS_PRESENT
580 make_edge( *c[k], *t[k][j] );
581 u[k][j] = new continue_ptr_type[p];
582
583 for ( int i = k+1; i <= j; ++i ) {
584 u[k][j][i] = new continue_type( g,
585 #if __TBB_CPP11_LAMBDAS_PRESENT
586 [=]( const tbb::flow::continue_msg & ) { call_dsyr2k( tile, b, k, j, i ); } );
587 #else
588 call_dsyr2k_functor( tile, b, k, j, i ) );
589 #endif // __TBB_CPP11_LAMBDAS_PRESENT
590
591 if ( k < p-2 && k+1 != j && k+1 != i ) {
592 make_edge( *u[k][j][i], *u[k+1][j][i] );
593 }
594
595 make_edge( *t[k][j], *u[k][j][i] );
596
597 if ( i != j ) {
598 make_edge( *t[k][i], *u[k][j][i] );
599 }
600
601 if ( k < p-2 && j > i && i == k+1 ) {
602 make_edge( *u[k][j][i], *t[i][j] );
603 }
604 }
605 }
606
607 if ( k != p-1 ) {
608 make_edge( *u[k][k+1][k+1], *c[k+1] );
609 }
610 }
611
612 c[0]->try_put( tbb::flow::continue_msg() );
613 g.wait_for_all();
614 }
615 }; // class algorithm_depend
616
617 /************************************************************
618 End dependence graph based version of cholesky
619 ************************************************************/
620
process_args(int argc,char * argv[])621 bool process_args( int argc, char *argv[] ) {
622 utility::parse_cli_arguments( argc, argv,
623 utility::cli_argument_pack()
624 //"-h" option for displaying help is present implicitly
625 .positional_arg( g_n, "size", "the row/column size of NxN matrix (size <= 46000)" )
626 .positional_arg( g_b, "blocksize", "the block size; size must be a multiple of the blocksize" )
627 .positional_arg( g_num_trials, "num_trials", "the number of times to run each algorithm" )
628 .positional_arg( g_output_prefix, "output_prefix",
629 "if provided the prefix will be preappended to output files:\n"
630 " output_prefix_posdef.txt\n"
631 " output_prefix_X.txt; where X is the algorithm used\n"
632 " if output_prefix is not provided, no output will be written" )
633 .positional_arg( g_alg_name, "algorithm", "name of the used algorithm - can be dpotrf, crout, depend or join" )
634 .positional_arg( g_num_tbb_threads, "num_tbb_threads", "number of started TBB threads" )
635
636 .arg( g_input_file_name, "input_file", "if provided it will be read to get the input matrix" )
637 .arg( g_benchmark_run, "-x", "skips all validation" )
638 );
639
640 if ( g_n > 46000 ) {
641 printf( "ERROR: invalid 'size' value (must be less or equal 46000): %d\n", g_n );
642 return false;
643 }
644
645 if ( g_n%g_b != 0 ) {
646 printf( "ERROR: size %d must be a multiple of the blocksize %d\n", g_n, g_b );
647 return false;
648 }
649
650 if ( g_n/g_b > 256 ) {
651 // Because tile index size is 1 byte only in tag_t type
652 printf( "ERROR: size / blocksize must be less or equal 256, but %d / %d = %d\n", g_n, g_b, g_n/g_b );
653 return false;
654 }
655
656 if ( g_b == -1 || (g_n == -1 && g_input_file_name == NULL) ) {
657 return false;
658 }
659
660 return true;
661 }
662
main(int argc,char * argv[])663 int main(int argc, char *argv[]) {
664 g_num_tbb_threads = utility::get_default_num_threads();
665 typedef std::map< std::string, algorithm * > algmap_t;
666 algmap_t algmap;
667
668 // Init algorithms
669 algmap.insert(std::pair<std::string, algorithm *>("dpotrf", new algorithm_dpotrf));
670 algmap.insert(std::pair<std::string, algorithm *>("crout", new algorithm_crout));
671 algmap.insert(std::pair<std::string, algorithm *>("depend", new algorithm_depend));
672 algmap.insert(std::pair<std::string, algorithm *>("join", new algorithm_join));
673
674 if ( !process_args( argc, argv ) ) {
675 printf( "ERROR: Invalid arguments. Run: %s -h\n", argv[0] );
676 exit( 1 );
677 }
678
679 tbb::global_control c(tbb::global_control::max_allowed_parallelism, g_num_tbb_threads);
680 double *A = NULL;
681
682 // Read input matrix
683 matrix_init( A, g_n, g_input_file_name );
684
685 // Write input matrix if output_prefix is set and we didn't read from a file
686 if ( !g_input_file_name && g_output_prefix ) {
687 std::string s( g_output_prefix );
688 s += "_posdef.txt";
689 matrix_write( A, g_n, s.c_str() );
690 }
691
692 if ( g_alg_name.empty() ) {
693 for ( algmap_t::iterator i = algmap.begin(); i != algmap.end(); ++i ) {
694 algorithm* const alg = i->second;
695 (*alg)( A, g_n, g_b, g_num_trials );
696 }
697 }
698 else {
699 algmap_t::iterator alg_iter = algmap.find(g_alg_name);
700
701 if ( alg_iter != algmap.end() ) {
702 algorithm* const alg = alg_iter->second;
703 (*alg)( A, g_n, g_b, g_num_trials );
704 }
705 else {
706 printf( "ERROR: Invalid algorithm name: %s\n", g_alg_name.c_str() );
707 exit( 2 );
708 }
709 }
710
711 free( A );
712 return 0;
713 }
714