1 /*
2     Copyright (c) 2005-2021 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #include <string>
18 #include <cstring>
19 #include <cstdio>
20 #include <cmath>
21 
22 #include <vector>
23 #include <map>
24 #include <tuple>
25 
26 #include "mkl_lapack.h"
27 #include "mkl.h"
28 
29 #include "oneapi/tbb/flow_graph.h"
30 #include "oneapi/tbb/tick_count.h"
31 #include "oneapi/tbb/global_control.h"
32 
33 // Application command line arguments parsing
34 #include "common/utility/utility.hpp"
35 #include "common/utility/get_default_num_threads.hpp"
36 
37 /************************************************************
38  FORWARD DECLARATIONS
39 ************************************************************/
40 
41 /**********************************************
42  Read or generate a positive-definite matrix
43  -- reads from file if fname != nullptr
44      -- sets n to matrix size
45      -- allocates and reads values in to A
46  -- otherwise generates a matrix
47      -- uses n to determine size
48      -- allocates and generates values in to A
49 **********************************************/
50 void matrix_init(double *&A, int &n, const char *fname);
51 
52 /**********************************************
53  Writes a lower triangular matrix to a file
54  -- first line of file is n
55  -- subsequently 1 row per line
56 **********************************************/
57 void matrix_write(double *A, int n, const char *fname, bool is_triangular = false);
58 
59 /************************************************************
60  GLOBAL VARIABLES
61 ************************************************************/
62 bool g_benchmark_run = false;
63 int g_n = -1, g_b = -1, g_num_trials = 1;
64 char *g_input_file_name = nullptr;
65 char *g_output_prefix = nullptr;
66 std::string g_alg_name;
67 int g_num_tbb_threads;
68 
69 // Creates tiled array
create_tile_array(double * A,int n,int b)70 static double ***create_tile_array(double *A, int n, int b) {
71     const int p = n / b;
72     double ***tile = (double ***)calloc(sizeof(double **), p);
73 
74     for (int j = 0; j < p; ++j) {
75         tile[j] = (double **)calloc(sizeof(double *), p);
76     }
77 
78     for (int j = 0; j < p; ++j) {
79         for (int i = 0; i < p; ++i) {
80             double *temp_block = (double *)calloc(sizeof(double), b * b);
81 
82             for (int A_j = j * b, T_j = 0; T_j < b; ++A_j, ++T_j) {
83                 for (int A_i = i * b, T_i = 0; T_i < b; ++A_i, ++T_i) {
84                     temp_block[T_j * b + T_i] = A[A_j * n + A_i];
85                 }
86             }
87 
88             tile[j][i] = temp_block;
89         }
90     }
91     return tile;
92 }
93 
collapse_tile_array(double *** tile,double * A,int n,int b)94 static void collapse_tile_array(double ***tile, double *A, int n, int b) {
95     const int p = n / b;
96 
97     for (int j = 0; j < p; ++j) {
98         for (int i = 0; i < p; ++i) {
99             double *temp_block = tile[j][i];
100 
101             for (int A_j = j * b, T_j = 0; T_j < b; ++A_j, ++T_j) {
102                 for (int A_i = i * b, T_i = 0; T_i < b; ++A_i, ++T_i) {
103                     A[A_j * n + A_i] = temp_block[T_j * b + T_i];
104                 }
105             }
106 
107             free(temp_block);
108             tile[j][i] = nullptr;
109         }
110 
111         free(tile[j]);
112     }
113 
114     free(tile);
115 }
116 
117 /************************************************************
118  Helper base class: algorithm
119 ************************************************************/
120 class algorithm {
121     std::string name;
122     bool is_tiled;
123 
check_if_valid(double * A0,double * C,double * A,int n)124     bool check_if_valid(double *A0, double *C, double *A, int n) {
125         char transa = 'n', transb = 't';
126         double alpha = 1;
127         double beta = 0;
128 
129         for (int i = 0; i < n; ++i) {
130             for (int j = i + 1; j < n; ++j) {
131                 A0[j * n + i] = 0.;
132             }
133         }
134 
135         dgemm(&transa, &transb, &n, &n, &n, &alpha, A0, &n, A0, &n, &beta, C, &n);
136 
137         for (int j = 0; j < n; ++j) {
138             for (int i = 0; i < n; ++i) {
139                 const double epsilon = std::abs(A[j * n + i] * 0.1);
140 
141                 if (std::abs(C[j * n + i] - A[j * n + i]) > epsilon) {
142                     printf("ERROR: %s did not validate at C(%d,%d) = %lf != A(%d,%d) = %lf\n",
143                            name.c_str(),
144                            i,
145                            j,
146                            C[j * n + i],
147                            i,
148                            j,
149                            A[j * n + i]);
150                     printf("ERROR: %g; %g < %g < %g\n",
151                            epsilon,
152                            A[j * n + i] - epsilon,
153                            C[j * n + i],
154                            A[j * n + i] + epsilon);
155                     return false;
156                 }
157             }
158         }
159         return true;
160     }
161 
162 public:
algorithm(const std::string & alg_name,bool t)163     algorithm(const std::string &alg_name, bool t) : name(alg_name), is_tiled(t) {}
164 
operator ()(double * A,int n,int b,int trials)165     double operator()(double *A, int n, int b, int trials) {
166         oneapi::tbb::tick_count t0, t1;
167         double elapsed_time = 0.0;
168         double *A0 = (double *)calloc(sizeof(double), n * n);
169         double *C = (double *)calloc(sizeof(double), n * n);
170 
171         for (int i = 0; i < trials + 1; ++i) {
172             if (is_tiled) {
173                 double ***tile = create_tile_array(A, n, b);
174                 t0 = oneapi::tbb::tick_count::now();
175                 func(tile, n, b);
176                 t1 = oneapi::tbb::tick_count::now();
177 
178                 collapse_tile_array(tile, A0, n, b);
179             }
180             else {
181                 memcpy(A0, A, sizeof(double) * n * n);
182                 t0 = oneapi::tbb::tick_count::now();
183                 func(A0, n, b);
184                 t1 = oneapi::tbb::tick_count::now();
185             }
186 
187             if (i)
188                 elapsed_time += (t1 - t0).seconds();
189 
190             if (!g_benchmark_run && !check_if_valid(A0, C, A, n)) {
191                 if (g_output_prefix) {
192                     std::string s(g_output_prefix);
193                     s += "_" + name + ".txt";
194                     matrix_write(A0, g_n, s.c_str(), true);
195                     free(A0);
196                     free(C);
197                     return 0.;
198                 }
199             }
200         }
201 
202         if (g_output_prefix) {
203             std::string s(g_output_prefix);
204             s += "_" + name + ".txt";
205             matrix_write(A0, g_n, s.c_str(), true);
206         }
207 
208         printf("%s %d %d %d %d %lf %lf\n",
209                name.c_str(),
210                g_num_tbb_threads,
211                trials,
212                n,
213                b,
214                elapsed_time,
215                elapsed_time / trials);
216         free(A0);
217         free(C);
218         return elapsed_time;
219     }
220 
221 protected:
222     // Main algorithm body function must be defined in any direved class
223     virtual void func(void *ptr, int n, int b) = 0;
224 };
225 
226 /***********************************************************/
227 
call_dpotf2(double *** tile,int b,int k)228 static void call_dpotf2(double ***tile, int b, int k) {
229     double *A_block = tile[k][k];
230     char uplo = 'l';
231     int info = 0;
232     dpotf2(&uplo, &b, A_block, &b, &info);
233     return;
234 }
235 
call_dtrsm(double *** tile,int b,int k,int j)236 static void call_dtrsm(double ***tile, int b, int k, int j) {
237     double *A_block = tile[k][j];
238     double *L_block = tile[k][k];
239     char uplo = 'l', side = 'r', transa = 't', diag = 'n';
240     double alpha = 1;
241     dtrsm(&side, &uplo, &transa, &diag, &b, &b, &alpha, L_block, &b, A_block, &b);
242     return;
243 }
244 
call_dsyr2k(double *** tile,int b,int k,int j,int i)245 static void call_dsyr2k(double ***tile, int b, int k, int j, int i) {
246     double *A_block = tile[i][j];
247     char transa = 'n', transb = 't';
248     char uplo = 'l';
249     double alpha = -1;
250     double beta = 1;
251 
252     if (i == j) { // Diagonal block
253         double *L_block = tile[k][i];
254         dsyrk(&uplo, &transa, &b, &b, &alpha, L_block, &b, &beta, A_block, &b);
255     }
256     else { // Non-diagonal block
257         double *L2_block = tile[k][i];
258         double *L1_block = tile[k][j];
259         dgemm(&transa, &transb, &b, &b, &b, &alpha, L1_block, &b, L2_block, &b, &beta, A_block, &b);
260     }
261     return;
262 }
263 
264 class algorithm_crout : public algorithm {
265 public:
algorithm_crout()266     algorithm_crout() : algorithm("crout_cholesky", true) {}
267 
268 protected:
func(void * ptr,int n,int b)269     virtual void func(void *ptr, int n, int b) {
270         double ***tile = (double ***)ptr;
271         const int p = n / b;
272 
273         for (int k = 0; k < p; ++k) {
274             call_dpotf2(tile, b, k);
275 
276             for (int j = k + 1; j < p; ++j) {
277                 call_dtrsm(tile, b, k, j);
278 
279                 for (int i = k + 1; i <= j; ++i) {
280                     call_dsyr2k(tile, b, k, j, i);
281                 }
282             }
283         }
284     }
285 };
286 
287 class algorithm_dpotrf : public algorithm {
288 public:
algorithm_dpotrf()289     algorithm_dpotrf() : algorithm("dpotrf_cholesky", false) {}
290 
291 protected:
func(void * ptr,int n,int)292     virtual void func(void *ptr, int n, int /* b */) {
293         double *A = (double *)ptr;
294         int lda = n;
295         int info = 0;
296         char uplo = 'l';
297         dpotrf(&uplo, &n, A, &lda, &info);
298     }
299 };
300 
301 /************************************************************
302  Begin data join graph based version of cholesky
303 ************************************************************/
304 
305 typedef union {
306     char a[4];
307     std::size_t tag;
308 } tag_t;
309 
310 typedef double *tile_t;
311 
312 typedef std::pair<tag_t, tile_t> tagged_tile_t;
313 typedef std::tuple<tagged_tile_t> t1_t;
314 typedef std::tuple<tagged_tile_t, tagged_tile_t> t2_t;
315 typedef std::tuple<tagged_tile_t, tagged_tile_t, tagged_tile_t> t3_t;
316 
317 typedef oneapi::tbb::flow::multifunction_node<tagged_tile_t, t1_t> dpotf2_node_t;
318 typedef oneapi::tbb::flow::multifunction_node<t2_t, t2_t> dtrsm_node_t;
319 typedef oneapi::tbb::flow::multifunction_node<t3_t, t3_t> dsyr2k_node_t;
320 
321 typedef oneapi::tbb::flow::join_node<t2_t, oneapi::tbb::flow::tag_matching> dtrsm_join_t;
322 typedef oneapi::tbb::flow::join_node<t3_t, oneapi::tbb::flow::tag_matching> dsyr2k_join_t;
323 
324 class dpotf2_body {
325     int p;
326     int b;
327 
328 public:
dpotf2_body(int p_,int b_)329     dpotf2_body(int p_, int b_) : p(p_), b(b_) {}
330 
operator ()(const tagged_tile_t & in,dpotf2_node_t::output_ports_type & ports)331     void operator()(const tagged_tile_t &in, dpotf2_node_t::output_ports_type &ports) {
332         int k = in.first.a[0];
333         tile_t A_block = in.second;
334         tag_t t;
335         t.tag = 0;
336         t.a[0] = k;
337         char uplo = 'l';
338         int info = 0;
339         dpotf2(&uplo, &b, A_block, &b, &info);
340 
341         // Send to dtrsms in same column
342         // k == k  j == k
343         t.a[2] = k;
344         for (int j = k + 1; j < p; ++j) {
345             t.a[1] = j;
346             std::get<0>(ports).try_put(std::make_pair(t, A_block));
347         }
348     }
349 };
350 
351 class dtrsm_body {
352     int p;
353     int b;
354 
355 public:
dtrsm_body(int p_,int b_)356     dtrsm_body(int p_, int b_) : p(p_), b(b_) {}
357 
operator ()(const t2_t & in,dtrsm_node_t::output_ports_type & ports)358     void operator()(const t2_t &in, dtrsm_node_t::output_ports_type &ports) {
359         tagged_tile_t in0 = std::get<0>(in);
360         tagged_tile_t in1 = std::get<1>(in);
361         int k = in0.first.a[0];
362         int j = in0.first.a[1];
363         tile_t L_block = in0.second;
364         tile_t A_block = in1.second;
365         tag_t t;
366         t.tag = 0;
367         t.a[0] = k;
368         char uplo = 'l', side = 'r', transa = 't', diag = 'n';
369         double alpha = 1;
370         dtrsm(&side, &uplo, &transa, &diag, &b, &b, &alpha, L_block, &b, A_block, &b);
371 
372         // Send to rest of my row
373         t.a[1] = j;
374         for (int i = k + 1; i <= j; ++i) {
375             t.a[2] = i;
376             std::get<0>(ports).try_put(std::make_pair(t, A_block));
377         }
378 
379         // Send to transposed row
380         t.a[2] = j;
381         for (int i = j; i < p; ++i) {
382             t.a[1] = i;
383             std::get<1>(ports).try_put(std::make_pair(t, A_block));
384         }
385     }
386 };
387 
388 class dsyr2k_body {
389     int p;
390     int b;
391 
392 public:
dsyr2k_body(int p_,int b_)393     dsyr2k_body(int p_, int b_) : p(p_), b(b_) {}
394 
operator ()(const t3_t & in,dsyr2k_node_t::output_ports_type & ports)395     void operator()(const t3_t &in, dsyr2k_node_t::output_ports_type &ports) {
396         tag_t t;
397         t.tag = 0;
398         char transa = 'n', transb = 't';
399         char uplo = 'l';
400         double alpha = -1;
401         double beta = 1;
402 
403         tagged_tile_t in0 = std::get<0>(in);
404         tagged_tile_t in1 = std::get<1>(in);
405         tagged_tile_t in2 = std::get<2>(in);
406         int k = in2.first.a[0];
407         int j = in2.first.a[1];
408         int i = in2.first.a[2];
409 
410         tile_t A_block = in2.second;
411         if (i == j) { // Diagonal block
412             tile_t L_block = in0.second;
413             dsyrk(&uplo, &transa, &b, &b, &alpha, L_block, &b, &beta, A_block, &b);
414         }
415         else { // Non-diagonal block
416             tile_t L1_block = in0.second;
417             tile_t L2_block = in1.second;
418             dgemm(&transa,
419                   &transb,
420                   &b,
421                   &b,
422                   &b,
423                   &alpha,
424                   L1_block,
425                   &b,
426                   L2_block,
427                   &b,
428                   &beta,
429                   A_block,
430                   &b);
431         }
432 
433         // All outputs flow to next step
434         t.a[0] = k + 1;
435         t.a[1] = j;
436         t.a[2] = i;
437         if (k != p - 1 && j == k + 1 && i == k + 1) {
438             std::get<0>(ports).try_put(std::make_pair(t, A_block));
439         }
440 
441         if (k < p - 2) {
442             if (i == k + 1 && j > i) {
443                 t.a[0] = k + 1;
444                 t.a[1] = j;
445                 std::get<1>(ports).try_put(std::make_pair(t, A_block));
446             }
447 
448             if (j != k + 1 && i != k + 1) {
449                 t.a[0] = k + 1;
450                 t.a[1] = j;
451                 t.a[2] = i;
452                 std::get<2>(ports).try_put(std::make_pair(t, A_block));
453             }
454         }
455     }
456 };
457 
458 struct tagged_tile_to_size_t {
operator ()tagged_tile_to_size_t459     std::size_t operator()(const tagged_tile_t &t) {
460         return t.first.tag;
461     }
462 };
463 
464 class algorithm_join : public algorithm {
465 public:
algorithm_join()466     algorithm_join() : algorithm("data_join_cholesky", true) {}
467 
468 protected:
func(void * ptr,int n,int b)469     virtual void func(void *ptr, int n, int b) {
470         using oneapi::tbb::flow::unlimited;
471         using oneapi::tbb::flow::output_port;
472         using oneapi::tbb::flow::input_port;
473 
474         double ***tile = (double ***)ptr;
475         const int p = n / b;
476         oneapi::tbb::flow::graph g;
477 
478         dpotf2_node_t dpotf2_node(g, unlimited, dpotf2_body(p, b));
479         dtrsm_node_t dtrsm_node(g, unlimited, dtrsm_body(p, b));
480         dsyr2k_node_t dsyr2k_node(g, unlimited, dsyr2k_body(p, b));
481         dtrsm_join_t dtrsm_join(g, tagged_tile_to_size_t(), tagged_tile_to_size_t());
482         dsyr2k_join_t dsyr2k_join(
483             g, tagged_tile_to_size_t(), tagged_tile_to_size_t(), tagged_tile_to_size_t());
484 
485         make_edge(output_port<0>(dsyr2k_node), dpotf2_node);
486 
487         make_edge(output_port<0>(dpotf2_node), input_port<0>(dtrsm_join));
488         make_edge(output_port<1>(dsyr2k_node), input_port<1>(dtrsm_join));
489         make_edge(dtrsm_join, dtrsm_node);
490 
491         make_edge(output_port<0>(dtrsm_node), input_port<0>(dsyr2k_join));
492         make_edge(output_port<1>(dtrsm_node), input_port<1>(dsyr2k_join));
493         make_edge(output_port<2>(dsyr2k_node), input_port<2>(dsyr2k_join));
494         make_edge(dsyr2k_join, dsyr2k_node);
495 
496         // Now we need to send out the tiles to their first nodes
497         tag_t t;
498         t.tag = 0;
499         t.a[0] = 0;
500         t.a[1] = 0;
501         t.a[2] = 0;
502 
503         // Send to feedback input of first dpotf2
504         // k == 0, j == 0, i == 0
505         dpotf2_node.try_put(std::make_pair(t, tile[0][0]));
506 
507         // Send to feedback input (port 1) of each dtrsm
508         // k == 0, j == 1..p-1
509         for (int j = 1; j < p; ++j) {
510             t.a[1] = j;
511             input_port<1>(dtrsm_join).try_put(std::make_pair(t, tile[0][j]));
512         }
513 
514         // Send to feedback input (port 2) of each dsyr2k
515         // k == 0
516         for (int i = 1; i < p; ++i) {
517             t.a[2] = i;
518 
519             for (int j = i; j < p; ++j) {
520                 t.a[1] = j;
521                 input_port<2>(dsyr2k_join).try_put(std::make_pair(t, tile[i][j]));
522             }
523         }
524 
525         g.wait_for_all();
526     }
527 };
528 
529 /************************************************************
530  End data join graph based version of cholesky
531 ************************************************************/
532 
533 /************************************************************
534  Begin dependence graph based version of cholesky
535 ************************************************************/
536 
537 typedef oneapi::tbb::flow::continue_node<oneapi::tbb::flow::continue_msg> continue_type;
538 typedef continue_type *continue_ptr_type;
539 
540 class algorithm_depend : public algorithm {
541 public:
algorithm_depend()542     algorithm_depend() : algorithm("depend_cholesky", true) {}
543 
544 protected:
func(void * ptr,int n,int b)545     virtual void func(void *ptr, int n, int b) {
546         double ***tile = (double ***)ptr;
547 
548         const int p = n / b;
549         continue_ptr_type *c = new continue_ptr_type[p];
550         continue_ptr_type **t = new continue_ptr_type *[p];
551         continue_ptr_type ***u = new continue_ptr_type **[p];
552 
553         oneapi::tbb::flow::graph g;
554         for (int k = p - 1; k >= 0; --k) {
555             c[k] = new continue_type(g, [=](const oneapi::tbb::flow::continue_msg &) {
556                 call_dpotf2(tile, b, k);
557             });
558             t[k] = new continue_ptr_type[p];
559             u[k] = new continue_ptr_type *[p];
560 
561             for (int j = k + 1; j < p; ++j) {
562                 t[k][j] = new continue_type(g, [=](const oneapi::tbb::flow::continue_msg &) {
563                     call_dtrsm(tile, b, k, j);
564                 });
565                 make_edge(*c[k], *t[k][j]);
566                 u[k][j] = new continue_ptr_type[p];
567 
568                 for (int i = k + 1; i <= j; ++i) {
569                     u[k][j][i] = new continue_type(g, [=](const oneapi::tbb::flow::continue_msg &) {
570                         call_dsyr2k(tile, b, k, j, i);
571                     });
572 
573                     if (k < p - 2 && k + 1 != j && k + 1 != i) {
574                         make_edge(*u[k][j][i], *u[k + 1][j][i]);
575                     }
576 
577                     make_edge(*t[k][j], *u[k][j][i]);
578 
579                     if (i != j) {
580                         make_edge(*t[k][i], *u[k][j][i]);
581                     }
582 
583                     if (k < p - 2 && j > i && i == k + 1) {
584                         make_edge(*u[k][j][i], *t[i][j]);
585                     }
586                 }
587             }
588 
589             if (k != p - 1) {
590                 make_edge(*u[k][k + 1][k + 1], *c[k + 1]);
591             }
592         }
593 
594         c[0]->try_put(oneapi::tbb::flow::continue_msg());
595         g.wait_for_all();
596 
597         for (int k = p - 1; k >= 0; --k) {
598             for (int j = k + 1; j < p; ++j) {
599                 for (int i = k + 1; i <= j; ++i) {
600                     delete u[k][j][i];
601                 }
602                 delete t[k][j];
603                 delete[] u[k][j];
604             }
605             delete c[k];
606             delete[] t[k];
607             delete[] u[k];
608         }
609         delete[] c;
610         delete[] t;
611         delete[] u;
612     }
613 }; // class algorithm_depend
614 
615 /************************************************************
616  End dependence graph based version of cholesky
617 ************************************************************/
618 
process_args(int argc,char * argv[])619 bool process_args(int argc, char *argv[]) {
620     utility::parse_cli_arguments(
621         argc,
622         argv,
623         utility::cli_argument_pack()
624             //"-h" option for displaying help is present implicitly
625             .positional_arg(g_n, "size", "the row/column size of NxN matrix (size <= 46000)")
626             .positional_arg(
627                 g_b, "blocksize", "the block size; size must be a multiple of the blocksize")
628             .positional_arg(g_num_trials, "num_trials", "the number of times to run each algorithm")
629             .positional_arg(
630                 g_output_prefix,
631                 "output_prefix",
632                 "if provided the prefix will be preappended to output files:\n"
633                 "                     output_prefix_posdef.txt\n"
634                 "                     output_prefix_X.txt; where X is the algorithm used\n"
635                 "                 if output_prefix is not provided, no output will be written")
636             .positional_arg(g_alg_name,
637                             "algorithm",
638                             "name of the used algorithm - can be dpotrf, crout, depend or join")
639             .positional_arg(g_num_tbb_threads, "num_tbb_threads", "number of started TBB threads")
640 
641             .arg(g_input_file_name,
642                  "input_file",
643                  "if provided it will be read to get the input matrix")
644             .arg(g_benchmark_run, "-x", "skips all validation"));
645 
646     if (g_n > 46000) {
647         printf("ERROR: invalid 'size' value (must be less or equal 46000): %d\n", g_n);
648         return false;
649     }
650 
651     if (g_n % g_b != 0) {
652         printf("ERROR: size %d must be a multiple of the blocksize %d\n", g_n, g_b);
653         return false;
654     }
655 
656     if (g_n / g_b > 256) {
657         // Because tile index size is 1 byte only in tag_t type
658         printf("ERROR: size / blocksize must be less or equal 256, but %d / %d = %d\n",
659                g_n,
660                g_b,
661                g_n / g_b);
662         return false;
663     }
664 
665     if (g_b == -1 || (g_n == -1 && g_input_file_name == nullptr)) {
666         return false;
667     }
668 
669     return true;
670 }
671 
main(int argc,char * argv[])672 int main(int argc, char *argv[]) {
673     g_num_tbb_threads = utility::get_default_num_threads();
674     typedef std::map<std::string, algorithm *> algmap_t;
675     algmap_t algmap;
676 
677     // Init algorithms
678     algmap.insert(std::pair<std::string, algorithm *>("dpotrf", new algorithm_dpotrf));
679     algmap.insert(std::pair<std::string, algorithm *>("crout", new algorithm_crout));
680     algmap.insert(std::pair<std::string, algorithm *>("depend", new algorithm_depend));
681     algmap.insert(std::pair<std::string, algorithm *>("join", new algorithm_join));
682 
683     if (!process_args(argc, argv)) {
684         printf("ERROR: Invalid arguments. Run: %s -h\n", argv[0]);
685         return -1;
686     }
687 
688     oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism,
689                                   g_num_tbb_threads);
690     double *A = nullptr;
691 
692     // Read input matrix
693     matrix_init(A, g_n, g_input_file_name);
694 
695     // Write input matrix if output_prefix is set and we didn't read from a file
696     if (!g_input_file_name && g_output_prefix) {
697         std::string s(g_output_prefix);
698         s += "_posdef.txt";
699         matrix_write(A, g_n, s.c_str());
700     }
701 
702     if (g_alg_name.empty()) {
703         for (algmap_t::iterator i = algmap.begin(); i != algmap.end(); ++i) {
704             algorithm *const alg = i->second;
705             (*alg)(A, g_n, g_b, g_num_trials);
706         }
707     }
708     else {
709         algmap_t::iterator alg_iter = algmap.find(g_alg_name);
710 
711         if (alg_iter != algmap.end()) {
712             algorithm *const alg = alg_iter->second;
713             (*alg)(A, g_n, g_b, g_num_trials);
714         }
715         else {
716             printf("ERROR: Invalid algorithm name: %s\n", g_alg_name.c_str());
717             return -1;
718         }
719     }
720 
721     free(A);
722     return 0;
723 }
724