1 /*! \file   MM-DissectionSolver.cpp
2     \brief  test rouinte of dissection solver reading Matrix Market format
3     \author Atsushi Suzuki, Laboratoire Jacques-Louis Lions
4     \date   Jun. 20th 2014
5     \date   Jul. 12th 2015
6     \date   Feb. 29th 2016
7 */
8 
9 // This file is part of Dissection
10 //
11 // Dissection is free software: you can redistribute it and/or modify
12 // it under the terms of the GNU General Public License as published by
13 // the Free Software Foundation, either version 3 of the License, or
14 // (at your option) any later version.
15 //
16 // Linking Dissection statically or dynamically with other modules is making
17 // a combined work based on Disssection. Thus, the terms and conditions of
18 // the GNU General Public License cover the whole combination.
19 //
20 // As a special exception, the copyright holders of Dissection give you
21 // permission to combine Dissection program with free software programs or
22 // libraries that are released under the GNU LGPL and with independent modules
23 // that communicate with Dissection solely through the Dissection-fortran
24 // interface. You may copy and distribute such a system following the terms of
25 // the GNU GPL for Dissection and the licenses of the other code concerned,
26 // provided that you include the source code of that other code when and as
27 // the GNU GPL requires distribution of source code and provided that you do
28 // not modify the Dissection-fortran interface.
29 //
30 // Note that people who make modified versions of Dissection are not obligated
31 // to grant this special exception for their modified versions; it is their
32 // choice whether to do so. The GNU General Public License gives permission to
33 // release a modified version without this exception; this exception also makes
34 // it possible to release a modified version which carries forward this
35 // exception. If you modify the Dissection-fortran interface, this exception
36 // does not apply to your modified version of Dissection, and you must remove
37 // this exception when you distribute your modified version.
38 //
39 // This exception is an additional permission under section 7 of the GNU
40 // General Public License, version 3 ("GPLv3")
41 //
42 // Dissection is distributed in the hope that it will be useful,
43 // but WITHOUT ANY WARRANTY; without even the implied warranty of
44 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
45 // GNU General Public License for more details.
46 //
47 // You should have received a copy of the GNU General Public License
48 // along with Dissection.  If not, see <http://www.gnu.org/licenses/>.
49 //
50 
51 #include <cstdio>
52 #include <cstdlib>
53 #include <cstring>
54 #include <fcntl.h>
55 #include <unistd.h>
56 #include <math.h>
57 
58 #include "Driver/DissectionSolver.hpp"
59 #ifdef BLAS_MKL
60 #include <mkl_service.h>
61 #endif
62 #include <pthread.h>
63 #include <iostream>
64 #include <sstream>
65 #include <fstream>
66 #include <complex>
67 #include "Compiler/arithmetic.hpp"
68 
69 using namespace std;
70 
71 static int _stat = (-1);
72 
thread_child(void * arg)73 void *thread_child(void *arg)
74 {
75   char buf[256];
76   int *pid = (int *)arg;
77   unsigned int mem_tmp, mem_min, mem_max;
78   double avg_mem;
79   avg_mem = 0.0;
80   mem_min = (1U << 31) - 1;
81   mem_max = 0U;
82   int stat0, stat1;
83   stat0 = _stat;
84   unsigned int count = 0U;
85 //  fprintf(stderr, "thread_child forked\n");
86   while(_stat != 0) {
87     stat1 = _stat;
88      if (stat1 == 1) {
89        sprintf(buf, "/proc/%d/statm", *pid);
90        ifstream fin(buf);
91        fin >> mem_tmp;
92        fin.close();
93        if (mem_tmp > mem_max) {
94 	 mem_max = mem_tmp;
95        }
96        if (mem_tmp < mem_min) {
97 	 mem_min = mem_tmp;
98        }
99        avg_mem += (double)mem_tmp;
100        count++;
101      }
102      if ((stat1 == (-1)) && (stat0 == 1)) {
103        fprintf(stderr,
104 	       "used memory :min: %14.8e  max: %14.8e avg: %14.8e count: %d\n",
105 	       (double)mem_min * 4.0 / (1024.0 * 1024.0),
106 	       (double)mem_max * 4.0 / (1024.0 * 1024.0),
107 	       (avg_mem / (double)count) * 4.0 / (1024.0 * 1024.0),
108 	       count);
109        count = 0U;
110        avg_mem = 0.0;
111        mem_min = (1U << 31) - 1;
112        mem_max = 0U;
113      }
114      stat0 = stat1;
115      usleep(1000);
116   }
117 // fprintf(stderr, "thread_child join\n count = %ld\n", count);
118   pthread_exit(arg);
119 
120   return (void *)NULL;
121 }
122 
123 template<typename T>
generate_CSR(std::list<int> * ind_cols_tmp,std::list<T> * val_tmp,int nrow,int * nnz,int * mask,int * old2new,int * irow,int * jcol,T * val,bool symmetrize)124 void generate_CSR(std::list<int>* ind_cols_tmp, std::list<T>* val_tmp,
125 		  int nrow, int *nnz, int *mask, int *old2new,
126 		  int *irow, int *jcol, T* val, bool symmetrize)
127 {
128   const T zero(0.0);
129   //  ind_cols_tmp = new std::list<int>[nrow];
130   //  val_tmp = new std::list<T>[nrow];
131   int nnz1 = *nnz;
132   for (int i = 0; i < *nnz; i++) {
133     const int i0 = irow[i];
134     const int j0 = jcol[i];
135     const int ii = old2new[i0];
136     const int jj = old2new[j0];
137     if ((mask[i0] != 1) || (mask[j0] != 1)) {
138       //      fprintf(stderr, "%d %d\n", i0, j0);
139       nnz1--;
140       continue;
141     }
142     //    fprintf(stderr, "%d %d -> %d %d \n", i0, j0, ii, jj);
143     if (ind_cols_tmp[ii].empty()) {
144       ind_cols_tmp[ii].push_back(jj);
145       val_tmp[ii].push_back(val[i]);
146     }
147     else {
148       if (ind_cols_tmp[ii].back() < jj) {
149 	ind_cols_tmp[ii].push_back(jj);
150 	val_tmp[ii].push_back(val[i]);
151       }
152       else {
153 	typename std::list<T>::iterator iv = val_tmp[ii].begin();
154 	std::list<int>::iterator it = ind_cols_tmp[ii].begin();
155 	for ( ; it != ind_cols_tmp[ii].end(); ++it, ++iv) {
156 	  if (*it == jj) {
157 	    fprintf(stderr, "already exits? (%d %d)\n", ii, jj);
158 	      break;
159 	  }
160 	  if (*it > jj) {
161 	    ind_cols_tmp[ii].insert(it, jj);
162 	    val_tmp[ii].insert(iv, val[i]);
163 	    break;
164 	  }
165 	}
166       }
167     }
168   }
169   // symmetrize
170   if (symmetrize) {
171     for (int i = 0; i < nrow; i++) {
172       std::list<int>::iterator jt = ind_cols_tmp[i].begin();
173       for ( ; jt != ind_cols_tmp[i].end(); ++jt) {
174 	const int jj = (*jt);
175 	bool flag = false;
176 	std::list<int>::iterator it = ind_cols_tmp[jj].begin();
177 	for ( ; it != ind_cols_tmp[jj].end(); ++it) {
178 	  if ((*it) == i) {
179 	    flag = true;
180 	    //	    fprintf(stderr, "%d %d symmetric position found\n", i, jj);
181 	    break;
182 	  }
183 	}
184 	if (!flag) {
185 	  if (ind_cols_tmp[jj].back() < i) {
186 	    ind_cols_tmp[jj].push_back(i);
187 	    val_tmp[jj].push_back(zero);
188 	    fprintf(stderr, "%d %d added for symmetry\n", i, jj);
189 	    nnz1++;
190 	  }
191 	  else {
192 	    typename std::list<T>::iterator iv = val_tmp[jj].begin();
193 	    std::list<int>::iterator it = ind_cols_tmp[jj].begin();
194 	    for ( ; it != ind_cols_tmp[jj].end(); ++it, ++iv) {
195 	      std::list<int>::iterator it1 = it;
196 	      ++it1;
197 	      if (((*it)) < i && ((*it1) > i)) {
198 		ind_cols_tmp[jj].insert(it, i);
199 		val_tmp[jj].insert(iv, zero);
200 		nnz1++;
201 		//		fprintf(stderr, "%d %d inserted for symmetry\n", i, jj);
202 		break;
203 	      }
204 	    }
205 	  }
206 	} // if (!flag);
207       }
208     }
209   }
210 #if 0
211   {
212     for (int i = 0; i < nrow; i++) {
213       std::list<int>::iterator jt = ind_cols_tmp[i].begin();
214       for ( ; jt != ind_cols_tmp[i].end(); ++jt) {
215 	const int jj = (*jt);
216 	bool flag = false;
217 	std::list<int>::iterator it = ind_cols_tmp[jj].begin();
218 	for ( ; it != ind_cols_tmp[jj].end(); ++it) {
219 	  if ((*it) == i) {
220 	    flag = true;
221 	    break;
222 	  }
223 	}
224 	if (!flag) {
225 	  fprintf(stderr, "%d %d position is not symmetric\n", i, jj);
226 	}
227       }
228     }
229   }
230 #endif
231   *nnz = nnz1;
232 }
233 
234 template<typename T>
copy_CSR(int * indcols,int * ptrows,T * coefs,int nrow,bool upper_flag,bool isSym,std::list<int> * ind_cols_tmp,std::list<T> * val_tmp)235 void copy_CSR(int *indcols, int *ptrows, T* coefs, int nrow,
236 	      bool upper_flag, bool isSym,
237 	      std::list<int>* ind_cols_tmp, std::list<T>* val_tmp)
238 {
239   const T zero(0.0);
240   ptrows[0] = 0;
241   for (int i = 0; i < nrow; i++) {
242     int k;
243     int itmp = ind_cols_tmp[i].size();
244     if (upper_flag) {
245       if (ind_cols_tmp[i].front() == i) {
246 	ptrows[i + 1] = ptrows[i] + itmp;
247 	k = ptrows[i];
248       }
249       else {
250 	fprintf(stderr, "zero is added to diagonal : %d\n", i);
251 	ptrows[i + 1] = ptrows[i] + itmp + 1;
252 	indcols[ptrows[i]] = i;
253 	coefs[ptrows[i]] = zero;
254 	k = ptrows[i] + 1;
255       }
256     }
257     else {
258       k = ptrows[i];
259       if (ind_cols_tmp[i].back() == i || (!isSym)) {
260 	ptrows[i + 1] = ptrows[i] + itmp;
261       }
262       else {
263 	fprintf(stderr, "zero is added to diagonal : %d\n", i);
264 	ptrows[i + 1] = ptrows[i] + itmp + 1;
265 	indcols[ptrows[i + 1] - 1] = i;
266 	coefs[ptrows[i + 1] - 1] = zero;
267       }
268     }
269     std::list<int>::iterator it = ind_cols_tmp[i].begin();
270     typename std::list<T>::iterator iv = val_tmp[i].begin();
271     for ( ; it != ind_cols_tmp[i].end(); ++it, ++iv, k++) {
272       indcols[k] = *it;
273       coefs[k] = *iv;
274     }
275   } // loop : i
276 }
277 
main(int argc,char ** argv)278 int main(int argc, char **argv)
279 {
280   int n, itmp, jtmp;
281   char fname[256], fname1[256];
282   char buf[1024];
283   int nrow, nnz, flag;
284   int *ptrows, *indcols;
285   int *irow, *jcol;
286   double *val, *coefs;
287   complex<double> *valc, *ccoefs;
288   quadruple *qcoefs;
289   complex<quadruple> *qccoefs;
290   int decomposer;
291   int num_threads;
292   int scaling = 1;
293   double eps_pivot;
294   int numlevels = -1;
295   int minNodes = 128;
296   std::list<int>* ind_cols_tmp;
297   std::list<double>* val_tmp;
298   std::list<complex<double> >* val_tmpc;
299   FILE *fp;
300   bool isSym, isComplex;
301   bool upper_flag = true;
302   bool isWhole = false;
303   bool kernel_detection_all = false;
304   int *indx_excl;
305   int nexcl = 0;
306   bool excl_flag = false;
307 
308   if (argc < 6) {
309     fprintf(stderr, "MM-dissection [data file] [decomposer] [num_threads] [eps_pivot] [num_levels] [scaling] [kerner_detection_all] [upper_flag] [minNodes]\n");
310     exit(-1);
311   }
312   strcpy(fname, argv[1]);
313   decomposer = atoi(argv[2]);
314   num_threads = atoi(argv[3]);
315   eps_pivot = atof(argv[4]);
316   numlevels = atof(argv[5]);
317   if (argc >= 7) {
318     scaling = atoi(argv[6]);
319   }
320   if (argc >= 8) {
321    kernel_detection_all = (atoi(argv[7]) == 1);
322   }
323 
324   if (argc >= 9) {
325     upper_flag = (atoi(argv[8]) == 1);
326     isWhole = (atoi(argv[8]) == (-1));
327   }
328   if (argc >= 10) {
329     strcpy(fname1, argv[9]);
330     excl_flag = true;
331   }
332   if (argc >= 11) {
333     minNodes = atoi(argv[10]);
334   }
335 
336   // read from the file
337   if ((fp = fopen(fname, "r")) == NULL) {
338     fprintf(stderr, "fail to open %s\n", fname);
339   }
340   fgets(buf, 256, fp);
341   //
342   if (strstr(buf, "symmetric") != NULL) {
343    isSym = true;
344   }
345   else {
346     isSym = false;
347     upper_flag = false;
348   }
349   if (strstr(buf, "complex") != NULL) {
350    isComplex = true;
351   }
352   else {
353     isComplex = false;
354   }
355 
356   fprintf(stderr, "symmetric = %s\n", isSym ? "true " : "false");
357   fprintf(stderr, "scaling = %d\n", scaling);
358   fprintf(stderr, "upper = %s\n", upper_flag ? "true" : "false");
359   if (kernel_detection_all) {
360     fprintf(stderr, "kernel detection is activated for all submatrices\n");
361   }
362   if (excl_flag) {
363     fprintf(stderr, "list of singular nodes %s\n", fname1);
364   }
365   while (1) {
366     fgets(buf, 256, fp);
367     if (buf[0] != '%') {
368       sscanf(buf, "%d %d %d", &nrow, &itmp, &nnz);
369       break;
370     }
371   }
372   irow = new int[nnz];
373   jcol = new int[nnz];
374 
375   if (isComplex) {
376     double xreal, ximag;
377     valc = new complex<double>[nnz];
378     if (upper_flag) {
379       for (int i = 0; i < nnz; i++) {
380 	fscanf(fp, "%d %d %lf %lf", &jcol[i], &irow[i], &xreal, &ximag);
381 	valc[i] = complex<double>(xreal, ximag);
382 	irow[i]--;
383 	jcol[i]--;
384 	if (isSym && irow[i] > jcol[i]) {
385 	  fprintf(stderr, "exchanged : %d > %d\n", irow[i], jcol[i]);
386 	  itmp = irow[i];
387 	  irow[i] = jcol[i];
388 	  jcol[i] = itmp;
389 	}
390       }
391     }
392     else {
393       for (int i = 0; i < nnz; i++) {
394 	fscanf(fp, "%d %d %lf %lf", &irow[i], &jcol[i], &xreal, &ximag);
395 	valc[i] = complex<double>(xreal, ximag);
396 	irow[i]--;
397 	jcol[i]--;
398 	if (isSym && irow[i] < jcol[i]) {
399 	  fprintf(stderr, "exchanged : %d > %d\n", irow[i], jcol[i]);
400 	  itmp = irow[i];
401 	  irow[i] = jcol[i];
402 	  jcol[i] = itmp;
403 	}
404       }
405     }
406   }
407   else {
408     val = new double[nnz];
409     if (upper_flag) {
410       for (int i = 0; i < nnz; i++) {
411 	fscanf(fp, "%d %d %lf", &jcol[i], &irow[i], &val[i]); // read lower
412 	irow[i]--;
413 	jcol[i]--;
414 	if (isSym && irow[i] > jcol[i]) {
415 	  fprintf(stderr, "exchanged : %d > %d\n", irow[i], jcol[i]);
416 	  itmp = irow[i];
417 	  irow[i] = jcol[i];
418 	  jcol[i] = itmp;
419 	}
420     }
421     }
422     else {
423       for (int i = 0; i < nnz; i++) {
424 	fscanf(fp, "%d %d %lf", &irow[i], &jcol[i], &val[i]); // read lower
425 	irow[i]--;
426 	jcol[i]--;
427 	if (isSym && irow[i] < jcol[i]) {
428 	  fprintf(stderr, "exchanged : %d > %d\n", irow[i], jcol[i]);
429 	  itmp = irow[i];
430 	  irow[i] = jcol[i];
431 	  jcol[i] = itmp;
432 	}
433       }
434     }
435   }
436   fclose (fp);
437 
438   if (excl_flag) {
439     if ((fp = fopen(fname1, "r")) == NULL) {
440       fprintf(stderr, "fail to open %s\n", fname1);
441     }
442     fgets(buf, 256, fp);
443     sscanf(buf, "# %d", &nexcl);
444     indx_excl = new int[nexcl];
445     for (int i = 0; i < nexcl; i++) {
446       fgets(buf, 256, fp);
447       sscanf(buf, "%d", &itmp);
448       indx_excl[i] = itmp;
449     }
450     fclose(fp);
451   }
452 
453   int *mask = new int[nrow];
454   int *old2new = new int[nrow];
455   for (int i = 0; i < nrow; i++) {
456     mask[i] = 1;
457   }
458   for (int i = 0; i < nexcl; i++) {
459     mask[indx_excl[i]] = 0;
460   }
461   itmp = 0;
462   jtmp = nrow - nexcl;
463   for (int i = 0; i < nrow; i++) {
464     if (mask[i] == 1) {
465       old2new[i] = itmp++;
466     }
467     else {
468       old2new[i] = jtmp++;
469     }
470   }
471 #if 0
472   if ((fp = fopen("debug-index.data", "w")) != NULL) {
473     for (int i = 0; i < nrow; i++) {
474       fprintf(fp, "%d %d %d\n", i, old2new[i], mask[i]);
475     }
476     fclose(fp);
477   }
478 #endif
479   nrow -= nexcl;
480 
481   ind_cols_tmp = new std::list<int>[nrow];
482   fprintf(stderr, "%s %d : getnerate_CSR\n", __FILE__, __LINE__);
483   if (isComplex) {
484     val_tmpc = new std::list<complex<double> >[nrow];
485     generate_CSR<complex<double> >(ind_cols_tmp, val_tmpc,
486 				   nrow, &nnz, mask, old2new,
487 				   irow, jcol, valc, (!isSym));
488   }
489   else {
490     val_tmp = new std::list<double>[nrow];
491     generate_CSR<double>(ind_cols_tmp, val_tmp,
492 			 nrow, &nnz, mask, old2new,
493 			 irow, jcol, val, (!isSym));
494   }
495   delete [] irow;
496   delete [] jcol;
497   delete [] mask;
498   delete [] old2new;
499   if (isComplex) {
500     delete [] valc;
501   }
502   else {
503     delete [] val;
504   }
505 
506   if (upper_flag) {
507     for (int i = 0; i < nrow; i++) {
508       if (ind_cols_tmp[i].front() != i) {
509 	nnz++;
510       }
511     }
512   }
513   else {
514     for (int i = 0; i < nrow; i++) {
515       if (ind_cols_tmp[i].back() != i) {
516 	nnz++;
517       }
518     }
519   }
520   fprintf(stderr, "%s %d : copy_CSR\n", __FILE__, __LINE__);
521   ptrows = new int[nrow + 1];
522   indcols = new int[nnz];
523   if (isComplex) {
524     ccoefs = new complex<double>[nnz];
525     copy_CSR<complex<double> >(indcols, ptrows, ccoefs,
526 			       nrow, upper_flag, isSym,
527 			       ind_cols_tmp, val_tmpc);
528     qccoefs = new complex<quadruple>[nnz];
529     for (int i = 0; i < nnz; i++) {
530       qccoefs[i] = complex<quadruple>(quadruple(std::real(ccoefs[i])),
531 				      quadruple(std::imag(ccoefs[i])));
532     }
533     delete [] ccoefs;
534   }
535 
536   else {
537     coefs = new double[nnz];
538     copy_CSR<double>(indcols, ptrows, coefs,
539 		     nrow, upper_flag, isSym,
540 		     ind_cols_tmp, val_tmp);
541     qcoefs = new quadruple[nnz];
542     for (int i = 0; i < nnz; i++) {
543       qcoefs[i] = quadruple(coefs[i]);
544     }
545     delete [] coefs;
546   }
547   delete [] ind_cols_tmp;
548 
549   if (isComplex) {
550     delete [] val_tmpc;
551   }
552   else {
553     delete [] val_tmp;
554   }
555 #if 0
556   if ((fp = fopen("debug.matrix.data", "w")) != NULL) {
557     for (int i = 0; i < nrow; i++) {
558       fprintf(fp, "%d : %d :: ", i, (ptrows[i + 1] - ptrows[i]));
559       for (int k = ptrows[i]; k < ptrows[i + 1]; k++) {
560 	fprintf(fp, "%d ", indcols[k]);
561       }
562       fprintf(fp, "\n");
563     }
564   }
565   fclose(fp);
566 #endif
567   int pid = (int)getpid();
568 #if 1
569   fprintf(stderr, "pid = %d\n", pid);
570   sprintf(fname, "dissection.%04d.log", pid);
571   fp = fopen(fname, "a");
572 #else
573   fp = stderr;
574 #endif
575   fprintf(stderr, "%s %d : before pthread_create\n", __FILE__, __LINE__);
576   void* results;
577   pthread_attr_t th_attr;
578   pthread_t thread;
579   pthread_attr_init(&th_attr);
580   pthread_attr_setdetachstate(&th_attr, PTHREAD_CREATE_JOINABLE);
581   int pthid = pthread_create(&thread, &th_attr,
582 			     &thread_child,
583 			     (void *)&pid);
584   if (pthid != 0) {
585     cout << "bad thread creation ? " << pid << endl;
586     exit(0);
587   }
588   fprintf(stderr, "%s %d : after pthread_create\n", __FILE__, __LINE__);
589 
590   if (isWhole) {
591     isSym = true;
592     upper_flag = false;
593   }
594 
595   if (isComplex) {
596     DissectionSolver<complex<quadruple>, quadruple>*dslv =
597       new DissectionSolver<complex<quadruple>, quadruple>(num_threads, true, 0,
598 							  fp);
599 
600     int called = 0;
601     clock_t t0_cpu, t1_cpu, t2_cpu, t3_cpu, t4_cpu, t5_cpu;
602     elapsed_t t0_elapsed, t1_elapsed, t2_elapsed, t3_elapsed,
603       t4_elapsed, t5_elapsed;
604 #ifdef BLAS_MKL
605     mkl_set_num_threads(1);
606 #endif
607 #ifdef VECLIB
608     setenv("VECLIB_MAXIMUM_THREADS", "1", true);
609 #endif
610     t0_cpu = clock();
611     get_realtime(&t0_elapsed);
612 
613     dslv->SymbolicFact(nrow, (int *)ptrows, (int *)indcols,
614 		       isSym,
615 		       upper_flag,
616 		       isWhole,
617 		       decomposer, numlevels, minNodes);
618 
619     t1_cpu = clock();
620     get_realtime(&t1_elapsed);
621 
622     t2_cpu = clock();
623     get_realtime(&t2_elapsed);
624     _stat = 1;
625     usleep(5000);
626     dslv->NumericFact(0, (complex<quadruple> *)qccoefs, scaling,
627 		      eps_pivot, kernel_detection_all);
628     _stat = (-1);
629     t3_cpu = clock();
630     get_realtime(&t3_elapsed);
631     usleep(5000);
632     fprintf(stderr, "%s %d : NumericFact() done\n", __FILE__, __LINE__);
633 
634     int n0;
635     n0 = dslv->kern_dimension();
636     fprintf(fp, "## kernel dimension = %d\n", n0);
637 
638     complex<quadruple> *x = new complex<quadruple>[nrow];
639     complex<quadruple> *y = new complex<quadruple>[nrow];
640     complex<quadruple> *z = new complex<quadruple>[nrow];
641     for (int i = 0; i < nrow; i++) {
642       y[i] = complex<quadruple>((quadruple)(i % 11), quadruple(0));
643     }
644     dslv->SpMV(y, x);
645     dslv->SpMV(x, y);
646     for (int i = 0; i < nrow; i++) {
647       z[i] = y[i];
648     }
649 
650     t4_cpu = clock();
651     get_realtime(&t4_elapsed);
652     _stat = 1;
653     usleep(5000);
654     dslv->SolveSingle(y, false, false, true); // with projection : true
655     _stat = (-1);
656     usleep(5000);
657     fprintf(stderr, "%s %d : SolveSingle() done\n", __FILE__, __LINE__);
658     t5_cpu = clock();
659     get_realtime(&t5_elapsed);
660     quadruple norm0, norm1;
661     norm0 = quadruple(0);
662     norm1 = quadruple(0);
663     for (int i = 0; i < nrow; i++) {
664       norm0 += x[i].real() * x[i].real() + x[i].imag() * x[i].imag();
665       complex<quadruple> ztmp = y[i] - x[i];
666       norm1 += ztmp.real() * ztmp.real() + ztmp.imag() * ztmp.imag();
667     }
668     fprintf(fp, "%s %d : ## error    = %18.7e\n",
669 	    __FILE__, __LINE__,
670 	    sqrt(quad2double(norm1 / norm0)));
671 
672     dslv->SpMV(y, x);
673 
674     norm0 = 0.0;
675     norm1 = 0.0;
676     for (int i = 0; i < nrow; i++) {
677       norm0 += z[i].real() * z[i].real() + z[i].imag() * z[i].imag();
678       complex<quadruple> ztmp = z[i] - x[i];
679       norm1 += ztmp.real() * ztmp.real() + ztmp.imag() * ztmp.imag();
680     }
681     fprintf(fp, "%s %d : ## residual = %18.7e\n",
682 	    __FILE__, __LINE__,
683 	    sqrt(quad2double(norm1 / norm0)));
684     _stat = 0;
685     pthread_attr_destroy(&th_attr);
686     pthid = pthread_join(thread, &results);
687     if (pthid != 0) {
688       cout << "bad thread join ? " << pthid << endl;
689       exit(0);
690     }
691 
692     fprintf(fp, "%s %d : ## symbolic fact    : cpu time = %.4e elapsed time = %.4e\n",
693 	    __FILE__, __LINE__,
694 	    (double)(t1_cpu - t0_cpu) / (double)CLOCKS_PER_SEC,
695 	    convert_time(t1_elapsed, t0_elapsed));
696 
697     fprintf(fp, "%s %d : ## numeric fact     : cpu time = %.4e elapsed time = %.4e\n",
698 	    __FILE__, __LINE__,
699 	    (double)(t3_cpu - t2_cpu) / (double)CLOCKS_PER_SEC,
700 	    convert_time(t3_elapsed, t2_elapsed));
701 
702     fprintf(fp, "%s %d : ## solve single RHS : cpu time = %.4e elapsed time = %.4e\n",
703 	    __FILE__, __LINE__,
704 	    (double)(t5_cpu - t4_cpu) / (double)CLOCKS_PER_SEC,
705 	    convert_time(t5_elapsed, t4_elapsed));
706 
707     delete dslv;
708     delete [] ptrows;
709     delete [] indcols;
710     delete [] qccoefs;
711     delete [] x;
712     delete [] y;
713     delete [] z;
714   }  // if (isComplex)
715   else {
716     DissectionSolver<quadruple> *dslv =
717       new DissectionSolver<quadruple>(num_threads, true, 0, fp);
718 
719     int called = 0;
720     clock_t t0_cpu, t1_cpu, t2_cpu, t3_cpu, t4_cpu, t5_cpu;
721     elapsed_t t0_elapsed, t1_elapsed, t2_elapsed, t3_elapsed,
722       t4_elapsed, t5_elapsed;
723 #ifdef BLAS_MKL
724     mkl_set_num_threads(1);
725 #endif
726 #ifdef VECLIB
727    setenv("VECLIB_MAXIMUM_THREADS", "1", true);
728 #endif
729     t0_cpu = clock();
730     get_realtime(&t0_elapsed);
731 
732     dslv->SymbolicFact(nrow, (int *)ptrows, (int *)indcols,
733 		       isSym,
734 		       upper_flag,
735 		       isWhole,
736 		       decomposer, numlevels, minNodes);
737     //                  sym, upper
738     //    dslv->SaveMMMatrix(0, coefs);
739     //    exit(-1);
740     t1_cpu = clock();
741     get_realtime(&t1_elapsed);
742 
743     _stat = 1;
744     usleep(5000);
745     t2_cpu = clock();
746     get_realtime(&t2_elapsed);
747     dslv->NumericFact(0, (quadruple *)qcoefs, scaling,
748 		      eps_pivot, kernel_detection_all);
749     t3_cpu = clock();
750     get_realtime(&t3_elapsed);
751     _stat = (-1);
752     usleep(5000);
753     fprintf(stderr, "%s %d : NumericFact() done\n", __FILE__, __LINE__);
754     int n0;
755     n0 = dslv->kern_dimension();
756     fprintf(fp, "%s %d : ## kernel dimension = %d\n", __FILE__, __LINE__, n0);
757 
758     quadruple *x = new quadruple[nrow];
759     quadruple *y = new quadruple[nrow];
760     quadruple *z = new quadruple[nrow];
761     for (int i = 0; i < nrow; i++) {
762       y[i] = (quadruple)(i % 11);
763     }
764 #define NORMAL
765 #ifdef NORMAL
766 #if 0
767     if (!isSym && (n0 > 0)) {
768     dslv->ComputeTransposedKernels(true);
769     }
770 #endif
771     dslv->SpMV(y, x);
772     if (n0 > 0) {
773       dslv->ProjectionImageSingle(x);
774     }
775     dslv->SpMV(x, y);
776     for (int i = 0; i < nrow; i++) {
777       z[i] = y[i];
778     }
779     _stat = 1;
780     usleep(5000);
781     t4_cpu = clock();
782     get_realtime(&t4_elapsed);
783     dslv->SolveSingle(y, false, false, true); // with projection : true
784     _stat = (-1);
785     usleep(5000);
786     fprintf(stderr, "%s %d : SolveSingle() done\n", __FILE__, __LINE__);
787     if (n0 > 0) {
788       dslv->ProjectionImageSingle(y);
789     }
790     t5_cpu = clock();
791     get_realtime(&t5_elapsed);
792 #else
793     dslv->SpMtV(y, x);
794     //  dslv->ProjectionImageSingle(x);
795     dslv->SpMtV(x, y);
796     for (int i = 0; i < nrow; i++) {
797       z[i] = y[i];
798     }
799 
800     _stat = 1;
801     usleep(5000);
802     t4_cpu = clock();
803     get_realtime(&t4_elapsed);
804     dslv->SolveSingle(y, false, true); // with projection : true
805     //  dslv->ProjectionImageSingle(y);
806     t5_cpu = clock();
807     get_realtime(&t5_elapsed);
808     _stat = (-1);
809     usleep(5000);
810     fprintf(stderr, "%s %d : SolveSingle() done\n", __FILE__, __LINE__);
811 #endif
812     quadruple norm0, norm1;
813     norm0 = quadruple(0);
814     norm1 = quadruple(0);
815     for (int i = 0; i < nrow; i++) {
816       norm0 += x[i] * x[i];
817       norm1 += (y[i] - x[i]) * (y[i] - x[i]);
818     }
819     fprintf(fp, "%s %d : ## error    = %18.7e\n",
820 	    __FILE__, __LINE__, sqrt(quad2double(norm1 / norm0)));
821 #ifdef NORMAL
822     dslv->SpMV(y, x);
823 #else
824     dslv->SpMtV(y, x);
825 #endif
826 
827     norm0 = quadruple(0);
828     norm1 = quadruple(0);
829     for (int i = 0; i < nrow; i++) {
830       norm0 += z[i] * z[i];
831       norm1 += (z[i] - x[i]) * (z[i] - x[i]);
832     }
833     fprintf(fp, "%s %d : ## residual = %18.7e\n",
834 	    __FILE__, __LINE__, sqrt(quad2double(norm1 / norm0)));
835     _stat = 0;
836     pthread_attr_destroy(&th_attr);
837     pthid = pthread_join(thread, &results);
838     if (pthid != 0) {
839       cout << "bad thread join ? " << pthid << endl;
840       exit(0);
841     }
842 
843     fprintf(fp, "%s %d : ## symbolic fact    : cpu time = %.4e elapsed time = %.4e\n",
844 	    __FILE__, __LINE__,
845 	    (double)(t1_cpu - t0_cpu) / (double)CLOCKS_PER_SEC,
846 	    convert_time(t1_elapsed, t0_elapsed));
847 
848     fprintf(fp, "%s %d : ## numeric fact     : cpu time = %.4e elapsed time = %.4e\n",
849 	    __FILE__, __LINE__,
850 	    (double)(t3_cpu - t2_cpu) / (double)CLOCKS_PER_SEC,
851 	    convert_time(t3_elapsed, t2_elapsed));
852 
853     fprintf(fp, "%s %d : ## solve single RHS : cpu time = %.4e elapsed time = %.4e\n",
854 	    __FILE__, __LINE__,
855 	    (double)(t5_cpu - t4_cpu) / (double)CLOCKS_PER_SEC,
856 	    convert_time(t5_elapsed, t4_elapsed));
857 
858 #if 0
859     double *scalediag;
860     scalediag = new double[nrow];
861     dslv->GetMatrixScaling(scalediag);
862     for (int i = 0; i < nrow; i++) {
863       for (int k = ptrows[i]; k < ptrows[i + 1]; k++) {
864 	if (indcols[k] == i) {
865 	  fprintf(fp, "%i : %g : %g %g\n", i, scalediag[i],
866 		  1.0 / (scalediag[i] * scalediag[i]), coefs[k]);
867 	  break;
868 	}
869       }
870     }
871     delete [] scalediag;
872 #endif
873 
874     delete dslv;
875     delete [] ptrows;
876     delete [] indcols;
877     delete [] qcoefs;
878     delete [] x;
879     delete [] y;
880     delete [] z;
881   }
882   fclose(fp);
883 
884 }
885