1 /*! \file   MM-DissectionSolver.cpp
2     \brief  test rouinte of dissection solver reading Matrix Market format
3     \author Atsushi Suzuki, Laboratoire Jacques-Louis Lions
4     \date   Jun. 20th 2014
5     \date   Jul. 12th 2015
6     \date   Feb. 29th 2016
7 */
8 
9 // This file is part of Dissection
10 //
11 // Dissection is free software: you can redistribute it and/or modify
12 // it under the terms of the GNU General Public License as published by
13 // the Free Software Foundation, either version 3 of the License, or
14 // (at your option) any later version.
15 //
16 // Linking Dissection statically or dynamically with other modules is making
17 // a combined work based on Disssection. Thus, the terms and conditions of
18 // the GNU General Public License cover the whole combination.
19 //
20 // As a special exception, the copyright holders of Dissection give you
21 // permission to combine Dissection program with free software programs or
22 // libraries that are released under the GNU LGPL and with independent modules
23 // that communicate with Dissection solely through the Dissection-fortran
24 // interface. You may copy and distribute such a system following the terms of
25 // the GNU GPL for Dissection and the licenses of the other code concerned,
26 // provided that you include the source code of that other code when and as
27 // the GNU GPL requires distribution of source code and provided that you do
28 // not modify the Dissection-fortran interface.
29 //
30 // Note that people who make modified versions of Dissection are not obligated
31 // to grant this special exception for their modified versions; it is their
32 // choice whether to do so. The GNU General Public License gives permission to
33 // release a modified version without this exception; this exception also makes
34 // it possible to release a modified version which carries forward this
35 // exception. If you modify the Dissection-fortran interface, this exception
36 // does not apply to your modified version of Dissection, and you must remove
37 // this exception when you distribute your modified version.
38 //
39 // This exception is an additional permission under section 7 of the GNU
40 // General Public License, version 3 ("GPLv3")
41 //
42 // Dissection is distributed in the hope that it will be useful,
43 // but WITHOUT ANY WARRANTY; without even the implied warranty of
44 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
45 // GNU General Public License for more details.
46 //
47 // You should have received a copy of the GNU General Public License
48 // along with Dissection.  If not, see <http://www.gnu.org/licenses/>.
49 //
50 
51 #include <cstdio>
52 #include <cstdlib>
53 #include <cstring>
54 #include <fcntl.h>
55 #include <unistd.h>
56 #include <math.h>
57 
58 #include "Driver/DissectionSolver.hpp"
59 #ifdef BLAS_MKL
60 #include <mkl_service.h>
61 #endif
62 #include <pthread.h>
63 #include <iostream>
64 #include <sstream>
65 #include <fstream>
66 #include <complex>
67 #include "Compiler/arithmetic.hpp"
68 
69 using namespace std;
70 
71 static int _stat = (-1);
72 
thread_child(void * arg)73 void *thread_child(void *arg)
74 {
75   char buf[256];
76   int *pid = (int *)arg;
77   unsigned int mem_tmp, mem_min, mem_max;
78   double avg_mem;
79   avg_mem = 0.0;
80   mem_min = (1U << 31) - 1;
81   mem_max = 0U;
82   int stat0, stat1;
83   stat0 = _stat;
84   unsigned int count = 0U;
85 //  fprintf(stderr, "thread_child forked\n");
86   while(_stat != 0) {
87     stat1 = _stat;
88      if (stat1 == 1) {
89        sprintf(buf, "/proc/%d/statm", *pid);
90        ifstream fin(buf);
91        fin >> mem_tmp;
92        fin.close();
93        if (mem_tmp > mem_max) {
94 	 mem_max = mem_tmp;
95        }
96        if (mem_tmp < mem_min) {
97 	 mem_min = mem_tmp;
98        }
99        avg_mem += (double)mem_tmp;
100        count++;
101      }
102      if ((stat1 == (-1)) && (stat0 == 1)) {
103        fprintf(stderr,
104 	       "used memory :min: %14.8e  max: %14.8e avg: %14.8e count: %d\n",
105 	       (double)mem_min * 4.0 / (1024.0 * 1024.0),
106 	       (double)mem_max * 4.0 / (1024.0 * 1024.0),
107 	       (avg_mem / (double)count) * 4.0 / (1024.0 * 1024.0),
108 	       count);
109        count = 0U;
110        avg_mem = 0.0;
111        mem_min = (1U << 31) - 1;
112        mem_max = 0U;
113      }
114      stat0 = stat1;
115      usleep(1000);
116   }
117 // fprintf(stderr, "thread_child join\n count = %ld\n", count);
118   pthread_exit(arg);
119 
120   return (void *)NULL;
121 }
122 
123 template<typename T>
generate_CSR(std::list<int> * ind_cols_tmp,std::list<T> * val_tmp,int nrow,int * nnz,int * mask,int * old2new,int * irow,int * jcol,T * val,bool symmetrize)124 void generate_CSR(std::list<int>* ind_cols_tmp, std::list<T>* val_tmp,
125 		  int nrow, int *nnz, int *mask, int *old2new,
126 		  int *irow, int *jcol, T* val, bool symmetrize)
127 {
128   const T zero(0.0);
129   //  ind_cols_tmp = new std::list<int>[nrow];
130   //  val_tmp = new std::list<T>[nrow];
131   int nnz1 = *nnz;
132   for (int i = 0; i < *nnz; i++) {
133     const int i0 = irow[i];
134     const int j0 = jcol[i];
135     const int ii = old2new[i0];
136     const int jj = old2new[j0];
137     if ((mask[i0] != 1) || (mask[j0] != 1)) {
138       //      fprintf(stderr, "%d %d\n", i0, j0);
139       nnz1--;
140       continue;
141     }
142     //    fprintf(stderr, "%d %d -> %d %d \n", i0, j0, ii, jj);
143     if (ind_cols_tmp[ii].empty()) {
144       ind_cols_tmp[ii].push_back(jj);
145       val_tmp[ii].push_back(val[i]);
146     }
147     else {
148       if (ind_cols_tmp[ii].back() < jj) {
149 	ind_cols_tmp[ii].push_back(jj);
150 	val_tmp[ii].push_back(val[i]);
151       }
152       else {
153 	typename std::list<T>::iterator iv = val_tmp[ii].begin();
154 	std::list<int>::iterator it = ind_cols_tmp[ii].begin();
155 	for ( ; it != ind_cols_tmp[ii].end(); ++it, ++iv) {
156 	  if (*it == jj) {
157 	    fprintf(stderr, "already exits? (%d %d)\n", ii, jj);
158 	      break;
159 	  }
160 	  if (*it > jj) {
161 	    ind_cols_tmp[ii].insert(it, jj);
162 	    val_tmp[ii].insert(iv, val[i]);
163 	    break;
164 	  }
165 	}
166       }
167     }
168   }
169   // symmetrize
170   if (symmetrize) {
171     for (int i = 0; i < nrow; i++) {
172       std::list<int>::iterator jt = ind_cols_tmp[i].begin();
173       for ( ; jt != ind_cols_tmp[i].end(); ++jt) {
174 	const int jj = (*jt);
175 	bool flag = false;
176 	std::list<int>::iterator it = ind_cols_tmp[jj].begin();
177 	for ( ; it != ind_cols_tmp[jj].end(); ++it) {
178 	  if ((*it) == i) {
179 	    flag = true;
180 	    //	    fprintf(stderr, "%d %d symmetric position found\n", i, jj);
181 	    break;
182 	  }
183 	}
184 	if (!flag) {
185 	  if (ind_cols_tmp[jj].back() < i) {
186 	    ind_cols_tmp[jj].push_back(i);
187 	    val_tmp[jj].push_back(zero);
188 	    fprintf(stderr, "%d %d added for symmetry\n", i, jj);
189 	    nnz1++;
190 	  }
191 	  else {
192 	    typename std::list<T>::iterator iv = val_tmp[jj].begin();
193 	    std::list<int>::iterator it = ind_cols_tmp[jj].begin();
194 	    for ( ; it != ind_cols_tmp[jj].end(); ++it, ++iv) {
195 	      std::list<int>::iterator it1 = it;
196 	      ++it1;
197 	      if (((*it)) < i && ((*it1) > i)) {
198 		ind_cols_tmp[jj].insert(it, i);
199 		val_tmp[jj].insert(iv, zero);
200 		nnz1++;
201 		//		fprintf(stderr, "%d %d inserted for symmetry\n", i, jj);
202 		break;
203 	      }
204 	    }
205 	  }
206 	} // if (!flag);
207       }
208     }
209   }
210 #if 0
211   {
212     for (int i = 0; i < nrow; i++) {
213       std::list<int>::iterator jt = ind_cols_tmp[i].begin();
214       for ( ; jt != ind_cols_tmp[i].end(); ++jt) {
215 	const int jj = (*jt);
216 	bool flag = false;
217 	std::list<int>::iterator it = ind_cols_tmp[jj].begin();
218 	for ( ; it != ind_cols_tmp[jj].end(); ++it) {
219 	  if ((*it) == i) {
220 	    flag = true;
221 	    break;
222 	  }
223 	}
224 	if (!flag) {
225 	  fprintf(stderr, "%d %d position is not symmetric\n", i, jj);
226 	}
227       }
228     }
229   }
230 #endif
231   *nnz = nnz1;
232 }
233 
234 template<typename T>
copy_CSR(int * indcols,int * ptrows,T * coefs,int nrow,bool upper_flag,bool isSym,std::list<int> * ind_cols_tmp,std::list<T> * val_tmp)235 void copy_CSR(int *indcols, int *ptrows, T* coefs, int nrow,
236 	      bool upper_flag, bool isSym,
237 	      std::list<int>* ind_cols_tmp, std::list<T>* val_tmp)
238 {
239   const T zero(0.0);
240   ptrows[0] = 0;
241   for (int i = 0; i < nrow; i++) {
242     int k;
243     int itmp = ind_cols_tmp[i].size();
244     if (upper_flag) {
245       if (ind_cols_tmp[i].front() == i) {
246 	ptrows[i + 1] = ptrows[i] + itmp;
247 	k = ptrows[i];
248       }
249       else {
250 	fprintf(stderr, "zero is added to diagonal : %d\n", i);
251 	ptrows[i + 1] = ptrows[i] + itmp + 1;
252 	indcols[ptrows[i]] = i;
253 	coefs[ptrows[i]] = zero;
254 	k = ptrows[i] + 1;
255       }
256     }
257     else {
258       k = ptrows[i];
259       if (ind_cols_tmp[i].back() == i || (!isSym)) {
260 	ptrows[i + 1] = ptrows[i] + itmp;
261       }
262       else {
263 	fprintf(stderr, "zero is added to diagonal : %d\n", i);
264 	ptrows[i + 1] = ptrows[i] + itmp + 1;
265 	indcols[ptrows[i + 1] - 1] = i;
266 	coefs[ptrows[i + 1] - 1] = zero;
267       }
268     }
269     std::list<int>::iterator it = ind_cols_tmp[i].begin();
270     typename std::list<T>::iterator iv = val_tmp[i].begin();
271     for ( ; it != ind_cols_tmp[i].end(); ++it, ++iv, k++) {
272       indcols[k] = *it;
273       coefs[k] = *iv;
274     }
275   } // loop : i
276 }
277 
main(int argc,char ** argv)278 int main(int argc, char **argv)
279 {
280   int n, itmp, jtmp;
281   char fname[256], fname1[256];
282   char buf[1024];
283   int nrow, nnz, flag;
284   int *ptrows, *indcols;
285   int *irow, *jcol;
286   double *val, *coefs;
287   complex<double> *valc, *ccoefs;
288   //  quadruple *qcoefs;
289   //  complex<quadruple> *qccoefs;
290   int decomposer;
291   int num_threads;
292   int scaling = 1;
293   double eps_pivot;
294   int numlevels = -1;
295   int minNodes = 128;
296   std::list<int>* ind_cols_tmp;
297   std::list<double>* val_tmp;
298   std::list<complex<double> >* val_tmpc;
299   FILE *fp;
300   bool isSym, isComplex;
301   bool upper_flag = true;
302   bool isWhole = false;
303   bool kernel_detection_all = false;
304   int *indx_excl;
305   int nexcl = 0;
306   bool excl_flag = false;
307 
308   if (argc < 6) {
309     fprintf(stderr, "MM-dissection [data file] [decomposer] [num_threads] [eps_pivot] [num_levels] [scaling] [kerner_detection_all] [upper_flag] [minNodes]\n");
310     exit(-1);
311   }
312   strcpy(fname, argv[1]);
313   decomposer = atoi(argv[2]);
314   num_threads = atoi(argv[3]);
315   eps_pivot = atof(argv[4]);
316   numlevels = atof(argv[5]);
317   if (argc >= 7) {
318     scaling = atoi(argv[6]);
319   }
320   if (argc >= 8) {
321    kernel_detection_all = (atoi(argv[7]) == 1);
322   }
323 
324   if (argc >= 9) {
325     upper_flag = (atoi(argv[8]) == 1);
326     isWhole = (atoi(argv[8]) == (-1));
327   }
328   if (argc >= 10) {
329     strcpy(fname1, argv[9]);
330     excl_flag = true;
331   }
332   if (argc >= 11) {
333     minNodes = atoi(argv[10]);
334   }
335 
336   // read from the file
337   if ((fp = fopen(fname, "r")) == NULL) {
338     fprintf(stderr, "fail to open %s\n", fname);
339   }
340   fgets(buf, 256, fp);
341   //
342   if (strstr(buf, "symmetric") != NULL) {
343    isSym = true;
344   }
345   else {
346     isSym = false;
347     upper_flag = false;
348   }
349   if (strstr(buf, "complex") != NULL) {
350    isComplex = true;
351   }
352   else {
353     isComplex = false;
354   }
355 
356   fprintf(stderr, "symmetric = %s\n", isSym ? "true " : "false");
357   fprintf(stderr, "scaling = %d\n", scaling);
358   fprintf(stderr, "upper = %s\n", upper_flag ? "true" : "false");
359   if (kernel_detection_all) {
360     fprintf(stderr, "kernel detection is activated for all submatrices\n");
361   }
362   if (excl_flag) {
363     fprintf(stderr, "list of singular nodes %s\n", fname1);
364   }
365   while (1) {
366     fgets(buf, 256, fp);
367     if (buf[0] != '%') {
368       sscanf(buf, "%d %d %d", &nrow, &itmp, &nnz);
369       break;
370     }
371   }
372   irow = new int[nnz];
373   jcol = new int[nnz];
374 
375   if (isComplex) {
376     double xreal, ximag;
377     valc = new complex<double>[nnz];
378     if (upper_flag) {
379       for (int i = 0; i < nnz; i++) {
380 	fscanf(fp, "%d %d %lf %lf", &jcol[i], &irow[i], &xreal, &ximag);
381 	valc[i] = complex<double>(xreal, ximag);
382 	irow[i]--;
383 	jcol[i]--;
384 	if (isSym && irow[i] > jcol[i]) {
385 	  fprintf(stderr, "exchanged : %d > %d\n", irow[i], jcol[i]);
386 	  itmp = irow[i];
387 	  irow[i] = jcol[i];
388 	  jcol[i] = itmp;
389 	}
390       }
391     }
392     else {
393       for (int i = 0; i < nnz; i++) {
394 	fscanf(fp, "%d %d %lf %lf", &irow[i], &jcol[i], &xreal, &ximag);
395 	valc[i] = complex<double>(xreal, ximag);
396 	irow[i]--;
397 	jcol[i]--;
398 	if (isSym && irow[i] < jcol[i]) {
399 	  fprintf(stderr, "exchanged : %d > %d\n", irow[i], jcol[i]);
400 	  itmp = irow[i];
401 	  irow[i] = jcol[i];
402 	  jcol[i] = itmp;
403 	}
404       }
405     }
406   }
407   else {
408     val = new double[nnz];
409     if (upper_flag) {
410       for (int i = 0; i < nnz; i++) {
411 	fscanf(fp, "%d %d %lf", &jcol[i], &irow[i], &val[i]); // read lower
412 	irow[i]--;
413 	jcol[i]--;
414 	if (isSym && irow[i] > jcol[i]) {
415 	  fprintf(stderr, "exchanged : %d > %d\n", irow[i], jcol[i]);
416 	  itmp = irow[i];
417 	  irow[i] = jcol[i];
418 	  jcol[i] = itmp;
419 	}
420     }
421     }
422     else {
423       for (int i = 0; i < nnz; i++) {
424 	fscanf(fp, "%d %d %lf", &irow[i], &jcol[i], &val[i]); // read lower
425 	irow[i]--;
426 	jcol[i]--;
427 	if (isSym && irow[i] < jcol[i]) {
428 	  fprintf(stderr, "exchanged : %d > %d\n", irow[i], jcol[i]);
429 	  itmp = irow[i];
430 	  irow[i] = jcol[i];
431 	  jcol[i] = itmp;
432 	}
433       }
434     }
435   }
436   fclose (fp);
437 
438   if (excl_flag) {
439     if ((fp = fopen(fname1, "r")) == NULL) {
440       fprintf(stderr, "fail to open %s\n", fname1);
441     }
442     fgets(buf, 256, fp);
443     sscanf(buf, "# %d", &nexcl);
444     indx_excl = new int[nexcl];
445     for (int i = 0; i < nexcl; i++) {
446       fgets(buf, 256, fp);
447       sscanf(buf, "%d", &itmp);
448       indx_excl[i] = itmp;
449     }
450     fclose(fp);
451   }
452 
453   int *mask = new int[nrow];
454   int *old2new = new int[nrow];
455   for (int i = 0; i < nrow; i++) {
456     mask[i] = 1;
457   }
458   for (int i = 0; i < nexcl; i++) {
459     mask[indx_excl[i]] = 0;
460   }
461   itmp = 0;
462   jtmp = nrow - nexcl;
463   for (int i = 0; i < nrow; i++) {
464     if (mask[i] == 1) {
465       old2new[i] = itmp++;
466     }
467     else {
468       old2new[i] = jtmp++;
469     }
470   }
471 #if 0
472   if ((fp = fopen("debug-index.data", "w")) != NULL) {
473     for (int i = 0; i < nrow; i++) {
474       fprintf(fp, "%d %d %d\n", i, old2new[i], mask[i]);
475     }
476     fclose(fp);
477   }
478 #endif
479   nrow -= nexcl;
480 
481   ind_cols_tmp = new std::list<int>[nrow];
482   fprintf(stderr, "%s %d : getnerate_CSR\n", __FILE__, __LINE__);
483   if (isComplex) {
484     val_tmpc = new std::list<complex<double> >[nrow];
485     generate_CSR<complex<double> >(ind_cols_tmp, val_tmpc,
486 				   nrow, &nnz, mask, old2new,
487 				   irow, jcol, valc, (!isSym));
488   }
489   else {
490     val_tmp = new std::list<double>[nrow];
491     generate_CSR<double>(ind_cols_tmp, val_tmp,
492 			 nrow, &nnz, mask, old2new,
493 			 irow, jcol, val, (!isSym));
494   }
495   delete [] irow;
496   delete [] jcol;
497   delete [] mask;
498   delete [] old2new;
499   if (isComplex) {
500     delete [] valc;
501   }
502   else {
503     delete [] val;
504   }
505 
506   if (upper_flag) {
507     for (int i = 0; i < nrow; i++) {
508       if (ind_cols_tmp[i].front() != i) {
509 	nnz++;
510       }
511     }
512   }
513   else {
514     for (int i = 0; i < nrow; i++) {
515       if (ind_cols_tmp[i].back() != i) {
516 	nnz++;
517       }
518     }
519   }
520   fprintf(stderr, "%s %d : copy_CSR\n", __FILE__, __LINE__);
521   ptrows = new int[nrow + 1];
522   indcols = new int[nnz];
523   if (isComplex) {
524     ccoefs = new complex<double>[nnz];
525     copy_CSR<complex<double> >(indcols, ptrows, ccoefs,
526 			       nrow, upper_flag, isSym,
527 			       ind_cols_tmp, val_tmpc);
528 #if 0
529     qccoefs = new complex<quadruple>[nnz];
530     for (int i = 0; i < nnz; i++) {
531       qccoefs[i] = complex<quadruple>(quadruple(std::real(ccoefs[i])),
532 				      quadruple(std::imag(ccoefs[i])));
533     }
534     delete [] ccoefs;
535 #endif
536   }
537 
538   else {
539     coefs = new double[nnz];
540     copy_CSR<double>(indcols, ptrows, coefs,
541 		     nrow, upper_flag, isSym,
542 		     ind_cols_tmp, val_tmp);
543 #if 0
544     qcoefs = new quadruple[nnz];
545     for (int i = 0; i < nnz; i++) {
546       qcoefs[i] = quadruple(coefs[i]);
547     }
548     delete [] coefs;
549 #endif
550   }
551   delete [] ind_cols_tmp;
552 
553   if (isComplex) {
554     delete [] val_tmpc;
555   }
556   else {
557     delete [] val_tmp;
558   }
559 #if 0
560   if ((fp = fopen("debug.matrix.data", "w")) != NULL) {
561     for (int i = 0; i < nrow; i++) {
562       fprintf(fp, "%d : %d :: ", i, (ptrows[i + 1] - ptrows[i]));
563       for (int k = ptrows[i]; k < ptrows[i + 1]; k++) {
564 	fprintf(fp, "%d ", indcols[k]);
565       }
566       fprintf(fp, "\n");
567     }
568   }
569   fclose(fp);
570 #endif
571   int pid = (int)getpid();
572 #if 1
573   fprintf(stderr, "pid = %d\n", pid);
574   sprintf(fname, "dissection.%04d.log", pid);
575   fp = fopen(fname, "a");
576 #else
577   fp = stderr;
578 #endif
579   fprintf(stderr, "%s %d : before pthread_create\n", __FILE__, __LINE__);
580   void* results;
581   pthread_attr_t th_attr;
582   pthread_t thread;
583   pthread_attr_init(&th_attr);
584   pthread_attr_setdetachstate(&th_attr, PTHREAD_CREATE_JOINABLE);
585   int pthid = pthread_create(&thread, &th_attr,
586 			     &thread_child,
587 			     (void *)&pid);
588   if (pthid != 0) {
589     cout << "bad thread creation ? " << pid << endl;
590     exit(0);
591   }
592   fprintf(stderr, "%s %d : after pthread_create\n", __FILE__, __LINE__);
593 
594   if (isWhole) {
595     isSym = true;
596     upper_flag = false;
597   }
598 
599   if (isComplex) {
600     DissectionSolver<complex<quadruple>, quadruple,
601 		     complex<double>, double,
602 		     complex<quadruple>, quadruple>*dslv =
603       new DissectionSolver<complex<quadruple>, quadruple,
604 			   complex<double>, double,
605 			   complex<quadruple>, quadruple>(num_threads, true, 0, fp);
606     DissectionSolver<complex<double>, double,
607 		     complex<double>, double,
608 		     complex<quadruple>, quadruple> *dslv2 =
609       new DissectionSolver<complex<double>, double,
610 			   complex<double>, double,
611 			   complex<quadruple>, quadruple>(num_threads, true, 0, fp);
612 
613     int num_levels = (-1);  // automatic
614     int called = 0;
615     clock_t t0_cpu, t1_cpu, t2_cpu, t3_cpu, t4_cpu, t5_cpu;
616     elapsed_t t0_elapsed, t1_elapsed, t2_elapsed, t3_elapsed,
617       t4_elapsed, t5_elapsed;
618 #ifdef BLAS_MKL
619     mkl_set_num_threads(1);
620 #endif
621 #ifdef VECLIB
622     setenv("VECLIB_MAXIMUM_THREADS", "1", true);
623 #endif
624     t0_cpu = clock();
625     get_realtime(&t0_elapsed);
626 
627     dslv->SymbolicFact(nrow, (int *)ptrows, (int *)indcols,
628 		       isSym,
629 		       upper_flag,
630 		       isWhole,
631 		       decomposer, numlevels, minNodes);
632 
633     t1_cpu = clock();
634     get_realtime(&t1_elapsed);
635 
636     t2_cpu = clock();
637     get_realtime(&t2_elapsed);
638     _stat = 1;
639     usleep(5000);
640     dslv->NumericFact(0, (complex<double> *)coefs, scaling,
641 		      eps_pivot, kernel_detection_all);
642     _stat = (-1);
643     t3_cpu = clock();
644     get_realtime(&t3_elapsed);
645     usleep(5000);
646     fprintf(stderr, "%s %d : NumericFact() done\n", __FILE__, __LINE__);
647     dslv2->CopyQueueFwBw(*dslv);
648     int n0;
649     n0 = dslv2->kern_dimension();
650     fprintf(fp, "## kernel dimension = %d\n", n0);
651 
652     complex<double> *x = new complex<double>[nrow];
653     complex<double> *y = new complex<double>[nrow];
654     complex<double> *z = new complex<double>[nrow];
655     for (int i = 0; i < nrow; i++) {
656       y[i] = complex<double>((double)(i % 11));
657     }
658     dslv2->SpMV(y, x);
659     dslv2->SpMV(x, y);
660     for (int i = 0; i < nrow; i++) {
661       z[i] = y[i];
662     }
663 
664     t4_cpu = clock();
665     get_realtime(&t4_elapsed);
666     _stat = 1;
667     usleep(5000);
668     dslv2->SolveSingle(y, false, false, true); // with projection : true
669     _stat = (-1);
670     usleep(5000);
671     fprintf(stderr, "%s %d : SolveSingle() done\n", __FILE__, __LINE__);
672     t5_cpu = clock();
673     get_realtime(&t5_elapsed);
674     quadruple norm0, norm1;
675     norm0 = quadruple(0);
676     norm1 = quadruple(0);
677     for (int i = 0; i < nrow; i++) {
678       norm0 += x[i].real() * x[i].real() + x[i].imag() * x[i].imag();
679       complex<quadruple> ztmp = y[i] - x[i];
680       norm1 += ztmp.real() * ztmp.real() + ztmp.imag() * ztmp.imag();
681     }
682     fprintf(fp, "%s %d : ## error    = %18.7e\n",
683 	    __FILE__, __LINE__,
684 	    sqrt(quad2double(norm1 / norm0)));
685 
686     dslv2->SpMV(y, x);
687 
688     norm0 = 0.0;
689     norm1 = 0.0;
690     for (int i = 0; i < nrow; i++) {
691       norm0 += z[i].real() * z[i].real() + z[i].imag() * z[i].imag();
692       complex<quadruple> ztmp = z[i] - x[i];
693       norm1 += ztmp.real() * ztmp.real() + ztmp.imag() * ztmp.imag();
694     }
695     fprintf(fp, "%s %d : ## residual = %18.7e\n",
696 	    __FILE__, __LINE__,
697 	    sqrt(quad2double(norm1 / norm0)));
698     _stat = 0;
699     pthread_attr_destroy(&th_attr);
700     pthid = pthread_join(thread, &results);
701     if (pthid != 0) {
702       cout << "bad thread join ? " << pthid << endl;
703       exit(0);
704     }
705 
706     fprintf(fp, "%s %d : ## symbolic fact    : cpu time = %.4e elapsed time = %.4e\n",
707 	    __FILE__, __LINE__,
708 	    (double)(t1_cpu - t0_cpu) / (double)CLOCKS_PER_SEC,
709 	    convert_time(t1_elapsed, t0_elapsed));
710 
711     fprintf(fp, "%s %d : ## numeric fact     : cpu time = %.4e elapsed time = %.4e\n",
712 	    __FILE__, __LINE__,
713 	    (double)(t3_cpu - t2_cpu) / (double)CLOCKS_PER_SEC,
714 	    convert_time(t3_elapsed, t2_elapsed));
715 
716     fprintf(fp, "%s %d : ## solve single RHS : cpu time = %.4e elapsed time = %.4e\n",
717 	    __FILE__, __LINE__,
718 	    (double)(t5_cpu - t4_cpu) / (double)CLOCKS_PER_SEC,
719 	    convert_time(t5_elapsed, t4_elapsed));
720 
721     delete dslv;
722     delete [] ptrows;
723     delete [] indcols;
724     delete [] x;
725     delete [] y;
726     delete [] z;
727   }  // if (isComplex)
728   else {
729     DissectionSolver<quadruple, quadruple, double, double> *dslv =
730       new DissectionSolver<quadruple, quadruple,
731 			   double, double>(num_threads, true, 0, fp);
732 
733     DissectionSolver<double, double, double, double, quadruple, quadruple> *dslv2 =
734       new DissectionSolver<double, double, double, double, quadruple, quadruple>(num_threads, true, 0, fp);
735 
736     int num_levels = (-1); // automatic
737     int called = 0;
738     clock_t t0_cpu, t1_cpu, t2_cpu, t3_cpu, t4_cpu, t5_cpu;
739     elapsed_t t0_elapsed, t1_elapsed, t2_elapsed, t3_elapsed,
740       t4_elapsed, t5_elapsed;
741 #ifdef BLAS_MKL
742     mkl_set_num_threads(1);
743 #endif
744 #ifdef VECLIB
745    setenv("VECLIB_MAXIMUM_THREADS", "1", true);
746 #endif
747     t0_cpu = clock();
748     get_realtime(&t0_elapsed);
749 
750     dslv->SymbolicFact(nrow, (int *)ptrows, (int *)indcols,
751 		       isSym,
752 		       upper_flag,
753 		       isWhole,
754 		       decomposer, numlevels, minNodes);
755     //                  sym, upper
756     //    dslv->SaveMMMatrix(0, coefs);
757     //    exit(-1);
758     t1_cpu = clock();
759     get_realtime(&t1_elapsed);
760 
761     _stat = 1;
762     usleep(5000);
763     t2_cpu = clock();
764     get_realtime(&t2_elapsed);
765     dslv->NumericFact(0, coefs, scaling,
766 		      eps_pivot, kernel_detection_all);
767     t3_cpu = clock();
768     get_realtime(&t3_elapsed);
769     _stat = (-1);
770     usleep(5000);
771     fprintf(stderr, "%s %d : NumericFact() done\n", __FILE__, __LINE__);
772     dslv2->CopyQueueFwBw(*dslv);
773     int n0;
774     //    n0 = dslv->kern_dimension();
775     n0 = dslv2->kern_dimension();
776     fprintf(fp, "%s %d : ## kernel dimension = %d\n", __FILE__, __LINE__, n0);
777 
778     double *x = new double[nrow];
779     double *y = new double[nrow];
780     double *z = new double[nrow];
781     for (int i = 0; i < nrow; i++) {
782       y[i] = (double)(i % 11);
783     }
784     dslv2->SpMV(y, x);
785     if (n0 > 0) {
786       dslv2->ProjectionImageSingle(x);
787     }
788     dslv2->SpMV(x, y);
789     for (int i = 0; i < nrow; i++) {
790       z[i] = y[i];
791     }
792     _stat = 1;
793     usleep(5000);
794     t4_cpu = clock();
795     get_realtime(&t4_elapsed);
796     dslv2->SolveSingle(y, false, false, true); // with projection : true
797     _stat = (-1);
798     usleep(5000);
799     fprintf(stderr, "%s %d : SolveSingle() done\n", __FILE__, __LINE__);
800     if (n0 > 0) {
801       dslv2->ProjectionImageSingle(y);
802     }
803     t5_cpu = clock();
804     get_realtime(&t5_elapsed);
805 
806     double norm0, norm1;
807     norm0 = 0.0;
808     norm1 = 0.0;
809     for (int i = 0; i < nrow; i++) {
810       norm0 += x[i] * x[i];
811       norm1 += (y[i] - x[i]) * (y[i] - x[i]);
812     }
813     fprintf(fp, "%s %d : ## error    = %18.7e\n",
814 	    __FILE__, __LINE__, sqrt(quad2double(norm1 / norm0)));
815     dslv2->SpMV(y, x);
816 
817     norm0 = 0.0;
818     norm1 = 0.0;
819     for (int i = 0; i < nrow; i++) {
820       norm0 += z[i] * z[i];
821       norm1 += (z[i] - x[i]) * (z[i] - x[i]);
822     }
823     fprintf(fp, "%s %d : ## residual = %18.7e\n",
824 	    __FILE__, __LINE__, sqrt(quad2double(norm1 / norm0)));
825     _stat = 0;
826     pthread_attr_destroy(&th_attr);
827     pthid = pthread_join(thread, &results);
828     if (pthid != 0) {
829       cout << "bad thread join ? " << pthid << endl;
830       exit(0);
831     }
832 
833     fprintf(fp, "%s %d : ## symbolic fact    : cpu time = %.4e elapsed time = %.4e\n",
834 	    __FILE__, __LINE__,
835 	    (double)(t1_cpu - t0_cpu) / (double)CLOCKS_PER_SEC,
836 	    convert_time(t1_elapsed, t0_elapsed));
837 
838     fprintf(fp, "%s %d : ## numeric fact     : cpu time = %.4e elapsed time = %.4e\n",
839 	    __FILE__, __LINE__,
840 	    (double)(t3_cpu - t2_cpu) / (double)CLOCKS_PER_SEC,
841 	    convert_time(t3_elapsed, t2_elapsed));
842 
843     fprintf(fp, "## solve single RHS : cpu time = %.4e elapsed time = %.4e\n",
844 	    (double)(t5_cpu - t4_cpu) / (double)CLOCKS_PER_SEC,
845 	    convert_time(t5_elapsed, t4_elapsed));
846 
847     delete dslv;
848     delete dslv2;  // DissectionMatrix()._diag().free() still fails 28 Aug.2015
849 
850     delete [] ptrows;
851     delete [] indcols;
852     delete [] x;
853     delete [] y;
854     delete [] z;
855   }
856   if (isComplex) {
857     delete [] ccoefs;
858   }
859   else {
860     delete [] coefs;
861   }
862   fclose(fp);
863 
864 }
865