1 /*************************************************************/
2 /*                                                           */
3 /*************************************************************/
4 
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <math.h>
8 
9 #define NDEBUG
10 #include <assert.h>
11 
12 #define TAUCS_CORE_CILK
13 #include "taucs.h"
14 
15 #ifdef TAUCS_CILK
16 #pragma lang -C
17 #endif
18 
19 #ifndef TAUCS_CORE_GENERAL
20 #ifdef TAUCS_CILK
21 
22 /*** GEMM ***/
23 
24 #define TAUCS_THRESHOLD_GEMM_SMALL 20
25 #define TAUCS_THRESHOLD_GEMM_BLAS  80
26 
27 static void
taucs_gemm_NCm1p1_small(int m,int n,int k,taucs_datatype * A,int lda,taucs_datatype * B,int ldb,taucs_datatype * C,int ldc)28 taucs_gemm_NCm1p1_small(int m, int n, int k,
29 			taucs_datatype* A, int lda,
30 			taucs_datatype* B, int ldb,
31 			taucs_datatype* C, int ldc)
32 {
33   int j,i,l;
34   taucs_datatype* Cj;
35   taucs_datatype* Ail;
36   taucs_datatype* Bjl;
37   taucs_datatype  Cij;
38 
39   Cj = C;
40   for (j=0; j<n; j++) {
41     for (i=0; i<m; i++) {
42       Cij = *Cj;
43       Ail = A + i;
44       Bjl = B + j;
45       for (l=0; l<k; l++) {
46 	Cij = taucs_sub( Cij, taucs_mul( *Ail, taucs_conj( *Bjl ) ) );
47 	Ail += lda;
48 	Bjl += ldb;
49       }
50       *Cj = Cij;
51       Cj++;
52     }
53     Cj = Cj + ldc - m;
54     /* now Cj is at the top of column j+1 */
55   }
56 }
57 
58 cilk static void
taucs_cilk_gemm(char * transa,char * transb,int * pm,int * pn,int * pk,taucs_real_datatype * alpha,taucs_datatype * A,int * plda,taucs_datatype * B,int * pldb,taucs_real_datatype * beta,taucs_datatype * C,int * pldc)59 taucs_cilk_gemm(char* transa, char* transb,
60 		int* pm, int*  pn, int* pk,
61 		taucs_real_datatype* alpha,
62 		taucs_datatype *A, int *plda,
63 		taucs_datatype *B, int *pldb,
64 		taucs_real_datatype* beta,
65 		taucs_datatype *C, int *pldc)
66 {
67   int    m  = *pm;
68   int    n  = *pn;
69   int    k  = *pk;
70 
71   assert(*transa == 'N');
72   assert(*transb == 'C');
73   assert(*alpha  ==-1.0);
74   assert(*beta   == 1.0);
75 
76   if (n <= TAUCS_THRESHOLD_GEMM_SMALL && k <= TAUCS_THRESHOLD_GEMM_SMALL) {
77     /*fprintf(stderr,"GEMM SMALL\n");*/
78     taucs_gemm_NCm1p1_small(m,n,k,A,*plda,B,*pldb,C,*pldc);
79     return;
80   }
81 
82   if (n <= TAUCS_THRESHOLD_GEMM_BLAS && k <= TAUCS_THRESHOLD_GEMM_BLAS) {
83     /*fprintf(stderr,"GEMM BLAS\n");*/
84     taucs_gemm(transa, transb,
85 	       pm, pn, pk,
86 	       alpha,
87 	       A, plda,
88 	       B, pldb,
89 	       beta,
90 	       C, pldc);
91     return;
92   }
93 
94   if (k >= n && k >= m) {
95     int khalf1 = k/2;
96     int khalf2 = k-khalf1;
97     int lda = *plda;
98     int ldb = *pldb;
99     /*fprintf(stderr,"GEMM K/2\n");*/
100     spawn taucs_cilk_gemm(transa,transb,
101 			  pm, pn, &khalf1,
102 			  alpha,
103 			  A, plda,
104 			  B, pldb,
105 			  beta,
106 			  C, pldc);
107     sync;
108     spawn taucs_cilk_gemm(transa,transb,
109 			  pm, pn, &khalf2,
110 			  alpha,
111 			  A + khalf1*lda, plda,
112 			  B + khalf1*ldb, pldb,
113 			  beta,
114 			  C, pldc);
115     sync;
116     return;
117   }
118 
119   if (n >= k && n >= m) {
120     int nhalf1 = n/2;
121     int nhalf2 = n-nhalf1;
122     int ldc = *pldc;
123     /*fprintf(stderr,"GEMM N/2\n");*/
124 
125     spawn taucs_cilk_gemm(transa,transb,
126 			  pm, &nhalf1, pk,
127 			  alpha,
128 			  A, plda,
129 			  B, pldb,
130 			  beta,
131 			  C, pldc);
132 
133 
134     spawn taucs_cilk_gemm(transa,transb,
135 			  pm, &nhalf2, pk,
136 			  alpha,
137 			  A, plda,
138 			  B + nhalf1, pldb,
139 			  beta,
140 			  C + nhalf1*ldc, pldc);
141     sync;
142     return;
143   }
144 
145   if (1 /* m >= k && m >= n*/) { /* the condition must be true */
146     int mhalf1 = m/2;
147     int mhalf2 = m-mhalf1;
148     /*fprintf(stderr,"GEMM M/2\n");*/
149 
150     spawn taucs_cilk_gemm(transa,transb,
151 			  &mhalf1, pn, pk,
152 			  alpha,
153 			  A, plda,
154 			  B, pldb,
155 			  beta,
156 			  C, pldc);
157 
158     spawn taucs_cilk_gemm(transa,transb,
159 			  &mhalf2, pn, pk,
160 			  alpha,
161 			  A + mhalf1, plda,
162 			  B, pldb,
163 			  beta,
164 			  C + mhalf1, pldc);
165     sync;
166     return;
167   }
168 
169   assert(0);
170 
171 }
172 
173 /*** HERK ***/
174 
175 #define TAUCS_THRESHOLD_HERK_SMALL 20
176 #define TAUCS_THRESHOLD_HERK_BLAS  80
177 
178 static void
taucs_herk_LNm1p1_small(int n,int k,taucs_datatype * A,int lda,taucs_datatype * C,int ldc)179 taucs_herk_LNm1p1_small(int n, int k,
180 			taucs_datatype* A, int lda,
181 			taucs_datatype* C, int ldc)
182 {
183   int j,i,l;
184   taucs_datatype* Cj;
185   taucs_datatype* Ail;
186   taucs_datatype* Ajl;
187   taucs_datatype  Cij;
188 
189   Cj = C;
190   for (j=0; j<n; j++) {
191     for (i=j; i<n; i++) {
192       Cij = *Cj;
193       Ail = A + i;
194       Ajl = A + j;
195       for (l=0; l<k; l++) {
196 	Cij = taucs_sub( Cij, taucs_mul( *Ail, taucs_conj( *Ajl ) ) );
197 	Ail += lda;
198 	Ajl += lda;
199       }
200       *Cj = Cij;
201       Cj++;
202     }
203     Cj = Cj + ldc - n;
204     /* now Cj is at the top of column j+1, move to the diagonal */
205     Cj = Cj + j+1;
206   }
207 }
208 
209 cilk static void
taucs_cilk_herk(char * uplo,char * trans,int * pn,int * pk,taucs_real_datatype * alpha,taucs_datatype * A,int * plda,taucs_real_datatype * beta,taucs_datatype * C,int * pldc)210 taucs_cilk_herk(char* uplo, char* trans,
211 		int*  pn, int* pk,
212 		taucs_real_datatype* alpha,
213 		taucs_datatype *A, int *plda,
214 		taucs_real_datatype* beta,
215 		taucs_datatype *C, int *pldc)
216 {
217   int    n  = *pn;
218   int    k  = *pk;
219 
220   assert(*uplo  == 'L');
221   assert(*trans == 'N');
222   assert(*alpha ==-1.0);
223   assert(*beta  == 1.0);
224 
225   if (n <= TAUCS_THRESHOLD_HERK_SMALL && k <= TAUCS_THRESHOLD_HERK_SMALL) {
226     /*fprintf(stderr,"HERK SMALL\n");*/
227     taucs_herk_LNm1p1_small(n,k,A,*plda,C,*pldc);
228     return;
229   }
230 
231   if (n <= TAUCS_THRESHOLD_HERK_BLAS && k <= TAUCS_THRESHOLD_HERK_BLAS) {
232     /*fprintf(stderr,"HERK BLAS\n");*/
233     taucs_herk(uplo,trans,
234 	       pn, pk,
235 	       alpha,
236 	       A, plda,
237 	       beta,
238 	       C, pldc);
239     return;
240   }
241 
242   if (k > n) {
243     int khalf1 = k/2;
244     int khalf2 = k-khalf1;
245     int lda = *plda;
246     /*fprintf(stderr,"HERK K/2\n");*/
247     spawn taucs_cilk_herk(uplo,trans,
248 			  pn, &khalf1,
249 			  alpha,
250 			  A, plda,
251 			  beta,
252 			  C, pldc);
253     sync;
254     spawn taucs_cilk_herk(uplo,trans,
255 			  pn, &khalf2,
256 			  alpha,
257 			  A + khalf1*lda, plda,
258 			  beta,
259 			  C, pldc);
260     sync;
261     return;
262   } else {
263     int ldc = *pldc;
264     int nhalf1 = n/2;
265     int nhalf2 = n-nhalf1;
266     /*fprintf(stderr,"HERK N/2\n");*/
267     spawn taucs_cilk_herk(uplo,trans,
268 			  &nhalf1, pk,
269 			  alpha,
270 			  A, plda,
271 			  beta,
272 			  C, pldc);
273 
274     spawn taucs_cilk_gemm("No Transpose", "Conjugate",
275 			  &nhalf2, &nhalf1, pk,
276 			  &taucs_minusone_const,
277 			  A+nhalf1  , plda,
278 			  A         , plda,
279 			  &taucs_one_const,
280 			  C +nhalf1, pldc);
281 
282     spawn taucs_cilk_herk(uplo,trans,
283 			  &nhalf2, pk,
284 			  alpha,
285 			  A + nhalf1, plda,
286 			  beta,
287 			  C + nhalf1*ldc + nhalf1, pldc);
288 
289     sync;
290     return;
291   }
292 
293 }
294 
295 /*** TRSM ***/
296 
297 #define TAUCS_THRESHOLD_TRSM_SMALL 20
298 #define TAUCS_THRESHOLD_TRSM_BLAS  80
299 
300 static void
taucs_trsm_RLCNp1_small(int m,int n,taucs_datatype * A,int lda,taucs_datatype * B,int ldb)301 taucs_trsm_RLCNp1_small(int m, int n,
302 			taucs_datatype* A, int lda,
303 			taucs_datatype* B, int ldb)
304 {
305   int j,i,k;
306   taucs_datatype* Bi;
307   taucs_datatype* Bik;
308   taucs_datatype* Ajk;
309   taucs_datatype  Bij;
310 
311   Bi = B;
312   for (i=0; i<m; i++) {
313     for (j=0; j<n; j++) {
314       Bij = *(Bi + j*ldb);
315       Bik = Bi;
316       Ajk = A + j;
317       for (k=0; k<j; k++) {
318 	Bij = taucs_sub( Bij, taucs_mul( *Bik, taucs_conj( *Ajk ) ) );
319 	Bik += ldb;
320 	Ajk += lda;
321       }
322       *(Bi + j*ldb) = taucs_div( Bij, taucs_conj( *Ajk ) );
323     }
324     Bi = Bi + 1;
325   }
326 }
327 
328 cilk static void
taucs_cilk_trsm(char * side,char * uplo,char * transa,char * diag,int * pm,int * pn,taucs_datatype * alpha,taucs_datatype * A,int * plda,taucs_datatype * B,int * pldb)329 taucs_cilk_trsm(char* side, char* uplo, char* transa, char* diag,
330 		int*  pm, int* pn,
331 		taucs_datatype* alpha,
332 		taucs_datatype *A, int *plda,
333 		taucs_datatype *B, int *pldb)
334 {
335   int    n  = *pn;
336   int    m  = *pm;
337 
338   assert(*side   == 'R');
339   assert(*uplo   == 'L');
340   assert(*transa == 'C');
341   assert(*diag   == 'N');
342   assert(taucs_re(*alpha) == 1.0);
343   assert(taucs_im(*alpha) == 0.0);
344 
345   if (m <= TAUCS_THRESHOLD_TRSM_SMALL && n <= TAUCS_THRESHOLD_TRSM_SMALL) {
346     /*fprintf(stderr,"TRSM SMALL\n");*/
347     taucs_trsm_RLCNp1_small(m,n,A,*plda,B,*pldb);
348     return;
349   }
350 
351   if (m <= TAUCS_THRESHOLD_TRSM_BLAS && n <= TAUCS_THRESHOLD_TRSM_BLAS) {
352     /*fprintf(stderr,"TRSM BLAS\n");*/
353     taucs_trsm(side,uplo,transa,diag,
354 	       pm, pn,
355 	       alpha,
356 	       A, plda,
357 	       B, pldb);
358     return;
359   }
360 
361   if (m >= n) {
362     int mhalf1 = m/2;
363     int mhalf2 = m-mhalf1;
364     /*fprintf(stderr,"TRSM M/2\n");*/
365     spawn taucs_cilk_trsm(side,uplo,transa,diag,
366 			  &mhalf1, pn,
367 			  alpha,
368 			  A, plda,
369 			  B, pldb);
370     spawn taucs_cilk_trsm(side,uplo,transa,diag,
371 			  &mhalf2, pn,
372 			  alpha,
373 			  A, plda,
374 			  B+mhalf1, pldb);
375     sync;
376     return;
377   } else {
378     int lda = *plda;
379     int ldb = *pldb;
380     int nhalf1 = n/2;
381     int nhalf2 = n-nhalf1;
382     /*fprintf(stderr,"TRSM N/2\n");*/
383     spawn taucs_cilk_trsm(side,uplo,transa,diag,
384 			  pm, &nhalf1,
385 			  alpha,
386 			  A, plda,
387 			  B, pldb);
388     sync;
389     spawn taucs_cilk_gemm("No Transpose", "Conjugate",
390 			  pm, &nhalf2, &nhalf1,
391 			  &taucs_minusone_const,
392 			  B                  , pldb,
393 			  A+nhalf1           , plda,
394 			  &taucs_one_const,
395 			  B       +nhalf1*ldb, pldb);
396     sync;
397     spawn taucs_cilk_trsm(side,uplo,transa,diag,
398 			  pm, &nhalf2,
399 			  alpha,
400 			  A+nhalf1+nhalf1*lda, plda,
401 			  B       +nhalf1*ldb, pldb);
402     sync;
403     return;
404   }
405 }
406 
407 /*** POTRF ***/
408 
409 #define TAUCS_THRESHOLD_POTRF_SMALL 20
410 #define TAUCS_THRESHOLD_POTRF_BLAS  80
411 
412 /*
413   this routine is for Lower only, and returns
414   0 or the index of the nonpositive diagonal.
415 */
416 static int
taucs_potrf_lower_small(int n,taucs_datatype * A,int lda)417 taucs_potrf_lower_small(int n, taucs_datatype* A, int lda)
418 {
419   int j,i,k;
420   taucs_datatype* Aj;
421   taucs_datatype* Ajk;
422   taucs_datatype* Aik;
423   taucs_datatype  Aij;
424   taucs_datatype  scale;
425 
426   Aj = A;
427   for (j=0; j<n; j++) {
428     for (i=j; i<n; i++) {
429       Aij = taucs_zero_const;
430       Ajk = A+j; /* k = 0 */
431       Aik = A+i; /* k = 0 */
432       for (k=0; k<j; k++) {
433 	Aij = taucs_add( Aij , taucs_mul( (*Aik) , (*Ajk) ) );
434 	Aik += lda;
435 	Ajk += lda;
436       }
437       Aj[i] = taucs_sub( Aj[i] , Aij );
438     }
439     if ( taucs_re(Aj[j]) < 0 ) return j+1;
440     scale = taucs_div( taucs_one_const, taucs_sqrt(Aj[j]) );
441     for (i=j; i<n; i++)
442       Aj[i] = taucs_mul(Aj[i] , scale);
443     Aj += lda;
444   }
445 
446   return 0;
447 }
448 
449 cilk static void
taucs_cilk_potrf(char * uplo,int * pn,taucs_datatype * A,int * plda,int * pinfo)450 taucs_cilk_potrf(char* uplo,
451 		 int*  pn,
452 		 taucs_datatype* A, int* plda,
453 		 int*  pinfo)
454 {
455   int    n  = *pn;
456   int nhalf1,nhalf2;
457 
458   assert(*uplo == 'L');
459 
460   if (n <= TAUCS_THRESHOLD_POTRF_SMALL) {
461     /*fprintf(stderr,"POTRF SMALL\n");*/
462     *pinfo = taucs_potrf_lower_small(*pn,A,*plda);
463     return;
464   }
465 
466   if (n <= TAUCS_THRESHOLD_POTRF_BLAS) {
467     /*fprintf(stderr,"POTRF BLAS\n");*/
468     taucs_potrf(uplo,pn,A,plda,pinfo);
469     return;
470   }
471 
472   /*fprintf(stderr,"POTRF RECIRSIVE\n");*/
473 
474   nhalf1 = n/2;
475   nhalf2 = n-nhalf1;
476 
477   spawn taucs_cilk_potrf(uplo,&nhalf1,A,plda,pinfo);
478   sync;
479 
480   if (*pinfo) return;
481 
482   spawn taucs_cilk_trsm ("Right",
483 			 "Lower",
484 			 "Conjugate",
485 			 "No unit diagonal",
486 			 &nhalf2,&nhalf1,
487 			 &taucs_one_const,
488 			 A,       plda,
489 			 A+nhalf1,plda);
490   sync;
491   /*
492   taucs_trsm ("Right",
493 	      "Lower",
494 	      "Conjugate",
495 	      "No unit diagonal",
496 	      &nhalf2,&nhalf1,
497 	      &taucs_one_const,
498 	      A,       plda,
499 	      A+nhalf1,plda);
500   */
501 
502   spawn taucs_cilk_herk ("Lower",
503 			 "No Conjugate",
504 			 &nhalf2,&nhalf1,
505 			 &taucs_minusone_real_const,
506 			 A+nhalf1,plda,
507 			 &taucs_one_real_const,
508 			 A+nhalf1+(nhalf1 * *plda), plda);
509   sync;
510 
511   spawn taucs_cilk_potrf(uplo,&nhalf2,A+nhalf1+(nhalf1 * *plda),plda,pinfo);
512   sync;
513 
514   if (*pinfo) *pinfo += nhalf1;
515 }
516 #else
517 #define taucs_cilk_potrf taucs_potrf
518 #define taucs_cilk_gemm  taucs_gemm
519 #define taucs_cilk_trsm  taucs_trsm
520 #define taucs_cilk_herk  taucs_herk
521 #endif
522 #endif
523 
524 /*************************************************************/
525 /* These are really generic routines                         */
526 /*************************************************************/
527 
528 #if 0
529 #ifdef TAUCS_CORE_GENERAL
530 void* taucs_cilk_init() {
531 #ifdef TAUCS_CILK
532   CilkContext* context;
533   int argc;
534 #define CILK_ACTIVE_SIZE 8
535   char* argv[] = {"program_name","--nproc","8",0};
536 
537   for (argc=0; argv[argc]; argc++);
538 
539   taucs_printf("taucs_cilk_init\n");
540   context = Cilk_init(&argc,argv);
541   return context;
542 #else
543   taucs_printf("taucs_cilk_init: This is not a Cilk build\n");
544   return NULL;
545 #endif
546 }
547 
548 void taucs_cilk_terminate(void* context) {
549 #ifdef TAUCS_CILK
550   Cilk_terminate((CilkContext*) context);
551 #endif
552 }
553 
554 #endif /* TAUCS_CORE_GENERAL */
555 #endif
556 /*************************************************************/
557 /* End of Cilk-related generic routines                      */
558 /*************************************************************/
559 
560 #if 0
561 /*omer added this, I don't know why yet*/
562 #ifdef TAUCS_CORE_GENERAL
563 taucs_double taucs_dzero_const     =  0.0;
564 taucs_double taucs_done_const      =  1.0;
565 taucs_double taucs_dminusone_const = -1.0;
566 
567 taucs_single taucs_szero_const     =  0.0f;
568 taucs_single taucs_sone_const      =  1.0f;
569 taucs_single taucs_sminusone_const = -1.0f;
570 #endif
571 #endif
572 
573 /*************************************************************/
574 /* structures                                                */
575 /*************************************************************/
576 
577 #define FALSE 0
578 #define TRUE  1
579 
580 /*#define BLAS_FLOPS_CUTOFF  1000.0*/
581 #define BLAS_FLOPS_CUTOFF  -1.0
582 #define SOLVE_DENSE_CUTOFF 5
583 
584 typedef struct {
585   int     sn_size;
586   int     n;
587   int*    rowind;
588 
589   int     up_size;
590   int*    sn_vertices;
591   int*    up_vertices;
592 
593   taucs_datatype* f1;
594   taucs_datatype* f2;
595   taucs_datatype* u;
596 
597 } supernodal_frontal_matrix;
598 
599 #define SFM_F1 f1
600 #define SFM_F2 f2
601 #define SFM_U   u
602 
603 typedef struct {
604   int     flags;
605 
606   char    uplo;     /* 'u' for upper, 'l' for lower, ' ' don't know; prefer lower. */
607   int     n;        /* size of matrix */
608   int     n_sn;     /* number of supernodes */
609 
610   int* parent;      /* supernodal elimination tree */
611   int* first_child;
612   int* next_child;
613 
614   int* sn_size;     /* size of supernodes (diagonal block) */
615   int* sn_up_size;  /* size of subdiagonal update blocks   */
616   int** sn_struct;  /* row structure of supernodes         */
617 
618   int* sn_blocks_ld;  /* lda of supernode blocks */
619   taucs_datatype** sn_blocks; /* supernode blocks        */
620 
621   int* up_blocks_ld;  /* lda of update blocks    */
622   taucs_datatype** up_blocks; /* update blocks           */
623 } supernodal_factor_matrix;
624 
625 #ifdef TAUCS_CORE_GENERAL
626 /*************************************************************/
627 /* for qsort                                                 */
628 /*************************************************************/
629 
630 /* this is never used */
631 /*
632 static int compare_ints(void* vx, void* vy)
633 {
634   int* ix = (int*)vx;
635   int* iy = (int*)vy;
636   if (*ix < *iy) return -1;
637   if (*ix > *iy) return  1;
638   return 0;
639 }
640 */
641 
642 static int* compare_indirect_map;
compare_indirect_ints(const void * vx,const void * vy)643 static int compare_indirect_ints( const void* vx, const void* vy)
644 {
645   int* ix = (int*)vx;
646   int* iy = (int*)vy;
647   if (compare_indirect_map[*ix] < compare_indirect_map[*iy]) return -1;
648   if (compare_indirect_map[*ix] > compare_indirect_map[*iy]) return  1;
649   return 0;
650 }
651 
652 /*************************************************************/
653 /* radix sort                                                */
654 /*************************************************************/
655 
656 #if 0
657 /* NCOUNTS = 2^LOGRADIX */
658 
659 #define RADIX_SORT_LOGRADIX 4
660 #define RADIX_SORT_NCOUNTS  16
661 
662 static unsigned int counts[RADIX_SORT_NCOUNTS];
663 
664 static int
665 radix_sort(unsigned int* x, int n)
666 {
667   int i;
668   unsigned int mask;
669 
670   unsigned int  ncounts;
671 
672   unsigned int* y;
673   unsigned int* to;
674   unsigned int* from;
675 
676   unsigned int v;
677   unsigned int partialsum;
678   unsigned int next;
679   unsigned int bits_sorted;
680 
681   if (RADIX_SORT_LOGRADIX >= 8*sizeof(unsigned int)) {
682     taucs_printf("radix sort: radix too large.\n");
683     /* the computation of ncounts will fail */
684     return 0;
685   }
686 
687   mask    = 0;
688   ncounts = 1;
689   for (i=0; i<RADIX_SORT_LOGRADIX; i++) {
690     mask = (mask << 1) | 1;
691     ncounts = ncounts << 1;
692   }
693 
694   assert(ncounts==RADIX_SORT_NCOUNTS);
695 
696   y      = (unsigned int*) taucs_malloc(n       * sizeof(unsigned int));
697   if (!y) {
698     taucs_printf("radix sort: out of memory.\n");
699     return -1;
700   }
701 
702   from = x;
703   to   = y;
704 
705   bits_sorted = 0;
706   while(bits_sorted < 8*sizeof(unsigned int)) {
707     for (i=0; i<ncounts; i++) counts[i] = 0;
708 
709     for (i=0; i<n; i++) {
710       v = (from[i] >> bits_sorted) & mask;
711       assert(v < ncounts);
712       counts[v] ++;
713     }
714 
715     partialsum = 0;
716     for (i=0; i<ncounts; i++) {
717       /*printf("<%d ",counts[i]);*/
718       next = counts[i];
719       counts[i] = partialsum;
720       /*printf("%d>\n",counts[i]);*/
721       partialsum = partialsum + next;
722     }
723 
724     for (i=0; i<n; i++) {
725       v = (from[i] >> bits_sorted) & mask;
726       assert(counts[v] < n);
727       to[counts[v]] = from[i];
728       counts[v] ++;
729     }
730     /*
731     printf("===========\n");
732     for (i=0; i<n; i++) printf(">>%d>> %08x\n",bits_sorted,to[i]);
733     printf("===========\n");
734     */
735 
736     bits_sorted += RADIX_SORT_LOGRADIX;
737     if (from == x) {
738       from = y;
739       to   = x;
740     } else {
741       from = x;
742       to   = y;
743     }
744   }
745 
746   if (from == y)
747     for (i=0; i<n; i++) x[i] = y[i];
748 
749   taucs_free(y);
750 
751   return 0;
752 }
753 #endif
754 
755 #endif /* TAUCS_CORE_GENERAL */
756 /*************************************************************/
757 /* create and free the factor object                         */
758 /*************************************************************/
759 
760 #ifndef TAUCS_CORE_GENERAL
761 
762 static supernodal_factor_matrix*
multifrontal_supernodal_create()763 multifrontal_supernodal_create()
764 {
765   supernodal_factor_matrix* L;
766 
767   L = (supernodal_factor_matrix*) taucs_malloc(sizeof(supernodal_factor_matrix));
768   if (!L) return NULL;
769 
770 #ifdef TAUCS_CORE_SINGLE
771   L->flags = TAUCS_SINGLE;
772 #endif
773 
774 #ifdef TAUCS_CORE_DOUBLE
775   L->flags = TAUCS_DOUBLE;
776 #endif
777 
778 #ifdef TAUCS_CORE_SCOMPLEX
779   L->flags = TAUCS_SCOMPLEX;
780 #endif
781 
782 #ifdef TAUCS_CORE_DCOMPLEX
783   L->flags = TAUCS_DCOMPLEX;
784 #endif
785 
786   L->uplo      = 'l';
787   L->n         = -1; /* unused */
788 
789   L->sn_struct   = NULL;
790   L->sn_size     = NULL;
791   L->sn_up_size  = NULL;
792   L->parent      = NULL;
793   L->first_child = NULL;
794   L->next_child  = NULL;
795   L->sn_blocks_ld  = NULL;
796   L->sn_blocks     = NULL;
797   L->up_blocks_ld  = NULL;
798   L->up_blocks     = NULL;
799 
800   return L;
801 }
802 
taucs_dtl(supernodal_factor_free)803 void taucs_dtl(supernodal_factor_free)(void* vL)
804 {
805   supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL;
806   int sn;
807 
808   if (!L) return;
809 
810   taucs_free(L->parent);
811   taucs_free(L->first_child);
812   taucs_free(L->next_child);
813 
814   taucs_free(L->sn_size);
815   taucs_free(L->sn_up_size);
816   taucs_free(L->sn_blocks_ld);
817   taucs_free(L->up_blocks_ld);
818 
819   if (L->sn_struct)
820     for (sn=0; sn<L->n_sn; sn++)
821       taucs_free(L->sn_struct[sn]);
822 
823   if (L->sn_blocks)
824     for (sn=0; sn<L->n_sn; sn++)
825       taucs_free(L->sn_blocks[sn]);
826 
827   if (L->up_blocks)
828     for (sn=0; sn<L->n_sn; sn++)
829       taucs_free(L->up_blocks[sn]);
830 
831   taucs_free(L->sn_struct);
832   taucs_free(L->sn_blocks);
833   taucs_free(L->up_blocks);
834 
835   taucs_free(L);
836 }
837 
taucs_dtl(supernodal_factor_free_numeric)838 void taucs_dtl(supernodal_factor_free_numeric)(void* vL)
839 {
840   supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL;
841   int sn;
842 
843   for (sn=0; sn<L->n_sn; sn++) {
844     taucs_free(L->sn_blocks[sn]);
845     L->sn_blocks[sn] = NULL;
846     taucs_free(L->up_blocks[sn]);
847     L->up_blocks[sn] = NULL;
848   }
849 }
850 
851 taucs_ccs_matrix*
taucs_dtl(supernodal_factor_to_ccs)852 taucs_dtl(supernodal_factor_to_ccs)(void* vL)
853 {
854   supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL;
855   taucs_ccs_matrix* C;
856   int n,nnz;
857   int i,j,ip,jp,sn,next;
858   taucs_datatype v;
859   int* len;
860 
861   n = L->n;
862 
863   len = (int*) taucs_malloc(n*sizeof(int));
864   if (!len) return NULL;
865 
866   nnz = 0;
867   /*
868   for (sn=0; sn<L->n_sn; sn++) {
869     for (jp=0; jp<(L->sn_size)[sn]; jp++) {
870       j = (L->sn_struct)[sn][jp];
871       len[j] = (L->sn_up_size)[sn] - jp;
872       nnz += len[j];
873     }
874   }
875   */
876 
877   for (sn=0; sn<L->n_sn; sn++) {
878     for (jp=0; jp<(L->sn_size)[sn]; jp++) {
879       j = (L->sn_struct)[sn][jp];
880       len[j] = 0;
881 
882       for (ip=jp; ip<(L->sn_size)[sn]; ip++) {
883 	i = (L->sn_struct)[sn][ ip ];
884 	v = (L->sn_blocks)[sn][ jp*(L->sn_blocks_ld)[sn] + ip ];
885 
886 	if (taucs_re(v) || taucs_im(v)) {
887 	  len[j] ++;
888 	  nnz ++;
889 	}
890       }
891       for (ip=(L->sn_size)[sn]; ip<(L->sn_up_size)[sn]; ip++) {
892 	i = (L->sn_struct)[sn][ ip ];
893 	v = (L->up_blocks)[sn][ jp*(L->up_blocks_ld)[sn] + (ip-(L->sn_size)[sn]) ];
894 
895 	if (taucs_re(v) || taucs_im(v)) {
896 	  len[j] ++;
897 	  nnz ++;
898 	}
899       }
900     }
901   }
902 
903 
904   C = taucs_dtl(ccs_create)(n,n,nnz);
905   if (!C) {
906     taucs_free(len);
907     return NULL;
908   }
909 
910 #ifdef TAUCS_CORE_SINGLE
911   C->flags = TAUCS_SINGLE;
912 #endif
913 
914 #ifdef TAUCS_CORE_DOUBLE
915   C->flags = TAUCS_DOUBLE;
916 #endif
917 
918 #ifdef TAUCS_CORE_SCOMPLEX
919   C->flags = TAUCS_SCOMPLEX;
920 #endif
921 
922 #ifdef TAUCS_CORE_DCOMPLEX
923   C->flags = TAUCS_DCOMPLEX;
924 #endif
925 
926   C->flags |= TAUCS_TRIANGULAR | TAUCS_LOWER;
927 
928   (C->colptr)[0] = 0;
929   for (j=1; j<=n; j++) (C->colptr)[j] = (C->colptr)[j-1] + len[j-1];
930 
931   taucs_free(len);
932 
933   for (sn=0; sn<L->n_sn; sn++) {
934     for (jp=0; jp<(L->sn_size)[sn]; jp++) {
935       j = (L->sn_struct)[sn][jp];
936 
937       next = (C->colptr)[j];
938 
939       /*
940       memcpy((C->rowind) + next,
941 	     ((L->sn_struct)[sn]) + jp,
942 	     ((L->sn_up_size)[sn] - jp) * sizeof(int));
943       memcpy((C->taucs_values) + next,
944 	     ((L->sn_blocks)[sn]) + (jp*(L->sn_blocks_ld)[sn] + jp),
945 	     ((L->sn_size)[sn] - jp) * sizeof(taucs_datatype));
946       next += ((L->sn_size)[sn] - jp);
947       memcpy((C->taucs_values) + next,
948 	     ((L->up_blocks)[sn]) + jp*(L->up_blocks_ld)[sn],
949 	     ((L->sn_up_size)[sn] - (L->sn_size)[sn]) * sizeof(taucs_datatype));
950       */
951 
952       for (ip=jp; ip<(L->sn_size)[sn]; ip++) {
953 	i = (L->sn_struct)[sn][ ip ];
954 	v = (L->sn_blocks)[sn][ jp*(L->sn_blocks_ld)[sn] + ip ];
955 
956 	if (!taucs_re(v) && !taucs_im(v)) continue;
957 	/*if (v == 0.0) continue;*/
958 
959 	(C->rowind)[next] = i;
960 	(C->taucs_values)[next] = v;
961 	next++;
962       }
963       for (ip=(L->sn_size)[sn]; ip<(L->sn_up_size)[sn]; ip++) {
964 	i = (L->sn_struct)[sn][ ip ];
965 	v = (L->up_blocks)[sn][ jp*(L->up_blocks_ld)[sn] + (ip-(L->sn_size)[sn]) ];
966 
967 	if (!taucs_re(v) && !taucs_im(v)) continue;
968 	/*if (v == 0.0) continue;*/
969 
970 	(C->rowind)[next] = i;
971 	(C->taucs_values)[next] = v;
972 	next++;
973       }
974     }
975   }
976 
977   return C;
978 }
979 
980 /* just get the diagonal of a supernodal factor, for Penny */
981 
982 taucs_datatype*
taucs_dtl(supernodal_factor_get_diag)983 taucs_dtl(supernodal_factor_get_diag)(void* vL)
984 {
985   supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL;
986   int j,ip,jp,sn;/*i,next omer*/
987   taucs_datatype  v;
988   taucs_datatype* diag;
989 
990   diag = (taucs_datatype*) taucs_malloc((L->n) * sizeof(taucs_datatype));
991   if (!diag) return NULL;
992 
993   for (sn=0; sn<L->n_sn; sn++) {
994     for (jp=0; jp<(L->sn_size)[sn]; jp++) {
995       j = (L->sn_struct)[sn][jp];
996 
997       ip=jp; /* we just want the diagonal */
998 
999       v = (L->sn_blocks)[sn][ jp*(L->sn_blocks_ld)[sn] + ip ];
1000 
1001       diag[ j ] = v;
1002     }
1003   }
1004 
1005   return diag;
1006 }
1007 
1008 
1009 /*************************************************************/
1010 /* create and free frontal matrices                          */
1011 /*************************************************************/
1012 
1013 static supernodal_frontal_matrix*
supernodal_frontal_create(int * firstcol_in_supernode,int sn_size,int n,int * rowind)1014 supernodal_frontal_create(int* firstcol_in_supernode,
1015 			  int sn_size,
1016 			  int n,
1017 			  int* rowind)
1018 {
1019   supernodal_frontal_matrix* tmp;
1020 
1021   tmp = (supernodal_frontal_matrix*)taucs_malloc(sizeof(supernodal_frontal_matrix));
1022   if(tmp==NULL) return NULL;
1023 
1024   tmp->sn_size = sn_size;
1025   tmp->n = n;
1026 
1027   tmp->rowind = rowind;
1028 
1029   tmp->n = n;
1030   tmp->sn_size = sn_size;
1031   tmp->up_size = n-sn_size;
1032 
1033   tmp->sn_vertices = rowind;
1034   tmp->up_vertices = rowind + sn_size;
1035 
1036   /* on some platforms, malloc(0) fails, so we avoid such calls */
1037 
1038   tmp->SFM_F1 = tmp->SFM_F2 = tmp->SFM_U = NULL;
1039 
1040   if (tmp->sn_size)
1041     tmp->SFM_F1 = (taucs_datatype*)taucs_calloc((tmp->sn_size)*(tmp->sn_size),sizeof(taucs_datatype));
1042 
1043   if (tmp->sn_size && tmp->up_size)
1044     tmp->SFM_F2 = (taucs_datatype*)taucs_calloc((tmp->up_size)*(tmp->sn_size),sizeof(taucs_datatype));
1045 
1046   if (tmp->up_size)
1047     tmp->SFM_U  = (taucs_datatype*)taucs_calloc((tmp->up_size)*(tmp->up_size),sizeof(taucs_datatype));
1048 
1049   if((   tmp->SFM_F1==NULL && tmp->sn_size)
1050      || (tmp->SFM_F2==NULL && tmp->sn_size && tmp->up_size)
1051      || (tmp->SFM_U ==NULL && tmp->up_size)) {
1052     taucs_free(tmp->SFM_U);
1053     taucs_free(tmp->SFM_F1);
1054     taucs_free(tmp->SFM_F2);
1055     taucs_free(tmp);
1056     return NULL;
1057   }
1058 
1059   assert(tmp);
1060   return tmp;
1061 }
1062 
supernodal_frontal_free(supernodal_frontal_matrix * to_del)1063 static void supernodal_frontal_free(supernodal_frontal_matrix* to_del)
1064 {
1065   /*
1066      SFM_F1 and SFM_F2 are moved to the factor,
1067      but this function may be called before they are
1068      moved.
1069   */
1070 
1071 
1072   if (to_del) {
1073     taucs_free(to_del->SFM_F1);
1074     taucs_free(to_del->SFM_F2);
1075     taucs_free(to_del->SFM_U);
1076     taucs_free(to_del);
1077   }
1078 }
1079 
1080 /*************************************************************/
1081 /* factor a frontal matrix                                   */
1082 /*************************************************************/
1083 
1084 cilk
1085 static int
multifrontal_supernodal_front_factor(int sn,int * firstcol_in_supernode,int sn_size,taucs_ccs_matrix * A,supernodal_frontal_matrix * mtr,int * bitmap,supernodal_factor_matrix * snL)1086 multifrontal_supernodal_front_factor(int sn,
1087 				     int* firstcol_in_supernode,
1088 				     int sn_size,
1089 				     taucs_ccs_matrix* A,
1090 				     supernodal_frontal_matrix* mtr,
1091 				     int* bitmap,
1092 				     supernodal_factor_matrix* snL)
1093 {
1094   int i,j;
1095   int* ind;
1096   taucs_datatype* re;
1097   int INFO;
1098 
1099   /* creating transform for real indices */
1100   for(i=0;i<mtr->sn_size;i++) bitmap[mtr->sn_vertices[i]] = i;
1101   for(i=0;i<mtr->up_size;i++) bitmap[mtr->up_vertices[i]] = mtr->sn_size + i;
1102 
1103   /* adding sn_size column of A to first sn_size column of frontal matrix */
1104 
1105   for(j=0;j<(mtr->sn_size);j++) {
1106     ind = &(A->rowind[A->colptr[*(firstcol_in_supernode+j)]]);
1107     re  = &(A->taucs_values[A->colptr[*(firstcol_in_supernode+j)]]);
1108     for(i=0;
1109 	i < A->colptr[*(firstcol_in_supernode+j)+1]
1110             - A->colptr[*(firstcol_in_supernode+j)];
1111 	i++) {
1112       if (bitmap[ind[i]] < mtr->sn_size)
1113 	mtr->SFM_F1[ (mtr->sn_size)*j + bitmap[ind[i]]] =
1114 	  taucs_add( mtr->SFM_F1[ (mtr->sn_size)*j + bitmap[ind[i]]] , re[i] );
1115       else
1116 	mtr->SFM_F2[ (mtr->up_size)*j + bitmap[ind[i]] - mtr->sn_size] =
1117 	  taucs_add( mtr->SFM_F2[ (mtr->up_size)*j + bitmap[ind[i]] - mtr->sn_size] , re[i] );
1118     }
1119   }
1120 
1121   /* we use the BLAS through the Fortran interface */
1122 
1123   /* solving of lower triangular system for L */
1124   if (mtr->sn_size) {
1125     /*
1126     taucs_potrf ("LOWER",
1127 		 &(mtr->sn_size),
1128 		 mtr->SFM_F1,&(mtr->sn_size),
1129 		 &INFO);
1130     */
1131     spawn taucs_cilk_potrf ("LOWER",
1132 		 &(mtr->sn_size),
1133 		 mtr->SFM_F1,&(mtr->sn_size),
1134 		 &INFO);
1135     sync;
1136   }
1137 
1138 
1139   if (INFO) {
1140     taucs_printf("sivan %d %d\n",sn,sn_size);
1141     taucs_printf("\t\tLL^T Factorization: Matrix is not positive definite.\n");
1142     taucs_printf("\t\t                    nonpositive pivot in column %d\n",
1143 		 mtr->sn_vertices[INFO-1]);
1144     return -1;
1145   }
1146 
1147   /* getting completion for found columns of L */
1148   if (mtr->up_size && mtr->sn_size) {
1149 
1150     spawn taucs_cilk_trsm ("Right",
1151 			   "Lower",
1152 			   "Conjugate",
1153 			   "No unit diagonal",
1154 			   &(mtr->up_size),&(mtr->sn_size),
1155 			   &taucs_one_const,
1156 			   mtr->SFM_F1,&(mtr->sn_size),
1157 			   mtr->SFM_F2,&(mtr->up_size));
1158     sync;
1159     /*
1160     taucs_trsm ("Right",
1161 		"Lower",
1162 		"Conjugate",
1163 		"No unit diagonal",
1164 		&(mtr->up_size),&(mtr->sn_size),
1165 		&taucs_one_const,
1166 		mtr->SFM_F1,&(mtr->sn_size),
1167 		mtr->SFM_F2,&(mtr->up_size));
1168     */
1169   }
1170 
1171   (snL->sn_blocks   )[sn] = mtr->SFM_F1;
1172   (snL->sn_blocks_ld)[sn] = mtr->sn_size;
1173 
1174   (snL->up_blocks   )[sn] = mtr->SFM_F2;
1175   (snL->up_blocks_ld)[sn] = mtr->up_size;
1176   /* printf("*** sn=%d up_ld=%d (%d)\n",sn,mtr->up_size,(snL->up_vertex_ptr)[sn+1] - (snL->up_vertex_ptr)[sn]);*/
1177 
1178   /* computation of updated part of frontal matrix */
1179   if (mtr->up_size && mtr->sn_size) {
1180     spawn taucs_cilk_herk ("Lower",
1181 			   "No Conjugate",
1182 			   &(mtr->up_size),&(mtr->sn_size),
1183 			   &taucs_minusone_real_const,
1184 			   mtr->SFM_F2,&(mtr->up_size),
1185 			   &taucs_one_real_const,
1186 			   mtr->SFM_U, &(mtr->up_size));
1187     sync;
1188   }
1189 
1190   mtr->SFM_F1 = NULL; /* so we don't free twice */
1191   mtr->SFM_F2 = NULL; /* so we don't free twice */
1192 
1193   return 0;
1194  }
1195 
1196 /*************************************************************/
1197 /* extend-add                                                */
1198 /*************************************************************/
1199 
1200 static void
multifrontal_supernodal_front_extend_add(supernodal_frontal_matrix * parent_mtr,supernodal_frontal_matrix * my_mtr,int * bitmap)1201 multifrontal_supernodal_front_extend_add(
1202 					 supernodal_frontal_matrix* parent_mtr,
1203 					 supernodal_frontal_matrix* my_mtr,
1204 					 int* bitmap)
1205 {
1206   int j,i,parent_i,parent_j;
1207   taucs_datatype v;
1208 
1209   for(i=0;i<parent_mtr->sn_size;i++) bitmap[parent_mtr->sn_vertices[i]] = i;
1210   for(i=0;i<parent_mtr->up_size;i++) bitmap[parent_mtr->up_vertices[i]] = (parent_mtr->sn_size)+i;
1211 
1212   /* extend add operation for update matrix */
1213   for(j=0;j<my_mtr->up_size;j++) {
1214     for(i=j;i<my_mtr->up_size;i++) {
1215       parent_j = bitmap[ my_mtr->up_vertices[j] ];
1216       parent_i = bitmap[ my_mtr->up_vertices[i] ];
1217       /* we could skip this if indices were sorted */
1218       if (parent_j>parent_i) {
1219 	int tmp = parent_j;
1220 	parent_j = parent_i;
1221 	parent_i = tmp;
1222       }
1223 
1224       v = (my_mtr->SFM_U)[(my_mtr->up_size)*j+i];
1225 
1226       if (parent_j < parent_mtr->sn_size) {
1227 	if (parent_i < parent_mtr->sn_size) {
1228 	  (parent_mtr->SFM_F1)[ (parent_mtr->sn_size)*parent_j + parent_i] =
1229 	    taucs_add( (parent_mtr->SFM_F1)[ (parent_mtr->sn_size)*parent_j + parent_i] , v );
1230 	} else {
1231 	  (parent_mtr->SFM_F2)[ (parent_mtr->up_size)*parent_j + (parent_i-parent_mtr->sn_size)] =
1232 	    taucs_add( (parent_mtr->SFM_F2)[ (parent_mtr->up_size)*parent_j + (parent_i-parent_mtr->sn_size)] , v );
1233 	}
1234       } else {
1235 	(parent_mtr->SFM_U)[ (parent_mtr->up_size)*(parent_j-parent_mtr->sn_size) + (parent_i-parent_mtr->sn_size)] =
1236 	  taucs_add( (parent_mtr->SFM_U)[ (parent_mtr->up_size)*(parent_j-parent_mtr->sn_size) + (parent_i-parent_mtr->sn_size)] , v);
1237       }
1238     }
1239   }
1240 }
1241 
1242 #endif /*#ifndef TAUCS_CORE_GENERAL*/
1243 
1244 /*************************************************************/
1245 /* symbolic elimination                                      */
1246 /*************************************************************/
1247 
1248 #ifdef TAUCS_CORE_GENERAL
1249 
1250 /* UNION FIND ROUTINES */
1251 
uf_makeset(int * uf,int i)1252 static int uf_makeset(int* uf, int i)        { uf[i] = i; return i; }
uf_find(int * uf,int i)1253 static int uf_find   (int* uf, int i)
1254 {
1255   if (uf[i] != i)
1256     uf[i] = uf_find(uf,uf[i]);
1257   return uf[i];
1258 }
uf_union(int * uf,int s,int t)1259 static int uf_union  (int* uf, int s, int t) {
1260   if (uf_find(uf,s) < uf_find(uf,t)) {
1261     uf[uf_find(uf,s)] = uf_find(uf,t);
1262     return (uf_find(uf,t));
1263   } else {
1264     uf[uf_find(uf,s)] = uf_find(uf,t);
1265     return (uf_find(uf,t));
1266   }
1267 }
1268 
1269 static
recursive_postorder(int j,int first_child[],int next_child[],int postorder[],int ipostorder[],int * next)1270 void recursive_postorder(int  j,
1271 			 int  first_child[],
1272 			 int  next_child[],
1273 			 int  postorder[],
1274 			 int  ipostorder[],
1275 			 int* next)
1276 {
1277   int c;
1278   for (c=first_child[j]; c != -1; c = next_child[c]) {
1279     /*printf("*** %d is child of %d\n",c,j);*/
1280     recursive_postorder(c,first_child,next_child,
1281 			postorder,ipostorder,next);
1282   }
1283   /*printf(">>> j=%d next=%d\n",j,*next);*/
1284   if (postorder)  postorder [*next] = j;
1285   if (ipostorder) ipostorder[j] = *next;
1286   (*next)++;
1287 }
1288 
1289 #define GILBERT_NG_PEYTON_ANALYSIS_SUP
1290 
1291 /* in a few tests the supernodal version seemed slower */
1292 #undef GILBERT_NG_PEYTON_ANALYSIS_SUP
1293 
ordered_uf_makeset(int * uf,int i)1294 static int ordered_uf_makeset(int* uf, int i)
1295 {
1296   uf[i] = i;
1297   return i;
1298 }
ordered_uf_find(int * uf,int i)1299 static int ordered_uf_find   (int* uf, int i)
1300 {
1301   if (uf[i] != i)
1302     uf[i] = uf_find(uf,uf[i]);
1303   return uf[i];
1304 }
ordered_uf_union(int * uf,int s,int t)1305 static int ordered_uf_union  (int* uf, int s, int t)
1306 {
1307   assert(uf[t] == t);
1308   assert(uf[s] == s);
1309   assert(t > s);
1310   if (t > s) {
1311     uf[s] = t;
1312     return t;
1313   } else
1314     uf[t] = s;
1315     return s;
1316 }
1317 
1318 static void
tree_level(int j,int isroot,int first_child[],int next_child[],int level[],int level_j)1319 tree_level(int j,
1320 	   int isroot,
1321 	   int first_child[],
1322 	   int next_child[],
1323 	   int level[],
1324 	   int level_j)
1325 {
1326   int c;
1327   if (!isroot) level[j] = level_j;
1328   for (c=first_child[j]; c != -1; c = next_child[c]) {
1329     tree_level(c,
1330 	       FALSE,
1331 	       first_child,
1332 	       next_child,
1333 	       level,
1334 	       level_j+1);
1335   }
1336 }
1337 
1338 static void
tree_first_descendant(int j,int isroot,int first_child[],int next_child[],int ipostorder[],int first_descendant[])1339 tree_first_descendant(int j,
1340 		      int isroot,
1341 		      int first_child[],
1342 		      int next_child[],
1343 		      int ipostorder[],
1344 		      int first_descendant[])
1345 {
1346   int c;
1347   int fd = ipostorder[j];
1348   for (c=first_child[j]; c != -1; c = next_child[c]) {
1349     tree_first_descendant(c,
1350 			  FALSE,
1351 			  first_child,
1352 			  next_child,
1353 			  ipostorder,
1354 			  first_descendant);
1355     if (first_descendant[c] < fd) fd = first_descendant[c];
1356   }
1357   if (!isroot) first_descendant[j] = fd;
1358 }
1359 
1360 
1361 int
1362 taucs_ccs_etree(taucs_ccs_matrix* A,
1363 		int* parent,
1364 		int* l_colcount,
1365 		int* l_rowcount,
1366 		int* l_nnz);
1367 
1368 int
1369 taucs_ccs_etree_liu(taucs_ccs_matrix* A,
1370 		    int* parent,
1371 		    int* l_colcount,
1372 		    int* l_rowcount,
1373 		    int* l_nnz);
1374 
1375 
1376 
1377 static int
recursive_symbolic_elimination(int j,taucs_ccs_matrix * A,int first_child[],int next_child[],int * n_sn,int sn_size[],int sn_up_size[],int * sn_rowind[],int sn_first_child[],int sn_next_child[],int rowind[],int column_to_sn_map[],int map[],int do_order,int ipostorder[])1378 recursive_symbolic_elimination(int            j,
1379 			       taucs_ccs_matrix* A,
1380 			       int            first_child[],
1381 			       int            next_child[],
1382 			       int*           n_sn,
1383 			       int            sn_size[],
1384 			       int            sn_up_size[],
1385 			       int*           sn_rowind[],
1386 			       int            sn_first_child[],
1387 			       int            sn_next_child[],
1388 			       int            rowind[],
1389 			       int            column_to_sn_map[],
1390 			       int            map[],
1391 			       int            do_order,
1392 			       int            ipostorder[]
1393 			       )
1394 {
1395   int  i,ip,c,c_sn;
1396   int  in_previous_sn;
1397   int  nnz = 0; /* just to suppress the warning */
1398 
1399   for (c=first_child[j]; c != -1; c = next_child[c]) {
1400     if (recursive_symbolic_elimination(c,A,
1401 				       first_child,next_child,
1402 				       n_sn,
1403 				       sn_size,sn_up_size,sn_rowind,
1404 				       sn_first_child,sn_next_child,
1405 				       rowind, /* temporary */
1406 				       column_to_sn_map,
1407 				       map,
1408 				       do_order,ipostorder
1409 				       )
1410 	== -1) return -1;
1411   }
1412 
1413   in_previous_sn = 1;
1414   if (j == A->n)
1415     in_previous_sn = 0; /* this is not a real column */
1416   else if (first_child[j] == -1)
1417     in_previous_sn = 0; /* this is a leaf */
1418   else if (next_child[first_child[j]] != -1)
1419     in_previous_sn = 0; /* more than 1 child */
1420   else {
1421     /* check that the structure is nested */
1422     /* map contains child markers         */
1423 
1424     c=first_child[j];
1425     for (ip=(A->colptr)[j]; ip<(A->colptr)[j+1]; ip++) {
1426       i = (A->rowind)[ip];
1427       in_previous_sn = in_previous_sn && (map[i] == c);
1428     }
1429   }
1430 
1431   if (in_previous_sn) {
1432     c = first_child[j];
1433     c_sn = column_to_sn_map[c];
1434     column_to_sn_map[j] = c_sn;
1435 
1436     /* swap row indices so j is at the end of the */
1437     /* supernode, not in the update indices       */
1438     for (ip=sn_size[c_sn]; ip<sn_up_size[c_sn]; ip++)
1439       if (sn_rowind[c_sn][ip] == j) break;
1440     assert(ip<sn_up_size[c_sn]);
1441     sn_rowind[c_sn][ip] = sn_rowind[c_sn][sn_size[c_sn]];
1442     sn_rowind[c_sn][sn_size[c_sn]] = j;
1443 
1444     /* mark the nonzeros in the map */
1445     for (ip=sn_size[c_sn]; ip<sn_up_size[c_sn]; ip++)
1446       map[ sn_rowind[c_sn][ip] ] = j;
1447 
1448     sn_size   [c_sn]++;
1449 
1450     return 0;
1451   }
1452 
1453   /* we are in a new supernode */
1454 
1455   if (j < A->n) {
1456     nnz = 1;
1457     rowind[0] = j;
1458     map[j]    = j;
1459 
1460     for (c=first_child[j]; c != -1; c = next_child[c]) {
1461       c_sn = column_to_sn_map[c];
1462       for (ip=sn_size[c_sn]; ip<sn_up_size[c_sn]; ip++) {
1463 	i = sn_rowind[c_sn][ip];
1464 	if (i > j && map[i] != j) { /* new row index */
1465 	  map[i] = j;
1466 	  rowind[nnz] = i;
1467 	  nnz++;
1468 	}
1469       }
1470     }
1471 
1472     for (ip=(A->colptr)[j]; ip<(A->colptr)[j+1]; ip++) {
1473       i = (A->rowind)[ip];
1474       if (map[i] != j) { /* new row index */
1475 	map[i] = j;
1476 	rowind[nnz] = i;
1477 	nnz++;
1478       }
1479     }
1480   }
1481 
1482   /*printf("children of sn %d: ",*n_sn);*/
1483   for (c=first_child[j]; c != -1; c = next_child[c]) {
1484     c_sn = column_to_sn_map[c];
1485     /*printf("%d ",c_sn);*/
1486     if (c==first_child[j])
1487       sn_first_child[*n_sn] = c_sn;
1488     else {
1489       sn_next_child[ c_sn ] = sn_first_child[*n_sn];
1490       sn_first_child[*n_sn] = c_sn;
1491     }
1492   }
1493   /*printf("\n");*/
1494 
1495   if (j < A->n) {
1496     column_to_sn_map[j] = *n_sn;
1497     sn_size   [*n_sn] = 1;
1498     sn_up_size[*n_sn] = nnz;
1499     sn_rowind [*n_sn] = (int*) taucs_malloc(nnz * sizeof(int));
1500     if (!( sn_rowind [*n_sn] )) return -1;
1501     for (ip=0; ip<nnz; ip++) sn_rowind[*n_sn][ip] = rowind[ip];
1502     if (do_order) {
1503       /* Sivan and Vladimir: we think that we can sort in */
1504       /* column order, not only in etree postorder.       */
1505       /*
1506 	radix_sort(sn_rowind [*n_sn],nnz);
1507 	qsort(sn_rowind [*n_sn],nnz,sizeof(int),compare_ints);
1508       */
1509       compare_indirect_map = ipostorder;
1510       qsort(sn_rowind [*n_sn],nnz,sizeof(int),compare_indirect_ints);
1511     }
1512     assert(sn_rowind [*n_sn][0] == j);
1513     (*n_sn)++;
1514   }
1515 
1516   return 0;
1517 }
1518 
1519 /* count zeros and nonzeros in a supernode to compute the */
1520 /* utility of merging fundamental supernodes.             */
1521 
1522 typedef struct {
1523   double zeros;
1524   double nonzeros;
1525 } znz;
1526 
1527 static znz
recursive_amalgamate_supernodes(int sn,int * n_sn,int sn_size[],int sn_up_size[],int * sn_rowind[],int sn_first_child[],int sn_next_child[],int rowind[],int column_to_sn_map[],int map[],int do_order,int ipostorder[])1528 recursive_amalgamate_supernodes(int           sn,
1529 				int*           n_sn,
1530 				int            sn_size[],
1531 				int            sn_up_size[],
1532 				int*           sn_rowind[],
1533 				int            sn_first_child[],
1534 				int            sn_next_child[],
1535 				int            rowind[],
1536 				int            column_to_sn_map[],
1537 				int            map[],
1538 				int            do_order,
1539 				int            ipostorder[]
1540 				)
1541 {
1542   int  i,ip,c_sn,gc_sn;
1543   /*int  i,ip,c,c_sn,gc_sn;*/
1544   int  nnz;
1545   int  nchildren /*, ichild*/; /* number of children, child index */
1546   znz* c_znz = NULL;
1547   znz  sn_znz, merged_znz;
1548   /*int zero_count = 0;*/
1549   int new_sn_size, new_sn_up_size;
1550 
1551   sn_znz.zeros    = 0.0;
1552   sn_znz.nonzeros = (double) (((sn_up_size[sn] - sn_size[sn]) * sn_size[sn])
1553                               + (sn_size[sn] * (sn_size[sn] + 1))/2);
1554 
1555   if (sn_first_child[sn] == -1) { /* leaf */
1556     return sn_znz;
1557   }
1558 
1559   nchildren = 0;
1560   for (c_sn=sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn])
1561     nchildren++;
1562 
1563   /*  c_znz = (znz*) alloca(nchildren * sizeof(znz));*/
1564   c_znz = (znz*) taucs_malloc(nchildren * sizeof(znz));
1565   assert(c_znz);
1566 
1567   /*printf("supernode %d out of %d\n",sn,*n_sn);*/
1568 
1569   /* merge the supernode with its children! */
1570 
1571   i = 0;
1572   for (c_sn=sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) {
1573     c_znz[i] =
1574       recursive_amalgamate_supernodes(c_sn,
1575 				      n_sn,
1576 				      sn_size,sn_up_size,sn_rowind,
1577 				      sn_first_child,sn_next_child,
1578 				      rowind, /* temporary */
1579 				      column_to_sn_map,
1580 				      map,
1581 				      do_order,ipostorder
1582 				      );
1583     assert(c_znz[i].zeros + c_znz[i].nonzeros ==
1584 	   (double) (((sn_up_size[c_sn] - sn_size[c_sn]) * sn_size[c_sn])
1585 		     + (sn_size[c_sn] * (sn_size[c_sn] + 1))/2 ));
1586     i++;
1587   }
1588 
1589   merged_znz.nonzeros = sn_znz.nonzeros;
1590   merged_znz.zeros    = sn_znz.zeros;
1591 
1592   for (i=0; i<nchildren; i++) {
1593     merged_znz.nonzeros += (c_znz[i]).nonzeros;
1594     merged_znz.zeros    += (c_znz[i]).zeros;
1595   }
1596 
1597   taucs_free(c_znz);
1598 
1599   /*  printf("supernode %d out of %d (continuing)\n",sn,*n_sn);*/
1600 
1601   /* should we merge the supernode with its children? */
1602 
1603   nnz = 0;
1604   for (c_sn=sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) {
1605     for (ip=0; ip<sn_size[c_sn]; ip++) {
1606       i = sn_rowind[c_sn][ip];
1607       assert( map[i] != sn );
1608       map[i] = sn;
1609       rowind[nnz] = i;
1610       nnz++;
1611     }
1612   }
1613 
1614   for (ip=0; ip<sn_size[sn]; ip++) {
1615     i = sn_rowind[sn][ip];
1616     assert( map[i] != sn );
1617     map[i] = sn;
1618     rowind[nnz] = i;
1619     nnz++;
1620   }
1621 
1622   new_sn_size = nnz;
1623 
1624   for (c_sn=sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) {
1625     for (ip=sn_size[c_sn]; ip<sn_up_size[c_sn]; ip++) {
1626       i = sn_rowind[c_sn][ip];
1627       if (map[i] != sn) { /* new row index */
1628 	map[i] = sn;
1629 	rowind[nnz] = i;
1630 	nnz++;
1631       }
1632     }
1633   }
1634 
1635   for (ip=sn_size[sn]; ip<sn_up_size[sn]; ip++) {
1636     i = sn_rowind[sn][ip];
1637     if (map[i] != sn) { /* new row index */
1638       map[i] = sn;
1639       rowind[nnz] = i;
1640       nnz++;
1641     }
1642   }
1643 
1644   new_sn_up_size = nnz;
1645 
1646   if (do_order) {
1647     compare_indirect_map = ipostorder;
1648     qsort(rowind,nnz,sizeof(int),compare_indirect_ints);
1649   }
1650 
1651   /* determine whether we should merge the supernode and its children */
1652 
1653   {
1654     int n;
1655     double* zcount = NULL;
1656 
1657     n = 0;
1658     for (ip=0; ip<nnz; ip++) {
1659       i = rowind[ip];
1660       if (i >= n) n = i+1;
1661     }
1662 
1663     /*zcount = (double*) alloca(n * sizeof(double));*/
1664     zcount = (double*) taucs_malloc(n * sizeof(double));
1665     assert(zcount);
1666 
1667     for (ip=0; ip<new_sn_size; ip++) {
1668       i = rowind[ip]; assert(i<n);
1669       zcount[i] = (double) (ip+1);
1670     }
1671     for (ip=new_sn_size; ip<new_sn_up_size; ip++) {
1672       i = rowind[ip]; assert(i<n);
1673       zcount[i] = (double) new_sn_size;
1674     }
1675 
1676     /*
1677     for (ip=0; ip<new_sn_up_size; ip++)
1678       printf("row %d zcount = %.0f\n",rowind[ip],zcount[rowind[ip]]);
1679     */
1680 
1681     for (c_sn=sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) {
1682       for (ip=0; ip<sn_size[c_sn]; ip++) {
1683 	i = sn_rowind[c_sn][ip]; assert(i<n);
1684 	zcount[i] -= (double) (ip+1);
1685       }
1686       for (ip=sn_size[c_sn]; ip<sn_up_size[c_sn]; ip++) {
1687 	i = sn_rowind[c_sn][ip]; assert(i<n);
1688 	zcount[i] -= (double) sn_size[c_sn];
1689       }
1690     }
1691 
1692     for (ip=0; ip<sn_size[sn]; ip++) {
1693       i = sn_rowind[sn][ip]; assert(i<n);
1694       zcount[i] -= (double) (ip+1);
1695     }
1696     for (ip=sn_size[sn]; ip<sn_up_size[sn]; ip++) {
1697       i = sn_rowind[sn][ip]; assert(i<n);
1698       zcount[i] -= (double) sn_size[sn];
1699     }
1700 
1701     /*
1702     for (ip=0; ip<new_sn_up_size; ip++)
1703       printf("ROW %d zcount = %.0f\n",rowind[ip],zcount[rowind[ip]]);
1704     printf("zeros before merging %.0f\n",merged_znz.zeros);
1705     */
1706 
1707     for (ip=0; ip<new_sn_up_size; ip++) {
1708       i = rowind[ip]; assert(i<n);
1709       assert(zcount[i] >= 0.0);
1710       merged_znz.zeros += zcount[i];
1711     }
1712 
1713     /*printf("zeros after merging %.0f\n",merged_znz.zeros);*/
1714 
1715     /* voodoo constants (need some kind of a utility function */
1716     if ((new_sn_size < 16)
1717 	||
1718 	((sn_size[sn] < 50) && (merged_znz.zeros < 0.5 * merged_znz.nonzeros))
1719 	||
1720 	((sn_size[sn] < 250) && (merged_znz.zeros < 0.25 * merged_znz.nonzeros))
1721 	||
1722 	((sn_size[sn] < 500) && (merged_znz.zeros < 0.10 * merged_znz.nonzeros))
1723 	||
1724 	(merged_znz.zeros < 0.05 * merged_znz.nonzeros)
1725 	) {
1726       /*
1727       taucs_printf("merging sn %d, zeros (%f) vs nonzeros (%f)\n",
1728 		   sn,merged_znz.zeros,merged_znz.nonzeros);
1729       */
1730     } else {
1731       /*
1732       taucs_printf("sn %d, too many zeros (%f) vs nonzeros (%f)\n",
1733 		   sn,merged_znz.zeros,merged_znz.nonzeros);
1734       printf("returning without merging\n");
1735       */
1736       taucs_free(zcount);
1737       return sn_znz;
1738     }
1739 
1740     taucs_free(zcount);
1741   }
1742 
1743   /* now merge the children lists */
1744 
1745   sn_size[sn]    = new_sn_size;
1746   sn_up_size[sn] = new_sn_up_size;
1747   sn_rowind[sn]  = (int*) taucs_realloc(sn_rowind[sn],
1748 				  new_sn_up_size * sizeof(int));
1749   for (ip=0; ip<new_sn_up_size; ip++) sn_rowind[sn][ip] = rowind[ip];
1750 
1751   /*  printf("supernode %d out of %d (merging)\n",sn,*n_sn);*/
1752 
1753   nchildren = 0;
1754   for (c_sn=sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) {
1755     for (ip=0; ip<sn_size[c_sn]; ip++) {
1756       i = (sn_rowind[c_sn])[ip];
1757       assert(column_to_sn_map[i] == c_sn);
1758       column_to_sn_map[i] = sn;
1759     }
1760 
1761     for (gc_sn=sn_first_child[c_sn]; gc_sn != -1; gc_sn = sn_next_child[gc_sn]) {
1762       rowind[nchildren] = gc_sn;
1763       nchildren++;
1764     }
1765   }
1766 
1767   /* free the children's rowind vectors */
1768   for (c_sn=sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) {
1769     taucs_free( sn_rowind[c_sn] );
1770     sn_rowind[c_sn]  = NULL;
1771     sn_size[c_sn]    = 0;
1772     sn_up_size[c_sn] = 0;
1773   }
1774 
1775   sn_first_child[sn] = -1;
1776   for (i=0; i<nchildren; i++) {
1777     sn_next_child[ rowind[i] ] = sn_first_child[sn];
1778     sn_first_child[sn] = rowind[i];
1779   }
1780 
1781   /*
1782   printf("supernode %d out of %d (done)\n",sn,*n_sn);
1783   printf("returning, merging\n");
1784   */
1785   return merged_znz;
1786 }
1787 #endif /* #ifdef TAUCS_CORE_GENERAL */
1788 
1789 
1790 #ifndef TAUCS_CORE_GENERAL
1791 
1792 
1793 /*************************************************************/
1794 /* factor routines                                           */
1795 /*************************************************************/
1796 
extend_add_wrapper(supernodal_frontal_matrix * child_matrix,supernodal_frontal_matrix ** my_matrix_ptr,int is_root,int * v,int sn_size,int sn_up_size,int * rowind,int * bitmap,int * fail)1797 static void extend_add_wrapper(supernodal_frontal_matrix * child_matrix,
1798 			       supernodal_frontal_matrix ** my_matrix_ptr,
1799 			       int is_root,
1800 			       int *v,
1801 			       int sn_size,
1802 			       int sn_up_size,
1803 			       int * rowind,
1804 			       int * bitmap,
1805 			       int * fail) {
1806 
1807   if (*fail) {
1808     if (*my_matrix_ptr)
1809       supernodal_frontal_free(*my_matrix_ptr);
1810     return;
1811   }
1812 
1813   if (!is_root) {
1814     if (!(*my_matrix_ptr)) {
1815       *my_matrix_ptr = supernodal_frontal_create(v,sn_size,sn_up_size,rowind);
1816       if (!(*my_matrix_ptr)) {
1817 	*fail = TRUE;
1818 	supernodal_frontal_free(child_matrix);
1819 	return;
1820       }
1821     }
1822     multifrontal_supernodal_front_extend_add(*my_matrix_ptr,child_matrix,bitmap);
1823   }
1824 
1825   /* moved outside "if !is_root"; Sivan 27 Feb 2002 */
1826   supernodal_frontal_free(child_matrix);
1827 }
1828 
1829 cilk
1830 static supernodal_frontal_matrix*
recursive_multifrontal_supernodal_factor_llt(int sn,int is_root,int ** bitmaps,taucs_ccs_matrix * A,supernodal_factor_matrix * snL,int * fail)1831 recursive_multifrontal_supernodal_factor_llt(int sn,       /* this supernode */
1832 					     int is_root,  /* is v the root? */
1833 #if 1
1834 					     int** bitmaps,
1835 #else
1836 					     int* bitmap,
1837 #endif
1838 					     taucs_ccs_matrix* A,
1839 					     supernodal_factor_matrix* snL,
1840 					     int* fail)
1841 {
1842   supernodal_frontal_matrix* my_matrix=NULL;
1843   /*supernodal_frontal_matrix* child_matrix=NULL;*/
1844   int child;
1845   int* v;
1846   int  sn_size;
1847   int* first_child   = snL->first_child;
1848   int* next_child    = snL->next_child;
1849 
1850 #ifdef TAUCS_CILK
1851   /* Inlet for syncronization */
1852   inlet void extend_add_inlet(supernodal_frontal_matrix * child_matrix) {
1853 
1854     if (!is_root) {
1855       if (!(my_matrix)) {
1856 	my_matrix = supernodal_frontal_create(v,sn_size,
1857 					      snL->sn_up_size[sn],snL->sn_struct[sn]);
1858 
1859 	if (!(my_matrix)) {
1860 	  *fail = TRUE;
1861 	  supernodal_frontal_free(child_matrix);
1862 	  return;
1863 	}
1864       }
1865       multifrontal_supernodal_front_extend_add(my_matrix,child_matrix,bitmaps[Self]);
1866     }
1867 
1868     /* moved outside "if !is_root"; Sivan 27 Feb 2002 */
1869     supernodal_frontal_free(child_matrix);
1870     /*
1871       The following approach is not working correctly because nothing is guaranteed about different procedure instances atomcity:
1872 
1873       extend_add_wrapper(child_matrix,&my_matrix,is_root,v,sn_size,snL->sn_up_size[sn],snL->sn_struct[sn],bitmaps[Self],fail);
1874     */
1875   }
1876 #endif
1877 
1878   /* Sivan fixed a bug 25/2/2003: v was set even at the root, */
1879   /* but this element of sn_struct was not allocated.         */
1880 
1881   if (!is_root) {
1882     sn_size = snL->sn_size[sn];
1883     v = &( snL->sn_struct[sn][0] );
1884   } else {
1885     sn_size = -1;
1886     v = NULL; /* not used */
1887   }
1888 
1889   for (child = first_child[sn]; child != -1; child = next_child[child]) {
1890     /* original non-cilk code: */
1891     /*
1892     child_matrix =
1893       recursive_multifrontal_supernodal_factor_llt(child,
1894 						   FALSE,
1895 						   bitmap,
1896 						   A,snL,fail);
1897     */
1898 
1899 #ifdef TAUCS_CILK
1900     extend_add_inlet(spawn recursive_multifrontal_supernodal_factor_llt(child,
1901 									FALSE,
1902 									bitmaps,
1903 									A,snL,fail));
1904 #else
1905     extend_add_wrapper(recursive_multifrontal_supernodal_factor_llt(child,
1906 								    FALSE,
1907 								    bitmaps,
1908 								    A,snL,fail),
1909 		       &my_matrix,
1910 		       is_root,
1911 		       v,
1912 		       sn_size,
1913 		       snL->sn_up_size[sn],
1914 		       snL->sn_struct[sn],
1915 		       bitmaps[Self],
1916 		       fail);
1917 #endif
1918 
1919 
1920     if (*fail) {
1921       if (my_matrix) supernodal_frontal_free(my_matrix);
1922       return NULL;
1923     }
1924 
1925 #if 0
1926     if (!is_root) {
1927       if (!my_matrix) {
1928 	my_matrix =  supernodal_frontal_create(v,sn_size,
1929 					       snL->sn_up_size[sn],
1930 					       snL->sn_struct[sn]);
1931 	if (!my_matrix) {
1932 	  *fail = TRUE;
1933 	  supernodal_frontal_free(child_matrix);
1934 	  return NULL;
1935 	}
1936       }
1937       multifrontal_supernodal_front_extend_add(my_matrix,child_matrix,bitmap);
1938     }
1939     /* moved outside "if !is_root"; Sivan 27 Feb 2002 */
1940     supernodal_frontal_free(child_matrix);
1941 #endif /* 0, old pre-cilk code */
1942   }
1943   sync;
1944 
1945   /* in case we have no children, we allocate now */
1946   if (!is_root && !my_matrix) {
1947     my_matrix =  supernodal_frontal_create(v,sn_size,
1948 					   snL->sn_up_size[sn],
1949 					   snL->sn_struct[sn]);
1950     if (!my_matrix) {
1951       *fail = TRUE;
1952       return NULL;
1953     }
1954   }
1955 
1956   if(!is_root) {
1957     int rc;
1958     rc = spawn multifrontal_supernodal_front_factor(sn,
1959 						    v,sn_size,
1960 						    A,
1961 						    my_matrix,
1962 #if 1
1963 						    bitmaps[Self],
1964 #else
1965 						    bitmap,
1966 #endif
1967 						    snL);
1968     sync;
1969     if (rc) {
1970       /* nonpositive pivot */
1971       *fail = TRUE;
1972       supernodal_frontal_free(my_matrix);
1973       return NULL;
1974     }
1975   }
1976   return my_matrix;
1977 }
1978 
1979 cilk
1980 void*
taucs_dtl(ccs_factor_llt_mf)1981 taucs_dtl(ccs_factor_llt_mf)(taucs_ccs_matrix* A)
1982 {
1983   void* p;
1984 
1985   p = spawn taucs_dtl(ccs_factor_llt_mf_maxdepth)(A,0);
1986   sync;
1987 
1988   return p;
1989 }
1990 
1991 cilk
1992 static void
recursive_multifrontal_supernodal_factor_llt_caller(int n_sn,int is_root,taucs_ccs_matrix * A,supernodal_factor_matrix * snL,int * fail)1993 recursive_multifrontal_supernodal_factor_llt_caller(int n_sn,     /* this supernode */
1994 						    int is_root,  /* is v the root? */
1995 						    taucs_ccs_matrix* A,
1996 						    supernodal_factor_matrix* snL,
1997 						    int* fail)
1998 {
1999   int** maps;
2000   int   i,j;
2001   supernodal_frontal_matrix* always_null;
2002 
2003   maps = (int**)taucs_malloc(Cilk_active_size*sizeof(int*));
2004   if (!maps) {
2005     taucs_supernodal_factor_free(snL);
2006     assert(0); return;
2007     /*return NULL;*/
2008   }
2009 
2010   for (i=0; i < Cilk_active_size; i++) {
2011     maps[i] = (int*)taucs_malloc((A->n+1)*sizeof(int));
2012     if (!maps[i]) {
2013       for (j=0; j < i ; j++)
2014 	taucs_free(maps[j]);
2015       taucs_free(maps);
2016       taucs_supernodal_factor_free(snL);
2017       assert(0); return;
2018       /*return NULL;*/
2019     }
2020   }
2021 
2022   /*#ifdef TAUCS_CILK  */
2023 #if 0
2024   context = Cilk_init(&argc,argv);
2025   always_null = EXPORT(recursive_multifrontal_supernodal_factor_llt)(context,
2026 								     n_sn,
2027 								     TRUE,
2028 								     maps,
2029 								     A,snL,fail);
2030   Cilk_terminate(context);
2031 #else
2032   always_null = spawn recursive_multifrontal_supernodal_factor_llt(n_sn,
2033 								   TRUE,
2034 								   maps,
2035 								   A,snL,fail);
2036   sync;
2037 #endif
2038 
2039   for(i=0;i<Cilk_active_size;i++)
2040     taucs_free(maps[i]);
2041   taucs_free(maps);
2042 
2043   /*
2044     always_null = spawn recursive_multifrontal_supernodal_factor_llt((L->n_sn),
2045     TRUE,
2046     map,
2047     A,L,&fail);
2048   */
2049 }
2050 
2051 cilk
2052 void*
taucs_dtl(ccs_factor_llt_mf_maxdepth)2053 taucs_dtl(ccs_factor_llt_mf_maxdepth)(taucs_ccs_matrix* A,int max_depth)
2054 {
2055   supernodal_factor_matrix* L;
2056 #if 1
2057 #else
2058   int* map;
2059 #endif
2060   int fail;
2061   double wtime, ctime;
2062 
2063   wtime = taucs_wtime();
2064   ctime = taucs_ctime();
2065 
2066   L = multifrontal_supernodal_create();
2067   if (!L) return NULL;
2068 
2069 #ifdef TAUCS_CORE_COMPLEX
2070   fail = taucs_ccs_symbolic_elimination(A,L,
2071 					TRUE /* sort, to avoid complex conjuation */,
2072 					max_depth);
2073 #else
2074   fail = taucs_ccs_symbolic_elimination(A,L,
2075 					FALSE /* don't sort row indices */          ,
2076 					max_depth);
2077 #endif
2078   if (fail == -1) {
2079     taucs_supernodal_factor_free(L);
2080     return NULL;
2081   }
2082 
2083   wtime = taucs_wtime()-wtime;
2084   ctime = taucs_ctime()-ctime;
2085   taucs_printf("\t\tSymbolic Analysis            = % 10.3f seconds (%.3f cpu)\n",
2086 	       wtime,ctime);
2087 
2088 #if 1
2089 #else
2090   map = (int*)taucs_malloc((A->n+1)*sizeof(int));
2091   if (!map) {
2092     taucs_supernodal_factor_free(L);
2093     return NULL;
2094   }
2095 #endif
2096 
2097   wtime = taucs_wtime();
2098   ctime = taucs_ctime();
2099 
2100   fail = FALSE;
2101   spawn recursive_multifrontal_supernodal_factor_llt_caller((L->n_sn),
2102 							    TRUE,
2103 							    A,L,&fail);
2104   sync;
2105 
2106   wtime = taucs_wtime()-wtime;
2107   ctime = taucs_ctime()-ctime;
2108   taucs_printf("\t\tSupernodal Multifrontal LL^T = % 10.3f seconds (%.3f cpu)\n",
2109 	       wtime,ctime);
2110 
2111 #if 1
2112 #else
2113   taucs_free(map);
2114 #endif
2115 
2116   if (fail) {
2117     taucs_supernodal_factor_free(L);
2118     return NULL;
2119   }
2120 
2121   return (void*) L;
2122 }
2123 
2124 /*************************************************************/
2125 /* symbolic-numeric routines                                 */
2126 /*************************************************************/
2127 
2128 void*
taucs_dtl(ccs_factor_llt_symbolic)2129 taucs_dtl(ccs_factor_llt_symbolic)(taucs_ccs_matrix* A)
2130 {
2131   return taucs_dtl(ccs_factor_llt_symbolic_maxdepth)(A,0);
2132 }
2133 
2134 void*
taucs_dtl(ccs_factor_llt_symbolic_maxdepth)2135 taucs_dtl(ccs_factor_llt_symbolic_maxdepth)(taucs_ccs_matrix* A, int max_depth)
2136 {
2137   supernodal_factor_matrix* L;
2138   int fail;
2139   double wtime, ctime;
2140 
2141   wtime = taucs_wtime();
2142   ctime = taucs_ctime();
2143 
2144   L = multifrontal_supernodal_create();
2145   if (!L) return NULL;
2146 
2147 #ifdef TAUCS_CORE_COMPLEX
2148   fail = taucs_ccs_symbolic_elimination(A,L,
2149 					TRUE /* sort, to avoid complex conjuation */,
2150 					max_depth);
2151 #else
2152   fail = taucs_ccs_symbolic_elimination(A,L,
2153 					FALSE /* don't sort row indices */          ,
2154 					max_depth);
2155 #endif
2156 
2157   if (fail == -1) {
2158     taucs_supernodal_factor_free(L);
2159     return NULL;
2160   }
2161 
2162   wtime = taucs_wtime()-wtime;
2163   ctime = taucs_ctime()-ctime;
2164   taucs_printf("\t\tSymbolic Analysis            = % 10.3f seconds (%.3f cpu)\n",
2165 	       wtime,ctime);
2166   return L;
2167 }
2168 
2169 cilk
2170 int
taucs_dtl(ccs_factor_llt_numeric)2171 taucs_dtl(ccs_factor_llt_numeric)(taucs_ccs_matrix* A,void* vL)
2172 {
2173   supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL;
2174   int* map;
2175   int fail;
2176   double wtime, ctime;
2177 
2178   map = (int*)taucs_malloc((A->n+1)*sizeof(int));
2179 
2180   wtime = taucs_wtime();
2181   ctime = taucs_ctime();
2182 
2183   /* XXX: sivan, we don't need map */
2184   fail = FALSE;
2185   spawn recursive_multifrontal_supernodal_factor_llt_caller((L->n_sn),
2186 							    TRUE,
2187 							    A,L,&fail);
2188   sync;
2189   /*
2190     recursive_multifrontal_supernodal_factor_llt((L->n_sn),
2191 					       TRUE,
2192 					       map,
2193 					       A,L,&fail);
2194   */
2195 
2196   wtime = taucs_wtime()-wtime;
2197   ctime = taucs_ctime()-ctime;
2198   taucs_printf("\t\tSupernodal Multifrontal LL^T = % 10.3f seconds (%.3f cpu)\n",
2199 	       wtime,ctime);
2200 
2201   taucs_free(map);
2202 
2203   if (fail) {
2204     taucs_supernodal_factor_free_numeric(L);
2205     return -1;
2206   }
2207 
2208   return 0;
2209 }
2210 
2211 /*************************************************************/
2212 /* left-looking factor routines                              */
2213 /*************************************************************/
2214 
2215 static void
recursive_leftlooking_supernodal_update(int J,int K,int bitmap[],taucs_datatype * dense_update_matrix,taucs_ccs_matrix * A,supernodal_factor_matrix * L)2216 recursive_leftlooking_supernodal_update(int J,int K,
2217 					int bitmap[],
2218 					taucs_datatype* dense_update_matrix,
2219 					taucs_ccs_matrix* A,
2220 					supernodal_factor_matrix* L)
2221 {
2222   int i,j,ir;
2223   int  child;
2224   int* first_child   = L->first_child;
2225   int* next_child    = L->next_child;
2226   int sn_size_father = (L->sn_size)[J];
2227   int sn_up_size_father = (L->sn_up_size)[J];
2228   int sn_size_child = (L->sn_size)[K];
2229   int sn_up_size_child = (L->sn_up_size)[K];
2230   int exist_upd=0;
2231   int first_row = 0;
2232   int row_count=0;
2233   int PK,M,N,LDA,LDB,LDC;
2234 
2235   /*
2236   for(i=0;i<sn_size_father;i++) {
2237     bitmap[L->sn_struct[J][i]]=i+1;
2238   }
2239 
2240   for(i=sn_size_father;i<sn_up_size_father;i++)
2241     bitmap[L->sn_struct[J][i]] = i - sn_size_father + 1;
2242   */
2243 
2244   for(i=sn_size_child;i<sn_up_size_child;i++)
2245     /* is this row index included in the columns of sn J? */
2246     if(bitmap[L->sn_struct[K][i]]
2247        && L->sn_struct[K][i] <= L->sn_struct[J][sn_size_father-1]) {
2248       if(!exist_upd) first_row = i;
2249       row_count++;
2250       exist_upd = 1;
2251       /*taucs_printf("update from K = %d to J = %d \n",K,J);*/
2252       /* loop over columns of sn K */
2253 
2254       /* for(j=0;j<sn_size_child;j++)
2255 	for(ir=i;ir<sn_up_size_child;ir++)
2256 	  if( L->sn_struct[K][ir] <= L->sn_struct[J][sn_size_father-1]){
2257 	    L->sn_blocks[J][ (bitmap[L->sn_struct[K][i]]-1)*(L->sn_blocks_ld[J])+(bitmap[L->sn_struct[K][ir]]-1)] -= L->up_blocks[K][j*(L->up_blocks_ld[K])+ir-sn_size_child]* L->up_blocks[K][j*L->up_blocks_ld[K]+i-sn_size_child];
2258 	    taucs_printf("sn_block: L[%d,%d] = %lf\n",(bitmap[L->sn_struct[K][ir]]-1),(bitmap[L->sn_struct[K][i]]-1),L->sn_blocks[J][ (bitmap[L->sn_struct[K][i]]-1)*(L->sn_blocks_ld[J])+(bitmap[L->sn_struct[K][ir]]-1)]);}
2259 	  else{
2260 	    L->up_blocks[J][ (bitmap[L->sn_struct[K][i]]-1)*(L->up_blocks_ld[J])+(bitmap[L->sn_struct[K][ir]]-1)] -=  L->up_blocks[K][j*L->up_blocks_ld[K]+ir-sn_size_child]* L->up_blocks[K][j*L->up_blocks_ld[K]+i-sn_size_child];
2261 	   taucs_printf("up_block: L[%d,%d] = %lf\n",(bitmap[L->sn_struct[K][ir]]-1),(bitmap[L->sn_struct[K][i]]-1),L->up_blocks[J][ (bitmap[L->sn_struct[K][i]]-1)*(L->up_blocks_ld[J])+(bitmap[L->sn_struct[K][ir]]-1)]);
2262 	   }*/
2263         }
2264 
2265   if(exist_upd){
2266     LDA = LDB = (L->up_blocks_ld)[K];
2267     M  = sn_up_size_child - first_row ; /* +-1 ? */
2268     LDC =  sn_up_size_father;
2269     N  = row_count;
2270     PK = L->sn_size[K];
2271 
2272     /* The GEMM code computes on the upper triangle of the trapezoidal
2273        matrix, which is junk. */
2274     /*
2275     taucs_gemm ("No Conjugate",
2276 		"Conjugate",
2277 		&M,&N,&PK,
2278 		&taucs_one_const,
2279 		&(L->up_blocks[K][first_row-sn_size_child]),&LDA,
2280 		&(L->up_blocks[K][first_row-sn_size_child]),&LDB,
2281 		&taucs_zero_const,
2282 		dense_update_matrix,&LDC);
2283     */
2284 
2285     /* This is the HERK+GEMM fix by Elad */
2286     taucs_herk ("Lower",
2287 		"No Conjugate",
2288 		&N,&PK,
2289 		&taucs_one_real_const,
2290 		&(L->up_blocks[K][first_row-sn_size_child]),&LDA,
2291 		&taucs_zero_real_const,
2292 		dense_update_matrix,&LDC);
2293 
2294     if(M-N > 0)
2295     {
2296         int newM = M - N;
2297 
2298         taucs_gemm ("No Conjugate",
2299 		"Conjugate",
2300 		&newM,&N,&PK,
2301 		&taucs_one_const,
2302 		&(L->up_blocks[K][first_row-sn_size_child+N]),&LDA,
2303 		&(L->up_blocks[K][first_row-sn_size_child]),&LDB,
2304 		&taucs_zero_const,
2305 		dense_update_matrix+N,&LDC);
2306     }
2307     /* end of GEMM/HERK+GEMM fix */
2308 
2309     /*for(j=0;j<row_count;j++)
2310        for(ir=0;ir<sn_up_size_father;ir++)
2311 	 taucs_printf("dense[%d,%d] = %lf\n",ir,j,dense_update_matrix[j*LDC+ir]);
2312     */
2313 
2314     for(j=0;j<row_count;j++)
2315       for(ir=j;ir<row_count;ir++){
2316 
2317 #if 0
2318 	L->sn_blocks[J][(bitmap[L->sn_struct[K][first_row+j]]-1)*sn_size_father+(bitmap[L->sn_struct[K][first_row+ir]]-1)] -= dense_update_matrix[j*LDC+ir];
2319 #endif
2320 
2321 	L->sn_blocks[J][(bitmap[L->sn_struct[K][first_row+j]]-1)*sn_size_father+(bitmap[L->sn_struct[K][first_row+ir]]-1)] =
2322 	  taucs_sub( L->sn_blocks[J][(bitmap[L->sn_struct[K][first_row+j]]-1)*sn_size_father+(bitmap[L->sn_struct[K][first_row+ir]]-1)] , dense_update_matrix[j*LDC+ir]);
2323 
2324 	/*	taucs_printf("sn_block: L[%d,%d] = %lf\n",(bitmap[L->sn_struct[K][first_row+ir]]-1),(bitmap[L->sn_struct[K][first_row+j]]-1),L->sn_blocks[J][(bitmap[L->sn_struct[K][first_row+j]]-1)*sn_size_father+(bitmap[L->sn_struct[K][first_row+ir]]-1)]);*/
2325 
2326       }
2327 
2328     for(j=0;j<row_count;j++)
2329       for(ir=row_count;ir<M;ir++){
2330 #if 0
2331 	L->up_blocks[J][(bitmap[L->sn_struct[K][first_row+j]]-1)*(L->up_blocks_ld)[J]+(bitmap[L->sn_struct[K][ir+first_row]]-1)] -= dense_update_matrix[j*LDC+ir];
2332 #endif
2333 
2334 	L->up_blocks[J][(bitmap[L->sn_struct[K][first_row+j]]-1)*(L->up_blocks_ld)[J]+(bitmap[L->sn_struct[K][ir+first_row]]-1)] =
2335 	  taucs_sub( L->up_blocks[J][(bitmap[L->sn_struct[K][first_row+j]]-1)*(L->up_blocks_ld)[J]+(bitmap[L->sn_struct[K][ir+first_row]]-1)] , dense_update_matrix[j*LDC+ir]);
2336 
2337 	/*	taucs_printf("up_block: L[%d,%d] = %lf\n",(bitmap[L->sn_struct[K][ir+first_row]]-1),(bitmap[L->sn_struct[K][first_row+j]]-1),L->up_blocks[J][(bitmap[L->sn_struct[K][first_row+j]]-1)*(L->up_blocks_ld)[J]+(bitmap[L->sn_struct[K][ir+first_row]]-1)]);*/
2338 
2339 	}
2340     /*
2341     for(i=0;i<sn_up_size_father;i++)
2342       bitmap[L->sn_struct[J][i]]=0;
2343     */
2344 
2345     for (child = first_child[K]; child != -1; child = next_child[child]) {
2346       recursive_leftlooking_supernodal_update(J,child,
2347 					      bitmap,dense_update_matrix,
2348 					      A,L);
2349     }
2350   }
2351 
2352   /*
2353   else
2354     for(i=0;i<sn_up_size_father;i++)
2355       bitmap[L->sn_struct[J][i]]=0;
2356   */
2357 
2358 }
2359 
2360 static int
leftlooking_supernodal_front_factor(int sn,int * bitmap,taucs_ccs_matrix * A,supernodal_factor_matrix * L)2361 leftlooking_supernodal_front_factor(int sn,
2362 				    int* bitmap,
2363 				    taucs_ccs_matrix* A,
2364 				    supernodal_factor_matrix* L)
2365 {
2366   int ip,jp;
2367   int*    ind;
2368   taucs_datatype* re;
2369   int INFO;
2370 
2371   int sn_size = (L->sn_size)[sn];
2372   int up_size = (L->sn_up_size)[sn] - (L->sn_size)[sn];
2373 
2374   /* creating transform for real indices */
2375   for(ip=0;ip<(L->sn_up_size)[sn];ip++) bitmap[(L->sn_struct)[sn][ip]] = ip;
2376 
2377   for(jp=0;jp<sn_size;jp++) {
2378     ind = &(A->rowind[A->colptr[ (L->sn_struct)[sn][jp] ]]);
2379     re  = &(A->taucs_values[A->colptr[ (L->sn_struct)[sn][jp] ]]);
2380     for(ip=0;
2381 	ip < A->colptr[ (L->sn_struct)[sn][jp] + 1 ]
2382            - A->colptr[ (L->sn_struct)[sn][jp] ];
2383 	ip++) {
2384       if (bitmap[ind[ip]] < sn_size)
2385 	(L->sn_blocks)[sn][ (L->sn_blocks_ld)[sn]*jp + bitmap[ind[ip]]] =
2386 	  taucs_add( (L->sn_blocks)[sn][ (L->sn_blocks_ld)[sn]*jp + bitmap[ind[ip]]] , re[ip] );
2387       else
2388 	(L->up_blocks)[sn][ (L->up_blocks_ld)[sn]*jp + bitmap[ind[ip]] - sn_size] =
2389 	taucs_add( (L->up_blocks)[sn][ (L->up_blocks_ld)[sn]*jp + bitmap[ind[ip]] - sn_size] , re[ip] );
2390     }
2391   }
2392 
2393   /* we use the BLAS through the Fortran interface */
2394 
2395   /* solving of lower triangular system for L */
2396   if (sn_size)
2397     taucs_potrf ("LOWER",
2398 		 &sn_size,
2399 		 (L->sn_blocks)[sn],&((L->sn_blocks_ld)[sn]),
2400 		 &INFO);
2401 
2402   if (INFO) {
2403     taucs_printf("\t\tLL^T Factorization: Matrix is not positive definite.\n");
2404     taucs_printf("\t\t                    nonpositive pivot in column %d\n",
2405 		 (L->sn_struct)[INFO-1]);
2406     return -1;
2407   }
2408 
2409   /* getting completion for found columns of L */
2410   if (up_size && sn_size)
2411     taucs_trsm ("Right",
2412 		"Lower",
2413 		"Conjugate",
2414 		"No unit diagonal",
2415 		&up_size,&sn_size,
2416 		&taucs_one_const,
2417 		(L->sn_blocks)[sn],&((L->sn_blocks_ld)[sn]),
2418 		(L->up_blocks)[sn],&((L->up_blocks_ld)[sn]));
2419 
2420   return 0;
2421 }
2422 
2423 static int
recursive_leftlooking_supernodal_factor_llt(int sn,int is_root,int * bitmap,int * indmap,taucs_ccs_matrix * A,supernodal_factor_matrix * L)2424 recursive_leftlooking_supernodal_factor_llt(int sn,       /* this supernode */
2425 					    int is_root,  /* is v the root? */
2426 					    int* bitmap,
2427 					    int* indmap,
2428 					    taucs_ccs_matrix* A,
2429 					    supernodal_factor_matrix* L)
2430 {
2431   int  child;
2432   int  sn_size;
2433   int* first_child   = L->first_child;
2434   int* next_child    = L->next_child;
2435   taucs_datatype* dense_update_matrix = NULL;
2436 
2437   if (!is_root)
2438     sn_size = L->sn_size[sn];
2439   else
2440     sn_size = -1;
2441 
2442   if (!is_root) {
2443     (L->sn_blocks   )[sn] = (L->up_blocks   )[sn] = NULL;
2444     if (L->sn_size[sn]) {
2445       (L->sn_blocks   )[sn] = (taucs_datatype*)taucs_calloc(((L->sn_size)[sn])*((L->sn_size)[sn]),
2446 							    sizeof(taucs_datatype));
2447       if (!((L->sn_blocks)[sn])) return -1; /* the caller will free L */
2448     }
2449     (L->sn_blocks_ld)[sn] = (L->sn_size   )[sn];
2450 
2451     if (((L->sn_up_size)[sn] - (L->sn_size)[sn]) && (L->sn_size)[sn]) {
2452       (L->up_blocks   )[sn] = (taucs_datatype*)taucs_calloc(((L->sn_up_size)[sn]-(L->sn_size)[sn])
2453 							    *((L->sn_size)[sn]),sizeof(taucs_datatype));
2454       if (!((L->up_blocks)[sn])) return -1; /* the caller will free L */
2455     }
2456     (L->up_blocks_ld)[sn] = (L->sn_up_size)[sn]-(L->sn_size)[sn];
2457   }
2458 
2459   for (child = first_child[sn]; child != -1; child = next_child[child]) {
2460     if (recursive_leftlooking_supernodal_factor_llt(child,
2461 						    FALSE,
2462 						    bitmap,
2463 						    indmap,
2464 						    A,L)
2465 	== -1 ) {
2466       taucs_free(dense_update_matrix);
2467       return -1;
2468     }
2469 
2470     if (!is_root) {
2471       if (!dense_update_matrix) {
2472 	dense_update_matrix =
2473 	  (taucs_datatype*) taucs_calloc((L->sn_up_size)[sn]*(L->sn_size)[sn],sizeof(taucs_datatype));
2474 	if (!dense_update_matrix) return -1; /* caller will free L */
2475       }
2476 
2477       /* prepare the bitmap. Moved out of the recusive
2478 	 update procedure 20/1/2003. Sivan and Elad */
2479 
2480       {
2481 	int i;
2482 	int J = sn;
2483 	int sn_size_father = (L->sn_size)[J];
2484 	int sn_up_size_father = (L->sn_up_size)[J];
2485 
2486 	for(i=0;i<sn_size_father;i++)
2487 	  bitmap[L->sn_struct[J][i]]=i+1;
2488 	for(i=sn_size_father;i<sn_up_size_father;i++)
2489 	  bitmap[L->sn_struct[J][i]] = i - sn_size_father + 1;
2490       }
2491 
2492       recursive_leftlooking_supernodal_update(sn,child,
2493 					      bitmap,dense_update_matrix,
2494 					      A,L);
2495 
2496       {
2497 	int i;
2498 	int J = sn;
2499 	int sn_size_father = (L->sn_size)[J];
2500 	int sn_up_size_father = (L->sn_up_size)[J];
2501 
2502 	for(i=0;i<sn_size_father;i++)
2503 	  bitmap[L->sn_struct[J][i]]=0;
2504 	for(i=0;i<sn_up_size_father;i++)
2505 	  bitmap[L->sn_struct[J][i]]=0;
2506       }
2507 
2508     }
2509   }
2510   taucs_free(dense_update_matrix);
2511 
2512   if(!is_root) {
2513     if (leftlooking_supernodal_front_factor(sn,
2514 					    indmap,
2515 					    A,
2516 					    L)) {
2517       return -1; /* nonpositive pivot */
2518     }
2519   }
2520 
2521   return 0;
2522 }
2523 
2524 
2525 void*
taucs_dtl(ccs_factor_llt_ll)2526 taucs_dtl(ccs_factor_llt_ll)(taucs_ccs_matrix* A)
2527 {
2528   return taucs_dtl(ccs_factor_llt_ll_maxdepth)(A,0);
2529 }
2530 
2531 void*
taucs_dtl(ccs_factor_llt_ll_maxdepth)2532 taucs_dtl(ccs_factor_llt_ll_maxdepth)(taucs_ccs_matrix* A,int max_depth)
2533 {
2534   supernodal_factor_matrix* L;
2535   int* map;
2536   int *map2;
2537   double wtime, ctime;
2538   int fail;
2539 
2540   wtime = taucs_wtime();
2541   ctime = taucs_ctime();
2542 
2543   L = multifrontal_supernodal_create();
2544   if (!L) return NULL;
2545 
2546   fail = taucs_ccs_symbolic_elimination(A,L,
2547 					TRUE /* sort row indices */,
2548 					max_depth);
2549 
2550   wtime = taucs_wtime()-wtime;
2551   ctime = taucs_ctime()-ctime;
2552   taucs_printf("\t\tSymbolic Analysis            = % 10.3f seconds (%.3f cpu)\n",
2553 	       wtime,ctime);
2554 
2555   map  = (int*)taucs_malloc((A->n+1)*sizeof(int));
2556   map2 = (int*)taucs_calloc((A->n+1),sizeof(int));
2557 
2558   if (fail == -1 || !map || !map2) {
2559     taucs_supernodal_factor_free(L);
2560     taucs_free(map2);
2561     taucs_free(map);
2562     return NULL;
2563   }
2564 
2565   wtime = taucs_wtime();
2566   ctime = taucs_ctime();
2567 
2568   if (recursive_leftlooking_supernodal_factor_llt((L->n_sn),
2569 						  TRUE,
2570 						  map2,
2571 						  map,
2572 						  A,L)
2573       == -1) {
2574     taucs_supernodal_factor_free(L);
2575     taucs_free(map);
2576     taucs_free(map2);
2577     return NULL;
2578   }
2579 
2580   wtime = taucs_wtime()-wtime;
2581   ctime = taucs_ctime()-ctime;
2582   taucs_printf("\t\tSupernodal Left-Looking LL^T = % 10.3f seconds (%.3f cpu)\n",
2583 	       wtime,ctime);
2584 
2585   taucs_free(map);
2586   taucs_free(map2);
2587 
2588   return (void*) L;
2589 }
2590 
2591 
2592 /*************************************************************/
2593 /* supernodal solve routines                                 */
2594 /*************************************************************/
2595 
2596 static void
recursive_supernodal_solve_l(int sn,int is_root,int * first_child,int * next_child,int ** sn_struct,int * sn_sizes,int * sn_up_sizes,int * sn_blocks_ld,taucs_datatype * sn_blocks[],int * up_blocks_ld,taucs_datatype * up_blocks[],taucs_datatype x[],taucs_datatype b[],taucs_datatype t[])2597 recursive_supernodal_solve_l(int sn,       /* this supernode */
2598 			     int is_root,  /* is v the root? */
2599 			     int* first_child, int* next_child,
2600 			     int** sn_struct, int* sn_sizes, int* sn_up_sizes,
2601 			     int* sn_blocks_ld,taucs_datatype* sn_blocks[],
2602 			     int* up_blocks_ld,taucs_datatype* up_blocks[],
2603 			     taucs_datatype x[], taucs_datatype b[],
2604 			     taucs_datatype t[])
2605 {
2606   int child;
2607   int  sn_size; /* number of rows/columns in the supernode    */
2608   int  up_size; /* number of rows that this supernode updates */
2609   int    ione = 1;
2610 
2611   taucs_datatype* xdense;
2612   taucs_datatype* bdense;
2613   double  flops;
2614   int i;/*ip,j,jp omer*/
2615 
2616   for (child = first_child[sn]; child != -1; child = next_child[child]) {
2617     recursive_supernodal_solve_l(child,
2618 				 FALSE,
2619 				 first_child,next_child,
2620  				 sn_struct,sn_sizes,sn_up_sizes,
2621 				 sn_blocks_ld,sn_blocks,
2622 				 up_blocks_ld,up_blocks,
2623 				 x,b,t);
2624   }
2625 
2626   if(!is_root) {
2627 
2628     sn_size = sn_sizes[sn];
2629     up_size = sn_up_sizes[sn] - sn_sizes[sn];
2630 
2631     flops = ((double)sn_size)*((double)sn_size)
2632       + 2.0*((double)sn_size)*((double)up_size);
2633 
2634     if (flops > BLAS_FLOPS_CUTOFF) {
2635       xdense = t;
2636       bdense = t + sn_size;
2637 
2638       for (i=0; i<sn_size; i++)
2639 	xdense[i] = b[ sn_struct[ sn ][ i ] ];
2640       for (i=0; i<up_size; i++)
2641 	bdense[i] = taucs_zero;
2642 
2643       taucs_trsm ("Left",
2644 		  "Lower",
2645 		  "No Conjugate",
2646 		  "No unit diagonal",
2647 		  &sn_size,&ione,
2648 		  &taucs_one_const,
2649 		  sn_blocks[sn],&(sn_blocks_ld[sn]),
2650 		  xdense       ,&sn_size);
2651 
2652       if (up_size > 0 && sn_size > 0) {
2653 	taucs_gemm ("No Conjugate","No Conjugate",
2654 		    &up_size, &ione, &sn_size,
2655 		    &taucs_one_const,
2656 		    up_blocks[sn],&(up_blocks_ld[sn]),
2657 		    xdense       ,&sn_size,
2658 		    &taucs_zero_const,
2659 		    bdense       ,&up_size);
2660       }
2661 
2662       for (i=0; i<sn_size; i++)
2663 	x[ sn_struct[ sn][ i ] ]  = xdense[i];
2664       for (i=0; i<up_size; i++)
2665 	/*b[ sn_struct[ sn ][ sn_size + i ] ] -= bdense[i];*/
2666 	b[ sn_struct[ sn ][ sn_size + i ] ] =
2667 	  taucs_sub( b[ sn_struct[ sn ][ sn_size + i ] ] , bdense[i] );
2668 
2669 #if 1
2670     }
2671 #else
2672     } else if (sn_size > SOLVE_DENSE_CUTOFF) {
2673 
2674       xdense = t;
2675       bdense = t + sn_size;
2676 
2677       for (i=0; i<sn_size; i++)
2678 	xdense[i] = b[ sn_struct[ sn ][ i ] ];
2679       for (i=0; i<up_size; i++)
2680 	bdense[i] = 0.0;
2681 
2682       for (jp=0; jp<sn_size; jp++) {
2683 	xdense[jp] = xdense[jp] / sn_blocks[sn][ sn_blocks_ld[sn]*jp + jp];
2684 
2685 	for (ip=jp+1; ip<sn_size; ip++) {
2686 	  xdense[ip] -= xdense[jp] * sn_blocks[sn][ sn_blocks_ld[sn]*jp + ip];
2687 	}
2688       }
2689 
2690       for (jp=0; jp<sn_size; jp++) {
2691 	for (ip=0; ip<up_size; ip++) {
2692 	  bdense[ip] += xdense[jp] * up_blocks[sn][ up_blocks_ld[sn]*jp + ip];
2693 	}
2694       }
2695 
2696       for (i=0; i<sn_size; i++)
2697 	x[ sn_struct[ sn][ i ] ]  = xdense[i];
2698       for (i=0; i<up_size; i++)
2699 	b[ sn_struct[ sn ][ sn_size + i ] ] -= bdense[i];
2700 
2701     } else {
2702 
2703       for (jp=0; jp<sn_size; jp++) {
2704 	j = sn_struct[sn][jp];
2705 	x[j] = b[j] / sn_blocks[sn][ sn_blocks_ld[sn]*jp + jp];
2706 	for (ip=jp+1; ip<sn_size; ip++) {
2707 	  i = sn_struct[sn][ip];
2708 	  b[i] -= x[j] * sn_blocks[sn][ sn_blocks_ld[sn]*jp + ip];
2709 	}
2710 
2711 	for (ip=0; ip<up_size; ip++) {
2712 	  i = sn_struct[sn][sn_size + ip];
2713 	  b[i] -= x[j] * up_blocks[sn][ up_blocks_ld[sn]*jp + ip];
2714 	}
2715       }
2716 
2717     }
2718 #endif
2719   }
2720 }
2721 
2722 static void
recursive_supernodal_solve_lt(int sn,int is_root,int * first_child,int * next_child,int ** sn_struct,int * sn_sizes,int * sn_up_sizes,int * sn_blocks_ld,taucs_datatype * sn_blocks[],int * up_blocks_ld,taucs_datatype * up_blocks[],taucs_datatype x[],taucs_datatype b[],taucs_datatype t[])2723 recursive_supernodal_solve_lt(int sn,       /* this supernode */
2724  			      int is_root,  /* is v the root? */
2725 			      int* first_child, int* next_child,
2726 			      int** sn_struct, int* sn_sizes, int* sn_up_sizes,
2727 			      int* sn_blocks_ld,taucs_datatype* sn_blocks[],
2728 			      int* up_blocks_ld,taucs_datatype* up_blocks[],
2729 			      taucs_datatype x[], taucs_datatype b[],
2730 			      taucs_datatype t[])
2731 {
2732   int child;
2733   int  sn_size; /* number of rows/columns in the supernode    */
2734   int  up_size; /* number of rows that this supernode updates */
2735   int    ione = 1;
2736 
2737   taucs_datatype* xdense;
2738   taucs_datatype* bdense;
2739   double  flops;
2740   int i;/*ip,j,jp omer*/
2741 
2742   if(!is_root) {
2743 
2744     sn_size = sn_sizes[sn];
2745     up_size = sn_up_sizes[sn]-sn_sizes[sn];
2746 
2747     flops = ((double)sn_size)*((double)sn_size)
2748       + 2.0*((double)sn_size)*((double)up_size);
2749 
2750     if (flops > BLAS_FLOPS_CUTOFF) {
2751 
2752       bdense = t;
2753       xdense = t + sn_size;
2754 
2755       for (i=0; i<sn_size; i++)
2756 	bdense[i] = b[ sn_struct[ sn][ i ] ];
2757       for (i=0; i<up_size; i++)
2758 	xdense[i] = x[ sn_struct[sn][sn_size+i] ];
2759 
2760       if (up_size > 0 && sn_size > 0)
2761 	taucs_gemm ("Conjugate","No Conjugate",
2762 		     &sn_size, &ione, &up_size,
2763 		     &taucs_minusone_const,
2764 		     up_blocks[sn],&(up_blocks_ld[sn]),
2765 		     xdense       ,&up_size,
2766 		     &taucs_one_const,
2767 		     bdense       ,&sn_size);
2768 
2769       taucs_trsm ("Left",
2770 		  "Lower",
2771 		  "Conjugate",
2772 		  "No unit diagonal",
2773 		  &sn_size,&ione,
2774 		  &taucs_one_const,
2775 		  sn_blocks[sn],&(sn_blocks_ld[sn]),
2776 		  bdense       ,&sn_size);
2777 
2778       for (i=0; i<sn_size; i++)
2779 	x[ sn_struct[ sn][ i ] ]  = bdense[i];
2780 #if 1
2781     }
2782 #else
2783     } else if (sn_size > SOLVE_DENSE_CUTOFF) {
2784 
2785       bdense = t;
2786       xdense = t + sn_size;
2787 
2788       for (i=0; i<sn_size; i++)
2789 	bdense[i] = b[ sn_struct[ sn][ i ] ];
2790       for (i=0; i<up_size; i++)
2791 	xdense[i] = x[ sn_struct[sn][sn_size+i] ];
2792 
2793       for (ip=sn_size-1; ip>=0; ip--) {
2794 	for (jp=0; jp<up_size; jp++) {
2795 	  bdense[ip] -= xdense[jp] * up_blocks[sn][ up_blocks_ld[sn]*ip + jp];
2796 	}
2797       }
2798 
2799       for (ip=sn_size-1; ip>=0; ip--) {
2800 	for (jp=sn_size-1; jp>ip; jp--) {
2801 	  bdense[ip] -= bdense[jp] * sn_blocks[sn][ sn_blocks_ld[sn]*ip + jp];
2802 	}
2803 	bdense[ip] = bdense[ip] / sn_blocks[sn][ sn_blocks_ld[sn]*ip + ip];
2804       }
2805 
2806       for (i=0; i<sn_size; i++)
2807 	x[ sn_struct[ sn][ i ] ]  = bdense[i];
2808 
2809     } else {
2810 
2811       for (ip=sn_size-1; ip>=0; ip--) {
2812 	i = sn_struct[sn][ip];
2813 
2814 	for (jp=0; jp<up_size; jp++) {
2815 	  j = sn_struct[sn][sn_size + jp];
2816 	  b[i] -= x[j] * up_blocks[sn][ up_blocks_ld[sn]*ip + jp];
2817 	}
2818 
2819 	for (jp=sn_size-1; jp>ip; jp--) {
2820 	  j = sn_struct[sn][jp];
2821 	  b[i] -= x[j] * sn_blocks[sn][ sn_blocks_ld[sn]*ip + jp];
2822 	}
2823 	x[i] = b[i] / sn_blocks[sn][ sn_blocks_ld[sn]*ip + ip];
2824 
2825       }
2826 
2827     }
2828 #endif
2829 
2830   }
2831 
2832   for (child = first_child[sn]; child != -1; child = next_child[child]) {
2833     recursive_supernodal_solve_lt(child,
2834 				  FALSE,
2835 				  first_child,next_child,
2836 				  sn_struct,sn_sizes,sn_up_sizes,
2837 				  sn_blocks_ld,sn_blocks,
2838 				  up_blocks_ld,up_blocks,
2839 				  x,b,t);
2840   }
2841 }
2842 
2843 
2844 int
taucs_dtl(supernodal_solve_llt)2845 taucs_dtl(supernodal_solve_llt)(void* vL, void* vx, void* vb)
2846 {
2847   supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL;
2848   taucs_datatype* x = (taucs_datatype*) vx;
2849   taucs_datatype* b = (taucs_datatype*) vb;
2850   taucs_datatype* y;
2851   taucs_datatype* t; /* temporary vector */
2852   int     i;
2853 
2854   y = taucs_malloc((L->n) * sizeof(taucs_datatype));
2855   t = taucs_malloc((L->n) * sizeof(taucs_datatype));
2856   if (!y || !t) {
2857     taucs_free(y);
2858     taucs_free(t);
2859     taucs_printf("multifrontal_supernodal_solve_llt: out of memory\n");
2860     return -1;
2861   }
2862 
2863   for (i=0; i<L->n; i++) x[i] = b[i];
2864 
2865   recursive_supernodal_solve_l (L->n_sn,
2866 				TRUE,  /* this is the root */
2867 				L->first_child, L->next_child,
2868 				L->sn_struct,L->sn_size,L->sn_up_size,
2869 				L->sn_blocks_ld, L->sn_blocks,
2870 				L->up_blocks_ld, L->up_blocks,
2871 				y, x, t);
2872 
2873   recursive_supernodal_solve_lt(L->n_sn,
2874 				TRUE,  /* this is the root */
2875 				L->first_child, L->next_child,
2876 				L->sn_struct,L->sn_size,L->sn_up_size,
2877 				L->sn_blocks_ld, L->sn_blocks,
2878 				L->up_blocks_ld, L->up_blocks,
2879 				x, y, t);
2880 
2881   taucs_free(y);
2882   taucs_free(t);
2883 
2884   return 0;
2885 }
2886 #endif /*#ifndef TAUCS_CORE_GENERAL*/
2887 
2888 /*************************************************************/
2889 /* generic interfaces to user-callable routines              */
2890 /*************************************************************/
2891 
2892 #ifdef TAUCS_CORE_GENERAL
2893 
2894 cilk
2895 void*
taucs_ccs_factor_llt_mf_maxdepth(taucs_ccs_matrix * A,int max_depth)2896 taucs_ccs_factor_llt_mf_maxdepth(taucs_ccs_matrix* A,int max_depth)
2897 {
2898   void* p= NULL;
2899 
2900 #ifdef TAUCS_DOUBLE_IN_BUILD
2901   if (A->flags & TAUCS_DOUBLE)
2902     p = spawn taucs_dccs_factor_llt_mf_maxdepth(A,max_depth);
2903 #endif
2904 
2905 #ifdef TAUCS_SINGLE_IN_BUILD
2906   if (A->flags & TAUCS_SINGLE)
2907     p = spawn taucs_sccs_factor_llt_mf_maxdepth(A,max_depth);
2908 #endif
2909 
2910 #ifdef TAUCS_DCOMPLEX_IN_BUILD
2911   if (A->flags & TAUCS_DCOMPLEX)
2912     p = spawn taucs_zccs_factor_llt_mf_maxdepth(A,max_depth);
2913 #endif
2914 
2915 #ifdef TAUCS_SCOMPLEX_IN_BUILD
2916   if (A->flags & TAUCS_SCOMPLEX)
2917     p = spawn taucs_cccs_factor_llt_mf_maxdepth(A,max_depth);
2918 #endif
2919 
2920   sync;
2921   return p;
2922 }
2923 
2924 void*
taucs_ccs_factor_llt_ll_maxdepth(taucs_ccs_matrix * A,int max_depth)2925 taucs_ccs_factor_llt_ll_maxdepth(taucs_ccs_matrix* A,int max_depth)
2926 {
2927 
2928 #ifdef TAUCS_DOUBLE_IN_BUILD
2929   if (A->flags & TAUCS_DOUBLE)
2930     return taucs_dccs_factor_llt_ll_maxdepth(A,max_depth);
2931 #endif
2932 
2933 #ifdef TAUCS_SINGLE_IN_BUILD
2934   if (A->flags & TAUCS_SINGLE)
2935     return taucs_sccs_factor_llt_ll_maxdepth(A,max_depth);
2936 #endif
2937 
2938 #ifdef TAUCS_DCOMPLEX_IN_BUILD
2939   if (A->flags & TAUCS_DCOMPLEX)
2940     return taucs_zccs_factor_llt_ll_maxdepth(A,max_depth);
2941 #endif
2942 
2943 #ifdef TAUCS_SCOMPLEX_IN_BUILD
2944   if (A->flags & TAUCS_SCOMPLEX)
2945     return taucs_cccs_factor_llt_ll_maxdepth(A,max_depth);
2946 #endif
2947 
2948 	assert(0);
2949   return NULL;
2950 }
2951 
taucs_ccs_factor_llt_symbolic_maxdepth(taucs_ccs_matrix * A,int max_depth)2952 void* taucs_ccs_factor_llt_symbolic_maxdepth(taucs_ccs_matrix* A,int max_depth)
2953 {
2954 
2955 #ifdef TAUCS_DOUBLE_IN_BUILD
2956   if (A->flags & TAUCS_DOUBLE)
2957     return taucs_dccs_factor_llt_symbolic_maxdepth(A,max_depth);
2958 #endif
2959 
2960 #ifdef TAUCS_SINGLE_IN_BUILD
2961   if (A->flags & TAUCS_SINGLE)
2962     return taucs_sccs_factor_llt_symbolic_maxdepth(A,max_depth);
2963 #endif
2964 
2965 #ifdef TAUCS_DCOMPLEX_IN_BUILD
2966   if (A->flags & TAUCS_DCOMPLEX)
2967     return taucs_zccs_factor_llt_symbolic_maxdepth(A,max_depth);
2968 #endif
2969 
2970 #ifdef TAUCS_SCOMPLEX_IN_BUILD
2971   if (A->flags & TAUCS_SCOMPLEX)
2972     return taucs_cccs_factor_llt_symbolic_maxdepth(A,max_depth);
2973 #endif
2974 
2975   assert(0);
2976   return NULL;
2977 }
2978 
2979 cilk
2980 void*
taucs_ccs_factor_llt_mf(taucs_ccs_matrix * A)2981 taucs_ccs_factor_llt_mf(taucs_ccs_matrix* A)
2982 {
2983   void* p = NULL;
2984 #ifdef TAUCS_DOUBLE_IN_BUILD
2985   if (A->flags & TAUCS_DOUBLE)
2986     p = spawn taucs_dccs_factor_llt_mf(A);
2987 #endif
2988 
2989 #ifdef TAUCS_SINGLE_IN_BUILD
2990   if (A->flags & TAUCS_SINGLE)
2991     p = spawn taucs_sccs_factor_llt_mf(A);
2992 #endif
2993 
2994 #ifdef TAUCS_DCOMPLEX_IN_BUILD
2995   if (A->flags & TAUCS_DCOMPLEX)
2996     p = spawn taucs_zccs_factor_llt_mf(A);
2997 #endif
2998 
2999 #ifdef TAUCS_SCOMPLEX_IN_BUILD
3000   if (A->flags & TAUCS_SCOMPLEX)
3001     p = spawn taucs_cccs_factor_llt_mf(A);
3002 #endif
3003 
3004   sync;
3005   return p;
3006 }
3007 
taucs_ccs_factor_llt_ll(taucs_ccs_matrix * A)3008 void* taucs_ccs_factor_llt_ll(taucs_ccs_matrix* A)
3009 {
3010 
3011 #ifdef TAUCS_DOUBLE_IN_BUILD
3012   if (A->flags & TAUCS_DOUBLE)
3013     return taucs_dccs_factor_llt_ll(A);
3014 #endif
3015 
3016 #ifdef TAUCS_SINGLE_IN_BUILD
3017   if (A->flags & TAUCS_SINGLE)
3018     return taucs_sccs_factor_llt_ll(A);
3019 #endif
3020 
3021 #ifdef TAUCS_DCOMPLEX_IN_BUILD
3022   if (A->flags & TAUCS_DCOMPLEX)
3023     return taucs_zccs_factor_llt_ll(A);
3024 #endif
3025 
3026 #ifdef TAUCS_SCOMPLEX_IN_BUILD
3027   if (A->flags & TAUCS_SCOMPLEX)
3028     return taucs_cccs_factor_llt_ll(A);
3029 #endif
3030 
3031   assert(0);
3032   return NULL;
3033 }
3034 
taucs_ccs_factor_llt_symbolic(taucs_ccs_matrix * A)3035 void* taucs_ccs_factor_llt_symbolic(taucs_ccs_matrix* A)
3036 {
3037 
3038 #ifdef TAUCS_DOUBLE_IN_BUILD
3039   if (A->flags & TAUCS_DOUBLE)
3040     return taucs_dccs_factor_llt_symbolic(A);
3041 #endif
3042 
3043 #ifdef TAUCS_SINGLE_IN_BUILD
3044   if (A->flags & TAUCS_SINGLE)
3045     return taucs_sccs_factor_llt_symbolic(A);
3046 #endif
3047 
3048 #ifdef TAUCS_DCOMPLEX_IN_BUILD
3049   if (A->flags & TAUCS_DCOMPLEX)
3050     return taucs_zccs_factor_llt_symbolic(A);
3051 #endif
3052 
3053 #ifdef TAUCS_SCOMPLEX_IN_BUILD
3054   if (A->flags & TAUCS_SCOMPLEX)
3055     return taucs_cccs_factor_llt_symbolic(A);
3056 #endif
3057 
3058   assert(0);
3059   return NULL;
3060 }
3061 
3062 cilk
taucs_ccs_factor_llt_numeric(taucs_ccs_matrix * A,void * L)3063 int taucs_ccs_factor_llt_numeric(taucs_ccs_matrix* A, void* L)
3064 {
3065   int rc = TAUCS_ERROR;
3066 #ifdef TAUCS_DOUBLE_IN_BUILD
3067   if (A->flags & TAUCS_DOUBLE)
3068     rc = spawn taucs_dccs_factor_llt_numeric(A,L);
3069 #endif
3070 
3071 #ifdef TAUCS_SINGLE_IN_BUILD
3072   if (A->flags & TAUCS_SINGLE)
3073     rc = spawn taucs_sccs_factor_llt_numeric(A,L);
3074 #endif
3075 
3076 #ifdef TAUCS_DCOMPLEX_IN_BUILD
3077   if (A->flags & TAUCS_DCOMPLEX)
3078     rc = spawn taucs_zccs_factor_llt_numeric(A,L);
3079 #endif
3080 
3081 #ifdef TAUCS_SCOMPLEX_IN_BUILD
3082   if (A->flags & TAUCS_SCOMPLEX)
3083     rc = spawn taucs_cccs_factor_llt_numeric(A,L);
3084 #endif
3085 
3086   sync;
3087   return rc;
3088 }
3089 
3090 
taucs_supernodal_solve_llt(void * L,void * x,void * b)3091 int taucs_supernodal_solve_llt(void* L, void* x, void* b)
3092 {
3093 
3094 #ifdef TAUCS_DOUBLE_IN_BUILD
3095   if (((supernodal_factor_matrix*) L)->flags & TAUCS_DOUBLE)
3096     return taucs_dsupernodal_solve_llt(L,x,b);
3097 #endif
3098 
3099 #ifdef TAUCS_SINGLE_IN_BUILD
3100   if (((supernodal_factor_matrix*) L)->flags & TAUCS_SINGLE)
3101     return taucs_ssupernodal_solve_llt(L,x,b);
3102 #endif
3103 
3104 #ifdef TAUCS_DCOMPLEX_IN_BUILD
3105   if (((supernodal_factor_matrix*) L)->flags & TAUCS_DCOMPLEX)
3106     return taucs_zsupernodal_solve_llt(L,x,b);
3107 #endif
3108 
3109 #ifdef TAUCS_SCOMPLEX_IN_BUILD
3110   if (((supernodal_factor_matrix*) L)->flags & TAUCS_SCOMPLEX)
3111     return taucs_csupernodal_solve_llt(L,x,b);
3112 #endif
3113 
3114   assert(0);
3115   return -1;
3116 }
3117 
taucs_supernodal_factor_free(void * L)3118 void taucs_supernodal_factor_free(void* L)
3119 {
3120 
3121 #ifdef TAUCS_DOUBLE_IN_BUILD
3122   if (((supernodal_factor_matrix*) L)->flags & TAUCS_DOUBLE) {
3123     taucs_dsupernodal_factor_free(L);
3124     return;
3125   }
3126 #endif
3127 
3128 #ifdef TAUCS_SINGLE_IN_BUILD
3129   if (((supernodal_factor_matrix*) L)->flags & TAUCS_SINGLE) {
3130     taucs_ssupernodal_factor_free(L);
3131     return;
3132   }
3133 #endif
3134 
3135 #ifdef TAUCS_DCOMPLEX_IN_BUILD
3136   if (((supernodal_factor_matrix*) L)->flags & TAUCS_DCOMPLEX) {
3137     taucs_zsupernodal_factor_free(L);
3138     return;
3139   }
3140 #endif
3141 
3142 #ifdef TAUCS_SCOMPLEX_IN_BUILD
3143   if (((supernodal_factor_matrix*) L)->flags & TAUCS_SCOMPLEX) {
3144     taucs_csupernodal_factor_free(L);
3145     return;
3146   }
3147 #endif
3148 
3149   assert(0);
3150 }
3151 
taucs_supernodal_factor_free_numeric(void * L)3152 void taucs_supernodal_factor_free_numeric(void* L)
3153 {
3154 
3155 
3156 #ifdef TAUCS_DOUBLE_IN_BUILD
3157   if (((supernodal_factor_matrix*) L)->flags & TAUCS_DOUBLE) {
3158     taucs_dsupernodal_factor_free_numeric(L);
3159     return;
3160   }
3161 #endif
3162 
3163 #ifdef TAUCS_SINGLE_IN_BUILD
3164   if (((supernodal_factor_matrix*) L)->flags & TAUCS_SINGLE) {
3165     taucs_ssupernodal_factor_free_numeric(L);
3166     return;
3167   }
3168 #endif
3169 
3170 #ifdef TAUCS_DCOMPLEX_IN_BUILD
3171   if (((supernodal_factor_matrix*) L)->flags & TAUCS_DCOMPLEX) {
3172     taucs_zsupernodal_factor_free_numeric(L);
3173     return;
3174   }
3175 #endif
3176 
3177 #ifdef TAUCS_SCOMPLEX_IN_BUILD
3178   if (((supernodal_factor_matrix*) L)->flags & TAUCS_SCOMPLEX) {
3179     taucs_csupernodal_factor_free_numeric(L);
3180     return;
3181   }
3182 #endif
3183 
3184   assert(0);
3185 }
3186 
3187 taucs_ccs_matrix*
taucs_supernodal_factor_to_ccs(void * L)3188 taucs_supernodal_factor_to_ccs(void* L)
3189 {
3190 
3191 #ifdef TAUCS_DOUBLE_IN_BUILD
3192   if (((supernodal_factor_matrix*) L)->flags & TAUCS_DOUBLE)
3193     return taucs_dsupernodal_factor_to_ccs(L);
3194 #endif
3195 
3196 #ifdef TAUCS_SINGLE_IN_BUILD
3197   if (((supernodal_factor_matrix*) L)->flags & TAUCS_SINGLE)
3198     return taucs_ssupernodal_factor_to_ccs(L);
3199 #endif
3200 
3201 #ifdef TAUCS_DCOMPLEX_IN_BUILD
3202   if (((supernodal_factor_matrix*) L)->flags & TAUCS_DCOMPLEX)
3203     return taucs_zsupernodal_factor_to_ccs(L);
3204 #endif
3205 
3206 #ifdef TAUCS_SCOMPLEX_IN_BUILD
3207   if (((supernodal_factor_matrix*) L)->flags & TAUCS_SCOMPLEX)
3208     return taucs_csupernodal_factor_to_ccs(L);
3209 #endif
3210 
3211   assert(0);
3212   return NULL;
3213 }
3214 
3215 void*
taucs_supernodal_factor_get_diag(void * L)3216 taucs_supernodal_factor_get_diag(void* L)
3217 {
3218 
3219 #ifdef TAUCS_DOUBLE_IN_BUILD
3220   if (((supernodal_factor_matrix*) L)->flags & TAUCS_DOUBLE)
3221     return taucs_dsupernodal_factor_get_diag(L);
3222 #endif
3223 
3224 #ifdef TAUCS_SINGLE_IN_BUILD
3225   if (((supernodal_factor_matrix*) L)->flags & TAUCS_SINGLE)
3226     return taucs_ssupernodal_factor_get_diag(L);
3227 #endif
3228 
3229 #ifdef TAUCS_DCOMPLEX_IN_BUILD
3230   if (((supernodal_factor_matrix*) L)->flags & TAUCS_DCOMPLEX)
3231     return taucs_zsupernodal_factor_get_diag(L);
3232 #endif
3233 
3234 #ifdef TAUCS_SCOMPLEX_IN_BUILD
3235   if (((supernodal_factor_matrix*) L)->flags & TAUCS_SCOMPLEX)
3236     return taucs_csupernodal_factor_get_diag(L);
3237 #endif
3238 
3239   assert(0);
3240   return NULL;
3241 }
3242 
3243 int
taucs_ccs_etree(taucs_ccs_matrix * A,int * parent,int * l_colcount,int * l_rowcount,int * l_nnz)3244 taucs_ccs_etree(taucs_ccs_matrix* A,
3245 		int* parent,
3246 		int* l_colcount,
3247 		int* l_rowcount,
3248 		int* l_nnz)
3249 {
3250   int* prev_p;
3251   /*int* prev_nbr;omer*/
3252   int* level;
3253   /*int* first_descendant;omer*/
3254   int* l_cc;
3255   int* l_rc;
3256   int* wt;
3257 
3258   int  n = A->n;
3259   int  pprime;/*p,q,u omer*/
3260   int  ju;
3261   int* postorder;
3262   int* ipostorder;
3263   int  *first_child,*next_child;
3264 
3265   int i,j,k,ip,jp,kp;
3266   int nnz,jnnz;
3267   int* uf;
3268   int* rowptr;
3269   int* colind;
3270   int* rowcount;
3271   int* realroot;
3272 
3273   /* we need the row structures for the lower triangle */
3274 
3275   nnz = (A->colptr)[n];
3276 
3277   uf       = taucs_malloc(n     * sizeof(int));
3278   rowcount = taucs_malloc((n+1) * sizeof(int));
3279   rowptr   = taucs_malloc((n+1) * sizeof(int));
3280   colind   = taucs_malloc(nnz   * sizeof(int));
3281 
3282   if (!uf || !rowcount || !rowptr || !colind) {
3283     taucs_free(uf);
3284     taucs_free(rowcount);
3285     taucs_free(rowptr);
3286     taucs_free(colind);
3287     return -1;
3288   }
3289 
3290   for (i=0; i <=n; i++) rowcount[i] = 0;
3291   for (j=0; j < n; j++) {
3292     jnnz = (A->colptr)[j+1] - (A->colptr)[j];
3293     for (ip=0; ip<jnnz; ip++) {
3294       i = (A->rowind)[(A->colptr)[j] + ip];
3295       if (j < i) rowcount[i]++;
3296     }
3297   }
3298 
3299   ip = 0;
3300   for (i=0; i <= n; i++) {
3301     int next_ip = ip + rowcount[i];
3302     rowcount[i] = ip;
3303     rowptr  [i] = ip;
3304     ip = next_ip;
3305   }
3306 
3307   for (j=0; j < n; j++) {
3308     jnnz = (A->colptr)[j+1] - (A->colptr)[j];
3309     for (ip=0; ip<jnnz; ip++) {
3310       i = (A->rowind)[(A->colptr)[j] + ip];
3311       if (i==j) continue;
3312       assert( rowcount[i] < rowptr[i+1] );
3313       colind[ rowcount[i] ] = j;
3314       rowcount[i]++;
3315     }
3316   }
3317 
3318   /* now compute the etree */
3319 
3320   {
3321     int u,t,vroot;
3322     realroot = rowcount; /* reuse space */
3323 
3324     for (i=0; i<n; i++) {
3325       uf_makeset(uf,i);
3326       realroot[i] = i;
3327       parent[i] = n;
3328       vroot = i;
3329       for (kp=rowptr[i]; kp<rowptr[i+1]; kp++) {
3330 	k=colind[kp];
3331 	u = uf_find(uf,k);
3332 	t = realroot[u];
3333 	if (parent[t] == n && t != i) {
3334 	  parent[t] = i;
3335 	  vroot = uf_union(uf,vroot,u);
3336 	  realroot[vroot] = i;
3337 	}
3338       }
3339     }
3340   }
3341 
3342   taucs_free(colind);
3343   taucs_free(rowptr);
3344   taucs_free(rowcount);
3345 
3346   /* now only uf remains allocated */
3347 
3348   /* compute column counts */
3349 
3350   if (l_colcount || l_rowcount || l_nnz) {
3351     int* l_nz;
3352     int  tmp;
3353     int  u,p,q;
3354 
3355     first_child = taucs_malloc((n+1) * sizeof(int));
3356     next_child  = taucs_malloc((n+1) * sizeof(int));
3357     postorder   = taucs_malloc(n     * sizeof(int));
3358     ipostorder  = taucs_malloc(n     * sizeof(int));
3359     wt          = taucs_malloc(n     * sizeof(int));
3360     level       = taucs_malloc(n     * sizeof(int));
3361     prev_p      = taucs_malloc(n     * sizeof(int));
3362 
3363 #ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP
3364     prev_nbr         = taucs_malloc(n     * sizeof(int));
3365     first_descendant = taucs_malloc(n     * sizeof(int));
3366 #endif
3367 
3368     /* we allocate scratch vectors to avoid conditionals */
3369     /* in the inner loop.                                */
3370 
3371     if (l_colcount) l_cc = l_colcount;
3372     else            l_cc = (int*) taucs_malloc(n*sizeof(int));
3373     if (l_rowcount) l_rc = l_rowcount;
3374     else            l_rc = (int*) taucs_malloc(n*sizeof(int));
3375     if (l_nnz)      l_nz = l_nnz;
3376     else            l_nz = &tmp;
3377 
3378 
3379     if (!first_child || !next_child || !postorder
3380 	|| !ipostorder || !wt || !level || !prev_p
3381 	|| (!l_colcount && !l_cc)
3382 	|| (!l_rowcount && !l_rc)
3383 #ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP
3384 	|| !prev_nbr || !first_descendant
3385 #endif
3386 	) {
3387       taucs_free(uf);
3388 
3389       if (!l_colcount) taucs_free(l_cc);
3390       if (!l_rowcount) taucs_free(l_rc);
3391 
3392       taucs_free(postorder);
3393       taucs_free(ipostorder);
3394       taucs_free(wt);
3395       taucs_free(level);
3396       taucs_free(prev_p);
3397 
3398 #ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP
3399       taucs_free(prev_nbr);
3400       taucs_free(first_descendant);
3401 #endif
3402       return -1;
3403     }
3404 
3405     /*for (j=0; j<n; j++) printf("parent[%d] = %d\n",j,parent[j]);*/
3406 
3407     /* compute the postorder */
3408 
3409     for (j=0; j<=n; j++) first_child[j] = -1;
3410     for (j=n-1; j>=0; j--) {
3411       next_child[j] = first_child[parent[j]];
3412       first_child[parent[j]] = j;
3413     }
3414 
3415     {
3416       int next = 0;
3417       recursive_postorder(n,first_child,next_child,
3418 			  postorder,
3419 			  ipostorder,&next);
3420     }
3421 
3422     /* sort by postorder of etree */
3423     /* compute level, fst_desc */
3424 
3425     tree_level(n,TRUE,first_child,next_child,
3426 	       level,-1);
3427 
3428     for (u=0; u < n; u++) prev_p  [u] = -1;
3429     for (u=0; u < n; u++) l_rc    [u] =  1;
3430     for (u=0; u < n; u++) ordered_uf_makeset(uf,u);
3431     for (u=0; u < n; u++) {
3432       if (first_child[u] == -1)
3433 	wt[u] = 1; /* leaves     */
3434       else
3435 	wt[u] =  0; /* nonleaves */
3436     }
3437 
3438 #ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP
3439     for (u=0; u < n; u++) prev_nbr[u] = -1;
3440 
3441     tree_first_descendant(n,TRUE,
3442 			  first_child,next_child,ipostorder,
3443 			  first_descendant);
3444 #endif
3445 
3446     taucs_free(first_child);
3447     taucs_free(next_child);
3448 
3449     for (p=0; p<n; p++) {
3450       jp = postorder[p];
3451       if (parent[jp] != n) wt[parent[jp]] --;
3452       for (ip = (A->colptr)[jp]; ip < (A->colptr)[jp+1]; ip++) {
3453 	ju = (A->rowind)[ip];
3454 	u  = ipostorder[ju];
3455 	if (ju==jp) continue; /* we only want proper neighbors */
3456 #ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP
3457 	if (first_descendant[jp] > prev_nbr[u]) {
3458 #else
3459 	if (1) {
3460 #endif
3461 	  wt[jp] ++;
3462 	  pprime = prev_p[ju];
3463 	  if (pprime == -1)
3464 	    l_rc[ju] += level[jp] - level[ju];
3465 	  else {
3466 	    q = ordered_uf_find(uf,pprime);
3467 	    l_rc[ju] += level[jp] - level[q];
3468 	    wt[q] --;
3469 	  }
3470 	  prev_p[ju] = jp;
3471 	}
3472 #ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP
3473 	prev_nbr[u] = p;
3474 #endif
3475       }
3476       if (parent[jp] != n) {
3477 	if (!(ipostorder[parent[jp]] > ipostorder[jp])) {
3478 	  printf("jp %d parent %d (ipo_j %d ipo_parent %d\n",
3479 		 jp,parent[jp],ipostorder[jp],ipostorder[parent[jp]]);
3480 	}
3481 	assert(ipostorder[parent[jp]] > ipostorder[jp]);
3482 	ordered_uf_union(uf,jp,parent[jp]);
3483       }
3484     }
3485 
3486     *l_nz = 0;
3487     for (u=0; u<n; u++) {
3488       l_cc[u] = wt[u];
3489       *l_nz += wt[u];
3490     }
3491     for (u=0; u<n; u++) {
3492       if (parent[u] != n) {
3493 	l_cc[parent[u]] += l_cc[u];
3494 	*l_nz += l_cc[u];
3495       }
3496     }
3497 
3498     /* free scrtach vectors                              */
3499 
3500     if (!l_colcount) taucs_free(l_cc);
3501     if (!l_rowcount) taucs_free(l_rc);
3502 
3503     /* free other data structures */
3504 
3505     taucs_free(postorder);
3506     taucs_free(ipostorder);
3507     taucs_free(wt);
3508     taucs_free(level);
3509     taucs_free(prev_p);
3510 
3511 #ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP
3512     taucs_free(prev_nbr);
3513     taucs_free(first_descendant);
3514 #endif
3515   }
3516 
3517   taucs_free(uf);
3518 
3519   return 0;
3520 }
3521 
3522 int
3523 taucs_ccs_etree_liu(taucs_ccs_matrix* A,
3524 		    int* parent,
3525 		    int* l_colcount,
3526 		    int* l_rowcount,
3527 		    int* l_nnz)
3528 {
3529   int n = A->n;
3530   int i,j,k,ip,kp;/*jp omer*/
3531   int nnz,jnnz;
3532 
3533   int* uf;
3534   int* rowptr;
3535   int* colind;
3536 
3537   int* rowcount;
3538   int* marker;
3539   int* realroot;
3540 
3541   int* l_cc;
3542   int* l_rc;
3543 
3544   /* we need the row structures for the lower triangle */
3545 
3546   nnz = (A->colptr)[n];
3547 
3548   uf       = taucs_malloc(n     * sizeof(int));
3549   rowcount = taucs_malloc((n+1) * sizeof(int));
3550   rowptr   = taucs_malloc((n+1) * sizeof(int));
3551   colind   = taucs_malloc(nnz   * sizeof(int));
3552 
3553   for (i=0; i <=n; i++) rowcount[i] = 0;
3554 
3555   for (j=0; j < n; j++) {
3556 
3557     jnnz = (A->colptr)[j+1] - (A->colptr)[j];
3558 
3559     for (ip=0; ip<jnnz; ip++) {
3560       i = (A->rowind)[(A->colptr)[j] + ip];
3561       if (j < i) rowcount[i]++;
3562     }
3563   }
3564 
3565   ip = 0;
3566   for (i=0; i <= n; i++) {
3567     int next_ip = ip + rowcount[i];
3568     rowcount[i] = ip;
3569     rowptr  [i] = ip;
3570     ip = next_ip;
3571   }
3572 
3573   for (j=0; j < n; j++) {
3574     jnnz = (A->colptr)[j+1] - (A->colptr)[j];
3575 
3576     for (ip=0; ip<jnnz; ip++) {
3577       i = (A->rowind)[(A->colptr)[j] + ip];
3578       if (i==j) continue;
3579       assert( rowcount[i] < rowptr[i+1] );
3580       colind[ rowcount[i] ] = j;
3581       rowcount[i]++;
3582     }
3583   }
3584 
3585   /* now compute the etree */
3586 
3587   {
3588     int u,t,vroot;
3589     realroot = rowcount; /* reuse space */
3590 
3591     for (i=0; i<n; i++) {
3592       uf_makeset(uf,i);
3593       realroot[i] = i;
3594       parent[i] = n;
3595       vroot = i;
3596       for (kp=rowptr[i]; kp<rowptr[i+1]; kp++) {
3597 	k=colind[kp];
3598 	u = uf_find(uf,k);
3599 	t = realroot[u];
3600 	if (parent[t] == n && t != i) {
3601 	  parent[t] = i;
3602 	  vroot = uf_union(uf,vroot,u);
3603 	  realroot[vroot] = i;
3604 	}
3605       }
3606     }
3607   }
3608 
3609   /* compute column counts */
3610 
3611   if (l_colcount || l_rowcount || l_nnz) {
3612     int* l_nz;
3613     int  tmp;
3614 
3615     /* we allocate scratch vectors to avoid conditionals */
3616     /* in the inner loop.                                */
3617 
3618     if (l_colcount) l_cc = l_colcount;
3619     else            l_cc = (int*) taucs_malloc(n*sizeof(int));
3620     if (l_rowcount) l_rc = l_rowcount;
3621     else            l_rc = (int*) taucs_malloc(n*sizeof(int));
3622     if (l_nnz)      l_nz = l_nnz;
3623     else            l_nz = &tmp;
3624 
3625     marker = rowcount; /* we reuse the space */
3626 
3627     for (j=0; j < n; j++) l_cc[j] = 1;
3628     *l_nz = n;
3629 
3630     for (i=0; i<n; i++) marker[i] = n; /* clear the array */
3631 
3632     for (i=0; i<n; i++) {
3633       l_rc[i] = 1;
3634       marker[ i ] = i;
3635       for (kp=rowptr[i]; kp<rowptr[i+1]; kp++) {
3636 	k=colind[kp];
3637 	j=k;
3638 	while (marker[j] != i) {
3639 	  l_cc[j]++;
3640 	  l_rc[i]++;
3641 	  (*l_nz)++;
3642 	  marker[j] = i;
3643 	  j = parent[j];
3644 	}
3645       }
3646     }
3647 
3648     /* free scrtach vectors                              */
3649 
3650     if (!l_colcount) taucs_free(l_cc);
3651     if (!l_rowcount) taucs_free(l_rc);
3652   }
3653 
3654   taucs_free(colind);
3655   taucs_free(rowptr);
3656   taucs_free(rowcount);
3657   taucs_free(uf);
3658 
3659   return 0;
3660 }
3661 
3662 
3663 int
3664 taucs_ccs_symbolic_elimination(taucs_ccs_matrix* A,
3665 			       void* vL,
3666 			       int do_order,
3667 			       int max_depth
3668 			       )
3669 {
3670   supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL;
3671   int* first_child;
3672   int* next_child;
3673   int j;
3674   int* column_to_sn_map;
3675   int* map;
3676   int* rowind;
3677   int* parent;
3678   int* ipostorder;
3679 
3680   int depth;
3681 
3682   L->n           = A->n;
3683   /* use calloc so we can deallocate unallocated entries */
3684   L->sn_struct   = (int**)taucs_calloc((A->n  ),sizeof(int*));
3685   L->sn_size     = (int*) taucs_malloc((A->n+1)*sizeof(int));
3686   L->sn_up_size  = (int*) taucs_malloc((A->n+1)*sizeof(int));
3687   L->first_child = (int*) taucs_malloc((A->n+1)*sizeof(int));
3688   L->next_child  = (int*) taucs_malloc((A->n+1)*sizeof(int));
3689 
3690   column_to_sn_map = (int*) taucs_malloc((A->n+1)*sizeof(int));
3691   map              = (int*) taucs_malloc((A->n+1)*sizeof(int));
3692   first_child      = (int*) taucs_malloc((A->n+1)*sizeof(int));
3693   next_child       = (int*) taucs_malloc((A->n+1)*sizeof(int));
3694   parent           = (int*) taucs_malloc((A->n+1)*sizeof(int));
3695   rowind           = (int*) taucs_malloc((A->n  )*sizeof(int));
3696 
3697   if (!(L->sn_struct) || !(L->sn_size) || !(L->sn_up_size) ||
3698       !(L->first_child) || !(L->next_child) || !column_to_sn_map
3699       || !map || !first_child || !next_child || !rowind || !parent) {
3700     taucs_free(parent);
3701     taucs_free(rowind);
3702     taucs_free(next_child);
3703     taucs_free(first_child);
3704     taucs_free(map);
3705     taucs_free(column_to_sn_map);
3706     taucs_free(L->next_child);
3707     taucs_free(L->first_child);
3708     taucs_free(L->sn_up_size);
3709     taucs_free(L->sn_size);
3710     taucs_free(L->sn_struct);
3711     L->sn_struct = NULL;
3712     L->sn_size = L->sn_up_size = L->first_child = L->next_child = NULL;
3713     return -1;
3714   }
3715 
3716   if (taucs_ccs_etree(A,parent,NULL,NULL,NULL) == -1) {
3717     taucs_free(parent);
3718     taucs_free(rowind);
3719     taucs_free(next_child);
3720     taucs_free(first_child);
3721     taucs_free(map);
3722     taucs_free(column_to_sn_map);
3723     taucs_free(L->next_child);
3724     taucs_free(L->first_child);
3725     taucs_free(L->sn_up_size);
3726     taucs_free(L->sn_size);
3727     taucs_free(L->sn_struct);
3728     L->sn_struct = NULL;
3729     L->sn_size = L->sn_up_size = L->first_child = L->next_child = NULL;
3730     return -1;
3731   }
3732 
3733   if (0) {
3734     double wtime;
3735     int *cc1,*cc2,*rc1,*rc2;
3736     int *p1;
3737     int nnz1,nnz2;
3738 
3739     cc1=(int*)taucs_malloc((A->n)*sizeof(int));
3740     cc2=(int*)taucs_malloc((A->n)*sizeof(int));
3741     rc1=(int*)taucs_malloc((A->n)*sizeof(int));
3742     rc2=(int*)taucs_malloc((A->n)*sizeof(int));
3743     p1 =(int*)taucs_malloc((A->n)*sizeof(int));
3744 
3745     wtime = taucs_wtime();
3746     taucs_ccs_etree_liu(A,parent,cc1,rc1,&nnz1);
3747     wtime = taucs_wtime() - wtime;
3748     printf("\t\t\tLiu Analysis = %.3f seconds\n",wtime);
3749 
3750     wtime = taucs_wtime();
3751     taucs_ccs_etree(A,p1,cc2,rc2,&nnz2);
3752     wtime = taucs_wtime() - wtime;
3753     printf("\t\t\tGNP Analysis = %.3f seconds\n",wtime);
3754 
3755     for (j=0; j<(A->n); j++) assert(parent[j]==p1[j]);
3756     for (j=0; j<(A->n); j++) {
3757       if (cc1[j]!=cc2[j]) printf("j=%d cc1=%d cc2=%d\n",j,cc1[j],cc2[j]);
3758       assert(cc1[j]==cc2[j]);
3759     }
3760 
3761     for (j=0; j<(A->n); j++) {
3762       if (rc1[j]!=rc2[j]) printf("j=%d rc1=%d rc2=%d\n",j,rc1[j],rc2[j]);
3763       assert(rc1[j]==rc2[j]);
3764     }
3765 
3766     if (nnz1!=nnz2) printf("nnz1=%d nnz2=%d\n",nnz1,nnz2);
3767 
3768     taucs_free(cc1); taucs_free(cc2); taucs_free(rc1); taucs_free(rc2);
3769   }
3770 
3771   for (j=0; j <= (A->n); j++) first_child[j] = -1;
3772   for (j = (A->n)-1; j >= 0; j--) {
3773     int p = parent[j];
3774     next_child[j] = first_child[p];
3775     first_child[p] = j;
3776   }
3777 
3778   /* let's compute the depth of the etree, to bail out if it is too deep */
3779   /* the whole thing will work better if we compute supernodal etrees    */
3780 
3781   {
3782     int next_depth_count;
3783     int this_depth_count;
3784     int child,i;
3785 
3786     int* this_depth = rowind; /* we alias rowind */
3787     int* next_depth = map;    /* and map         */
3788     int* tmp;
3789 
3790     this_depth[0] = A->n;
3791     this_depth_count = 1;
3792     next_depth_count = 0;
3793     depth = -1;
3794 
3795     while (this_depth_count) {
3796       for (i=0; i<this_depth_count; i++) {
3797 	child = first_child[ this_depth[i] ];
3798 	while (child != -1) {
3799 	  next_depth[ next_depth_count ] = child;
3800 	  next_depth_count++;
3801 	  child = next_child[ child ];
3802 	}
3803       }
3804 
3805       tmp = this_depth;
3806       this_depth = next_depth;
3807       next_depth = tmp;
3808 
3809       this_depth_count = next_depth_count;
3810       next_depth_count = 0;
3811       depth++;
3812     }
3813   }
3814 
3815   taucs_printf("\t\tElimination tree depth is %d\n",depth);
3816 
3817   if (max_depth && depth > max_depth) {
3818     taucs_printf("taucs_ccs_symbolic_elimination: etree depth %d, maximum allowed is %d\n",
3819 		 depth, max_depth);
3820     taucs_free(parent);
3821     taucs_free(rowind);
3822     taucs_free(next_child);
3823     taucs_free(first_child);
3824     taucs_free(map);
3825     taucs_free(column_to_sn_map);
3826     taucs_free(L->next_child);
3827     taucs_free(L->first_child);
3828     taucs_free(L->sn_up_size);
3829     taucs_free(L->sn_size);
3830     taucs_free(L->sn_struct);
3831     L->sn_struct = NULL;
3832     L->sn_size = L->sn_up_size = L->first_child = L->next_child = NULL;
3833     return -1;
3834   }
3835 
3836   /*
3837   taucs_free(parent);
3838   ipostorder = (int*)taucs_malloc((A->n+1)*sizeof(int));
3839   */
3840 
3841   ipostorder = parent;
3842   {
3843     int next = 0;
3844     /*int* postorder = (int*)taucs_malloc((A->n+1)*sizeof(int));*/
3845     recursive_postorder(A->n,first_child,next_child,
3846 			NULL,
3847 			ipostorder,&next);
3848     /*
3849     printf("ipostorder ");
3850     for (j=0; j <= (A->n); j++) printf("%d ",ipostorder[j]);
3851     printf("\n");
3852     printf(" postorder ");
3853     for (j=0; j <= (A->n); j++) printf("%d ",postorder[j]);
3854     printf("\n");
3855     */
3856   }
3857 
3858   L->n_sn = 0;
3859   for (j=0; j < (A->n); j++) map[j] = -1;
3860   for (j=0; j <= (A->n); j++) (L->first_child)[j] = (L->next_child)[j] = -1;
3861 
3862   if (recursive_symbolic_elimination(A->n,
3863 				     A,
3864 				     first_child,next_child,
3865 				     &(L->n_sn),
3866 				     L->sn_size,L->sn_up_size,L->sn_struct,
3867 				     L->first_child,L->next_child,
3868 				     rowind,
3869 				     column_to_sn_map,
3870 				     map,
3871 				     do_order,ipostorder
3872 				     )
3873       == -1) {
3874     for (j=0; j < (A->n); j++) taucs_free((L->sn_struct)[j]);
3875 
3876     taucs_free(parent);
3877     taucs_free(rowind);
3878     taucs_free(next_child);
3879     taucs_free(first_child);
3880     taucs_free(map);
3881     taucs_free(column_to_sn_map);
3882     taucs_free(L->next_child);
3883     taucs_free(L->first_child);
3884     taucs_free(L->sn_up_size);
3885     taucs_free(L->sn_size);
3886     taucs_free(L->sn_struct);
3887     L->sn_struct = NULL;
3888     L->sn_size = L->sn_up_size = L->first_child = L->next_child = NULL;
3889     return -1;
3890   }
3891 
3892   {
3893     double nnz   = 0.0;
3894     double flops = 0.0;
3895     int sn,i,colnnz;
3896     int bytes;
3897 
3898     bytes =
3899       1*sizeof(char)                /* uplo             */
3900       + 2*sizeof(int)               /* n, n_sn          */
3901       + 3*(L->n_sn)*sizeof(int)     /* etree            */
3902       + 4*(L->n_sn)*sizeof(int)     /* block sizes, lda */
3903       + 1*(L->n_sn)*sizeof(int*)    /* row/col indices  */
3904       + 3*(L->n_sn)*sizeof(taucs_datatype*) /* actual blocks    */
3905       ;
3906 
3907     for (sn=0; sn<(L->n_sn); sn++) {
3908       bytes += (L->sn_up_size)[sn] * sizeof(int);
3909       bytes += ((L->sn_size)[sn]*(L->sn_up_size)[sn]) * sizeof(taucs_datatype);
3910 
3911       for (i=0, colnnz = (L->sn_up_size)[sn];
3912 	   i<(L->sn_size)[sn];
3913 	   i++, colnnz--) {
3914 	/* There was a bug here. I did not count muliply-adds in the
3915 	   update part of the computation as 2 flops but one. */
3916 	/*flops += ((double)(colnnz) - 1.0) * ((double)(colnnz) + 2.0) / 2.0;*/
3917 	flops += 1.0 + ((double)(colnnz)) * ((double)(colnnz));
3918 	nnz   += (double) (colnnz);
3919       }
3920     }
3921     taucs_printf("\t\tSymbolic Analysis of LL^T: %.2e nonzeros, %.2e flops, %.2e bytes in L\n",
3922 		 nnz, flops, (float) bytes);
3923   }
3924 
3925   for (j=0; j < (A->n); j++) map[j] = -1;
3926   if (1)
3927   (void) recursive_amalgamate_supernodes((L->n_sn) - 1,
3928 					 &(L->n_sn),
3929 					 L->sn_size,L->sn_up_size,L->sn_struct,
3930 					 L->first_child,L->next_child,
3931 					 rowind,
3932 					 column_to_sn_map,
3933 					 map,
3934 					 do_order,ipostorder
3935 					 );
3936 
3937 
3938   {
3939     double nnz   = 0.0;
3940     double flops = 0.0;
3941     int sn,i,colnnz;
3942     int bytes;
3943 
3944     bytes =
3945       1*sizeof(char)                /* uplo             */
3946       + 2*sizeof(int)               /* n, n_sn          */
3947       + 3*(L->n_sn)*sizeof(int)     /* etree            */
3948       + 4*(L->n_sn)*sizeof(int)     /* block sizes, lda */
3949       + 1*(L->n_sn)*sizeof(int*)    /* row/col indices  */
3950       + 3*(L->n_sn)*sizeof(taucs_datatype*) /* actual blocks    */
3951       ;
3952 
3953     for (sn=0; sn<(L->n_sn); sn++) {
3954       bytes += (L->sn_up_size)[sn] * sizeof(int);
3955       bytes += ((L->sn_size)[sn]*(L->sn_up_size)[sn]) * sizeof(taucs_datatype);
3956 
3957       for (i=0, colnnz = (L->sn_up_size)[sn];
3958 	   i<(L->sn_size)[sn];
3959 	   i++, colnnz--) {
3960 	/* There was a bug here. I did not count muliply-adds in the
3961 	   update part of the computation as 2 flops but one. */
3962 	/*flops += ((double)(colnnz) - 1.0) * ((double)(colnnz) + 2.0) / 2.0;*/
3963 	flops += 1.0 + ((double)(colnnz)) * ((double)(colnnz));
3964 	nnz   += (double) (colnnz);
3965       }
3966     }
3967     taucs_printf("\t\tRelaxed  Analysis of LL^T: %.2e nonzeros, %.2e flops, %.2e bytes in L\n",
3968 		 nnz, flops, (float) bytes);
3969   }
3970 
3971   /*
3972   {
3973     int i;
3974     printf("c2sn: ");
3975     for (i=0; i<A->n; i++) printf("%d ",column_to_sn_map[i]);
3976     printf("\n");
3977   }
3978   */
3979 
3980   taucs_free(parent);
3981   taucs_free(rowind);
3982   taucs_free(map);
3983   taucs_free(column_to_sn_map);
3984   taucs_free(next_child);
3985   taucs_free(first_child);
3986 
3987   L->sn_blocks_ld  = taucs_malloc((L->n_sn) * sizeof(int));
3988   L->sn_blocks     = taucs_calloc((L->n_sn), sizeof(taucs_datatype*)); /* so we can free before allocation */
3989 
3990   L->up_blocks_ld  = taucs_malloc((L->n_sn) * sizeof(int));
3991   L->up_blocks     = taucs_calloc((L->n_sn), sizeof(taucs_datatype*));
3992 
3993   if (!(L->sn_blocks_ld)
3994       || !(L->sn_blocks_ld)
3995       || !(L->sn_blocks)
3996       || !(L->up_blocks_ld)
3997       || !(L->up_blocks))
3998     return -1; /* the caller will free L */
3999 
4000   return 0;
4001 }
4002 #endif
4003 
4004 
4005 /*************************************************************/
4006 /* end of file                                               */
4007 /*************************************************************/
4008 
4009 
4010 
4011 
4012 
4013 
4014 
4015 
4016 
4017 
4018 
4019 
4020 
4021