1 /*************************************************************/
2 /* */
3 /*************************************************************/
4
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <math.h>
8
9 #define NDEBUG
10 #include <assert.h>
11
12 #define TAUCS_CORE_CILK
13 #include "taucs.h"
14
15 #ifdef TAUCS_CILK
16 #pragma lang -C
17 #endif
18
19 #ifndef TAUCS_CORE_GENERAL
20 #ifdef TAUCS_CILK
21
22 /*** GEMM ***/
23
24 #define TAUCS_THRESHOLD_GEMM_SMALL 20
25 #define TAUCS_THRESHOLD_GEMM_BLAS 80
26
27 static void
taucs_gemm_NCm1p1_small(int m,int n,int k,taucs_datatype * A,int lda,taucs_datatype * B,int ldb,taucs_datatype * C,int ldc)28 taucs_gemm_NCm1p1_small(int m, int n, int k,
29 taucs_datatype* A, int lda,
30 taucs_datatype* B, int ldb,
31 taucs_datatype* C, int ldc)
32 {
33 int j,i,l;
34 taucs_datatype* Cj;
35 taucs_datatype* Ail;
36 taucs_datatype* Bjl;
37 taucs_datatype Cij;
38
39 Cj = C;
40 for (j=0; j<n; j++) {
41 for (i=0; i<m; i++) {
42 Cij = *Cj;
43 Ail = A + i;
44 Bjl = B + j;
45 for (l=0; l<k; l++) {
46 Cij = taucs_sub( Cij, taucs_mul( *Ail, taucs_conj( *Bjl ) ) );
47 Ail += lda;
48 Bjl += ldb;
49 }
50 *Cj = Cij;
51 Cj++;
52 }
53 Cj = Cj + ldc - m;
54 /* now Cj is at the top of column j+1 */
55 }
56 }
57
58 cilk static void
taucs_cilk_gemm(char * transa,char * transb,int * pm,int * pn,int * pk,taucs_real_datatype * alpha,taucs_datatype * A,int * plda,taucs_datatype * B,int * pldb,taucs_real_datatype * beta,taucs_datatype * C,int * pldc)59 taucs_cilk_gemm(char* transa, char* transb,
60 int* pm, int* pn, int* pk,
61 taucs_real_datatype* alpha,
62 taucs_datatype *A, int *plda,
63 taucs_datatype *B, int *pldb,
64 taucs_real_datatype* beta,
65 taucs_datatype *C, int *pldc)
66 {
67 int m = *pm;
68 int n = *pn;
69 int k = *pk;
70
71 assert(*transa == 'N');
72 assert(*transb == 'C');
73 assert(*alpha ==-1.0);
74 assert(*beta == 1.0);
75
76 if (n <= TAUCS_THRESHOLD_GEMM_SMALL && k <= TAUCS_THRESHOLD_GEMM_SMALL) {
77 /*fprintf(stderr,"GEMM SMALL\n");*/
78 taucs_gemm_NCm1p1_small(m,n,k,A,*plda,B,*pldb,C,*pldc);
79 return;
80 }
81
82 if (n <= TAUCS_THRESHOLD_GEMM_BLAS && k <= TAUCS_THRESHOLD_GEMM_BLAS) {
83 /*fprintf(stderr,"GEMM BLAS\n");*/
84 taucs_gemm(transa, transb,
85 pm, pn, pk,
86 alpha,
87 A, plda,
88 B, pldb,
89 beta,
90 C, pldc);
91 return;
92 }
93
94 if (k >= n && k >= m) {
95 int khalf1 = k/2;
96 int khalf2 = k-khalf1;
97 int lda = *plda;
98 int ldb = *pldb;
99 /*fprintf(stderr,"GEMM K/2\n");*/
100 spawn taucs_cilk_gemm(transa,transb,
101 pm, pn, &khalf1,
102 alpha,
103 A, plda,
104 B, pldb,
105 beta,
106 C, pldc);
107 sync;
108 spawn taucs_cilk_gemm(transa,transb,
109 pm, pn, &khalf2,
110 alpha,
111 A + khalf1*lda, plda,
112 B + khalf1*ldb, pldb,
113 beta,
114 C, pldc);
115 sync;
116 return;
117 }
118
119 if (n >= k && n >= m) {
120 int nhalf1 = n/2;
121 int nhalf2 = n-nhalf1;
122 int ldc = *pldc;
123 /*fprintf(stderr,"GEMM N/2\n");*/
124
125 spawn taucs_cilk_gemm(transa,transb,
126 pm, &nhalf1, pk,
127 alpha,
128 A, plda,
129 B, pldb,
130 beta,
131 C, pldc);
132
133
134 spawn taucs_cilk_gemm(transa,transb,
135 pm, &nhalf2, pk,
136 alpha,
137 A, plda,
138 B + nhalf1, pldb,
139 beta,
140 C + nhalf1*ldc, pldc);
141 sync;
142 return;
143 }
144
145 if (1 /* m >= k && m >= n*/) { /* the condition must be true */
146 int mhalf1 = m/2;
147 int mhalf2 = m-mhalf1;
148 /*fprintf(stderr,"GEMM M/2\n");*/
149
150 spawn taucs_cilk_gemm(transa,transb,
151 &mhalf1, pn, pk,
152 alpha,
153 A, plda,
154 B, pldb,
155 beta,
156 C, pldc);
157
158 spawn taucs_cilk_gemm(transa,transb,
159 &mhalf2, pn, pk,
160 alpha,
161 A + mhalf1, plda,
162 B, pldb,
163 beta,
164 C + mhalf1, pldc);
165 sync;
166 return;
167 }
168
169 assert(0);
170
171 }
172
173 /*** HERK ***/
174
175 #define TAUCS_THRESHOLD_HERK_SMALL 20
176 #define TAUCS_THRESHOLD_HERK_BLAS 80
177
178 static void
taucs_herk_LNm1p1_small(int n,int k,taucs_datatype * A,int lda,taucs_datatype * C,int ldc)179 taucs_herk_LNm1p1_small(int n, int k,
180 taucs_datatype* A, int lda,
181 taucs_datatype* C, int ldc)
182 {
183 int j,i,l;
184 taucs_datatype* Cj;
185 taucs_datatype* Ail;
186 taucs_datatype* Ajl;
187 taucs_datatype Cij;
188
189 Cj = C;
190 for (j=0; j<n; j++) {
191 for (i=j; i<n; i++) {
192 Cij = *Cj;
193 Ail = A + i;
194 Ajl = A + j;
195 for (l=0; l<k; l++) {
196 Cij = taucs_sub( Cij, taucs_mul( *Ail, taucs_conj( *Ajl ) ) );
197 Ail += lda;
198 Ajl += lda;
199 }
200 *Cj = Cij;
201 Cj++;
202 }
203 Cj = Cj + ldc - n;
204 /* now Cj is at the top of column j+1, move to the diagonal */
205 Cj = Cj + j+1;
206 }
207 }
208
209 cilk static void
taucs_cilk_herk(char * uplo,char * trans,int * pn,int * pk,taucs_real_datatype * alpha,taucs_datatype * A,int * plda,taucs_real_datatype * beta,taucs_datatype * C,int * pldc)210 taucs_cilk_herk(char* uplo, char* trans,
211 int* pn, int* pk,
212 taucs_real_datatype* alpha,
213 taucs_datatype *A, int *plda,
214 taucs_real_datatype* beta,
215 taucs_datatype *C, int *pldc)
216 {
217 int n = *pn;
218 int k = *pk;
219
220 assert(*uplo == 'L');
221 assert(*trans == 'N');
222 assert(*alpha ==-1.0);
223 assert(*beta == 1.0);
224
225 if (n <= TAUCS_THRESHOLD_HERK_SMALL && k <= TAUCS_THRESHOLD_HERK_SMALL) {
226 /*fprintf(stderr,"HERK SMALL\n");*/
227 taucs_herk_LNm1p1_small(n,k,A,*plda,C,*pldc);
228 return;
229 }
230
231 if (n <= TAUCS_THRESHOLD_HERK_BLAS && k <= TAUCS_THRESHOLD_HERK_BLAS) {
232 /*fprintf(stderr,"HERK BLAS\n");*/
233 taucs_herk(uplo,trans,
234 pn, pk,
235 alpha,
236 A, plda,
237 beta,
238 C, pldc);
239 return;
240 }
241
242 if (k > n) {
243 int khalf1 = k/2;
244 int khalf2 = k-khalf1;
245 int lda = *plda;
246 /*fprintf(stderr,"HERK K/2\n");*/
247 spawn taucs_cilk_herk(uplo,trans,
248 pn, &khalf1,
249 alpha,
250 A, plda,
251 beta,
252 C, pldc);
253 sync;
254 spawn taucs_cilk_herk(uplo,trans,
255 pn, &khalf2,
256 alpha,
257 A + khalf1*lda, plda,
258 beta,
259 C, pldc);
260 sync;
261 return;
262 } else {
263 int ldc = *pldc;
264 int nhalf1 = n/2;
265 int nhalf2 = n-nhalf1;
266 /*fprintf(stderr,"HERK N/2\n");*/
267 spawn taucs_cilk_herk(uplo,trans,
268 &nhalf1, pk,
269 alpha,
270 A, plda,
271 beta,
272 C, pldc);
273
274 spawn taucs_cilk_gemm("No Transpose", "Conjugate",
275 &nhalf2, &nhalf1, pk,
276 &taucs_minusone_const,
277 A+nhalf1 , plda,
278 A , plda,
279 &taucs_one_const,
280 C +nhalf1, pldc);
281
282 spawn taucs_cilk_herk(uplo,trans,
283 &nhalf2, pk,
284 alpha,
285 A + nhalf1, plda,
286 beta,
287 C + nhalf1*ldc + nhalf1, pldc);
288
289 sync;
290 return;
291 }
292
293 }
294
295 /*** TRSM ***/
296
297 #define TAUCS_THRESHOLD_TRSM_SMALL 20
298 #define TAUCS_THRESHOLD_TRSM_BLAS 80
299
300 static void
taucs_trsm_RLCNp1_small(int m,int n,taucs_datatype * A,int lda,taucs_datatype * B,int ldb)301 taucs_trsm_RLCNp1_small(int m, int n,
302 taucs_datatype* A, int lda,
303 taucs_datatype* B, int ldb)
304 {
305 int j,i,k;
306 taucs_datatype* Bi;
307 taucs_datatype* Bik;
308 taucs_datatype* Ajk;
309 taucs_datatype Bij;
310
311 Bi = B;
312 for (i=0; i<m; i++) {
313 for (j=0; j<n; j++) {
314 Bij = *(Bi + j*ldb);
315 Bik = Bi;
316 Ajk = A + j;
317 for (k=0; k<j; k++) {
318 Bij = taucs_sub( Bij, taucs_mul( *Bik, taucs_conj( *Ajk ) ) );
319 Bik += ldb;
320 Ajk += lda;
321 }
322 *(Bi + j*ldb) = taucs_div( Bij, taucs_conj( *Ajk ) );
323 }
324 Bi = Bi + 1;
325 }
326 }
327
328 cilk static void
taucs_cilk_trsm(char * side,char * uplo,char * transa,char * diag,int * pm,int * pn,taucs_datatype * alpha,taucs_datatype * A,int * plda,taucs_datatype * B,int * pldb)329 taucs_cilk_trsm(char* side, char* uplo, char* transa, char* diag,
330 int* pm, int* pn,
331 taucs_datatype* alpha,
332 taucs_datatype *A, int *plda,
333 taucs_datatype *B, int *pldb)
334 {
335 int n = *pn;
336 int m = *pm;
337
338 assert(*side == 'R');
339 assert(*uplo == 'L');
340 assert(*transa == 'C');
341 assert(*diag == 'N');
342 assert(taucs_re(*alpha) == 1.0);
343 assert(taucs_im(*alpha) == 0.0);
344
345 if (m <= TAUCS_THRESHOLD_TRSM_SMALL && n <= TAUCS_THRESHOLD_TRSM_SMALL) {
346 /*fprintf(stderr,"TRSM SMALL\n");*/
347 taucs_trsm_RLCNp1_small(m,n,A,*plda,B,*pldb);
348 return;
349 }
350
351 if (m <= TAUCS_THRESHOLD_TRSM_BLAS && n <= TAUCS_THRESHOLD_TRSM_BLAS) {
352 /*fprintf(stderr,"TRSM BLAS\n");*/
353 taucs_trsm(side,uplo,transa,diag,
354 pm, pn,
355 alpha,
356 A, plda,
357 B, pldb);
358 return;
359 }
360
361 if (m >= n) {
362 int mhalf1 = m/2;
363 int mhalf2 = m-mhalf1;
364 /*fprintf(stderr,"TRSM M/2\n");*/
365 spawn taucs_cilk_trsm(side,uplo,transa,diag,
366 &mhalf1, pn,
367 alpha,
368 A, plda,
369 B, pldb);
370 spawn taucs_cilk_trsm(side,uplo,transa,diag,
371 &mhalf2, pn,
372 alpha,
373 A, plda,
374 B+mhalf1, pldb);
375 sync;
376 return;
377 } else {
378 int lda = *plda;
379 int ldb = *pldb;
380 int nhalf1 = n/2;
381 int nhalf2 = n-nhalf1;
382 /*fprintf(stderr,"TRSM N/2\n");*/
383 spawn taucs_cilk_trsm(side,uplo,transa,diag,
384 pm, &nhalf1,
385 alpha,
386 A, plda,
387 B, pldb);
388 sync;
389 spawn taucs_cilk_gemm("No Transpose", "Conjugate",
390 pm, &nhalf2, &nhalf1,
391 &taucs_minusone_const,
392 B , pldb,
393 A+nhalf1 , plda,
394 &taucs_one_const,
395 B +nhalf1*ldb, pldb);
396 sync;
397 spawn taucs_cilk_trsm(side,uplo,transa,diag,
398 pm, &nhalf2,
399 alpha,
400 A+nhalf1+nhalf1*lda, plda,
401 B +nhalf1*ldb, pldb);
402 sync;
403 return;
404 }
405 }
406
407 /*** POTRF ***/
408
409 #define TAUCS_THRESHOLD_POTRF_SMALL 20
410 #define TAUCS_THRESHOLD_POTRF_BLAS 80
411
412 /*
413 this routine is for Lower only, and returns
414 0 or the index of the nonpositive diagonal.
415 */
416 static int
taucs_potrf_lower_small(int n,taucs_datatype * A,int lda)417 taucs_potrf_lower_small(int n, taucs_datatype* A, int lda)
418 {
419 int j,i,k;
420 taucs_datatype* Aj;
421 taucs_datatype* Ajk;
422 taucs_datatype* Aik;
423 taucs_datatype Aij;
424 taucs_datatype scale;
425
426 Aj = A;
427 for (j=0; j<n; j++) {
428 for (i=j; i<n; i++) {
429 Aij = taucs_zero_const;
430 Ajk = A+j; /* k = 0 */
431 Aik = A+i; /* k = 0 */
432 for (k=0; k<j; k++) {
433 Aij = taucs_add( Aij , taucs_mul( (*Aik) , (*Ajk) ) );
434 Aik += lda;
435 Ajk += lda;
436 }
437 Aj[i] = taucs_sub( Aj[i] , Aij );
438 }
439 if ( taucs_re(Aj[j]) < 0 ) return j+1;
440 scale = taucs_div( taucs_one_const, taucs_sqrt(Aj[j]) );
441 for (i=j; i<n; i++)
442 Aj[i] = taucs_mul(Aj[i] , scale);
443 Aj += lda;
444 }
445
446 return 0;
447 }
448
449 cilk static void
taucs_cilk_potrf(char * uplo,int * pn,taucs_datatype * A,int * plda,int * pinfo)450 taucs_cilk_potrf(char* uplo,
451 int* pn,
452 taucs_datatype* A, int* plda,
453 int* pinfo)
454 {
455 int n = *pn;
456 int nhalf1,nhalf2;
457
458 assert(*uplo == 'L');
459
460 if (n <= TAUCS_THRESHOLD_POTRF_SMALL) {
461 /*fprintf(stderr,"POTRF SMALL\n");*/
462 *pinfo = taucs_potrf_lower_small(*pn,A,*plda);
463 return;
464 }
465
466 if (n <= TAUCS_THRESHOLD_POTRF_BLAS) {
467 /*fprintf(stderr,"POTRF BLAS\n");*/
468 taucs_potrf(uplo,pn,A,plda,pinfo);
469 return;
470 }
471
472 /*fprintf(stderr,"POTRF RECIRSIVE\n");*/
473
474 nhalf1 = n/2;
475 nhalf2 = n-nhalf1;
476
477 spawn taucs_cilk_potrf(uplo,&nhalf1,A,plda,pinfo);
478 sync;
479
480 if (*pinfo) return;
481
482 spawn taucs_cilk_trsm ("Right",
483 "Lower",
484 "Conjugate",
485 "No unit diagonal",
486 &nhalf2,&nhalf1,
487 &taucs_one_const,
488 A, plda,
489 A+nhalf1,plda);
490 sync;
491 /*
492 taucs_trsm ("Right",
493 "Lower",
494 "Conjugate",
495 "No unit diagonal",
496 &nhalf2,&nhalf1,
497 &taucs_one_const,
498 A, plda,
499 A+nhalf1,plda);
500 */
501
502 spawn taucs_cilk_herk ("Lower",
503 "No Conjugate",
504 &nhalf2,&nhalf1,
505 &taucs_minusone_real_const,
506 A+nhalf1,plda,
507 &taucs_one_real_const,
508 A+nhalf1+(nhalf1 * *plda), plda);
509 sync;
510
511 spawn taucs_cilk_potrf(uplo,&nhalf2,A+nhalf1+(nhalf1 * *plda),plda,pinfo);
512 sync;
513
514 if (*pinfo) *pinfo += nhalf1;
515 }
516 #else
517 #define taucs_cilk_potrf taucs_potrf
518 #define taucs_cilk_gemm taucs_gemm
519 #define taucs_cilk_trsm taucs_trsm
520 #define taucs_cilk_herk taucs_herk
521 #endif
522 #endif
523
524 /*************************************************************/
525 /* These are really generic routines */
526 /*************************************************************/
527
528 #if 0
529 #ifdef TAUCS_CORE_GENERAL
530 void* taucs_cilk_init() {
531 #ifdef TAUCS_CILK
532 CilkContext* context;
533 int argc;
534 #define CILK_ACTIVE_SIZE 8
535 char* argv[] = {"program_name","--nproc","8",0};
536
537 for (argc=0; argv[argc]; argc++);
538
539 taucs_printf("taucs_cilk_init\n");
540 context = Cilk_init(&argc,argv);
541 return context;
542 #else
543 taucs_printf("taucs_cilk_init: This is not a Cilk build\n");
544 return NULL;
545 #endif
546 }
547
548 void taucs_cilk_terminate(void* context) {
549 #ifdef TAUCS_CILK
550 Cilk_terminate((CilkContext*) context);
551 #endif
552 }
553
554 #endif /* TAUCS_CORE_GENERAL */
555 #endif
556 /*************************************************************/
557 /* End of Cilk-related generic routines */
558 /*************************************************************/
559
560 #if 0
561 /*omer added this, I don't know why yet*/
562 #ifdef TAUCS_CORE_GENERAL
563 taucs_double taucs_dzero_const = 0.0;
564 taucs_double taucs_done_const = 1.0;
565 taucs_double taucs_dminusone_const = -1.0;
566
567 taucs_single taucs_szero_const = 0.0f;
568 taucs_single taucs_sone_const = 1.0f;
569 taucs_single taucs_sminusone_const = -1.0f;
570 #endif
571 #endif
572
573 /*************************************************************/
574 /* structures */
575 /*************************************************************/
576
577 #define FALSE 0
578 #define TRUE 1
579
580 /*#define BLAS_FLOPS_CUTOFF 1000.0*/
581 #define BLAS_FLOPS_CUTOFF -1.0
582 #define SOLVE_DENSE_CUTOFF 5
583
584 typedef struct {
585 int sn_size;
586 int n;
587 int* rowind;
588
589 int up_size;
590 int* sn_vertices;
591 int* up_vertices;
592
593 taucs_datatype* f1;
594 taucs_datatype* f2;
595 taucs_datatype* u;
596
597 } supernodal_frontal_matrix;
598
599 #define SFM_F1 f1
600 #define SFM_F2 f2
601 #define SFM_U u
602
603 typedef struct {
604 int flags;
605
606 char uplo; /* 'u' for upper, 'l' for lower, ' ' don't know; prefer lower. */
607 int n; /* size of matrix */
608 int n_sn; /* number of supernodes */
609
610 int* parent; /* supernodal elimination tree */
611 int* first_child;
612 int* next_child;
613
614 int* sn_size; /* size of supernodes (diagonal block) */
615 int* sn_up_size; /* size of subdiagonal update blocks */
616 int** sn_struct; /* row structure of supernodes */
617
618 int* sn_blocks_ld; /* lda of supernode blocks */
619 taucs_datatype** sn_blocks; /* supernode blocks */
620
621 int* up_blocks_ld; /* lda of update blocks */
622 taucs_datatype** up_blocks; /* update blocks */
623 } supernodal_factor_matrix;
624
625 #ifdef TAUCS_CORE_GENERAL
626 /*************************************************************/
627 /* for qsort */
628 /*************************************************************/
629
630 /* this is never used */
631 /*
632 static int compare_ints(void* vx, void* vy)
633 {
634 int* ix = (int*)vx;
635 int* iy = (int*)vy;
636 if (*ix < *iy) return -1;
637 if (*ix > *iy) return 1;
638 return 0;
639 }
640 */
641
642 static int* compare_indirect_map;
compare_indirect_ints(const void * vx,const void * vy)643 static int compare_indirect_ints( const void* vx, const void* vy)
644 {
645 int* ix = (int*)vx;
646 int* iy = (int*)vy;
647 if (compare_indirect_map[*ix] < compare_indirect_map[*iy]) return -1;
648 if (compare_indirect_map[*ix] > compare_indirect_map[*iy]) return 1;
649 return 0;
650 }
651
652 /*************************************************************/
653 /* radix sort */
654 /*************************************************************/
655
656 #if 0
657 /* NCOUNTS = 2^LOGRADIX */
658
659 #define RADIX_SORT_LOGRADIX 4
660 #define RADIX_SORT_NCOUNTS 16
661
662 static unsigned int counts[RADIX_SORT_NCOUNTS];
663
664 static int
665 radix_sort(unsigned int* x, int n)
666 {
667 int i;
668 unsigned int mask;
669
670 unsigned int ncounts;
671
672 unsigned int* y;
673 unsigned int* to;
674 unsigned int* from;
675
676 unsigned int v;
677 unsigned int partialsum;
678 unsigned int next;
679 unsigned int bits_sorted;
680
681 if (RADIX_SORT_LOGRADIX >= 8*sizeof(unsigned int)) {
682 taucs_printf("radix sort: radix too large.\n");
683 /* the computation of ncounts will fail */
684 return 0;
685 }
686
687 mask = 0;
688 ncounts = 1;
689 for (i=0; i<RADIX_SORT_LOGRADIX; i++) {
690 mask = (mask << 1) | 1;
691 ncounts = ncounts << 1;
692 }
693
694 assert(ncounts==RADIX_SORT_NCOUNTS);
695
696 y = (unsigned int*) taucs_malloc(n * sizeof(unsigned int));
697 if (!y) {
698 taucs_printf("radix sort: out of memory.\n");
699 return -1;
700 }
701
702 from = x;
703 to = y;
704
705 bits_sorted = 0;
706 while(bits_sorted < 8*sizeof(unsigned int)) {
707 for (i=0; i<ncounts; i++) counts[i] = 0;
708
709 for (i=0; i<n; i++) {
710 v = (from[i] >> bits_sorted) & mask;
711 assert(v < ncounts);
712 counts[v] ++;
713 }
714
715 partialsum = 0;
716 for (i=0; i<ncounts; i++) {
717 /*printf("<%d ",counts[i]);*/
718 next = counts[i];
719 counts[i] = partialsum;
720 /*printf("%d>\n",counts[i]);*/
721 partialsum = partialsum + next;
722 }
723
724 for (i=0; i<n; i++) {
725 v = (from[i] >> bits_sorted) & mask;
726 assert(counts[v] < n);
727 to[counts[v]] = from[i];
728 counts[v] ++;
729 }
730 /*
731 printf("===========\n");
732 for (i=0; i<n; i++) printf(">>%d>> %08x\n",bits_sorted,to[i]);
733 printf("===========\n");
734 */
735
736 bits_sorted += RADIX_SORT_LOGRADIX;
737 if (from == x) {
738 from = y;
739 to = x;
740 } else {
741 from = x;
742 to = y;
743 }
744 }
745
746 if (from == y)
747 for (i=0; i<n; i++) x[i] = y[i];
748
749 taucs_free(y);
750
751 return 0;
752 }
753 #endif
754
755 #endif /* TAUCS_CORE_GENERAL */
756 /*************************************************************/
757 /* create and free the factor object */
758 /*************************************************************/
759
760 #ifndef TAUCS_CORE_GENERAL
761
762 static supernodal_factor_matrix*
multifrontal_supernodal_create()763 multifrontal_supernodal_create()
764 {
765 supernodal_factor_matrix* L;
766
767 L = (supernodal_factor_matrix*) taucs_malloc(sizeof(supernodal_factor_matrix));
768 if (!L) return NULL;
769
770 #ifdef TAUCS_CORE_SINGLE
771 L->flags = TAUCS_SINGLE;
772 #endif
773
774 #ifdef TAUCS_CORE_DOUBLE
775 L->flags = TAUCS_DOUBLE;
776 #endif
777
778 #ifdef TAUCS_CORE_SCOMPLEX
779 L->flags = TAUCS_SCOMPLEX;
780 #endif
781
782 #ifdef TAUCS_CORE_DCOMPLEX
783 L->flags = TAUCS_DCOMPLEX;
784 #endif
785
786 L->uplo = 'l';
787 L->n = -1; /* unused */
788
789 L->sn_struct = NULL;
790 L->sn_size = NULL;
791 L->sn_up_size = NULL;
792 L->parent = NULL;
793 L->first_child = NULL;
794 L->next_child = NULL;
795 L->sn_blocks_ld = NULL;
796 L->sn_blocks = NULL;
797 L->up_blocks_ld = NULL;
798 L->up_blocks = NULL;
799
800 return L;
801 }
802
taucs_dtl(supernodal_factor_free)803 void taucs_dtl(supernodal_factor_free)(void* vL)
804 {
805 supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL;
806 int sn;
807
808 if (!L) return;
809
810 taucs_free(L->parent);
811 taucs_free(L->first_child);
812 taucs_free(L->next_child);
813
814 taucs_free(L->sn_size);
815 taucs_free(L->sn_up_size);
816 taucs_free(L->sn_blocks_ld);
817 taucs_free(L->up_blocks_ld);
818
819 if (L->sn_struct)
820 for (sn=0; sn<L->n_sn; sn++)
821 taucs_free(L->sn_struct[sn]);
822
823 if (L->sn_blocks)
824 for (sn=0; sn<L->n_sn; sn++)
825 taucs_free(L->sn_blocks[sn]);
826
827 if (L->up_blocks)
828 for (sn=0; sn<L->n_sn; sn++)
829 taucs_free(L->up_blocks[sn]);
830
831 taucs_free(L->sn_struct);
832 taucs_free(L->sn_blocks);
833 taucs_free(L->up_blocks);
834
835 taucs_free(L);
836 }
837
taucs_dtl(supernodal_factor_free_numeric)838 void taucs_dtl(supernodal_factor_free_numeric)(void* vL)
839 {
840 supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL;
841 int sn;
842
843 for (sn=0; sn<L->n_sn; sn++) {
844 taucs_free(L->sn_blocks[sn]);
845 L->sn_blocks[sn] = NULL;
846 taucs_free(L->up_blocks[sn]);
847 L->up_blocks[sn] = NULL;
848 }
849 }
850
851 taucs_ccs_matrix*
taucs_dtl(supernodal_factor_to_ccs)852 taucs_dtl(supernodal_factor_to_ccs)(void* vL)
853 {
854 supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL;
855 taucs_ccs_matrix* C;
856 int n,nnz;
857 int i,j,ip,jp,sn,next;
858 taucs_datatype v;
859 int* len;
860
861 n = L->n;
862
863 len = (int*) taucs_malloc(n*sizeof(int));
864 if (!len) return NULL;
865
866 nnz = 0;
867 /*
868 for (sn=0; sn<L->n_sn; sn++) {
869 for (jp=0; jp<(L->sn_size)[sn]; jp++) {
870 j = (L->sn_struct)[sn][jp];
871 len[j] = (L->sn_up_size)[sn] - jp;
872 nnz += len[j];
873 }
874 }
875 */
876
877 for (sn=0; sn<L->n_sn; sn++) {
878 for (jp=0; jp<(L->sn_size)[sn]; jp++) {
879 j = (L->sn_struct)[sn][jp];
880 len[j] = 0;
881
882 for (ip=jp; ip<(L->sn_size)[sn]; ip++) {
883 i = (L->sn_struct)[sn][ ip ];
884 v = (L->sn_blocks)[sn][ jp*(L->sn_blocks_ld)[sn] + ip ];
885
886 if (taucs_re(v) || taucs_im(v)) {
887 len[j] ++;
888 nnz ++;
889 }
890 }
891 for (ip=(L->sn_size)[sn]; ip<(L->sn_up_size)[sn]; ip++) {
892 i = (L->sn_struct)[sn][ ip ];
893 v = (L->up_blocks)[sn][ jp*(L->up_blocks_ld)[sn] + (ip-(L->sn_size)[sn]) ];
894
895 if (taucs_re(v) || taucs_im(v)) {
896 len[j] ++;
897 nnz ++;
898 }
899 }
900 }
901 }
902
903
904 C = taucs_dtl(ccs_create)(n,n,nnz);
905 if (!C) {
906 taucs_free(len);
907 return NULL;
908 }
909
910 #ifdef TAUCS_CORE_SINGLE
911 C->flags = TAUCS_SINGLE;
912 #endif
913
914 #ifdef TAUCS_CORE_DOUBLE
915 C->flags = TAUCS_DOUBLE;
916 #endif
917
918 #ifdef TAUCS_CORE_SCOMPLEX
919 C->flags = TAUCS_SCOMPLEX;
920 #endif
921
922 #ifdef TAUCS_CORE_DCOMPLEX
923 C->flags = TAUCS_DCOMPLEX;
924 #endif
925
926 C->flags |= TAUCS_TRIANGULAR | TAUCS_LOWER;
927
928 (C->colptr)[0] = 0;
929 for (j=1; j<=n; j++) (C->colptr)[j] = (C->colptr)[j-1] + len[j-1];
930
931 taucs_free(len);
932
933 for (sn=0; sn<L->n_sn; sn++) {
934 for (jp=0; jp<(L->sn_size)[sn]; jp++) {
935 j = (L->sn_struct)[sn][jp];
936
937 next = (C->colptr)[j];
938
939 /*
940 memcpy((C->rowind) + next,
941 ((L->sn_struct)[sn]) + jp,
942 ((L->sn_up_size)[sn] - jp) * sizeof(int));
943 memcpy((C->taucs_values) + next,
944 ((L->sn_blocks)[sn]) + (jp*(L->sn_blocks_ld)[sn] + jp),
945 ((L->sn_size)[sn] - jp) * sizeof(taucs_datatype));
946 next += ((L->sn_size)[sn] - jp);
947 memcpy((C->taucs_values) + next,
948 ((L->up_blocks)[sn]) + jp*(L->up_blocks_ld)[sn],
949 ((L->sn_up_size)[sn] - (L->sn_size)[sn]) * sizeof(taucs_datatype));
950 */
951
952 for (ip=jp; ip<(L->sn_size)[sn]; ip++) {
953 i = (L->sn_struct)[sn][ ip ];
954 v = (L->sn_blocks)[sn][ jp*(L->sn_blocks_ld)[sn] + ip ];
955
956 if (!taucs_re(v) && !taucs_im(v)) continue;
957 /*if (v == 0.0) continue;*/
958
959 (C->rowind)[next] = i;
960 (C->taucs_values)[next] = v;
961 next++;
962 }
963 for (ip=(L->sn_size)[sn]; ip<(L->sn_up_size)[sn]; ip++) {
964 i = (L->sn_struct)[sn][ ip ];
965 v = (L->up_blocks)[sn][ jp*(L->up_blocks_ld)[sn] + (ip-(L->sn_size)[sn]) ];
966
967 if (!taucs_re(v) && !taucs_im(v)) continue;
968 /*if (v == 0.0) continue;*/
969
970 (C->rowind)[next] = i;
971 (C->taucs_values)[next] = v;
972 next++;
973 }
974 }
975 }
976
977 return C;
978 }
979
980 /* just get the diagonal of a supernodal factor, for Penny */
981
982 taucs_datatype*
taucs_dtl(supernodal_factor_get_diag)983 taucs_dtl(supernodal_factor_get_diag)(void* vL)
984 {
985 supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL;
986 int j,ip,jp,sn;/*i,next omer*/
987 taucs_datatype v;
988 taucs_datatype* diag;
989
990 diag = (taucs_datatype*) taucs_malloc((L->n) * sizeof(taucs_datatype));
991 if (!diag) return NULL;
992
993 for (sn=0; sn<L->n_sn; sn++) {
994 for (jp=0; jp<(L->sn_size)[sn]; jp++) {
995 j = (L->sn_struct)[sn][jp];
996
997 ip=jp; /* we just want the diagonal */
998
999 v = (L->sn_blocks)[sn][ jp*(L->sn_blocks_ld)[sn] + ip ];
1000
1001 diag[ j ] = v;
1002 }
1003 }
1004
1005 return diag;
1006 }
1007
1008
1009 /*************************************************************/
1010 /* create and free frontal matrices */
1011 /*************************************************************/
1012
1013 static supernodal_frontal_matrix*
supernodal_frontal_create(int * firstcol_in_supernode,int sn_size,int n,int * rowind)1014 supernodal_frontal_create(int* firstcol_in_supernode,
1015 int sn_size,
1016 int n,
1017 int* rowind)
1018 {
1019 supernodal_frontal_matrix* tmp;
1020
1021 tmp = (supernodal_frontal_matrix*)taucs_malloc(sizeof(supernodal_frontal_matrix));
1022 if(tmp==NULL) return NULL;
1023
1024 tmp->sn_size = sn_size;
1025 tmp->n = n;
1026
1027 tmp->rowind = rowind;
1028
1029 tmp->n = n;
1030 tmp->sn_size = sn_size;
1031 tmp->up_size = n-sn_size;
1032
1033 tmp->sn_vertices = rowind;
1034 tmp->up_vertices = rowind + sn_size;
1035
1036 /* on some platforms, malloc(0) fails, so we avoid such calls */
1037
1038 tmp->SFM_F1 = tmp->SFM_F2 = tmp->SFM_U = NULL;
1039
1040 if (tmp->sn_size)
1041 tmp->SFM_F1 = (taucs_datatype*)taucs_calloc((tmp->sn_size)*(tmp->sn_size),sizeof(taucs_datatype));
1042
1043 if (tmp->sn_size && tmp->up_size)
1044 tmp->SFM_F2 = (taucs_datatype*)taucs_calloc((tmp->up_size)*(tmp->sn_size),sizeof(taucs_datatype));
1045
1046 if (tmp->up_size)
1047 tmp->SFM_U = (taucs_datatype*)taucs_calloc((tmp->up_size)*(tmp->up_size),sizeof(taucs_datatype));
1048
1049 if(( tmp->SFM_F1==NULL && tmp->sn_size)
1050 || (tmp->SFM_F2==NULL && tmp->sn_size && tmp->up_size)
1051 || (tmp->SFM_U ==NULL && tmp->up_size)) {
1052 taucs_free(tmp->SFM_U);
1053 taucs_free(tmp->SFM_F1);
1054 taucs_free(tmp->SFM_F2);
1055 taucs_free(tmp);
1056 return NULL;
1057 }
1058
1059 assert(tmp);
1060 return tmp;
1061 }
1062
supernodal_frontal_free(supernodal_frontal_matrix * to_del)1063 static void supernodal_frontal_free(supernodal_frontal_matrix* to_del)
1064 {
1065 /*
1066 SFM_F1 and SFM_F2 are moved to the factor,
1067 but this function may be called before they are
1068 moved.
1069 */
1070
1071
1072 if (to_del) {
1073 taucs_free(to_del->SFM_F1);
1074 taucs_free(to_del->SFM_F2);
1075 taucs_free(to_del->SFM_U);
1076 taucs_free(to_del);
1077 }
1078 }
1079
1080 /*************************************************************/
1081 /* factor a frontal matrix */
1082 /*************************************************************/
1083
1084 cilk
1085 static int
multifrontal_supernodal_front_factor(int sn,int * firstcol_in_supernode,int sn_size,taucs_ccs_matrix * A,supernodal_frontal_matrix * mtr,int * bitmap,supernodal_factor_matrix * snL)1086 multifrontal_supernodal_front_factor(int sn,
1087 int* firstcol_in_supernode,
1088 int sn_size,
1089 taucs_ccs_matrix* A,
1090 supernodal_frontal_matrix* mtr,
1091 int* bitmap,
1092 supernodal_factor_matrix* snL)
1093 {
1094 int i,j;
1095 int* ind;
1096 taucs_datatype* re;
1097 int INFO;
1098
1099 /* creating transform for real indices */
1100 for(i=0;i<mtr->sn_size;i++) bitmap[mtr->sn_vertices[i]] = i;
1101 for(i=0;i<mtr->up_size;i++) bitmap[mtr->up_vertices[i]] = mtr->sn_size + i;
1102
1103 /* adding sn_size column of A to first sn_size column of frontal matrix */
1104
1105 for(j=0;j<(mtr->sn_size);j++) {
1106 ind = &(A->rowind[A->colptr[*(firstcol_in_supernode+j)]]);
1107 re = &(A->taucs_values[A->colptr[*(firstcol_in_supernode+j)]]);
1108 for(i=0;
1109 i < A->colptr[*(firstcol_in_supernode+j)+1]
1110 - A->colptr[*(firstcol_in_supernode+j)];
1111 i++) {
1112 if (bitmap[ind[i]] < mtr->sn_size)
1113 mtr->SFM_F1[ (mtr->sn_size)*j + bitmap[ind[i]]] =
1114 taucs_add( mtr->SFM_F1[ (mtr->sn_size)*j + bitmap[ind[i]]] , re[i] );
1115 else
1116 mtr->SFM_F2[ (mtr->up_size)*j + bitmap[ind[i]] - mtr->sn_size] =
1117 taucs_add( mtr->SFM_F2[ (mtr->up_size)*j + bitmap[ind[i]] - mtr->sn_size] , re[i] );
1118 }
1119 }
1120
1121 /* we use the BLAS through the Fortran interface */
1122
1123 /* solving of lower triangular system for L */
1124 if (mtr->sn_size) {
1125 /*
1126 taucs_potrf ("LOWER",
1127 &(mtr->sn_size),
1128 mtr->SFM_F1,&(mtr->sn_size),
1129 &INFO);
1130 */
1131 spawn taucs_cilk_potrf ("LOWER",
1132 &(mtr->sn_size),
1133 mtr->SFM_F1,&(mtr->sn_size),
1134 &INFO);
1135 sync;
1136 }
1137
1138
1139 if (INFO) {
1140 taucs_printf("sivan %d %d\n",sn,sn_size);
1141 taucs_printf("\t\tLL^T Factorization: Matrix is not positive definite.\n");
1142 taucs_printf("\t\t nonpositive pivot in column %d\n",
1143 mtr->sn_vertices[INFO-1]);
1144 return -1;
1145 }
1146
1147 /* getting completion for found columns of L */
1148 if (mtr->up_size && mtr->sn_size) {
1149
1150 spawn taucs_cilk_trsm ("Right",
1151 "Lower",
1152 "Conjugate",
1153 "No unit diagonal",
1154 &(mtr->up_size),&(mtr->sn_size),
1155 &taucs_one_const,
1156 mtr->SFM_F1,&(mtr->sn_size),
1157 mtr->SFM_F2,&(mtr->up_size));
1158 sync;
1159 /*
1160 taucs_trsm ("Right",
1161 "Lower",
1162 "Conjugate",
1163 "No unit diagonal",
1164 &(mtr->up_size),&(mtr->sn_size),
1165 &taucs_one_const,
1166 mtr->SFM_F1,&(mtr->sn_size),
1167 mtr->SFM_F2,&(mtr->up_size));
1168 */
1169 }
1170
1171 (snL->sn_blocks )[sn] = mtr->SFM_F1;
1172 (snL->sn_blocks_ld)[sn] = mtr->sn_size;
1173
1174 (snL->up_blocks )[sn] = mtr->SFM_F2;
1175 (snL->up_blocks_ld)[sn] = mtr->up_size;
1176 /* printf("*** sn=%d up_ld=%d (%d)\n",sn,mtr->up_size,(snL->up_vertex_ptr)[sn+1] - (snL->up_vertex_ptr)[sn]);*/
1177
1178 /* computation of updated part of frontal matrix */
1179 if (mtr->up_size && mtr->sn_size) {
1180 spawn taucs_cilk_herk ("Lower",
1181 "No Conjugate",
1182 &(mtr->up_size),&(mtr->sn_size),
1183 &taucs_minusone_real_const,
1184 mtr->SFM_F2,&(mtr->up_size),
1185 &taucs_one_real_const,
1186 mtr->SFM_U, &(mtr->up_size));
1187 sync;
1188 }
1189
1190 mtr->SFM_F1 = NULL; /* so we don't free twice */
1191 mtr->SFM_F2 = NULL; /* so we don't free twice */
1192
1193 return 0;
1194 }
1195
1196 /*************************************************************/
1197 /* extend-add */
1198 /*************************************************************/
1199
1200 static void
multifrontal_supernodal_front_extend_add(supernodal_frontal_matrix * parent_mtr,supernodal_frontal_matrix * my_mtr,int * bitmap)1201 multifrontal_supernodal_front_extend_add(
1202 supernodal_frontal_matrix* parent_mtr,
1203 supernodal_frontal_matrix* my_mtr,
1204 int* bitmap)
1205 {
1206 int j,i,parent_i,parent_j;
1207 taucs_datatype v;
1208
1209 for(i=0;i<parent_mtr->sn_size;i++) bitmap[parent_mtr->sn_vertices[i]] = i;
1210 for(i=0;i<parent_mtr->up_size;i++) bitmap[parent_mtr->up_vertices[i]] = (parent_mtr->sn_size)+i;
1211
1212 /* extend add operation for update matrix */
1213 for(j=0;j<my_mtr->up_size;j++) {
1214 for(i=j;i<my_mtr->up_size;i++) {
1215 parent_j = bitmap[ my_mtr->up_vertices[j] ];
1216 parent_i = bitmap[ my_mtr->up_vertices[i] ];
1217 /* we could skip this if indices were sorted */
1218 if (parent_j>parent_i) {
1219 int tmp = parent_j;
1220 parent_j = parent_i;
1221 parent_i = tmp;
1222 }
1223
1224 v = (my_mtr->SFM_U)[(my_mtr->up_size)*j+i];
1225
1226 if (parent_j < parent_mtr->sn_size) {
1227 if (parent_i < parent_mtr->sn_size) {
1228 (parent_mtr->SFM_F1)[ (parent_mtr->sn_size)*parent_j + parent_i] =
1229 taucs_add( (parent_mtr->SFM_F1)[ (parent_mtr->sn_size)*parent_j + parent_i] , v );
1230 } else {
1231 (parent_mtr->SFM_F2)[ (parent_mtr->up_size)*parent_j + (parent_i-parent_mtr->sn_size)] =
1232 taucs_add( (parent_mtr->SFM_F2)[ (parent_mtr->up_size)*parent_j + (parent_i-parent_mtr->sn_size)] , v );
1233 }
1234 } else {
1235 (parent_mtr->SFM_U)[ (parent_mtr->up_size)*(parent_j-parent_mtr->sn_size) + (parent_i-parent_mtr->sn_size)] =
1236 taucs_add( (parent_mtr->SFM_U)[ (parent_mtr->up_size)*(parent_j-parent_mtr->sn_size) + (parent_i-parent_mtr->sn_size)] , v);
1237 }
1238 }
1239 }
1240 }
1241
1242 #endif /*#ifndef TAUCS_CORE_GENERAL*/
1243
1244 /*************************************************************/
1245 /* symbolic elimination */
1246 /*************************************************************/
1247
1248 #ifdef TAUCS_CORE_GENERAL
1249
1250 /* UNION FIND ROUTINES */
1251
uf_makeset(int * uf,int i)1252 static int uf_makeset(int* uf, int i) { uf[i] = i; return i; }
uf_find(int * uf,int i)1253 static int uf_find (int* uf, int i)
1254 {
1255 if (uf[i] != i)
1256 uf[i] = uf_find(uf,uf[i]);
1257 return uf[i];
1258 }
uf_union(int * uf,int s,int t)1259 static int uf_union (int* uf, int s, int t) {
1260 if (uf_find(uf,s) < uf_find(uf,t)) {
1261 uf[uf_find(uf,s)] = uf_find(uf,t);
1262 return (uf_find(uf,t));
1263 } else {
1264 uf[uf_find(uf,s)] = uf_find(uf,t);
1265 return (uf_find(uf,t));
1266 }
1267 }
1268
1269 static
recursive_postorder(int j,int first_child[],int next_child[],int postorder[],int ipostorder[],int * next)1270 void recursive_postorder(int j,
1271 int first_child[],
1272 int next_child[],
1273 int postorder[],
1274 int ipostorder[],
1275 int* next)
1276 {
1277 int c;
1278 for (c=first_child[j]; c != -1; c = next_child[c]) {
1279 /*printf("*** %d is child of %d\n",c,j);*/
1280 recursive_postorder(c,first_child,next_child,
1281 postorder,ipostorder,next);
1282 }
1283 /*printf(">>> j=%d next=%d\n",j,*next);*/
1284 if (postorder) postorder [*next] = j;
1285 if (ipostorder) ipostorder[j] = *next;
1286 (*next)++;
1287 }
1288
1289 #define GILBERT_NG_PEYTON_ANALYSIS_SUP
1290
1291 /* in a few tests the supernodal version seemed slower */
1292 #undef GILBERT_NG_PEYTON_ANALYSIS_SUP
1293
ordered_uf_makeset(int * uf,int i)1294 static int ordered_uf_makeset(int* uf, int i)
1295 {
1296 uf[i] = i;
1297 return i;
1298 }
ordered_uf_find(int * uf,int i)1299 static int ordered_uf_find (int* uf, int i)
1300 {
1301 if (uf[i] != i)
1302 uf[i] = uf_find(uf,uf[i]);
1303 return uf[i];
1304 }
ordered_uf_union(int * uf,int s,int t)1305 static int ordered_uf_union (int* uf, int s, int t)
1306 {
1307 assert(uf[t] == t);
1308 assert(uf[s] == s);
1309 assert(t > s);
1310 if (t > s) {
1311 uf[s] = t;
1312 return t;
1313 } else
1314 uf[t] = s;
1315 return s;
1316 }
1317
1318 static void
tree_level(int j,int isroot,int first_child[],int next_child[],int level[],int level_j)1319 tree_level(int j,
1320 int isroot,
1321 int first_child[],
1322 int next_child[],
1323 int level[],
1324 int level_j)
1325 {
1326 int c;
1327 if (!isroot) level[j] = level_j;
1328 for (c=first_child[j]; c != -1; c = next_child[c]) {
1329 tree_level(c,
1330 FALSE,
1331 first_child,
1332 next_child,
1333 level,
1334 level_j+1);
1335 }
1336 }
1337
1338 static void
tree_first_descendant(int j,int isroot,int first_child[],int next_child[],int ipostorder[],int first_descendant[])1339 tree_first_descendant(int j,
1340 int isroot,
1341 int first_child[],
1342 int next_child[],
1343 int ipostorder[],
1344 int first_descendant[])
1345 {
1346 int c;
1347 int fd = ipostorder[j];
1348 for (c=first_child[j]; c != -1; c = next_child[c]) {
1349 tree_first_descendant(c,
1350 FALSE,
1351 first_child,
1352 next_child,
1353 ipostorder,
1354 first_descendant);
1355 if (first_descendant[c] < fd) fd = first_descendant[c];
1356 }
1357 if (!isroot) first_descendant[j] = fd;
1358 }
1359
1360
1361 int
1362 taucs_ccs_etree(taucs_ccs_matrix* A,
1363 int* parent,
1364 int* l_colcount,
1365 int* l_rowcount,
1366 int* l_nnz);
1367
1368 int
1369 taucs_ccs_etree_liu(taucs_ccs_matrix* A,
1370 int* parent,
1371 int* l_colcount,
1372 int* l_rowcount,
1373 int* l_nnz);
1374
1375
1376
1377 static int
recursive_symbolic_elimination(int j,taucs_ccs_matrix * A,int first_child[],int next_child[],int * n_sn,int sn_size[],int sn_up_size[],int * sn_rowind[],int sn_first_child[],int sn_next_child[],int rowind[],int column_to_sn_map[],int map[],int do_order,int ipostorder[])1378 recursive_symbolic_elimination(int j,
1379 taucs_ccs_matrix* A,
1380 int first_child[],
1381 int next_child[],
1382 int* n_sn,
1383 int sn_size[],
1384 int sn_up_size[],
1385 int* sn_rowind[],
1386 int sn_first_child[],
1387 int sn_next_child[],
1388 int rowind[],
1389 int column_to_sn_map[],
1390 int map[],
1391 int do_order,
1392 int ipostorder[]
1393 )
1394 {
1395 int i,ip,c,c_sn;
1396 int in_previous_sn;
1397 int nnz = 0; /* just to suppress the warning */
1398
1399 for (c=first_child[j]; c != -1; c = next_child[c]) {
1400 if (recursive_symbolic_elimination(c,A,
1401 first_child,next_child,
1402 n_sn,
1403 sn_size,sn_up_size,sn_rowind,
1404 sn_first_child,sn_next_child,
1405 rowind, /* temporary */
1406 column_to_sn_map,
1407 map,
1408 do_order,ipostorder
1409 )
1410 == -1) return -1;
1411 }
1412
1413 in_previous_sn = 1;
1414 if (j == A->n)
1415 in_previous_sn = 0; /* this is not a real column */
1416 else if (first_child[j] == -1)
1417 in_previous_sn = 0; /* this is a leaf */
1418 else if (next_child[first_child[j]] != -1)
1419 in_previous_sn = 0; /* more than 1 child */
1420 else {
1421 /* check that the structure is nested */
1422 /* map contains child markers */
1423
1424 c=first_child[j];
1425 for (ip=(A->colptr)[j]; ip<(A->colptr)[j+1]; ip++) {
1426 i = (A->rowind)[ip];
1427 in_previous_sn = in_previous_sn && (map[i] == c);
1428 }
1429 }
1430
1431 if (in_previous_sn) {
1432 c = first_child[j];
1433 c_sn = column_to_sn_map[c];
1434 column_to_sn_map[j] = c_sn;
1435
1436 /* swap row indices so j is at the end of the */
1437 /* supernode, not in the update indices */
1438 for (ip=sn_size[c_sn]; ip<sn_up_size[c_sn]; ip++)
1439 if (sn_rowind[c_sn][ip] == j) break;
1440 assert(ip<sn_up_size[c_sn]);
1441 sn_rowind[c_sn][ip] = sn_rowind[c_sn][sn_size[c_sn]];
1442 sn_rowind[c_sn][sn_size[c_sn]] = j;
1443
1444 /* mark the nonzeros in the map */
1445 for (ip=sn_size[c_sn]; ip<sn_up_size[c_sn]; ip++)
1446 map[ sn_rowind[c_sn][ip] ] = j;
1447
1448 sn_size [c_sn]++;
1449
1450 return 0;
1451 }
1452
1453 /* we are in a new supernode */
1454
1455 if (j < A->n) {
1456 nnz = 1;
1457 rowind[0] = j;
1458 map[j] = j;
1459
1460 for (c=first_child[j]; c != -1; c = next_child[c]) {
1461 c_sn = column_to_sn_map[c];
1462 for (ip=sn_size[c_sn]; ip<sn_up_size[c_sn]; ip++) {
1463 i = sn_rowind[c_sn][ip];
1464 if (i > j && map[i] != j) { /* new row index */
1465 map[i] = j;
1466 rowind[nnz] = i;
1467 nnz++;
1468 }
1469 }
1470 }
1471
1472 for (ip=(A->colptr)[j]; ip<(A->colptr)[j+1]; ip++) {
1473 i = (A->rowind)[ip];
1474 if (map[i] != j) { /* new row index */
1475 map[i] = j;
1476 rowind[nnz] = i;
1477 nnz++;
1478 }
1479 }
1480 }
1481
1482 /*printf("children of sn %d: ",*n_sn);*/
1483 for (c=first_child[j]; c != -1; c = next_child[c]) {
1484 c_sn = column_to_sn_map[c];
1485 /*printf("%d ",c_sn);*/
1486 if (c==first_child[j])
1487 sn_first_child[*n_sn] = c_sn;
1488 else {
1489 sn_next_child[ c_sn ] = sn_first_child[*n_sn];
1490 sn_first_child[*n_sn] = c_sn;
1491 }
1492 }
1493 /*printf("\n");*/
1494
1495 if (j < A->n) {
1496 column_to_sn_map[j] = *n_sn;
1497 sn_size [*n_sn] = 1;
1498 sn_up_size[*n_sn] = nnz;
1499 sn_rowind [*n_sn] = (int*) taucs_malloc(nnz * sizeof(int));
1500 if (!( sn_rowind [*n_sn] )) return -1;
1501 for (ip=0; ip<nnz; ip++) sn_rowind[*n_sn][ip] = rowind[ip];
1502 if (do_order) {
1503 /* Sivan and Vladimir: we think that we can sort in */
1504 /* column order, not only in etree postorder. */
1505 /*
1506 radix_sort(sn_rowind [*n_sn],nnz);
1507 qsort(sn_rowind [*n_sn],nnz,sizeof(int),compare_ints);
1508 */
1509 compare_indirect_map = ipostorder;
1510 qsort(sn_rowind [*n_sn],nnz,sizeof(int),compare_indirect_ints);
1511 }
1512 assert(sn_rowind [*n_sn][0] == j);
1513 (*n_sn)++;
1514 }
1515
1516 return 0;
1517 }
1518
1519 /* count zeros and nonzeros in a supernode to compute the */
1520 /* utility of merging fundamental supernodes. */
1521
1522 typedef struct {
1523 double zeros;
1524 double nonzeros;
1525 } znz;
1526
1527 static znz
recursive_amalgamate_supernodes(int sn,int * n_sn,int sn_size[],int sn_up_size[],int * sn_rowind[],int sn_first_child[],int sn_next_child[],int rowind[],int column_to_sn_map[],int map[],int do_order,int ipostorder[])1528 recursive_amalgamate_supernodes(int sn,
1529 int* n_sn,
1530 int sn_size[],
1531 int sn_up_size[],
1532 int* sn_rowind[],
1533 int sn_first_child[],
1534 int sn_next_child[],
1535 int rowind[],
1536 int column_to_sn_map[],
1537 int map[],
1538 int do_order,
1539 int ipostorder[]
1540 )
1541 {
1542 int i,ip,c_sn,gc_sn;
1543 /*int i,ip,c,c_sn,gc_sn;*/
1544 int nnz;
1545 int nchildren /*, ichild*/; /* number of children, child index */
1546 znz* c_znz = NULL;
1547 znz sn_znz, merged_znz;
1548 /*int zero_count = 0;*/
1549 int new_sn_size, new_sn_up_size;
1550
1551 sn_znz.zeros = 0.0;
1552 sn_znz.nonzeros = (double) (((sn_up_size[sn] - sn_size[sn]) * sn_size[sn])
1553 + (sn_size[sn] * (sn_size[sn] + 1))/2);
1554
1555 if (sn_first_child[sn] == -1) { /* leaf */
1556 return sn_znz;
1557 }
1558
1559 nchildren = 0;
1560 for (c_sn=sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn])
1561 nchildren++;
1562
1563 /* c_znz = (znz*) alloca(nchildren * sizeof(znz));*/
1564 c_znz = (znz*) taucs_malloc(nchildren * sizeof(znz));
1565 assert(c_znz);
1566
1567 /*printf("supernode %d out of %d\n",sn,*n_sn);*/
1568
1569 /* merge the supernode with its children! */
1570
1571 i = 0;
1572 for (c_sn=sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) {
1573 c_znz[i] =
1574 recursive_amalgamate_supernodes(c_sn,
1575 n_sn,
1576 sn_size,sn_up_size,sn_rowind,
1577 sn_first_child,sn_next_child,
1578 rowind, /* temporary */
1579 column_to_sn_map,
1580 map,
1581 do_order,ipostorder
1582 );
1583 assert(c_znz[i].zeros + c_znz[i].nonzeros ==
1584 (double) (((sn_up_size[c_sn] - sn_size[c_sn]) * sn_size[c_sn])
1585 + (sn_size[c_sn] * (sn_size[c_sn] + 1))/2 ));
1586 i++;
1587 }
1588
1589 merged_znz.nonzeros = sn_znz.nonzeros;
1590 merged_znz.zeros = sn_znz.zeros;
1591
1592 for (i=0; i<nchildren; i++) {
1593 merged_znz.nonzeros += (c_znz[i]).nonzeros;
1594 merged_znz.zeros += (c_znz[i]).zeros;
1595 }
1596
1597 taucs_free(c_znz);
1598
1599 /* printf("supernode %d out of %d (continuing)\n",sn,*n_sn);*/
1600
1601 /* should we merge the supernode with its children? */
1602
1603 nnz = 0;
1604 for (c_sn=sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) {
1605 for (ip=0; ip<sn_size[c_sn]; ip++) {
1606 i = sn_rowind[c_sn][ip];
1607 assert( map[i] != sn );
1608 map[i] = sn;
1609 rowind[nnz] = i;
1610 nnz++;
1611 }
1612 }
1613
1614 for (ip=0; ip<sn_size[sn]; ip++) {
1615 i = sn_rowind[sn][ip];
1616 assert( map[i] != sn );
1617 map[i] = sn;
1618 rowind[nnz] = i;
1619 nnz++;
1620 }
1621
1622 new_sn_size = nnz;
1623
1624 for (c_sn=sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) {
1625 for (ip=sn_size[c_sn]; ip<sn_up_size[c_sn]; ip++) {
1626 i = sn_rowind[c_sn][ip];
1627 if (map[i] != sn) { /* new row index */
1628 map[i] = sn;
1629 rowind[nnz] = i;
1630 nnz++;
1631 }
1632 }
1633 }
1634
1635 for (ip=sn_size[sn]; ip<sn_up_size[sn]; ip++) {
1636 i = sn_rowind[sn][ip];
1637 if (map[i] != sn) { /* new row index */
1638 map[i] = sn;
1639 rowind[nnz] = i;
1640 nnz++;
1641 }
1642 }
1643
1644 new_sn_up_size = nnz;
1645
1646 if (do_order) {
1647 compare_indirect_map = ipostorder;
1648 qsort(rowind,nnz,sizeof(int),compare_indirect_ints);
1649 }
1650
1651 /* determine whether we should merge the supernode and its children */
1652
1653 {
1654 int n;
1655 double* zcount = NULL;
1656
1657 n = 0;
1658 for (ip=0; ip<nnz; ip++) {
1659 i = rowind[ip];
1660 if (i >= n) n = i+1;
1661 }
1662
1663 /*zcount = (double*) alloca(n * sizeof(double));*/
1664 zcount = (double*) taucs_malloc(n * sizeof(double));
1665 assert(zcount);
1666
1667 for (ip=0; ip<new_sn_size; ip++) {
1668 i = rowind[ip]; assert(i<n);
1669 zcount[i] = (double) (ip+1);
1670 }
1671 for (ip=new_sn_size; ip<new_sn_up_size; ip++) {
1672 i = rowind[ip]; assert(i<n);
1673 zcount[i] = (double) new_sn_size;
1674 }
1675
1676 /*
1677 for (ip=0; ip<new_sn_up_size; ip++)
1678 printf("row %d zcount = %.0f\n",rowind[ip],zcount[rowind[ip]]);
1679 */
1680
1681 for (c_sn=sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) {
1682 for (ip=0; ip<sn_size[c_sn]; ip++) {
1683 i = sn_rowind[c_sn][ip]; assert(i<n);
1684 zcount[i] -= (double) (ip+1);
1685 }
1686 for (ip=sn_size[c_sn]; ip<sn_up_size[c_sn]; ip++) {
1687 i = sn_rowind[c_sn][ip]; assert(i<n);
1688 zcount[i] -= (double) sn_size[c_sn];
1689 }
1690 }
1691
1692 for (ip=0; ip<sn_size[sn]; ip++) {
1693 i = sn_rowind[sn][ip]; assert(i<n);
1694 zcount[i] -= (double) (ip+1);
1695 }
1696 for (ip=sn_size[sn]; ip<sn_up_size[sn]; ip++) {
1697 i = sn_rowind[sn][ip]; assert(i<n);
1698 zcount[i] -= (double) sn_size[sn];
1699 }
1700
1701 /*
1702 for (ip=0; ip<new_sn_up_size; ip++)
1703 printf("ROW %d zcount = %.0f\n",rowind[ip],zcount[rowind[ip]]);
1704 printf("zeros before merging %.0f\n",merged_znz.zeros);
1705 */
1706
1707 for (ip=0; ip<new_sn_up_size; ip++) {
1708 i = rowind[ip]; assert(i<n);
1709 assert(zcount[i] >= 0.0);
1710 merged_znz.zeros += zcount[i];
1711 }
1712
1713 /*printf("zeros after merging %.0f\n",merged_znz.zeros);*/
1714
1715 /* voodoo constants (need some kind of a utility function */
1716 if ((new_sn_size < 16)
1717 ||
1718 ((sn_size[sn] < 50) && (merged_znz.zeros < 0.5 * merged_znz.nonzeros))
1719 ||
1720 ((sn_size[sn] < 250) && (merged_znz.zeros < 0.25 * merged_znz.nonzeros))
1721 ||
1722 ((sn_size[sn] < 500) && (merged_znz.zeros < 0.10 * merged_znz.nonzeros))
1723 ||
1724 (merged_znz.zeros < 0.05 * merged_znz.nonzeros)
1725 ) {
1726 /*
1727 taucs_printf("merging sn %d, zeros (%f) vs nonzeros (%f)\n",
1728 sn,merged_znz.zeros,merged_znz.nonzeros);
1729 */
1730 } else {
1731 /*
1732 taucs_printf("sn %d, too many zeros (%f) vs nonzeros (%f)\n",
1733 sn,merged_znz.zeros,merged_znz.nonzeros);
1734 printf("returning without merging\n");
1735 */
1736 taucs_free(zcount);
1737 return sn_znz;
1738 }
1739
1740 taucs_free(zcount);
1741 }
1742
1743 /* now merge the children lists */
1744
1745 sn_size[sn] = new_sn_size;
1746 sn_up_size[sn] = new_sn_up_size;
1747 sn_rowind[sn] = (int*) taucs_realloc(sn_rowind[sn],
1748 new_sn_up_size * sizeof(int));
1749 for (ip=0; ip<new_sn_up_size; ip++) sn_rowind[sn][ip] = rowind[ip];
1750
1751 /* printf("supernode %d out of %d (merging)\n",sn,*n_sn);*/
1752
1753 nchildren = 0;
1754 for (c_sn=sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) {
1755 for (ip=0; ip<sn_size[c_sn]; ip++) {
1756 i = (sn_rowind[c_sn])[ip];
1757 assert(column_to_sn_map[i] == c_sn);
1758 column_to_sn_map[i] = sn;
1759 }
1760
1761 for (gc_sn=sn_first_child[c_sn]; gc_sn != -1; gc_sn = sn_next_child[gc_sn]) {
1762 rowind[nchildren] = gc_sn;
1763 nchildren++;
1764 }
1765 }
1766
1767 /* free the children's rowind vectors */
1768 for (c_sn=sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) {
1769 taucs_free( sn_rowind[c_sn] );
1770 sn_rowind[c_sn] = NULL;
1771 sn_size[c_sn] = 0;
1772 sn_up_size[c_sn] = 0;
1773 }
1774
1775 sn_first_child[sn] = -1;
1776 for (i=0; i<nchildren; i++) {
1777 sn_next_child[ rowind[i] ] = sn_first_child[sn];
1778 sn_first_child[sn] = rowind[i];
1779 }
1780
1781 /*
1782 printf("supernode %d out of %d (done)\n",sn,*n_sn);
1783 printf("returning, merging\n");
1784 */
1785 return merged_znz;
1786 }
1787 #endif /* #ifdef TAUCS_CORE_GENERAL */
1788
1789
1790 #ifndef TAUCS_CORE_GENERAL
1791
1792
1793 /*************************************************************/
1794 /* factor routines */
1795 /*************************************************************/
1796
extend_add_wrapper(supernodal_frontal_matrix * child_matrix,supernodal_frontal_matrix ** my_matrix_ptr,int is_root,int * v,int sn_size,int sn_up_size,int * rowind,int * bitmap,int * fail)1797 static void extend_add_wrapper(supernodal_frontal_matrix * child_matrix,
1798 supernodal_frontal_matrix ** my_matrix_ptr,
1799 int is_root,
1800 int *v,
1801 int sn_size,
1802 int sn_up_size,
1803 int * rowind,
1804 int * bitmap,
1805 int * fail) {
1806
1807 if (*fail) {
1808 if (*my_matrix_ptr)
1809 supernodal_frontal_free(*my_matrix_ptr);
1810 return;
1811 }
1812
1813 if (!is_root) {
1814 if (!(*my_matrix_ptr)) {
1815 *my_matrix_ptr = supernodal_frontal_create(v,sn_size,sn_up_size,rowind);
1816 if (!(*my_matrix_ptr)) {
1817 *fail = TRUE;
1818 supernodal_frontal_free(child_matrix);
1819 return;
1820 }
1821 }
1822 multifrontal_supernodal_front_extend_add(*my_matrix_ptr,child_matrix,bitmap);
1823 }
1824
1825 /* moved outside "if !is_root"; Sivan 27 Feb 2002 */
1826 supernodal_frontal_free(child_matrix);
1827 }
1828
1829 cilk
1830 static supernodal_frontal_matrix*
recursive_multifrontal_supernodal_factor_llt(int sn,int is_root,int ** bitmaps,taucs_ccs_matrix * A,supernodal_factor_matrix * snL,int * fail)1831 recursive_multifrontal_supernodal_factor_llt(int sn, /* this supernode */
1832 int is_root, /* is v the root? */
1833 #if 1
1834 int** bitmaps,
1835 #else
1836 int* bitmap,
1837 #endif
1838 taucs_ccs_matrix* A,
1839 supernodal_factor_matrix* snL,
1840 int* fail)
1841 {
1842 supernodal_frontal_matrix* my_matrix=NULL;
1843 /*supernodal_frontal_matrix* child_matrix=NULL;*/
1844 int child;
1845 int* v;
1846 int sn_size;
1847 int* first_child = snL->first_child;
1848 int* next_child = snL->next_child;
1849
1850 #ifdef TAUCS_CILK
1851 /* Inlet for syncronization */
1852 inlet void extend_add_inlet(supernodal_frontal_matrix * child_matrix) {
1853
1854 if (!is_root) {
1855 if (!(my_matrix)) {
1856 my_matrix = supernodal_frontal_create(v,sn_size,
1857 snL->sn_up_size[sn],snL->sn_struct[sn]);
1858
1859 if (!(my_matrix)) {
1860 *fail = TRUE;
1861 supernodal_frontal_free(child_matrix);
1862 return;
1863 }
1864 }
1865 multifrontal_supernodal_front_extend_add(my_matrix,child_matrix,bitmaps[Self]);
1866 }
1867
1868 /* moved outside "if !is_root"; Sivan 27 Feb 2002 */
1869 supernodal_frontal_free(child_matrix);
1870 /*
1871 The following approach is not working correctly because nothing is guaranteed about different procedure instances atomcity:
1872
1873 extend_add_wrapper(child_matrix,&my_matrix,is_root,v,sn_size,snL->sn_up_size[sn],snL->sn_struct[sn],bitmaps[Self],fail);
1874 */
1875 }
1876 #endif
1877
1878 /* Sivan fixed a bug 25/2/2003: v was set even at the root, */
1879 /* but this element of sn_struct was not allocated. */
1880
1881 if (!is_root) {
1882 sn_size = snL->sn_size[sn];
1883 v = &( snL->sn_struct[sn][0] );
1884 } else {
1885 sn_size = -1;
1886 v = NULL; /* not used */
1887 }
1888
1889 for (child = first_child[sn]; child != -1; child = next_child[child]) {
1890 /* original non-cilk code: */
1891 /*
1892 child_matrix =
1893 recursive_multifrontal_supernodal_factor_llt(child,
1894 FALSE,
1895 bitmap,
1896 A,snL,fail);
1897 */
1898
1899 #ifdef TAUCS_CILK
1900 extend_add_inlet(spawn recursive_multifrontal_supernodal_factor_llt(child,
1901 FALSE,
1902 bitmaps,
1903 A,snL,fail));
1904 #else
1905 extend_add_wrapper(recursive_multifrontal_supernodal_factor_llt(child,
1906 FALSE,
1907 bitmaps,
1908 A,snL,fail),
1909 &my_matrix,
1910 is_root,
1911 v,
1912 sn_size,
1913 snL->sn_up_size[sn],
1914 snL->sn_struct[sn],
1915 bitmaps[Self],
1916 fail);
1917 #endif
1918
1919
1920 if (*fail) {
1921 if (my_matrix) supernodal_frontal_free(my_matrix);
1922 return NULL;
1923 }
1924
1925 #if 0
1926 if (!is_root) {
1927 if (!my_matrix) {
1928 my_matrix = supernodal_frontal_create(v,sn_size,
1929 snL->sn_up_size[sn],
1930 snL->sn_struct[sn]);
1931 if (!my_matrix) {
1932 *fail = TRUE;
1933 supernodal_frontal_free(child_matrix);
1934 return NULL;
1935 }
1936 }
1937 multifrontal_supernodal_front_extend_add(my_matrix,child_matrix,bitmap);
1938 }
1939 /* moved outside "if !is_root"; Sivan 27 Feb 2002 */
1940 supernodal_frontal_free(child_matrix);
1941 #endif /* 0, old pre-cilk code */
1942 }
1943 sync;
1944
1945 /* in case we have no children, we allocate now */
1946 if (!is_root && !my_matrix) {
1947 my_matrix = supernodal_frontal_create(v,sn_size,
1948 snL->sn_up_size[sn],
1949 snL->sn_struct[sn]);
1950 if (!my_matrix) {
1951 *fail = TRUE;
1952 return NULL;
1953 }
1954 }
1955
1956 if(!is_root) {
1957 int rc;
1958 rc = spawn multifrontal_supernodal_front_factor(sn,
1959 v,sn_size,
1960 A,
1961 my_matrix,
1962 #if 1
1963 bitmaps[Self],
1964 #else
1965 bitmap,
1966 #endif
1967 snL);
1968 sync;
1969 if (rc) {
1970 /* nonpositive pivot */
1971 *fail = TRUE;
1972 supernodal_frontal_free(my_matrix);
1973 return NULL;
1974 }
1975 }
1976 return my_matrix;
1977 }
1978
1979 cilk
1980 void*
taucs_dtl(ccs_factor_llt_mf)1981 taucs_dtl(ccs_factor_llt_mf)(taucs_ccs_matrix* A)
1982 {
1983 void* p;
1984
1985 p = spawn taucs_dtl(ccs_factor_llt_mf_maxdepth)(A,0);
1986 sync;
1987
1988 return p;
1989 }
1990
1991 cilk
1992 static void
recursive_multifrontal_supernodal_factor_llt_caller(int n_sn,int is_root,taucs_ccs_matrix * A,supernodal_factor_matrix * snL,int * fail)1993 recursive_multifrontal_supernodal_factor_llt_caller(int n_sn, /* this supernode */
1994 int is_root, /* is v the root? */
1995 taucs_ccs_matrix* A,
1996 supernodal_factor_matrix* snL,
1997 int* fail)
1998 {
1999 int** maps;
2000 int i,j;
2001 supernodal_frontal_matrix* always_null;
2002
2003 maps = (int**)taucs_malloc(Cilk_active_size*sizeof(int*));
2004 if (!maps) {
2005 taucs_supernodal_factor_free(snL);
2006 assert(0); return;
2007 /*return NULL;*/
2008 }
2009
2010 for (i=0; i < Cilk_active_size; i++) {
2011 maps[i] = (int*)taucs_malloc((A->n+1)*sizeof(int));
2012 if (!maps[i]) {
2013 for (j=0; j < i ; j++)
2014 taucs_free(maps[j]);
2015 taucs_free(maps);
2016 taucs_supernodal_factor_free(snL);
2017 assert(0); return;
2018 /*return NULL;*/
2019 }
2020 }
2021
2022 /*#ifdef TAUCS_CILK */
2023 #if 0
2024 context = Cilk_init(&argc,argv);
2025 always_null = EXPORT(recursive_multifrontal_supernodal_factor_llt)(context,
2026 n_sn,
2027 TRUE,
2028 maps,
2029 A,snL,fail);
2030 Cilk_terminate(context);
2031 #else
2032 always_null = spawn recursive_multifrontal_supernodal_factor_llt(n_sn,
2033 TRUE,
2034 maps,
2035 A,snL,fail);
2036 sync;
2037 #endif
2038
2039 for(i=0;i<Cilk_active_size;i++)
2040 taucs_free(maps[i]);
2041 taucs_free(maps);
2042
2043 /*
2044 always_null = spawn recursive_multifrontal_supernodal_factor_llt((L->n_sn),
2045 TRUE,
2046 map,
2047 A,L,&fail);
2048 */
2049 }
2050
2051 cilk
2052 void*
taucs_dtl(ccs_factor_llt_mf_maxdepth)2053 taucs_dtl(ccs_factor_llt_mf_maxdepth)(taucs_ccs_matrix* A,int max_depth)
2054 {
2055 supernodal_factor_matrix* L;
2056 #if 1
2057 #else
2058 int* map;
2059 #endif
2060 int fail;
2061 double wtime, ctime;
2062
2063 wtime = taucs_wtime();
2064 ctime = taucs_ctime();
2065
2066 L = multifrontal_supernodal_create();
2067 if (!L) return NULL;
2068
2069 #ifdef TAUCS_CORE_COMPLEX
2070 fail = taucs_ccs_symbolic_elimination(A,L,
2071 TRUE /* sort, to avoid complex conjuation */,
2072 max_depth);
2073 #else
2074 fail = taucs_ccs_symbolic_elimination(A,L,
2075 FALSE /* don't sort row indices */ ,
2076 max_depth);
2077 #endif
2078 if (fail == -1) {
2079 taucs_supernodal_factor_free(L);
2080 return NULL;
2081 }
2082
2083 wtime = taucs_wtime()-wtime;
2084 ctime = taucs_ctime()-ctime;
2085 taucs_printf("\t\tSymbolic Analysis = % 10.3f seconds (%.3f cpu)\n",
2086 wtime,ctime);
2087
2088 #if 1
2089 #else
2090 map = (int*)taucs_malloc((A->n+1)*sizeof(int));
2091 if (!map) {
2092 taucs_supernodal_factor_free(L);
2093 return NULL;
2094 }
2095 #endif
2096
2097 wtime = taucs_wtime();
2098 ctime = taucs_ctime();
2099
2100 fail = FALSE;
2101 spawn recursive_multifrontal_supernodal_factor_llt_caller((L->n_sn),
2102 TRUE,
2103 A,L,&fail);
2104 sync;
2105
2106 wtime = taucs_wtime()-wtime;
2107 ctime = taucs_ctime()-ctime;
2108 taucs_printf("\t\tSupernodal Multifrontal LL^T = % 10.3f seconds (%.3f cpu)\n",
2109 wtime,ctime);
2110
2111 #if 1
2112 #else
2113 taucs_free(map);
2114 #endif
2115
2116 if (fail) {
2117 taucs_supernodal_factor_free(L);
2118 return NULL;
2119 }
2120
2121 return (void*) L;
2122 }
2123
2124 /*************************************************************/
2125 /* symbolic-numeric routines */
2126 /*************************************************************/
2127
2128 void*
taucs_dtl(ccs_factor_llt_symbolic)2129 taucs_dtl(ccs_factor_llt_symbolic)(taucs_ccs_matrix* A)
2130 {
2131 return taucs_dtl(ccs_factor_llt_symbolic_maxdepth)(A,0);
2132 }
2133
2134 void*
taucs_dtl(ccs_factor_llt_symbolic_maxdepth)2135 taucs_dtl(ccs_factor_llt_symbolic_maxdepth)(taucs_ccs_matrix* A, int max_depth)
2136 {
2137 supernodal_factor_matrix* L;
2138 int fail;
2139 double wtime, ctime;
2140
2141 wtime = taucs_wtime();
2142 ctime = taucs_ctime();
2143
2144 L = multifrontal_supernodal_create();
2145 if (!L) return NULL;
2146
2147 #ifdef TAUCS_CORE_COMPLEX
2148 fail = taucs_ccs_symbolic_elimination(A,L,
2149 TRUE /* sort, to avoid complex conjuation */,
2150 max_depth);
2151 #else
2152 fail = taucs_ccs_symbolic_elimination(A,L,
2153 FALSE /* don't sort row indices */ ,
2154 max_depth);
2155 #endif
2156
2157 if (fail == -1) {
2158 taucs_supernodal_factor_free(L);
2159 return NULL;
2160 }
2161
2162 wtime = taucs_wtime()-wtime;
2163 ctime = taucs_ctime()-ctime;
2164 taucs_printf("\t\tSymbolic Analysis = % 10.3f seconds (%.3f cpu)\n",
2165 wtime,ctime);
2166 return L;
2167 }
2168
2169 cilk
2170 int
taucs_dtl(ccs_factor_llt_numeric)2171 taucs_dtl(ccs_factor_llt_numeric)(taucs_ccs_matrix* A,void* vL)
2172 {
2173 supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL;
2174 int* map;
2175 int fail;
2176 double wtime, ctime;
2177
2178 map = (int*)taucs_malloc((A->n+1)*sizeof(int));
2179
2180 wtime = taucs_wtime();
2181 ctime = taucs_ctime();
2182
2183 /* XXX: sivan, we don't need map */
2184 fail = FALSE;
2185 spawn recursive_multifrontal_supernodal_factor_llt_caller((L->n_sn),
2186 TRUE,
2187 A,L,&fail);
2188 sync;
2189 /*
2190 recursive_multifrontal_supernodal_factor_llt((L->n_sn),
2191 TRUE,
2192 map,
2193 A,L,&fail);
2194 */
2195
2196 wtime = taucs_wtime()-wtime;
2197 ctime = taucs_ctime()-ctime;
2198 taucs_printf("\t\tSupernodal Multifrontal LL^T = % 10.3f seconds (%.3f cpu)\n",
2199 wtime,ctime);
2200
2201 taucs_free(map);
2202
2203 if (fail) {
2204 taucs_supernodal_factor_free_numeric(L);
2205 return -1;
2206 }
2207
2208 return 0;
2209 }
2210
2211 /*************************************************************/
2212 /* left-looking factor routines */
2213 /*************************************************************/
2214
2215 static void
recursive_leftlooking_supernodal_update(int J,int K,int bitmap[],taucs_datatype * dense_update_matrix,taucs_ccs_matrix * A,supernodal_factor_matrix * L)2216 recursive_leftlooking_supernodal_update(int J,int K,
2217 int bitmap[],
2218 taucs_datatype* dense_update_matrix,
2219 taucs_ccs_matrix* A,
2220 supernodal_factor_matrix* L)
2221 {
2222 int i,j,ir;
2223 int child;
2224 int* first_child = L->first_child;
2225 int* next_child = L->next_child;
2226 int sn_size_father = (L->sn_size)[J];
2227 int sn_up_size_father = (L->sn_up_size)[J];
2228 int sn_size_child = (L->sn_size)[K];
2229 int sn_up_size_child = (L->sn_up_size)[K];
2230 int exist_upd=0;
2231 int first_row = 0;
2232 int row_count=0;
2233 int PK,M,N,LDA,LDB,LDC;
2234
2235 /*
2236 for(i=0;i<sn_size_father;i++) {
2237 bitmap[L->sn_struct[J][i]]=i+1;
2238 }
2239
2240 for(i=sn_size_father;i<sn_up_size_father;i++)
2241 bitmap[L->sn_struct[J][i]] = i - sn_size_father + 1;
2242 */
2243
2244 for(i=sn_size_child;i<sn_up_size_child;i++)
2245 /* is this row index included in the columns of sn J? */
2246 if(bitmap[L->sn_struct[K][i]]
2247 && L->sn_struct[K][i] <= L->sn_struct[J][sn_size_father-1]) {
2248 if(!exist_upd) first_row = i;
2249 row_count++;
2250 exist_upd = 1;
2251 /*taucs_printf("update from K = %d to J = %d \n",K,J);*/
2252 /* loop over columns of sn K */
2253
2254 /* for(j=0;j<sn_size_child;j++)
2255 for(ir=i;ir<sn_up_size_child;ir++)
2256 if( L->sn_struct[K][ir] <= L->sn_struct[J][sn_size_father-1]){
2257 L->sn_blocks[J][ (bitmap[L->sn_struct[K][i]]-1)*(L->sn_blocks_ld[J])+(bitmap[L->sn_struct[K][ir]]-1)] -= L->up_blocks[K][j*(L->up_blocks_ld[K])+ir-sn_size_child]* L->up_blocks[K][j*L->up_blocks_ld[K]+i-sn_size_child];
2258 taucs_printf("sn_block: L[%d,%d] = %lf\n",(bitmap[L->sn_struct[K][ir]]-1),(bitmap[L->sn_struct[K][i]]-1),L->sn_blocks[J][ (bitmap[L->sn_struct[K][i]]-1)*(L->sn_blocks_ld[J])+(bitmap[L->sn_struct[K][ir]]-1)]);}
2259 else{
2260 L->up_blocks[J][ (bitmap[L->sn_struct[K][i]]-1)*(L->up_blocks_ld[J])+(bitmap[L->sn_struct[K][ir]]-1)] -= L->up_blocks[K][j*L->up_blocks_ld[K]+ir-sn_size_child]* L->up_blocks[K][j*L->up_blocks_ld[K]+i-sn_size_child];
2261 taucs_printf("up_block: L[%d,%d] = %lf\n",(bitmap[L->sn_struct[K][ir]]-1),(bitmap[L->sn_struct[K][i]]-1),L->up_blocks[J][ (bitmap[L->sn_struct[K][i]]-1)*(L->up_blocks_ld[J])+(bitmap[L->sn_struct[K][ir]]-1)]);
2262 }*/
2263 }
2264
2265 if(exist_upd){
2266 LDA = LDB = (L->up_blocks_ld)[K];
2267 M = sn_up_size_child - first_row ; /* +-1 ? */
2268 LDC = sn_up_size_father;
2269 N = row_count;
2270 PK = L->sn_size[K];
2271
2272 /* The GEMM code computes on the upper triangle of the trapezoidal
2273 matrix, which is junk. */
2274 /*
2275 taucs_gemm ("No Conjugate",
2276 "Conjugate",
2277 &M,&N,&PK,
2278 &taucs_one_const,
2279 &(L->up_blocks[K][first_row-sn_size_child]),&LDA,
2280 &(L->up_blocks[K][first_row-sn_size_child]),&LDB,
2281 &taucs_zero_const,
2282 dense_update_matrix,&LDC);
2283 */
2284
2285 /* This is the HERK+GEMM fix by Elad */
2286 taucs_herk ("Lower",
2287 "No Conjugate",
2288 &N,&PK,
2289 &taucs_one_real_const,
2290 &(L->up_blocks[K][first_row-sn_size_child]),&LDA,
2291 &taucs_zero_real_const,
2292 dense_update_matrix,&LDC);
2293
2294 if(M-N > 0)
2295 {
2296 int newM = M - N;
2297
2298 taucs_gemm ("No Conjugate",
2299 "Conjugate",
2300 &newM,&N,&PK,
2301 &taucs_one_const,
2302 &(L->up_blocks[K][first_row-sn_size_child+N]),&LDA,
2303 &(L->up_blocks[K][first_row-sn_size_child]),&LDB,
2304 &taucs_zero_const,
2305 dense_update_matrix+N,&LDC);
2306 }
2307 /* end of GEMM/HERK+GEMM fix */
2308
2309 /*for(j=0;j<row_count;j++)
2310 for(ir=0;ir<sn_up_size_father;ir++)
2311 taucs_printf("dense[%d,%d] = %lf\n",ir,j,dense_update_matrix[j*LDC+ir]);
2312 */
2313
2314 for(j=0;j<row_count;j++)
2315 for(ir=j;ir<row_count;ir++){
2316
2317 #if 0
2318 L->sn_blocks[J][(bitmap[L->sn_struct[K][first_row+j]]-1)*sn_size_father+(bitmap[L->sn_struct[K][first_row+ir]]-1)] -= dense_update_matrix[j*LDC+ir];
2319 #endif
2320
2321 L->sn_blocks[J][(bitmap[L->sn_struct[K][first_row+j]]-1)*sn_size_father+(bitmap[L->sn_struct[K][first_row+ir]]-1)] =
2322 taucs_sub( L->sn_blocks[J][(bitmap[L->sn_struct[K][first_row+j]]-1)*sn_size_father+(bitmap[L->sn_struct[K][first_row+ir]]-1)] , dense_update_matrix[j*LDC+ir]);
2323
2324 /* taucs_printf("sn_block: L[%d,%d] = %lf\n",(bitmap[L->sn_struct[K][first_row+ir]]-1),(bitmap[L->sn_struct[K][first_row+j]]-1),L->sn_blocks[J][(bitmap[L->sn_struct[K][first_row+j]]-1)*sn_size_father+(bitmap[L->sn_struct[K][first_row+ir]]-1)]);*/
2325
2326 }
2327
2328 for(j=0;j<row_count;j++)
2329 for(ir=row_count;ir<M;ir++){
2330 #if 0
2331 L->up_blocks[J][(bitmap[L->sn_struct[K][first_row+j]]-1)*(L->up_blocks_ld)[J]+(bitmap[L->sn_struct[K][ir+first_row]]-1)] -= dense_update_matrix[j*LDC+ir];
2332 #endif
2333
2334 L->up_blocks[J][(bitmap[L->sn_struct[K][first_row+j]]-1)*(L->up_blocks_ld)[J]+(bitmap[L->sn_struct[K][ir+first_row]]-1)] =
2335 taucs_sub( L->up_blocks[J][(bitmap[L->sn_struct[K][first_row+j]]-1)*(L->up_blocks_ld)[J]+(bitmap[L->sn_struct[K][ir+first_row]]-1)] , dense_update_matrix[j*LDC+ir]);
2336
2337 /* taucs_printf("up_block: L[%d,%d] = %lf\n",(bitmap[L->sn_struct[K][ir+first_row]]-1),(bitmap[L->sn_struct[K][first_row+j]]-1),L->up_blocks[J][(bitmap[L->sn_struct[K][first_row+j]]-1)*(L->up_blocks_ld)[J]+(bitmap[L->sn_struct[K][ir+first_row]]-1)]);*/
2338
2339 }
2340 /*
2341 for(i=0;i<sn_up_size_father;i++)
2342 bitmap[L->sn_struct[J][i]]=0;
2343 */
2344
2345 for (child = first_child[K]; child != -1; child = next_child[child]) {
2346 recursive_leftlooking_supernodal_update(J,child,
2347 bitmap,dense_update_matrix,
2348 A,L);
2349 }
2350 }
2351
2352 /*
2353 else
2354 for(i=0;i<sn_up_size_father;i++)
2355 bitmap[L->sn_struct[J][i]]=0;
2356 */
2357
2358 }
2359
2360 static int
leftlooking_supernodal_front_factor(int sn,int * bitmap,taucs_ccs_matrix * A,supernodal_factor_matrix * L)2361 leftlooking_supernodal_front_factor(int sn,
2362 int* bitmap,
2363 taucs_ccs_matrix* A,
2364 supernodal_factor_matrix* L)
2365 {
2366 int ip,jp;
2367 int* ind;
2368 taucs_datatype* re;
2369 int INFO;
2370
2371 int sn_size = (L->sn_size)[sn];
2372 int up_size = (L->sn_up_size)[sn] - (L->sn_size)[sn];
2373
2374 /* creating transform for real indices */
2375 for(ip=0;ip<(L->sn_up_size)[sn];ip++) bitmap[(L->sn_struct)[sn][ip]] = ip;
2376
2377 for(jp=0;jp<sn_size;jp++) {
2378 ind = &(A->rowind[A->colptr[ (L->sn_struct)[sn][jp] ]]);
2379 re = &(A->taucs_values[A->colptr[ (L->sn_struct)[sn][jp] ]]);
2380 for(ip=0;
2381 ip < A->colptr[ (L->sn_struct)[sn][jp] + 1 ]
2382 - A->colptr[ (L->sn_struct)[sn][jp] ];
2383 ip++) {
2384 if (bitmap[ind[ip]] < sn_size)
2385 (L->sn_blocks)[sn][ (L->sn_blocks_ld)[sn]*jp + bitmap[ind[ip]]] =
2386 taucs_add( (L->sn_blocks)[sn][ (L->sn_blocks_ld)[sn]*jp + bitmap[ind[ip]]] , re[ip] );
2387 else
2388 (L->up_blocks)[sn][ (L->up_blocks_ld)[sn]*jp + bitmap[ind[ip]] - sn_size] =
2389 taucs_add( (L->up_blocks)[sn][ (L->up_blocks_ld)[sn]*jp + bitmap[ind[ip]] - sn_size] , re[ip] );
2390 }
2391 }
2392
2393 /* we use the BLAS through the Fortran interface */
2394
2395 /* solving of lower triangular system for L */
2396 if (sn_size)
2397 taucs_potrf ("LOWER",
2398 &sn_size,
2399 (L->sn_blocks)[sn],&((L->sn_blocks_ld)[sn]),
2400 &INFO);
2401
2402 if (INFO) {
2403 taucs_printf("\t\tLL^T Factorization: Matrix is not positive definite.\n");
2404 taucs_printf("\t\t nonpositive pivot in column %d\n",
2405 (L->sn_struct)[INFO-1]);
2406 return -1;
2407 }
2408
2409 /* getting completion for found columns of L */
2410 if (up_size && sn_size)
2411 taucs_trsm ("Right",
2412 "Lower",
2413 "Conjugate",
2414 "No unit diagonal",
2415 &up_size,&sn_size,
2416 &taucs_one_const,
2417 (L->sn_blocks)[sn],&((L->sn_blocks_ld)[sn]),
2418 (L->up_blocks)[sn],&((L->up_blocks_ld)[sn]));
2419
2420 return 0;
2421 }
2422
2423 static int
recursive_leftlooking_supernodal_factor_llt(int sn,int is_root,int * bitmap,int * indmap,taucs_ccs_matrix * A,supernodal_factor_matrix * L)2424 recursive_leftlooking_supernodal_factor_llt(int sn, /* this supernode */
2425 int is_root, /* is v the root? */
2426 int* bitmap,
2427 int* indmap,
2428 taucs_ccs_matrix* A,
2429 supernodal_factor_matrix* L)
2430 {
2431 int child;
2432 int sn_size;
2433 int* first_child = L->first_child;
2434 int* next_child = L->next_child;
2435 taucs_datatype* dense_update_matrix = NULL;
2436
2437 if (!is_root)
2438 sn_size = L->sn_size[sn];
2439 else
2440 sn_size = -1;
2441
2442 if (!is_root) {
2443 (L->sn_blocks )[sn] = (L->up_blocks )[sn] = NULL;
2444 if (L->sn_size[sn]) {
2445 (L->sn_blocks )[sn] = (taucs_datatype*)taucs_calloc(((L->sn_size)[sn])*((L->sn_size)[sn]),
2446 sizeof(taucs_datatype));
2447 if (!((L->sn_blocks)[sn])) return -1; /* the caller will free L */
2448 }
2449 (L->sn_blocks_ld)[sn] = (L->sn_size )[sn];
2450
2451 if (((L->sn_up_size)[sn] - (L->sn_size)[sn]) && (L->sn_size)[sn]) {
2452 (L->up_blocks )[sn] = (taucs_datatype*)taucs_calloc(((L->sn_up_size)[sn]-(L->sn_size)[sn])
2453 *((L->sn_size)[sn]),sizeof(taucs_datatype));
2454 if (!((L->up_blocks)[sn])) return -1; /* the caller will free L */
2455 }
2456 (L->up_blocks_ld)[sn] = (L->sn_up_size)[sn]-(L->sn_size)[sn];
2457 }
2458
2459 for (child = first_child[sn]; child != -1; child = next_child[child]) {
2460 if (recursive_leftlooking_supernodal_factor_llt(child,
2461 FALSE,
2462 bitmap,
2463 indmap,
2464 A,L)
2465 == -1 ) {
2466 taucs_free(dense_update_matrix);
2467 return -1;
2468 }
2469
2470 if (!is_root) {
2471 if (!dense_update_matrix) {
2472 dense_update_matrix =
2473 (taucs_datatype*) taucs_calloc((L->sn_up_size)[sn]*(L->sn_size)[sn],sizeof(taucs_datatype));
2474 if (!dense_update_matrix) return -1; /* caller will free L */
2475 }
2476
2477 /* prepare the bitmap. Moved out of the recusive
2478 update procedure 20/1/2003. Sivan and Elad */
2479
2480 {
2481 int i;
2482 int J = sn;
2483 int sn_size_father = (L->sn_size)[J];
2484 int sn_up_size_father = (L->sn_up_size)[J];
2485
2486 for(i=0;i<sn_size_father;i++)
2487 bitmap[L->sn_struct[J][i]]=i+1;
2488 for(i=sn_size_father;i<sn_up_size_father;i++)
2489 bitmap[L->sn_struct[J][i]] = i - sn_size_father + 1;
2490 }
2491
2492 recursive_leftlooking_supernodal_update(sn,child,
2493 bitmap,dense_update_matrix,
2494 A,L);
2495
2496 {
2497 int i;
2498 int J = sn;
2499 int sn_size_father = (L->sn_size)[J];
2500 int sn_up_size_father = (L->sn_up_size)[J];
2501
2502 for(i=0;i<sn_size_father;i++)
2503 bitmap[L->sn_struct[J][i]]=0;
2504 for(i=0;i<sn_up_size_father;i++)
2505 bitmap[L->sn_struct[J][i]]=0;
2506 }
2507
2508 }
2509 }
2510 taucs_free(dense_update_matrix);
2511
2512 if(!is_root) {
2513 if (leftlooking_supernodal_front_factor(sn,
2514 indmap,
2515 A,
2516 L)) {
2517 return -1; /* nonpositive pivot */
2518 }
2519 }
2520
2521 return 0;
2522 }
2523
2524
2525 void*
taucs_dtl(ccs_factor_llt_ll)2526 taucs_dtl(ccs_factor_llt_ll)(taucs_ccs_matrix* A)
2527 {
2528 return taucs_dtl(ccs_factor_llt_ll_maxdepth)(A,0);
2529 }
2530
2531 void*
taucs_dtl(ccs_factor_llt_ll_maxdepth)2532 taucs_dtl(ccs_factor_llt_ll_maxdepth)(taucs_ccs_matrix* A,int max_depth)
2533 {
2534 supernodal_factor_matrix* L;
2535 int* map;
2536 int *map2;
2537 double wtime, ctime;
2538 int fail;
2539
2540 wtime = taucs_wtime();
2541 ctime = taucs_ctime();
2542
2543 L = multifrontal_supernodal_create();
2544 if (!L) return NULL;
2545
2546 fail = taucs_ccs_symbolic_elimination(A,L,
2547 TRUE /* sort row indices */,
2548 max_depth);
2549
2550 wtime = taucs_wtime()-wtime;
2551 ctime = taucs_ctime()-ctime;
2552 taucs_printf("\t\tSymbolic Analysis = % 10.3f seconds (%.3f cpu)\n",
2553 wtime,ctime);
2554
2555 map = (int*)taucs_malloc((A->n+1)*sizeof(int));
2556 map2 = (int*)taucs_calloc((A->n+1),sizeof(int));
2557
2558 if (fail == -1 || !map || !map2) {
2559 taucs_supernodal_factor_free(L);
2560 taucs_free(map2);
2561 taucs_free(map);
2562 return NULL;
2563 }
2564
2565 wtime = taucs_wtime();
2566 ctime = taucs_ctime();
2567
2568 if (recursive_leftlooking_supernodal_factor_llt((L->n_sn),
2569 TRUE,
2570 map2,
2571 map,
2572 A,L)
2573 == -1) {
2574 taucs_supernodal_factor_free(L);
2575 taucs_free(map);
2576 taucs_free(map2);
2577 return NULL;
2578 }
2579
2580 wtime = taucs_wtime()-wtime;
2581 ctime = taucs_ctime()-ctime;
2582 taucs_printf("\t\tSupernodal Left-Looking LL^T = % 10.3f seconds (%.3f cpu)\n",
2583 wtime,ctime);
2584
2585 taucs_free(map);
2586 taucs_free(map2);
2587
2588 return (void*) L;
2589 }
2590
2591
2592 /*************************************************************/
2593 /* supernodal solve routines */
2594 /*************************************************************/
2595
2596 static void
recursive_supernodal_solve_l(int sn,int is_root,int * first_child,int * next_child,int ** sn_struct,int * sn_sizes,int * sn_up_sizes,int * sn_blocks_ld,taucs_datatype * sn_blocks[],int * up_blocks_ld,taucs_datatype * up_blocks[],taucs_datatype x[],taucs_datatype b[],taucs_datatype t[])2597 recursive_supernodal_solve_l(int sn, /* this supernode */
2598 int is_root, /* is v the root? */
2599 int* first_child, int* next_child,
2600 int** sn_struct, int* sn_sizes, int* sn_up_sizes,
2601 int* sn_blocks_ld,taucs_datatype* sn_blocks[],
2602 int* up_blocks_ld,taucs_datatype* up_blocks[],
2603 taucs_datatype x[], taucs_datatype b[],
2604 taucs_datatype t[])
2605 {
2606 int child;
2607 int sn_size; /* number of rows/columns in the supernode */
2608 int up_size; /* number of rows that this supernode updates */
2609 int ione = 1;
2610
2611 taucs_datatype* xdense;
2612 taucs_datatype* bdense;
2613 double flops;
2614 int i;/*ip,j,jp omer*/
2615
2616 for (child = first_child[sn]; child != -1; child = next_child[child]) {
2617 recursive_supernodal_solve_l(child,
2618 FALSE,
2619 first_child,next_child,
2620 sn_struct,sn_sizes,sn_up_sizes,
2621 sn_blocks_ld,sn_blocks,
2622 up_blocks_ld,up_blocks,
2623 x,b,t);
2624 }
2625
2626 if(!is_root) {
2627
2628 sn_size = sn_sizes[sn];
2629 up_size = sn_up_sizes[sn] - sn_sizes[sn];
2630
2631 flops = ((double)sn_size)*((double)sn_size)
2632 + 2.0*((double)sn_size)*((double)up_size);
2633
2634 if (flops > BLAS_FLOPS_CUTOFF) {
2635 xdense = t;
2636 bdense = t + sn_size;
2637
2638 for (i=0; i<sn_size; i++)
2639 xdense[i] = b[ sn_struct[ sn ][ i ] ];
2640 for (i=0; i<up_size; i++)
2641 bdense[i] = taucs_zero;
2642
2643 taucs_trsm ("Left",
2644 "Lower",
2645 "No Conjugate",
2646 "No unit diagonal",
2647 &sn_size,&ione,
2648 &taucs_one_const,
2649 sn_blocks[sn],&(sn_blocks_ld[sn]),
2650 xdense ,&sn_size);
2651
2652 if (up_size > 0 && sn_size > 0) {
2653 taucs_gemm ("No Conjugate","No Conjugate",
2654 &up_size, &ione, &sn_size,
2655 &taucs_one_const,
2656 up_blocks[sn],&(up_blocks_ld[sn]),
2657 xdense ,&sn_size,
2658 &taucs_zero_const,
2659 bdense ,&up_size);
2660 }
2661
2662 for (i=0; i<sn_size; i++)
2663 x[ sn_struct[ sn][ i ] ] = xdense[i];
2664 for (i=0; i<up_size; i++)
2665 /*b[ sn_struct[ sn ][ sn_size + i ] ] -= bdense[i];*/
2666 b[ sn_struct[ sn ][ sn_size + i ] ] =
2667 taucs_sub( b[ sn_struct[ sn ][ sn_size + i ] ] , bdense[i] );
2668
2669 #if 1
2670 }
2671 #else
2672 } else if (sn_size > SOLVE_DENSE_CUTOFF) {
2673
2674 xdense = t;
2675 bdense = t + sn_size;
2676
2677 for (i=0; i<sn_size; i++)
2678 xdense[i] = b[ sn_struct[ sn ][ i ] ];
2679 for (i=0; i<up_size; i++)
2680 bdense[i] = 0.0;
2681
2682 for (jp=0; jp<sn_size; jp++) {
2683 xdense[jp] = xdense[jp] / sn_blocks[sn][ sn_blocks_ld[sn]*jp + jp];
2684
2685 for (ip=jp+1; ip<sn_size; ip++) {
2686 xdense[ip] -= xdense[jp] * sn_blocks[sn][ sn_blocks_ld[sn]*jp + ip];
2687 }
2688 }
2689
2690 for (jp=0; jp<sn_size; jp++) {
2691 for (ip=0; ip<up_size; ip++) {
2692 bdense[ip] += xdense[jp] * up_blocks[sn][ up_blocks_ld[sn]*jp + ip];
2693 }
2694 }
2695
2696 for (i=0; i<sn_size; i++)
2697 x[ sn_struct[ sn][ i ] ] = xdense[i];
2698 for (i=0; i<up_size; i++)
2699 b[ sn_struct[ sn ][ sn_size + i ] ] -= bdense[i];
2700
2701 } else {
2702
2703 for (jp=0; jp<sn_size; jp++) {
2704 j = sn_struct[sn][jp];
2705 x[j] = b[j] / sn_blocks[sn][ sn_blocks_ld[sn]*jp + jp];
2706 for (ip=jp+1; ip<sn_size; ip++) {
2707 i = sn_struct[sn][ip];
2708 b[i] -= x[j] * sn_blocks[sn][ sn_blocks_ld[sn]*jp + ip];
2709 }
2710
2711 for (ip=0; ip<up_size; ip++) {
2712 i = sn_struct[sn][sn_size + ip];
2713 b[i] -= x[j] * up_blocks[sn][ up_blocks_ld[sn]*jp + ip];
2714 }
2715 }
2716
2717 }
2718 #endif
2719 }
2720 }
2721
2722 static void
recursive_supernodal_solve_lt(int sn,int is_root,int * first_child,int * next_child,int ** sn_struct,int * sn_sizes,int * sn_up_sizes,int * sn_blocks_ld,taucs_datatype * sn_blocks[],int * up_blocks_ld,taucs_datatype * up_blocks[],taucs_datatype x[],taucs_datatype b[],taucs_datatype t[])2723 recursive_supernodal_solve_lt(int sn, /* this supernode */
2724 int is_root, /* is v the root? */
2725 int* first_child, int* next_child,
2726 int** sn_struct, int* sn_sizes, int* sn_up_sizes,
2727 int* sn_blocks_ld,taucs_datatype* sn_blocks[],
2728 int* up_blocks_ld,taucs_datatype* up_blocks[],
2729 taucs_datatype x[], taucs_datatype b[],
2730 taucs_datatype t[])
2731 {
2732 int child;
2733 int sn_size; /* number of rows/columns in the supernode */
2734 int up_size; /* number of rows that this supernode updates */
2735 int ione = 1;
2736
2737 taucs_datatype* xdense;
2738 taucs_datatype* bdense;
2739 double flops;
2740 int i;/*ip,j,jp omer*/
2741
2742 if(!is_root) {
2743
2744 sn_size = sn_sizes[sn];
2745 up_size = sn_up_sizes[sn]-sn_sizes[sn];
2746
2747 flops = ((double)sn_size)*((double)sn_size)
2748 + 2.0*((double)sn_size)*((double)up_size);
2749
2750 if (flops > BLAS_FLOPS_CUTOFF) {
2751
2752 bdense = t;
2753 xdense = t + sn_size;
2754
2755 for (i=0; i<sn_size; i++)
2756 bdense[i] = b[ sn_struct[ sn][ i ] ];
2757 for (i=0; i<up_size; i++)
2758 xdense[i] = x[ sn_struct[sn][sn_size+i] ];
2759
2760 if (up_size > 0 && sn_size > 0)
2761 taucs_gemm ("Conjugate","No Conjugate",
2762 &sn_size, &ione, &up_size,
2763 &taucs_minusone_const,
2764 up_blocks[sn],&(up_blocks_ld[sn]),
2765 xdense ,&up_size,
2766 &taucs_one_const,
2767 bdense ,&sn_size);
2768
2769 taucs_trsm ("Left",
2770 "Lower",
2771 "Conjugate",
2772 "No unit diagonal",
2773 &sn_size,&ione,
2774 &taucs_one_const,
2775 sn_blocks[sn],&(sn_blocks_ld[sn]),
2776 bdense ,&sn_size);
2777
2778 for (i=0; i<sn_size; i++)
2779 x[ sn_struct[ sn][ i ] ] = bdense[i];
2780 #if 1
2781 }
2782 #else
2783 } else if (sn_size > SOLVE_DENSE_CUTOFF) {
2784
2785 bdense = t;
2786 xdense = t + sn_size;
2787
2788 for (i=0; i<sn_size; i++)
2789 bdense[i] = b[ sn_struct[ sn][ i ] ];
2790 for (i=0; i<up_size; i++)
2791 xdense[i] = x[ sn_struct[sn][sn_size+i] ];
2792
2793 for (ip=sn_size-1; ip>=0; ip--) {
2794 for (jp=0; jp<up_size; jp++) {
2795 bdense[ip] -= xdense[jp] * up_blocks[sn][ up_blocks_ld[sn]*ip + jp];
2796 }
2797 }
2798
2799 for (ip=sn_size-1; ip>=0; ip--) {
2800 for (jp=sn_size-1; jp>ip; jp--) {
2801 bdense[ip] -= bdense[jp] * sn_blocks[sn][ sn_blocks_ld[sn]*ip + jp];
2802 }
2803 bdense[ip] = bdense[ip] / sn_blocks[sn][ sn_blocks_ld[sn]*ip + ip];
2804 }
2805
2806 for (i=0; i<sn_size; i++)
2807 x[ sn_struct[ sn][ i ] ] = bdense[i];
2808
2809 } else {
2810
2811 for (ip=sn_size-1; ip>=0; ip--) {
2812 i = sn_struct[sn][ip];
2813
2814 for (jp=0; jp<up_size; jp++) {
2815 j = sn_struct[sn][sn_size + jp];
2816 b[i] -= x[j] * up_blocks[sn][ up_blocks_ld[sn]*ip + jp];
2817 }
2818
2819 for (jp=sn_size-1; jp>ip; jp--) {
2820 j = sn_struct[sn][jp];
2821 b[i] -= x[j] * sn_blocks[sn][ sn_blocks_ld[sn]*ip + jp];
2822 }
2823 x[i] = b[i] / sn_blocks[sn][ sn_blocks_ld[sn]*ip + ip];
2824
2825 }
2826
2827 }
2828 #endif
2829
2830 }
2831
2832 for (child = first_child[sn]; child != -1; child = next_child[child]) {
2833 recursive_supernodal_solve_lt(child,
2834 FALSE,
2835 first_child,next_child,
2836 sn_struct,sn_sizes,sn_up_sizes,
2837 sn_blocks_ld,sn_blocks,
2838 up_blocks_ld,up_blocks,
2839 x,b,t);
2840 }
2841 }
2842
2843
2844 int
taucs_dtl(supernodal_solve_llt)2845 taucs_dtl(supernodal_solve_llt)(void* vL, void* vx, void* vb)
2846 {
2847 supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL;
2848 taucs_datatype* x = (taucs_datatype*) vx;
2849 taucs_datatype* b = (taucs_datatype*) vb;
2850 taucs_datatype* y;
2851 taucs_datatype* t; /* temporary vector */
2852 int i;
2853
2854 y = taucs_malloc((L->n) * sizeof(taucs_datatype));
2855 t = taucs_malloc((L->n) * sizeof(taucs_datatype));
2856 if (!y || !t) {
2857 taucs_free(y);
2858 taucs_free(t);
2859 taucs_printf("multifrontal_supernodal_solve_llt: out of memory\n");
2860 return -1;
2861 }
2862
2863 for (i=0; i<L->n; i++) x[i] = b[i];
2864
2865 recursive_supernodal_solve_l (L->n_sn,
2866 TRUE, /* this is the root */
2867 L->first_child, L->next_child,
2868 L->sn_struct,L->sn_size,L->sn_up_size,
2869 L->sn_blocks_ld, L->sn_blocks,
2870 L->up_blocks_ld, L->up_blocks,
2871 y, x, t);
2872
2873 recursive_supernodal_solve_lt(L->n_sn,
2874 TRUE, /* this is the root */
2875 L->first_child, L->next_child,
2876 L->sn_struct,L->sn_size,L->sn_up_size,
2877 L->sn_blocks_ld, L->sn_blocks,
2878 L->up_blocks_ld, L->up_blocks,
2879 x, y, t);
2880
2881 taucs_free(y);
2882 taucs_free(t);
2883
2884 return 0;
2885 }
2886 #endif /*#ifndef TAUCS_CORE_GENERAL*/
2887
2888 /*************************************************************/
2889 /* generic interfaces to user-callable routines */
2890 /*************************************************************/
2891
2892 #ifdef TAUCS_CORE_GENERAL
2893
2894 cilk
2895 void*
taucs_ccs_factor_llt_mf_maxdepth(taucs_ccs_matrix * A,int max_depth)2896 taucs_ccs_factor_llt_mf_maxdepth(taucs_ccs_matrix* A,int max_depth)
2897 {
2898 void* p= NULL;
2899
2900 #ifdef TAUCS_DOUBLE_IN_BUILD
2901 if (A->flags & TAUCS_DOUBLE)
2902 p = spawn taucs_dccs_factor_llt_mf_maxdepth(A,max_depth);
2903 #endif
2904
2905 #ifdef TAUCS_SINGLE_IN_BUILD
2906 if (A->flags & TAUCS_SINGLE)
2907 p = spawn taucs_sccs_factor_llt_mf_maxdepth(A,max_depth);
2908 #endif
2909
2910 #ifdef TAUCS_DCOMPLEX_IN_BUILD
2911 if (A->flags & TAUCS_DCOMPLEX)
2912 p = spawn taucs_zccs_factor_llt_mf_maxdepth(A,max_depth);
2913 #endif
2914
2915 #ifdef TAUCS_SCOMPLEX_IN_BUILD
2916 if (A->flags & TAUCS_SCOMPLEX)
2917 p = spawn taucs_cccs_factor_llt_mf_maxdepth(A,max_depth);
2918 #endif
2919
2920 sync;
2921 return p;
2922 }
2923
2924 void*
taucs_ccs_factor_llt_ll_maxdepth(taucs_ccs_matrix * A,int max_depth)2925 taucs_ccs_factor_llt_ll_maxdepth(taucs_ccs_matrix* A,int max_depth)
2926 {
2927
2928 #ifdef TAUCS_DOUBLE_IN_BUILD
2929 if (A->flags & TAUCS_DOUBLE)
2930 return taucs_dccs_factor_llt_ll_maxdepth(A,max_depth);
2931 #endif
2932
2933 #ifdef TAUCS_SINGLE_IN_BUILD
2934 if (A->flags & TAUCS_SINGLE)
2935 return taucs_sccs_factor_llt_ll_maxdepth(A,max_depth);
2936 #endif
2937
2938 #ifdef TAUCS_DCOMPLEX_IN_BUILD
2939 if (A->flags & TAUCS_DCOMPLEX)
2940 return taucs_zccs_factor_llt_ll_maxdepth(A,max_depth);
2941 #endif
2942
2943 #ifdef TAUCS_SCOMPLEX_IN_BUILD
2944 if (A->flags & TAUCS_SCOMPLEX)
2945 return taucs_cccs_factor_llt_ll_maxdepth(A,max_depth);
2946 #endif
2947
2948 assert(0);
2949 return NULL;
2950 }
2951
taucs_ccs_factor_llt_symbolic_maxdepth(taucs_ccs_matrix * A,int max_depth)2952 void* taucs_ccs_factor_llt_symbolic_maxdepth(taucs_ccs_matrix* A,int max_depth)
2953 {
2954
2955 #ifdef TAUCS_DOUBLE_IN_BUILD
2956 if (A->flags & TAUCS_DOUBLE)
2957 return taucs_dccs_factor_llt_symbolic_maxdepth(A,max_depth);
2958 #endif
2959
2960 #ifdef TAUCS_SINGLE_IN_BUILD
2961 if (A->flags & TAUCS_SINGLE)
2962 return taucs_sccs_factor_llt_symbolic_maxdepth(A,max_depth);
2963 #endif
2964
2965 #ifdef TAUCS_DCOMPLEX_IN_BUILD
2966 if (A->flags & TAUCS_DCOMPLEX)
2967 return taucs_zccs_factor_llt_symbolic_maxdepth(A,max_depth);
2968 #endif
2969
2970 #ifdef TAUCS_SCOMPLEX_IN_BUILD
2971 if (A->flags & TAUCS_SCOMPLEX)
2972 return taucs_cccs_factor_llt_symbolic_maxdepth(A,max_depth);
2973 #endif
2974
2975 assert(0);
2976 return NULL;
2977 }
2978
2979 cilk
2980 void*
taucs_ccs_factor_llt_mf(taucs_ccs_matrix * A)2981 taucs_ccs_factor_llt_mf(taucs_ccs_matrix* A)
2982 {
2983 void* p = NULL;
2984 #ifdef TAUCS_DOUBLE_IN_BUILD
2985 if (A->flags & TAUCS_DOUBLE)
2986 p = spawn taucs_dccs_factor_llt_mf(A);
2987 #endif
2988
2989 #ifdef TAUCS_SINGLE_IN_BUILD
2990 if (A->flags & TAUCS_SINGLE)
2991 p = spawn taucs_sccs_factor_llt_mf(A);
2992 #endif
2993
2994 #ifdef TAUCS_DCOMPLEX_IN_BUILD
2995 if (A->flags & TAUCS_DCOMPLEX)
2996 p = spawn taucs_zccs_factor_llt_mf(A);
2997 #endif
2998
2999 #ifdef TAUCS_SCOMPLEX_IN_BUILD
3000 if (A->flags & TAUCS_SCOMPLEX)
3001 p = spawn taucs_cccs_factor_llt_mf(A);
3002 #endif
3003
3004 sync;
3005 return p;
3006 }
3007
taucs_ccs_factor_llt_ll(taucs_ccs_matrix * A)3008 void* taucs_ccs_factor_llt_ll(taucs_ccs_matrix* A)
3009 {
3010
3011 #ifdef TAUCS_DOUBLE_IN_BUILD
3012 if (A->flags & TAUCS_DOUBLE)
3013 return taucs_dccs_factor_llt_ll(A);
3014 #endif
3015
3016 #ifdef TAUCS_SINGLE_IN_BUILD
3017 if (A->flags & TAUCS_SINGLE)
3018 return taucs_sccs_factor_llt_ll(A);
3019 #endif
3020
3021 #ifdef TAUCS_DCOMPLEX_IN_BUILD
3022 if (A->flags & TAUCS_DCOMPLEX)
3023 return taucs_zccs_factor_llt_ll(A);
3024 #endif
3025
3026 #ifdef TAUCS_SCOMPLEX_IN_BUILD
3027 if (A->flags & TAUCS_SCOMPLEX)
3028 return taucs_cccs_factor_llt_ll(A);
3029 #endif
3030
3031 assert(0);
3032 return NULL;
3033 }
3034
taucs_ccs_factor_llt_symbolic(taucs_ccs_matrix * A)3035 void* taucs_ccs_factor_llt_symbolic(taucs_ccs_matrix* A)
3036 {
3037
3038 #ifdef TAUCS_DOUBLE_IN_BUILD
3039 if (A->flags & TAUCS_DOUBLE)
3040 return taucs_dccs_factor_llt_symbolic(A);
3041 #endif
3042
3043 #ifdef TAUCS_SINGLE_IN_BUILD
3044 if (A->flags & TAUCS_SINGLE)
3045 return taucs_sccs_factor_llt_symbolic(A);
3046 #endif
3047
3048 #ifdef TAUCS_DCOMPLEX_IN_BUILD
3049 if (A->flags & TAUCS_DCOMPLEX)
3050 return taucs_zccs_factor_llt_symbolic(A);
3051 #endif
3052
3053 #ifdef TAUCS_SCOMPLEX_IN_BUILD
3054 if (A->flags & TAUCS_SCOMPLEX)
3055 return taucs_cccs_factor_llt_symbolic(A);
3056 #endif
3057
3058 assert(0);
3059 return NULL;
3060 }
3061
3062 cilk
taucs_ccs_factor_llt_numeric(taucs_ccs_matrix * A,void * L)3063 int taucs_ccs_factor_llt_numeric(taucs_ccs_matrix* A, void* L)
3064 {
3065 int rc = TAUCS_ERROR;
3066 #ifdef TAUCS_DOUBLE_IN_BUILD
3067 if (A->flags & TAUCS_DOUBLE)
3068 rc = spawn taucs_dccs_factor_llt_numeric(A,L);
3069 #endif
3070
3071 #ifdef TAUCS_SINGLE_IN_BUILD
3072 if (A->flags & TAUCS_SINGLE)
3073 rc = spawn taucs_sccs_factor_llt_numeric(A,L);
3074 #endif
3075
3076 #ifdef TAUCS_DCOMPLEX_IN_BUILD
3077 if (A->flags & TAUCS_DCOMPLEX)
3078 rc = spawn taucs_zccs_factor_llt_numeric(A,L);
3079 #endif
3080
3081 #ifdef TAUCS_SCOMPLEX_IN_BUILD
3082 if (A->flags & TAUCS_SCOMPLEX)
3083 rc = spawn taucs_cccs_factor_llt_numeric(A,L);
3084 #endif
3085
3086 sync;
3087 return rc;
3088 }
3089
3090
taucs_supernodal_solve_llt(void * L,void * x,void * b)3091 int taucs_supernodal_solve_llt(void* L, void* x, void* b)
3092 {
3093
3094 #ifdef TAUCS_DOUBLE_IN_BUILD
3095 if (((supernodal_factor_matrix*) L)->flags & TAUCS_DOUBLE)
3096 return taucs_dsupernodal_solve_llt(L,x,b);
3097 #endif
3098
3099 #ifdef TAUCS_SINGLE_IN_BUILD
3100 if (((supernodal_factor_matrix*) L)->flags & TAUCS_SINGLE)
3101 return taucs_ssupernodal_solve_llt(L,x,b);
3102 #endif
3103
3104 #ifdef TAUCS_DCOMPLEX_IN_BUILD
3105 if (((supernodal_factor_matrix*) L)->flags & TAUCS_DCOMPLEX)
3106 return taucs_zsupernodal_solve_llt(L,x,b);
3107 #endif
3108
3109 #ifdef TAUCS_SCOMPLEX_IN_BUILD
3110 if (((supernodal_factor_matrix*) L)->flags & TAUCS_SCOMPLEX)
3111 return taucs_csupernodal_solve_llt(L,x,b);
3112 #endif
3113
3114 assert(0);
3115 return -1;
3116 }
3117
taucs_supernodal_factor_free(void * L)3118 void taucs_supernodal_factor_free(void* L)
3119 {
3120
3121 #ifdef TAUCS_DOUBLE_IN_BUILD
3122 if (((supernodal_factor_matrix*) L)->flags & TAUCS_DOUBLE) {
3123 taucs_dsupernodal_factor_free(L);
3124 return;
3125 }
3126 #endif
3127
3128 #ifdef TAUCS_SINGLE_IN_BUILD
3129 if (((supernodal_factor_matrix*) L)->flags & TAUCS_SINGLE) {
3130 taucs_ssupernodal_factor_free(L);
3131 return;
3132 }
3133 #endif
3134
3135 #ifdef TAUCS_DCOMPLEX_IN_BUILD
3136 if (((supernodal_factor_matrix*) L)->flags & TAUCS_DCOMPLEX) {
3137 taucs_zsupernodal_factor_free(L);
3138 return;
3139 }
3140 #endif
3141
3142 #ifdef TAUCS_SCOMPLEX_IN_BUILD
3143 if (((supernodal_factor_matrix*) L)->flags & TAUCS_SCOMPLEX) {
3144 taucs_csupernodal_factor_free(L);
3145 return;
3146 }
3147 #endif
3148
3149 assert(0);
3150 }
3151
taucs_supernodal_factor_free_numeric(void * L)3152 void taucs_supernodal_factor_free_numeric(void* L)
3153 {
3154
3155
3156 #ifdef TAUCS_DOUBLE_IN_BUILD
3157 if (((supernodal_factor_matrix*) L)->flags & TAUCS_DOUBLE) {
3158 taucs_dsupernodal_factor_free_numeric(L);
3159 return;
3160 }
3161 #endif
3162
3163 #ifdef TAUCS_SINGLE_IN_BUILD
3164 if (((supernodal_factor_matrix*) L)->flags & TAUCS_SINGLE) {
3165 taucs_ssupernodal_factor_free_numeric(L);
3166 return;
3167 }
3168 #endif
3169
3170 #ifdef TAUCS_DCOMPLEX_IN_BUILD
3171 if (((supernodal_factor_matrix*) L)->flags & TAUCS_DCOMPLEX) {
3172 taucs_zsupernodal_factor_free_numeric(L);
3173 return;
3174 }
3175 #endif
3176
3177 #ifdef TAUCS_SCOMPLEX_IN_BUILD
3178 if (((supernodal_factor_matrix*) L)->flags & TAUCS_SCOMPLEX) {
3179 taucs_csupernodal_factor_free_numeric(L);
3180 return;
3181 }
3182 #endif
3183
3184 assert(0);
3185 }
3186
3187 taucs_ccs_matrix*
taucs_supernodal_factor_to_ccs(void * L)3188 taucs_supernodal_factor_to_ccs(void* L)
3189 {
3190
3191 #ifdef TAUCS_DOUBLE_IN_BUILD
3192 if (((supernodal_factor_matrix*) L)->flags & TAUCS_DOUBLE)
3193 return taucs_dsupernodal_factor_to_ccs(L);
3194 #endif
3195
3196 #ifdef TAUCS_SINGLE_IN_BUILD
3197 if (((supernodal_factor_matrix*) L)->flags & TAUCS_SINGLE)
3198 return taucs_ssupernodal_factor_to_ccs(L);
3199 #endif
3200
3201 #ifdef TAUCS_DCOMPLEX_IN_BUILD
3202 if (((supernodal_factor_matrix*) L)->flags & TAUCS_DCOMPLEX)
3203 return taucs_zsupernodal_factor_to_ccs(L);
3204 #endif
3205
3206 #ifdef TAUCS_SCOMPLEX_IN_BUILD
3207 if (((supernodal_factor_matrix*) L)->flags & TAUCS_SCOMPLEX)
3208 return taucs_csupernodal_factor_to_ccs(L);
3209 #endif
3210
3211 assert(0);
3212 return NULL;
3213 }
3214
3215 void*
taucs_supernodal_factor_get_diag(void * L)3216 taucs_supernodal_factor_get_diag(void* L)
3217 {
3218
3219 #ifdef TAUCS_DOUBLE_IN_BUILD
3220 if (((supernodal_factor_matrix*) L)->flags & TAUCS_DOUBLE)
3221 return taucs_dsupernodal_factor_get_diag(L);
3222 #endif
3223
3224 #ifdef TAUCS_SINGLE_IN_BUILD
3225 if (((supernodal_factor_matrix*) L)->flags & TAUCS_SINGLE)
3226 return taucs_ssupernodal_factor_get_diag(L);
3227 #endif
3228
3229 #ifdef TAUCS_DCOMPLEX_IN_BUILD
3230 if (((supernodal_factor_matrix*) L)->flags & TAUCS_DCOMPLEX)
3231 return taucs_zsupernodal_factor_get_diag(L);
3232 #endif
3233
3234 #ifdef TAUCS_SCOMPLEX_IN_BUILD
3235 if (((supernodal_factor_matrix*) L)->flags & TAUCS_SCOMPLEX)
3236 return taucs_csupernodal_factor_get_diag(L);
3237 #endif
3238
3239 assert(0);
3240 return NULL;
3241 }
3242
3243 int
taucs_ccs_etree(taucs_ccs_matrix * A,int * parent,int * l_colcount,int * l_rowcount,int * l_nnz)3244 taucs_ccs_etree(taucs_ccs_matrix* A,
3245 int* parent,
3246 int* l_colcount,
3247 int* l_rowcount,
3248 int* l_nnz)
3249 {
3250 int* prev_p;
3251 /*int* prev_nbr;omer*/
3252 int* level;
3253 /*int* first_descendant;omer*/
3254 int* l_cc;
3255 int* l_rc;
3256 int* wt;
3257
3258 int n = A->n;
3259 int pprime;/*p,q,u omer*/
3260 int ju;
3261 int* postorder;
3262 int* ipostorder;
3263 int *first_child,*next_child;
3264
3265 int i,j,k,ip,jp,kp;
3266 int nnz,jnnz;
3267 int* uf;
3268 int* rowptr;
3269 int* colind;
3270 int* rowcount;
3271 int* realroot;
3272
3273 /* we need the row structures for the lower triangle */
3274
3275 nnz = (A->colptr)[n];
3276
3277 uf = taucs_malloc(n * sizeof(int));
3278 rowcount = taucs_malloc((n+1) * sizeof(int));
3279 rowptr = taucs_malloc((n+1) * sizeof(int));
3280 colind = taucs_malloc(nnz * sizeof(int));
3281
3282 if (!uf || !rowcount || !rowptr || !colind) {
3283 taucs_free(uf);
3284 taucs_free(rowcount);
3285 taucs_free(rowptr);
3286 taucs_free(colind);
3287 return -1;
3288 }
3289
3290 for (i=0; i <=n; i++) rowcount[i] = 0;
3291 for (j=0; j < n; j++) {
3292 jnnz = (A->colptr)[j+1] - (A->colptr)[j];
3293 for (ip=0; ip<jnnz; ip++) {
3294 i = (A->rowind)[(A->colptr)[j] + ip];
3295 if (j < i) rowcount[i]++;
3296 }
3297 }
3298
3299 ip = 0;
3300 for (i=0; i <= n; i++) {
3301 int next_ip = ip + rowcount[i];
3302 rowcount[i] = ip;
3303 rowptr [i] = ip;
3304 ip = next_ip;
3305 }
3306
3307 for (j=0; j < n; j++) {
3308 jnnz = (A->colptr)[j+1] - (A->colptr)[j];
3309 for (ip=0; ip<jnnz; ip++) {
3310 i = (A->rowind)[(A->colptr)[j] + ip];
3311 if (i==j) continue;
3312 assert( rowcount[i] < rowptr[i+1] );
3313 colind[ rowcount[i] ] = j;
3314 rowcount[i]++;
3315 }
3316 }
3317
3318 /* now compute the etree */
3319
3320 {
3321 int u,t,vroot;
3322 realroot = rowcount; /* reuse space */
3323
3324 for (i=0; i<n; i++) {
3325 uf_makeset(uf,i);
3326 realroot[i] = i;
3327 parent[i] = n;
3328 vroot = i;
3329 for (kp=rowptr[i]; kp<rowptr[i+1]; kp++) {
3330 k=colind[kp];
3331 u = uf_find(uf,k);
3332 t = realroot[u];
3333 if (parent[t] == n && t != i) {
3334 parent[t] = i;
3335 vroot = uf_union(uf,vroot,u);
3336 realroot[vroot] = i;
3337 }
3338 }
3339 }
3340 }
3341
3342 taucs_free(colind);
3343 taucs_free(rowptr);
3344 taucs_free(rowcount);
3345
3346 /* now only uf remains allocated */
3347
3348 /* compute column counts */
3349
3350 if (l_colcount || l_rowcount || l_nnz) {
3351 int* l_nz;
3352 int tmp;
3353 int u,p,q;
3354
3355 first_child = taucs_malloc((n+1) * sizeof(int));
3356 next_child = taucs_malloc((n+1) * sizeof(int));
3357 postorder = taucs_malloc(n * sizeof(int));
3358 ipostorder = taucs_malloc(n * sizeof(int));
3359 wt = taucs_malloc(n * sizeof(int));
3360 level = taucs_malloc(n * sizeof(int));
3361 prev_p = taucs_malloc(n * sizeof(int));
3362
3363 #ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP
3364 prev_nbr = taucs_malloc(n * sizeof(int));
3365 first_descendant = taucs_malloc(n * sizeof(int));
3366 #endif
3367
3368 /* we allocate scratch vectors to avoid conditionals */
3369 /* in the inner loop. */
3370
3371 if (l_colcount) l_cc = l_colcount;
3372 else l_cc = (int*) taucs_malloc(n*sizeof(int));
3373 if (l_rowcount) l_rc = l_rowcount;
3374 else l_rc = (int*) taucs_malloc(n*sizeof(int));
3375 if (l_nnz) l_nz = l_nnz;
3376 else l_nz = &tmp;
3377
3378
3379 if (!first_child || !next_child || !postorder
3380 || !ipostorder || !wt || !level || !prev_p
3381 || (!l_colcount && !l_cc)
3382 || (!l_rowcount && !l_rc)
3383 #ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP
3384 || !prev_nbr || !first_descendant
3385 #endif
3386 ) {
3387 taucs_free(uf);
3388
3389 if (!l_colcount) taucs_free(l_cc);
3390 if (!l_rowcount) taucs_free(l_rc);
3391
3392 taucs_free(postorder);
3393 taucs_free(ipostorder);
3394 taucs_free(wt);
3395 taucs_free(level);
3396 taucs_free(prev_p);
3397
3398 #ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP
3399 taucs_free(prev_nbr);
3400 taucs_free(first_descendant);
3401 #endif
3402 return -1;
3403 }
3404
3405 /*for (j=0; j<n; j++) printf("parent[%d] = %d\n",j,parent[j]);*/
3406
3407 /* compute the postorder */
3408
3409 for (j=0; j<=n; j++) first_child[j] = -1;
3410 for (j=n-1; j>=0; j--) {
3411 next_child[j] = first_child[parent[j]];
3412 first_child[parent[j]] = j;
3413 }
3414
3415 {
3416 int next = 0;
3417 recursive_postorder(n,first_child,next_child,
3418 postorder,
3419 ipostorder,&next);
3420 }
3421
3422 /* sort by postorder of etree */
3423 /* compute level, fst_desc */
3424
3425 tree_level(n,TRUE,first_child,next_child,
3426 level,-1);
3427
3428 for (u=0; u < n; u++) prev_p [u] = -1;
3429 for (u=0; u < n; u++) l_rc [u] = 1;
3430 for (u=0; u < n; u++) ordered_uf_makeset(uf,u);
3431 for (u=0; u < n; u++) {
3432 if (first_child[u] == -1)
3433 wt[u] = 1; /* leaves */
3434 else
3435 wt[u] = 0; /* nonleaves */
3436 }
3437
3438 #ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP
3439 for (u=0; u < n; u++) prev_nbr[u] = -1;
3440
3441 tree_first_descendant(n,TRUE,
3442 first_child,next_child,ipostorder,
3443 first_descendant);
3444 #endif
3445
3446 taucs_free(first_child);
3447 taucs_free(next_child);
3448
3449 for (p=0; p<n; p++) {
3450 jp = postorder[p];
3451 if (parent[jp] != n) wt[parent[jp]] --;
3452 for (ip = (A->colptr)[jp]; ip < (A->colptr)[jp+1]; ip++) {
3453 ju = (A->rowind)[ip];
3454 u = ipostorder[ju];
3455 if (ju==jp) continue; /* we only want proper neighbors */
3456 #ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP
3457 if (first_descendant[jp] > prev_nbr[u]) {
3458 #else
3459 if (1) {
3460 #endif
3461 wt[jp] ++;
3462 pprime = prev_p[ju];
3463 if (pprime == -1)
3464 l_rc[ju] += level[jp] - level[ju];
3465 else {
3466 q = ordered_uf_find(uf,pprime);
3467 l_rc[ju] += level[jp] - level[q];
3468 wt[q] --;
3469 }
3470 prev_p[ju] = jp;
3471 }
3472 #ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP
3473 prev_nbr[u] = p;
3474 #endif
3475 }
3476 if (parent[jp] != n) {
3477 if (!(ipostorder[parent[jp]] > ipostorder[jp])) {
3478 printf("jp %d parent %d (ipo_j %d ipo_parent %d\n",
3479 jp,parent[jp],ipostorder[jp],ipostorder[parent[jp]]);
3480 }
3481 assert(ipostorder[parent[jp]] > ipostorder[jp]);
3482 ordered_uf_union(uf,jp,parent[jp]);
3483 }
3484 }
3485
3486 *l_nz = 0;
3487 for (u=0; u<n; u++) {
3488 l_cc[u] = wt[u];
3489 *l_nz += wt[u];
3490 }
3491 for (u=0; u<n; u++) {
3492 if (parent[u] != n) {
3493 l_cc[parent[u]] += l_cc[u];
3494 *l_nz += l_cc[u];
3495 }
3496 }
3497
3498 /* free scrtach vectors */
3499
3500 if (!l_colcount) taucs_free(l_cc);
3501 if (!l_rowcount) taucs_free(l_rc);
3502
3503 /* free other data structures */
3504
3505 taucs_free(postorder);
3506 taucs_free(ipostorder);
3507 taucs_free(wt);
3508 taucs_free(level);
3509 taucs_free(prev_p);
3510
3511 #ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP
3512 taucs_free(prev_nbr);
3513 taucs_free(first_descendant);
3514 #endif
3515 }
3516
3517 taucs_free(uf);
3518
3519 return 0;
3520 }
3521
3522 int
3523 taucs_ccs_etree_liu(taucs_ccs_matrix* A,
3524 int* parent,
3525 int* l_colcount,
3526 int* l_rowcount,
3527 int* l_nnz)
3528 {
3529 int n = A->n;
3530 int i,j,k,ip,kp;/*jp omer*/
3531 int nnz,jnnz;
3532
3533 int* uf;
3534 int* rowptr;
3535 int* colind;
3536
3537 int* rowcount;
3538 int* marker;
3539 int* realroot;
3540
3541 int* l_cc;
3542 int* l_rc;
3543
3544 /* we need the row structures for the lower triangle */
3545
3546 nnz = (A->colptr)[n];
3547
3548 uf = taucs_malloc(n * sizeof(int));
3549 rowcount = taucs_malloc((n+1) * sizeof(int));
3550 rowptr = taucs_malloc((n+1) * sizeof(int));
3551 colind = taucs_malloc(nnz * sizeof(int));
3552
3553 for (i=0; i <=n; i++) rowcount[i] = 0;
3554
3555 for (j=0; j < n; j++) {
3556
3557 jnnz = (A->colptr)[j+1] - (A->colptr)[j];
3558
3559 for (ip=0; ip<jnnz; ip++) {
3560 i = (A->rowind)[(A->colptr)[j] + ip];
3561 if (j < i) rowcount[i]++;
3562 }
3563 }
3564
3565 ip = 0;
3566 for (i=0; i <= n; i++) {
3567 int next_ip = ip + rowcount[i];
3568 rowcount[i] = ip;
3569 rowptr [i] = ip;
3570 ip = next_ip;
3571 }
3572
3573 for (j=0; j < n; j++) {
3574 jnnz = (A->colptr)[j+1] - (A->colptr)[j];
3575
3576 for (ip=0; ip<jnnz; ip++) {
3577 i = (A->rowind)[(A->colptr)[j] + ip];
3578 if (i==j) continue;
3579 assert( rowcount[i] < rowptr[i+1] );
3580 colind[ rowcount[i] ] = j;
3581 rowcount[i]++;
3582 }
3583 }
3584
3585 /* now compute the etree */
3586
3587 {
3588 int u,t,vroot;
3589 realroot = rowcount; /* reuse space */
3590
3591 for (i=0; i<n; i++) {
3592 uf_makeset(uf,i);
3593 realroot[i] = i;
3594 parent[i] = n;
3595 vroot = i;
3596 for (kp=rowptr[i]; kp<rowptr[i+1]; kp++) {
3597 k=colind[kp];
3598 u = uf_find(uf,k);
3599 t = realroot[u];
3600 if (parent[t] == n && t != i) {
3601 parent[t] = i;
3602 vroot = uf_union(uf,vroot,u);
3603 realroot[vroot] = i;
3604 }
3605 }
3606 }
3607 }
3608
3609 /* compute column counts */
3610
3611 if (l_colcount || l_rowcount || l_nnz) {
3612 int* l_nz;
3613 int tmp;
3614
3615 /* we allocate scratch vectors to avoid conditionals */
3616 /* in the inner loop. */
3617
3618 if (l_colcount) l_cc = l_colcount;
3619 else l_cc = (int*) taucs_malloc(n*sizeof(int));
3620 if (l_rowcount) l_rc = l_rowcount;
3621 else l_rc = (int*) taucs_malloc(n*sizeof(int));
3622 if (l_nnz) l_nz = l_nnz;
3623 else l_nz = &tmp;
3624
3625 marker = rowcount; /* we reuse the space */
3626
3627 for (j=0; j < n; j++) l_cc[j] = 1;
3628 *l_nz = n;
3629
3630 for (i=0; i<n; i++) marker[i] = n; /* clear the array */
3631
3632 for (i=0; i<n; i++) {
3633 l_rc[i] = 1;
3634 marker[ i ] = i;
3635 for (kp=rowptr[i]; kp<rowptr[i+1]; kp++) {
3636 k=colind[kp];
3637 j=k;
3638 while (marker[j] != i) {
3639 l_cc[j]++;
3640 l_rc[i]++;
3641 (*l_nz)++;
3642 marker[j] = i;
3643 j = parent[j];
3644 }
3645 }
3646 }
3647
3648 /* free scrtach vectors */
3649
3650 if (!l_colcount) taucs_free(l_cc);
3651 if (!l_rowcount) taucs_free(l_rc);
3652 }
3653
3654 taucs_free(colind);
3655 taucs_free(rowptr);
3656 taucs_free(rowcount);
3657 taucs_free(uf);
3658
3659 return 0;
3660 }
3661
3662
3663 int
3664 taucs_ccs_symbolic_elimination(taucs_ccs_matrix* A,
3665 void* vL,
3666 int do_order,
3667 int max_depth
3668 )
3669 {
3670 supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL;
3671 int* first_child;
3672 int* next_child;
3673 int j;
3674 int* column_to_sn_map;
3675 int* map;
3676 int* rowind;
3677 int* parent;
3678 int* ipostorder;
3679
3680 int depth;
3681
3682 L->n = A->n;
3683 /* use calloc so we can deallocate unallocated entries */
3684 L->sn_struct = (int**)taucs_calloc((A->n ),sizeof(int*));
3685 L->sn_size = (int*) taucs_malloc((A->n+1)*sizeof(int));
3686 L->sn_up_size = (int*) taucs_malloc((A->n+1)*sizeof(int));
3687 L->first_child = (int*) taucs_malloc((A->n+1)*sizeof(int));
3688 L->next_child = (int*) taucs_malloc((A->n+1)*sizeof(int));
3689
3690 column_to_sn_map = (int*) taucs_malloc((A->n+1)*sizeof(int));
3691 map = (int*) taucs_malloc((A->n+1)*sizeof(int));
3692 first_child = (int*) taucs_malloc((A->n+1)*sizeof(int));
3693 next_child = (int*) taucs_malloc((A->n+1)*sizeof(int));
3694 parent = (int*) taucs_malloc((A->n+1)*sizeof(int));
3695 rowind = (int*) taucs_malloc((A->n )*sizeof(int));
3696
3697 if (!(L->sn_struct) || !(L->sn_size) || !(L->sn_up_size) ||
3698 !(L->first_child) || !(L->next_child) || !column_to_sn_map
3699 || !map || !first_child || !next_child || !rowind || !parent) {
3700 taucs_free(parent);
3701 taucs_free(rowind);
3702 taucs_free(next_child);
3703 taucs_free(first_child);
3704 taucs_free(map);
3705 taucs_free(column_to_sn_map);
3706 taucs_free(L->next_child);
3707 taucs_free(L->first_child);
3708 taucs_free(L->sn_up_size);
3709 taucs_free(L->sn_size);
3710 taucs_free(L->sn_struct);
3711 L->sn_struct = NULL;
3712 L->sn_size = L->sn_up_size = L->first_child = L->next_child = NULL;
3713 return -1;
3714 }
3715
3716 if (taucs_ccs_etree(A,parent,NULL,NULL,NULL) == -1) {
3717 taucs_free(parent);
3718 taucs_free(rowind);
3719 taucs_free(next_child);
3720 taucs_free(first_child);
3721 taucs_free(map);
3722 taucs_free(column_to_sn_map);
3723 taucs_free(L->next_child);
3724 taucs_free(L->first_child);
3725 taucs_free(L->sn_up_size);
3726 taucs_free(L->sn_size);
3727 taucs_free(L->sn_struct);
3728 L->sn_struct = NULL;
3729 L->sn_size = L->sn_up_size = L->first_child = L->next_child = NULL;
3730 return -1;
3731 }
3732
3733 if (0) {
3734 double wtime;
3735 int *cc1,*cc2,*rc1,*rc2;
3736 int *p1;
3737 int nnz1,nnz2;
3738
3739 cc1=(int*)taucs_malloc((A->n)*sizeof(int));
3740 cc2=(int*)taucs_malloc((A->n)*sizeof(int));
3741 rc1=(int*)taucs_malloc((A->n)*sizeof(int));
3742 rc2=(int*)taucs_malloc((A->n)*sizeof(int));
3743 p1 =(int*)taucs_malloc((A->n)*sizeof(int));
3744
3745 wtime = taucs_wtime();
3746 taucs_ccs_etree_liu(A,parent,cc1,rc1,&nnz1);
3747 wtime = taucs_wtime() - wtime;
3748 printf("\t\t\tLiu Analysis = %.3f seconds\n",wtime);
3749
3750 wtime = taucs_wtime();
3751 taucs_ccs_etree(A,p1,cc2,rc2,&nnz2);
3752 wtime = taucs_wtime() - wtime;
3753 printf("\t\t\tGNP Analysis = %.3f seconds\n",wtime);
3754
3755 for (j=0; j<(A->n); j++) assert(parent[j]==p1[j]);
3756 for (j=0; j<(A->n); j++) {
3757 if (cc1[j]!=cc2[j]) printf("j=%d cc1=%d cc2=%d\n",j,cc1[j],cc2[j]);
3758 assert(cc1[j]==cc2[j]);
3759 }
3760
3761 for (j=0; j<(A->n); j++) {
3762 if (rc1[j]!=rc2[j]) printf("j=%d rc1=%d rc2=%d\n",j,rc1[j],rc2[j]);
3763 assert(rc1[j]==rc2[j]);
3764 }
3765
3766 if (nnz1!=nnz2) printf("nnz1=%d nnz2=%d\n",nnz1,nnz2);
3767
3768 taucs_free(cc1); taucs_free(cc2); taucs_free(rc1); taucs_free(rc2);
3769 }
3770
3771 for (j=0; j <= (A->n); j++) first_child[j] = -1;
3772 for (j = (A->n)-1; j >= 0; j--) {
3773 int p = parent[j];
3774 next_child[j] = first_child[p];
3775 first_child[p] = j;
3776 }
3777
3778 /* let's compute the depth of the etree, to bail out if it is too deep */
3779 /* the whole thing will work better if we compute supernodal etrees */
3780
3781 {
3782 int next_depth_count;
3783 int this_depth_count;
3784 int child,i;
3785
3786 int* this_depth = rowind; /* we alias rowind */
3787 int* next_depth = map; /* and map */
3788 int* tmp;
3789
3790 this_depth[0] = A->n;
3791 this_depth_count = 1;
3792 next_depth_count = 0;
3793 depth = -1;
3794
3795 while (this_depth_count) {
3796 for (i=0; i<this_depth_count; i++) {
3797 child = first_child[ this_depth[i] ];
3798 while (child != -1) {
3799 next_depth[ next_depth_count ] = child;
3800 next_depth_count++;
3801 child = next_child[ child ];
3802 }
3803 }
3804
3805 tmp = this_depth;
3806 this_depth = next_depth;
3807 next_depth = tmp;
3808
3809 this_depth_count = next_depth_count;
3810 next_depth_count = 0;
3811 depth++;
3812 }
3813 }
3814
3815 taucs_printf("\t\tElimination tree depth is %d\n",depth);
3816
3817 if (max_depth && depth > max_depth) {
3818 taucs_printf("taucs_ccs_symbolic_elimination: etree depth %d, maximum allowed is %d\n",
3819 depth, max_depth);
3820 taucs_free(parent);
3821 taucs_free(rowind);
3822 taucs_free(next_child);
3823 taucs_free(first_child);
3824 taucs_free(map);
3825 taucs_free(column_to_sn_map);
3826 taucs_free(L->next_child);
3827 taucs_free(L->first_child);
3828 taucs_free(L->sn_up_size);
3829 taucs_free(L->sn_size);
3830 taucs_free(L->sn_struct);
3831 L->sn_struct = NULL;
3832 L->sn_size = L->sn_up_size = L->first_child = L->next_child = NULL;
3833 return -1;
3834 }
3835
3836 /*
3837 taucs_free(parent);
3838 ipostorder = (int*)taucs_malloc((A->n+1)*sizeof(int));
3839 */
3840
3841 ipostorder = parent;
3842 {
3843 int next = 0;
3844 /*int* postorder = (int*)taucs_malloc((A->n+1)*sizeof(int));*/
3845 recursive_postorder(A->n,first_child,next_child,
3846 NULL,
3847 ipostorder,&next);
3848 /*
3849 printf("ipostorder ");
3850 for (j=0; j <= (A->n); j++) printf("%d ",ipostorder[j]);
3851 printf("\n");
3852 printf(" postorder ");
3853 for (j=0; j <= (A->n); j++) printf("%d ",postorder[j]);
3854 printf("\n");
3855 */
3856 }
3857
3858 L->n_sn = 0;
3859 for (j=0; j < (A->n); j++) map[j] = -1;
3860 for (j=0; j <= (A->n); j++) (L->first_child)[j] = (L->next_child)[j] = -1;
3861
3862 if (recursive_symbolic_elimination(A->n,
3863 A,
3864 first_child,next_child,
3865 &(L->n_sn),
3866 L->sn_size,L->sn_up_size,L->sn_struct,
3867 L->first_child,L->next_child,
3868 rowind,
3869 column_to_sn_map,
3870 map,
3871 do_order,ipostorder
3872 )
3873 == -1) {
3874 for (j=0; j < (A->n); j++) taucs_free((L->sn_struct)[j]);
3875
3876 taucs_free(parent);
3877 taucs_free(rowind);
3878 taucs_free(next_child);
3879 taucs_free(first_child);
3880 taucs_free(map);
3881 taucs_free(column_to_sn_map);
3882 taucs_free(L->next_child);
3883 taucs_free(L->first_child);
3884 taucs_free(L->sn_up_size);
3885 taucs_free(L->sn_size);
3886 taucs_free(L->sn_struct);
3887 L->sn_struct = NULL;
3888 L->sn_size = L->sn_up_size = L->first_child = L->next_child = NULL;
3889 return -1;
3890 }
3891
3892 {
3893 double nnz = 0.0;
3894 double flops = 0.0;
3895 int sn,i,colnnz;
3896 int bytes;
3897
3898 bytes =
3899 1*sizeof(char) /* uplo */
3900 + 2*sizeof(int) /* n, n_sn */
3901 + 3*(L->n_sn)*sizeof(int) /* etree */
3902 + 4*(L->n_sn)*sizeof(int) /* block sizes, lda */
3903 + 1*(L->n_sn)*sizeof(int*) /* row/col indices */
3904 + 3*(L->n_sn)*sizeof(taucs_datatype*) /* actual blocks */
3905 ;
3906
3907 for (sn=0; sn<(L->n_sn); sn++) {
3908 bytes += (L->sn_up_size)[sn] * sizeof(int);
3909 bytes += ((L->sn_size)[sn]*(L->sn_up_size)[sn]) * sizeof(taucs_datatype);
3910
3911 for (i=0, colnnz = (L->sn_up_size)[sn];
3912 i<(L->sn_size)[sn];
3913 i++, colnnz--) {
3914 /* There was a bug here. I did not count muliply-adds in the
3915 update part of the computation as 2 flops but one. */
3916 /*flops += ((double)(colnnz) - 1.0) * ((double)(colnnz) + 2.0) / 2.0;*/
3917 flops += 1.0 + ((double)(colnnz)) * ((double)(colnnz));
3918 nnz += (double) (colnnz);
3919 }
3920 }
3921 taucs_printf("\t\tSymbolic Analysis of LL^T: %.2e nonzeros, %.2e flops, %.2e bytes in L\n",
3922 nnz, flops, (float) bytes);
3923 }
3924
3925 for (j=0; j < (A->n); j++) map[j] = -1;
3926 if (1)
3927 (void) recursive_amalgamate_supernodes((L->n_sn) - 1,
3928 &(L->n_sn),
3929 L->sn_size,L->sn_up_size,L->sn_struct,
3930 L->first_child,L->next_child,
3931 rowind,
3932 column_to_sn_map,
3933 map,
3934 do_order,ipostorder
3935 );
3936
3937
3938 {
3939 double nnz = 0.0;
3940 double flops = 0.0;
3941 int sn,i,colnnz;
3942 int bytes;
3943
3944 bytes =
3945 1*sizeof(char) /* uplo */
3946 + 2*sizeof(int) /* n, n_sn */
3947 + 3*(L->n_sn)*sizeof(int) /* etree */
3948 + 4*(L->n_sn)*sizeof(int) /* block sizes, lda */
3949 + 1*(L->n_sn)*sizeof(int*) /* row/col indices */
3950 + 3*(L->n_sn)*sizeof(taucs_datatype*) /* actual blocks */
3951 ;
3952
3953 for (sn=0; sn<(L->n_sn); sn++) {
3954 bytes += (L->sn_up_size)[sn] * sizeof(int);
3955 bytes += ((L->sn_size)[sn]*(L->sn_up_size)[sn]) * sizeof(taucs_datatype);
3956
3957 for (i=0, colnnz = (L->sn_up_size)[sn];
3958 i<(L->sn_size)[sn];
3959 i++, colnnz--) {
3960 /* There was a bug here. I did not count muliply-adds in the
3961 update part of the computation as 2 flops but one. */
3962 /*flops += ((double)(colnnz) - 1.0) * ((double)(colnnz) + 2.0) / 2.0;*/
3963 flops += 1.0 + ((double)(colnnz)) * ((double)(colnnz));
3964 nnz += (double) (colnnz);
3965 }
3966 }
3967 taucs_printf("\t\tRelaxed Analysis of LL^T: %.2e nonzeros, %.2e flops, %.2e bytes in L\n",
3968 nnz, flops, (float) bytes);
3969 }
3970
3971 /*
3972 {
3973 int i;
3974 printf("c2sn: ");
3975 for (i=0; i<A->n; i++) printf("%d ",column_to_sn_map[i]);
3976 printf("\n");
3977 }
3978 */
3979
3980 taucs_free(parent);
3981 taucs_free(rowind);
3982 taucs_free(map);
3983 taucs_free(column_to_sn_map);
3984 taucs_free(next_child);
3985 taucs_free(first_child);
3986
3987 L->sn_blocks_ld = taucs_malloc((L->n_sn) * sizeof(int));
3988 L->sn_blocks = taucs_calloc((L->n_sn), sizeof(taucs_datatype*)); /* so we can free before allocation */
3989
3990 L->up_blocks_ld = taucs_malloc((L->n_sn) * sizeof(int));
3991 L->up_blocks = taucs_calloc((L->n_sn), sizeof(taucs_datatype*));
3992
3993 if (!(L->sn_blocks_ld)
3994 || !(L->sn_blocks_ld)
3995 || !(L->sn_blocks)
3996 || !(L->up_blocks_ld)
3997 || !(L->up_blocks))
3998 return -1; /* the caller will free L */
3999
4000 return 0;
4001 }
4002 #endif
4003
4004
4005 /*************************************************************/
4006 /* end of file */
4007 /*************************************************************/
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021