1 #include "BSprivate.h"
2 
3 /*@ BSb_for_solve - Forward triangular matrix solution on a
4                     block of vectors
5 
6     Input Parameters:
7 .   A - The sparse matrix
8 .   x - The contiguous block of rhs's
9 .   comm - The communication structure for A
10 .   block_size - the number of rhs's
11 .   procinfo - the usual processor information
12 
13     Output Parameters:
14 .   b - on exit these vectors contain the solution vector
15 
16     Returns:
17     void
18 
19  @*/
BSb_for_solve(BSpar_mat * A,FLOAT * x,BScomm * comm,int block_size,BSprocinfo * procinfo)20 void BSb_for_solve(BSpar_mat *A, FLOAT *x, BScomm *comm,
21 		int block_size, BSprocinfo *procinfo)
22 {
23 	BMphase *to_phase, *from_phase;
24 	BMmsg *msg;
25 	int	i, j, k, n;
26 	int	cl_ind, in_ind;
27 	int	count, size, ind, num_cols;
28 	int *row;
29 	FLOAT *nz;
30 	BScl_2_inode *clique2inode;
31 	BSnumbering *color2clique;
32 	BSinode *inodes;
33 	int	*data_ptr, msg_len;
34 	FLOAT *msg_buf, *matrix;
35 	FLOAT *work;
36 	FLOAT	*xptr, *wptr;
37 	FLOAT	**xoff;
38 	char UP = 'U';
39 	char TR = 'T';
40 	char NTR = 'N';
41 	char SIDE = 'L';
42 	char ND = 'N';
43 	FLOAT one = 1.0;
44 	FLOAT zero = 0.0;
45 
46 	if((!A->icc_storage)||(procinfo->single)) {
47 		/* No ILU version or single version so call BSfor_solve block_size times */
48 		n = A->num_rows;
49 		for (i=0;i<block_size;i++) {
50 			if(procinfo->single) {
51 				BSfor_solve1(A,&(x[n*i]),comm,procinfo); CHKERR(0);
52 			} else {
53 				BSfor_solve(A,&(x[n*i]),comm,procinfo); CHKERR(0);
54 			}
55 		}
56 		return;
57 	}
58 
59 	color2clique = A->color2clique;
60 	clique2inode = A->clique2inode;
61 	inodes = A->inodes->list;
62 
63 	/* get some work space */
64 	MY_MALLOC(work,(FLOAT *),sizeof(FLOAT)*A->num_rows*block_size,1);
65 
66 	/* post for all messages */
67 	BMinit_comp_msg(comm->from_msg,procinfo); CHKERR(0);
68 
69 	/* calculate x offsets */
70 	MY_MALLOC(xoff,(FLOAT **),sizeof(FLOAT *)*block_size,1);
71 	for (i=0;i<block_size;i++) {
72 		xoff[i] = &(x[i*A->num_rows]);
73 	}
74 
75 	/* now do this phase by phase */
76 	for (i=0;i<color2clique->length-1;i++) {
77 		/* find my portion of the solution using the cliques on the diagonal */
78 		for (cl_ind=color2clique->numbers[i];
79 			cl_ind<color2clique->numbers[i+1];cl_ind++) {
80 			if (procinfo->my_id == clique2inode->proc[cl_ind]) {
81 				/* first, multiply the clique */
82 				/* the clique is stored, inverted, in the upper triangle */
83 				size = clique2inode->d_mats[cl_ind].size;
84 				ind = clique2inode->d_mats[cl_ind].local_ind;
85 				matrix = clique2inode->d_mats[cl_ind].matrix;
86 				DTRMM(&SIDE,&UP,&TR,&ND,&size,&block_size,&one,matrix,
87 					&size,&(x[ind]),&(A->num_rows));
88 			}
89 		}
90 
91 		/* now send my messages */
92 		to_phase = BMget_phase(comm->to_msg,i); CHKERR(0);
93 		msg = NULL;
94 		while ((msg = BMnext_msg(to_phase,msg)) != NULL) {
95 			CHKERR(0);
96 			msg_buf = (FLOAT *) BMget_msg_ptr(msg); CHKERR(0);
97 			data_ptr = BMget_user(msg,&msg_len); CHKERR(0);
98 			for (j=0;j<block_size;j++) {
99 				wptr = &(msg_buf[j*msg_len]);
100 				xptr = xoff[j];
101 				for (k=0;k<msg_len;k++) {
102 					wptr[k] = xptr[data_ptr[k]];
103 				}
104 			}
105 			BMsendf_msg(msg,procinfo); CHKERR(0);
106 		}
107 		CHKERR(0);
108 
109 		/* do some local work */
110 		for (cl_ind=color2clique->numbers[i];
111 			cl_ind<color2clique->numbers[i+1];cl_ind++) {
112 			if (procinfo->my_id == clique2inode->proc[cl_ind]) {
113 				ind = clique2inode->d_mats[cl_ind].local_ind;
114 				/* multiply the inodes */
115 				for (in_ind=clique2inode->inode_index[cl_ind];
116 					in_ind<clique2inode->inode_index[cl_ind+1];in_ind++) {
117 					row = inodes[in_ind].row_num;
118 					nz = inodes[in_ind].nz;
119 					size = inodes[in_ind].length;
120 					num_cols = inodes[in_ind].num_cols;
121 					if (size > 0) {
122 						DGEMM(&NTR,&NTR,&size,&block_size,&num_cols,&one,nz,
123 							&size,&(x[ind]),&(A->num_rows),&zero,work,&size);
124 						for (j=0;j<block_size;j++) {
125 							xptr = xoff[j];
126 							wptr = &(work[j*size]);
127 							for (k=0;k<size;k++) xptr[row[k]] -= wptr[k];
128 						}
129 					}
130 					ind += num_cols;
131 				}
132 			}
133 		}
134 
135 		/* receive my messages and do non-local work */
136 		from_phase = BMget_phase(comm->from_msg,i); CHKERR(0);
137 		while ((msg = BMrecv_msg(from_phase)) != NULL) {
138 			CHKERR(0);
139 			msg_buf = (FLOAT *) BMget_msg_ptr(msg); CHKERR(0);
140 			data_ptr = BMget_user(msg,&msg_len); CHKERR(0);
141 			msg_len = BMget_msg_size(msg); CHKERR(0);
142 			msg_len /= block_size;
143 			count = 0;
144 			for (cl_ind=data_ptr[0];cl_ind<=data_ptr[1];cl_ind++) {
145 				for (in_ind=clique2inode->inode_index[cl_ind];
146 					in_ind<clique2inode->inode_index[cl_ind+1];in_ind++) {
147 					row = inodes[in_ind].row_num;
148 					nz = inodes[in_ind].nz;
149 					size = inodes[in_ind].length;
150 					num_cols = inodes[in_ind].num_cols;
151 					if (size > 0) {
152 						DGEMM(&NTR,&NTR,&size,&block_size,&num_cols,&one,nz,
153 							&size,&(msg_buf[count]),&msg_len,&zero,work,
154 							&size);
155 						for (j=0;j<block_size;j++) {
156 							xptr = xoff[j];
157 							wptr = &(work[j*size]);
158 							for (k=0;k<size;k++) xptr[row[k]] -= wptr[k];
159 						}
160 					}
161 					count += num_cols;
162 				}
163 			}
164 			BMfree_msg(msg); CHKERR(0);
165 		}
166 		CHKERR(0);
167 	}
168 
169 	MY_FREE(work);
170 	MY_FREE(xoff);
171 
172 	/* wait for all of the sent messages to finish */
173 	BMfinish_comp_msg(comm->to_msg,procinfo); CHKERR(0);
174 	MLOG_flop((2*A->local_nnz));
175 }
176