1
2 /*
3 This file provides high performance routines for the Inode format (compressed sparse row)
4 by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7
MatCreateColInode_Private(Mat A,PetscInt * size,PetscInt ** ns)8 static PetscErrorCode MatCreateColInode_Private(Mat A,PetscInt *size,PetscInt **ns)
9 {
10 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
11 PetscErrorCode ierr;
12 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col;
13
14 PetscFunctionBegin;
15 n = A->cmap->n;
16 m = A->rmap->n;
17 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
18 ns_row = a->inode.size;
19
20 min_mn = (m < n) ? m : n;
21 if (!ns) {
22 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
23 for (; count+1 < n; count++,i++) ;
24 if (count < n) {
25 i++;
26 }
27 *size = i;
28 PetscFunctionReturn(0);
29 }
30 ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr);
31
32 /* Use the same row structure wherever feasible. */
33 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
34 ns_col[i] = ns_row[i];
35 }
36
37 /* if m < n; pad up the remainder with inode_limit */
38 for (; count+1 < n; count++,i++) {
39 ns_col[i] = 1;
40 }
41 /* The last node is the odd ball. padd it up with the remaining rows; */
42 if (count < n) {
43 ns_col[i] = n - count;
44 i++;
45 } else if (count > n) {
46 /* Adjust for the over estimation */
47 ns_col[i-1] += n - count;
48 }
49 *size = i;
50 *ns = ns_col;
51 PetscFunctionReturn(0);
52 }
53
54
55 /*
56 This builds symmetric version of nonzero structure,
57 */
MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt * iia[],const PetscInt * jja[],PetscInt ishift,PetscInt oshift)58 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
59 {
60 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
61 PetscErrorCode ierr;
62 PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
63 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
64 const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
65
66 PetscFunctionBegin;
67 nslim_row = a->inode.node_count;
68 m = A->rmap->n;
69 n = A->cmap->n;
70 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
71 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
72
73 /* Use the row_inode as column_inode */
74 nslim_col = nslim_row;
75 ns_col = ns_row;
76
77 /* allocate space for reformated inode structure */
78 ierr = PetscMalloc2(nslim_col+1,&tns,n+1,&tvc);CHKERRQ(ierr);
79 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
80
81 for (i1=0,col=0; i1<nslim_col; ++i1) {
82 nsz = ns_col[i1];
83 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
84 }
85 /* allocate space for row pointers */
86 ierr = PetscCalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
87 *iia = ia;
88 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
89
90 /* determine the number of columns in each row */
91 ia[0] = oshift;
92 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
93
94 j = aj + ai[row] + ishift;
95 jmax = aj + ai[row+1] + ishift;
96 if (j==jmax) continue; /* empty row */
97 col = *j++ + ishift;
98 i2 = tvc[col];
99 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
100 ia[i1+1]++;
101 ia[i2+1]++;
102 i2++; /* Start col of next node */
103 while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
104 i2 = tvc[col];
105 }
106 if (i2 == i1) ia[i2+1]++; /* now the diagonal element */
107 }
108
109 /* shift ia[i] to point to next row */
110 for (i1=1; i1<nslim_row+1; i1++) {
111 row = ia[i1-1];
112 ia[i1] += row;
113 work[i1-1] = row - oshift;
114 }
115
116 /* allocate space for column pointers */
117 nz = ia[nslim_row] + (!ishift);
118 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
119 *jja = ja;
120
121 /* loop over lower triangular part putting into ja */
122 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
123 j = aj + ai[row] + ishift;
124 jmax = aj + ai[row+1] + ishift;
125 if (j==jmax) continue; /* empty row */
126 col = *j++ + ishift;
127 i2 = tvc[col];
128 while (i2<i1 && j<jmax) {
129 ja[work[i2]++] = i1 + oshift;
130 ja[work[i1]++] = i2 + oshift;
131 ++i2;
132 while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
133 i2 = tvc[col];
134 }
135 if (i2 == i1) ja[work[i1]++] = i2 + oshift;
136
137 }
138 ierr = PetscFree(work);CHKERRQ(ierr);
139 ierr = PetscFree2(tns,tvc);CHKERRQ(ierr);
140 PetscFunctionReturn(0);
141 }
142
143 /*
144 This builds nonsymmetric version of nonzero structure,
145 */
MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt * iia[],const PetscInt * jja[],PetscInt ishift,PetscInt oshift)146 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
147 {
148 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
149 PetscErrorCode ierr;
150 PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
151 PetscInt *tns,*tvc,nsz,i1,i2;
152 const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
153
154 PetscFunctionBegin;
155 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
156 nslim_row = a->inode.node_count;
157 n = A->cmap->n;
158
159 /* Create The column_inode for this matrix */
160 ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr);
161
162 /* allocate space for reformated column_inode structure */
163 ierr = PetscMalloc2(nslim_col +1,&tns,n + 1,&tvc);CHKERRQ(ierr);
164 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
165
166 for (i1=0,col=0; i1<nslim_col; ++i1) {
167 nsz = ns_col[i1];
168 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
169 }
170 /* allocate space for row pointers */
171 ierr = PetscCalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
172 *iia = ia;
173 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
174
175 /* determine the number of columns in each row */
176 ia[0] = oshift;
177 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
178 j = aj + ai[row] + ishift;
179 nz = ai[row+1] - ai[row];
180 if (!nz) continue; /* empty row */
181 col = *j++ + ishift;
182 i2 = tvc[col];
183 while (nz-- > 0) { /* off-diagonal elemets */
184 ia[i1+1]++;
185 i2++; /* Start col of next node */
186 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
187 if (nz > 0) i2 = tvc[col];
188 }
189 }
190
191 /* shift ia[i] to point to next row */
192 for (i1=1; i1<nslim_row+1; i1++) {
193 row = ia[i1-1];
194 ia[i1] += row;
195 work[i1-1] = row - oshift;
196 }
197
198 /* allocate space for column pointers */
199 nz = ia[nslim_row] + (!ishift);
200 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
201 *jja = ja;
202
203 /* loop over matrix putting into ja */
204 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
205 j = aj + ai[row] + ishift;
206 nz = ai[row+1] - ai[row];
207 if (!nz) continue; /* empty row */
208 col = *j++ + ishift;
209 i2 = tvc[col];
210 while (nz-- > 0) {
211 ja[work[i1]++] = i2 + oshift;
212 ++i2;
213 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
214 if (nz > 0) i2 = tvc[col];
215 }
216 }
217 ierr = PetscFree(ns_col);CHKERRQ(ierr);
218 ierr = PetscFree(work);CHKERRQ(ierr);
219 ierr = PetscFree2(tns,tvc);CHKERRQ(ierr);
220 PetscFunctionReturn(0);
221 }
222
MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt * n,const PetscInt * ia[],const PetscInt * ja[],PetscBool * done)223 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
224 {
225 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
226 PetscErrorCode ierr;
227
228 PetscFunctionBegin;
229 *n = a->inode.node_count;
230 if (!ia) PetscFunctionReturn(0);
231 if (!blockcompressed) {
232 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);
233 } else if (symmetric) {
234 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
235 } else {
236 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
237 }
238 PetscFunctionReturn(0);
239 }
240
MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt * n,const PetscInt * ia[],const PetscInt * ja[],PetscBool * done)241 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
242 {
243 PetscErrorCode ierr;
244
245 PetscFunctionBegin;
246 if (!ia) PetscFunctionReturn(0);
247
248 if (!blockcompressed) {
249 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);
250 } else {
251 ierr = PetscFree(*ia);CHKERRQ(ierr);
252 ierr = PetscFree(*ja);CHKERRQ(ierr);
253 }
254 PetscFunctionReturn(0);
255 }
256
257 /* ----------------------------------------------------------- */
258
MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt * iia[],const PetscInt * jja[],PetscInt ishift,PetscInt oshift)259 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
260 {
261 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
262 PetscErrorCode ierr;
263 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
264 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
265
266 PetscFunctionBegin;
267 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
268 nslim_row = a->inode.node_count;
269 n = A->cmap->n;
270
271 /* Create The column_inode for this matrix */
272 ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr);
273
274 /* allocate space for reformated column_inode structure */
275 ierr = PetscMalloc2(nslim_col + 1,&tns,n + 1,&tvc);CHKERRQ(ierr);
276 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
277
278 for (i1=0,col=0; i1<nslim_col; ++i1) {
279 nsz = ns_col[i1];
280 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
281 }
282 /* allocate space for column pointers */
283 ierr = PetscCalloc1(nslim_col+1,&ia);CHKERRQ(ierr);
284 *iia = ia;
285 ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr);
286
287 /* determine the number of columns in each row */
288 ia[0] = oshift;
289 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
290 j = aj + ai[row] + ishift;
291 col = *j++ + ishift;
292 i2 = tvc[col];
293 nz = ai[row+1] - ai[row];
294 while (nz-- > 0) { /* off-diagonal elemets */
295 /* ia[i1+1]++; */
296 ia[i2+1]++;
297 i2++;
298 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
299 if (nz > 0) i2 = tvc[col];
300 }
301 }
302
303 /* shift ia[i] to point to next col */
304 for (i1=1; i1<nslim_col+1; i1++) {
305 col = ia[i1-1];
306 ia[i1] += col;
307 work[i1-1] = col - oshift;
308 }
309
310 /* allocate space for column pointers */
311 nz = ia[nslim_col] + (!ishift);
312 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
313 *jja = ja;
314
315 /* loop over matrix putting into ja */
316 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
317 j = aj + ai[row] + ishift;
318 col = *j++ + ishift;
319 i2 = tvc[col];
320 nz = ai[row+1] - ai[row];
321 while (nz-- > 0) {
322 /* ja[work[i1]++] = i2 + oshift; */
323 ja[work[i2]++] = i1 + oshift;
324 i2++;
325 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
326 if (nz > 0) i2 = tvc[col];
327 }
328 }
329 ierr = PetscFree(ns_col);CHKERRQ(ierr);
330 ierr = PetscFree(work);CHKERRQ(ierr);
331 ierr = PetscFree2(tns,tvc);CHKERRQ(ierr);
332 PetscFunctionReturn(0);
333 }
334
MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt * n,const PetscInt * ia[],const PetscInt * ja[],PetscBool * done)335 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
336 {
337 PetscErrorCode ierr;
338
339 PetscFunctionBegin;
340 ierr = MatCreateColInode_Private(A,n,NULL);CHKERRQ(ierr);
341 if (!ia) PetscFunctionReturn(0);
342
343 if (!blockcompressed) {
344 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);
345 } else if (symmetric) {
346 /* Since the indices are symmetric it does'nt matter */
347 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
348 } else {
349 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
350 }
351 PetscFunctionReturn(0);
352 }
353
MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt * n,const PetscInt * ia[],const PetscInt * ja[],PetscBool * done)354 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done)
355 {
356 PetscErrorCode ierr;
357
358 PetscFunctionBegin;
359 if (!ia) PetscFunctionReturn(0);
360 if (!blockcompressed) {
361 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);
362 } else {
363 ierr = PetscFree(*ia);CHKERRQ(ierr);
364 ierr = PetscFree(*ja);CHKERRQ(ierr);
365 }
366 PetscFunctionReturn(0);
367 }
368
369 /* ----------------------------------------------------------- */
370
MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)371 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
372 {
373 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
374 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
375 PetscScalar *y;
376 const PetscScalar *x;
377 const MatScalar *v1,*v2,*v3,*v4,*v5;
378 PetscErrorCode ierr;
379 PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
380 const PetscInt *idx,*ns,*ii;
381
382 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
383 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
384 #endif
385
386 PetscFunctionBegin;
387 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
388 node_max = a->inode.node_count;
389 ns = a->inode.size; /* Node Size array */
390 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
391 ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
392 idx = a->j;
393 v1 = a->a;
394 ii = a->i;
395
396 for (i = 0,row = 0; i< node_max; ++i) {
397 nsz = ns[i];
398 n = ii[1] - ii[0];
399 nonzerorow += (n>0)*nsz;
400 ii += nsz;
401 PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */
402 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */
403 sz = n; /* No of non zeros in this row */
404 /* Switch on the size of Node */
405 switch (nsz) { /* Each loop in 'case' is unrolled */
406 case 1:
407 sum1 = 0.;
408
409 for (n = 0; n< sz-1; n+=2) {
410 i1 = idx[0]; /* The instructions are ordered to */
411 i2 = idx[1]; /* make the compiler's job easy */
412 idx += 2;
413 tmp0 = x[i1];
414 tmp1 = x[i2];
415 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
416 }
417
418 if (n == sz-1) { /* Take care of the last nonzero */
419 tmp0 = x[*idx++];
420 sum1 += *v1++ *tmp0;
421 }
422 y[row++]=sum1;
423 break;
424 case 2:
425 sum1 = 0.;
426 sum2 = 0.;
427 v2 = v1 + n;
428
429 for (n = 0; n< sz-1; n+=2) {
430 i1 = idx[0];
431 i2 = idx[1];
432 idx += 2;
433 tmp0 = x[i1];
434 tmp1 = x[i2];
435 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
436 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
437 }
438 if (n == sz-1) {
439 tmp0 = x[*idx++];
440 sum1 += *v1++ * tmp0;
441 sum2 += *v2++ * tmp0;
442 }
443 y[row++]=sum1;
444 y[row++]=sum2;
445 v1 =v2; /* Since the next block to be processed starts there*/
446 idx +=sz;
447 break;
448 case 3:
449 sum1 = 0.;
450 sum2 = 0.;
451 sum3 = 0.;
452 v2 = v1 + n;
453 v3 = v2 + n;
454
455 for (n = 0; n< sz-1; n+=2) {
456 i1 = idx[0];
457 i2 = idx[1];
458 idx += 2;
459 tmp0 = x[i1];
460 tmp1 = x[i2];
461 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
462 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
463 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
464 }
465 if (n == sz-1) {
466 tmp0 = x[*idx++];
467 sum1 += *v1++ * tmp0;
468 sum2 += *v2++ * tmp0;
469 sum3 += *v3++ * tmp0;
470 }
471 y[row++]=sum1;
472 y[row++]=sum2;
473 y[row++]=sum3;
474 v1 =v3; /* Since the next block to be processed starts there*/
475 idx +=2*sz;
476 break;
477 case 4:
478 sum1 = 0.;
479 sum2 = 0.;
480 sum3 = 0.;
481 sum4 = 0.;
482 v2 = v1 + n;
483 v3 = v2 + n;
484 v4 = v3 + n;
485
486 for (n = 0; n< sz-1; n+=2) {
487 i1 = idx[0];
488 i2 = idx[1];
489 idx += 2;
490 tmp0 = x[i1];
491 tmp1 = x[i2];
492 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
493 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
494 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
495 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
496 }
497 if (n == sz-1) {
498 tmp0 = x[*idx++];
499 sum1 += *v1++ * tmp0;
500 sum2 += *v2++ * tmp0;
501 sum3 += *v3++ * tmp0;
502 sum4 += *v4++ * tmp0;
503 }
504 y[row++]=sum1;
505 y[row++]=sum2;
506 y[row++]=sum3;
507 y[row++]=sum4;
508 v1 =v4; /* Since the next block to be processed starts there*/
509 idx +=3*sz;
510 break;
511 case 5:
512 sum1 = 0.;
513 sum2 = 0.;
514 sum3 = 0.;
515 sum4 = 0.;
516 sum5 = 0.;
517 v2 = v1 + n;
518 v3 = v2 + n;
519 v4 = v3 + n;
520 v5 = v4 + n;
521
522 for (n = 0; n<sz-1; n+=2) {
523 i1 = idx[0];
524 i2 = idx[1];
525 idx += 2;
526 tmp0 = x[i1];
527 tmp1 = x[i2];
528 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
529 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
530 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
531 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
532 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
533 }
534 if (n == sz-1) {
535 tmp0 = x[*idx++];
536 sum1 += *v1++ * tmp0;
537 sum2 += *v2++ * tmp0;
538 sum3 += *v3++ * tmp0;
539 sum4 += *v4++ * tmp0;
540 sum5 += *v5++ * tmp0;
541 }
542 y[row++]=sum1;
543 y[row++]=sum2;
544 y[row++]=sum3;
545 y[row++]=sum4;
546 y[row++]=sum5;
547 v1 =v5; /* Since the next block to be processed starts there */
548 idx +=4*sz;
549 break;
550 default:
551 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
552 }
553 }
554 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
555 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
556 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
557 PetscFunctionReturn(0);
558 }
559 /* ----------------------------------------------------------- */
560 /* Almost same code as the MatMult_SeqAIJ_Inode() */
MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)561 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
562 {
563 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
564 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
565 const MatScalar *v1,*v2,*v3,*v4,*v5;
566 const PetscScalar *x;
567 PetscScalar *y,*z,*zt;
568 PetscErrorCode ierr;
569 PetscInt i1,i2,n,i,row,node_max,nsz,sz;
570 const PetscInt *idx,*ns,*ii;
571
572 PetscFunctionBegin;
573 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
574 node_max = a->inode.node_count;
575 ns = a->inode.size; /* Node Size array */
576
577 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
578 ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
579 zt = z;
580
581 idx = a->j;
582 v1 = a->a;
583 ii = a->i;
584
585 for (i = 0,row = 0; i< node_max; ++i) {
586 nsz = ns[i];
587 n = ii[1] - ii[0];
588 ii += nsz;
589 sz = n; /* No of non zeros in this row */
590 /* Switch on the size of Node */
591 switch (nsz) { /* Each loop in 'case' is unrolled */
592 case 1:
593 sum1 = *zt++;
594
595 for (n = 0; n< sz-1; n+=2) {
596 i1 = idx[0]; /* The instructions are ordered to */
597 i2 = idx[1]; /* make the compiler's job easy */
598 idx += 2;
599 tmp0 = x[i1];
600 tmp1 = x[i2];
601 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
602 }
603
604 if (n == sz-1) { /* Take care of the last nonzero */
605 tmp0 = x[*idx++];
606 sum1 += *v1++ * tmp0;
607 }
608 y[row++]=sum1;
609 break;
610 case 2:
611 sum1 = *zt++;
612 sum2 = *zt++;
613 v2 = v1 + n;
614
615 for (n = 0; n< sz-1; n+=2) {
616 i1 = idx[0];
617 i2 = idx[1];
618 idx += 2;
619 tmp0 = x[i1];
620 tmp1 = x[i2];
621 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
622 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
623 }
624 if (n == sz-1) {
625 tmp0 = x[*idx++];
626 sum1 += *v1++ * tmp0;
627 sum2 += *v2++ * tmp0;
628 }
629 y[row++]=sum1;
630 y[row++]=sum2;
631 v1 =v2; /* Since the next block to be processed starts there*/
632 idx +=sz;
633 break;
634 case 3:
635 sum1 = *zt++;
636 sum2 = *zt++;
637 sum3 = *zt++;
638 v2 = v1 + n;
639 v3 = v2 + n;
640
641 for (n = 0; n< sz-1; n+=2) {
642 i1 = idx[0];
643 i2 = idx[1];
644 idx += 2;
645 tmp0 = x[i1];
646 tmp1 = x[i2];
647 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
648 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
649 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
650 }
651 if (n == sz-1) {
652 tmp0 = x[*idx++];
653 sum1 += *v1++ * tmp0;
654 sum2 += *v2++ * tmp0;
655 sum3 += *v3++ * tmp0;
656 }
657 y[row++]=sum1;
658 y[row++]=sum2;
659 y[row++]=sum3;
660 v1 =v3; /* Since the next block to be processed starts there*/
661 idx +=2*sz;
662 break;
663 case 4:
664 sum1 = *zt++;
665 sum2 = *zt++;
666 sum3 = *zt++;
667 sum4 = *zt++;
668 v2 = v1 + n;
669 v3 = v2 + n;
670 v4 = v3 + n;
671
672 for (n = 0; n< sz-1; n+=2) {
673 i1 = idx[0];
674 i2 = idx[1];
675 idx += 2;
676 tmp0 = x[i1];
677 tmp1 = x[i2];
678 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
679 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
680 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
681 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
682 }
683 if (n == sz-1) {
684 tmp0 = x[*idx++];
685 sum1 += *v1++ * tmp0;
686 sum2 += *v2++ * tmp0;
687 sum3 += *v3++ * tmp0;
688 sum4 += *v4++ * tmp0;
689 }
690 y[row++]=sum1;
691 y[row++]=sum2;
692 y[row++]=sum3;
693 y[row++]=sum4;
694 v1 =v4; /* Since the next block to be processed starts there*/
695 idx +=3*sz;
696 break;
697 case 5:
698 sum1 = *zt++;
699 sum2 = *zt++;
700 sum3 = *zt++;
701 sum4 = *zt++;
702 sum5 = *zt++;
703 v2 = v1 + n;
704 v3 = v2 + n;
705 v4 = v3 + n;
706 v5 = v4 + n;
707
708 for (n = 0; n<sz-1; n+=2) {
709 i1 = idx[0];
710 i2 = idx[1];
711 idx += 2;
712 tmp0 = x[i1];
713 tmp1 = x[i2];
714 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
715 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
716 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
717 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
718 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
719 }
720 if (n == sz-1) {
721 tmp0 = x[*idx++];
722 sum1 += *v1++ * tmp0;
723 sum2 += *v2++ * tmp0;
724 sum3 += *v3++ * tmp0;
725 sum4 += *v4++ * tmp0;
726 sum5 += *v5++ * tmp0;
727 }
728 y[row++]=sum1;
729 y[row++]=sum2;
730 y[row++]=sum3;
731 y[row++]=sum4;
732 y[row++]=sum5;
733 v1 =v5; /* Since the next block to be processed starts there */
734 idx +=4*sz;
735 break;
736 default:
737 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
738 }
739 }
740 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
741 ierr = VecRestoreArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
742 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
743 PetscFunctionReturn(0);
744 }
745
746 /* ----------------------------------------------------------- */
MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)747 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
748 {
749 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
750 IS iscol = a->col,isrow = a->row;
751 PetscErrorCode ierr;
752 const PetscInt *r,*c,*rout,*cout;
753 PetscInt i,j,n = A->rmap->n,nz;
754 PetscInt node_max,*ns,row,nsz,aii,i0,i1;
755 const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
756 PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
757 PetscScalar sum1,sum2,sum3,sum4,sum5;
758 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
759 const PetscScalar *b;
760
761 PetscFunctionBegin;
762 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
763 node_max = a->inode.node_count;
764 ns = a->inode.size; /* Node Size array */
765
766 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
767 ierr = VecGetArrayWrite(xx,&x);CHKERRQ(ierr);
768 tmp = a->solve_work;
769
770 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
771 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
772
773 /* forward solve the lower triangular */
774 tmps = tmp;
775 aa = a_a;
776 aj = a_j;
777 ad = a->diag;
778
779 for (i = 0,row = 0; i< node_max; ++i) {
780 nsz = ns[i];
781 aii = ai[row];
782 v1 = aa + aii;
783 vi = aj + aii;
784 nz = ad[row]- aii;
785 if (i < node_max-1) {
786 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
787 * but our indexing to determine it's size could. */
788 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
789 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
790 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
791 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
792 }
793
794 switch (nsz) { /* Each loop in 'case' is unrolled */
795 case 1:
796 sum1 = b[*r++];
797 for (j=0; j<nz-1; j+=2) {
798 i0 = vi[0];
799 i1 = vi[1];
800 vi +=2;
801 tmp0 = tmps[i0];
802 tmp1 = tmps[i1];
803 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
804 }
805 if (j == nz-1) {
806 tmp0 = tmps[*vi++];
807 sum1 -= *v1++ *tmp0;
808 }
809 tmp[row++]=sum1;
810 break;
811 case 2:
812 sum1 = b[*r++];
813 sum2 = b[*r++];
814 v2 = aa + ai[row+1];
815
816 for (j=0; j<nz-1; j+=2) {
817 i0 = vi[0];
818 i1 = vi[1];
819 vi +=2;
820 tmp0 = tmps[i0];
821 tmp1 = tmps[i1];
822 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
823 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
824 }
825 if (j == nz-1) {
826 tmp0 = tmps[*vi++];
827 sum1 -= *v1++ *tmp0;
828 sum2 -= *v2++ *tmp0;
829 }
830 sum2 -= *v2++ *sum1;
831 tmp[row++]=sum1;
832 tmp[row++]=sum2;
833 break;
834 case 3:
835 sum1 = b[*r++];
836 sum2 = b[*r++];
837 sum3 = b[*r++];
838 v2 = aa + ai[row+1];
839 v3 = aa + ai[row+2];
840
841 for (j=0; j<nz-1; j+=2) {
842 i0 = vi[0];
843 i1 = vi[1];
844 vi +=2;
845 tmp0 = tmps[i0];
846 tmp1 = tmps[i1];
847 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
848 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
849 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
850 }
851 if (j == nz-1) {
852 tmp0 = tmps[*vi++];
853 sum1 -= *v1++ *tmp0;
854 sum2 -= *v2++ *tmp0;
855 sum3 -= *v3++ *tmp0;
856 }
857 sum2 -= *v2++ * sum1;
858 sum3 -= *v3++ * sum1;
859 sum3 -= *v3++ * sum2;
860
861 tmp[row++]=sum1;
862 tmp[row++]=sum2;
863 tmp[row++]=sum3;
864 break;
865
866 case 4:
867 sum1 = b[*r++];
868 sum2 = b[*r++];
869 sum3 = b[*r++];
870 sum4 = b[*r++];
871 v2 = aa + ai[row+1];
872 v3 = aa + ai[row+2];
873 v4 = aa + ai[row+3];
874
875 for (j=0; j<nz-1; j+=2) {
876 i0 = vi[0];
877 i1 = vi[1];
878 vi +=2;
879 tmp0 = tmps[i0];
880 tmp1 = tmps[i1];
881 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
882 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
883 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
884 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
885 }
886 if (j == nz-1) {
887 tmp0 = tmps[*vi++];
888 sum1 -= *v1++ *tmp0;
889 sum2 -= *v2++ *tmp0;
890 sum3 -= *v3++ *tmp0;
891 sum4 -= *v4++ *tmp0;
892 }
893 sum2 -= *v2++ * sum1;
894 sum3 -= *v3++ * sum1;
895 sum4 -= *v4++ * sum1;
896 sum3 -= *v3++ * sum2;
897 sum4 -= *v4++ * sum2;
898 sum4 -= *v4++ * sum3;
899
900 tmp[row++]=sum1;
901 tmp[row++]=sum2;
902 tmp[row++]=sum3;
903 tmp[row++]=sum4;
904 break;
905 case 5:
906 sum1 = b[*r++];
907 sum2 = b[*r++];
908 sum3 = b[*r++];
909 sum4 = b[*r++];
910 sum5 = b[*r++];
911 v2 = aa + ai[row+1];
912 v3 = aa + ai[row+2];
913 v4 = aa + ai[row+3];
914 v5 = aa + ai[row+4];
915
916 for (j=0; j<nz-1; j+=2) {
917 i0 = vi[0];
918 i1 = vi[1];
919 vi +=2;
920 tmp0 = tmps[i0];
921 tmp1 = tmps[i1];
922 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
923 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
924 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
925 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
926 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
927 }
928 if (j == nz-1) {
929 tmp0 = tmps[*vi++];
930 sum1 -= *v1++ *tmp0;
931 sum2 -= *v2++ *tmp0;
932 sum3 -= *v3++ *tmp0;
933 sum4 -= *v4++ *tmp0;
934 sum5 -= *v5++ *tmp0;
935 }
936
937 sum2 -= *v2++ * sum1;
938 sum3 -= *v3++ * sum1;
939 sum4 -= *v4++ * sum1;
940 sum5 -= *v5++ * sum1;
941 sum3 -= *v3++ * sum2;
942 sum4 -= *v4++ * sum2;
943 sum5 -= *v5++ * sum2;
944 sum4 -= *v4++ * sum3;
945 sum5 -= *v5++ * sum3;
946 sum5 -= *v5++ * sum4;
947
948 tmp[row++]=sum1;
949 tmp[row++]=sum2;
950 tmp[row++]=sum3;
951 tmp[row++]=sum4;
952 tmp[row++]=sum5;
953 break;
954 default:
955 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
956 }
957 }
958 /* backward solve the upper triangular */
959 for (i=node_max -1,row = n-1; i>=0; i--) {
960 nsz = ns[i];
961 aii = ai[row+1] -1;
962 v1 = aa + aii;
963 vi = aj + aii;
964 nz = aii- ad[row];
965 switch (nsz) { /* Each loop in 'case' is unrolled */
966 case 1:
967 sum1 = tmp[row];
968
969 for (j=nz; j>1; j-=2) {
970 vi -=2;
971 i0 = vi[2];
972 i1 = vi[1];
973 tmp0 = tmps[i0];
974 tmp1 = tmps[i1];
975 v1 -= 2;
976 sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
977 }
978 if (j==1) {
979 tmp0 = tmps[*vi--];
980 sum1 -= *v1-- * tmp0;
981 }
982 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
983 break;
984 case 2:
985 sum1 = tmp[row];
986 sum2 = tmp[row -1];
987 v2 = aa + ai[row]-1;
988 for (j=nz; j>1; j-=2) {
989 vi -=2;
990 i0 = vi[2];
991 i1 = vi[1];
992 tmp0 = tmps[i0];
993 tmp1 = tmps[i1];
994 v1 -= 2;
995 v2 -= 2;
996 sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
997 sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
998 }
999 if (j==1) {
1000 tmp0 = tmps[*vi--];
1001 sum1 -= *v1-- * tmp0;
1002 sum2 -= *v2-- * tmp0;
1003 }
1004
1005 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1006 sum2 -= *v2-- * tmp0;
1007 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1008 break;
1009 case 3:
1010 sum1 = tmp[row];
1011 sum2 = tmp[row -1];
1012 sum3 = tmp[row -2];
1013 v2 = aa + ai[row]-1;
1014 v3 = aa + ai[row -1]-1;
1015 for (j=nz; j>1; j-=2) {
1016 vi -=2;
1017 i0 = vi[2];
1018 i1 = vi[1];
1019 tmp0 = tmps[i0];
1020 tmp1 = tmps[i1];
1021 v1 -= 2;
1022 v2 -= 2;
1023 v3 -= 2;
1024 sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1025 sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1026 sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1027 }
1028 if (j==1) {
1029 tmp0 = tmps[*vi--];
1030 sum1 -= *v1-- * tmp0;
1031 sum2 -= *v2-- * tmp0;
1032 sum3 -= *v3-- * tmp0;
1033 }
1034 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1035 sum2 -= *v2-- * tmp0;
1036 sum3 -= *v3-- * tmp0;
1037 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1038 sum3 -= *v3-- * tmp0;
1039 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1040
1041 break;
1042 case 4:
1043 sum1 = tmp[row];
1044 sum2 = tmp[row -1];
1045 sum3 = tmp[row -2];
1046 sum4 = tmp[row -3];
1047 v2 = aa + ai[row]-1;
1048 v3 = aa + ai[row -1]-1;
1049 v4 = aa + ai[row -2]-1;
1050
1051 for (j=nz; j>1; j-=2) {
1052 vi -=2;
1053 i0 = vi[2];
1054 i1 = vi[1];
1055 tmp0 = tmps[i0];
1056 tmp1 = tmps[i1];
1057 v1 -= 2;
1058 v2 -= 2;
1059 v3 -= 2;
1060 v4 -= 2;
1061 sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1062 sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1063 sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1064 sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1065 }
1066 if (j==1) {
1067 tmp0 = tmps[*vi--];
1068 sum1 -= *v1-- * tmp0;
1069 sum2 -= *v2-- * tmp0;
1070 sum3 -= *v3-- * tmp0;
1071 sum4 -= *v4-- * tmp0;
1072 }
1073
1074 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1075 sum2 -= *v2-- * tmp0;
1076 sum3 -= *v3-- * tmp0;
1077 sum4 -= *v4-- * tmp0;
1078 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1079 sum3 -= *v3-- * tmp0;
1080 sum4 -= *v4-- * tmp0;
1081 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1082 sum4 -= *v4-- * tmp0;
1083 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1084 break;
1085 case 5:
1086 sum1 = tmp[row];
1087 sum2 = tmp[row -1];
1088 sum3 = tmp[row -2];
1089 sum4 = tmp[row -3];
1090 sum5 = tmp[row -4];
1091 v2 = aa + ai[row]-1;
1092 v3 = aa + ai[row -1]-1;
1093 v4 = aa + ai[row -2]-1;
1094 v5 = aa + ai[row -3]-1;
1095 for (j=nz; j>1; j-=2) {
1096 vi -= 2;
1097 i0 = vi[2];
1098 i1 = vi[1];
1099 tmp0 = tmps[i0];
1100 tmp1 = tmps[i1];
1101 v1 -= 2;
1102 v2 -= 2;
1103 v3 -= 2;
1104 v4 -= 2;
1105 v5 -= 2;
1106 sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1107 sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1108 sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1109 sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1110 sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1111 }
1112 if (j==1) {
1113 tmp0 = tmps[*vi--];
1114 sum1 -= *v1-- * tmp0;
1115 sum2 -= *v2-- * tmp0;
1116 sum3 -= *v3-- * tmp0;
1117 sum4 -= *v4-- * tmp0;
1118 sum5 -= *v5-- * tmp0;
1119 }
1120
1121 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1122 sum2 -= *v2-- * tmp0;
1123 sum3 -= *v3-- * tmp0;
1124 sum4 -= *v4-- * tmp0;
1125 sum5 -= *v5-- * tmp0;
1126 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1127 sum3 -= *v3-- * tmp0;
1128 sum4 -= *v4-- * tmp0;
1129 sum5 -= *v5-- * tmp0;
1130 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1131 sum4 -= *v4-- * tmp0;
1132 sum5 -= *v5-- * tmp0;
1133 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1134 sum5 -= *v5-- * tmp0;
1135 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1136 break;
1137 default:
1138 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1139 }
1140 }
1141 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1142 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1143 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1144 ierr = VecRestoreArrayWrite(xx,&x);CHKERRQ(ierr);
1145 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1146 PetscFunctionReturn(0);
1147 }
1148
MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo * info)1149 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1150 {
1151 Mat C =B;
1152 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1153 IS isrow = b->row,isicol = b->icol;
1154 PetscErrorCode ierr;
1155 const PetscInt *r,*ic,*ics;
1156 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1157 PetscInt i,j,k,nz,nzL,row,*pj;
1158 const PetscInt *ajtmp,*bjtmp;
1159 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1160 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1161 FactorShiftCtx sctx;
1162 const PetscInt *ddiag;
1163 PetscReal rs;
1164 MatScalar d;
1165 PetscInt inod,nodesz,node_max,col;
1166 const PetscInt *ns;
1167 PetscInt *tmp_vec1,*tmp_vec2,*nsmap;
1168
1169 PetscFunctionBegin;
1170 /* MatPivotSetUp(): initialize shift context sctx */
1171 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1172
1173 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1174 ddiag = a->diag;
1175 sctx.shift_top = info->zeropivot;
1176 for (i=0; i<n; i++) {
1177 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1178 d = (aa)[ddiag[i]];
1179 rs = -PetscAbsScalar(d) - PetscRealPart(d);
1180 v = aa+ai[i];
1181 nz = ai[i+1] - ai[i];
1182 for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1183 if (rs>sctx.shift_top) sctx.shift_top = rs;
1184 }
1185 sctx.shift_top *= 1.1;
1186 sctx.nshift_max = 5;
1187 sctx.shift_lo = 0.;
1188 sctx.shift_hi = 1.;
1189 }
1190
1191 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1192 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1193
1194 ierr = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr);
1195 ics = ic;
1196
1197 node_max = a->inode.node_count;
1198 ns = a->inode.size;
1199 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1200
1201 /* If max inode size > 4, split it into two inodes.*/
1202 /* also map the inode sizes according to the ordering */
1203 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1204 for (i=0,j=0; i<node_max; ++i,++j) {
1205 if (ns[i] > 4) {
1206 tmp_vec1[j] = 4;
1207 ++j;
1208 tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1209 } else {
1210 tmp_vec1[j] = ns[i];
1211 }
1212 }
1213 /* Use the correct node_max */
1214 node_max = j;
1215
1216 /* Now reorder the inode info based on mat re-ordering info */
1217 /* First create a row -> inode_size_array_index map */
1218 ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1219 ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1220 for (i=0,row=0; i<node_max; i++) {
1221 nodesz = tmp_vec1[i];
1222 for (j=0; j<nodesz; j++,row++) {
1223 nsmap[row] = i;
1224 }
1225 }
1226 /* Using nsmap, create a reordered ns structure */
1227 for (i=0,j=0; i< node_max; i++) {
1228 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1229 tmp_vec2[i] = nodesz;
1230 j += nodesz;
1231 }
1232 ierr = PetscFree(nsmap);CHKERRQ(ierr);
1233 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1234
1235 /* Now use the correct ns */
1236 ns = tmp_vec2;
1237
1238 do {
1239 sctx.newshift = PETSC_FALSE;
1240 /* Now loop over each block-row, and do the factorization */
1241 for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1242 nodesz = ns[inod];
1243
1244 switch (nodesz) {
1245 case 1:
1246 /*----------*/
1247 /* zero rtmp1 */
1248 /* L part */
1249 nz = bi[i+1] - bi[i];
1250 bjtmp = bj + bi[i];
1251 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1252
1253 /* U part */
1254 nz = bdiag[i]-bdiag[i+1];
1255 bjtmp = bj + bdiag[i+1]+1;
1256 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1257
1258 /* load in initial (unfactored row) */
1259 nz = ai[r[i]+1] - ai[r[i]];
1260 ajtmp = aj + ai[r[i]];
1261 v = aa + ai[r[i]];
1262 for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1263
1264 /* ZeropivotApply() */
1265 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
1266
1267 /* elimination */
1268 bjtmp = bj + bi[i];
1269 row = *bjtmp++;
1270 nzL = bi[i+1] - bi[i];
1271 for (k=0; k < nzL; k++) {
1272 pc = rtmp1 + row;
1273 if (*pc != 0.0) {
1274 pv = b->a + bdiag[row];
1275 mul1 = *pc * (*pv);
1276 *pc = mul1;
1277 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1278 pv = b->a + bdiag[row+1]+1;
1279 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1280 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1281 ierr = PetscLogFlops(1+2.0*nz);CHKERRQ(ierr);
1282 }
1283 row = *bjtmp++;
1284 }
1285
1286 /* finished row so stick it into b->a */
1287 rs = 0.0;
1288 /* L part */
1289 pv = b->a + bi[i];
1290 pj = b->j + bi[i];
1291 nz = bi[i+1] - bi[i];
1292 for (j=0; j<nz; j++) {
1293 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1294 }
1295
1296 /* U part */
1297 pv = b->a + bdiag[i+1]+1;
1298 pj = b->j + bdiag[i+1]+1;
1299 nz = bdiag[i] - bdiag[i+1]-1;
1300 for (j=0; j<nz; j++) {
1301 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1302 }
1303
1304 /* Check zero pivot */
1305 sctx.rs = rs;
1306 sctx.pv = rtmp1[i];
1307 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1308 if (sctx.newshift) break;
1309
1310 /* Mark diagonal and invert diagonal for simplier triangular solves */
1311 pv = b->a + bdiag[i];
1312 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1313 break;
1314
1315 case 2:
1316 /*----------*/
1317 /* zero rtmp1 and rtmp2 */
1318 /* L part */
1319 nz = bi[i+1] - bi[i];
1320 bjtmp = bj + bi[i];
1321 for (j=0; j<nz; j++) {
1322 col = bjtmp[j];
1323 rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1324 }
1325
1326 /* U part */
1327 nz = bdiag[i]-bdiag[i+1];
1328 bjtmp = bj + bdiag[i+1]+1;
1329 for (j=0; j<nz; j++) {
1330 col = bjtmp[j];
1331 rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1332 }
1333
1334 /* load in initial (unfactored row) */
1335 nz = ai[r[i]+1] - ai[r[i]];
1336 ajtmp = aj + ai[r[i]];
1337 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1338 for (j=0; j<nz; j++) {
1339 col = ics[ajtmp[j]];
1340 rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1341 }
1342 /* ZeropivotApply(): shift the diagonal of the matrix */
1343 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1344
1345 /* elimination */
1346 bjtmp = bj + bi[i];
1347 row = *bjtmp++; /* pivot row */
1348 nzL = bi[i+1] - bi[i];
1349 for (k=0; k < nzL; k++) {
1350 pc1 = rtmp1 + row;
1351 pc2 = rtmp2 + row;
1352 if (*pc1 != 0.0 || *pc2 != 0.0) {
1353 pv = b->a + bdiag[row];
1354 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1355 *pc1 = mul1; *pc2 = mul2;
1356
1357 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1358 pv = b->a + bdiag[row+1]+1;
1359 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1360 for (j=0; j<nz; j++) {
1361 col = pj[j];
1362 rtmp1[col] -= mul1 * pv[j];
1363 rtmp2[col] -= mul2 * pv[j];
1364 }
1365 ierr = PetscLogFlops(2+4.0*nz);CHKERRQ(ierr);
1366 }
1367 row = *bjtmp++;
1368 }
1369
1370 /* finished row i; check zero pivot, then stick row i into b->a */
1371 rs = 0.0;
1372 /* L part */
1373 pc1 = b->a + bi[i];
1374 pj = b->j + bi[i];
1375 nz = bi[i+1] - bi[i];
1376 for (j=0; j<nz; j++) {
1377 col = pj[j];
1378 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1379 }
1380 /* U part */
1381 pc1 = b->a + bdiag[i+1]+1;
1382 pj = b->j + bdiag[i+1]+1;
1383 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1384 for (j=0; j<nz; j++) {
1385 col = pj[j];
1386 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1387 }
1388
1389 sctx.rs = rs;
1390 sctx.pv = rtmp1[i];
1391 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1392 if (sctx.newshift) break;
1393 pc1 = b->a + bdiag[i]; /* Mark diagonal */
1394 *pc1 = 1.0/sctx.pv;
1395
1396 /* Now take care of diagonal 2x2 block. */
1397 pc2 = rtmp2 + i;
1398 if (*pc2 != 0.0) {
1399 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1400 *pc2 = mul1; /* insert L entry */
1401 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1402 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1403 for (j=0; j<nz; j++) {
1404 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1405 }
1406 ierr = PetscLogFlops(1+2.0*nz);CHKERRQ(ierr);
1407 }
1408
1409 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1410 rs = 0.0;
1411 /* L part */
1412 pc2 = b->a + bi[i+1];
1413 pj = b->j + bi[i+1];
1414 nz = bi[i+2] - bi[i+1];
1415 for (j=0; j<nz; j++) {
1416 col = pj[j];
1417 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1418 }
1419 /* U part */
1420 pc2 = b->a + bdiag[i+2]+1;
1421 pj = b->j + bdiag[i+2]+1;
1422 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1423 for (j=0; j<nz; j++) {
1424 col = pj[j];
1425 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1426 }
1427
1428 sctx.rs = rs;
1429 sctx.pv = rtmp2[i+1];
1430 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1431 if (sctx.newshift) break;
1432 pc2 = b->a + bdiag[i+1];
1433 *pc2 = 1.0/sctx.pv;
1434 break;
1435
1436 case 3:
1437 /*----------*/
1438 /* zero rtmp */
1439 /* L part */
1440 nz = bi[i+1] - bi[i];
1441 bjtmp = bj + bi[i];
1442 for (j=0; j<nz; j++) {
1443 col = bjtmp[j];
1444 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1445 }
1446
1447 /* U part */
1448 nz = bdiag[i]-bdiag[i+1];
1449 bjtmp = bj + bdiag[i+1]+1;
1450 for (j=0; j<nz; j++) {
1451 col = bjtmp[j];
1452 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1453 }
1454
1455 /* load in initial (unfactored row) */
1456 nz = ai[r[i]+1] - ai[r[i]];
1457 ajtmp = aj + ai[r[i]];
1458 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1459 for (j=0; j<nz; j++) {
1460 col = ics[ajtmp[j]];
1461 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1462 }
1463 /* ZeropivotApply(): shift the diagonal of the matrix */
1464 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1465
1466 /* elimination */
1467 bjtmp = bj + bi[i];
1468 row = *bjtmp++; /* pivot row */
1469 nzL = bi[i+1] - bi[i];
1470 for (k=0; k < nzL; k++) {
1471 pc1 = rtmp1 + row;
1472 pc2 = rtmp2 + row;
1473 pc3 = rtmp3 + row;
1474 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1475 pv = b->a + bdiag[row];
1476 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1477 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1478
1479 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1480 pv = b->a + bdiag[row+1]+1;
1481 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1482 for (j=0; j<nz; j++) {
1483 col = pj[j];
1484 rtmp1[col] -= mul1 * pv[j];
1485 rtmp2[col] -= mul2 * pv[j];
1486 rtmp3[col] -= mul3 * pv[j];
1487 }
1488 ierr = PetscLogFlops(3+6.0*nz);CHKERRQ(ierr);
1489 }
1490 row = *bjtmp++;
1491 }
1492
1493 /* finished row i; check zero pivot, then stick row i into b->a */
1494 rs = 0.0;
1495 /* L part */
1496 pc1 = b->a + bi[i];
1497 pj = b->j + bi[i];
1498 nz = bi[i+1] - bi[i];
1499 for (j=0; j<nz; j++) {
1500 col = pj[j];
1501 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1502 }
1503 /* U part */
1504 pc1 = b->a + bdiag[i+1]+1;
1505 pj = b->j + bdiag[i+1]+1;
1506 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1507 for (j=0; j<nz; j++) {
1508 col = pj[j];
1509 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1510 }
1511
1512 sctx.rs = rs;
1513 sctx.pv = rtmp1[i];
1514 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1515 if (sctx.newshift) break;
1516 pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1517 *pc1 = 1.0/sctx.pv;
1518
1519 /* Now take care of 1st column of diagonal 3x3 block. */
1520 pc2 = rtmp2 + i;
1521 pc3 = rtmp3 + i;
1522 if (*pc2 != 0.0 || *pc3 != 0.0) {
1523 mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1524 mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1525 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1526 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1527 for (j=0; j<nz; j++) {
1528 col = pj[j];
1529 rtmp2[col] -= mul2 * rtmp1[col];
1530 rtmp3[col] -= mul3 * rtmp1[col];
1531 }
1532 ierr = PetscLogFlops(2+4.0*nz);CHKERRQ(ierr);
1533 }
1534
1535 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1536 rs = 0.0;
1537 /* L part */
1538 pc2 = b->a + bi[i+1];
1539 pj = b->j + bi[i+1];
1540 nz = bi[i+2] - bi[i+1];
1541 for (j=0; j<nz; j++) {
1542 col = pj[j];
1543 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1544 }
1545 /* U part */
1546 pc2 = b->a + bdiag[i+2]+1;
1547 pj = b->j + bdiag[i+2]+1;
1548 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1549 for (j=0; j<nz; j++) {
1550 col = pj[j];
1551 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1552 }
1553
1554 sctx.rs = rs;
1555 sctx.pv = rtmp2[i+1];
1556 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1557 if (sctx.newshift) break;
1558 pc2 = b->a + bdiag[i+1];
1559 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1560
1561 /* Now take care of 2nd column of diagonal 3x3 block. */
1562 pc3 = rtmp3 + i+1;
1563 if (*pc3 != 0.0) {
1564 mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1565 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1566 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1567 for (j=0; j<nz; j++) {
1568 col = pj[j];
1569 rtmp3[col] -= mul3 * rtmp2[col];
1570 }
1571 ierr = PetscLogFlops(1+2.0*nz);CHKERRQ(ierr);
1572 }
1573
1574 /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1575 rs = 0.0;
1576 /* L part */
1577 pc3 = b->a + bi[i+2];
1578 pj = b->j + bi[i+2];
1579 nz = bi[i+3] - bi[i+2];
1580 for (j=0; j<nz; j++) {
1581 col = pj[j];
1582 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1583 }
1584 /* U part */
1585 pc3 = b->a + bdiag[i+3]+1;
1586 pj = b->j + bdiag[i+3]+1;
1587 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1588 for (j=0; j<nz; j++) {
1589 col = pj[j];
1590 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1591 }
1592
1593 sctx.rs = rs;
1594 sctx.pv = rtmp3[i+2];
1595 ierr = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1596 if (sctx.newshift) break;
1597 pc3 = b->a + bdiag[i+2];
1598 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1599 break;
1600 case 4:
1601 /*----------*/
1602 /* zero rtmp */
1603 /* L part */
1604 nz = bi[i+1] - bi[i];
1605 bjtmp = bj + bi[i];
1606 for (j=0; j<nz; j++) {
1607 col = bjtmp[j];
1608 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1609 }
1610
1611 /* U part */
1612 nz = bdiag[i]-bdiag[i+1];
1613 bjtmp = bj + bdiag[i+1]+1;
1614 for (j=0; j<nz; j++) {
1615 col = bjtmp[j];
1616 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1617 }
1618
1619 /* load in initial (unfactored row) */
1620 nz = ai[r[i]+1] - ai[r[i]];
1621 ajtmp = aj + ai[r[i]];
1622 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1623 for (j=0; j<nz; j++) {
1624 col = ics[ajtmp[j]];
1625 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1626 }
1627 /* ZeropivotApply(): shift the diagonal of the matrix */
1628 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1629
1630 /* elimination */
1631 bjtmp = bj + bi[i];
1632 row = *bjtmp++; /* pivot row */
1633 nzL = bi[i+1] - bi[i];
1634 for (k=0; k < nzL; k++) {
1635 pc1 = rtmp1 + row;
1636 pc2 = rtmp2 + row;
1637 pc3 = rtmp3 + row;
1638 pc4 = rtmp4 + row;
1639 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1640 pv = b->a + bdiag[row];
1641 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1642 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1643
1644 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1645 pv = b->a + bdiag[row+1]+1;
1646 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1647 for (j=0; j<nz; j++) {
1648 col = pj[j];
1649 rtmp1[col] -= mul1 * pv[j];
1650 rtmp2[col] -= mul2 * pv[j];
1651 rtmp3[col] -= mul3 * pv[j];
1652 rtmp4[col] -= mul4 * pv[j];
1653 }
1654 ierr = PetscLogFlops(4+8.0*nz);CHKERRQ(ierr);
1655 }
1656 row = *bjtmp++;
1657 }
1658
1659 /* finished row i; check zero pivot, then stick row i into b->a */
1660 rs = 0.0;
1661 /* L part */
1662 pc1 = b->a + bi[i];
1663 pj = b->j + bi[i];
1664 nz = bi[i+1] - bi[i];
1665 for (j=0; j<nz; j++) {
1666 col = pj[j];
1667 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1668 }
1669 /* U part */
1670 pc1 = b->a + bdiag[i+1]+1;
1671 pj = b->j + bdiag[i+1]+1;
1672 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1673 for (j=0; j<nz; j++) {
1674 col = pj[j];
1675 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1676 }
1677
1678 sctx.rs = rs;
1679 sctx.pv = rtmp1[i];
1680 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1681 if (sctx.newshift) break;
1682 pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1683 *pc1 = 1.0/sctx.pv;
1684
1685 /* Now take care of 1st column of diagonal 4x4 block. */
1686 pc2 = rtmp2 + i;
1687 pc3 = rtmp3 + i;
1688 pc4 = rtmp4 + i;
1689 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1690 mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1691 mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1692 mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1693 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */
1694 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1695 for (j=0; j<nz; j++) {
1696 col = pj[j];
1697 rtmp2[col] -= mul2 * rtmp1[col];
1698 rtmp3[col] -= mul3 * rtmp1[col];
1699 rtmp4[col] -= mul4 * rtmp1[col];
1700 }
1701 ierr = PetscLogFlops(3+6.0*nz);CHKERRQ(ierr);
1702 }
1703
1704 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1705 rs = 0.0;
1706 /* L part */
1707 pc2 = b->a + bi[i+1];
1708 pj = b->j + bi[i+1];
1709 nz = bi[i+2] - bi[i+1];
1710 for (j=0; j<nz; j++) {
1711 col = pj[j];
1712 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1713 }
1714 /* U part */
1715 pc2 = b->a + bdiag[i+2]+1;
1716 pj = b->j + bdiag[i+2]+1;
1717 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1718 for (j=0; j<nz; j++) {
1719 col = pj[j];
1720 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1721 }
1722
1723 sctx.rs = rs;
1724 sctx.pv = rtmp2[i+1];
1725 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1726 if (sctx.newshift) break;
1727 pc2 = b->a + bdiag[i+1];
1728 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1729
1730 /* Now take care of 2nd column of diagonal 4x4 block. */
1731 pc3 = rtmp3 + i+1;
1732 pc4 = rtmp4 + i+1;
1733 if (*pc3 != 0.0 || *pc4 != 0.0) {
1734 mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1735 mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1736 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */
1737 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1738 for (j=0; j<nz; j++) {
1739 col = pj[j];
1740 rtmp3[col] -= mul3 * rtmp2[col];
1741 rtmp4[col] -= mul4 * rtmp2[col];
1742 }
1743 ierr = PetscLogFlops(4.0*nz);CHKERRQ(ierr);
1744 }
1745
1746 /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1747 rs = 0.0;
1748 /* L part */
1749 pc3 = b->a + bi[i+2];
1750 pj = b->j + bi[i+2];
1751 nz = bi[i+3] - bi[i+2];
1752 for (j=0; j<nz; j++) {
1753 col = pj[j];
1754 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1755 }
1756 /* U part */
1757 pc3 = b->a + bdiag[i+3]+1;
1758 pj = b->j + bdiag[i+3]+1;
1759 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1760 for (j=0; j<nz; j++) {
1761 col = pj[j];
1762 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1763 }
1764
1765 sctx.rs = rs;
1766 sctx.pv = rtmp3[i+2];
1767 ierr = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1768 if (sctx.newshift) break;
1769 pc3 = b->a + bdiag[i+2];
1770 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1771
1772 /* Now take care of 3rd column of diagonal 4x4 block. */
1773 pc4 = rtmp4 + i+2;
1774 if (*pc4 != 0.0) {
1775 mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1776 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */
1777 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1778 for (j=0; j<nz; j++) {
1779 col = pj[j];
1780 rtmp4[col] -= mul4 * rtmp3[col];
1781 }
1782 ierr = PetscLogFlops(1+2.0*nz);CHKERRQ(ierr);
1783 }
1784
1785 /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1786 rs = 0.0;
1787 /* L part */
1788 pc4 = b->a + bi[i+3];
1789 pj = b->j + bi[i+3];
1790 nz = bi[i+4] - bi[i+3];
1791 for (j=0; j<nz; j++) {
1792 col = pj[j];
1793 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1794 }
1795 /* U part */
1796 pc4 = b->a + bdiag[i+4]+1;
1797 pj = b->j + bdiag[i+4]+1;
1798 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1799 for (j=0; j<nz; j++) {
1800 col = pj[j];
1801 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1802 }
1803
1804 sctx.rs = rs;
1805 sctx.pv = rtmp4[i+3];
1806 ierr = MatPivotCheck(B,A,info,&sctx,i+3);CHKERRQ(ierr);
1807 if (sctx.newshift) break;
1808 pc4 = b->a + bdiag[i+3];
1809 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1810 break;
1811
1812 default:
1813 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1814 }
1815 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1816 i += nodesz; /* Update the row */
1817 }
1818
1819 /* MatPivotRefine() */
1820 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1821 /*
1822 * if no shift in this attempt & shifting & started shifting & can refine,
1823 * then try lower shift
1824 */
1825 sctx.shift_hi = sctx.shift_fraction;
1826 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1827 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
1828 sctx.newshift = PETSC_TRUE;
1829 sctx.nshift++;
1830 }
1831 } while (sctx.newshift);
1832
1833 ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr);
1834 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1835 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1836 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1837
1838 if (b->inode.size) {
1839 C->ops->solve = MatSolve_SeqAIJ_Inode;
1840 } else {
1841 C->ops->solve = MatSolve_SeqAIJ;
1842 }
1843 C->ops->solveadd = MatSolveAdd_SeqAIJ;
1844 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
1845 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1846 C->ops->matsolve = MatMatSolve_SeqAIJ;
1847 C->assembled = PETSC_TRUE;
1848 C->preallocated = PETSC_TRUE;
1849
1850 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1851
1852 /* MatShiftView(A,info,&sctx) */
1853 if (sctx.nshift) {
1854 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1855 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
1856 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1857 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1858 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1859 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr);
1860 }
1861 }
1862 PetscFunctionReturn(0);
1863 }
1864
MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo * info)1865 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1866 {
1867 Mat C = B;
1868 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1869 IS iscol = b->col,isrow = b->row,isicol = b->icol;
1870 PetscErrorCode ierr;
1871 const PetscInt *r,*ic,*c,*ics;
1872 PetscInt n = A->rmap->n,*bi = b->i;
1873 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1874 PetscInt i,j,idx,*bd = b->diag,node_max,nodesz;
1875 PetscInt *ai = a->i,*aj = a->j;
1876 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1877 PetscScalar mul1,mul2,mul3,tmp;
1878 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1879 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1880 PetscReal rs=0.0;
1881 FactorShiftCtx sctx;
1882
1883 PetscFunctionBegin;
1884 sctx.shift_top = 0;
1885 sctx.nshift_max = 0;
1886 sctx.shift_lo = 0;
1887 sctx.shift_hi = 0;
1888 sctx.shift_fraction = 0;
1889
1890 /* if both shift schemes are chosen by user, only use info->shiftpd */
1891 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1892 sctx.shift_top = 0;
1893 for (i=0; i<n; i++) {
1894 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1895 rs = 0.0;
1896 ajtmp = aj + ai[i];
1897 rtmp1 = aa + ai[i];
1898 nz = ai[i+1] - ai[i];
1899 for (j=0; j<nz; j++) {
1900 if (*ajtmp != i) {
1901 rs += PetscAbsScalar(*rtmp1++);
1902 } else {
1903 rs -= PetscRealPart(*rtmp1++);
1904 }
1905 ajtmp++;
1906 }
1907 if (rs>sctx.shift_top) sctx.shift_top = rs;
1908 }
1909 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1910 sctx.shift_top *= 1.1;
1911 sctx.nshift_max = 5;
1912 sctx.shift_lo = 0.;
1913 sctx.shift_hi = 1.;
1914 }
1915 sctx.shift_amount = 0;
1916 sctx.nshift = 0;
1917
1918 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1919 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1920 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1921 ierr = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr);
1922 ics = ic;
1923
1924 node_max = a->inode.node_count;
1925 ns = a->inode.size;
1926 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1927
1928 /* If max inode size > 3, split it into two inodes.*/
1929 /* also map the inode sizes according to the ordering */
1930 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1931 for (i=0,j=0; i<node_max; ++i,++j) {
1932 if (ns[i]>3) {
1933 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1934 ++j;
1935 tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1936 } else {
1937 tmp_vec1[j] = ns[i];
1938 }
1939 }
1940 /* Use the correct node_max */
1941 node_max = j;
1942
1943 /* Now reorder the inode info based on mat re-ordering info */
1944 /* First create a row -> inode_size_array_index map */
1945 ierr = PetscMalloc2(n+1,&nsmap,node_max+1,&tmp_vec2);CHKERRQ(ierr);
1946 for (i=0,row=0; i<node_max; i++) {
1947 nodesz = tmp_vec1[i];
1948 for (j=0; j<nodesz; j++,row++) {
1949 nsmap[row] = i;
1950 }
1951 }
1952 /* Using nsmap, create a reordered ns structure */
1953 for (i=0,j=0; i< node_max; i++) {
1954 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1955 tmp_vec2[i] = nodesz;
1956 j += nodesz;
1957 }
1958 ierr = PetscFree2(nsmap,tmp_vec1);CHKERRQ(ierr);
1959 /* Now use the correct ns */
1960 ns = tmp_vec2;
1961
1962 do {
1963 sctx.newshift = PETSC_FALSE;
1964 /* Now loop over each block-row, and do the factorization */
1965 for (i=0,row=0; i<node_max; i++) {
1966 nodesz = ns[i];
1967 nz = bi[row+1] - bi[row];
1968 bjtmp = bj + bi[row];
1969
1970 switch (nodesz) {
1971 case 1:
1972 for (j=0; j<nz; j++) {
1973 idx = bjtmp[j];
1974 rtmp11[idx] = 0.0;
1975 }
1976
1977 /* load in initial (unfactored row) */
1978 idx = r[row];
1979 nz_tmp = ai[idx+1] - ai[idx];
1980 ajtmp = aj + ai[idx];
1981 v1 = aa + ai[idx];
1982
1983 for (j=0; j<nz_tmp; j++) {
1984 idx = ics[ajtmp[j]];
1985 rtmp11[idx] = v1[j];
1986 }
1987 rtmp11[ics[r[row]]] += sctx.shift_amount;
1988
1989 prow = *bjtmp++;
1990 while (prow < row) {
1991 pc1 = rtmp11 + prow;
1992 if (*pc1 != 0.0) {
1993 pv = ba + bd[prow];
1994 pj = nbj + bd[prow];
1995 mul1 = *pc1 * *pv++;
1996 *pc1 = mul1;
1997 nz_tmp = bi[prow+1] - bd[prow] - 1;
1998 ierr = PetscLogFlops(1+2.0*nz_tmp);CHKERRQ(ierr);
1999 for (j=0; j<nz_tmp; j++) {
2000 tmp = pv[j];
2001 idx = pj[j];
2002 rtmp11[idx] -= mul1 * tmp;
2003 }
2004 }
2005 prow = *bjtmp++;
2006 }
2007 pj = bj + bi[row];
2008 pc1 = ba + bi[row];
2009
2010 sctx.pv = rtmp11[row];
2011 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2012 rs = 0.0;
2013 for (j=0; j<nz; j++) {
2014 idx = pj[j];
2015 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2016 if (idx != row) rs += PetscAbsScalar(pc1[j]);
2017 }
2018 sctx.rs = rs;
2019 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2020 if (sctx.newshift) goto endofwhile;
2021 break;
2022
2023 case 2:
2024 for (j=0; j<nz; j++) {
2025 idx = bjtmp[j];
2026 rtmp11[idx] = 0.0;
2027 rtmp22[idx] = 0.0;
2028 }
2029
2030 /* load in initial (unfactored row) */
2031 idx = r[row];
2032 nz_tmp = ai[idx+1] - ai[idx];
2033 ajtmp = aj + ai[idx];
2034 v1 = aa + ai[idx];
2035 v2 = aa + ai[idx+1];
2036 for (j=0; j<nz_tmp; j++) {
2037 idx = ics[ajtmp[j]];
2038 rtmp11[idx] = v1[j];
2039 rtmp22[idx] = v2[j];
2040 }
2041 rtmp11[ics[r[row]]] += sctx.shift_amount;
2042 rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2043
2044 prow = *bjtmp++;
2045 while (prow < row) {
2046 pc1 = rtmp11 + prow;
2047 pc2 = rtmp22 + prow;
2048 if (*pc1 != 0.0 || *pc2 != 0.0) {
2049 pv = ba + bd[prow];
2050 pj = nbj + bd[prow];
2051 mul1 = *pc1 * *pv;
2052 mul2 = *pc2 * *pv;
2053 ++pv;
2054 *pc1 = mul1;
2055 *pc2 = mul2;
2056
2057 nz_tmp = bi[prow+1] - bd[prow] - 1;
2058 for (j=0; j<nz_tmp; j++) {
2059 tmp = pv[j];
2060 idx = pj[j];
2061 rtmp11[idx] -= mul1 * tmp;
2062 rtmp22[idx] -= mul2 * tmp;
2063 }
2064 ierr = PetscLogFlops(2+4.0*nz_tmp);CHKERRQ(ierr);
2065 }
2066 prow = *bjtmp++;
2067 }
2068
2069 /* Now take care of diagonal 2x2 block. Note: prow = row here */
2070 pc1 = rtmp11 + prow;
2071 pc2 = rtmp22 + prow;
2072
2073 sctx.pv = *pc1;
2074 pj = bj + bi[prow];
2075 rs = 0.0;
2076 for (j=0; j<nz; j++) {
2077 idx = pj[j];
2078 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2079 }
2080 sctx.rs = rs;
2081 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2082 if (sctx.newshift) goto endofwhile;
2083
2084 if (*pc2 != 0.0) {
2085 pj = nbj + bd[prow];
2086 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2087 *pc2 = mul2;
2088 nz_tmp = bi[prow+1] - bd[prow] - 1;
2089 for (j=0; j<nz_tmp; j++) {
2090 idx = pj[j];
2091 tmp = rtmp11[idx];
2092 rtmp22[idx] -= mul2 * tmp;
2093 }
2094 ierr = PetscLogFlops(1+2.0*nz_tmp);CHKERRQ(ierr);
2095 }
2096
2097 pj = bj + bi[row];
2098 pc1 = ba + bi[row];
2099 pc2 = ba + bi[row+1];
2100
2101 sctx.pv = rtmp22[row+1];
2102 rs = 0.0;
2103 rtmp11[row] = 1.0/rtmp11[row];
2104 rtmp22[row+1] = 1.0/rtmp22[row+1];
2105 /* copy row entries from dense representation to sparse */
2106 for (j=0; j<nz; j++) {
2107 idx = pj[j];
2108 pc1[j] = rtmp11[idx];
2109 pc2[j] = rtmp22[idx];
2110 if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2111 }
2112 sctx.rs = rs;
2113 ierr = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2114 if (sctx.newshift) goto endofwhile;
2115 break;
2116
2117 case 3:
2118 for (j=0; j<nz; j++) {
2119 idx = bjtmp[j];
2120 rtmp11[idx] = 0.0;
2121 rtmp22[idx] = 0.0;
2122 rtmp33[idx] = 0.0;
2123 }
2124 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2125 idx = r[row];
2126 nz_tmp = ai[idx+1] - ai[idx];
2127 ajtmp = aj + ai[idx];
2128 v1 = aa + ai[idx];
2129 v2 = aa + ai[idx+1];
2130 v3 = aa + ai[idx+2];
2131 for (j=0; j<nz_tmp; j++) {
2132 idx = ics[ajtmp[j]];
2133 rtmp11[idx] = v1[j];
2134 rtmp22[idx] = v2[j];
2135 rtmp33[idx] = v3[j];
2136 }
2137 rtmp11[ics[r[row]]] += sctx.shift_amount;
2138 rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2139 rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2140
2141 /* loop over all pivot row blocks above this row block */
2142 prow = *bjtmp++;
2143 while (prow < row) {
2144 pc1 = rtmp11 + prow;
2145 pc2 = rtmp22 + prow;
2146 pc3 = rtmp33 + prow;
2147 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2148 pv = ba + bd[prow];
2149 pj = nbj + bd[prow];
2150 mul1 = *pc1 * *pv;
2151 mul2 = *pc2 * *pv;
2152 mul3 = *pc3 * *pv;
2153 ++pv;
2154 *pc1 = mul1;
2155 *pc2 = mul2;
2156 *pc3 = mul3;
2157
2158 nz_tmp = bi[prow+1] - bd[prow] - 1;
2159 /* update this row based on pivot row */
2160 for (j=0; j<nz_tmp; j++) {
2161 tmp = pv[j];
2162 idx = pj[j];
2163 rtmp11[idx] -= mul1 * tmp;
2164 rtmp22[idx] -= mul2 * tmp;
2165 rtmp33[idx] -= mul3 * tmp;
2166 }
2167 ierr = PetscLogFlops(3+6.0*nz_tmp);CHKERRQ(ierr);
2168 }
2169 prow = *bjtmp++;
2170 }
2171
2172 /* Now take care of diagonal 3x3 block in this set of rows */
2173 /* note: prow = row here */
2174 pc1 = rtmp11 + prow;
2175 pc2 = rtmp22 + prow;
2176 pc3 = rtmp33 + prow;
2177
2178 sctx.pv = *pc1;
2179 pj = bj + bi[prow];
2180 rs = 0.0;
2181 for (j=0; j<nz; j++) {
2182 idx = pj[j];
2183 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2184 }
2185 sctx.rs = rs;
2186 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2187 if (sctx.newshift) goto endofwhile;
2188
2189 if (*pc2 != 0.0 || *pc3 != 0.0) {
2190 mul2 = (*pc2)/(*pc1);
2191 mul3 = (*pc3)/(*pc1);
2192 *pc2 = mul2;
2193 *pc3 = mul3;
2194 nz_tmp = bi[prow+1] - bd[prow] - 1;
2195 pj = nbj + bd[prow];
2196 for (j=0; j<nz_tmp; j++) {
2197 idx = pj[j];
2198 tmp = rtmp11[idx];
2199 rtmp22[idx] -= mul2 * tmp;
2200 rtmp33[idx] -= mul3 * tmp;
2201 }
2202 ierr = PetscLogFlops(2+4.0*nz_tmp);CHKERRQ(ierr);
2203 }
2204 ++prow;
2205
2206 pc2 = rtmp22 + prow;
2207 pc3 = rtmp33 + prow;
2208 sctx.pv = *pc2;
2209 pj = bj + bi[prow];
2210 rs = 0.0;
2211 for (j=0; j<nz; j++) {
2212 idx = pj[j];
2213 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2214 }
2215 sctx.rs = rs;
2216 ierr = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2217 if (sctx.newshift) goto endofwhile;
2218
2219 if (*pc3 != 0.0) {
2220 mul3 = (*pc3)/(*pc2);
2221 *pc3 = mul3;
2222 pj = nbj + bd[prow];
2223 nz_tmp = bi[prow+1] - bd[prow] - 1;
2224 for (j=0; j<nz_tmp; j++) {
2225 idx = pj[j];
2226 tmp = rtmp22[idx];
2227 rtmp33[idx] -= mul3 * tmp;
2228 }
2229 ierr = PetscLogFlops(1+2.0*nz_tmp);CHKERRQ(ierr);
2230 }
2231
2232 pj = bj + bi[row];
2233 pc1 = ba + bi[row];
2234 pc2 = ba + bi[row+1];
2235 pc3 = ba + bi[row+2];
2236
2237 sctx.pv = rtmp33[row+2];
2238 rs = 0.0;
2239 rtmp11[row] = 1.0/rtmp11[row];
2240 rtmp22[row+1] = 1.0/rtmp22[row+1];
2241 rtmp33[row+2] = 1.0/rtmp33[row+2];
2242 /* copy row entries from dense representation to sparse */
2243 for (j=0; j<nz; j++) {
2244 idx = pj[j];
2245 pc1[j] = rtmp11[idx];
2246 pc2[j] = rtmp22[idx];
2247 pc3[j] = rtmp33[idx];
2248 if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2249 }
2250
2251 sctx.rs = rs;
2252 ierr = MatPivotCheck(B,A,info,&sctx,row+2);CHKERRQ(ierr);
2253 if (sctx.newshift) goto endofwhile;
2254 break;
2255
2256 default:
2257 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2258 }
2259 row += nodesz; /* Update the row */
2260 }
2261 endofwhile:;
2262 } while (sctx.newshift);
2263 ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr);
2264 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2265 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2266 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2267 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2268
2269 (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2270 /* do not set solve add, since MatSolve_Inode + Add is faster */
2271 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace;
2272 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2273 C->assembled = PETSC_TRUE;
2274 C->preallocated = PETSC_TRUE;
2275 if (sctx.nshift) {
2276 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2277 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
2278 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2279 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
2280 }
2281 }
2282 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2283 ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr);
2284 PetscFunctionReturn(0);
2285 }
2286
2287
2288 /* ----------------------------------------------------------- */
MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)2289 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2290 {
2291 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2292 IS iscol = a->col,isrow = a->row;
2293 PetscErrorCode ierr;
2294 const PetscInt *r,*c,*rout,*cout;
2295 PetscInt i,j,n = A->rmap->n;
2296 PetscInt node_max,row,nsz,aii,i0,i1,nz;
2297 const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2298 PetscScalar *x,*tmp,*tmps,tmp0,tmp1;
2299 PetscScalar sum1,sum2,sum3,sum4,sum5;
2300 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2301 const PetscScalar *b;
2302
2303 PetscFunctionBegin;
2304 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2305 node_max = a->inode.node_count;
2306 ns = a->inode.size; /* Node Size array */
2307
2308 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2309 ierr = VecGetArrayWrite(xx,&x);CHKERRQ(ierr);
2310 tmp = a->solve_work;
2311
2312 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2313 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2314
2315 /* forward solve the lower triangular */
2316 tmps = tmp;
2317 aa = a_a;
2318 aj = a_j;
2319 ad = a->diag;
2320
2321 for (i = 0,row = 0; i< node_max; ++i) {
2322 nsz = ns[i];
2323 aii = ai[row];
2324 v1 = aa + aii;
2325 vi = aj + aii;
2326 nz = ai[row+1]- ai[row];
2327
2328 if (i < node_max-1) {
2329 /* Prefetch the indices for the next block */
2330 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2331 /* Prefetch the data for the next block */
2332 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2333 }
2334
2335 switch (nsz) { /* Each loop in 'case' is unrolled */
2336 case 1:
2337 sum1 = b[r[row]];
2338 for (j=0; j<nz-1; j+=2) {
2339 i0 = vi[j];
2340 i1 = vi[j+1];
2341 tmp0 = tmps[i0];
2342 tmp1 = tmps[i1];
2343 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2344 }
2345 if (j == nz-1) {
2346 tmp0 = tmps[vi[j]];
2347 sum1 -= v1[j]*tmp0;
2348 }
2349 tmp[row++]=sum1;
2350 break;
2351 case 2:
2352 sum1 = b[r[row]];
2353 sum2 = b[r[row+1]];
2354 v2 = aa + ai[row+1];
2355
2356 for (j=0; j<nz-1; j+=2) {
2357 i0 = vi[j];
2358 i1 = vi[j+1];
2359 tmp0 = tmps[i0];
2360 tmp1 = tmps[i1];
2361 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2362 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2363 }
2364 if (j == nz-1) {
2365 tmp0 = tmps[vi[j]];
2366 sum1 -= v1[j] *tmp0;
2367 sum2 -= v2[j] *tmp0;
2368 }
2369 sum2 -= v2[nz] * sum1;
2370 tmp[row++]=sum1;
2371 tmp[row++]=sum2;
2372 break;
2373 case 3:
2374 sum1 = b[r[row]];
2375 sum2 = b[r[row+1]];
2376 sum3 = b[r[row+2]];
2377 v2 = aa + ai[row+1];
2378 v3 = aa + ai[row+2];
2379
2380 for (j=0; j<nz-1; j+=2) {
2381 i0 = vi[j];
2382 i1 = vi[j+1];
2383 tmp0 = tmps[i0];
2384 tmp1 = tmps[i1];
2385 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2386 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2387 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2388 }
2389 if (j == nz-1) {
2390 tmp0 = tmps[vi[j]];
2391 sum1 -= v1[j] *tmp0;
2392 sum2 -= v2[j] *tmp0;
2393 sum3 -= v3[j] *tmp0;
2394 }
2395 sum2 -= v2[nz] * sum1;
2396 sum3 -= v3[nz] * sum1;
2397 sum3 -= v3[nz+1] * sum2;
2398 tmp[row++]=sum1;
2399 tmp[row++]=sum2;
2400 tmp[row++]=sum3;
2401 break;
2402
2403 case 4:
2404 sum1 = b[r[row]];
2405 sum2 = b[r[row+1]];
2406 sum3 = b[r[row+2]];
2407 sum4 = b[r[row+3]];
2408 v2 = aa + ai[row+1];
2409 v3 = aa + ai[row+2];
2410 v4 = aa + ai[row+3];
2411
2412 for (j=0; j<nz-1; j+=2) {
2413 i0 = vi[j];
2414 i1 = vi[j+1];
2415 tmp0 = tmps[i0];
2416 tmp1 = tmps[i1];
2417 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2418 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2419 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2420 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2421 }
2422 if (j == nz-1) {
2423 tmp0 = tmps[vi[j]];
2424 sum1 -= v1[j] *tmp0;
2425 sum2 -= v2[j] *tmp0;
2426 sum3 -= v3[j] *tmp0;
2427 sum4 -= v4[j] *tmp0;
2428 }
2429 sum2 -= v2[nz] * sum1;
2430 sum3 -= v3[nz] * sum1;
2431 sum4 -= v4[nz] * sum1;
2432 sum3 -= v3[nz+1] * sum2;
2433 sum4 -= v4[nz+1] * sum2;
2434 sum4 -= v4[nz+2] * sum3;
2435
2436 tmp[row++]=sum1;
2437 tmp[row++]=sum2;
2438 tmp[row++]=sum3;
2439 tmp[row++]=sum4;
2440 break;
2441 case 5:
2442 sum1 = b[r[row]];
2443 sum2 = b[r[row+1]];
2444 sum3 = b[r[row+2]];
2445 sum4 = b[r[row+3]];
2446 sum5 = b[r[row+4]];
2447 v2 = aa + ai[row+1];
2448 v3 = aa + ai[row+2];
2449 v4 = aa + ai[row+3];
2450 v5 = aa + ai[row+4];
2451
2452 for (j=0; j<nz-1; j+=2) {
2453 i0 = vi[j];
2454 i1 = vi[j+1];
2455 tmp0 = tmps[i0];
2456 tmp1 = tmps[i1];
2457 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2458 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2459 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2460 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2461 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2462 }
2463 if (j == nz-1) {
2464 tmp0 = tmps[vi[j]];
2465 sum1 -= v1[j] *tmp0;
2466 sum2 -= v2[j] *tmp0;
2467 sum3 -= v3[j] *tmp0;
2468 sum4 -= v4[j] *tmp0;
2469 sum5 -= v5[j] *tmp0;
2470 }
2471
2472 sum2 -= v2[nz] * sum1;
2473 sum3 -= v3[nz] * sum1;
2474 sum4 -= v4[nz] * sum1;
2475 sum5 -= v5[nz] * sum1;
2476 sum3 -= v3[nz+1] * sum2;
2477 sum4 -= v4[nz+1] * sum2;
2478 sum5 -= v5[nz+1] * sum2;
2479 sum4 -= v4[nz+2] * sum3;
2480 sum5 -= v5[nz+2] * sum3;
2481 sum5 -= v5[nz+3] * sum4;
2482
2483 tmp[row++]=sum1;
2484 tmp[row++]=sum2;
2485 tmp[row++]=sum3;
2486 tmp[row++]=sum4;
2487 tmp[row++]=sum5;
2488 break;
2489 default:
2490 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2491 }
2492 }
2493 /* backward solve the upper triangular */
2494 for (i=node_max -1,row = n-1; i>=0; i--) {
2495 nsz = ns[i];
2496 aii = ad[row+1] + 1;
2497 v1 = aa + aii;
2498 vi = aj + aii;
2499 nz = ad[row]- ad[row+1] - 1;
2500
2501 if (i > 0) {
2502 /* Prefetch the indices for the next block */
2503 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2504 /* Prefetch the data for the next block */
2505 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2506 }
2507
2508 switch (nsz) { /* Each loop in 'case' is unrolled */
2509 case 1:
2510 sum1 = tmp[row];
2511
2512 for (j=0; j<nz-1; j+=2) {
2513 i0 = vi[j];
2514 i1 = vi[j+1];
2515 tmp0 = tmps[i0];
2516 tmp1 = tmps[i1];
2517 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2518 }
2519 if (j == nz-1) {
2520 tmp0 = tmps[vi[j]];
2521 sum1 -= v1[j]*tmp0;
2522 }
2523 x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2524 break;
2525 case 2:
2526 sum1 = tmp[row];
2527 sum2 = tmp[row-1];
2528 v2 = aa + ad[row] + 1;
2529 for (j=0; j<nz-1; j+=2) {
2530 i0 = vi[j];
2531 i1 = vi[j+1];
2532 tmp0 = tmps[i0];
2533 tmp1 = tmps[i1];
2534 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2535 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2536 }
2537 if (j == nz-1) {
2538 tmp0 = tmps[vi[j]];
2539 sum1 -= v1[j]* tmp0;
2540 sum2 -= v2[j+1]* tmp0;
2541 }
2542
2543 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2544 sum2 -= v2[0] * tmp0;
2545 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2546 break;
2547 case 3:
2548 sum1 = tmp[row];
2549 sum2 = tmp[row -1];
2550 sum3 = tmp[row -2];
2551 v2 = aa + ad[row] + 1;
2552 v3 = aa + ad[row -1] + 1;
2553 for (j=0; j<nz-1; j+=2) {
2554 i0 = vi[j];
2555 i1 = vi[j+1];
2556 tmp0 = tmps[i0];
2557 tmp1 = tmps[i1];
2558 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2559 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2560 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2561 }
2562 if (j== nz-1) {
2563 tmp0 = tmps[vi[j]];
2564 sum1 -= v1[j] * tmp0;
2565 sum2 -= v2[j+1] * tmp0;
2566 sum3 -= v3[j+2] * tmp0;
2567 }
2568 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2569 sum2 -= v2[0]* tmp0;
2570 sum3 -= v3[1] * tmp0;
2571 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2572 sum3 -= v3[0]* tmp0;
2573 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2574
2575 break;
2576 case 4:
2577 sum1 = tmp[row];
2578 sum2 = tmp[row -1];
2579 sum3 = tmp[row -2];
2580 sum4 = tmp[row -3];
2581 v2 = aa + ad[row]+1;
2582 v3 = aa + ad[row -1]+1;
2583 v4 = aa + ad[row -2]+1;
2584
2585 for (j=0; j<nz-1; j+=2) {
2586 i0 = vi[j];
2587 i1 = vi[j+1];
2588 tmp0 = tmps[i0];
2589 tmp1 = tmps[i1];
2590 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2591 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2592 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2593 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2594 }
2595 if (j== nz-1) {
2596 tmp0 = tmps[vi[j]];
2597 sum1 -= v1[j] * tmp0;
2598 sum2 -= v2[j+1] * tmp0;
2599 sum3 -= v3[j+2] * tmp0;
2600 sum4 -= v4[j+3] * tmp0;
2601 }
2602
2603 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2604 sum2 -= v2[0] * tmp0;
2605 sum3 -= v3[1] * tmp0;
2606 sum4 -= v4[2] * tmp0;
2607 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2608 sum3 -= v3[0] * tmp0;
2609 sum4 -= v4[1] * tmp0;
2610 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2611 sum4 -= v4[0] * tmp0;
2612 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2613 break;
2614 case 5:
2615 sum1 = tmp[row];
2616 sum2 = tmp[row -1];
2617 sum3 = tmp[row -2];
2618 sum4 = tmp[row -3];
2619 sum5 = tmp[row -4];
2620 v2 = aa + ad[row]+1;
2621 v3 = aa + ad[row -1]+1;
2622 v4 = aa + ad[row -2]+1;
2623 v5 = aa + ad[row -3]+1;
2624 for (j=0; j<nz-1; j+=2) {
2625 i0 = vi[j];
2626 i1 = vi[j+1];
2627 tmp0 = tmps[i0];
2628 tmp1 = tmps[i1];
2629 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2630 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2631 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2632 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2633 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2634 }
2635 if (j==nz-1) {
2636 tmp0 = tmps[vi[j]];
2637 sum1 -= v1[j] * tmp0;
2638 sum2 -= v2[j+1] * tmp0;
2639 sum3 -= v3[j+2] * tmp0;
2640 sum4 -= v4[j+3] * tmp0;
2641 sum5 -= v5[j+4] * tmp0;
2642 }
2643
2644 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2645 sum2 -= v2[0] * tmp0;
2646 sum3 -= v3[1] * tmp0;
2647 sum4 -= v4[2] * tmp0;
2648 sum5 -= v5[3] * tmp0;
2649 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2650 sum3 -= v3[0] * tmp0;
2651 sum4 -= v4[1] * tmp0;
2652 sum5 -= v5[2] * tmp0;
2653 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2654 sum4 -= v4[0] * tmp0;
2655 sum5 -= v5[1] * tmp0;
2656 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2657 sum5 -= v5[0] * tmp0;
2658 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2659 break;
2660 default:
2661 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2662 }
2663 }
2664 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2665 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2666 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2667 ierr = VecRestoreArrayWrite(xx,&x);CHKERRQ(ierr);
2668 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2669 PetscFunctionReturn(0);
2670 }
2671
2672
2673 /*
2674 Makes a longer coloring[] array and calls the usual code with that
2675 */
MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring * iscoloring)2676 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2677 {
2678 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data;
2679 PetscErrorCode ierr;
2680 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2681 PetscInt *colorused,i;
2682 ISColoringValue *newcolor;
2683
2684 PetscFunctionBegin;
2685 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2686 ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr);
2687 /* loop over inodes, marking a color for each column*/
2688 row = 0;
2689 for (i=0; i<m; i++) {
2690 for (j=0; j<ns[i]; j++) {
2691 newcolor[row++] = coloring[i] + j*ncolors;
2692 }
2693 }
2694
2695 /* eliminate unneeded colors */
2696 ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr);
2697 for (i=0; i<n; i++) {
2698 colorused[newcolor[i]] = 1;
2699 }
2700
2701 for (i=1; i<5*ncolors; i++) {
2702 colorused[i] += colorused[i-1];
2703 }
2704 ncolors = colorused[5*ncolors-1];
2705 for (i=0; i<n; i++) {
2706 newcolor[i] = colorused[newcolor[i]]-1;
2707 }
2708 ierr = PetscFree(colorused);CHKERRQ(ierr);
2709 ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr);
2710 ierr = PetscFree(coloring);CHKERRQ(ierr);
2711 PetscFunctionReturn(0);
2712 }
2713
2714 #include <petsc/private/kernels/blockinvert.h>
2715
MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)2716 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2717 {
2718 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
2719 PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2720 MatScalar *ibdiag,*bdiag,work[25],*t;
2721 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2722 const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2723 const PetscScalar *xb, *b;
2724 PetscReal zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0;
2725 PetscErrorCode ierr;
2726 PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2727 PetscInt sz,k,ipvt[5];
2728 PetscBool allowzeropivot,zeropivotdetected;
2729 const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2730
2731 PetscFunctionBegin;
2732 allowzeropivot = PetscNot(A->erroriffailure);
2733 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2734 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2735 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2736
2737 if (!a->inode.ibdiagvalid) {
2738 if (!a->inode.ibdiag) {
2739 /* calculate space needed for diagonal blocks */
2740 for (i=0; i<m; i++) {
2741 cnt += sizes[i]*sizes[i];
2742 }
2743 a->inode.bdiagsize = cnt;
2744
2745 ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr);
2746 }
2747
2748 /* copy over the diagonal blocks and invert them */
2749 ibdiag = a->inode.ibdiag;
2750 bdiag = a->inode.bdiag;
2751 cnt = 0;
2752 for (i=0, row = 0; i<m; i++) {
2753 for (j=0; j<sizes[i]; j++) {
2754 for (k=0; k<sizes[i]; k++) {
2755 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2756 }
2757 }
2758 ierr = PetscArraycpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]);CHKERRQ(ierr);
2759
2760 switch (sizes[i]) {
2761 case 1:
2762 /* Create matrix data structure */
2763 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2764 if (allowzeropivot) {
2765 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2766 A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2767 A->factorerror_zeropivot_row = row;
2768 ierr = PetscInfo1(A,"Zero pivot, row %D\n",row);CHKERRQ(ierr);
2769 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2770 }
2771 ibdiag[cnt] = 1.0/ibdiag[cnt];
2772 break;
2773 case 2:
2774 ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2775 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2776 break;
2777 case 3:
2778 ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2779 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2780 break;
2781 case 4:
2782 ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2783 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2784 break;
2785 case 5:
2786 ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2787 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2788 break;
2789 default:
2790 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2791 }
2792 cnt += sizes[i]*sizes[i];
2793 row += sizes[i];
2794 }
2795 a->inode.ibdiagvalid = PETSC_TRUE;
2796 }
2797 ibdiag = a->inode.ibdiag;
2798 bdiag = a->inode.bdiag;
2799 t = a->inode.ssor_work;
2800
2801 ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2802 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2803 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2804 if (flag & SOR_ZERO_INITIAL_GUESS) {
2805 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2806
2807 for (i=0, row=0; i<m; i++) {
2808 sz = diag[row] - ii[row];
2809 v1 = a->a + ii[row];
2810 idx = a->j + ii[row];
2811
2812 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2813 switch (sizes[i]) {
2814 case 1:
2815
2816 sum1 = b[row];
2817 for (n = 0; n<sz-1; n+=2) {
2818 i1 = idx[0];
2819 i2 = idx[1];
2820 idx += 2;
2821 tmp0 = x[i1];
2822 tmp1 = x[i2];
2823 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2824 }
2825
2826 if (n == sz-1) {
2827 tmp0 = x[*idx];
2828 sum1 -= *v1 * tmp0;
2829 }
2830 t[row] = sum1;
2831 x[row++] = sum1*(*ibdiag++);
2832 break;
2833 case 2:
2834 v2 = a->a + ii[row+1];
2835 sum1 = b[row];
2836 sum2 = b[row+1];
2837 for (n = 0; n<sz-1; n+=2) {
2838 i1 = idx[0];
2839 i2 = idx[1];
2840 idx += 2;
2841 tmp0 = x[i1];
2842 tmp1 = x[i2];
2843 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2844 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2845 }
2846
2847 if (n == sz-1) {
2848 tmp0 = x[*idx];
2849 sum1 -= v1[0] * tmp0;
2850 sum2 -= v2[0] * tmp0;
2851 }
2852 t[row] = sum1;
2853 t[row+1] = sum2;
2854 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2855 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2856 ibdiag += 4;
2857 break;
2858 case 3:
2859 v2 = a->a + ii[row+1];
2860 v3 = a->a + ii[row+2];
2861 sum1 = b[row];
2862 sum2 = b[row+1];
2863 sum3 = b[row+2];
2864 for (n = 0; n<sz-1; n+=2) {
2865 i1 = idx[0];
2866 i2 = idx[1];
2867 idx += 2;
2868 tmp0 = x[i1];
2869 tmp1 = x[i2];
2870 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2871 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2872 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2873 }
2874
2875 if (n == sz-1) {
2876 tmp0 = x[*idx];
2877 sum1 -= v1[0] * tmp0;
2878 sum2 -= v2[0] * tmp0;
2879 sum3 -= v3[0] * tmp0;
2880 }
2881 t[row] = sum1;
2882 t[row+1] = sum2;
2883 t[row+2] = sum3;
2884 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2885 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2886 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2887 ibdiag += 9;
2888 break;
2889 case 4:
2890 v2 = a->a + ii[row+1];
2891 v3 = a->a + ii[row+2];
2892 v4 = a->a + ii[row+3];
2893 sum1 = b[row];
2894 sum2 = b[row+1];
2895 sum3 = b[row+2];
2896 sum4 = b[row+3];
2897 for (n = 0; n<sz-1; n+=2) {
2898 i1 = idx[0];
2899 i2 = idx[1];
2900 idx += 2;
2901 tmp0 = x[i1];
2902 tmp1 = x[i2];
2903 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2904 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2905 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2906 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2907 }
2908
2909 if (n == sz-1) {
2910 tmp0 = x[*idx];
2911 sum1 -= v1[0] * tmp0;
2912 sum2 -= v2[0] * tmp0;
2913 sum3 -= v3[0] * tmp0;
2914 sum4 -= v4[0] * tmp0;
2915 }
2916 t[row] = sum1;
2917 t[row+1] = sum2;
2918 t[row+2] = sum3;
2919 t[row+3] = sum4;
2920 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2921 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2922 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2923 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2924 ibdiag += 16;
2925 break;
2926 case 5:
2927 v2 = a->a + ii[row+1];
2928 v3 = a->a + ii[row+2];
2929 v4 = a->a + ii[row+3];
2930 v5 = a->a + ii[row+4];
2931 sum1 = b[row];
2932 sum2 = b[row+1];
2933 sum3 = b[row+2];
2934 sum4 = b[row+3];
2935 sum5 = b[row+4];
2936 for (n = 0; n<sz-1; n+=2) {
2937 i1 = idx[0];
2938 i2 = idx[1];
2939 idx += 2;
2940 tmp0 = x[i1];
2941 tmp1 = x[i2];
2942 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2943 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2944 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2945 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2946 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2947 }
2948
2949 if (n == sz-1) {
2950 tmp0 = x[*idx];
2951 sum1 -= v1[0] * tmp0;
2952 sum2 -= v2[0] * tmp0;
2953 sum3 -= v3[0] * tmp0;
2954 sum4 -= v4[0] * tmp0;
2955 sum5 -= v5[0] * tmp0;
2956 }
2957 t[row] = sum1;
2958 t[row+1] = sum2;
2959 t[row+2] = sum3;
2960 t[row+3] = sum4;
2961 t[row+4] = sum5;
2962 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2963 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2964 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2965 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2966 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2967 ibdiag += 25;
2968 break;
2969 default:
2970 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2971 }
2972 }
2973
2974 xb = t;
2975 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2976 } else xb = b;
2977 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
2978
2979 ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2980 for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2981 ibdiag -= sizes[i]*sizes[i];
2982 sz = ii[row+1] - diag[row] - 1;
2983 v1 = a->a + diag[row] + 1;
2984 idx = a->j + diag[row] + 1;
2985
2986 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2987 switch (sizes[i]) {
2988 case 1:
2989
2990 sum1 = xb[row];
2991 for (n = 0; n<sz-1; n+=2) {
2992 i1 = idx[0];
2993 i2 = idx[1];
2994 idx += 2;
2995 tmp0 = x[i1];
2996 tmp1 = x[i2];
2997 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2998 }
2999
3000 if (n == sz-1) {
3001 tmp0 = x[*idx];
3002 sum1 -= *v1*tmp0;
3003 }
3004 x[row--] = sum1*(*ibdiag);
3005 break;
3006
3007 case 2:
3008
3009 sum1 = xb[row];
3010 sum2 = xb[row-1];
3011 /* note that sum1 is associated with the second of the two rows */
3012 v2 = a->a + diag[row-1] + 2;
3013 for (n = 0; n<sz-1; n+=2) {
3014 i1 = idx[0];
3015 i2 = idx[1];
3016 idx += 2;
3017 tmp0 = x[i1];
3018 tmp1 = x[i2];
3019 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3020 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3021 }
3022
3023 if (n == sz-1) {
3024 tmp0 = x[*idx];
3025 sum1 -= *v1*tmp0;
3026 sum2 -= *v2*tmp0;
3027 }
3028 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3029 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3030 break;
3031 case 3:
3032
3033 sum1 = xb[row];
3034 sum2 = xb[row-1];
3035 sum3 = xb[row-2];
3036 v2 = a->a + diag[row-1] + 2;
3037 v3 = a->a + diag[row-2] + 3;
3038 for (n = 0; n<sz-1; n+=2) {
3039 i1 = idx[0];
3040 i2 = idx[1];
3041 idx += 2;
3042 tmp0 = x[i1];
3043 tmp1 = x[i2];
3044 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3045 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3046 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3047 }
3048
3049 if (n == sz-1) {
3050 tmp0 = x[*idx];
3051 sum1 -= *v1*tmp0;
3052 sum2 -= *v2*tmp0;
3053 sum3 -= *v3*tmp0;
3054 }
3055 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3056 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3057 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3058 break;
3059 case 4:
3060
3061 sum1 = xb[row];
3062 sum2 = xb[row-1];
3063 sum3 = xb[row-2];
3064 sum4 = xb[row-3];
3065 v2 = a->a + diag[row-1] + 2;
3066 v3 = a->a + diag[row-2] + 3;
3067 v4 = a->a + diag[row-3] + 4;
3068 for (n = 0; n<sz-1; n+=2) {
3069 i1 = idx[0];
3070 i2 = idx[1];
3071 idx += 2;
3072 tmp0 = x[i1];
3073 tmp1 = x[i2];
3074 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3075 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3076 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3077 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3078 }
3079
3080 if (n == sz-1) {
3081 tmp0 = x[*idx];
3082 sum1 -= *v1*tmp0;
3083 sum2 -= *v2*tmp0;
3084 sum3 -= *v3*tmp0;
3085 sum4 -= *v4*tmp0;
3086 }
3087 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3088 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3089 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3090 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3091 break;
3092 case 5:
3093
3094 sum1 = xb[row];
3095 sum2 = xb[row-1];
3096 sum3 = xb[row-2];
3097 sum4 = xb[row-3];
3098 sum5 = xb[row-4];
3099 v2 = a->a + diag[row-1] + 2;
3100 v3 = a->a + diag[row-2] + 3;
3101 v4 = a->a + diag[row-3] + 4;
3102 v5 = a->a + diag[row-4] + 5;
3103 for (n = 0; n<sz-1; n+=2) {
3104 i1 = idx[0];
3105 i2 = idx[1];
3106 idx += 2;
3107 tmp0 = x[i1];
3108 tmp1 = x[i2];
3109 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3110 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3111 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3112 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3113 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3114 }
3115
3116 if (n == sz-1) {
3117 tmp0 = x[*idx];
3118 sum1 -= *v1*tmp0;
3119 sum2 -= *v2*tmp0;
3120 sum3 -= *v3*tmp0;
3121 sum4 -= *v4*tmp0;
3122 sum5 -= *v5*tmp0;
3123 }
3124 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3125 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3126 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3127 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3128 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3129 break;
3130 default:
3131 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3132 }
3133 }
3134
3135 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3136 }
3137 its--;
3138 }
3139 while (its--) {
3140
3141 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3142 for (i=0, row=0, ibdiag = a->inode.ibdiag;
3143 i<m;
3144 row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3145
3146 sz = diag[row] - ii[row];
3147 v1 = a->a + ii[row];
3148 idx = a->j + ii[row];
3149 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3150 switch (sizes[i]) {
3151 case 1:
3152 sum1 = b[row];
3153 for (n = 0; n<sz-1; n+=2) {
3154 i1 = idx[0];
3155 i2 = idx[1];
3156 idx += 2;
3157 tmp0 = x[i1];
3158 tmp1 = x[i2];
3159 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3160 }
3161 if (n == sz-1) {
3162 tmp0 = x[*idx++];
3163 sum1 -= *v1 * tmp0;
3164 v1++;
3165 }
3166 t[row] = sum1;
3167 sz = ii[row+1] - diag[row] - 1;
3168 idx = a->j + diag[row] + 1;
3169 v1 += 1;
3170 for (n = 0; n<sz-1; n+=2) {
3171 i1 = idx[0];
3172 i2 = idx[1];
3173 idx += 2;
3174 tmp0 = x[i1];
3175 tmp1 = x[i2];
3176 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3177 }
3178 if (n == sz-1) {
3179 tmp0 = x[*idx++];
3180 sum1 -= *v1 * tmp0;
3181 }
3182 /* in MatSOR_SeqAIJ this line would be
3183 *
3184 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3185 *
3186 * but omega == 1, so this becomes
3187 *
3188 * x[row] = sum1*(*ibdiag++);
3189 *
3190 */
3191 x[row] = sum1*(*ibdiag);
3192 break;
3193 case 2:
3194 v2 = a->a + ii[row+1];
3195 sum1 = b[row];
3196 sum2 = b[row+1];
3197 for (n = 0; n<sz-1; n+=2) {
3198 i1 = idx[0];
3199 i2 = idx[1];
3200 idx += 2;
3201 tmp0 = x[i1];
3202 tmp1 = x[i2];
3203 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3204 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3205 }
3206 if (n == sz-1) {
3207 tmp0 = x[*idx++];
3208 sum1 -= v1[0] * tmp0;
3209 sum2 -= v2[0] * tmp0;
3210 v1++; v2++;
3211 }
3212 t[row] = sum1;
3213 t[row+1] = sum2;
3214 sz = ii[row+1] - diag[row] - 2;
3215 idx = a->j + diag[row] + 2;
3216 v1 += 2;
3217 v2 += 2;
3218 for (n = 0; n<sz-1; n+=2) {
3219 i1 = idx[0];
3220 i2 = idx[1];
3221 idx += 2;
3222 tmp0 = x[i1];
3223 tmp1 = x[i2];
3224 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3225 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3226 }
3227 if (n == sz-1) {
3228 tmp0 = x[*idx];
3229 sum1 -= v1[0] * tmp0;
3230 sum2 -= v2[0] * tmp0;
3231 }
3232 x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3233 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3234 break;
3235 case 3:
3236 v2 = a->a + ii[row+1];
3237 v3 = a->a + ii[row+2];
3238 sum1 = b[row];
3239 sum2 = b[row+1];
3240 sum3 = b[row+2];
3241 for (n = 0; n<sz-1; n+=2) {
3242 i1 = idx[0];
3243 i2 = idx[1];
3244 idx += 2;
3245 tmp0 = x[i1];
3246 tmp1 = x[i2];
3247 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3248 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3249 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3250 }
3251 if (n == sz-1) {
3252 tmp0 = x[*idx++];
3253 sum1 -= v1[0] * tmp0;
3254 sum2 -= v2[0] * tmp0;
3255 sum3 -= v3[0] * tmp0;
3256 v1++; v2++; v3++;
3257 }
3258 t[row] = sum1;
3259 t[row+1] = sum2;
3260 t[row+2] = sum3;
3261 sz = ii[row+1] - diag[row] - 3;
3262 idx = a->j + diag[row] + 3;
3263 v1 += 3;
3264 v2 += 3;
3265 v3 += 3;
3266 for (n = 0; n<sz-1; n+=2) {
3267 i1 = idx[0];
3268 i2 = idx[1];
3269 idx += 2;
3270 tmp0 = x[i1];
3271 tmp1 = x[i2];
3272 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3273 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3274 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3275 }
3276 if (n == sz-1) {
3277 tmp0 = x[*idx];
3278 sum1 -= v1[0] * tmp0;
3279 sum2 -= v2[0] * tmp0;
3280 sum3 -= v3[0] * tmp0;
3281 }
3282 x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3283 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3284 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3285 break;
3286 case 4:
3287 v2 = a->a + ii[row+1];
3288 v3 = a->a + ii[row+2];
3289 v4 = a->a + ii[row+3];
3290 sum1 = b[row];
3291 sum2 = b[row+1];
3292 sum3 = b[row+2];
3293 sum4 = b[row+3];
3294 for (n = 0; n<sz-1; n+=2) {
3295 i1 = idx[0];
3296 i2 = idx[1];
3297 idx += 2;
3298 tmp0 = x[i1];
3299 tmp1 = x[i2];
3300 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3301 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3302 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3303 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3304 }
3305 if (n == sz-1) {
3306 tmp0 = x[*idx++];
3307 sum1 -= v1[0] * tmp0;
3308 sum2 -= v2[0] * tmp0;
3309 sum3 -= v3[0] * tmp0;
3310 sum4 -= v4[0] * tmp0;
3311 v1++; v2++; v3++; v4++;
3312 }
3313 t[row] = sum1;
3314 t[row+1] = sum2;
3315 t[row+2] = sum3;
3316 t[row+3] = sum4;
3317 sz = ii[row+1] - diag[row] - 4;
3318 idx = a->j + diag[row] + 4;
3319 v1 += 4;
3320 v2 += 4;
3321 v3 += 4;
3322 v4 += 4;
3323 for (n = 0; n<sz-1; n+=2) {
3324 i1 = idx[0];
3325 i2 = idx[1];
3326 idx += 2;
3327 tmp0 = x[i1];
3328 tmp1 = x[i2];
3329 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3330 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3331 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3332 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3333 }
3334 if (n == sz-1) {
3335 tmp0 = x[*idx];
3336 sum1 -= v1[0] * tmp0;
3337 sum2 -= v2[0] * tmp0;
3338 sum3 -= v3[0] * tmp0;
3339 sum4 -= v4[0] * tmp0;
3340 }
3341 x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3342 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3343 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3344 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3345 break;
3346 case 5:
3347 v2 = a->a + ii[row+1];
3348 v3 = a->a + ii[row+2];
3349 v4 = a->a + ii[row+3];
3350 v5 = a->a + ii[row+4];
3351 sum1 = b[row];
3352 sum2 = b[row+1];
3353 sum3 = b[row+2];
3354 sum4 = b[row+3];
3355 sum5 = b[row+4];
3356 for (n = 0; n<sz-1; n+=2) {
3357 i1 = idx[0];
3358 i2 = idx[1];
3359 idx += 2;
3360 tmp0 = x[i1];
3361 tmp1 = x[i2];
3362 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3363 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3364 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3365 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3366 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3367 }
3368 if (n == sz-1) {
3369 tmp0 = x[*idx++];
3370 sum1 -= v1[0] * tmp0;
3371 sum2 -= v2[0] * tmp0;
3372 sum3 -= v3[0] * tmp0;
3373 sum4 -= v4[0] * tmp0;
3374 sum5 -= v5[0] * tmp0;
3375 v1++; v2++; v3++; v4++; v5++;
3376 }
3377 t[row] = sum1;
3378 t[row+1] = sum2;
3379 t[row+2] = sum3;
3380 t[row+3] = sum4;
3381 t[row+4] = sum5;
3382 sz = ii[row+1] - diag[row] - 5;
3383 idx = a->j + diag[row] + 5;
3384 v1 += 5;
3385 v2 += 5;
3386 v3 += 5;
3387 v4 += 5;
3388 v5 += 5;
3389 for (n = 0; n<sz-1; n+=2) {
3390 i1 = idx[0];
3391 i2 = idx[1];
3392 idx += 2;
3393 tmp0 = x[i1];
3394 tmp1 = x[i2];
3395 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3396 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3397 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3398 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3399 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3400 }
3401 if (n == sz-1) {
3402 tmp0 = x[*idx];
3403 sum1 -= v1[0] * tmp0;
3404 sum2 -= v2[0] * tmp0;
3405 sum3 -= v3[0] * tmp0;
3406 sum4 -= v4[0] * tmp0;
3407 sum5 -= v5[0] * tmp0;
3408 }
3409 x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3410 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3411 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3412 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3413 x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3414 break;
3415 default:
3416 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3417 }
3418 }
3419 xb = t;
3420 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); /* undercounts diag inverse */
3421 } else xb = b;
3422
3423 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3424
3425 ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3426 for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3427 ibdiag -= sizes[i]*sizes[i];
3428
3429 /* set RHS */
3430 if (xb == b) {
3431 /* whole (old way) */
3432 sz = ii[row+1] - ii[row];
3433 idx = a->j + ii[row];
3434 switch (sizes[i]) {
3435 case 5:
3436 v5 = a->a + ii[row-4];
3437 case 4: /* fall through */
3438 v4 = a->a + ii[row-3];
3439 case 3:
3440 v3 = a->a + ii[row-2];
3441 case 2:
3442 v2 = a->a + ii[row-1];
3443 case 1:
3444 v1 = a->a + ii[row];
3445 break;
3446 default:
3447 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3448 }
3449 } else {
3450 /* upper, no diag */
3451 sz = ii[row+1] - diag[row] - 1;
3452 idx = a->j + diag[row] + 1;
3453 switch (sizes[i]) {
3454 case 5:
3455 v5 = a->a + diag[row-4] + 5;
3456 case 4: /* fall through */
3457 v4 = a->a + diag[row-3] + 4;
3458 case 3:
3459 v3 = a->a + diag[row-2] + 3;
3460 case 2:
3461 v2 = a->a + diag[row-1] + 2;
3462 case 1:
3463 v1 = a->a + diag[row] + 1;
3464 }
3465 }
3466 /* set sum */
3467 switch (sizes[i]) {
3468 case 5:
3469 sum5 = xb[row-4];
3470 case 4: /* fall through */
3471 sum4 = xb[row-3];
3472 case 3:
3473 sum3 = xb[row-2];
3474 case 2:
3475 sum2 = xb[row-1];
3476 case 1:
3477 /* note that sum1 is associated with the last row */
3478 sum1 = xb[row];
3479 }
3480 /* do sums */
3481 for (n = 0; n<sz-1; n+=2) {
3482 i1 = idx[0];
3483 i2 = idx[1];
3484 idx += 2;
3485 tmp0 = x[i1];
3486 tmp1 = x[i2];
3487 switch (sizes[i]) {
3488 case 5:
3489 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3490 case 4: /* fall through */
3491 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3492 case 3:
3493 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3494 case 2:
3495 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3496 case 1:
3497 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3498 }
3499 }
3500 /* ragged edge */
3501 if (n == sz-1) {
3502 tmp0 = x[*idx];
3503 switch (sizes[i]) {
3504 case 5:
3505 sum5 -= *v5*tmp0;
3506 case 4: /* fall through */
3507 sum4 -= *v4*tmp0;
3508 case 3:
3509 sum3 -= *v3*tmp0;
3510 case 2:
3511 sum2 -= *v2*tmp0;
3512 case 1:
3513 sum1 -= *v1*tmp0;
3514 }
3515 }
3516 /* update */
3517 if (xb == b) {
3518 /* whole (old way) w/ diag */
3519 switch (sizes[i]) {
3520 case 5:
3521 x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3522 x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3523 x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3524 x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3525 x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3526 break;
3527 case 4:
3528 x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3529 x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3530 x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3531 x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3532 break;
3533 case 3:
3534 x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3535 x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3536 x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3537 break;
3538 case 2:
3539 x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3540 x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3541 break;
3542 case 1:
3543 x[row--] += sum1*(*ibdiag);
3544 break;
3545 }
3546 } else {
3547 /* no diag so set = */
3548 switch (sizes[i]) {
3549 case 5:
3550 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3551 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3552 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3553 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3554 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3555 break;
3556 case 4:
3557 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3558 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3559 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3560 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3561 break;
3562 case 3:
3563 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3564 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3565 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3566 break;
3567 case 2:
3568 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3569 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3570 break;
3571 case 1:
3572 x[row--] = sum1*(*ibdiag);
3573 break;
3574 }
3575 }
3576 }
3577 if (xb == b) {
3578 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3579 } else {
3580 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */
3581 }
3582 }
3583 }
3584 if (flag & SOR_EISENSTAT) {
3585 /*
3586 Apply (U + D)^-1 where D is now the block diagonal
3587 */
3588 ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3589 for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3590 ibdiag -= sizes[i]*sizes[i];
3591 sz = ii[row+1] - diag[row] - 1;
3592 v1 = a->a + diag[row] + 1;
3593 idx = a->j + diag[row] + 1;
3594 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3595 switch (sizes[i]) {
3596 case 1:
3597
3598 sum1 = b[row];
3599 for (n = 0; n<sz-1; n+=2) {
3600 i1 = idx[0];
3601 i2 = idx[1];
3602 idx += 2;
3603 tmp0 = x[i1];
3604 tmp1 = x[i2];
3605 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3606 }
3607
3608 if (n == sz-1) {
3609 tmp0 = x[*idx];
3610 sum1 -= *v1*tmp0;
3611 }
3612 x[row] = sum1*(*ibdiag);row--;
3613 break;
3614
3615 case 2:
3616
3617 sum1 = b[row];
3618 sum2 = b[row-1];
3619 /* note that sum1 is associated with the second of the two rows */
3620 v2 = a->a + diag[row-1] + 2;
3621 for (n = 0; n<sz-1; n+=2) {
3622 i1 = idx[0];
3623 i2 = idx[1];
3624 idx += 2;
3625 tmp0 = x[i1];
3626 tmp1 = x[i2];
3627 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3628 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3629 }
3630
3631 if (n == sz-1) {
3632 tmp0 = x[*idx];
3633 sum1 -= *v1*tmp0;
3634 sum2 -= *v2*tmp0;
3635 }
3636 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3];
3637 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3638 row -= 2;
3639 break;
3640 case 3:
3641
3642 sum1 = b[row];
3643 sum2 = b[row-1];
3644 sum3 = b[row-2];
3645 v2 = a->a + diag[row-1] + 2;
3646 v3 = a->a + diag[row-2] + 3;
3647 for (n = 0; n<sz-1; n+=2) {
3648 i1 = idx[0];
3649 i2 = idx[1];
3650 idx += 2;
3651 tmp0 = x[i1];
3652 tmp1 = x[i2];
3653 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3654 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3655 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3656 }
3657
3658 if (n == sz-1) {
3659 tmp0 = x[*idx];
3660 sum1 -= *v1*tmp0;
3661 sum2 -= *v2*tmp0;
3662 sum3 -= *v3*tmp0;
3663 }
3664 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3665 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3666 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3667 row -= 3;
3668 break;
3669 case 4:
3670
3671 sum1 = b[row];
3672 sum2 = b[row-1];
3673 sum3 = b[row-2];
3674 sum4 = b[row-3];
3675 v2 = a->a + diag[row-1] + 2;
3676 v3 = a->a + diag[row-2] + 3;
3677 v4 = a->a + diag[row-3] + 4;
3678 for (n = 0; n<sz-1; n+=2) {
3679 i1 = idx[0];
3680 i2 = idx[1];
3681 idx += 2;
3682 tmp0 = x[i1];
3683 tmp1 = x[i2];
3684 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3685 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3686 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3687 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3688 }
3689
3690 if (n == sz-1) {
3691 tmp0 = x[*idx];
3692 sum1 -= *v1*tmp0;
3693 sum2 -= *v2*tmp0;
3694 sum3 -= *v3*tmp0;
3695 sum4 -= *v4*tmp0;
3696 }
3697 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3698 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3699 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3700 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3701 row -= 4;
3702 break;
3703 case 5:
3704
3705 sum1 = b[row];
3706 sum2 = b[row-1];
3707 sum3 = b[row-2];
3708 sum4 = b[row-3];
3709 sum5 = b[row-4];
3710 v2 = a->a + diag[row-1] + 2;
3711 v3 = a->a + diag[row-2] + 3;
3712 v4 = a->a + diag[row-3] + 4;
3713 v5 = a->a + diag[row-4] + 5;
3714 for (n = 0; n<sz-1; n+=2) {
3715 i1 = idx[0];
3716 i2 = idx[1];
3717 idx += 2;
3718 tmp0 = x[i1];
3719 tmp1 = x[i2];
3720 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3721 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3722 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3723 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3724 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3725 }
3726
3727 if (n == sz-1) {
3728 tmp0 = x[*idx];
3729 sum1 -= *v1*tmp0;
3730 sum2 -= *v2*tmp0;
3731 sum3 -= *v3*tmp0;
3732 sum4 -= *v4*tmp0;
3733 sum5 -= *v5*tmp0;
3734 }
3735 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3736 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3737 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3738 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3739 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3740 row -= 5;
3741 break;
3742 default:
3743 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3744 }
3745 }
3746 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3747
3748 /*
3749 t = b - D x where D is the block diagonal
3750 */
3751 cnt = 0;
3752 for (i=0, row=0; i<m; i++) {
3753 switch (sizes[i]) {
3754 case 1:
3755 t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3756 break;
3757 case 2:
3758 x1 = x[row]; x2 = x[row+1];
3759 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3760 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3761 t[row] = b[row] - tmp1;
3762 t[row+1] = b[row+1] - tmp2; row += 2;
3763 cnt += 4;
3764 break;
3765 case 3:
3766 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2];
3767 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3768 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3769 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3770 t[row] = b[row] - tmp1;
3771 t[row+1] = b[row+1] - tmp2;
3772 t[row+2] = b[row+2] - tmp3; row += 3;
3773 cnt += 9;
3774 break;
3775 case 4:
3776 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3777 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3778 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3779 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3780 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3781 t[row] = b[row] - tmp1;
3782 t[row+1] = b[row+1] - tmp2;
3783 t[row+2] = b[row+2] - tmp3;
3784 t[row+3] = b[row+3] - tmp4; row += 4;
3785 cnt += 16;
3786 break;
3787 case 5:
3788 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3789 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3790 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3791 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3792 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3793 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3794 t[row] = b[row] - tmp1;
3795 t[row+1] = b[row+1] - tmp2;
3796 t[row+2] = b[row+2] - tmp3;
3797 t[row+3] = b[row+3] - tmp4;
3798 t[row+4] = b[row+4] - tmp5;row += 5;
3799 cnt += 25;
3800 break;
3801 default:
3802 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3803 }
3804 }
3805 ierr = PetscLogFlops(m);CHKERRQ(ierr);
3806
3807
3808
3809 /*
3810 Apply (L + D)^-1 where D is the block diagonal
3811 */
3812 for (i=0, row=0; i<m; i++) {
3813 sz = diag[row] - ii[row];
3814 v1 = a->a + ii[row];
3815 idx = a->j + ii[row];
3816 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3817 switch (sizes[i]) {
3818 case 1:
3819
3820 sum1 = t[row];
3821 for (n = 0; n<sz-1; n+=2) {
3822 i1 = idx[0];
3823 i2 = idx[1];
3824 idx += 2;
3825 tmp0 = t[i1];
3826 tmp1 = t[i2];
3827 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3828 }
3829
3830 if (n == sz-1) {
3831 tmp0 = t[*idx];
3832 sum1 -= *v1 * tmp0;
3833 }
3834 x[row] += t[row] = sum1*(*ibdiag++); row++;
3835 break;
3836 case 2:
3837 v2 = a->a + ii[row+1];
3838 sum1 = t[row];
3839 sum2 = t[row+1];
3840 for (n = 0; n<sz-1; n+=2) {
3841 i1 = idx[0];
3842 i2 = idx[1];
3843 idx += 2;
3844 tmp0 = t[i1];
3845 tmp1 = t[i2];
3846 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3847 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3848 }
3849
3850 if (n == sz-1) {
3851 tmp0 = t[*idx];
3852 sum1 -= v1[0] * tmp0;
3853 sum2 -= v2[0] * tmp0;
3854 }
3855 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3856 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3857 ibdiag += 4; row += 2;
3858 break;
3859 case 3:
3860 v2 = a->a + ii[row+1];
3861 v3 = a->a + ii[row+2];
3862 sum1 = t[row];
3863 sum2 = t[row+1];
3864 sum3 = t[row+2];
3865 for (n = 0; n<sz-1; n+=2) {
3866 i1 = idx[0];
3867 i2 = idx[1];
3868 idx += 2;
3869 tmp0 = t[i1];
3870 tmp1 = t[i2];
3871 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3872 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3873 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3874 }
3875
3876 if (n == sz-1) {
3877 tmp0 = t[*idx];
3878 sum1 -= v1[0] * tmp0;
3879 sum2 -= v2[0] * tmp0;
3880 sum3 -= v3[0] * tmp0;
3881 }
3882 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3883 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3884 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3885 ibdiag += 9; row += 3;
3886 break;
3887 case 4:
3888 v2 = a->a + ii[row+1];
3889 v3 = a->a + ii[row+2];
3890 v4 = a->a + ii[row+3];
3891 sum1 = t[row];
3892 sum2 = t[row+1];
3893 sum3 = t[row+2];
3894 sum4 = t[row+3];
3895 for (n = 0; n<sz-1; n+=2) {
3896 i1 = idx[0];
3897 i2 = idx[1];
3898 idx += 2;
3899 tmp0 = t[i1];
3900 tmp1 = t[i2];
3901 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3902 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3903 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3904 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3905 }
3906
3907 if (n == sz-1) {
3908 tmp0 = t[*idx];
3909 sum1 -= v1[0] * tmp0;
3910 sum2 -= v2[0] * tmp0;
3911 sum3 -= v3[0] * tmp0;
3912 sum4 -= v4[0] * tmp0;
3913 }
3914 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3915 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3916 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3917 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3918 ibdiag += 16; row += 4;
3919 break;
3920 case 5:
3921 v2 = a->a + ii[row+1];
3922 v3 = a->a + ii[row+2];
3923 v4 = a->a + ii[row+3];
3924 v5 = a->a + ii[row+4];
3925 sum1 = t[row];
3926 sum2 = t[row+1];
3927 sum3 = t[row+2];
3928 sum4 = t[row+3];
3929 sum5 = t[row+4];
3930 for (n = 0; n<sz-1; n+=2) {
3931 i1 = idx[0];
3932 i2 = idx[1];
3933 idx += 2;
3934 tmp0 = t[i1];
3935 tmp1 = t[i2];
3936 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3937 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3938 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3939 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3940 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3941 }
3942
3943 if (n == sz-1) {
3944 tmp0 = t[*idx];
3945 sum1 -= v1[0] * tmp0;
3946 sum2 -= v2[0] * tmp0;
3947 sum3 -= v3[0] * tmp0;
3948 sum4 -= v4[0] * tmp0;
3949 sum5 -= v5[0] * tmp0;
3950 }
3951 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3952 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3953 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3954 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3955 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3956 ibdiag += 25; row += 5;
3957 break;
3958 default:
3959 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3960 }
3961 }
3962 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3963 }
3964 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3965 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3966 PetscFunctionReturn(0);
3967 }
3968
MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)3969 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3970 {
3971 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
3972 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3973 const MatScalar *bdiag = a->inode.bdiag;
3974 const PetscScalar *b;
3975 PetscErrorCode ierr;
3976 PetscInt m = a->inode.node_count,cnt = 0,i,row;
3977 const PetscInt *sizes = a->inode.size;
3978
3979 PetscFunctionBegin;
3980 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
3981 ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3982 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3983 cnt = 0;
3984 for (i=0, row=0; i<m; i++) {
3985 switch (sizes[i]) {
3986 case 1:
3987 x[row] = b[row]*bdiag[cnt++];row++;
3988 break;
3989 case 2:
3990 x1 = b[row]; x2 = b[row+1];
3991 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3992 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3993 x[row++] = tmp1;
3994 x[row++] = tmp2;
3995 cnt += 4;
3996 break;
3997 case 3:
3998 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2];
3999 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4000 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4001 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4002 x[row++] = tmp1;
4003 x[row++] = tmp2;
4004 x[row++] = tmp3;
4005 cnt += 9;
4006 break;
4007 case 4:
4008 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4009 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4010 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4011 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4012 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4013 x[row++] = tmp1;
4014 x[row++] = tmp2;
4015 x[row++] = tmp3;
4016 x[row++] = tmp4;
4017 cnt += 16;
4018 break;
4019 case 5:
4020 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4021 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4022 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4023 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4024 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4025 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4026 x[row++] = tmp1;
4027 x[row++] = tmp2;
4028 x[row++] = tmp3;
4029 x[row++] = tmp4;
4030 x[row++] = tmp5;
4031 cnt += 25;
4032 break;
4033 default:
4034 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4035 }
4036 }
4037 ierr = PetscLogFlops(2.0*cnt);CHKERRQ(ierr);
4038 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4039 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4040 PetscFunctionReturn(0);
4041 }
4042
MatSeqAIJ_Inode_ResetOps(Mat A)4043 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4044 {
4045 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4046
4047 PetscFunctionBegin;
4048 a->inode.node_count = 0;
4049 a->inode.use = PETSC_FALSE;
4050 a->inode.checked = PETSC_FALSE;
4051 a->inode.mat_nonzerostate = -1;
4052 A->ops->getrowij = MatGetRowIJ_SeqAIJ;
4053 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
4054 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
4055 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
4056 A->ops->coloringpatch = NULL;
4057 A->ops->multdiagonalblock = NULL;
4058 if (A->factortype) {
4059 A->ops->solve = MatSolve_SeqAIJ_inplace;
4060 }
4061 PetscFunctionReturn(0);
4062 }
4063
4064 /*
4065 samestructure indicates that the matrix has not changed its nonzero structure so we
4066 do not need to recompute the inodes
4067 */
MatSeqAIJCheckInode(Mat A)4068 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4069 {
4070 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4071 PetscErrorCode ierr;
4072 PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size;
4073 PetscBool flag;
4074 const PetscInt *idx,*idy,*ii;
4075
4076 PetscFunctionBegin;
4077 if (!a->inode.use) {
4078 ierr = MatSeqAIJ_Inode_ResetOps(A);CHKERRQ(ierr);
4079 ierr = PetscFree(a->inode.size);CHKERRQ(ierr);
4080 PetscFunctionReturn(0);
4081 }
4082 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0);
4083
4084 m = A->rmap->n;
4085 if (!a->inode.size) { ierr = PetscMalloc1(m+1,&a->inode.size);CHKERRQ(ierr); }
4086 ns = a->inode.size;
4087
4088 i = 0;
4089 node_count = 0;
4090 idx = a->j;
4091 ii = a->i;
4092 while (i < m) { /* For each row */
4093 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
4094 /* Limits the number of elements in a node to 'a->inode.limit' */
4095 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4096 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
4097 if (nzy != nzx) break;
4098 idy += nzx; /* Same nonzero pattern */
4099 ierr = PetscArraycmp(idx,idy,nzx,&flag);CHKERRQ(ierr);
4100 if (!flag) break;
4101 }
4102 ns[node_count++] = blk_size;
4103 idx += blk_size*nzx;
4104 i = j;
4105 }
4106
4107 /* If not enough inodes found,, do not use inode version of the routines */
4108 if (!m || node_count > .8*m) {
4109 ierr = MatSeqAIJ_Inode_ResetOps(A);CHKERRQ(ierr);
4110 ierr = PetscFree(a->inode.size);CHKERRQ(ierr);
4111 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4112 } else {
4113 if (!A->factortype) {
4114 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4115 if (A->rmap->n == A->cmap->n) {
4116 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4117 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4118 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4119 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4120 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4121 }
4122 } else {
4123 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4124 }
4125 a->inode.node_count = node_count;
4126 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4127 }
4128 a->inode.checked = PETSC_TRUE;
4129 a->inode.mat_nonzerostate = A->nonzerostate;
4130 PetscFunctionReturn(0);
4131 }
4132
MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat * C)4133 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4134 {
4135 Mat B =*C;
4136 Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4137 PetscErrorCode ierr;
4138 PetscInt m=A->rmap->n;
4139
4140 PetscFunctionBegin;
4141 c->inode.use = a->inode.use;
4142 c->inode.limit = a->inode.limit;
4143 c->inode.max_limit = a->inode.max_limit;
4144 c->inode.checked = PETSC_FALSE;
4145 c->inode.size = NULL;
4146 c->inode.node_count = 0;
4147 c->inode.ibdiagvalid = PETSC_FALSE;
4148 c->inode.ibdiag = NULL;
4149 c->inode.bdiag = NULL;
4150 c->inode.mat_nonzerostate = -1;
4151 if (a->inode.use) {
4152 if (a->inode.checked && a->inode.size) {
4153 ierr = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr);
4154 ierr = PetscArraycpy(c->inode.size,a->inode.size,m+1);CHKERRQ(ierr);
4155
4156 c->inode.checked = PETSC_TRUE;
4157 c->inode.node_count = a->inode.node_count;
4158 c->inode.mat_nonzerostate = (*C)->nonzerostate;
4159 }
4160 /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4161 if (!B->factortype) {
4162 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
4163 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
4164 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
4165 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4166 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
4167 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4168 } else {
4169 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4170 }
4171 }
4172 PetscFunctionReturn(0);
4173 }
4174
MatGetRow_FactoredLU(PetscInt * cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt * ai,const PetscInt * aj,const PetscInt * adiag,PetscInt row)4175 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4176 {
4177 PetscInt k;
4178 const PetscInt *vi;
4179
4180 PetscFunctionBegin;
4181 vi = aj + ai[row];
4182 for (k=0; k<nzl; k++) cols[k] = vi[k];
4183 vi = aj + adiag[row];
4184 cols[nzl] = vi[0];
4185 vi = aj + adiag[row+1]+1;
4186 for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4187 PetscFunctionReturn(0);
4188 }
4189 /*
4190 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4191 Modified from MatSeqAIJCheckInode().
4192
4193 Input Parameters:
4194 . Mat A - ILU or LU matrix factor
4195
4196 */
MatSeqAIJCheckInode_FactorLU(Mat A)4197 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4198 {
4199 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4200 PetscErrorCode ierr;
4201 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4202 PetscInt *cols1,*cols2,*ns;
4203 const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4204 PetscBool flag;
4205
4206 PetscFunctionBegin;
4207 if (!a->inode.use) PetscFunctionReturn(0);
4208 if (a->inode.checked) PetscFunctionReturn(0);
4209
4210 m = A->rmap->n;
4211 if (a->inode.size) ns = a->inode.size;
4212 else {
4213 ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4214 }
4215
4216 i = 0;
4217 node_count = 0;
4218 ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr);
4219 while (i < m) { /* For each row */
4220 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */
4221 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4222 nzx = nzl1 + nzu1 + 1;
4223 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4224
4225 /* Limits the number of elements in a node to 'a->inode.limit' */
4226 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4227 nzl2 = ai[j+1] - ai[j];
4228 nzu2 = adiag[j] - adiag[j+1] - 1;
4229 nzy = nzl2 + nzu2 + 1;
4230 if (nzy != nzx) break;
4231 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4232 ierr = PetscArraycmp(cols1,cols2,nzx,&flag);CHKERRQ(ierr);
4233 if (!flag) break;
4234 }
4235 ns[node_count++] = blk_size;
4236 i = j;
4237 }
4238 ierr = PetscFree2(cols1,cols2);CHKERRQ(ierr);
4239 /* If not enough inodes found,, do not use inode version of the routines */
4240 if (!m || node_count > .8*m) {
4241 ierr = PetscFree(ns);CHKERRQ(ierr);
4242
4243 a->inode.node_count = 0;
4244 a->inode.size = NULL;
4245 a->inode.use = PETSC_FALSE;
4246
4247 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4248 } else {
4249 A->ops->mult = NULL;
4250 A->ops->sor = NULL;
4251 A->ops->multadd = NULL;
4252 A->ops->getrowij = NULL;
4253 A->ops->restorerowij = NULL;
4254 A->ops->getcolumnij = NULL;
4255 A->ops->restorecolumnij = NULL;
4256 A->ops->coloringpatch = NULL;
4257 A->ops->multdiagonalblock = NULL;
4258 a->inode.node_count = node_count;
4259 a->inode.size = ns;
4260
4261 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4262 }
4263 a->inode.checked = PETSC_TRUE;
4264 PetscFunctionReturn(0);
4265 }
4266
MatSeqAIJInvalidateDiagonal_Inode(Mat A)4267 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4268 {
4269 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4270
4271 PetscFunctionBegin;
4272 a->inode.ibdiagvalid = PETSC_FALSE;
4273 PetscFunctionReturn(0);
4274 }
4275
4276 /*
4277 This is really ugly. if inodes are used this replaces the
4278 permutations with ones that correspond to rows/cols of the matrix
4279 rather then inode blocks
4280 */
MatInodeAdjustForInodes(Mat A,IS * rperm,IS * cperm)4281 PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4282 {
4283 PetscErrorCode ierr;
4284
4285 PetscFunctionBegin;
4286 ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4287 PetscFunctionReturn(0);
4288 }
4289
MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS * rperm,IS * cperm)4290 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4291 {
4292 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4293 PetscErrorCode ierr;
4294 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4295 const PetscInt *ridx,*cidx;
4296 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
4297 PetscInt nslim_col,*ns_col;
4298 IS ris = *rperm,cis = *cperm;
4299
4300 PetscFunctionBegin;
4301 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4302 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4303
4304 ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4305 ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr);
4306 ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr);
4307
4308 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4309 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4310
4311 /* Form the inode structure for the rows of permuted matric using inv perm*/
4312 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4313
4314 /* Construct the permutations for rows*/
4315 for (i=0,row = 0; i<nslim_row; ++i) {
4316 indx = ridx[i];
4317 start_val = tns[indx];
4318 end_val = tns[indx + 1];
4319 for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4320 }
4321
4322 /* Form the inode structure for the columns of permuted matrix using inv perm*/
4323 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4324
4325 /* Construct permutations for columns */
4326 for (i=0,col=0; i<nslim_col; ++i) {
4327 indx = cidx[i];
4328 start_val = tns[indx];
4329 end_val = tns[indx + 1];
4330 for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4331 }
4332
4333 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4334 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4335 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4336 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4337
4338 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4339 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4340
4341 ierr = PetscFree(ns_col);CHKERRQ(ierr);
4342 ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4343 ierr = ISDestroy(&cis);CHKERRQ(ierr);
4344 ierr = ISDestroy(&ris);CHKERRQ(ierr);
4345 ierr = PetscFree(tns);CHKERRQ(ierr);
4346 PetscFunctionReturn(0);
4347 }
4348
4349 /*@C
4350 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4351
4352 Not Collective
4353
4354 Input Parameter:
4355 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4356
4357 Output Parameter:
4358 + node_count - no of inodes present in the matrix.
4359 . sizes - an array of size node_count,with sizes of each inode.
4360 - limit - the max size used to generate the inodes.
4361
4362 Level: advanced
4363
4364 Notes:
4365 This routine returns some internal storage information
4366 of the matrix, it is intended to be used by advanced users.
4367 It should be called after the matrix is assembled.
4368 The contents of the sizes[] array should not be changed.
4369 NULL may be passed for information not requested.
4370
4371 .seealso: MatGetInfo()
4372 @*/
MatInodeGetInodeSizes(Mat A,PetscInt * node_count,PetscInt * sizes[],PetscInt * limit)4373 PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4374 {
4375 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4376
4377 PetscFunctionBegin;
4378 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4379 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr);
4380 if (f) {
4381 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4382 }
4383 PetscFunctionReturn(0);
4384 }
4385
MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt * node_count,PetscInt * sizes[],PetscInt * limit)4386 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4387 {
4388 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4389
4390 PetscFunctionBegin;
4391 if (node_count) *node_count = a->inode.node_count;
4392 if (sizes) *sizes = a->inode.size;
4393 if (limit) *limit = a->inode.limit;
4394 PetscFunctionReturn(0);
4395 }
4396