1 
2 /*
3   This file provides high performance routines for the Inode format (compressed sparse row)
4   by taking advantage of rows with identical nonzero structure (I-nodes).
5 */
6 #include <../src/mat/impls/aij/seq/aij.h>
7 
MatCreateColInode_Private(Mat A,PetscInt * size,PetscInt ** ns)8 static PetscErrorCode MatCreateColInode_Private(Mat A,PetscInt *size,PetscInt **ns)
9 {
10   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
11   PetscErrorCode ierr;
12   PetscInt       i,count,m,n,min_mn,*ns_row,*ns_col;
13 
14   PetscFunctionBegin;
15   n = A->cmap->n;
16   m = A->rmap->n;
17   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
18   ns_row = a->inode.size;
19 
20   min_mn = (m < n) ? m : n;
21   if (!ns) {
22     for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ;
23     for (; count+1 < n; count++,i++) ;
24     if (count < n)  {
25       i++;
26     }
27     *size = i;
28     PetscFunctionReturn(0);
29   }
30   ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr);
31 
32   /* Use the same row structure wherever feasible. */
33   for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
34     ns_col[i] = ns_row[i];
35   }
36 
37   /* if m < n; pad up the remainder with inode_limit */
38   for (; count+1 < n; count++,i++) {
39     ns_col[i] = 1;
40   }
41   /* The last node is the odd ball. padd it up with the remaining rows; */
42   if (count < n) {
43     ns_col[i] = n - count;
44     i++;
45   } else if (count > n) {
46     /* Adjust for the over estimation */
47     ns_col[i-1] += n - count;
48   }
49   *size = i;
50   *ns   = ns_col;
51   PetscFunctionReturn(0);
52 }
53 
54 
55 /*
56       This builds symmetric version of nonzero structure,
57 */
MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt * iia[],const PetscInt * jja[],PetscInt ishift,PetscInt oshift)58 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
59 {
60   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
61   PetscErrorCode ierr;
62   PetscInt       *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n;
63   PetscInt       *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2;
64   const PetscInt *j,*jmax,*ai= a->i,*aj = a->j;
65 
66   PetscFunctionBegin;
67   nslim_row = a->inode.node_count;
68   m         = A->rmap->n;
69   n         = A->cmap->n;
70   if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
71   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
72 
73   /* Use the row_inode as column_inode */
74   nslim_col = nslim_row;
75   ns_col    = ns_row;
76 
77   /* allocate space for reformated inode structure */
78   ierr = PetscMalloc2(nslim_col+1,&tns,n+1,&tvc);CHKERRQ(ierr);
79   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
80 
81   for (i1=0,col=0; i1<nslim_col; ++i1) {
82     nsz = ns_col[i1];
83     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
84   }
85   /* allocate space for row pointers */
86   ierr = PetscCalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
87   *iia = ia;
88   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
89 
90   /* determine the number of columns in each row */
91   ia[0] = oshift;
92   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
93 
94     j    = aj + ai[row] + ishift;
95     jmax = aj + ai[row+1] + ishift;
96     if (j==jmax) continue; /* empty row */
97     col  = *j++ + ishift;
98     i2   = tvc[col];
99     while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
100       ia[i1+1]++;
101       ia[i2+1]++;
102       i2++;                     /* Start col of next node */
103       while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j;
104       i2 = tvc[col];
105     }
106     if (i2 == i1) ia[i2+1]++;    /* now the diagonal element */
107   }
108 
109   /* shift ia[i] to point to next row */
110   for (i1=1; i1<nslim_row+1; i1++) {
111     row        = ia[i1-1];
112     ia[i1]    += row;
113     work[i1-1] = row - oshift;
114   }
115 
116   /* allocate space for column pointers */
117   nz   = ia[nslim_row] + (!ishift);
118   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
119   *jja = ja;
120 
121   /* loop over lower triangular part putting into ja */
122   for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
123     j    = aj + ai[row] + ishift;
124     jmax = aj + ai[row+1] + ishift;
125     if (j==jmax) continue; /* empty row */
126     col  = *j++ + ishift;
127     i2   = tvc[col];
128     while (i2<i1 && j<jmax) {
129       ja[work[i2]++] = i1 + oshift;
130       ja[work[i1]++] = i2 + oshift;
131       ++i2;
132       while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */
133       i2 = tvc[col];
134     }
135     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
136 
137   }
138   ierr = PetscFree(work);CHKERRQ(ierr);
139   ierr = PetscFree2(tns,tvc);CHKERRQ(ierr);
140   PetscFunctionReturn(0);
141 }
142 
143 /*
144       This builds nonsymmetric version of nonzero structure,
145 */
MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt * iia[],const PetscInt * jja[],PetscInt ishift,PetscInt oshift)146 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
147 {
148   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
149   PetscErrorCode ierr;
150   PetscInt       *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col;
151   PetscInt       *tns,*tvc,nsz,i1,i2;
152   const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size;
153 
154   PetscFunctionBegin;
155   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
156   nslim_row = a->inode.node_count;
157   n         = A->cmap->n;
158 
159   /* Create The column_inode for this matrix */
160   ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr);
161 
162   /* allocate space for reformated column_inode structure */
163   ierr = PetscMalloc2(nslim_col +1,&tns,n + 1,&tvc);CHKERRQ(ierr);
164   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
165 
166   for (i1=0,col=0; i1<nslim_col; ++i1) {
167     nsz = ns_col[i1];
168     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
169   }
170   /* allocate space for row pointers */
171   ierr = PetscCalloc1(nslim_row+1,&ia);CHKERRQ(ierr);
172   *iia = ia;
173   ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr);
174 
175   /* determine the number of columns in each row */
176   ia[0] = oshift;
177   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
178     j   = aj + ai[row] + ishift;
179     nz  = ai[row+1] - ai[row];
180     if (!nz) continue; /* empty row */
181     col = *j++ + ishift;
182     i2  = tvc[col];
183     while (nz-- > 0) {           /* off-diagonal elemets */
184       ia[i1+1]++;
185       i2++;                     /* Start col of next node */
186       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
187       if (nz > 0) i2 = tvc[col];
188     }
189   }
190 
191   /* shift ia[i] to point to next row */
192   for (i1=1; i1<nslim_row+1; i1++) {
193     row        = ia[i1-1];
194     ia[i1]    += row;
195     work[i1-1] = row - oshift;
196   }
197 
198   /* allocate space for column pointers */
199   nz   = ia[nslim_row] + (!ishift);
200   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
201   *jja = ja;
202 
203   /* loop over matrix putting into ja */
204   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
205     j   = aj + ai[row] + ishift;
206     nz  = ai[row+1] - ai[row];
207     if (!nz) continue; /* empty row */
208     col = *j++ + ishift;
209     i2  = tvc[col];
210     while (nz-- > 0) {
211       ja[work[i1]++] = i2 + oshift;
212       ++i2;
213       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
214       if (nz > 0) i2 = tvc[col];
215     }
216   }
217   ierr = PetscFree(ns_col);CHKERRQ(ierr);
218   ierr = PetscFree(work);CHKERRQ(ierr);
219   ierr = PetscFree2(tns,tvc);CHKERRQ(ierr);
220   PetscFunctionReturn(0);
221 }
222 
MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt * n,const PetscInt * ia[],const PetscInt * ja[],PetscBool * done)223 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
224 {
225   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
226   PetscErrorCode ierr;
227 
228   PetscFunctionBegin;
229   *n = a->inode.node_count;
230   if (!ia) PetscFunctionReturn(0);
231   if (!blockcompressed) {
232     ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);
233   } else if (symmetric) {
234     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
235   } else {
236     ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
237   }
238   PetscFunctionReturn(0);
239 }
240 
MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt * n,const PetscInt * ia[],const PetscInt * ja[],PetscBool * done)241 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
242 {
243   PetscErrorCode ierr;
244 
245   PetscFunctionBegin;
246   if (!ia) PetscFunctionReturn(0);
247 
248   if (!blockcompressed) {
249     ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);
250   } else {
251     ierr = PetscFree(*ia);CHKERRQ(ierr);
252     ierr = PetscFree(*ja);CHKERRQ(ierr);
253   }
254   PetscFunctionReturn(0);
255 }
256 
257 /* ----------------------------------------------------------- */
258 
MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt * iia[],const PetscInt * jja[],PetscInt ishift,PetscInt oshift)259 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift)
260 {
261   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
262   PetscErrorCode ierr;
263   PetscInt       *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
264   PetscInt       *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
265 
266   PetscFunctionBegin;
267   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
268   nslim_row = a->inode.node_count;
269   n         = A->cmap->n;
270 
271   /* Create The column_inode for this matrix */
272   ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr);
273 
274   /* allocate space for reformated column_inode structure */
275   ierr = PetscMalloc2(nslim_col + 1,&tns,n + 1,&tvc);CHKERRQ(ierr);
276   for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
277 
278   for (i1=0,col=0; i1<nslim_col; ++i1) {
279     nsz = ns_col[i1];
280     for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1;
281   }
282   /* allocate space for column pointers */
283   ierr = PetscCalloc1(nslim_col+1,&ia);CHKERRQ(ierr);
284   *iia = ia;
285   ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr);
286 
287   /* determine the number of columns in each row */
288   ia[0] = oshift;
289   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
290     j   = aj + ai[row] + ishift;
291     col = *j++ + ishift;
292     i2  = tvc[col];
293     nz  = ai[row+1] - ai[row];
294     while (nz-- > 0) {           /* off-diagonal elemets */
295       /* ia[i1+1]++; */
296       ia[i2+1]++;
297       i2++;
298       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
299       if (nz > 0) i2 = tvc[col];
300     }
301   }
302 
303   /* shift ia[i] to point to next col */
304   for (i1=1; i1<nslim_col+1; i1++) {
305     col        = ia[i1-1];
306     ia[i1]    += col;
307     work[i1-1] = col - oshift;
308   }
309 
310   /* allocate space for column pointers */
311   nz   = ia[nslim_col] + (!ishift);
312   ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr);
313   *jja = ja;
314 
315   /* loop over matrix putting into ja */
316   for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
317     j   = aj + ai[row] + ishift;
318     col = *j++ + ishift;
319     i2  = tvc[col];
320     nz  = ai[row+1] - ai[row];
321     while (nz-- > 0) {
322       /* ja[work[i1]++] = i2 + oshift; */
323       ja[work[i2]++] = i1 + oshift;
324       i2++;
325       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
326       if (nz > 0) i2 = tvc[col];
327     }
328   }
329   ierr = PetscFree(ns_col);CHKERRQ(ierr);
330   ierr = PetscFree(work);CHKERRQ(ierr);
331   ierr = PetscFree2(tns,tvc);CHKERRQ(ierr);
332   PetscFunctionReturn(0);
333 }
334 
MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt * n,const PetscInt * ia[],const PetscInt * ja[],PetscBool * done)335 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
336 {
337   PetscErrorCode ierr;
338 
339   PetscFunctionBegin;
340   ierr = MatCreateColInode_Private(A,n,NULL);CHKERRQ(ierr);
341   if (!ia) PetscFunctionReturn(0);
342 
343   if (!blockcompressed) {
344     ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);
345   } else if (symmetric) {
346     /* Since the indices are symmetric it does'nt matter */
347     ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
348   } else {
349     ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr);
350   }
351   PetscFunctionReturn(0);
352 }
353 
MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt * n,const PetscInt * ia[],const PetscInt * ja[],PetscBool * done)354 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool  *done)
355 {
356   PetscErrorCode ierr;
357 
358   PetscFunctionBegin;
359   if (!ia) PetscFunctionReturn(0);
360   if (!blockcompressed) {
361     ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);
362   } else {
363     ierr = PetscFree(*ia);CHKERRQ(ierr);
364     ierr = PetscFree(*ja);CHKERRQ(ierr);
365   }
366   PetscFunctionReturn(0);
367 }
368 
369 /* ----------------------------------------------------------- */
370 
MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)371 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)
372 {
373   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
374   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
375   PetscScalar       *y;
376   const PetscScalar *x;
377   const MatScalar   *v1,*v2,*v3,*v4,*v5;
378   PetscErrorCode    ierr;
379   PetscInt          i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0;
380   const PetscInt    *idx,*ns,*ii;
381 
382 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
383 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
384 #endif
385 
386   PetscFunctionBegin;
387   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
388   node_max = a->inode.node_count;
389   ns       = a->inode.size;     /* Node Size array */
390   ierr     = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
391   ierr     = VecGetArray(yy,&y);CHKERRQ(ierr);
392   idx      = a->j;
393   v1       = a->a;
394   ii       = a->i;
395 
396   for (i = 0,row = 0; i< node_max; ++i) {
397     nsz         = ns[i];
398     n           = ii[1] - ii[0];
399     nonzerorow += (n>0)*nsz;
400     ii         += nsz;
401     PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA);    /* Prefetch the indices for the block row after the current one */
402     PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
403     sz = n;                     /* No of non zeros in this row */
404                                 /* Switch on the size of Node */
405     switch (nsz) {               /* Each loop in 'case' is unrolled */
406     case 1:
407       sum1 = 0.;
408 
409       for (n = 0; n< sz-1; n+=2) {
410         i1    = idx[0];         /* The instructions are ordered to */
411         i2    = idx[1];         /* make the compiler's job easy */
412         idx  += 2;
413         tmp0  = x[i1];
414         tmp1  = x[i2];
415         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
416       }
417 
418       if (n == sz-1) {          /* Take care of the last nonzero  */
419         tmp0  = x[*idx++];
420         sum1 += *v1++ *tmp0;
421       }
422       y[row++]=sum1;
423       break;
424     case 2:
425       sum1 = 0.;
426       sum2 = 0.;
427       v2   = v1 + n;
428 
429       for (n = 0; n< sz-1; n+=2) {
430         i1    = idx[0];
431         i2    = idx[1];
432         idx  += 2;
433         tmp0  = x[i1];
434         tmp1  = x[i2];
435         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
436         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
437       }
438       if (n == sz-1) {
439         tmp0  = x[*idx++];
440         sum1 += *v1++ * tmp0;
441         sum2 += *v2++ * tmp0;
442       }
443       y[row++]=sum1;
444       y[row++]=sum2;
445       v1      =v2;              /* Since the next block to be processed starts there*/
446       idx    +=sz;
447       break;
448     case 3:
449       sum1 = 0.;
450       sum2 = 0.;
451       sum3 = 0.;
452       v2   = v1 + n;
453       v3   = v2 + n;
454 
455       for (n = 0; n< sz-1; n+=2) {
456         i1    = idx[0];
457         i2    = idx[1];
458         idx  += 2;
459         tmp0  = x[i1];
460         tmp1  = x[i2];
461         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
462         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
463         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
464       }
465       if (n == sz-1) {
466         tmp0  = x[*idx++];
467         sum1 += *v1++ * tmp0;
468         sum2 += *v2++ * tmp0;
469         sum3 += *v3++ * tmp0;
470       }
471       y[row++]=sum1;
472       y[row++]=sum2;
473       y[row++]=sum3;
474       v1      =v3;              /* Since the next block to be processed starts there*/
475       idx    +=2*sz;
476       break;
477     case 4:
478       sum1 = 0.;
479       sum2 = 0.;
480       sum3 = 0.;
481       sum4 = 0.;
482       v2   = v1 + n;
483       v3   = v2 + n;
484       v4   = v3 + n;
485 
486       for (n = 0; n< sz-1; n+=2) {
487         i1    = idx[0];
488         i2    = idx[1];
489         idx  += 2;
490         tmp0  = x[i1];
491         tmp1  = x[i2];
492         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
493         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
494         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
495         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
496       }
497       if (n == sz-1) {
498         tmp0  = x[*idx++];
499         sum1 += *v1++ * tmp0;
500         sum2 += *v2++ * tmp0;
501         sum3 += *v3++ * tmp0;
502         sum4 += *v4++ * tmp0;
503       }
504       y[row++]=sum1;
505       y[row++]=sum2;
506       y[row++]=sum3;
507       y[row++]=sum4;
508       v1      =v4;              /* Since the next block to be processed starts there*/
509       idx    +=3*sz;
510       break;
511     case 5:
512       sum1 = 0.;
513       sum2 = 0.;
514       sum3 = 0.;
515       sum4 = 0.;
516       sum5 = 0.;
517       v2   = v1 + n;
518       v3   = v2 + n;
519       v4   = v3 + n;
520       v5   = v4 + n;
521 
522       for (n = 0; n<sz-1; n+=2) {
523         i1    = idx[0];
524         i2    = idx[1];
525         idx  += 2;
526         tmp0  = x[i1];
527         tmp1  = x[i2];
528         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
529         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
530         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
531         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
532         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
533       }
534       if (n == sz-1) {
535         tmp0  = x[*idx++];
536         sum1 += *v1++ * tmp0;
537         sum2 += *v2++ * tmp0;
538         sum3 += *v3++ * tmp0;
539         sum4 += *v4++ * tmp0;
540         sum5 += *v5++ * tmp0;
541       }
542       y[row++]=sum1;
543       y[row++]=sum2;
544       y[row++]=sum3;
545       y[row++]=sum4;
546       y[row++]=sum5;
547       v1      =v5;       /* Since the next block to be processed starts there */
548       idx    +=4*sz;
549       break;
550     default:
551       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
552     }
553   }
554   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
555   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
556   ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr);
557   PetscFunctionReturn(0);
558 }
559 /* ----------------------------------------------------------- */
560 /* Almost same code as the MatMult_SeqAIJ_Inode() */
MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)561 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)
562 {
563   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
564   PetscScalar       sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
565   const MatScalar   *v1,*v2,*v3,*v4,*v5;
566   const PetscScalar *x;
567   PetscScalar       *y,*z,*zt;
568   PetscErrorCode    ierr;
569   PetscInt          i1,i2,n,i,row,node_max,nsz,sz;
570   const PetscInt    *idx,*ns,*ii;
571 
572   PetscFunctionBegin;
573   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
574   node_max = a->inode.node_count;
575   ns       = a->inode.size;     /* Node Size array */
576 
577   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
578   ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
579   zt = z;
580 
581   idx = a->j;
582   v1  = a->a;
583   ii  = a->i;
584 
585   for (i = 0,row = 0; i< node_max; ++i) {
586     nsz = ns[i];
587     n   = ii[1] - ii[0];
588     ii += nsz;
589     sz  = n;                    /* No of non zeros in this row */
590                                 /* Switch on the size of Node */
591     switch (nsz) {               /* Each loop in 'case' is unrolled */
592     case 1:
593       sum1 = *zt++;
594 
595       for (n = 0; n< sz-1; n+=2) {
596         i1    = idx[0];         /* The instructions are ordered to */
597         i2    = idx[1];         /* make the compiler's job easy */
598         idx  += 2;
599         tmp0  = x[i1];
600         tmp1  = x[i2];
601         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
602       }
603 
604       if (n   == sz-1) {          /* Take care of the last nonzero  */
605         tmp0  = x[*idx++];
606         sum1 += *v1++ * tmp0;
607       }
608       y[row++]=sum1;
609       break;
610     case 2:
611       sum1 = *zt++;
612       sum2 = *zt++;
613       v2   = v1 + n;
614 
615       for (n = 0; n< sz-1; n+=2) {
616         i1    = idx[0];
617         i2    = idx[1];
618         idx  += 2;
619         tmp0  = x[i1];
620         tmp1  = x[i2];
621         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
622         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
623       }
624       if (n   == sz-1) {
625         tmp0  = x[*idx++];
626         sum1 += *v1++ * tmp0;
627         sum2 += *v2++ * tmp0;
628       }
629       y[row++]=sum1;
630       y[row++]=sum2;
631       v1      =v2;              /* Since the next block to be processed starts there*/
632       idx    +=sz;
633       break;
634     case 3:
635       sum1 = *zt++;
636       sum2 = *zt++;
637       sum3 = *zt++;
638       v2   = v1 + n;
639       v3   = v2 + n;
640 
641       for (n = 0; n< sz-1; n+=2) {
642         i1    = idx[0];
643         i2    = idx[1];
644         idx  += 2;
645         tmp0  = x[i1];
646         tmp1  = x[i2];
647         sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
648         sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
649         sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
650       }
651       if (n == sz-1) {
652         tmp0  = x[*idx++];
653         sum1 += *v1++ * tmp0;
654         sum2 += *v2++ * tmp0;
655         sum3 += *v3++ * tmp0;
656       }
657       y[row++]=sum1;
658       y[row++]=sum2;
659       y[row++]=sum3;
660       v1      =v3;              /* Since the next block to be processed starts there*/
661       idx    +=2*sz;
662       break;
663     case 4:
664       sum1 = *zt++;
665       sum2 = *zt++;
666       sum3 = *zt++;
667       sum4 = *zt++;
668       v2   = v1 + n;
669       v3   = v2 + n;
670       v4   = v3 + n;
671 
672       for (n = 0; n< sz-1; n+=2) {
673         i1    = idx[0];
674         i2    = idx[1];
675         idx  += 2;
676         tmp0  = x[i1];
677         tmp1  = x[i2];
678         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
679         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
680         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
681         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
682       }
683       if (n == sz-1) {
684         tmp0  = x[*idx++];
685         sum1 += *v1++ * tmp0;
686         sum2 += *v2++ * tmp0;
687         sum3 += *v3++ * tmp0;
688         sum4 += *v4++ * tmp0;
689       }
690       y[row++]=sum1;
691       y[row++]=sum2;
692       y[row++]=sum3;
693       y[row++]=sum4;
694       v1      =v4;              /* Since the next block to be processed starts there*/
695       idx    +=3*sz;
696       break;
697     case 5:
698       sum1 = *zt++;
699       sum2 = *zt++;
700       sum3 = *zt++;
701       sum4 = *zt++;
702       sum5 = *zt++;
703       v2   = v1 + n;
704       v3   = v2 + n;
705       v4   = v3 + n;
706       v5   = v4 + n;
707 
708       for (n = 0; n<sz-1; n+=2) {
709         i1    = idx[0];
710         i2    = idx[1];
711         idx  += 2;
712         tmp0  = x[i1];
713         tmp1  = x[i2];
714         sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
715         sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
716         sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
717         sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
718         sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
719       }
720       if (n   == sz-1) {
721         tmp0  = x[*idx++];
722         sum1 += *v1++ * tmp0;
723         sum2 += *v2++ * tmp0;
724         sum3 += *v3++ * tmp0;
725         sum4 += *v4++ * tmp0;
726         sum5 += *v5++ * tmp0;
727       }
728       y[row++]=sum1;
729       y[row++]=sum2;
730       y[row++]=sum3;
731       y[row++]=sum4;
732       y[row++]=sum5;
733       v1      =v5;       /* Since the next block to be processed starts there */
734       idx    +=4*sz;
735       break;
736     default:
737       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported");
738     }
739   }
740   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
741   ierr = VecRestoreArrayPair(zz,yy,&z,&y);CHKERRQ(ierr);
742   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
743   PetscFunctionReturn(0);
744 }
745 
746 /* ----------------------------------------------------------- */
MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)747 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)
748 {
749   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
750   IS                iscol = a->col,isrow = a->row;
751   PetscErrorCode    ierr;
752   const PetscInt    *r,*c,*rout,*cout;
753   PetscInt          i,j,n = A->rmap->n,nz;
754   PetscInt          node_max,*ns,row,nsz,aii,i0,i1;
755   const PetscInt    *ai = a->i,*a_j = a->j,*vi,*ad,*aj;
756   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
757   PetscScalar       sum1,sum2,sum3,sum4,sum5;
758   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
759   const PetscScalar *b;
760 
761   PetscFunctionBegin;
762   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
763   node_max = a->inode.node_count;
764   ns       = a->inode.size;     /* Node Size array */
765 
766   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
767   ierr = VecGetArrayWrite(xx,&x);CHKERRQ(ierr);
768   tmp  = a->solve_work;
769 
770   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
771   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
772 
773   /* forward solve the lower triangular */
774   tmps = tmp;
775   aa   = a_a;
776   aj   = a_j;
777   ad   = a->diag;
778 
779   for (i = 0,row = 0; i< node_max; ++i) {
780     nsz = ns[i];
781     aii = ai[row];
782     v1  = aa + aii;
783     vi  = aj + aii;
784     nz  = ad[row]- aii;
785     if (i < node_max-1) {
786       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
787       * but our indexing to determine it's size could. */
788       PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
789       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
790       PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
791       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
792     }
793 
794     switch (nsz) {               /* Each loop in 'case' is unrolled */
795     case 1:
796       sum1 = b[*r++];
797       for (j=0; j<nz-1; j+=2) {
798         i0    = vi[0];
799         i1    = vi[1];
800         vi   +=2;
801         tmp0  = tmps[i0];
802         tmp1  = tmps[i1];
803         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
804       }
805       if (j == nz-1) {
806         tmp0  = tmps[*vi++];
807         sum1 -= *v1++ *tmp0;
808       }
809       tmp[row++]=sum1;
810       break;
811     case 2:
812       sum1 = b[*r++];
813       sum2 = b[*r++];
814       v2   = aa + ai[row+1];
815 
816       for (j=0; j<nz-1; j+=2) {
817         i0    = vi[0];
818         i1    = vi[1];
819         vi   +=2;
820         tmp0  = tmps[i0];
821         tmp1  = tmps[i1];
822         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
823         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
824       }
825       if (j == nz-1) {
826         tmp0  = tmps[*vi++];
827         sum1 -= *v1++ *tmp0;
828         sum2 -= *v2++ *tmp0;
829       }
830       sum2     -= *v2++ *sum1;
831       tmp[row++]=sum1;
832       tmp[row++]=sum2;
833       break;
834     case 3:
835       sum1 = b[*r++];
836       sum2 = b[*r++];
837       sum3 = b[*r++];
838       v2   = aa + ai[row+1];
839       v3   = aa + ai[row+2];
840 
841       for (j=0; j<nz-1; j+=2) {
842         i0    = vi[0];
843         i1    = vi[1];
844         vi   +=2;
845         tmp0  = tmps[i0];
846         tmp1  = tmps[i1];
847         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
848         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
849         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
850       }
851       if (j == nz-1) {
852         tmp0  = tmps[*vi++];
853         sum1 -= *v1++ *tmp0;
854         sum2 -= *v2++ *tmp0;
855         sum3 -= *v3++ *tmp0;
856       }
857       sum2 -= *v2++ * sum1;
858       sum3 -= *v3++ * sum1;
859       sum3 -= *v3++ * sum2;
860 
861       tmp[row++]=sum1;
862       tmp[row++]=sum2;
863       tmp[row++]=sum3;
864       break;
865 
866     case 4:
867       sum1 = b[*r++];
868       sum2 = b[*r++];
869       sum3 = b[*r++];
870       sum4 = b[*r++];
871       v2   = aa + ai[row+1];
872       v3   = aa + ai[row+2];
873       v4   = aa + ai[row+3];
874 
875       for (j=0; j<nz-1; j+=2) {
876         i0    = vi[0];
877         i1    = vi[1];
878         vi   +=2;
879         tmp0  = tmps[i0];
880         tmp1  = tmps[i1];
881         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
882         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
883         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
884         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
885       }
886       if (j == nz-1) {
887         tmp0  = tmps[*vi++];
888         sum1 -= *v1++ *tmp0;
889         sum2 -= *v2++ *tmp0;
890         sum3 -= *v3++ *tmp0;
891         sum4 -= *v4++ *tmp0;
892       }
893       sum2 -= *v2++ * sum1;
894       sum3 -= *v3++ * sum1;
895       sum4 -= *v4++ * sum1;
896       sum3 -= *v3++ * sum2;
897       sum4 -= *v4++ * sum2;
898       sum4 -= *v4++ * sum3;
899 
900       tmp[row++]=sum1;
901       tmp[row++]=sum2;
902       tmp[row++]=sum3;
903       tmp[row++]=sum4;
904       break;
905     case 5:
906       sum1 = b[*r++];
907       sum2 = b[*r++];
908       sum3 = b[*r++];
909       sum4 = b[*r++];
910       sum5 = b[*r++];
911       v2   = aa + ai[row+1];
912       v3   = aa + ai[row+2];
913       v4   = aa + ai[row+3];
914       v5   = aa + ai[row+4];
915 
916       for (j=0; j<nz-1; j+=2) {
917         i0    = vi[0];
918         i1    = vi[1];
919         vi   +=2;
920         tmp0  = tmps[i0];
921         tmp1  = tmps[i1];
922         sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
923         sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
924         sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
925         sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
926         sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
927       }
928       if (j == nz-1) {
929         tmp0  = tmps[*vi++];
930         sum1 -= *v1++ *tmp0;
931         sum2 -= *v2++ *tmp0;
932         sum3 -= *v3++ *tmp0;
933         sum4 -= *v4++ *tmp0;
934         sum5 -= *v5++ *tmp0;
935       }
936 
937       sum2 -= *v2++ * sum1;
938       sum3 -= *v3++ * sum1;
939       sum4 -= *v4++ * sum1;
940       sum5 -= *v5++ * sum1;
941       sum3 -= *v3++ * sum2;
942       sum4 -= *v4++ * sum2;
943       sum5 -= *v5++ * sum2;
944       sum4 -= *v4++ * sum3;
945       sum5 -= *v5++ * sum3;
946       sum5 -= *v5++ * sum4;
947 
948       tmp[row++]=sum1;
949       tmp[row++]=sum2;
950       tmp[row++]=sum3;
951       tmp[row++]=sum4;
952       tmp[row++]=sum5;
953       break;
954     default:
955       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
956     }
957   }
958   /* backward solve the upper triangular */
959   for (i=node_max -1,row = n-1; i>=0; i--) {
960     nsz = ns[i];
961     aii = ai[row+1] -1;
962     v1  = aa + aii;
963     vi  = aj + aii;
964     nz  = aii- ad[row];
965     switch (nsz) {               /* Each loop in 'case' is unrolled */
966     case 1:
967       sum1 = tmp[row];
968 
969       for (j=nz; j>1; j-=2) {
970         vi   -=2;
971         i0    = vi[2];
972         i1    = vi[1];
973         tmp0  = tmps[i0];
974         tmp1  = tmps[i1];
975         v1   -= 2;
976         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
977       }
978       if (j==1) {
979         tmp0  = tmps[*vi--];
980         sum1 -= *v1-- * tmp0;
981       }
982       x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
983       break;
984     case 2:
985       sum1 = tmp[row];
986       sum2 = tmp[row -1];
987       v2   = aa + ai[row]-1;
988       for (j=nz; j>1; j-=2) {
989         vi   -=2;
990         i0    = vi[2];
991         i1    = vi[1];
992         tmp0  = tmps[i0];
993         tmp1  = tmps[i1];
994         v1   -= 2;
995         v2   -= 2;
996         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
997         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
998       }
999       if (j==1) {
1000         tmp0  = tmps[*vi--];
1001         sum1 -= *v1-- * tmp0;
1002         sum2 -= *v2-- * tmp0;
1003       }
1004 
1005       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1006       sum2   -= *v2-- * tmp0;
1007       x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1008       break;
1009     case 3:
1010       sum1 = tmp[row];
1011       sum2 = tmp[row -1];
1012       sum3 = tmp[row -2];
1013       v2   = aa + ai[row]-1;
1014       v3   = aa + ai[row -1]-1;
1015       for (j=nz; j>1; j-=2) {
1016         vi   -=2;
1017         i0    = vi[2];
1018         i1    = vi[1];
1019         tmp0  = tmps[i0];
1020         tmp1  = tmps[i1];
1021         v1   -= 2;
1022         v2   -= 2;
1023         v3   -= 2;
1024         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1025         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1026         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1027       }
1028       if (j==1) {
1029         tmp0  = tmps[*vi--];
1030         sum1 -= *v1-- * tmp0;
1031         sum2 -= *v2-- * tmp0;
1032         sum3 -= *v3-- * tmp0;
1033       }
1034       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1035       sum2   -= *v2-- * tmp0;
1036       sum3   -= *v3-- * tmp0;
1037       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1038       sum3   -= *v3-- * tmp0;
1039       x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1040 
1041       break;
1042     case 4:
1043       sum1 = tmp[row];
1044       sum2 = tmp[row -1];
1045       sum3 = tmp[row -2];
1046       sum4 = tmp[row -3];
1047       v2   = aa + ai[row]-1;
1048       v3   = aa + ai[row -1]-1;
1049       v4   = aa + ai[row -2]-1;
1050 
1051       for (j=nz; j>1; j-=2) {
1052         vi   -=2;
1053         i0    = vi[2];
1054         i1    = vi[1];
1055         tmp0  = tmps[i0];
1056         tmp1  = tmps[i1];
1057         v1   -= 2;
1058         v2   -= 2;
1059         v3   -= 2;
1060         v4   -= 2;
1061         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1062         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1063         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1064         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1065       }
1066       if (j==1) {
1067         tmp0  = tmps[*vi--];
1068         sum1 -= *v1-- * tmp0;
1069         sum2 -= *v2-- * tmp0;
1070         sum3 -= *v3-- * tmp0;
1071         sum4 -= *v4-- * tmp0;
1072       }
1073 
1074       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1075       sum2   -= *v2-- * tmp0;
1076       sum3   -= *v3-- * tmp0;
1077       sum4   -= *v4-- * tmp0;
1078       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1079       sum3   -= *v3-- * tmp0;
1080       sum4   -= *v4-- * tmp0;
1081       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1082       sum4   -= *v4-- * tmp0;
1083       x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1084       break;
1085     case 5:
1086       sum1 = tmp[row];
1087       sum2 = tmp[row -1];
1088       sum3 = tmp[row -2];
1089       sum4 = tmp[row -3];
1090       sum5 = tmp[row -4];
1091       v2   = aa + ai[row]-1;
1092       v3   = aa + ai[row -1]-1;
1093       v4   = aa + ai[row -2]-1;
1094       v5   = aa + ai[row -3]-1;
1095       for (j=nz; j>1; j-=2) {
1096         vi   -= 2;
1097         i0    = vi[2];
1098         i1    = vi[1];
1099         tmp0  = tmps[i0];
1100         tmp1  = tmps[i1];
1101         v1   -= 2;
1102         v2   -= 2;
1103         v3   -= 2;
1104         v4   -= 2;
1105         v5   -= 2;
1106         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1107         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1108         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1109         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1110         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1111       }
1112       if (j==1) {
1113         tmp0  = tmps[*vi--];
1114         sum1 -= *v1-- * tmp0;
1115         sum2 -= *v2-- * tmp0;
1116         sum3 -= *v3-- * tmp0;
1117         sum4 -= *v4-- * tmp0;
1118         sum5 -= *v5-- * tmp0;
1119       }
1120 
1121       tmp0    = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1122       sum2   -= *v2-- * tmp0;
1123       sum3   -= *v3-- * tmp0;
1124       sum4   -= *v4-- * tmp0;
1125       sum5   -= *v5-- * tmp0;
1126       tmp0    = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1127       sum3   -= *v3-- * tmp0;
1128       sum4   -= *v4-- * tmp0;
1129       sum5   -= *v5-- * tmp0;
1130       tmp0    = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1131       sum4   -= *v4-- * tmp0;
1132       sum5   -= *v5-- * tmp0;
1133       tmp0    = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1134       sum5   -= *v5-- * tmp0;
1135       x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1136       break;
1137     default:
1138       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
1139     }
1140   }
1141   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1142   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1143   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1144   ierr = VecRestoreArrayWrite(xx,&x);CHKERRQ(ierr);
1145   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
1146   PetscFunctionReturn(0);
1147 }
1148 
MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo * info)1149 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info)
1150 {
1151   Mat             C     =B;
1152   Mat_SeqAIJ      *a    =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data;
1153   IS              isrow = b->row,isicol = b->icol;
1154   PetscErrorCode  ierr;
1155   const PetscInt  *r,*ic,*ics;
1156   const PetscInt  n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag;
1157   PetscInt        i,j,k,nz,nzL,row,*pj;
1158   const PetscInt  *ajtmp,*bjtmp;
1159   MatScalar       *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4;
1160   const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4;
1161   FactorShiftCtx  sctx;
1162   const PetscInt  *ddiag;
1163   PetscReal       rs;
1164   MatScalar       d;
1165   PetscInt        inod,nodesz,node_max,col;
1166   const PetscInt  *ns;
1167   PetscInt        *tmp_vec1,*tmp_vec2,*nsmap;
1168 
1169   PetscFunctionBegin;
1170   /* MatPivotSetUp(): initialize shift context sctx */
1171   ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr);
1172 
1173   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1174     ddiag          = a->diag;
1175     sctx.shift_top = info->zeropivot;
1176     for (i=0; i<n; i++) {
1177       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1178       d  = (aa)[ddiag[i]];
1179       rs = -PetscAbsScalar(d) - PetscRealPart(d);
1180       v  = aa+ai[i];
1181       nz = ai[i+1] - ai[i];
1182       for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]);
1183       if (rs>sctx.shift_top) sctx.shift_top = rs;
1184     }
1185     sctx.shift_top *= 1.1;
1186     sctx.nshift_max = 5;
1187     sctx.shift_lo   = 0.;
1188     sctx.shift_hi   = 1.;
1189   }
1190 
1191   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1192   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1193 
1194   ierr  = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr);
1195   ics   = ic;
1196 
1197   node_max = a->inode.node_count;
1198   ns       = a->inode.size;
1199   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1200 
1201   /* If max inode size > 4, split it into two inodes.*/
1202   /* also map the inode sizes according to the ordering */
1203   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1204   for (i=0,j=0; i<node_max; ++i,++j) {
1205     if (ns[i] > 4) {
1206       tmp_vec1[j] = 4;
1207       ++j;
1208       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1209     } else {
1210       tmp_vec1[j] = ns[i];
1211     }
1212   }
1213   /* Use the correct node_max */
1214   node_max = j;
1215 
1216   /* Now reorder the inode info based on mat re-ordering info */
1217   /* First create a row -> inode_size_array_index map */
1218   ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr);
1219   ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr);
1220   for (i=0,row=0; i<node_max; i++) {
1221     nodesz = tmp_vec1[i];
1222     for (j=0; j<nodesz; j++,row++) {
1223       nsmap[row] = i;
1224     }
1225   }
1226   /* Using nsmap, create a reordered ns structure */
1227   for (i=0,j=0; i< node_max; i++) {
1228     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1229     tmp_vec2[i] = nodesz;
1230     j          += nodesz;
1231   }
1232   ierr = PetscFree(nsmap);CHKERRQ(ierr);
1233   ierr = PetscFree(tmp_vec1);CHKERRQ(ierr);
1234 
1235   /* Now use the correct ns */
1236   ns = tmp_vec2;
1237 
1238   do {
1239     sctx.newshift = PETSC_FALSE;
1240     /* Now loop over each block-row, and do the factorization */
1241     for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */
1242       nodesz = ns[inod];
1243 
1244       switch (nodesz) {
1245       case 1:
1246         /*----------*/
1247         /* zero rtmp1 */
1248         /* L part */
1249         nz    = bi[i+1] - bi[i];
1250         bjtmp = bj + bi[i];
1251         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1252 
1253         /* U part */
1254         nz    = bdiag[i]-bdiag[i+1];
1255         bjtmp = bj + bdiag[i+1]+1;
1256         for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0;
1257 
1258         /* load in initial (unfactored row) */
1259         nz    = ai[r[i]+1] - ai[r[i]];
1260         ajtmp = aj + ai[r[i]];
1261         v     = aa + ai[r[i]];
1262         for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
1263 
1264         /* ZeropivotApply() */
1265         rtmp1[i] += sctx.shift_amount;  /* shift the diagonal of the matrix */
1266 
1267         /* elimination */
1268         bjtmp = bj + bi[i];
1269         row   = *bjtmp++;
1270         nzL   = bi[i+1] - bi[i];
1271         for (k=0; k < nzL; k++) {
1272           pc = rtmp1 + row;
1273           if (*pc != 0.0) {
1274             pv   = b->a + bdiag[row];
1275             mul1 = *pc * (*pv);
1276             *pc  = mul1;
1277             pj   = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1278             pv   = b->a + bdiag[row+1]+1;
1279             nz   = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1280             for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
1281             ierr = PetscLogFlops(1+2.0*nz);CHKERRQ(ierr);
1282           }
1283           row = *bjtmp++;
1284         }
1285 
1286         /* finished row so stick it into b->a */
1287         rs = 0.0;
1288         /* L part */
1289         pv = b->a + bi[i];
1290         pj = b->j + bi[i];
1291         nz = bi[i+1] - bi[i];
1292         for (j=0; j<nz; j++) {
1293           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1294         }
1295 
1296         /* U part */
1297         pv = b->a + bdiag[i+1]+1;
1298         pj = b->j + bdiag[i+1]+1;
1299         nz = bdiag[i] - bdiag[i+1]-1;
1300         for (j=0; j<nz; j++) {
1301           pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]);
1302         }
1303 
1304         /* Check zero pivot */
1305         sctx.rs = rs;
1306         sctx.pv = rtmp1[i];
1307         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1308         if (sctx.newshift) break;
1309 
1310         /* Mark diagonal and invert diagonal for simplier triangular solves */
1311         pv  = b->a + bdiag[i];
1312         *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
1313         break;
1314 
1315       case 2:
1316         /*----------*/
1317         /* zero rtmp1 and rtmp2 */
1318         /* L part */
1319         nz    = bi[i+1] - bi[i];
1320         bjtmp = bj + bi[i];
1321         for  (j=0; j<nz; j++) {
1322           col        = bjtmp[j];
1323           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1324         }
1325 
1326         /* U part */
1327         nz    = bdiag[i]-bdiag[i+1];
1328         bjtmp = bj + bdiag[i+1]+1;
1329         for  (j=0; j<nz; j++) {
1330           col        = bjtmp[j];
1331           rtmp1[col] = 0.0; rtmp2[col] = 0.0;
1332         }
1333 
1334         /* load in initial (unfactored row) */
1335         nz    = ai[r[i]+1] - ai[r[i]];
1336         ajtmp = aj + ai[r[i]];
1337         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1];
1338         for (j=0; j<nz; j++) {
1339           col        = ics[ajtmp[j]];
1340           rtmp1[col] = v1[j]; rtmp2[col] = v2[j];
1341         }
1342         /* ZeropivotApply(): shift the diagonal of the matrix  */
1343         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount;
1344 
1345         /* elimination */
1346         bjtmp = bj + bi[i];
1347         row   = *bjtmp++; /* pivot row */
1348         nzL   = bi[i+1] - bi[i];
1349         for (k=0; k < nzL; k++) {
1350           pc1 = rtmp1 + row;
1351           pc2 = rtmp2 + row;
1352           if (*pc1 != 0.0 || *pc2 != 0.0) {
1353             pv   = b->a + bdiag[row];
1354             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv);
1355             *pc1 = mul1;       *pc2 = mul2;
1356 
1357             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1358             pv = b->a + bdiag[row+1]+1;
1359             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1360             for (j=0; j<nz; j++) {
1361               col         = pj[j];
1362               rtmp1[col] -= mul1 * pv[j];
1363               rtmp2[col] -= mul2 * pv[j];
1364             }
1365             ierr = PetscLogFlops(2+4.0*nz);CHKERRQ(ierr);
1366           }
1367           row = *bjtmp++;
1368         }
1369 
1370         /* finished row i; check zero pivot, then stick row i into b->a */
1371         rs = 0.0;
1372         /* L part */
1373         pc1 = b->a + bi[i];
1374         pj  = b->j + bi[i];
1375         nz  = bi[i+1] - bi[i];
1376         for (j=0; j<nz; j++) {
1377           col    = pj[j];
1378           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1379         }
1380         /* U part */
1381         pc1 = b->a + bdiag[i+1]+1;
1382         pj  = b->j + bdiag[i+1]+1;
1383         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1384         for (j=0; j<nz; j++) {
1385           col    = pj[j];
1386           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1387         }
1388 
1389         sctx.rs = rs;
1390         sctx.pv = rtmp1[i];
1391         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1392         if (sctx.newshift) break;
1393         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1394         *pc1 = 1.0/sctx.pv;
1395 
1396         /* Now take care of diagonal 2x2 block. */
1397         pc2 = rtmp2 + i;
1398         if (*pc2 != 0.0) {
1399           mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */
1400           *pc2 = mul1;          /* insert L entry */
1401           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1402           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1403           for (j=0; j<nz; j++) {
1404             col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col];
1405           }
1406           ierr = PetscLogFlops(1+2.0*nz);CHKERRQ(ierr);
1407         }
1408 
1409         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1410         rs = 0.0;
1411         /* L part */
1412         pc2 = b->a + bi[i+1];
1413         pj  = b->j + bi[i+1];
1414         nz  = bi[i+2] - bi[i+1];
1415         for (j=0; j<nz; j++) {
1416           col    = pj[j];
1417           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1418         }
1419         /* U part */
1420         pc2 = b->a + bdiag[i+2]+1;
1421         pj  = b->j + bdiag[i+2]+1;
1422         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1423         for (j=0; j<nz; j++) {
1424           col    = pj[j];
1425           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1426         }
1427 
1428         sctx.rs = rs;
1429         sctx.pv = rtmp2[i+1];
1430         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1431         if (sctx.newshift) break;
1432         pc2  = b->a + bdiag[i+1];
1433         *pc2 = 1.0/sctx.pv;
1434         break;
1435 
1436       case 3:
1437         /*----------*/
1438         /* zero rtmp */
1439         /* L part */
1440         nz    = bi[i+1] - bi[i];
1441         bjtmp = bj + bi[i];
1442         for  (j=0; j<nz; j++) {
1443           col        = bjtmp[j];
1444           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1445         }
1446 
1447         /* U part */
1448         nz    = bdiag[i]-bdiag[i+1];
1449         bjtmp = bj + bdiag[i+1]+1;
1450         for  (j=0; j<nz; j++) {
1451           col        = bjtmp[j];
1452           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;
1453         }
1454 
1455         /* load in initial (unfactored row) */
1456         nz    = ai[r[i]+1] - ai[r[i]];
1457         ajtmp = aj + ai[r[i]];
1458         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2];
1459         for (j=0; j<nz; j++) {
1460           col        = ics[ajtmp[j]];
1461           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j];
1462         }
1463         /* ZeropivotApply(): shift the diagonal of the matrix  */
1464         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount;
1465 
1466         /* elimination */
1467         bjtmp = bj + bi[i];
1468         row   = *bjtmp++; /* pivot row */
1469         nzL   = bi[i+1] - bi[i];
1470         for (k=0; k < nzL; k++) {
1471           pc1 = rtmp1 + row;
1472           pc2 = rtmp2 + row;
1473           pc3 = rtmp3 + row;
1474           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
1475             pv   = b->a + bdiag[row];
1476             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv);
1477             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3;
1478 
1479             pj = b->j + bdiag[row+1]+1;     /* beginning of U(row,:) */
1480             pv = b->a + bdiag[row+1]+1;
1481             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1482             for (j=0; j<nz; j++) {
1483               col         = pj[j];
1484               rtmp1[col] -= mul1 * pv[j];
1485               rtmp2[col] -= mul2 * pv[j];
1486               rtmp3[col] -= mul3 * pv[j];
1487             }
1488             ierr = PetscLogFlops(3+6.0*nz);CHKERRQ(ierr);
1489           }
1490           row = *bjtmp++;
1491         }
1492 
1493         /* finished row i; check zero pivot, then stick row i into b->a */
1494         rs = 0.0;
1495         /* L part */
1496         pc1 = b->a + bi[i];
1497         pj  = b->j + bi[i];
1498         nz  = bi[i+1] - bi[i];
1499         for (j=0; j<nz; j++) {
1500           col    = pj[j];
1501           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1502         }
1503         /* U part */
1504         pc1 = b->a + bdiag[i+1]+1;
1505         pj  = b->j + bdiag[i+1]+1;
1506         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1507         for (j=0; j<nz; j++) {
1508           col    = pj[j];
1509           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1510         }
1511 
1512         sctx.rs = rs;
1513         sctx.pv = rtmp1[i];
1514         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1515         if (sctx.newshift) break;
1516         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1517         *pc1 = 1.0/sctx.pv;
1518 
1519         /* Now take care of 1st column of diagonal 3x3 block. */
1520         pc2 = rtmp2 + i;
1521         pc3 = rtmp3 + i;
1522         if (*pc2 != 0.0 || *pc3 != 0.0) {
1523           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1524           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1525           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1526           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1527           for (j=0; j<nz; j++) {
1528             col         = pj[j];
1529             rtmp2[col] -= mul2 * rtmp1[col];
1530             rtmp3[col] -= mul3 * rtmp1[col];
1531           }
1532           ierr = PetscLogFlops(2+4.0*nz);CHKERRQ(ierr);
1533         }
1534 
1535         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1536         rs = 0.0;
1537         /* L part */
1538         pc2 = b->a + bi[i+1];
1539         pj  = b->j + bi[i+1];
1540         nz  = bi[i+2] - bi[i+1];
1541         for (j=0; j<nz; j++) {
1542           col    = pj[j];
1543           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1544         }
1545         /* U part */
1546         pc2 = b->a + bdiag[i+2]+1;
1547         pj  = b->j + bdiag[i+2]+1;
1548         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1549         for (j=0; j<nz; j++) {
1550           col    = pj[j];
1551           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1552         }
1553 
1554         sctx.rs = rs;
1555         sctx.pv = rtmp2[i+1];
1556         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1557         if (sctx.newshift) break;
1558         pc2  = b->a + bdiag[i+1];
1559         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1560 
1561         /* Now take care of 2nd column of diagonal 3x3 block. */
1562         pc3 = rtmp3 + i+1;
1563         if (*pc3 != 0.0) {
1564           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1565           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1566           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1567           for (j=0; j<nz; j++) {
1568             col         = pj[j];
1569             rtmp3[col] -= mul3 * rtmp2[col];
1570           }
1571           ierr = PetscLogFlops(1+2.0*nz);CHKERRQ(ierr);
1572         }
1573 
1574         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1575         rs = 0.0;
1576         /* L part */
1577         pc3 = b->a + bi[i+2];
1578         pj  = b->j + bi[i+2];
1579         nz  = bi[i+3] - bi[i+2];
1580         for (j=0; j<nz; j++) {
1581           col    = pj[j];
1582           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1583         }
1584         /* U part */
1585         pc3 = b->a + bdiag[i+3]+1;
1586         pj  = b->j + bdiag[i+3]+1;
1587         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1588         for (j=0; j<nz; j++) {
1589           col    = pj[j];
1590           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1591         }
1592 
1593         sctx.rs = rs;
1594         sctx.pv = rtmp3[i+2];
1595         ierr    = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1596         if (sctx.newshift) break;
1597         pc3  = b->a + bdiag[i+2];
1598         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1599         break;
1600       case 4:
1601         /*----------*/
1602         /* zero rtmp */
1603         /* L part */
1604         nz    = bi[i+1] - bi[i];
1605         bjtmp = bj + bi[i];
1606         for  (j=0; j<nz; j++) {
1607           col        = bjtmp[j];
1608           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0;
1609         }
1610 
1611         /* U part */
1612         nz    = bdiag[i]-bdiag[i+1];
1613         bjtmp = bj + bdiag[i+1]+1;
1614         for  (j=0; j<nz; j++) {
1615           col        = bjtmp[j];
1616           rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0;
1617         }
1618 
1619         /* load in initial (unfactored row) */
1620         nz    = ai[r[i]+1] - ai[r[i]];
1621         ajtmp = aj + ai[r[i]];
1622         v1    = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3];
1623         for (j=0; j<nz; j++) {
1624           col        = ics[ajtmp[j]];
1625           rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j];
1626         }
1627         /* ZeropivotApply(): shift the diagonal of the matrix  */
1628         rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount;
1629 
1630         /* elimination */
1631         bjtmp = bj + bi[i];
1632         row   = *bjtmp++; /* pivot row */
1633         nzL   = bi[i+1] - bi[i];
1634         for (k=0; k < nzL; k++) {
1635           pc1 = rtmp1 + row;
1636           pc2 = rtmp2 + row;
1637           pc3 = rtmp3 + row;
1638           pc4 = rtmp4 + row;
1639           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1640             pv   = b->a + bdiag[row];
1641             mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv);
1642             *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4;
1643 
1644             pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
1645             pv = b->a + bdiag[row+1]+1;
1646             nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */
1647             for (j=0; j<nz; j++) {
1648               col         = pj[j];
1649               rtmp1[col] -= mul1 * pv[j];
1650               rtmp2[col] -= mul2 * pv[j];
1651               rtmp3[col] -= mul3 * pv[j];
1652               rtmp4[col] -= mul4 * pv[j];
1653             }
1654             ierr = PetscLogFlops(4+8.0*nz);CHKERRQ(ierr);
1655           }
1656           row = *bjtmp++;
1657         }
1658 
1659         /* finished row i; check zero pivot, then stick row i into b->a */
1660         rs = 0.0;
1661         /* L part */
1662         pc1 = b->a + bi[i];
1663         pj  = b->j + bi[i];
1664         nz  = bi[i+1] - bi[i];
1665         for (j=0; j<nz; j++) {
1666           col    = pj[j];
1667           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1668         }
1669         /* U part */
1670         pc1 = b->a + bdiag[i+1]+1;
1671         pj  = b->j + bdiag[i+1]+1;
1672         nz  = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */
1673         for (j=0; j<nz; j++) {
1674           col    = pj[j];
1675           pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]);
1676         }
1677 
1678         sctx.rs = rs;
1679         sctx.pv = rtmp1[i];
1680         ierr    = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr);
1681         if (sctx.newshift) break;
1682         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1683         *pc1 = 1.0/sctx.pv;
1684 
1685         /* Now take care of 1st column of diagonal 4x4 block. */
1686         pc2 = rtmp2 + i;
1687         pc3 = rtmp3 + i;
1688         pc4 = rtmp4 + i;
1689         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
1690           mul2 = (*pc2)*(*pc1); *pc2 = mul2;
1691           mul3 = (*pc3)*(*pc1); *pc3 = mul3;
1692           mul4 = (*pc4)*(*pc1); *pc4 = mul4;
1693           pj   = b->j + bdiag[i+1]+1;   /* beginning of U(i,:) */
1694           nz   = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */
1695           for (j=0; j<nz; j++) {
1696             col         = pj[j];
1697             rtmp2[col] -= mul2 * rtmp1[col];
1698             rtmp3[col] -= mul3 * rtmp1[col];
1699             rtmp4[col] -= mul4 * rtmp1[col];
1700           }
1701           ierr = PetscLogFlops(3+6.0*nz);CHKERRQ(ierr);
1702         }
1703 
1704         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1705         rs = 0.0;
1706         /* L part */
1707         pc2 = b->a + bi[i+1];
1708         pj  = b->j + bi[i+1];
1709         nz  = bi[i+2] - bi[i+1];
1710         for (j=0; j<nz; j++) {
1711           col    = pj[j];
1712           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1713         }
1714         /* U part */
1715         pc2 = b->a + bdiag[i+2]+1;
1716         pj  = b->j + bdiag[i+2]+1;
1717         nz  = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */
1718         for (j=0; j<nz; j++) {
1719           col    = pj[j];
1720           pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]);
1721         }
1722 
1723         sctx.rs = rs;
1724         sctx.pv = rtmp2[i+1];
1725         ierr    = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr);
1726         if (sctx.newshift) break;
1727         pc2  = b->a + bdiag[i+1];
1728         *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */
1729 
1730         /* Now take care of 2nd column of diagonal 4x4 block. */
1731         pc3 = rtmp3 + i+1;
1732         pc4 = rtmp4 + i+1;
1733         if (*pc3 != 0.0 || *pc4 != 0.0) {
1734           mul3 = (*pc3)*(*pc2); *pc3 = mul3;
1735           mul4 = (*pc4)*(*pc2); *pc4 = mul4;
1736           pj   = b->j + bdiag[i+2]+1;     /* beginning of U(i+1,:) */
1737           nz   = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */
1738           for (j=0; j<nz; j++) {
1739             col         = pj[j];
1740             rtmp3[col] -= mul3 * rtmp2[col];
1741             rtmp4[col] -= mul4 * rtmp2[col];
1742           }
1743           ierr = PetscLogFlops(4.0*nz);CHKERRQ(ierr);
1744         }
1745 
1746         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
1747         rs = 0.0;
1748         /* L part */
1749         pc3 = b->a + bi[i+2];
1750         pj  = b->j + bi[i+2];
1751         nz  = bi[i+3] - bi[i+2];
1752         for (j=0; j<nz; j++) {
1753           col    = pj[j];
1754           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1755         }
1756         /* U part */
1757         pc3 = b->a + bdiag[i+3]+1;
1758         pj  = b->j + bdiag[i+3]+1;
1759         nz  = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */
1760         for (j=0; j<nz; j++) {
1761           col    = pj[j];
1762           pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]);
1763         }
1764 
1765         sctx.rs = rs;
1766         sctx.pv = rtmp3[i+2];
1767         ierr    = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr);
1768         if (sctx.newshift) break;
1769         pc3  = b->a + bdiag[i+2];
1770         *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */
1771 
1772         /* Now take care of 3rd column of diagonal 4x4 block. */
1773         pc4 = rtmp4 + i+2;
1774         if (*pc4 != 0.0) {
1775           mul4 = (*pc4)*(*pc3); *pc4 = mul4;
1776           pj   = b->j + bdiag[i+3]+1;     /* beginning of U(i+2,:) */
1777           nz   = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */
1778           for (j=0; j<nz; j++) {
1779             col         = pj[j];
1780             rtmp4[col] -= mul4 * rtmp3[col];
1781           }
1782           ierr = PetscLogFlops(1+2.0*nz);CHKERRQ(ierr);
1783         }
1784 
1785         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
1786         rs = 0.0;
1787         /* L part */
1788         pc4 = b->a + bi[i+3];
1789         pj  = b->j + bi[i+3];
1790         nz  = bi[i+4] - bi[i+3];
1791         for (j=0; j<nz; j++) {
1792           col    = pj[j];
1793           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1794         }
1795         /* U part */
1796         pc4 = b->a + bdiag[i+4]+1;
1797         pj  = b->j + bdiag[i+4]+1;
1798         nz  = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */
1799         for (j=0; j<nz; j++) {
1800           col    = pj[j];
1801           pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]);
1802         }
1803 
1804         sctx.rs = rs;
1805         sctx.pv = rtmp4[i+3];
1806         ierr    = MatPivotCheck(B,A,info,&sctx,i+3);CHKERRQ(ierr);
1807         if (sctx.newshift) break;
1808         pc4  = b->a + bdiag[i+3];
1809         *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */
1810         break;
1811 
1812       default:
1813         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
1814       }
1815       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
1816       i += nodesz;                 /* Update the row */
1817     }
1818 
1819     /* MatPivotRefine() */
1820     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) {
1821       /*
1822        * if no shift in this attempt & shifting & started shifting & can refine,
1823        * then try lower shift
1824        */
1825       sctx.shift_hi       = sctx.shift_fraction;
1826       sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.;
1827       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
1828       sctx.newshift       = PETSC_TRUE;
1829       sctx.nshift++;
1830     }
1831   } while (sctx.newshift);
1832 
1833   ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr);
1834   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
1835   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
1836   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1837 
1838   if (b->inode.size) {
1839     C->ops->solve           = MatSolve_SeqAIJ_Inode;
1840   } else {
1841     C->ops->solve           = MatSolve_SeqAIJ;
1842   }
1843   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
1844   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
1845   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
1846   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1847   C->assembled              = PETSC_TRUE;
1848   C->preallocated           = PETSC_TRUE;
1849 
1850   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
1851 
1852   /* MatShiftView(A,info,&sctx) */
1853   if (sctx.nshift) {
1854     if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) {
1855       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
1856     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
1857       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
1858     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
1859       ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr);
1860     }
1861   }
1862   PetscFunctionReturn(0);
1863 }
1864 
MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo * info)1865 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info)
1866 {
1867   Mat             C     = B;
1868   Mat_SeqAIJ      *a    = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1869   IS              iscol = b->col,isrow = b->row,isicol = b->icol;
1870   PetscErrorCode  ierr;
1871   const PetscInt  *r,*ic,*c,*ics;
1872   PetscInt        n   = A->rmap->n,*bi = b->i;
1873   PetscInt        *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1874   PetscInt        i,j,idx,*bd = b->diag,node_max,nodesz;
1875   PetscInt        *ai = a->i,*aj = a->j;
1876   PetscInt        *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1877   PetscScalar     mul1,mul2,mul3,tmp;
1878   MatScalar       *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33;
1879   const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1;
1880   PetscReal       rs=0.0;
1881   FactorShiftCtx  sctx;
1882 
1883   PetscFunctionBegin;
1884   sctx.shift_top      = 0;
1885   sctx.nshift_max     = 0;
1886   sctx.shift_lo       = 0;
1887   sctx.shift_hi       = 0;
1888   sctx.shift_fraction = 0;
1889 
1890   /* if both shift schemes are chosen by user, only use info->shiftpd */
1891   if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1892     sctx.shift_top = 0;
1893     for (i=0; i<n; i++) {
1894       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1895       rs    = 0.0;
1896       ajtmp = aj + ai[i];
1897       rtmp1 = aa + ai[i];
1898       nz    = ai[i+1] - ai[i];
1899       for (j=0; j<nz; j++) {
1900         if (*ajtmp != i) {
1901           rs += PetscAbsScalar(*rtmp1++);
1902         } else {
1903           rs -= PetscRealPart(*rtmp1++);
1904         }
1905         ajtmp++;
1906       }
1907       if (rs>sctx.shift_top) sctx.shift_top = rs;
1908     }
1909     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1910     sctx.shift_top *= 1.1;
1911     sctx.nshift_max = 5;
1912     sctx.shift_lo   = 0.;
1913     sctx.shift_hi   = 1.;
1914   }
1915   sctx.shift_amount = 0;
1916   sctx.nshift       = 0;
1917 
1918   ierr   = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1919   ierr   = ISGetIndices(iscol,&c);CHKERRQ(ierr);
1920   ierr   = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
1921   ierr   = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr);
1922   ics    = ic;
1923 
1924   node_max = a->inode.node_count;
1925   ns       = a->inode.size;
1926   if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information");
1927 
1928   /* If max inode size > 3, split it into two inodes.*/
1929   /* also map the inode sizes according to the ordering */
1930   ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr);
1931   for (i=0,j=0; i<node_max; ++i,++j) {
1932     if (ns[i]>3) {
1933       tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5  */
1934       ++j;
1935       tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1936     } else {
1937       tmp_vec1[j] = ns[i];
1938     }
1939   }
1940   /* Use the correct node_max */
1941   node_max = j;
1942 
1943   /* Now reorder the inode info based on mat re-ordering info */
1944   /* First create a row -> inode_size_array_index map */
1945   ierr = PetscMalloc2(n+1,&nsmap,node_max+1,&tmp_vec2);CHKERRQ(ierr);
1946   for (i=0,row=0; i<node_max; i++) {
1947     nodesz = tmp_vec1[i];
1948     for (j=0; j<nodesz; j++,row++) {
1949       nsmap[row] = i;
1950     }
1951   }
1952   /* Using nsmap, create a reordered ns structure */
1953   for (i=0,j=0; i< node_max; i++) {
1954     nodesz      = tmp_vec1[nsmap[r[j]]];     /* here the reordered row_no is in r[] */
1955     tmp_vec2[i] = nodesz;
1956     j          += nodesz;
1957   }
1958   ierr = PetscFree2(nsmap,tmp_vec1);CHKERRQ(ierr);
1959   /* Now use the correct ns */
1960   ns = tmp_vec2;
1961 
1962   do {
1963     sctx.newshift = PETSC_FALSE;
1964     /* Now loop over each block-row, and do the factorization */
1965     for (i=0,row=0; i<node_max; i++) {
1966       nodesz = ns[i];
1967       nz     = bi[row+1] - bi[row];
1968       bjtmp  = bj + bi[row];
1969 
1970       switch (nodesz) {
1971       case 1:
1972         for  (j=0; j<nz; j++) {
1973           idx         = bjtmp[j];
1974           rtmp11[idx] = 0.0;
1975         }
1976 
1977         /* load in initial (unfactored row) */
1978         idx    = r[row];
1979         nz_tmp = ai[idx+1] - ai[idx];
1980         ajtmp  = aj + ai[idx];
1981         v1     = aa + ai[idx];
1982 
1983         for (j=0; j<nz_tmp; j++) {
1984           idx         = ics[ajtmp[j]];
1985           rtmp11[idx] = v1[j];
1986         }
1987         rtmp11[ics[r[row]]] += sctx.shift_amount;
1988 
1989         prow = *bjtmp++;
1990         while (prow < row) {
1991           pc1 = rtmp11 + prow;
1992           if (*pc1 != 0.0) {
1993             pv     = ba + bd[prow];
1994             pj     = nbj + bd[prow];
1995             mul1   = *pc1 * *pv++;
1996             *pc1   = mul1;
1997             nz_tmp = bi[prow+1] - bd[prow] - 1;
1998             ierr   = PetscLogFlops(1+2.0*nz_tmp);CHKERRQ(ierr);
1999             for (j=0; j<nz_tmp; j++) {
2000               tmp          = pv[j];
2001               idx          = pj[j];
2002               rtmp11[idx] -= mul1 * tmp;
2003             }
2004           }
2005           prow = *bjtmp++;
2006         }
2007         pj  = bj + bi[row];
2008         pc1 = ba + bi[row];
2009 
2010         sctx.pv     = rtmp11[row];
2011         rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */
2012         rs          = 0.0;
2013         for (j=0; j<nz; j++) {
2014           idx    = pj[j];
2015           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2016           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2017         }
2018         sctx.rs = rs;
2019         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2020         if (sctx.newshift) goto endofwhile;
2021         break;
2022 
2023       case 2:
2024         for (j=0; j<nz; j++) {
2025           idx         = bjtmp[j];
2026           rtmp11[idx] = 0.0;
2027           rtmp22[idx] = 0.0;
2028         }
2029 
2030         /* load in initial (unfactored row) */
2031         idx    = r[row];
2032         nz_tmp = ai[idx+1] - ai[idx];
2033         ajtmp  = aj + ai[idx];
2034         v1     = aa + ai[idx];
2035         v2     = aa + ai[idx+1];
2036         for (j=0; j<nz_tmp; j++) {
2037           idx         = ics[ajtmp[j]];
2038           rtmp11[idx] = v1[j];
2039           rtmp22[idx] = v2[j];
2040         }
2041         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2042         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2043 
2044         prow = *bjtmp++;
2045         while (prow < row) {
2046           pc1 = rtmp11 + prow;
2047           pc2 = rtmp22 + prow;
2048           if (*pc1 != 0.0 || *pc2 != 0.0) {
2049             pv   = ba + bd[prow];
2050             pj   = nbj + bd[prow];
2051             mul1 = *pc1 * *pv;
2052             mul2 = *pc2 * *pv;
2053             ++pv;
2054             *pc1 = mul1;
2055             *pc2 = mul2;
2056 
2057             nz_tmp = bi[prow+1] - bd[prow] - 1;
2058             for (j=0; j<nz_tmp; j++) {
2059               tmp          = pv[j];
2060               idx          = pj[j];
2061               rtmp11[idx] -= mul1 * tmp;
2062               rtmp22[idx] -= mul2 * tmp;
2063             }
2064             ierr = PetscLogFlops(2+4.0*nz_tmp);CHKERRQ(ierr);
2065           }
2066           prow = *bjtmp++;
2067         }
2068 
2069         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2070         pc1 = rtmp11 + prow;
2071         pc2 = rtmp22 + prow;
2072 
2073         sctx.pv = *pc1;
2074         pj      = bj + bi[prow];
2075         rs      = 0.0;
2076         for (j=0; j<nz; j++) {
2077           idx = pj[j];
2078           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2079         }
2080         sctx.rs = rs;
2081         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2082         if (sctx.newshift) goto endofwhile;
2083 
2084         if (*pc2 != 0.0) {
2085           pj     = nbj + bd[prow];
2086           mul2   = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
2087           *pc2   = mul2;
2088           nz_tmp = bi[prow+1] - bd[prow] - 1;
2089           for (j=0; j<nz_tmp; j++) {
2090             idx          = pj[j];
2091             tmp          = rtmp11[idx];
2092             rtmp22[idx] -= mul2 * tmp;
2093           }
2094           ierr = PetscLogFlops(1+2.0*nz_tmp);CHKERRQ(ierr);
2095         }
2096 
2097         pj  = bj + bi[row];
2098         pc1 = ba + bi[row];
2099         pc2 = ba + bi[row+1];
2100 
2101         sctx.pv       = rtmp22[row+1];
2102         rs            = 0.0;
2103         rtmp11[row]   = 1.0/rtmp11[row];
2104         rtmp22[row+1] = 1.0/rtmp22[row+1];
2105         /* copy row entries from dense representation to sparse */
2106         for (j=0; j<nz; j++) {
2107           idx    = pj[j];
2108           pc1[j] = rtmp11[idx];
2109           pc2[j] = rtmp22[idx];
2110           if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
2111         }
2112         sctx.rs = rs;
2113         ierr    = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2114         if (sctx.newshift) goto endofwhile;
2115         break;
2116 
2117       case 3:
2118         for  (j=0; j<nz; j++) {
2119           idx         = bjtmp[j];
2120           rtmp11[idx] = 0.0;
2121           rtmp22[idx] = 0.0;
2122           rtmp33[idx] = 0.0;
2123         }
2124         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2125         idx    = r[row];
2126         nz_tmp = ai[idx+1] - ai[idx];
2127         ajtmp  = aj + ai[idx];
2128         v1     = aa + ai[idx];
2129         v2     = aa + ai[idx+1];
2130         v3     = aa + ai[idx+2];
2131         for (j=0; j<nz_tmp; j++) {
2132           idx         = ics[ajtmp[j]];
2133           rtmp11[idx] = v1[j];
2134           rtmp22[idx] = v2[j];
2135           rtmp33[idx] = v3[j];
2136         }
2137         rtmp11[ics[r[row]]]   += sctx.shift_amount;
2138         rtmp22[ics[r[row+1]]] += sctx.shift_amount;
2139         rtmp33[ics[r[row+2]]] += sctx.shift_amount;
2140 
2141         /* loop over all pivot row blocks above this row block */
2142         prow = *bjtmp++;
2143         while (prow < row) {
2144           pc1 = rtmp11 + prow;
2145           pc2 = rtmp22 + prow;
2146           pc3 = rtmp33 + prow;
2147           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) {
2148             pv   = ba  + bd[prow];
2149             pj   = nbj + bd[prow];
2150             mul1 = *pc1 * *pv;
2151             mul2 = *pc2 * *pv;
2152             mul3 = *pc3 * *pv;
2153             ++pv;
2154             *pc1 = mul1;
2155             *pc2 = mul2;
2156             *pc3 = mul3;
2157 
2158             nz_tmp = bi[prow+1] - bd[prow] - 1;
2159             /* update this row based on pivot row */
2160             for (j=0; j<nz_tmp; j++) {
2161               tmp          = pv[j];
2162               idx          = pj[j];
2163               rtmp11[idx] -= mul1 * tmp;
2164               rtmp22[idx] -= mul2 * tmp;
2165               rtmp33[idx] -= mul3 * tmp;
2166             }
2167             ierr = PetscLogFlops(3+6.0*nz_tmp);CHKERRQ(ierr);
2168           }
2169           prow = *bjtmp++;
2170         }
2171 
2172         /* Now take care of diagonal 3x3 block in this set of rows */
2173         /* note: prow = row here */
2174         pc1 = rtmp11 + prow;
2175         pc2 = rtmp22 + prow;
2176         pc3 = rtmp33 + prow;
2177 
2178         sctx.pv = *pc1;
2179         pj      = bj + bi[prow];
2180         rs      = 0.0;
2181         for (j=0; j<nz; j++) {
2182           idx = pj[j];
2183           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2184         }
2185         sctx.rs = rs;
2186         ierr    = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr);
2187         if (sctx.newshift) goto endofwhile;
2188 
2189         if (*pc2 != 0.0 || *pc3 != 0.0) {
2190           mul2   = (*pc2)/(*pc1);
2191           mul3   = (*pc3)/(*pc1);
2192           *pc2   = mul2;
2193           *pc3   = mul3;
2194           nz_tmp = bi[prow+1] - bd[prow] - 1;
2195           pj     = nbj + bd[prow];
2196           for (j=0; j<nz_tmp; j++) {
2197             idx          = pj[j];
2198             tmp          = rtmp11[idx];
2199             rtmp22[idx] -= mul2 * tmp;
2200             rtmp33[idx] -= mul3 * tmp;
2201           }
2202           ierr = PetscLogFlops(2+4.0*nz_tmp);CHKERRQ(ierr);
2203         }
2204         ++prow;
2205 
2206         pc2     = rtmp22 + prow;
2207         pc3     = rtmp33 + prow;
2208         sctx.pv = *pc2;
2209         pj      = bj + bi[prow];
2210         rs      = 0.0;
2211         for (j=0; j<nz; j++) {
2212           idx = pj[j];
2213           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2214         }
2215         sctx.rs = rs;
2216         ierr    = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr);
2217         if (sctx.newshift) goto endofwhile;
2218 
2219         if (*pc3 != 0.0) {
2220           mul3   = (*pc3)/(*pc2);
2221           *pc3   = mul3;
2222           pj     = nbj + bd[prow];
2223           nz_tmp = bi[prow+1] - bd[prow] - 1;
2224           for (j=0; j<nz_tmp; j++) {
2225             idx          = pj[j];
2226             tmp          = rtmp22[idx];
2227             rtmp33[idx] -= mul3 * tmp;
2228           }
2229           ierr = PetscLogFlops(1+2.0*nz_tmp);CHKERRQ(ierr);
2230         }
2231 
2232         pj  = bj + bi[row];
2233         pc1 = ba + bi[row];
2234         pc2 = ba + bi[row+1];
2235         pc3 = ba + bi[row+2];
2236 
2237         sctx.pv       = rtmp33[row+2];
2238         rs            = 0.0;
2239         rtmp11[row]   = 1.0/rtmp11[row];
2240         rtmp22[row+1] = 1.0/rtmp22[row+1];
2241         rtmp33[row+2] = 1.0/rtmp33[row+2];
2242         /* copy row entries from dense representation to sparse */
2243         for (j=0; j<nz; j++) {
2244           idx    = pj[j];
2245           pc1[j] = rtmp11[idx];
2246           pc2[j] = rtmp22[idx];
2247           pc3[j] = rtmp33[idx];
2248           if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
2249         }
2250 
2251         sctx.rs = rs;
2252         ierr    = MatPivotCheck(B,A,info,&sctx,row+2);CHKERRQ(ierr);
2253         if (sctx.newshift) goto endofwhile;
2254         break;
2255 
2256       default:
2257         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n");
2258       }
2259       row += nodesz;                 /* Update the row */
2260     }
2261 endofwhile:;
2262   } while (sctx.newshift);
2263   ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr);
2264   ierr = PetscFree(tmp_vec2);CHKERRQ(ierr);
2265   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2266   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
2267   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
2268 
2269   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2270   /* do not set solve add, since MatSolve_Inode + Add is faster */
2271   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2272   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2273   C->assembled              = PETSC_TRUE;
2274   C->preallocated           = PETSC_TRUE;
2275   if (sctx.nshift) {
2276     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
2277       ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr);
2278     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
2279       ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr);
2280     }
2281   }
2282   ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr);
2283   ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr);
2284   PetscFunctionReturn(0);
2285 }
2286 
2287 
2288 /* ----------------------------------------------------------- */
MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)2289 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
2290 {
2291   Mat_SeqAIJ        *a    = (Mat_SeqAIJ*)A->data;
2292   IS                iscol = a->col,isrow = a->row;
2293   PetscErrorCode    ierr;
2294   const PetscInt    *r,*c,*rout,*cout;
2295   PetscInt          i,j,n = A->rmap->n;
2296   PetscInt          node_max,row,nsz,aii,i0,i1,nz;
2297   const PetscInt    *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj;
2298   PetscScalar       *x,*tmp,*tmps,tmp0,tmp1;
2299   PetscScalar       sum1,sum2,sum3,sum4,sum5;
2300   const MatScalar   *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa;
2301   const PetscScalar *b;
2302 
2303   PetscFunctionBegin;
2304   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2305   node_max = a->inode.node_count;
2306   ns       = a->inode.size;     /* Node Size array */
2307 
2308   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2309   ierr = VecGetArrayWrite(xx,&x);CHKERRQ(ierr);
2310   tmp  = a->solve_work;
2311 
2312   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2313   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2314 
2315   /* forward solve the lower triangular */
2316   tmps = tmp;
2317   aa   = a_a;
2318   aj   = a_j;
2319   ad   = a->diag;
2320 
2321   for (i = 0,row = 0; i< node_max; ++i) {
2322     nsz = ns[i];
2323     aii = ai[row];
2324     v1  = aa + aii;
2325     vi  = aj + aii;
2326     nz  = ai[row+1]- ai[row];
2327 
2328     if (i < node_max-1) {
2329       /* Prefetch the indices for the next block */
2330       PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */
2331       /* Prefetch the data for the next block */
2332       PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA);
2333     }
2334 
2335     switch (nsz) {               /* Each loop in 'case' is unrolled */
2336     case 1:
2337       sum1 = b[r[row]];
2338       for (j=0; j<nz-1; j+=2) {
2339         i0    = vi[j];
2340         i1    = vi[j+1];
2341         tmp0  = tmps[i0];
2342         tmp1  = tmps[i1];
2343         sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1;
2344       }
2345       if (j == nz-1) {
2346         tmp0  = tmps[vi[j]];
2347         sum1 -= v1[j]*tmp0;
2348       }
2349       tmp[row++]=sum1;
2350       break;
2351     case 2:
2352       sum1 = b[r[row]];
2353       sum2 = b[r[row+1]];
2354       v2   = aa + ai[row+1];
2355 
2356       for (j=0; j<nz-1; j+=2) {
2357         i0    = vi[j];
2358         i1    = vi[j+1];
2359         tmp0  = tmps[i0];
2360         tmp1  = tmps[i1];
2361         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2362         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2363       }
2364       if (j == nz-1) {
2365         tmp0  = tmps[vi[j]];
2366         sum1 -= v1[j] *tmp0;
2367         sum2 -= v2[j] *tmp0;
2368       }
2369       sum2     -= v2[nz] * sum1;
2370       tmp[row++]=sum1;
2371       tmp[row++]=sum2;
2372       break;
2373     case 3:
2374       sum1 = b[r[row]];
2375       sum2 = b[r[row+1]];
2376       sum3 = b[r[row+2]];
2377       v2   = aa + ai[row+1];
2378       v3   = aa + ai[row+2];
2379 
2380       for (j=0; j<nz-1; j+=2) {
2381         i0    = vi[j];
2382         i1    = vi[j+1];
2383         tmp0  = tmps[i0];
2384         tmp1  = tmps[i1];
2385         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2386         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2387         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2388       }
2389       if (j == nz-1) {
2390         tmp0  = tmps[vi[j]];
2391         sum1 -= v1[j] *tmp0;
2392         sum2 -= v2[j] *tmp0;
2393         sum3 -= v3[j] *tmp0;
2394       }
2395       sum2     -= v2[nz] * sum1;
2396       sum3     -= v3[nz] * sum1;
2397       sum3     -= v3[nz+1] * sum2;
2398       tmp[row++]=sum1;
2399       tmp[row++]=sum2;
2400       tmp[row++]=sum3;
2401       break;
2402 
2403     case 4:
2404       sum1 = b[r[row]];
2405       sum2 = b[r[row+1]];
2406       sum3 = b[r[row+2]];
2407       sum4 = b[r[row+3]];
2408       v2   = aa + ai[row+1];
2409       v3   = aa + ai[row+2];
2410       v4   = aa + ai[row+3];
2411 
2412       for (j=0; j<nz-1; j+=2) {
2413         i0    = vi[j];
2414         i1    = vi[j+1];
2415         tmp0  = tmps[i0];
2416         tmp1  = tmps[i1];
2417         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2418         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2419         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2420         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2421       }
2422       if (j == nz-1) {
2423         tmp0  = tmps[vi[j]];
2424         sum1 -= v1[j] *tmp0;
2425         sum2 -= v2[j] *tmp0;
2426         sum3 -= v3[j] *tmp0;
2427         sum4 -= v4[j] *tmp0;
2428       }
2429       sum2 -= v2[nz] * sum1;
2430       sum3 -= v3[nz] * sum1;
2431       sum4 -= v4[nz] * sum1;
2432       sum3 -= v3[nz+1] * sum2;
2433       sum4 -= v4[nz+1] * sum2;
2434       sum4 -= v4[nz+2] * sum3;
2435 
2436       tmp[row++]=sum1;
2437       tmp[row++]=sum2;
2438       tmp[row++]=sum3;
2439       tmp[row++]=sum4;
2440       break;
2441     case 5:
2442       sum1 = b[r[row]];
2443       sum2 = b[r[row+1]];
2444       sum3 = b[r[row+2]];
2445       sum4 = b[r[row+3]];
2446       sum5 = b[r[row+4]];
2447       v2   = aa + ai[row+1];
2448       v3   = aa + ai[row+2];
2449       v4   = aa + ai[row+3];
2450       v5   = aa + ai[row+4];
2451 
2452       for (j=0; j<nz-1; j+=2) {
2453         i0    = vi[j];
2454         i1    = vi[j+1];
2455         tmp0  = tmps[i0];
2456         tmp1  = tmps[i1];
2457         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2458         sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1;
2459         sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1;
2460         sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1;
2461         sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1;
2462       }
2463       if (j == nz-1) {
2464         tmp0  = tmps[vi[j]];
2465         sum1 -= v1[j] *tmp0;
2466         sum2 -= v2[j] *tmp0;
2467         sum3 -= v3[j] *tmp0;
2468         sum4 -= v4[j] *tmp0;
2469         sum5 -= v5[j] *tmp0;
2470       }
2471 
2472       sum2 -= v2[nz] * sum1;
2473       sum3 -= v3[nz] * sum1;
2474       sum4 -= v4[nz] * sum1;
2475       sum5 -= v5[nz] * sum1;
2476       sum3 -= v3[nz+1] * sum2;
2477       sum4 -= v4[nz+1] * sum2;
2478       sum5 -= v5[nz+1] * sum2;
2479       sum4 -= v4[nz+2] * sum3;
2480       sum5 -= v5[nz+2] * sum3;
2481       sum5 -= v5[nz+3] * sum4;
2482 
2483       tmp[row++]=sum1;
2484       tmp[row++]=sum2;
2485       tmp[row++]=sum3;
2486       tmp[row++]=sum4;
2487       tmp[row++]=sum5;
2488       break;
2489     default:
2490       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2491     }
2492   }
2493   /* backward solve the upper triangular */
2494   for (i=node_max -1,row = n-1; i>=0; i--) {
2495     nsz = ns[i];
2496     aii = ad[row+1] + 1;
2497     v1  = aa + aii;
2498     vi  = aj + aii;
2499     nz  = ad[row]- ad[row+1] - 1;
2500 
2501     if (i > 0) {
2502       /* Prefetch the indices for the next block */
2503       PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2504       /* Prefetch the data for the next block */
2505       PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA);
2506     }
2507 
2508     switch (nsz) {               /* Each loop in 'case' is unrolled */
2509     case 1:
2510       sum1 = tmp[row];
2511 
2512       for (j=0; j<nz-1; j+=2) {
2513         i0    = vi[j];
2514         i1    = vi[j+1];
2515         tmp0  = tmps[i0];
2516         tmp1  = tmps[i1];
2517         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2518       }
2519       if (j == nz-1) {
2520         tmp0  = tmps[vi[j]];
2521         sum1 -= v1[j]*tmp0;
2522       }
2523       x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2524       break;
2525     case 2:
2526       sum1 = tmp[row];
2527       sum2 = tmp[row-1];
2528       v2   = aa + ad[row] + 1;
2529       for (j=0; j<nz-1; j+=2) {
2530         i0    = vi[j];
2531         i1    = vi[j+1];
2532         tmp0  = tmps[i0];
2533         tmp1  = tmps[i1];
2534         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2535         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2536       }
2537       if (j == nz-1) {
2538         tmp0  = tmps[vi[j]];
2539         sum1 -= v1[j]* tmp0;
2540         sum2 -= v2[j+1]* tmp0;
2541       }
2542 
2543       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2544       sum2     -= v2[0] * tmp0;
2545       x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2546       break;
2547     case 3:
2548       sum1 = tmp[row];
2549       sum2 = tmp[row -1];
2550       sum3 = tmp[row -2];
2551       v2   = aa + ad[row] + 1;
2552       v3   = aa + ad[row -1] + 1;
2553       for (j=0; j<nz-1; j+=2) {
2554         i0    = vi[j];
2555         i1    = vi[j+1];
2556         tmp0  = tmps[i0];
2557         tmp1  = tmps[i1];
2558         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2559         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2560         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2561       }
2562       if (j== nz-1) {
2563         tmp0  = tmps[vi[j]];
2564         sum1 -= v1[j] * tmp0;
2565         sum2 -= v2[j+1] * tmp0;
2566         sum3 -= v3[j+2] * tmp0;
2567       }
2568       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2569       sum2     -= v2[0]* tmp0;
2570       sum3     -= v3[1] * tmp0;
2571       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2572       sum3     -= v3[0]* tmp0;
2573       x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2574 
2575       break;
2576     case 4:
2577       sum1 = tmp[row];
2578       sum2 = tmp[row -1];
2579       sum3 = tmp[row -2];
2580       sum4 = tmp[row -3];
2581       v2   = aa + ad[row]+1;
2582       v3   = aa + ad[row -1]+1;
2583       v4   = aa + ad[row -2]+1;
2584 
2585       for (j=0; j<nz-1; j+=2) {
2586         i0    = vi[j];
2587         i1    = vi[j+1];
2588         tmp0  = tmps[i0];
2589         tmp1  = tmps[i1];
2590         sum1 -= v1[j] * tmp0   + v1[j+1] * tmp1;
2591         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2592         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2593         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2594       }
2595       if (j== nz-1) {
2596         tmp0  = tmps[vi[j]];
2597         sum1 -= v1[j] * tmp0;
2598         sum2 -= v2[j+1] * tmp0;
2599         sum3 -= v3[j+2] * tmp0;
2600         sum4 -= v4[j+3] * tmp0;
2601       }
2602 
2603       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2604       sum2     -= v2[0] * tmp0;
2605       sum3     -= v3[1] * tmp0;
2606       sum4     -= v4[2] * tmp0;
2607       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2608       sum3     -= v3[0] * tmp0;
2609       sum4     -= v4[1] * tmp0;
2610       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2611       sum4     -= v4[0] * tmp0;
2612       x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2613       break;
2614     case 5:
2615       sum1 = tmp[row];
2616       sum2 = tmp[row -1];
2617       sum3 = tmp[row -2];
2618       sum4 = tmp[row -3];
2619       sum5 = tmp[row -4];
2620       v2   = aa + ad[row]+1;
2621       v3   = aa + ad[row -1]+1;
2622       v4   = aa + ad[row -2]+1;
2623       v5   = aa + ad[row -3]+1;
2624       for (j=0; j<nz-1; j+=2) {
2625         i0    = vi[j];
2626         i1    = vi[j+1];
2627         tmp0  = tmps[i0];
2628         tmp1  = tmps[i1];
2629         sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1;
2630         sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1;
2631         sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1;
2632         sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1;
2633         sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1;
2634       }
2635       if (j==nz-1) {
2636         tmp0  = tmps[vi[j]];
2637         sum1 -= v1[j] * tmp0;
2638         sum2 -= v2[j+1] * tmp0;
2639         sum3 -= v3[j+2] * tmp0;
2640         sum4 -= v4[j+3] * tmp0;
2641         sum5 -= v5[j+4] * tmp0;
2642       }
2643 
2644       tmp0      = x[c[row]] = tmp[row] = sum1*v1[nz]; row--;
2645       sum2     -= v2[0] * tmp0;
2646       sum3     -= v3[1] * tmp0;
2647       sum4     -= v4[2] * tmp0;
2648       sum5     -= v5[3] * tmp0;
2649       tmp0      = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--;
2650       sum3     -= v3[0] * tmp0;
2651       sum4     -= v4[1] * tmp0;
2652       sum5     -= v5[2] * tmp0;
2653       tmp0      = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--;
2654       sum4     -= v4[0] * tmp0;
2655       sum5     -= v5[1] * tmp0;
2656       tmp0      = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--;
2657       sum5     -= v5[0] * tmp0;
2658       x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--;
2659       break;
2660     default:
2661       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n");
2662     }
2663   }
2664   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2665   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2666   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2667   ierr = VecRestoreArrayWrite(xx,&x);CHKERRQ(ierr);
2668   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
2669   PetscFunctionReturn(0);
2670 }
2671 
2672 
2673 /*
2674      Makes a longer coloring[] array and calls the usual code with that
2675 */
MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring * iscoloring)2676 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
2677 {
2678   Mat_SeqAIJ      *a = (Mat_SeqAIJ*)mat->data;
2679   PetscErrorCode  ierr;
2680   PetscInt        n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row;
2681   PetscInt        *colorused,i;
2682   ISColoringValue *newcolor;
2683 
2684   PetscFunctionBegin;
2685   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2686   ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr);
2687   /* loop over inodes, marking a color for each column*/
2688   row = 0;
2689   for (i=0; i<m; i++) {
2690     for (j=0; j<ns[i]; j++) {
2691       newcolor[row++] = coloring[i] + j*ncolors;
2692     }
2693   }
2694 
2695   /* eliminate unneeded colors */
2696   ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr);
2697   for (i=0; i<n; i++) {
2698     colorused[newcolor[i]] = 1;
2699   }
2700 
2701   for (i=1; i<5*ncolors; i++) {
2702     colorused[i] += colorused[i-1];
2703   }
2704   ncolors = colorused[5*ncolors-1];
2705   for (i=0; i<n; i++) {
2706     newcolor[i] = colorused[newcolor[i]]-1;
2707   }
2708   ierr = PetscFree(colorused);CHKERRQ(ierr);
2709   ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr);
2710   ierr = PetscFree(coloring);CHKERRQ(ierr);
2711   PetscFunctionReturn(0);
2712 }
2713 
2714 #include <petsc/private/kernels/blockinvert.h>
2715 
MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)2716 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
2717 {
2718   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
2719   PetscScalar       sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3;
2720   MatScalar         *ibdiag,*bdiag,work[25],*t;
2721   PetscScalar       *x,tmp4,tmp5,x1,x2,x3,x4,x5;
2722   const MatScalar   *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL;
2723   const PetscScalar *xb, *b;
2724   PetscReal         zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0;
2725   PetscErrorCode    ierr;
2726   PetscInt          n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2;
2727   PetscInt          sz,k,ipvt[5];
2728   PetscBool         allowzeropivot,zeropivotdetected;
2729   const PetscInt    *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i;
2730 
2731   PetscFunctionBegin;
2732   allowzeropivot = PetscNot(A->erroriffailure);
2733   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
2734   if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode");
2735   if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode");
2736 
2737   if (!a->inode.ibdiagvalid) {
2738     if (!a->inode.ibdiag) {
2739       /* calculate space needed for diagonal blocks */
2740       for (i=0; i<m; i++) {
2741         cnt += sizes[i]*sizes[i];
2742       }
2743       a->inode.bdiagsize = cnt;
2744 
2745       ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr);
2746     }
2747 
2748     /* copy over the diagonal blocks and invert them */
2749     ibdiag = a->inode.ibdiag;
2750     bdiag  = a->inode.bdiag;
2751     cnt    = 0;
2752     for (i=0, row = 0; i<m; i++) {
2753       for (j=0; j<sizes[i]; j++) {
2754         for (k=0; k<sizes[i]; k++) {
2755           bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k];
2756         }
2757       }
2758       ierr = PetscArraycpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]);CHKERRQ(ierr);
2759 
2760       switch (sizes[i]) {
2761       case 1:
2762         /* Create matrix data structure */
2763         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2764           if (allowzeropivot) {
2765             A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2766             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
2767             A->factorerror_zeropivot_row   = row;
2768             ierr = PetscInfo1(A,"Zero pivot, row %D\n",row);CHKERRQ(ierr);
2769           } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row);
2770         }
2771         ibdiag[cnt] = 1.0/ibdiag[cnt];
2772         break;
2773       case 2:
2774         ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2775         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2776         break;
2777       case 3:
2778         ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2779         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2780         break;
2781       case 4:
2782         ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2783         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2784         break;
2785       case 5:
2786         ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
2787         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2788         break;
2789       default:
2790         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2791       }
2792       cnt += sizes[i]*sizes[i];
2793       row += sizes[i];
2794     }
2795     a->inode.ibdiagvalid = PETSC_TRUE;
2796   }
2797   ibdiag = a->inode.ibdiag;
2798   bdiag  = a->inode.bdiag;
2799   t      = a->inode.ssor_work;
2800 
2801   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2802   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2803   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
2804   if (flag & SOR_ZERO_INITIAL_GUESS) {
2805     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2806 
2807       for (i=0, row=0; i<m; i++) {
2808         sz  = diag[row] - ii[row];
2809         v1  = a->a + ii[row];
2810         idx = a->j + ii[row];
2811 
2812         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2813         switch (sizes[i]) {
2814         case 1:
2815 
2816           sum1 = b[row];
2817           for (n = 0; n<sz-1; n+=2) {
2818             i1    = idx[0];
2819             i2    = idx[1];
2820             idx  += 2;
2821             tmp0  = x[i1];
2822             tmp1  = x[i2];
2823             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2824           }
2825 
2826           if (n == sz-1) {
2827             tmp0  = x[*idx];
2828             sum1 -= *v1 * tmp0;
2829           }
2830           t[row]   = sum1;
2831           x[row++] = sum1*(*ibdiag++);
2832           break;
2833         case 2:
2834           v2   = a->a + ii[row+1];
2835           sum1 = b[row];
2836           sum2 = b[row+1];
2837           for (n = 0; n<sz-1; n+=2) {
2838             i1    = idx[0];
2839             i2    = idx[1];
2840             idx  += 2;
2841             tmp0  = x[i1];
2842             tmp1  = x[i2];
2843             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2844             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2845           }
2846 
2847           if (n == sz-1) {
2848             tmp0  = x[*idx];
2849             sum1 -= v1[0] * tmp0;
2850             sum2 -= v2[0] * tmp0;
2851           }
2852           t[row]   = sum1;
2853           t[row+1] = sum2;
2854           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2];
2855           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3];
2856           ibdiag  += 4;
2857           break;
2858         case 3:
2859           v2   = a->a + ii[row+1];
2860           v3   = a->a + ii[row+2];
2861           sum1 = b[row];
2862           sum2 = b[row+1];
2863           sum3 = b[row+2];
2864           for (n = 0; n<sz-1; n+=2) {
2865             i1    = idx[0];
2866             i2    = idx[1];
2867             idx  += 2;
2868             tmp0  = x[i1];
2869             tmp1  = x[i2];
2870             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2871             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2872             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2873           }
2874 
2875           if (n == sz-1) {
2876             tmp0  = x[*idx];
2877             sum1 -= v1[0] * tmp0;
2878             sum2 -= v2[0] * tmp0;
2879             sum3 -= v3[0] * tmp0;
2880           }
2881           t[row]   = sum1;
2882           t[row+1] = sum2;
2883           t[row+2] = sum3;
2884           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
2885           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
2886           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
2887           ibdiag  += 9;
2888           break;
2889         case 4:
2890           v2   = a->a + ii[row+1];
2891           v3   = a->a + ii[row+2];
2892           v4   = a->a + ii[row+3];
2893           sum1 = b[row];
2894           sum2 = b[row+1];
2895           sum3 = b[row+2];
2896           sum4 = b[row+3];
2897           for (n = 0; n<sz-1; n+=2) {
2898             i1    = idx[0];
2899             i2    = idx[1];
2900             idx  += 2;
2901             tmp0  = x[i1];
2902             tmp1  = x[i2];
2903             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2904             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2905             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2906             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2907           }
2908 
2909           if (n == sz-1) {
2910             tmp0  = x[*idx];
2911             sum1 -= v1[0] * tmp0;
2912             sum2 -= v2[0] * tmp0;
2913             sum3 -= v3[0] * tmp0;
2914             sum4 -= v4[0] * tmp0;
2915           }
2916           t[row]   = sum1;
2917           t[row+1] = sum2;
2918           t[row+2] = sum3;
2919           t[row+3] = sum4;
2920           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
2921           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
2922           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
2923           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
2924           ibdiag  += 16;
2925           break;
2926         case 5:
2927           v2   = a->a + ii[row+1];
2928           v3   = a->a + ii[row+2];
2929           v4   = a->a + ii[row+3];
2930           v5   = a->a + ii[row+4];
2931           sum1 = b[row];
2932           sum2 = b[row+1];
2933           sum3 = b[row+2];
2934           sum4 = b[row+3];
2935           sum5 = b[row+4];
2936           for (n = 0; n<sz-1; n+=2) {
2937             i1    = idx[0];
2938             i2    = idx[1];
2939             idx  += 2;
2940             tmp0  = x[i1];
2941             tmp1  = x[i2];
2942             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2943             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
2944             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
2945             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
2946             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
2947           }
2948 
2949           if (n == sz-1) {
2950             tmp0  = x[*idx];
2951             sum1 -= v1[0] * tmp0;
2952             sum2 -= v2[0] * tmp0;
2953             sum3 -= v3[0] * tmp0;
2954             sum4 -= v4[0] * tmp0;
2955             sum5 -= v5[0] * tmp0;
2956           }
2957           t[row]   = sum1;
2958           t[row+1] = sum2;
2959           t[row+2] = sum3;
2960           t[row+3] = sum4;
2961           t[row+4] = sum5;
2962           x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
2963           x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
2964           x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
2965           x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
2966           x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
2967           ibdiag  += 25;
2968           break;
2969         default:
2970           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
2971         }
2972       }
2973 
2974       xb   = t;
2975       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2976     } else xb = b;
2977     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
2978 
2979       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
2980       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
2981         ibdiag -= sizes[i]*sizes[i];
2982         sz      = ii[row+1] - diag[row] - 1;
2983         v1      = a->a + diag[row] + 1;
2984         idx     = a->j + diag[row] + 1;
2985 
2986         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2987         switch (sizes[i]) {
2988         case 1:
2989 
2990           sum1 = xb[row];
2991           for (n = 0; n<sz-1; n+=2) {
2992             i1    = idx[0];
2993             i2    = idx[1];
2994             idx  += 2;
2995             tmp0  = x[i1];
2996             tmp1  = x[i2];
2997             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
2998           }
2999 
3000           if (n == sz-1) {
3001             tmp0  = x[*idx];
3002             sum1 -= *v1*tmp0;
3003           }
3004           x[row--] = sum1*(*ibdiag);
3005           break;
3006 
3007         case 2:
3008 
3009           sum1 = xb[row];
3010           sum2 = xb[row-1];
3011           /* note that sum1 is associated with the second of the two rows */
3012           v2 = a->a + diag[row-1] + 2;
3013           for (n = 0; n<sz-1; n+=2) {
3014             i1    = idx[0];
3015             i2    = idx[1];
3016             idx  += 2;
3017             tmp0  = x[i1];
3018             tmp1  = x[i2];
3019             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3020             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3021           }
3022 
3023           if (n == sz-1) {
3024             tmp0  = x[*idx];
3025             sum1 -= *v1*tmp0;
3026             sum2 -= *v2*tmp0;
3027           }
3028           x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3029           x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3030           break;
3031         case 3:
3032 
3033           sum1 = xb[row];
3034           sum2 = xb[row-1];
3035           sum3 = xb[row-2];
3036           v2   = a->a + diag[row-1] + 2;
3037           v3   = a->a + diag[row-2] + 3;
3038           for (n = 0; n<sz-1; n+=2) {
3039             i1    = idx[0];
3040             i2    = idx[1];
3041             idx  += 2;
3042             tmp0  = x[i1];
3043             tmp1  = x[i2];
3044             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3045             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3046             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3047           }
3048 
3049           if (n == sz-1) {
3050             tmp0  = x[*idx];
3051             sum1 -= *v1*tmp0;
3052             sum2 -= *v2*tmp0;
3053             sum3 -= *v3*tmp0;
3054           }
3055           x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3056           x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3057           x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3058           break;
3059         case 4:
3060 
3061           sum1 = xb[row];
3062           sum2 = xb[row-1];
3063           sum3 = xb[row-2];
3064           sum4 = xb[row-3];
3065           v2   = a->a + diag[row-1] + 2;
3066           v3   = a->a + diag[row-2] + 3;
3067           v4   = a->a + diag[row-3] + 4;
3068           for (n = 0; n<sz-1; n+=2) {
3069             i1    = idx[0];
3070             i2    = idx[1];
3071             idx  += 2;
3072             tmp0  = x[i1];
3073             tmp1  = x[i2];
3074             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3075             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3076             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3077             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3078           }
3079 
3080           if (n == sz-1) {
3081             tmp0  = x[*idx];
3082             sum1 -= *v1*tmp0;
3083             sum2 -= *v2*tmp0;
3084             sum3 -= *v3*tmp0;
3085             sum4 -= *v4*tmp0;
3086           }
3087           x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3088           x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3089           x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3090           x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3091           break;
3092         case 5:
3093 
3094           sum1 = xb[row];
3095           sum2 = xb[row-1];
3096           sum3 = xb[row-2];
3097           sum4 = xb[row-3];
3098           sum5 = xb[row-4];
3099           v2   = a->a + diag[row-1] + 2;
3100           v3   = a->a + diag[row-2] + 3;
3101           v4   = a->a + diag[row-3] + 4;
3102           v5   = a->a + diag[row-4] + 5;
3103           for (n = 0; n<sz-1; n+=2) {
3104             i1    = idx[0];
3105             i2    = idx[1];
3106             idx  += 2;
3107             tmp0  = x[i1];
3108             tmp1  = x[i2];
3109             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3110             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3111             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3112             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3113             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3114           }
3115 
3116           if (n == sz-1) {
3117             tmp0  = x[*idx];
3118             sum1 -= *v1*tmp0;
3119             sum2 -= *v2*tmp0;
3120             sum3 -= *v3*tmp0;
3121             sum4 -= *v4*tmp0;
3122             sum5 -= *v5*tmp0;
3123           }
3124           x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3125           x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3126           x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3127           x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3128           x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3129           break;
3130         default:
3131           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3132         }
3133       }
3134 
3135       ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3136     }
3137     its--;
3138   }
3139   while (its--) {
3140 
3141     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3142       for (i=0, row=0, ibdiag = a->inode.ibdiag;
3143            i<m;
3144            row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) {
3145 
3146         sz  = diag[row] - ii[row];
3147         v1  = a->a + ii[row];
3148         idx = a->j + ii[row];
3149         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3150         switch (sizes[i]) {
3151         case 1:
3152           sum1 = b[row];
3153           for (n = 0; n<sz-1; n+=2) {
3154             i1    = idx[0];
3155             i2    = idx[1];
3156             idx  += 2;
3157             tmp0  = x[i1];
3158             tmp1  = x[i2];
3159             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3160           }
3161           if (n == sz-1) {
3162             tmp0  = x[*idx++];
3163             sum1 -= *v1 * tmp0;
3164             v1++;
3165           }
3166           t[row]   = sum1;
3167           sz      = ii[row+1] - diag[row] - 1;
3168           idx     = a->j + diag[row] + 1;
3169           v1 += 1;
3170           for (n = 0; n<sz-1; n+=2) {
3171             i1    = idx[0];
3172             i2    = idx[1];
3173             idx  += 2;
3174             tmp0  = x[i1];
3175             tmp1  = x[i2];
3176             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3177           }
3178           if (n == sz-1) {
3179             tmp0  = x[*idx++];
3180             sum1 -= *v1 * tmp0;
3181           }
3182           /* in MatSOR_SeqAIJ this line would be
3183            *
3184            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
3185            *
3186            * but omega == 1, so this becomes
3187            *
3188            * x[row] = sum1*(*ibdiag++);
3189            *
3190            */
3191           x[row] = sum1*(*ibdiag);
3192           break;
3193         case 2:
3194           v2   = a->a + ii[row+1];
3195           sum1 = b[row];
3196           sum2 = b[row+1];
3197           for (n = 0; n<sz-1; n+=2) {
3198             i1    = idx[0];
3199             i2    = idx[1];
3200             idx  += 2;
3201             tmp0  = x[i1];
3202             tmp1  = x[i2];
3203             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3204             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3205           }
3206           if (n == sz-1) {
3207             tmp0  = x[*idx++];
3208             sum1 -= v1[0] * tmp0;
3209             sum2 -= v2[0] * tmp0;
3210             v1++; v2++;
3211           }
3212           t[row]   = sum1;
3213           t[row+1] = sum2;
3214           sz      = ii[row+1] - diag[row] - 2;
3215           idx     = a->j + diag[row] + 2;
3216           v1 += 2;
3217           v2 += 2;
3218           for (n = 0; n<sz-1; n+=2) {
3219             i1    = idx[0];
3220             i2    = idx[1];
3221             idx  += 2;
3222             tmp0  = x[i1];
3223             tmp1  = x[i2];
3224             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3225             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3226           }
3227           if (n == sz-1) {
3228             tmp0  = x[*idx];
3229             sum1 -= v1[0] * tmp0;
3230             sum2 -= v2[0] * tmp0;
3231           }
3232           x[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3233           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3234           break;
3235         case 3:
3236           v2   = a->a + ii[row+1];
3237           v3   = a->a + ii[row+2];
3238           sum1 = b[row];
3239           sum2 = b[row+1];
3240           sum3 = b[row+2];
3241           for (n = 0; n<sz-1; n+=2) {
3242             i1    = idx[0];
3243             i2    = idx[1];
3244             idx  += 2;
3245             tmp0  = x[i1];
3246             tmp1  = x[i2];
3247             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3248             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3249             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3250           }
3251           if (n == sz-1) {
3252             tmp0  = x[*idx++];
3253             sum1 -= v1[0] * tmp0;
3254             sum2 -= v2[0] * tmp0;
3255             sum3 -= v3[0] * tmp0;
3256             v1++; v2++; v3++;
3257           }
3258           t[row]   = sum1;
3259           t[row+1] = sum2;
3260           t[row+2] = sum3;
3261           sz      = ii[row+1] - diag[row] - 3;
3262           idx     = a->j + diag[row] + 3;
3263           v1 += 3;
3264           v2 += 3;
3265           v3 += 3;
3266           for (n = 0; n<sz-1; n+=2) {
3267             i1    = idx[0];
3268             i2    = idx[1];
3269             idx  += 2;
3270             tmp0  = x[i1];
3271             tmp1  = x[i2];
3272             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3273             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3274             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3275           }
3276           if (n == sz-1) {
3277             tmp0  = x[*idx];
3278             sum1 -= v1[0] * tmp0;
3279             sum2 -= v2[0] * tmp0;
3280             sum3 -= v3[0] * tmp0;
3281           }
3282           x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3283           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3284           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3285           break;
3286         case 4:
3287           v2   = a->a + ii[row+1];
3288           v3   = a->a + ii[row+2];
3289           v4   = a->a + ii[row+3];
3290           sum1 = b[row];
3291           sum2 = b[row+1];
3292           sum3 = b[row+2];
3293           sum4 = b[row+3];
3294           for (n = 0; n<sz-1; n+=2) {
3295             i1    = idx[0];
3296             i2    = idx[1];
3297             idx  += 2;
3298             tmp0  = x[i1];
3299             tmp1  = x[i2];
3300             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3301             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3302             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3303             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3304           }
3305           if (n == sz-1) {
3306             tmp0  = x[*idx++];
3307             sum1 -= v1[0] * tmp0;
3308             sum2 -= v2[0] * tmp0;
3309             sum3 -= v3[0] * tmp0;
3310             sum4 -= v4[0] * tmp0;
3311             v1++; v2++; v3++; v4++;
3312           }
3313           t[row]   = sum1;
3314           t[row+1] = sum2;
3315           t[row+2] = sum3;
3316           t[row+3] = sum4;
3317           sz      = ii[row+1] - diag[row] - 4;
3318           idx     = a->j + diag[row] + 4;
3319           v1 += 4;
3320           v2 += 4;
3321           v3 += 4;
3322           v4 += 4;
3323           for (n = 0; n<sz-1; n+=2) {
3324             i1    = idx[0];
3325             i2    = idx[1];
3326             idx  += 2;
3327             tmp0  = x[i1];
3328             tmp1  = x[i2];
3329             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3330             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3331             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3332             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3333           }
3334           if (n == sz-1) {
3335             tmp0  = x[*idx];
3336             sum1 -= v1[0] * tmp0;
3337             sum2 -= v2[0] * tmp0;
3338             sum3 -= v3[0] * tmp0;
3339             sum4 -= v4[0] * tmp0;
3340           }
3341           x[row] =   sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3342           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3343           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3344           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3345           break;
3346         case 5:
3347           v2   = a->a + ii[row+1];
3348           v3   = a->a + ii[row+2];
3349           v4   = a->a + ii[row+3];
3350           v5   = a->a + ii[row+4];
3351           sum1 = b[row];
3352           sum2 = b[row+1];
3353           sum3 = b[row+2];
3354           sum4 = b[row+3];
3355           sum5 = b[row+4];
3356           for (n = 0; n<sz-1; n+=2) {
3357             i1    = idx[0];
3358             i2    = idx[1];
3359             idx  += 2;
3360             tmp0  = x[i1];
3361             tmp1  = x[i2];
3362             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3363             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3364             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3365             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3366             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3367           }
3368           if (n == sz-1) {
3369             tmp0  = x[*idx++];
3370             sum1 -= v1[0] * tmp0;
3371             sum2 -= v2[0] * tmp0;
3372             sum3 -= v3[0] * tmp0;
3373             sum4 -= v4[0] * tmp0;
3374             sum5 -= v5[0] * tmp0;
3375             v1++; v2++; v3++; v4++; v5++;
3376           }
3377           t[row]   = sum1;
3378           t[row+1] = sum2;
3379           t[row+2] = sum3;
3380           t[row+3] = sum4;
3381           t[row+4] = sum5;
3382           sz      = ii[row+1] - diag[row] - 5;
3383           idx     = a->j + diag[row] + 5;
3384           v1 += 5;
3385           v2 += 5;
3386           v3 += 5;
3387           v4 += 5;
3388           v5 += 5;
3389           for (n = 0; n<sz-1; n+=2) {
3390             i1    = idx[0];
3391             i2    = idx[1];
3392             idx  += 2;
3393             tmp0  = x[i1];
3394             tmp1  = x[i2];
3395             sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3396             sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3397             sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3398             sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3399             sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3400           }
3401           if (n == sz-1) {
3402             tmp0  = x[*idx];
3403             sum1 -= v1[0] * tmp0;
3404             sum2 -= v2[0] * tmp0;
3405             sum3 -= v3[0] * tmp0;
3406             sum4 -= v4[0] * tmp0;
3407             sum5 -= v5[0] * tmp0;
3408           }
3409           x[row]   = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3410           x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3411           x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3412           x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3413           x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3414           break;
3415         default:
3416           SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3417         }
3418       }
3419       xb   = t;
3420       ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);  /* undercounts diag inverse */
3421     } else xb = b;
3422 
3423     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3424 
3425       ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3426       for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3427         ibdiag -= sizes[i]*sizes[i];
3428 
3429         /* set RHS */
3430         if (xb == b) {
3431           /* whole (old way) */
3432           sz      = ii[row+1] - ii[row];
3433           idx     = a->j + ii[row];
3434           switch (sizes[i]) {
3435           case 5:
3436             v5      = a->a + ii[row-4];
3437           case 4: /* fall through */
3438             v4      = a->a + ii[row-3];
3439           case 3:
3440             v3      = a->a + ii[row-2];
3441           case 2:
3442             v2      = a->a + ii[row-1];
3443           case 1:
3444             v1      = a->a + ii[row];
3445             break;
3446           default:
3447             SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3448           }
3449         } else {
3450           /* upper, no diag */
3451           sz      = ii[row+1] - diag[row] - 1;
3452           idx     = a->j + diag[row] + 1;
3453           switch (sizes[i]) {
3454           case 5:
3455             v5      = a->a + diag[row-4] + 5;
3456           case 4: /* fall through */
3457             v4      = a->a + diag[row-3] + 4;
3458           case 3:
3459             v3      = a->a + diag[row-2] + 3;
3460           case 2:
3461             v2      = a->a + diag[row-1] + 2;
3462           case 1:
3463             v1      = a->a + diag[row] + 1;
3464           }
3465         }
3466         /* set sum */
3467         switch (sizes[i]) {
3468         case 5:
3469           sum5 = xb[row-4];
3470         case 4: /* fall through */
3471           sum4 = xb[row-3];
3472         case 3:
3473           sum3 = xb[row-2];
3474         case 2:
3475           sum2 = xb[row-1];
3476         case 1:
3477           /* note that sum1 is associated with the last row */
3478           sum1 = xb[row];
3479         }
3480         /* do sums */
3481         for (n = 0; n<sz-1; n+=2) {
3482             i1    = idx[0];
3483             i2    = idx[1];
3484             idx  += 2;
3485             tmp0  = x[i1];
3486             tmp1  = x[i2];
3487             switch (sizes[i]) {
3488             case 5:
3489               sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3490             case 4: /* fall through */
3491               sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3492             case 3:
3493               sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3494             case 2:
3495               sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3496             case 1:
3497               sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3498             }
3499         }
3500         /* ragged edge */
3501         if (n == sz-1) {
3502           tmp0  = x[*idx];
3503           switch (sizes[i]) {
3504           case 5:
3505             sum5 -= *v5*tmp0;
3506           case 4: /* fall through */
3507             sum4 -= *v4*tmp0;
3508           case 3:
3509             sum3 -= *v3*tmp0;
3510           case 2:
3511             sum2 -= *v2*tmp0;
3512           case 1:
3513             sum1 -= *v1*tmp0;
3514           }
3515         }
3516         /* update */
3517         if (xb == b) {
3518           /* whole (old way) w/ diag */
3519           switch (sizes[i]) {
3520           case 5:
3521             x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3522             x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3523             x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3524             x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3525             x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3526             break;
3527           case 4:
3528             x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3529             x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3530             x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3531             x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3532             break;
3533           case 3:
3534             x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3535             x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3536             x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3537             break;
3538           case 2:
3539             x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3];
3540             x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2];
3541             break;
3542           case 1:
3543             x[row--] += sum1*(*ibdiag);
3544             break;
3545           }
3546         } else {
3547           /* no diag so set =  */
3548           switch (sizes[i]) {
3549           case 5:
3550             x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3551             x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3552             x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3553             x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3554             x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3555             break;
3556           case 4:
3557             x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3558             x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3559             x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3560             x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3561             break;
3562           case 3:
3563             x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3564             x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3565             x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3566             break;
3567           case 2:
3568             x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3];
3569             x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2];
3570             break;
3571           case 1:
3572             x[row--] = sum1*(*ibdiag);
3573             break;
3574           }
3575         }
3576       }
3577       if (xb == b) {
3578         ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
3579       } else {
3580         ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */
3581       }
3582     }
3583   }
3584   if (flag & SOR_EISENSTAT) {
3585     /*
3586           Apply  (U + D)^-1  where D is now the block diagonal
3587     */
3588     ibdiag = a->inode.ibdiag+a->inode.bdiagsize;
3589     for (i=m-1, row=A->rmap->n-1; i>=0; i--) {
3590       ibdiag -= sizes[i]*sizes[i];
3591       sz      = ii[row+1] - diag[row] - 1;
3592       v1      = a->a + diag[row] + 1;
3593       idx     = a->j + diag[row] + 1;
3594       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3595       switch (sizes[i]) {
3596       case 1:
3597 
3598         sum1 = b[row];
3599         for (n = 0; n<sz-1; n+=2) {
3600           i1    = idx[0];
3601           i2    = idx[1];
3602           idx  += 2;
3603           tmp0  = x[i1];
3604           tmp1  = x[i2];
3605           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3606         }
3607 
3608         if (n == sz-1) {
3609           tmp0  = x[*idx];
3610           sum1 -= *v1*tmp0;
3611         }
3612         x[row] = sum1*(*ibdiag);row--;
3613         break;
3614 
3615       case 2:
3616 
3617         sum1 = b[row];
3618         sum2 = b[row-1];
3619         /* note that sum1 is associated with the second of the two rows */
3620         v2 = a->a + diag[row-1] + 2;
3621         for (n = 0; n<sz-1; n+=2) {
3622           i1    = idx[0];
3623           i2    = idx[1];
3624           idx  += 2;
3625           tmp0  = x[i1];
3626           tmp1  = x[i2];
3627           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3628           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3629         }
3630 
3631         if (n == sz-1) {
3632           tmp0  = x[*idx];
3633           sum1 -= *v1*tmp0;
3634           sum2 -= *v2*tmp0;
3635         }
3636         x[row]   = sum2*ibdiag[1] + sum1*ibdiag[3];
3637         x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2];
3638         row     -= 2;
3639         break;
3640       case 3:
3641 
3642         sum1 = b[row];
3643         sum2 = b[row-1];
3644         sum3 = b[row-2];
3645         v2   = a->a + diag[row-1] + 2;
3646         v3   = a->a + diag[row-2] + 3;
3647         for (n = 0; n<sz-1; n+=2) {
3648           i1    = idx[0];
3649           i2    = idx[1];
3650           idx  += 2;
3651           tmp0  = x[i1];
3652           tmp1  = x[i2];
3653           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3654           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3655           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3656         }
3657 
3658         if (n == sz-1) {
3659           tmp0  = x[*idx];
3660           sum1 -= *v1*tmp0;
3661           sum2 -= *v2*tmp0;
3662           sum3 -= *v3*tmp0;
3663         }
3664         x[row]   = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8];
3665         x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7];
3666         x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6];
3667         row     -= 3;
3668         break;
3669       case 4:
3670 
3671         sum1 = b[row];
3672         sum2 = b[row-1];
3673         sum3 = b[row-2];
3674         sum4 = b[row-3];
3675         v2   = a->a + diag[row-1] + 2;
3676         v3   = a->a + diag[row-2] + 3;
3677         v4   = a->a + diag[row-3] + 4;
3678         for (n = 0; n<sz-1; n+=2) {
3679           i1    = idx[0];
3680           i2    = idx[1];
3681           idx  += 2;
3682           tmp0  = x[i1];
3683           tmp1  = x[i2];
3684           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3685           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3686           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3687           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3688         }
3689 
3690         if (n == sz-1) {
3691           tmp0  = x[*idx];
3692           sum1 -= *v1*tmp0;
3693           sum2 -= *v2*tmp0;
3694           sum3 -= *v3*tmp0;
3695           sum4 -= *v4*tmp0;
3696         }
3697         x[row]   = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15];
3698         x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14];
3699         x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13];
3700         x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12];
3701         row     -= 4;
3702         break;
3703       case 5:
3704 
3705         sum1 = b[row];
3706         sum2 = b[row-1];
3707         sum3 = b[row-2];
3708         sum4 = b[row-3];
3709         sum5 = b[row-4];
3710         v2   = a->a + diag[row-1] + 2;
3711         v3   = a->a + diag[row-2] + 3;
3712         v4   = a->a + diag[row-3] + 4;
3713         v5   = a->a + diag[row-4] + 5;
3714         for (n = 0; n<sz-1; n+=2) {
3715           i1    = idx[0];
3716           i2    = idx[1];
3717           idx  += 2;
3718           tmp0  = x[i1];
3719           tmp1  = x[i2];
3720           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3721           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3722           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3723           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3724           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3725         }
3726 
3727         if (n == sz-1) {
3728           tmp0  = x[*idx];
3729           sum1 -= *v1*tmp0;
3730           sum2 -= *v2*tmp0;
3731           sum3 -= *v3*tmp0;
3732           sum4 -= *v4*tmp0;
3733           sum5 -= *v5*tmp0;
3734         }
3735         x[row]   = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24];
3736         x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23];
3737         x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22];
3738         x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21];
3739         x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20];
3740         row     -= 5;
3741         break;
3742       default:
3743         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3744       }
3745     }
3746     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3747 
3748     /*
3749            t = b - D x    where D is the block diagonal
3750     */
3751     cnt = 0;
3752     for (i=0, row=0; i<m; i++) {
3753       switch (sizes[i]) {
3754       case 1:
3755         t[row] = b[row] - bdiag[cnt++]*x[row]; row++;
3756         break;
3757       case 2:
3758         x1       = x[row]; x2 = x[row+1];
3759         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3760         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3761         t[row]   = b[row] - tmp1;
3762         t[row+1] = b[row+1] - tmp2; row += 2;
3763         cnt     += 4;
3764         break;
3765       case 3:
3766         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2];
3767         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
3768         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
3769         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
3770         t[row]   = b[row] - tmp1;
3771         t[row+1] = b[row+1] - tmp2;
3772         t[row+2] = b[row+2] - tmp3; row += 3;
3773         cnt     += 9;
3774         break;
3775       case 4:
3776         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3];
3777         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
3778         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
3779         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
3780         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
3781         t[row]   = b[row] - tmp1;
3782         t[row+1] = b[row+1] - tmp2;
3783         t[row+2] = b[row+2] - tmp3;
3784         t[row+3] = b[row+3] - tmp4; row += 4;
3785         cnt     += 16;
3786         break;
3787       case 5:
3788         x1       = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4];
3789         tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
3790         tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
3791         tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
3792         tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
3793         tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
3794         t[row]   = b[row] - tmp1;
3795         t[row+1] = b[row+1] - tmp2;
3796         t[row+2] = b[row+2] - tmp3;
3797         t[row+3] = b[row+3] - tmp4;
3798         t[row+4] = b[row+4] - tmp5;row += 5;
3799         cnt     += 25;
3800         break;
3801       default:
3802         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3803       }
3804     }
3805     ierr = PetscLogFlops(m);CHKERRQ(ierr);
3806 
3807 
3808 
3809     /*
3810           Apply (L + D)^-1 where D is the block diagonal
3811     */
3812     for (i=0, row=0; i<m; i++) {
3813       sz  = diag[row] - ii[row];
3814       v1  = a->a + ii[row];
3815       idx = a->j + ii[row];
3816       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3817       switch (sizes[i]) {
3818       case 1:
3819 
3820         sum1 = t[row];
3821         for (n = 0; n<sz-1; n+=2) {
3822           i1    = idx[0];
3823           i2    = idx[1];
3824           idx  += 2;
3825           tmp0  = t[i1];
3826           tmp1  = t[i2];
3827           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3828         }
3829 
3830         if (n == sz-1) {
3831           tmp0  = t[*idx];
3832           sum1 -= *v1 * tmp0;
3833         }
3834         x[row] += t[row] = sum1*(*ibdiag++); row++;
3835         break;
3836       case 2:
3837         v2   = a->a + ii[row+1];
3838         sum1 = t[row];
3839         sum2 = t[row+1];
3840         for (n = 0; n<sz-1; n+=2) {
3841           i1    = idx[0];
3842           i2    = idx[1];
3843           idx  += 2;
3844           tmp0  = t[i1];
3845           tmp1  = t[i2];
3846           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3847           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3848         }
3849 
3850         if (n == sz-1) {
3851           tmp0  = t[*idx];
3852           sum1 -= v1[0] * tmp0;
3853           sum2 -= v2[0] * tmp0;
3854         }
3855         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2];
3856         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3];
3857         ibdiag   += 4; row += 2;
3858         break;
3859       case 3:
3860         v2   = a->a + ii[row+1];
3861         v3   = a->a + ii[row+2];
3862         sum1 = t[row];
3863         sum2 = t[row+1];
3864         sum3 = t[row+2];
3865         for (n = 0; n<sz-1; n+=2) {
3866           i1    = idx[0];
3867           i2    = idx[1];
3868           idx  += 2;
3869           tmp0  = t[i1];
3870           tmp1  = t[i2];
3871           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3872           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3873           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3874         }
3875 
3876         if (n == sz-1) {
3877           tmp0  = t[*idx];
3878           sum1 -= v1[0] * tmp0;
3879           sum2 -= v2[0] * tmp0;
3880           sum3 -= v3[0] * tmp0;
3881         }
3882         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6];
3883         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7];
3884         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8];
3885         ibdiag   += 9; row += 3;
3886         break;
3887       case 4:
3888         v2   = a->a + ii[row+1];
3889         v3   = a->a + ii[row+2];
3890         v4   = a->a + ii[row+3];
3891         sum1 = t[row];
3892         sum2 = t[row+1];
3893         sum3 = t[row+2];
3894         sum4 = t[row+3];
3895         for (n = 0; n<sz-1; n+=2) {
3896           i1    = idx[0];
3897           i2    = idx[1];
3898           idx  += 2;
3899           tmp0  = t[i1];
3900           tmp1  = t[i2];
3901           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3902           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3903           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3904           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3905         }
3906 
3907         if (n == sz-1) {
3908           tmp0  = t[*idx];
3909           sum1 -= v1[0] * tmp0;
3910           sum2 -= v2[0] * tmp0;
3911           sum3 -= v3[0] * tmp0;
3912           sum4 -= v4[0] * tmp0;
3913         }
3914         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12];
3915         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13];
3916         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14];
3917         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15];
3918         ibdiag   += 16; row += 4;
3919         break;
3920       case 5:
3921         v2   = a->a + ii[row+1];
3922         v3   = a->a + ii[row+2];
3923         v4   = a->a + ii[row+3];
3924         v5   = a->a + ii[row+4];
3925         sum1 = t[row];
3926         sum2 = t[row+1];
3927         sum3 = t[row+2];
3928         sum4 = t[row+3];
3929         sum5 = t[row+4];
3930         for (n = 0; n<sz-1; n+=2) {
3931           i1    = idx[0];
3932           i2    = idx[1];
3933           idx  += 2;
3934           tmp0  = t[i1];
3935           tmp1  = t[i2];
3936           sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
3937           sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
3938           sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
3939           sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
3940           sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
3941         }
3942 
3943         if (n == sz-1) {
3944           tmp0  = t[*idx];
3945           sum1 -= v1[0] * tmp0;
3946           sum2 -= v2[0] * tmp0;
3947           sum3 -= v3[0] * tmp0;
3948           sum4 -= v4[0] * tmp0;
3949           sum5 -= v5[0] * tmp0;
3950         }
3951         x[row]   += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20];
3952         x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21];
3953         x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22];
3954         x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23];
3955         x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24];
3956         ibdiag   += 25; row += 5;
3957         break;
3958       default:
3959         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
3960       }
3961     }
3962     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
3963   }
3964   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3965   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
3966   PetscFunctionReturn(0);
3967 }
3968 
MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)3969 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)
3970 {
3971   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
3972   PetscScalar       *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5;
3973   const MatScalar   *bdiag = a->inode.bdiag;
3974   const PetscScalar *b;
3975   PetscErrorCode    ierr;
3976   PetscInt          m      = a->inode.node_count,cnt = 0,i,row;
3977   const PetscInt    *sizes = a->inode.size;
3978 
3979   PetscFunctionBegin;
3980   if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure");
3981   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3982   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
3983   cnt  = 0;
3984   for (i=0, row=0; i<m; i++) {
3985     switch (sizes[i]) {
3986     case 1:
3987       x[row] = b[row]*bdiag[cnt++];row++;
3988       break;
3989     case 2:
3990       x1       = b[row]; x2 = b[row+1];
3991       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+2];
3992       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+3];
3993       x[row++] = tmp1;
3994       x[row++] = tmp2;
3995       cnt     += 4;
3996       break;
3997     case 3:
3998       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2];
3999       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6];
4000       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7];
4001       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8];
4002       x[row++] = tmp1;
4003       x[row++] = tmp2;
4004       x[row++] = tmp3;
4005       cnt     += 9;
4006       break;
4007     case 4:
4008       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3];
4009       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12];
4010       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13];
4011       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14];
4012       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15];
4013       x[row++] = tmp1;
4014       x[row++] = tmp2;
4015       x[row++] = tmp3;
4016       x[row++] = tmp4;
4017       cnt     += 16;
4018       break;
4019     case 5:
4020       x1       = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4];
4021       tmp1     = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20];
4022       tmp2     = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21];
4023       tmp3     = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22];
4024       tmp4     = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23];
4025       tmp5     = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24];
4026       x[row++] = tmp1;
4027       x[row++] = tmp2;
4028       x[row++] = tmp3;
4029       x[row++] = tmp4;
4030       x[row++] = tmp5;
4031       cnt     += 25;
4032       break;
4033     default:
4034       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]);
4035     }
4036   }
4037   ierr = PetscLogFlops(2.0*cnt);CHKERRQ(ierr);
4038   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4039   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
4040   PetscFunctionReturn(0);
4041 }
4042 
MatSeqAIJ_Inode_ResetOps(Mat A)4043 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4044 {
4045   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4046 
4047   PetscFunctionBegin;
4048   a->inode.node_count       = 0;
4049   a->inode.use              = PETSC_FALSE;
4050   a->inode.checked          = PETSC_FALSE;
4051   a->inode.mat_nonzerostate = -1;
4052   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4053   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4054   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4055   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4056   A->ops->coloringpatch     = NULL;
4057   A->ops->multdiagonalblock = NULL;
4058   if (A->factortype) {
4059     A->ops->solve = MatSolve_SeqAIJ_inplace;
4060   }
4061   PetscFunctionReturn(0);
4062 }
4063 
4064 /*
4065     samestructure indicates that the matrix has not changed its nonzero structure so we
4066     do not need to recompute the inodes
4067 */
MatSeqAIJCheckInode(Mat A)4068 PetscErrorCode MatSeqAIJCheckInode(Mat A)
4069 {
4070   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4071   PetscErrorCode ierr;
4072   PetscInt       i,j,m,nzx,nzy,*ns,node_count,blk_size;
4073   PetscBool      flag;
4074   const PetscInt *idx,*idy,*ii;
4075 
4076   PetscFunctionBegin;
4077   if (!a->inode.use) {
4078     ierr = MatSeqAIJ_Inode_ResetOps(A);CHKERRQ(ierr);
4079     ierr = PetscFree(a->inode.size);CHKERRQ(ierr);
4080     PetscFunctionReturn(0);
4081   }
4082   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0);
4083 
4084   m = A->rmap->n;
4085   if (!a->inode.size) { ierr = PetscMalloc1(m+1,&a->inode.size);CHKERRQ(ierr); }
4086   ns = a->inode.size;
4087 
4088   i          = 0;
4089   node_count = 0;
4090   idx        = a->j;
4091   ii         = a->i;
4092   while (i < m) {                /* For each row */
4093     nzx = ii[i+1] - ii[i];       /* Number of nonzeros */
4094     /* Limits the number of elements in a node to 'a->inode.limit' */
4095     for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4096       nzy = ii[j+1] - ii[j];     /* Same number of nonzeros */
4097       if (nzy != nzx) break;
4098       idy += nzx;              /* Same nonzero pattern */
4099       ierr = PetscArraycmp(idx,idy,nzx,&flag);CHKERRQ(ierr);
4100       if (!flag) break;
4101     }
4102     ns[node_count++] = blk_size;
4103     idx             += blk_size*nzx;
4104     i                = j;
4105   }
4106 
4107   /* If not enough inodes found,, do not use inode version of the routines */
4108   if (!m || node_count > .8*m) {
4109     ierr = MatSeqAIJ_Inode_ResetOps(A);CHKERRQ(ierr);
4110     ierr = PetscFree(a->inode.size);CHKERRQ(ierr);
4111     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4112   } else {
4113     if (!A->factortype) {
4114       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4115       if (A->rmap->n == A->cmap->n) {
4116         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
4117         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
4118         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
4119         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
4120         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4121       }
4122     } else {
4123       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4124     }
4125     a->inode.node_count = node_count;
4126     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4127   }
4128   a->inode.checked          = PETSC_TRUE;
4129   a->inode.mat_nonzerostate = A->nonzerostate;
4130   PetscFunctionReturn(0);
4131 }
4132 
MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat * C)4133 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C)
4134 {
4135   Mat            B =*C;
4136   Mat_SeqAIJ     *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data;
4137   PetscErrorCode ierr;
4138   PetscInt       m=A->rmap->n;
4139 
4140   PetscFunctionBegin;
4141   c->inode.use              = a->inode.use;
4142   c->inode.limit            = a->inode.limit;
4143   c->inode.max_limit        = a->inode.max_limit;
4144   c->inode.checked          = PETSC_FALSE;
4145   c->inode.size             = NULL;
4146   c->inode.node_count       = 0;
4147   c->inode.ibdiagvalid      = PETSC_FALSE;
4148   c->inode.ibdiag           = NULL;
4149   c->inode.bdiag            = NULL;
4150   c->inode.mat_nonzerostate = -1;
4151   if (a->inode.use) {
4152     if (a->inode.checked && a->inode.size) {
4153       ierr = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr);
4154       ierr = PetscArraycpy(c->inode.size,a->inode.size,m+1);CHKERRQ(ierr);
4155 
4156       c->inode.checked          = PETSC_TRUE;
4157       c->inode.node_count       = a->inode.node_count;
4158       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4159     }
4160     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
4161     if (!B->factortype) {
4162       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
4163       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
4164       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
4165       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
4166       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
4167       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4168     } else {
4169       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4170     }
4171   }
4172   PetscFunctionReturn(0);
4173 }
4174 
MatGetRow_FactoredLU(PetscInt * cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt * ai,const PetscInt * aj,const PetscInt * adiag,PetscInt row)4175 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row)
4176 {
4177   PetscInt       k;
4178   const PetscInt *vi;
4179 
4180   PetscFunctionBegin;
4181   vi = aj + ai[row];
4182   for (k=0; k<nzl; k++) cols[k] = vi[k];
4183   vi        = aj + adiag[row];
4184   cols[nzl] = vi[0];
4185   vi        = aj + adiag[row+1]+1;
4186   for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k];
4187   PetscFunctionReturn(0);
4188 }
4189 /*
4190    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4191    Modified from MatSeqAIJCheckInode().
4192 
4193    Input Parameters:
4194 .  Mat A - ILU or LU matrix factor
4195 
4196 */
MatSeqAIJCheckInode_FactorLU(Mat A)4197 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4198 {
4199   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4200   PetscErrorCode ierr;
4201   PetscInt       i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size;
4202   PetscInt       *cols1,*cols2,*ns;
4203   const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag;
4204   PetscBool      flag;
4205 
4206   PetscFunctionBegin;
4207   if (!a->inode.use)    PetscFunctionReturn(0);
4208   if (a->inode.checked) PetscFunctionReturn(0);
4209 
4210   m = A->rmap->n;
4211   if (a->inode.size) ns = a->inode.size;
4212   else {
4213     ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr);
4214   }
4215 
4216   i          = 0;
4217   node_count = 0;
4218   ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr);
4219   while (i < m) {                /* For each row */
4220     nzl1 = ai[i+1] - ai[i];       /* Number of nonzeros in L */
4221     nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/
4222     nzx  = nzl1 + nzu1 + 1;
4223     MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i);
4224 
4225     /* Limits the number of elements in a node to 'a->inode.limit' */
4226     for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
4227       nzl2 = ai[j+1] - ai[j];
4228       nzu2 = adiag[j] - adiag[j+1] - 1;
4229       nzy  = nzl2 + nzu2 + 1;
4230       if (nzy != nzx) break;
4231       ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr);
4232       ierr = PetscArraycmp(cols1,cols2,nzx,&flag);CHKERRQ(ierr);
4233       if (!flag) break;
4234     }
4235     ns[node_count++] = blk_size;
4236     i                = j;
4237   }
4238   ierr             = PetscFree2(cols1,cols2);CHKERRQ(ierr);
4239   /* If not enough inodes found,, do not use inode version of the routines */
4240   if (!m || node_count > .8*m) {
4241     ierr = PetscFree(ns);CHKERRQ(ierr);
4242 
4243     a->inode.node_count = 0;
4244     a->inode.size       = NULL;
4245     a->inode.use        = PETSC_FALSE;
4246 
4247     ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr);
4248   } else {
4249     A->ops->mult              = NULL;
4250     A->ops->sor               = NULL;
4251     A->ops->multadd           = NULL;
4252     A->ops->getrowij          = NULL;
4253     A->ops->restorerowij      = NULL;
4254     A->ops->getcolumnij       = NULL;
4255     A->ops->restorecolumnij   = NULL;
4256     A->ops->coloringpatch     = NULL;
4257     A->ops->multdiagonalblock = NULL;
4258     a->inode.node_count       = node_count;
4259     a->inode.size             = ns;
4260 
4261     ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr);
4262   }
4263   a->inode.checked = PETSC_TRUE;
4264   PetscFunctionReturn(0);
4265 }
4266 
MatSeqAIJInvalidateDiagonal_Inode(Mat A)4267 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4268 {
4269   Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
4270 
4271   PetscFunctionBegin;
4272   a->inode.ibdiagvalid = PETSC_FALSE;
4273   PetscFunctionReturn(0);
4274 }
4275 
4276 /*
4277      This is really ugly. if inodes are used this replaces the
4278   permutations with ones that correspond to rows/cols of the matrix
4279   rather then inode blocks
4280 */
MatInodeAdjustForInodes(Mat A,IS * rperm,IS * cperm)4281 PetscErrorCode  MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
4282 {
4283   PetscErrorCode ierr;
4284 
4285   PetscFunctionBegin;
4286   ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr);
4287   PetscFunctionReturn(0);
4288 }
4289 
MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS * rperm,IS * cperm)4290 PetscErrorCode  MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm)
4291 {
4292   Mat_SeqAIJ     *a=(Mat_SeqAIJ*)A->data;
4293   PetscErrorCode ierr;
4294   PetscInt       m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count;
4295   const PetscInt *ridx,*cidx;
4296   PetscInt       row,col,*permr,*permc,*ns_row =  a->inode.size,*tns,start_val,end_val,indx;
4297   PetscInt       nslim_col,*ns_col;
4298   IS             ris = *rperm,cis = *cperm;
4299 
4300   PetscFunctionBegin;
4301   if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */
4302   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
4303 
4304   ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr);
4305   ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr);
4306   ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr);
4307 
4308   ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr);
4309   ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr);
4310 
4311   /* Form the inode structure for the rows of permuted matric using inv perm*/
4312   for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
4313 
4314   /* Construct the permutations for rows*/
4315   for (i=0,row = 0; i<nslim_row; ++i) {
4316     indx      = ridx[i];
4317     start_val = tns[indx];
4318     end_val   = tns[indx + 1];
4319     for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
4320   }
4321 
4322   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4323   for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
4324 
4325   /* Construct permutations for columns */
4326   for (i=0,col=0; i<nslim_col; ++i) {
4327     indx      = cidx[i];
4328     start_val = tns[indx];
4329     end_val   = tns[indx + 1];
4330     for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
4331   }
4332 
4333   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr);
4334   ierr = ISSetPermutation(*rperm);CHKERRQ(ierr);
4335   ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr);
4336   ierr = ISSetPermutation(*cperm);CHKERRQ(ierr);
4337 
4338   ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr);
4339   ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr);
4340 
4341   ierr = PetscFree(ns_col);CHKERRQ(ierr);
4342   ierr = PetscFree2(permr,permc);CHKERRQ(ierr);
4343   ierr = ISDestroy(&cis);CHKERRQ(ierr);
4344   ierr = ISDestroy(&ris);CHKERRQ(ierr);
4345   ierr = PetscFree(tns);CHKERRQ(ierr);
4346   PetscFunctionReturn(0);
4347 }
4348 
4349 /*@C
4350    MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
4351 
4352    Not Collective
4353 
4354    Input Parameter:
4355 .  A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
4356 
4357    Output Parameter:
4358 +  node_count - no of inodes present in the matrix.
4359 .  sizes      - an array of size node_count,with sizes of each inode.
4360 -  limit      - the max size used to generate the inodes.
4361 
4362    Level: advanced
4363 
4364    Notes:
4365     This routine returns some internal storage information
4366    of the matrix, it is intended to be used by advanced users.
4367    It should be called after the matrix is assembled.
4368    The contents of the sizes[] array should not be changed.
4369    NULL may be passed for information not requested.
4370 
4371 .seealso: MatGetInfo()
4372 @*/
MatInodeGetInodeSizes(Mat A,PetscInt * node_count,PetscInt * sizes[],PetscInt * limit)4373 PetscErrorCode  MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4374 {
4375   PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
4376 
4377   PetscFunctionBegin;
4378   if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
4379   ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr);
4380   if (f) {
4381     ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr);
4382   }
4383   PetscFunctionReturn(0);
4384 }
4385 
MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt * node_count,PetscInt * sizes[],PetscInt * limit)4386 PetscErrorCode  MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
4387 {
4388   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
4389 
4390   PetscFunctionBegin;
4391   if (node_count) *node_count = a->inode.node_count;
4392   if (sizes)      *sizes      = a->inode.size;
4393   if (limit)      *limit      = a->inode.limit;
4394   PetscFunctionReturn(0);
4395 }
4396