1 
2 /*
3     Factorization code for SBAIJ format.
4 */
5 
6 #include <../src/mat/impls/sbaij/seq/sbaij.h>
7 #include <../src/mat/impls/baij/seq/baij.h>
8 #include <petsc/private/kernels/blockinvert.h>
9 
MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)10 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
11 {
12   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
13   IS                isrow=a->row;
14   PetscInt          mbs  =a->mbs,*ai=a->i,*aj=a->j;
15   PetscErrorCode    ierr;
16   const PetscInt    *r;
17   PetscInt          nz,*vj,k,idx,k1;
18   PetscInt          bs =A->rmap->bs,bs2 = a->bs2;
19   const MatScalar   *aa=a->a,*v,*diag;
20   PetscScalar       *x,*xk,*xj,*xk_tmp,*t;
21   const PetscScalar *b;
22 
23   PetscFunctionBegin;
24   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
25   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
26   t    = a->solve_work;
27   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
28   ierr = PetscMalloc1(bs,&xk_tmp);CHKERRQ(ierr);
29 
30   /* solve U^T * D * y = b by forward substitution */
31   xk = t;
32   for (k=0; k<mbs; k++) { /* t <- perm(b) */
33     idx = bs*r[k];
34     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
35   }
36   for (k=0; k<mbs; k++) {
37     v    = aa + bs2*ai[k];
38     xk   = t + k*bs;    /* Dk*xk = k-th block of x */
39     ierr = PetscArraycpy(xk_tmp,xk,bs);CHKERRQ(ierr); /* xk_tmp <- xk */
40     nz   = ai[k+1] - ai[k];
41     vj   = aj + ai[k];
42     xj   = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
43     while (nz--) {
44       /* x(:) += U(k,:)^T*(Dk*xk) */
45       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
46       vj++; xj = t + (*vj)*bs;
47       v       += bs2;
48     }
49     /* xk = inv(Dk)*(Dk*xk) */
50     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
51     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
52   }
53 
54   /* solve U*x = y by back substitution */
55   for (k=mbs-1; k>=0; k--) {
56     v  = aa + bs2*ai[k];
57     xk = t + k*bs;        /* xk */
58     nz = ai[k+1] - ai[k];
59     vj = aj + ai[k];
60     xj = t + (*vj)*bs;
61     while (nz--) {
62       /* xk += U(k,:)*x(:) */
63       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
64       vj++;
65       v += bs2; xj = t + (*vj)*bs;
66     }
67     idx = bs*r[k];
68     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
69   }
70 
71   ierr = PetscFree(xk_tmp);CHKERRQ(ierr);
72   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
73   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
74   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
75   ierr = PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);CHKERRQ(ierr);
76   PetscFunctionReturn(0);
77 }
78 
MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)79 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
80 {
81   PetscFunctionBegin;
82   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"not implemented yet");
83   PetscFunctionReturn(0);
84 }
85 
MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)86 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
87 {
88   PetscFunctionBegin;
89   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not yet implemented");
90   PetscFunctionReturn(0);
91 }
92 
MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscInt bs,PetscScalar * x)93 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
94 {
95   PetscErrorCode  ierr;
96   PetscInt        nz,k;
97   const PetscInt  *vj,bs2 = bs*bs;
98   const MatScalar *v,*diag;
99   PetscScalar     *xk,*xj,*xk_tmp;
100 
101   PetscFunctionBegin;
102   ierr = PetscMalloc1(bs,&xk_tmp);CHKERRQ(ierr);
103   for (k=0; k<mbs; k++) {
104     v    = aa + bs2*ai[k];
105     xk   = x + k*bs;    /* Dk*xk = k-th block of x */
106     ierr = PetscArraycpy(xk_tmp,xk,bs);CHKERRQ(ierr); /* xk_tmp <- xk */
107     nz   = ai[k+1] - ai[k];
108     vj   = aj + ai[k];
109     xj   = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
110     while (nz--) {
111       /* x(:) += U(k,:)^T*(Dk*xk) */
112       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
113       vj++; xj = x + (*vj)*bs;
114       v       += bs2;
115     }
116     /* xk = inv(Dk)*(Dk*xk) */
117     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
118     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
119   }
120   ierr = PetscFree(xk_tmp);CHKERRQ(ierr);
121   PetscFunctionReturn(0);
122 }
123 
MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscInt bs,PetscScalar * x)124 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
125 {
126   PetscInt        nz,k;
127   const PetscInt  *vj,bs2 = bs*bs;
128   const MatScalar *v;
129   PetscScalar     *xk,*xj;
130 
131   PetscFunctionBegin;
132   for (k=mbs-1; k>=0; k--) {
133     v  = aa + bs2*ai[k];
134     xk = x + k*bs;        /* xk */
135     nz = ai[k+1] - ai[k];
136     vj = aj + ai[k];
137     xj = x + (*vj)*bs;
138     while (nz--) {
139       /* xk += U(k,:)*x(:) */
140       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
141       vj++;
142       v += bs2; xj = x + (*vj)*bs;
143     }
144   }
145   PetscFunctionReturn(0);
146 }
147 
MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)148 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
149 {
150   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
151   PetscErrorCode    ierr;
152   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
153   PetscInt          bs =A->rmap->bs;
154   const MatScalar   *aa=a->a;
155   PetscScalar       *x;
156   const PetscScalar *b;
157 
158   PetscFunctionBegin;
159   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
160   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
161 
162   /* solve U^T * D * y = b by forward substitution */
163   ierr = PetscArraycpy(x,b,bs*mbs);CHKERRQ(ierr); /* x <- b */
164   ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
165 
166   /* solve U*x = y by back substitution */
167   ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
168 
169   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
170   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
171   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
172   PetscFunctionReturn(0);
173 }
174 
MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)175 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
176 {
177   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
178   PetscErrorCode    ierr;
179   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
180   PetscInt          bs =A->rmap->bs;
181   const MatScalar   *aa=a->a;
182   const PetscScalar *b;
183   PetscScalar       *x;
184 
185   PetscFunctionBegin;
186   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
187   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
188   ierr = PetscArraycpy(x,b,bs*mbs);CHKERRQ(ierr); /* x <- b */
189   ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
190   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
191   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
192   ierr = PetscLogFlops(2.0*a->bs2*a->nz - bs*mbs);CHKERRQ(ierr);
193   PetscFunctionReturn(0);
194 }
195 
MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)196 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
197 {
198   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
199   PetscErrorCode    ierr;
200   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
201   PetscInt          bs =A->rmap->bs;
202   const MatScalar   *aa=a->a;
203   const PetscScalar *b;
204   PetscScalar       *x;
205 
206   PetscFunctionBegin;
207   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
208   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
209   ierr = PetscArraycpy(x,b,bs*mbs);CHKERRQ(ierr);
210   ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
211   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
212   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
213   ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr);
214   PetscFunctionReturn(0);
215 }
216 
MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)217 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
218 {
219   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
220   IS                isrow=a->row;
221   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
222   PetscErrorCode    ierr;
223   PetscInt          nz,k,idx;
224   const MatScalar   *aa=a->a,*v,*d;
225   PetscScalar       *x,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
226   const PetscScalar *b;
227 
228   PetscFunctionBegin;
229   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
230   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
231   t    = a->solve_work;
232   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
233 
234   /* solve U^T * D * y = b by forward substitution */
235   tp = t;
236   for (k=0; k<mbs; k++) { /* t <- perm(b) */
237     idx   = 7*r[k];
238     tp[0] = b[idx];
239     tp[1] = b[idx+1];
240     tp[2] = b[idx+2];
241     tp[3] = b[idx+3];
242     tp[4] = b[idx+4];
243     tp[5] = b[idx+5];
244     tp[6] = b[idx+6];
245     tp   += 7;
246   }
247 
248   for (k=0; k<mbs; k++) {
249     v  = aa + 49*ai[k];
250     vj = aj + ai[k];
251     tp = t + k*7;
252     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
253     nz = ai[k+1] - ai[k];
254     tp = t + (*vj)*7;
255     while (nz--) {
256       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
257       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
258       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
259       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
260       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
261       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
262       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
263       vj++;
264       tp = t + (*vj)*7;
265       v += 49;
266     }
267 
268     /* xk = inv(Dk)*(Dk*xk) */
269     d     = aa+k*49;       /* ptr to inv(Dk) */
270     tp    = t + k*7;
271     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
272     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
273     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
274     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
275     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
276     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
277     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
278   }
279 
280   /* solve U*x = y by back substitution */
281   for (k=mbs-1; k>=0; k--) {
282     v  = aa + 49*ai[k];
283     vj = aj + ai[k];
284     tp = t + k*7;
285     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
286     nz = ai[k+1] - ai[k];
287 
288     tp = t + (*vj)*7;
289     while (nz--) {
290       /* xk += U(k,:)*x(:) */
291       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
292       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
293       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
294       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
295       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
296       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
297       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
298       vj++;
299       tp = t + (*vj)*7;
300       v += 49;
301     }
302     tp       = t + k*7;
303     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
304     idx      = 7*r[k];
305     x[idx]   = x0;
306     x[idx+1] = x1;
307     x[idx+2] = x2;
308     x[idx+3] = x3;
309     x[idx+4] = x4;
310     x[idx+5] = x5;
311     x[idx+6] = x6;
312   }
313 
314   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
315   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
316   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
317   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
318   PetscFunctionReturn(0);
319 }
320 
MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)321 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
322 {
323   const MatScalar *v,*d;
324   PetscScalar     *xp,x0,x1,x2,x3,x4,x5,x6;
325   PetscInt        nz,k;
326   const PetscInt  *vj;
327 
328   PetscFunctionBegin;
329   for (k=0; k<mbs; k++) {
330     v  = aa + 49*ai[k];
331     xp = x + k*7;
332     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
333     nz = ai[k+1] - ai[k];
334     vj = aj + ai[k];
335     xp = x + (*vj)*7;
336     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
337     PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
338     while (nz--) {
339       /* x(:) += U(k,:)^T*(Dk*xk) */
340       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
341       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
342       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
343       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
344       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
345       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
346       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
347       vj++;
348       xp = x + (*vj)*7;
349       v += 49;
350     }
351     /* xk = inv(Dk)*(Dk*xk) */
352     d     = aa+k*49;       /* ptr to inv(Dk) */
353     xp    = x + k*7;
354     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
355     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
356     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
357     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
358     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
359     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
360     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
361   }
362   PetscFunctionReturn(0);
363 }
364 
MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)365 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
366 {
367   const MatScalar *v;
368   PetscScalar     *xp,x0,x1,x2,x3,x4,x5,x6;
369   PetscInt        nz,k;
370   const PetscInt  *vj;
371 
372   PetscFunctionBegin;
373   for (k=mbs-1; k>=0; k--) {
374     v  = aa + 49*ai[k];
375     xp = x + k*7;
376     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
377     nz = ai[k+1] - ai[k];
378     vj = aj + ai[k];
379     xp = x + (*vj)*7;
380     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
381     PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
382     while (nz--) {
383       /* xk += U(k,:)*x(:) */
384       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
385       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
386       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
387       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
388       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
389       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
390       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
391       vj++;
392       v += 49; xp = x + (*vj)*7;
393     }
394     xp   = x + k*7;
395     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
396   }
397   PetscFunctionReturn(0);
398 }
399 
MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)400 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
401 {
402   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
403   PetscErrorCode    ierr;
404   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
405   const MatScalar   *aa=a->a;
406   PetscScalar       *x;
407   const PetscScalar *b;
408 
409   PetscFunctionBegin;
410   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
411   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
412 
413   /* solve U^T * D * y = b by forward substitution */
414   ierr = PetscArraycpy(x,b,7*mbs);CHKERRQ(ierr); /* x <- b */
415   ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
416 
417   /* solve U*x = y by back substitution */
418   ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
419 
420   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
421   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
422   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
423   PetscFunctionReturn(0);
424 }
425 
MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)426 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
427 {
428   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
429   PetscErrorCode    ierr;
430   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
431   const MatScalar   *aa=a->a;
432   PetscScalar       *x;
433   const PetscScalar *b;
434 
435   PetscFunctionBegin;
436   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
437   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
438   ierr = PetscArraycpy(x,b,7*mbs);CHKERRQ(ierr);
439   ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
440   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
441   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
442   ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr);
443   PetscFunctionReturn(0);
444 }
445 
MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)446 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
447 {
448   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
449   PetscErrorCode    ierr;
450   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
451   const MatScalar   *aa=a->a;
452   PetscScalar       *x;
453   const PetscScalar *b;
454 
455   PetscFunctionBegin;
456   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
457   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
458   ierr = PetscArraycpy(x,b,7*mbs);CHKERRQ(ierr);
459   ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
460   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
461   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
462   ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr);
463   PetscFunctionReturn(0);
464 }
465 
MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)466 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
467 {
468   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
469   IS                isrow=a->row;
470   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
471   PetscErrorCode    ierr;
472   PetscInt          nz,k,idx;
473   const MatScalar   *aa=a->a,*v,*d;
474   PetscScalar       *x,x0,x1,x2,x3,x4,x5,*t,*tp;
475   const PetscScalar *b;
476 
477   PetscFunctionBegin;
478   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
479   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
480   t    = a->solve_work;
481   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
482 
483   /* solve U^T * D * y = b by forward substitution */
484   tp = t;
485   for (k=0; k<mbs; k++) { /* t <- perm(b) */
486     idx   = 6*r[k];
487     tp[0] = b[idx];
488     tp[1] = b[idx+1];
489     tp[2] = b[idx+2];
490     tp[3] = b[idx+3];
491     tp[4] = b[idx+4];
492     tp[5] = b[idx+5];
493     tp   += 6;
494   }
495 
496   for (k=0; k<mbs; k++) {
497     v  = aa + 36*ai[k];
498     vj = aj + ai[k];
499     tp = t + k*6;
500     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
501     nz = ai[k+1] - ai[k];
502     tp = t + (*vj)*6;
503     while (nz--) {
504       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
505       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
506       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
507       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
508       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
509       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
510       vj++;
511       tp = t + (*vj)*6;
512       v += 36;
513     }
514 
515     /* xk = inv(Dk)*(Dk*xk) */
516     d     = aa+k*36;       /* ptr to inv(Dk) */
517     tp    = t + k*6;
518     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
519     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
520     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
521     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
522     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
523     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
524   }
525 
526   /* solve U*x = y by back substitution */
527   for (k=mbs-1; k>=0; k--) {
528     v  = aa + 36*ai[k];
529     vj = aj + ai[k];
530     tp = t + k*6;
531     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
532     nz = ai[k+1] - ai[k];
533 
534     tp = t + (*vj)*6;
535     while (nz--) {
536       /* xk += U(k,:)*x(:) */
537       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
538       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
539       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
540       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
541       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
542       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
543       vj++;
544       tp = t + (*vj)*6;
545       v += 36;
546     }
547     tp       = t + k*6;
548     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
549     idx      = 6*r[k];
550     x[idx]   = x0;
551     x[idx+1] = x1;
552     x[idx+2] = x2;
553     x[idx+3] = x3;
554     x[idx+4] = x4;
555     x[idx+5] = x5;
556   }
557 
558   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
559   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
560   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
561   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
562   PetscFunctionReturn(0);
563 }
564 
MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)565 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
566 {
567   const MatScalar *v,*d;
568   PetscScalar     *xp,x0,x1,x2,x3,x4,x5;
569   PetscInt        nz,k;
570   const PetscInt  *vj;
571 
572   PetscFunctionBegin;
573   for (k=0; k<mbs; k++) {
574     v  = aa + 36*ai[k];
575     xp = x + k*6;
576     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
577     nz = ai[k+1] - ai[k];
578     vj = aj + ai[k];
579     xp = x + (*vj)*6;
580     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
581     PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
582     while (nz--) {
583       /* x(:) += U(k,:)^T*(Dk*xk) */
584       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
585       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
586       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
587       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
588       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
589       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
590       vj++;
591       xp = x + (*vj)*6;
592       v += 36;
593     }
594     /* xk = inv(Dk)*(Dk*xk) */
595     d     = aa+k*36;       /* ptr to inv(Dk) */
596     xp    = x + k*6;
597     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
598     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
599     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
600     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
601     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
602     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
603   }
604   PetscFunctionReturn(0);
605 }
MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)606 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
607 {
608   const MatScalar   *v;
609   PetscScalar       *xp,x0,x1,x2,x3,x4,x5;
610   PetscInt          nz,k;
611   const PetscInt    *vj;
612 
613   PetscFunctionBegin;
614   for (k=mbs-1; k>=0; k--) {
615     v  = aa + 36*ai[k];
616     xp = x + k*6;
617     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
618     nz = ai[k+1] - ai[k];
619     vj = aj + ai[k];
620     xp = x + (*vj)*6;
621     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
622     PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
623     while (nz--) {
624       /* xk += U(k,:)*x(:) */
625       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
626       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
627       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
628       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
629       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
630       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
631       vj++;
632       v += 36; xp = x + (*vj)*6;
633     }
634     xp   = x + k*6;
635     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
636   }
637   PetscFunctionReturn(0);
638 }
639 
640 
MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)641 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
642 {
643   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
644   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
645   const MatScalar   *aa=a->a;
646   PetscScalar       *x;
647   const PetscScalar *b;
648   PetscErrorCode    ierr;
649 
650   PetscFunctionBegin;
651   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
652   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
653 
654   /* solve U^T * D * y = b by forward substitution */
655   ierr = PetscArraycpy(x,b,6*mbs);CHKERRQ(ierr); /* x <- b */
656   ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
657 
658   /* solve U*x = y by back substitution */
659   ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
660 
661   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
662   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
663   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
664   PetscFunctionReturn(0);
665 }
666 
MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)667 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
668 {
669   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
670   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
671   const MatScalar   *aa=a->a;
672   PetscScalar       *x;
673   const PetscScalar *b;
674   PetscErrorCode    ierr;
675 
676   PetscFunctionBegin;
677   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
678   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
679   ierr = PetscArraycpy(x,b,6*mbs);CHKERRQ(ierr); /* x <- b */
680   ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
681   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
682   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
683   ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr);
684   PetscFunctionReturn(0);
685 }
686 
MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)687 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
688 {
689   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
690   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
691   const MatScalar   *aa=a->a;
692   PetscScalar       *x;
693   const PetscScalar *b;
694   PetscErrorCode    ierr;
695 
696   PetscFunctionBegin;
697   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
698   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
699   ierr = PetscArraycpy(x,b,6*mbs);CHKERRQ(ierr); /* x <- b */
700   ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
701   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
702   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
703   ierr = PetscLogFlops(2.0*a->bs2*(a->nz - mbs));CHKERRQ(ierr);
704   PetscFunctionReturn(0);
705 }
706 
MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)707 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
708 {
709   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
710   IS                isrow=a->row;
711   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
712   PetscErrorCode    ierr;
713   const PetscInt    *r,*vj;
714   PetscInt          nz,k,idx;
715   const MatScalar   *aa=a->a,*v,*diag;
716   PetscScalar       *x,x0,x1,x2,x3,x4,*t,*tp;
717   const PetscScalar *b;
718 
719   PetscFunctionBegin;
720   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
721   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
722   t    = a->solve_work;
723   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
724 
725   /* solve U^T * D * y = b by forward substitution */
726   tp = t;
727   for (k=0; k<mbs; k++) { /* t <- perm(b) */
728     idx   = 5*r[k];
729     tp[0] = b[idx];
730     tp[1] = b[idx+1];
731     tp[2] = b[idx+2];
732     tp[3] = b[idx+3];
733     tp[4] = b[idx+4];
734     tp   += 5;
735   }
736 
737   for (k=0; k<mbs; k++) {
738     v  = aa + 25*ai[k];
739     vj = aj + ai[k];
740     tp = t + k*5;
741     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
742     nz = ai[k+1] - ai[k];
743 
744     tp = t + (*vj)*5;
745     while (nz--) {
746       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
747       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
748       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
749       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
750       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
751       vj++;
752       tp = t + (*vj)*5;
753       v += 25;
754     }
755 
756     /* xk = inv(Dk)*(Dk*xk) */
757     diag  = aa+k*25;          /* ptr to inv(Dk) */
758     tp    = t + k*5;
759     tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
760     tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
761     tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
762     tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
763     tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
764   }
765 
766   /* solve U*x = y by back substitution */
767   for (k=mbs-1; k>=0; k--) {
768     v  = aa + 25*ai[k];
769     vj = aj + ai[k];
770     tp = t + k*5;
771     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */
772     nz = ai[k+1] - ai[k];
773 
774     tp = t + (*vj)*5;
775     while (nz--) {
776       /* xk += U(k,:)*x(:) */
777       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
778       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
779       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
780       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
781       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
782       vj++;
783       tp = t + (*vj)*5;
784       v += 25;
785     }
786     tp       = t + k*5;
787     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
788     idx      = 5*r[k];
789     x[idx]   = x0;
790     x[idx+1] = x1;
791     x[idx+2] = x2;
792     x[idx+3] = x3;
793     x[idx+4] = x4;
794   }
795 
796   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
797   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
798   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
799   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
800   PetscFunctionReturn(0);
801 }
802 
MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)803 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
804 {
805   const MatScalar *v,*diag;
806   PetscScalar     *xp,x0,x1,x2,x3,x4;
807   PetscInt        nz,k;
808   const PetscInt  *vj;
809 
810   PetscFunctionBegin;
811   for (k=0; k<mbs; k++) {
812     v  = aa + 25*ai[k];
813     xp = x + k*5;
814     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
815     nz = ai[k+1] - ai[k];
816     vj = aj + ai[k];
817     xp = x + (*vj)*5;
818     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
819     PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
820     while (nz--) {
821       /* x(:) += U(k,:)^T*(Dk*xk) */
822       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
823       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
824       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
825       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
826       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
827       vj++;
828       xp = x + (*vj)*5;
829       v += 25;
830     }
831     /* xk = inv(Dk)*(Dk*xk) */
832     diag  = aa+k*25;         /* ptr to inv(Dk) */
833     xp    = x + k*5;
834     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
835     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
836     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
837     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
838     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
839   }
840   PetscFunctionReturn(0);
841 }
842 
MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)843 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
844 {
845   const MatScalar *v;
846   PetscScalar     *xp,x0,x1,x2,x3,x4;
847   PetscInt        nz,k;
848   const PetscInt  *vj;
849 
850   PetscFunctionBegin;
851   for (k=mbs-1; k>=0; k--) {
852     v  = aa + 25*ai[k];
853     xp = x + k*5;
854     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */
855     nz = ai[k+1] - ai[k];
856     vj = aj + ai[k];
857     xp = x + (*vj)*5;
858     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
859     PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
860     while (nz--) {
861       /* xk += U(k,:)*x(:) */
862       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
863       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
864       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
865       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
866       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
867       vj++;
868       v += 25; xp = x + (*vj)*5;
869     }
870     xp   = x + k*5;
871     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
872   }
873   PetscFunctionReturn(0);
874 }
875 
MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)876 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
877 {
878   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
879   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
880   const MatScalar   *aa=a->a;
881   PetscScalar       *x;
882   const PetscScalar *b;
883   PetscErrorCode    ierr;
884 
885   PetscFunctionBegin;
886   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
887   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
888 
889   /* solve U^T * D * y = b by forward substitution */
890   ierr = PetscArraycpy(x,b,5*mbs);CHKERRQ(ierr); /* x <- b */
891   ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
892 
893   /* solve U*x = y by back substitution */
894   ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
895 
896   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
897   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
898   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
899   PetscFunctionReturn(0);
900 }
901 
MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)902 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
903 {
904   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
905   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
906   const MatScalar   *aa=a->a;
907   PetscScalar       *x;
908   const PetscScalar *b;
909   PetscErrorCode    ierr;
910 
911   PetscFunctionBegin;
912   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
913   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
914   ierr = PetscArraycpy(x,b,5*mbs);CHKERRQ(ierr); /* x <- b */
915   ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
916   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
917   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
918   ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr);
919   PetscFunctionReturn(0);
920 }
921 
MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)922 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
923 {
924   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
925   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
926   const MatScalar   *aa=a->a;
927   PetscScalar       *x;
928   const PetscScalar *b;
929   PetscErrorCode    ierr;
930 
931   PetscFunctionBegin;
932   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
933   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
934   ierr = PetscArraycpy(x,b,5*mbs);CHKERRQ(ierr);
935   ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
936   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
937   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
938   ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr);
939   PetscFunctionReturn(0);
940 }
941 
MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)942 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
943 {
944   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
945   IS                isrow=a->row;
946   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
947   PetscErrorCode    ierr;
948   const PetscInt    *r,*vj;
949   PetscInt          nz,k,idx;
950   const MatScalar   *aa=a->a,*v,*diag;
951   PetscScalar       *x,x0,x1,x2,x3,*t,*tp;
952   const PetscScalar *b;
953 
954   PetscFunctionBegin;
955   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
956   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
957   t    = a->solve_work;
958   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
959 
960   /* solve U^T * D * y = b by forward substitution */
961   tp = t;
962   for (k=0; k<mbs; k++) { /* t <- perm(b) */
963     idx   = 4*r[k];
964     tp[0] = b[idx];
965     tp[1] = b[idx+1];
966     tp[2] = b[idx+2];
967     tp[3] = b[idx+3];
968     tp   += 4;
969   }
970 
971   for (k=0; k<mbs; k++) {
972     v  = aa + 16*ai[k];
973     vj = aj + ai[k];
974     tp = t + k*4;
975     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
976     nz = ai[k+1] - ai[k];
977 
978     tp = t + (*vj)*4;
979     while (nz--) {
980       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
981       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
982       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
983       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
984       vj++;
985       tp = t + (*vj)*4;
986       v += 16;
987     }
988 
989     /* xk = inv(Dk)*(Dk*xk) */
990     diag  = aa+k*16;          /* ptr to inv(Dk) */
991     tp    = t + k*4;
992     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
993     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
994     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
995     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
996   }
997 
998   /* solve U*x = y by back substitution */
999   for (k=mbs-1; k>=0; k--) {
1000     v  = aa + 16*ai[k];
1001     vj = aj + ai[k];
1002     tp = t + k*4;
1003     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1004     nz = ai[k+1] - ai[k];
1005 
1006     tp = t + (*vj)*4;
1007     while (nz--) {
1008       /* xk += U(k,:)*x(:) */
1009       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1010       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1011       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1012       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1013       vj++;
1014       tp = t + (*vj)*4;
1015       v += 16;
1016     }
1017     tp       = t + k*4;
1018     tp[0]    =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1019     idx      = 4*r[k];
1020     x[idx]   = x0;
1021     x[idx+1] = x1;
1022     x[idx+2] = x2;
1023     x[idx+3] = x3;
1024   }
1025 
1026   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1027   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1028   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1029   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
1030   PetscFunctionReturn(0);
1031 }
1032 
MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)1033 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1034 {
1035   const MatScalar *v,*diag;
1036   PetscScalar     *xp,x0,x1,x2,x3;
1037   PetscInt        nz,k;
1038   const PetscInt  *vj;
1039 
1040   PetscFunctionBegin;
1041   for (k=0; k<mbs; k++) {
1042     v  = aa + 16*ai[k];
1043     xp = x + k*4;
1044     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1045     nz = ai[k+1] - ai[k];
1046     vj = aj + ai[k];
1047     xp = x + (*vj)*4;
1048     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1049     PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1050     while (nz--) {
1051       /* x(:) += U(k,:)^T*(Dk*xk) */
1052       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1053       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1054       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1055       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1056       vj++;
1057       xp = x + (*vj)*4;
1058       v += 16;
1059     }
1060     /* xk = inv(Dk)*(Dk*xk) */
1061     diag  = aa+k*16;         /* ptr to inv(Dk) */
1062     xp    = x + k*4;
1063     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1064     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1065     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1066     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1067   }
1068   PetscFunctionReturn(0);
1069 }
1070 
MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)1071 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1072 {
1073   const MatScalar *v;
1074   PetscScalar     *xp,x0,x1,x2,x3;
1075   PetscInt        nz,k;
1076   const PetscInt  *vj;
1077 
1078   PetscFunctionBegin;
1079   for (k=mbs-1; k>=0; k--) {
1080     v  = aa + 16*ai[k];
1081     xp = x + k*4;
1082     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1083     nz = ai[k+1] - ai[k];
1084     vj = aj + ai[k];
1085     xp = x + (*vj)*4;
1086     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1087     PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1088     while (nz--) {
1089       /* xk += U(k,:)*x(:) */
1090       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1091       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1092       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1093       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1094       vj++;
1095       v += 16; xp = x + (*vj)*4;
1096     }
1097     xp    = x + k*4;
1098     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1099   }
1100   PetscFunctionReturn(0);
1101 }
1102 
MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1103 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1104 {
1105   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1106   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1107   const MatScalar   *aa=a->a;
1108   PetscScalar       *x;
1109   const PetscScalar *b;
1110   PetscErrorCode    ierr;
1111 
1112   PetscFunctionBegin;
1113   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1114   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1115 
1116   /* solve U^T * D * y = b by forward substitution */
1117   ierr = PetscArraycpy(x,b,4*mbs);CHKERRQ(ierr); /* x <- b */
1118   ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1119 
1120   /* solve U*x = y by back substitution */
1121   ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1122   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1123   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1124   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
1125   PetscFunctionReturn(0);
1126 }
1127 
MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1128 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1129 {
1130   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1131   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1132   const MatScalar   *aa=a->a;
1133   PetscScalar       *x;
1134   const PetscScalar *b;
1135   PetscErrorCode    ierr;
1136 
1137   PetscFunctionBegin;
1138   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1139   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1140   ierr = PetscArraycpy(x,b,4*mbs);CHKERRQ(ierr); /* x <- b */
1141   ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1142   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1143   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1144   ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr);
1145   PetscFunctionReturn(0);
1146 }
1147 
MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1148 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1149 {
1150   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1151   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1152   const MatScalar   *aa=a->a;
1153   PetscScalar       *x;
1154   const PetscScalar *b;
1155   PetscErrorCode    ierr;
1156 
1157   PetscFunctionBegin;
1158   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1159   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1160   ierr = PetscArraycpy(x,b,4*mbs);CHKERRQ(ierr);
1161   ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1162   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1163   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1164   ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr);
1165   PetscFunctionReturn(0);
1166 }
1167 
MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)1168 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1169 {
1170   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1171   IS                isrow=a->row;
1172   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
1173   PetscErrorCode    ierr;
1174   const PetscInt    *r;
1175   PetscInt          nz,k,idx;
1176   const PetscInt    *vj;
1177   const MatScalar   *aa=a->a,*v,*diag;
1178   PetscScalar       *x,x0,x1,x2,*t,*tp;
1179   const PetscScalar *b;
1180 
1181   PetscFunctionBegin;
1182   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1183   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1184   t    = a->solve_work;
1185   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1186 
1187   /* solve U^T * D * y = b by forward substitution */
1188   tp = t;
1189   for (k=0; k<mbs; k++) { /* t <- perm(b) */
1190     idx   = 3*r[k];
1191     tp[0] = b[idx];
1192     tp[1] = b[idx+1];
1193     tp[2] = b[idx+2];
1194     tp   += 3;
1195   }
1196 
1197   for (k=0; k<mbs; k++) {
1198     v  = aa + 9*ai[k];
1199     vj = aj + ai[k];
1200     tp = t + k*3;
1201     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1202     nz = ai[k+1] - ai[k];
1203 
1204     tp = t + (*vj)*3;
1205     while (nz--) {
1206       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1207       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1208       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1209       vj++;
1210       tp = t + (*vj)*3;
1211       v += 9;
1212     }
1213 
1214     /* xk = inv(Dk)*(Dk*xk) */
1215     diag  = aa+k*9;          /* ptr to inv(Dk) */
1216     tp    = t + k*3;
1217     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1218     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1219     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1220   }
1221 
1222   /* solve U*x = y by back substitution */
1223   for (k=mbs-1; k>=0; k--) {
1224     v  = aa + 9*ai[k];
1225     vj = aj + ai[k];
1226     tp = t + k*3;
1227     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
1228     nz = ai[k+1] - ai[k];
1229 
1230     tp = t + (*vj)*3;
1231     while (nz--) {
1232       /* xk += U(k,:)*x(:) */
1233       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1234       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1235       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1236       vj++;
1237       tp = t + (*vj)*3;
1238       v += 9;
1239     }
1240     tp       = t + k*3;
1241     tp[0]    = x0; tp[1] = x1; tp[2] = x2;
1242     idx      = 3*r[k];
1243     x[idx]   = x0;
1244     x[idx+1] = x1;
1245     x[idx+2] = x2;
1246   }
1247 
1248   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1249   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1250   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1251   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
1252   PetscFunctionReturn(0);
1253 }
1254 
MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)1255 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1256 {
1257   const MatScalar *v,*diag;
1258   PetscScalar     *xp,x0,x1,x2;
1259   PetscInt        nz,k;
1260   const PetscInt  *vj;
1261 
1262   PetscFunctionBegin;
1263   for (k=0; k<mbs; k++) {
1264     v  = aa + 9*ai[k];
1265     xp = x + k*3;
1266     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1267     nz = ai[k+1] - ai[k];
1268     vj = aj + ai[k];
1269     xp = x + (*vj)*3;
1270     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1271     PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1272     while (nz--) {
1273       /* x(:) += U(k,:)^T*(Dk*xk) */
1274       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1275       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1276       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1277       vj++;
1278       xp = x + (*vj)*3;
1279       v += 9;
1280     }
1281     /* xk = inv(Dk)*(Dk*xk) */
1282     diag  = aa+k*9;         /* ptr to inv(Dk) */
1283     xp    = x + k*3;
1284     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1285     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1286     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1287   }
1288   PetscFunctionReturn(0);
1289 }
1290 
MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)1291 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1292 {
1293   const MatScalar *v;
1294   PetscScalar     *xp,x0,x1,x2;
1295   PetscInt        nz,k;
1296   const PetscInt  *vj;
1297 
1298   PetscFunctionBegin;
1299   for (k=mbs-1; k>=0; k--) {
1300     v  = aa + 9*ai[k];
1301     xp = x + k*3;
1302     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1303     nz = ai[k+1] - ai[k];
1304     vj = aj + ai[k];
1305     xp = x + (*vj)*3;
1306     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1307     PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1308     while (nz--) {
1309       /* xk += U(k,:)*x(:) */
1310       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1311       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1312       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1313       vj++;
1314       v += 9; xp = x + (*vj)*3;
1315     }
1316     xp    = x + k*3;
1317     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1318   }
1319   PetscFunctionReturn(0);
1320 }
1321 
MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1322 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1323 {
1324   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1325   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1326   const MatScalar   *aa=a->a;
1327   PetscScalar       *x;
1328   const PetscScalar *b;
1329   PetscErrorCode    ierr;
1330 
1331   PetscFunctionBegin;
1332   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1333   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1334 
1335   /* solve U^T * D * y = b by forward substitution */
1336   ierr = PetscArraycpy(x,b,3*mbs);CHKERRQ(ierr);
1337   ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1338 
1339   /* solve U*x = y by back substitution */
1340   ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1341 
1342   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1343   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1344   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
1345   PetscFunctionReturn(0);
1346 }
1347 
MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1348 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1349 {
1350   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1351   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1352   const MatScalar   *aa=a->a;
1353   PetscScalar       *x;
1354   const PetscScalar *b;
1355   PetscErrorCode    ierr;
1356 
1357   PetscFunctionBegin;
1358   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1359   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1360   ierr = PetscArraycpy(x,b,3*mbs);CHKERRQ(ierr);
1361   ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1362   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1363   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1364   ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr);
1365   PetscFunctionReturn(0);
1366 }
1367 
MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1368 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1369 {
1370   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1371   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1372   const MatScalar   *aa=a->a;
1373   PetscScalar       *x;
1374   const PetscScalar *b;
1375   PetscErrorCode    ierr;
1376 
1377   PetscFunctionBegin;
1378   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1379   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1380   ierr = PetscArraycpy(x,b,3*mbs);CHKERRQ(ierr);
1381   ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1382   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1383   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1384   ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr);
1385   PetscFunctionReturn(0);
1386 }
1387 
MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)1388 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1389 {
1390   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
1391   IS                isrow=a->row;
1392   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
1393   PetscErrorCode    ierr;
1394   const PetscInt    *r,*vj;
1395   PetscInt          nz,k,k2,idx;
1396   const MatScalar   *aa=a->a,*v,*diag;
1397   PetscScalar       *x,x0,x1,*t;
1398   const PetscScalar *b;
1399 
1400   PetscFunctionBegin;
1401   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1402   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1403   t    = a->solve_work;
1404   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1405 
1406   /* solve U^T * D * y = perm(b) by forward substitution */
1407   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1408     idx      = 2*r[k];
1409     t[k*2]   = b[idx];
1410     t[k*2+1] = b[idx+1];
1411   }
1412   for (k=0; k<mbs; k++) {
1413     v  = aa + 4*ai[k];
1414     vj = aj + ai[k];
1415     k2 = k*2;
1416     x0 = t[k2]; x1 = t[k2+1];
1417     nz = ai[k+1] - ai[k];
1418     while (nz--) {
1419       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1420       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1421       vj++; v      += 4;
1422     }
1423     diag    = aa+k*4; /* ptr to inv(Dk) */
1424     t[k2]   = diag[0]*x0 + diag[2]*x1;
1425     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1426   }
1427 
1428   /* solve U*x = y by back substitution */
1429   for (k=mbs-1; k>=0; k--) {
1430     v  = aa + 4*ai[k];
1431     vj = aj + ai[k];
1432     k2 = k*2;
1433     x0 = t[k2]; x1 = t[k2+1];
1434     nz = ai[k+1] - ai[k];
1435     while (nz--) {
1436       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1437       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1438       vj++;
1439       v += 4;
1440     }
1441     t[k2]    = x0;
1442     t[k2+1]  = x1;
1443     idx      = 2*r[k];
1444     x[idx]   = x0;
1445     x[idx+1] = x1;
1446   }
1447 
1448   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1449   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1450   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1451   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
1452   PetscFunctionReturn(0);
1453 }
1454 
MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)1455 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1456 {
1457   const MatScalar *v,*diag;
1458   PetscScalar     x0,x1;
1459   PetscInt        nz,k,k2;
1460   const PetscInt  *vj;
1461 
1462   PetscFunctionBegin;
1463   for (k=0; k<mbs; k++) {
1464     v  = aa + 4*ai[k];
1465     vj = aj + ai[k];
1466     k2 = k*2;
1467     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1468     nz = ai[k+1] - ai[k];
1469     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1470     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1471     while (nz--) {
1472       /* x(:) += U(k,:)^T*(Dk*xk) */
1473       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1474       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1475       vj++; v      += 4;
1476     }
1477     /* xk = inv(Dk)*(Dk*xk) */
1478     diag    = aa+k*4;       /* ptr to inv(Dk) */
1479     x[k2]   = diag[0]*x0 + diag[2]*x1;
1480     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1481   }
1482   PetscFunctionReturn(0);
1483 }
1484 
MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt * ai,const PetscInt * aj,const MatScalar * aa,PetscInt mbs,PetscScalar * x)1485 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1486 {
1487   const MatScalar *v;
1488   PetscScalar     x0,x1;
1489   PetscInt        nz,k,k2;
1490   const PetscInt  *vj;
1491 
1492   PetscFunctionBegin;
1493   for (k=mbs-1; k>=0; k--) {
1494     v  = aa + 4*ai[k];
1495     vj = aj + ai[k];
1496     k2 = k*2;
1497     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1498     nz = ai[k+1] - ai[k];
1499     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1500     PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1501     while (nz--) {
1502       /* xk += U(k,:)*x(:) */
1503       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1504       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1505       vj++;
1506       v += 4;
1507     }
1508     x[k2]   = x0;
1509     x[k2+1] = x1;
1510   }
1511   PetscFunctionReturn(0);
1512 }
1513 
MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1514 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1515 {
1516   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1517   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1518   const MatScalar   *aa=a->a;
1519   PetscScalar       *x;
1520   const PetscScalar *b;
1521   PetscErrorCode    ierr;
1522 
1523   PetscFunctionBegin;
1524   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1525   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1526 
1527   /* solve U^T * D * y = b by forward substitution */
1528   ierr = PetscArraycpy(x,b,2*mbs);CHKERRQ(ierr);
1529   ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1530 
1531   /* solve U*x = y by back substitution */
1532   ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1533 
1534   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1535   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1536   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
1537   PetscFunctionReturn(0);
1538 }
1539 
MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1540 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1541 {
1542   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1543   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1544   const MatScalar   *aa=a->a;
1545   PetscScalar       *x;
1546   const PetscScalar *b;
1547   PetscErrorCode    ierr;
1548 
1549   PetscFunctionBegin;
1550   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1551   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1552   ierr = PetscArraycpy(x,b,2*mbs);CHKERRQ(ierr);
1553   ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1554   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1555   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1556   ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr);
1557   PetscFunctionReturn(0);
1558 }
1559 
MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)1560 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1561 {
1562   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1563   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1564   const MatScalar   *aa=a->a;
1565   PetscScalar       *x;
1566   const PetscScalar *b;
1567   PetscErrorCode    ierr;
1568 
1569   PetscFunctionBegin;
1570   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1571   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1572   ierr = PetscArraycpy(x,b,2*mbs);CHKERRQ(ierr);
1573   ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1574   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1575   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1576   ierr = PetscLogFlops(2.0*a->bs2*(a->nz - mbs));CHKERRQ(ierr);
1577   PetscFunctionReturn(0);
1578 }
1579 
MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)1580 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1581 {
1582   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1583   IS                isrow=a->row;
1584   PetscErrorCode    ierr;
1585   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1586   const MatScalar   *aa=a->a,*v;
1587   const PetscScalar *b;
1588   PetscScalar       *x,xk,*t;
1589   PetscInt          nz,k,j;
1590 
1591   PetscFunctionBegin;
1592   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1593   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1594   t    = a->solve_work;
1595   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1596 
1597   /* solve U^T*D*y = perm(b) by forward substitution */
1598   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1599   for (k=0; k<mbs; k++) {
1600     v  = aa + ai[k];
1601     vj = aj + ai[k];
1602     xk = t[k];
1603     nz = ai[k+1] - ai[k] - 1;
1604     for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1605     t[k] = xk*v[nz];   /* v[nz] = 1/D(k) */
1606   }
1607 
1608   /* solve U*perm(x) = y by back substitution */
1609   for (k=mbs-1; k>=0; k--) {
1610     v  = aa + adiag[k] - 1;
1611     vj = aj + adiag[k] - 1;
1612     nz = ai[k+1] - ai[k] - 1;
1613     for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1614     x[rp[k]] = t[k];
1615   }
1616 
1617   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1618   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1619   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1620   ierr = PetscLogFlops(4.0*a->nz - 3.0*mbs);CHKERRQ(ierr);
1621   PetscFunctionReturn(0);
1622 }
1623 
MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)1624 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1625 {
1626   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1627   IS                isrow=a->row;
1628   PetscErrorCode    ierr;
1629   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1630   const MatScalar   *aa=a->a,*v;
1631   PetscScalar       *x,xk,*t;
1632   const PetscScalar *b;
1633   PetscInt          nz,k;
1634 
1635   PetscFunctionBegin;
1636   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1637   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1638   t    = a->solve_work;
1639   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1640 
1641   /* solve U^T*D*y = perm(b) by forward substitution */
1642   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1643   for (k=0; k<mbs; k++) {
1644     v  = aa + ai[k] + 1;
1645     vj = aj + ai[k] + 1;
1646     xk = t[k];
1647     nz = ai[k+1] - ai[k] - 1;
1648     while (nz--) t[*vj++] += (*v++) * xk;
1649     t[k] = xk*aa[ai[k]];  /* aa[k] = 1/D(k) */
1650   }
1651 
1652   /* solve U*perm(x) = y by back substitution */
1653   for (k=mbs-1; k>=0; k--) {
1654     v  = aa + ai[k] + 1;
1655     vj = aj + ai[k] + 1;
1656     nz = ai[k+1] - ai[k] - 1;
1657     while (nz--) t[k] += (*v++) * t[*vj++];
1658     x[rp[k]] = t[k];
1659   }
1660 
1661   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1662   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1663   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1664   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
1665   PetscFunctionReturn(0);
1666 }
1667 
MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)1668 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1669 {
1670   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1671   IS                isrow=a->row;
1672   PetscErrorCode    ierr;
1673   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1674   const MatScalar   *aa=a->a,*v;
1675   PetscReal         diagk;
1676   PetscScalar       *x,xk;
1677   const PetscScalar *b;
1678   PetscInt          nz,k;
1679 
1680   PetscFunctionBegin;
1681   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1682   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1683   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1684   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1685 
1686   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1687   for (k=0; k<mbs; k++) {
1688     v  = aa + ai[k];
1689     vj = aj + ai[k];
1690     xk = x[k];
1691     nz = ai[k+1] - ai[k] - 1;
1692     while (nz--) x[*vj++] += (*v++) * xk;
1693 
1694     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1695     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1696     x[k] = xk*PetscSqrtReal(diagk);
1697   }
1698   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1699   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1700   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1701   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1702   PetscFunctionReturn(0);
1703 }
1704 
MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)1705 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1706 {
1707   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1708   IS                isrow=a->row;
1709   PetscErrorCode    ierr;
1710   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1711   const MatScalar   *aa=a->a,*v;
1712   PetscReal         diagk;
1713   PetscScalar       *x,xk;
1714   const PetscScalar *b;
1715   PetscInt          nz,k;
1716 
1717   PetscFunctionBegin;
1718   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1719   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1720   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1721   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1722 
1723   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1724   for (k=0; k<mbs; k++) {
1725     v  = aa + ai[k] + 1;
1726     vj = aj + ai[k] + 1;
1727     xk = x[k];
1728     nz = ai[k+1] - ai[k] - 1;
1729     while (nz--) x[*vj++] += (*v++) * xk;
1730 
1731     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1732     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1733     x[k] = xk*PetscSqrtReal(diagk);
1734   }
1735   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1736   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1737   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1738   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1739   PetscFunctionReturn(0);
1740 }
1741 
MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)1742 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1743 {
1744   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1745   IS                isrow=a->row;
1746   PetscErrorCode    ierr;
1747   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1748   const MatScalar   *aa=a->a,*v;
1749   PetscReal         diagk;
1750   PetscScalar       *x,*t;
1751   const PetscScalar *b;
1752   PetscInt          nz,k;
1753 
1754   PetscFunctionBegin;
1755   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1756   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1757   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1758   t    = a->solve_work;
1759   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1760 
1761   for (k=mbs-1; k>=0; k--) {
1762     v     = aa + ai[k];
1763     vj    = aj + ai[k];
1764     diagk = PetscRealPart(aa[adiag[k]]);
1765     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1766     t[k] = b[k] * PetscSqrtReal(diagk);
1767     nz   = ai[k+1] - ai[k] - 1;
1768     while (nz--) t[k] += (*v++) * t[*vj++];
1769     x[rp[k]] = t[k];
1770   }
1771   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1772   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1773   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1774   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1775   PetscFunctionReturn(0);
1776 }
1777 
MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)1778 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1779 {
1780   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1781   IS                isrow=a->row;
1782   PetscErrorCode    ierr;
1783   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1784   const MatScalar   *aa=a->a,*v;
1785   PetscReal         diagk;
1786   PetscScalar       *x,*t;
1787   const PetscScalar *b;
1788   PetscInt          nz,k;
1789 
1790   PetscFunctionBegin;
1791   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1792   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1793   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1794   t    = a->solve_work;
1795   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1796 
1797   for (k=mbs-1; k>=0; k--) {
1798     v     = aa + ai[k] + 1;
1799     vj    = aj + ai[k] + 1;
1800     diagk = PetscRealPart(aa[ai[k]]);
1801     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1802     t[k] = b[k] * PetscSqrtReal(diagk);
1803     nz   = ai[k+1] - ai[k] - 1;
1804     while (nz--) t[k] += (*v++) * t[*vj++];
1805     x[rp[k]] = t[k];
1806   }
1807   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1808   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1809   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1810   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1811   PetscFunctionReturn(0);
1812 }
1813 
MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)1814 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1815 {
1816   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1817   PetscErrorCode ierr;
1818 
1819   PetscFunctionBegin;
1820   if (A->rmap->bs == 1) {
1821     ierr = MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);CHKERRQ(ierr);
1822   } else {
1823     IS                isrow=a->row;
1824     const PetscInt    *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1825     const MatScalar   *aa=a->a,*v;
1826     PetscScalar       *x,*t;
1827     const PetscScalar *b;
1828     PetscInt          nz,k,n,i,j;
1829 
1830     if (bb->n > a->solves_work_n) {
1831       ierr = PetscFree(a->solves_work);CHKERRQ(ierr);
1832       ierr = PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);CHKERRQ(ierr);
1833       a->solves_work_n = bb->n;
1834     }
1835     n    = bb->n;
1836     ierr = VecGetArrayRead(bb->v,&b);CHKERRQ(ierr);
1837     ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr);
1838     t    = a->solves_work;
1839 
1840     ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1841 
1842     /* solve U^T*D*y = perm(b) by forward substitution */
1843     for (k=0; k<mbs; k++) {
1844       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1845     }
1846     for (k=0; k<mbs; k++) {
1847       v  = aa + ai[k];
1848       vj = aj + ai[k];
1849       nz = ai[k+1] - ai[k] - 1;
1850       for (j=0; j<nz; j++) {
1851         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1852         v++;vj++;
1853       }
1854       for (i=0; i<n; i++) t[n*k+i] *= aa[nz];  /* note: aa[nz] = 1/D(k) */
1855     }
1856 
1857     /* solve U*perm(x) = y by back substitution */
1858     for (k=mbs-1; k>=0; k--) {
1859       v  = aa + ai[k] - 1;
1860       vj = aj + ai[k] - 1;
1861       nz = ai[k+1] - ai[k] - 1;
1862       for (j=0; j<nz; j++) {
1863         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1864         v++;vj++;
1865       }
1866       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1867     }
1868 
1869     ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1870     ierr = VecRestoreArrayRead(bb->v,&b);CHKERRQ(ierr);
1871     ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr);
1872     ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr);
1873   }
1874   PetscFunctionReturn(0);
1875 }
1876 
MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)1877 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1878 {
1879   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1880   PetscErrorCode ierr;
1881 
1882   PetscFunctionBegin;
1883   if (A->rmap->bs == 1) {
1884     ierr = MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);CHKERRQ(ierr);
1885   } else {
1886     IS                isrow=a->row;
1887     const PetscInt    *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1888     const MatScalar   *aa=a->a,*v;
1889     PetscScalar       *x,*t;
1890     const PetscScalar *b;
1891     PetscInt          nz,k,n,i;
1892 
1893     if (bb->n > a->solves_work_n) {
1894       ierr = PetscFree(a->solves_work);CHKERRQ(ierr);
1895       ierr = PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);CHKERRQ(ierr);
1896       a->solves_work_n = bb->n;
1897     }
1898     n    = bb->n;
1899     ierr = VecGetArrayRead(bb->v,&b);CHKERRQ(ierr);
1900     ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr);
1901     t    = a->solves_work;
1902 
1903     ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1904 
1905     /* solve U^T*D*y = perm(b) by forward substitution */
1906     for (k=0; k<mbs; k++) {
1907       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];  /* values are stored interlaced in t */
1908     }
1909     for (k=0; k<mbs; k++) {
1910       v  = aa + ai[k];
1911       vj = aj + ai[k];
1912       nz = ai[k+1] - ai[k];
1913       while (nz--) {
1914         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1915         v++;vj++;
1916       }
1917       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
1918     }
1919 
1920     /* solve U*perm(x) = y by back substitution */
1921     for (k=mbs-1; k>=0; k--) {
1922       v  = aa + ai[k];
1923       vj = aj + ai[k];
1924       nz = ai[k+1] - ai[k];
1925       while (nz--) {
1926         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1927         v++;vj++;
1928       }
1929       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1930     }
1931 
1932     ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1933     ierr = VecRestoreArrayRead(bb->v,&b);CHKERRQ(ierr);
1934     ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr);
1935     ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr);
1936   }
1937   PetscFunctionReturn(0);
1938 }
1939 
MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)1940 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1941 {
1942   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1943   PetscErrorCode    ierr;
1944   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1945   const MatScalar   *aa=a->a,*v;
1946   const PetscScalar *b;
1947   PetscScalar       *x,xi;
1948   PetscInt          nz,i,j;
1949 
1950   PetscFunctionBegin;
1951   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1952   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1953   /* solve U^T*D*y = b by forward substitution */
1954   ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr);
1955   for (i=0; i<mbs; i++) {
1956     v  = aa + ai[i];
1957     vj = aj + ai[i];
1958     xi = x[i];
1959     nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1960     for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
1961     x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
1962   }
1963   /* solve U*x = y by backward substitution */
1964   for (i=mbs-2; i>=0; i--) {
1965     xi = x[i];
1966     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
1967     vj = aj + adiag[i] - 1;
1968     nz = ai[i+1] - ai[i] - 1;
1969     for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
1970     x[i] = xi;
1971   }
1972   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1973   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1974   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
1975   PetscFunctionReturn(0);
1976 }
1977 
MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat B,Mat X)1978 PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat B,Mat X)
1979 {
1980   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1981   PetscErrorCode    ierr;
1982   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1983   const MatScalar   *aa=a->a,*v;
1984   const PetscScalar *b;
1985   PetscScalar       *x,xi;
1986   PetscInt          nz,i,j,neq,ldb,ldx;
1987   PetscBool         isdense;
1988 
1989   PetscFunctionBegin;
1990   if (!mbs) PetscFunctionReturn(0);
1991   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
1992   if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"B matrix must be a SeqDense matrix");
1993   if (X != B) {
1994     ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQDENSE,&isdense);CHKERRQ(ierr);
1995     if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"X matrix must be a SeqDense matrix");
1996   }
1997   ierr = MatDenseGetArrayRead(B,&b);CHKERRQ(ierr);
1998   ierr = MatDenseGetLDA(B,&ldb);CHKERRQ(ierr);
1999   ierr = MatDenseGetArray(X,&x);CHKERRQ(ierr);
2000   ierr = MatDenseGetLDA(X,&ldx);CHKERRQ(ierr);
2001   for (neq=0; neq<B->cmap->n; neq++) {
2002     /* solve U^T*D*y = b by forward substitution */
2003     ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr);
2004     for (i=0; i<mbs; i++) {
2005       v  = aa + ai[i];
2006       vj = aj + ai[i];
2007       xi = x[i];
2008       nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
2009       for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
2010       x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
2011     }
2012     /* solve U*x = y by backward substitution */
2013     for (i=mbs-2; i>=0; i--) {
2014       xi = x[i];
2015       v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2016       vj = aj + adiag[i] - 1;
2017       nz = ai[i+1] - ai[i] - 1;
2018       for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
2019       x[i] = xi;
2020     }
2021     b += ldb;
2022     x += ldx;
2023   }
2024   ierr = MatDenseRestoreArrayRead(B,&b);CHKERRQ(ierr);
2025   ierr = MatDenseRestoreArray(X,&x);CHKERRQ(ierr);
2026   ierr = PetscLogFlops(B->cmap->n*(4.0*a->nz - 3*mbs));CHKERRQ(ierr);
2027   PetscFunctionReturn(0);
2028 }
2029 
MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)2030 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2031 {
2032   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2033   PetscErrorCode    ierr;
2034   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2035   const MatScalar   *aa=a->a,*v;
2036   PetscScalar       *x,xk;
2037   const PetscScalar *b;
2038   PetscInt          nz,k;
2039 
2040   PetscFunctionBegin;
2041   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2042   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2043 
2044   /* solve U^T*D*y = b by forward substitution */
2045   ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr);
2046   for (k=0; k<mbs; k++) {
2047     v  = aa + ai[k] + 1;
2048     vj = aj + ai[k] + 1;
2049     xk = x[k];
2050     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2051     while (nz--) x[*vj++] += (*v++) * xk;
2052     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
2053   }
2054 
2055   /* solve U*x = y by back substitution */
2056   for (k=mbs-2; k>=0; k--) {
2057     v  = aa + ai[k] + 1;
2058     vj = aj + ai[k] + 1;
2059     xk = x[k];
2060     nz = ai[k+1] - ai[k] - 1;
2061     while (nz--) {
2062       xk += (*v++) * x[*vj++];
2063     }
2064     x[k] = xk;
2065   }
2066 
2067   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2068   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2069   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
2070   PetscFunctionReturn(0);
2071 }
2072 
MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)2073 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2074 {
2075   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2076   PetscErrorCode    ierr;
2077   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
2078   const MatScalar   *aa=a->a,*v;
2079   PetscReal         diagk;
2080   PetscScalar       *x;
2081   const PetscScalar *b;
2082   PetscInt          nz,k;
2083 
2084   PetscFunctionBegin;
2085   /* solve U^T*D^(1/2)*x = b by forward substitution */
2086   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2087   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2088   ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr);
2089   for (k=0; k<mbs; k++) {
2090     v  = aa + ai[k];
2091     vj = aj + ai[k];
2092     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2093     while (nz--) x[*vj++] += (*v++) * x[k];
2094     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2095     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2096     x[k] *= PetscSqrtReal(diagk);
2097   }
2098   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2099   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2100   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2101   PetscFunctionReturn(0);
2102 }
2103 
MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)2104 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2105 {
2106   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2107   PetscErrorCode    ierr;
2108   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2109   const MatScalar   *aa=a->a,*v;
2110   PetscReal         diagk;
2111   PetscScalar       *x;
2112   const PetscScalar *b;
2113   PetscInt          nz,k;
2114 
2115   PetscFunctionBegin;
2116   /* solve U^T*D^(1/2)*x = b by forward substitution */
2117   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2118   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2119   ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr);
2120   for (k=0; k<mbs; k++) {
2121     v  = aa + ai[k] + 1;
2122     vj = aj + ai[k] + 1;
2123     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2124     while (nz--) x[*vj++] += (*v++) * x[k];
2125     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2126     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2127     x[k] *= PetscSqrtReal(diagk);
2128   }
2129   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2130   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2131   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2132   PetscFunctionReturn(0);
2133 }
2134 
MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)2135 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2136 {
2137   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2138   PetscErrorCode    ierr;
2139   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
2140   const MatScalar   *aa=a->a,*v;
2141   PetscReal         diagk;
2142   PetscScalar       *x;
2143   const PetscScalar *b;
2144   PetscInt          nz,k;
2145 
2146   PetscFunctionBegin;
2147   /* solve D^(1/2)*U*x = b by back substitution */
2148   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2149   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2150 
2151   for (k=mbs-1; k>=0; k--) {
2152     v     = aa + ai[k];
2153     vj    = aj + ai[k];
2154     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2155     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2156     x[k] = PetscSqrtReal(diagk)*b[k];
2157     nz   = ai[k+1] - ai[k] - 1;
2158     while (nz--) x[k] += (*v++) * x[*vj++];
2159   }
2160   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2161   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2162   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2163   PetscFunctionReturn(0);
2164 }
2165 
MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)2166 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2167 {
2168   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2169   PetscErrorCode    ierr;
2170   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2171   const MatScalar   *aa=a->a,*v;
2172   PetscReal         diagk;
2173   PetscScalar       *x;
2174   const PetscScalar *b;
2175   PetscInt          nz,k;
2176 
2177   PetscFunctionBegin;
2178   /* solve D^(1/2)*U*x = b by back substitution */
2179   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2180   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2181 
2182   for (k=mbs-1; k>=0; k--) {
2183     v     = aa + ai[k] + 1;
2184     vj    = aj + ai[k] + 1;
2185     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2186     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2187     x[k] = PetscSqrtReal(diagk)*b[k];
2188     nz   = ai[k+1] - ai[k] - 1;
2189     while (nz--) x[k] += (*v++) * x[*vj++];
2190   }
2191   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2192   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2193   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2194   PetscFunctionReturn(0);
2195 }
2196 
2197 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo * info)2198 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2199 {
2200   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
2201   PetscErrorCode ierr;
2202   const PetscInt *rip,mbs = a->mbs,*ai,*aj;
2203   PetscInt       *jutmp,bs = A->rmap->bs,i;
2204   PetscInt       m,reallocs = 0,*levtmp;
2205   PetscInt       *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2206   PetscInt       incrlev,*lev,shift,prow,nz;
2207   PetscReal      f = info->fill,levels = info->levels;
2208   PetscBool      perm_identity;
2209 
2210   PetscFunctionBegin;
2211   /* check whether perm is the identity mapping */
2212   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2213 
2214   if (perm_identity) {
2215     a->permute = PETSC_FALSE;
2216     ai         = a->i; aj = a->j;
2217   } else { /*  non-trivial permutation */
2218     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2219     a->permute = PETSC_TRUE;
2220     ierr       = MatReorderingSeqSBAIJ(A, perm);CHKERRQ(ierr);
2221     ai         = a->inew; aj = a->jnew;
2222   }
2223 
2224   /* initialization */
2225   ierr  = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2226   umax  = (PetscInt)(f*ai[mbs] + 1);
2227   ierr  = PetscMalloc1(umax,&lev);CHKERRQ(ierr);
2228   umax += mbs + 1;
2229   shift = mbs + 1;
2230   ierr  = PetscMalloc1(mbs+1,&iu);CHKERRQ(ierr);
2231   ierr  = PetscMalloc1(umax,&ju);CHKERRQ(ierr);
2232   iu[0] = mbs + 1;
2233   juidx = mbs + 1;
2234   /* prowl: linked list for pivot row */
2235   ierr = PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp);CHKERRQ(ierr);
2236   /* q: linked list for col index */
2237 
2238   for (i=0; i<mbs; i++) {
2239     prowl[i] = mbs;
2240     q[i]     = 0;
2241   }
2242 
2243   /* for each row k */
2244   for (k=0; k<mbs; k++) {
2245     nzk  = 0;
2246     q[k] = mbs;
2247     /* copy current row into linked list */
2248     nz = ai[rip[k]+1] - ai[rip[k]];
2249     j  = ai[rip[k]];
2250     while (nz--) {
2251       vj = rip[aj[j++]];
2252       if (vj > k) {
2253         qm = k;
2254         do {
2255           m = qm; qm = q[m];
2256         } while (qm < vj);
2257         if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
2258         nzk++;
2259         q[m]       = vj;
2260         q[vj]      = qm;
2261         levtmp[vj] = 0;   /* initialize lev for nonzero element */
2262       }
2263     }
2264 
2265     /* modify nonzero structure of k-th row by computing fill-in
2266        for each row prow to be merged in */
2267     prow = k;
2268     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2269 
2270     while (prow < k) {
2271       /* merge row prow into k-th row */
2272       jmin = iu[prow] + 1;
2273       jmax = iu[prow+1];
2274       qm   = k;
2275       for (j=jmin; j<jmax; j++) {
2276         incrlev = lev[j-shift] + 1;
2277         if (incrlev > levels) continue;
2278         vj = ju[j];
2279         do {
2280           m = qm; qm = q[m];
2281         } while (qm < vj);
2282         if (qm != vj) {      /* a fill */
2283           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2284           levtmp[vj] = incrlev;
2285         } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2286       }
2287       prow = prowl[prow]; /* next pivot row */
2288     }
2289 
2290     /* add k to row list for first nonzero element in k-th row */
2291     if (nzk > 1) {
2292       i        = q[k]; /* col value of first nonzero element in k_th row of U */
2293       prowl[k] = prowl[i]; prowl[i] = k;
2294     }
2295     iu[k+1] = iu[k] + nzk;
2296 
2297     /* allocate more space to ju and lev if needed */
2298     if (iu[k+1] > umax) {
2299       /* estimate how much additional space we will need */
2300       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2301       /* just double the memory each time */
2302       maxadd = umax;
2303       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2304       umax += maxadd;
2305 
2306       /* allocate a longer ju */
2307       ierr = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr);
2308       ierr = PetscArraycpy(jutmp,ju,iu[k]);CHKERRQ(ierr);
2309       ierr = PetscFree(ju);CHKERRQ(ierr);
2310       ju   = jutmp;
2311 
2312       ierr      = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr);
2313       ierr      = PetscArraycpy(jutmp,lev,iu[k]-shift);CHKERRQ(ierr);
2314       ierr      = PetscFree(lev);CHKERRQ(ierr);
2315       lev       = jutmp;
2316       reallocs += 2; /* count how many times we realloc */
2317     }
2318 
2319     /* save nonzero structure of k-th row in ju */
2320     i=k;
2321     while (nzk--) {
2322       i                = q[i];
2323       ju[juidx]        = i;
2324       lev[juidx-shift] = levtmp[i];
2325       juidx++;
2326     }
2327   }
2328 
2329 #if defined(PETSC_USE_INFO)
2330   if (ai[mbs] != 0) {
2331     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2332     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);CHKERRQ(ierr);
2333     ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr);
2334     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);CHKERRQ(ierr);
2335     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
2336   } else {
2337     ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr);
2338   }
2339 #endif
2340 
2341   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2342   ierr = PetscFree3(prowl,q,levtmp);CHKERRQ(ierr);
2343   ierr = PetscFree(lev);CHKERRQ(ierr);
2344 
2345   /* put together the new matrix */
2346   ierr = MatSeqSBAIJSetPreallocation(B,bs,0,NULL);CHKERRQ(ierr);
2347 
2348   /* ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)iperm);CHKERRQ(ierr); */
2349   b    = (Mat_SeqSBAIJ*)(B)->data;
2350   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2351 
2352   b->singlemalloc = PETSC_FALSE;
2353   b->free_a       = PETSC_TRUE;
2354   b->free_ij      = PETSC_TRUE;
2355   /* the next line frees the default space generated by the Create() */
2356   ierr    = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr);
2357   ierr    = PetscMalloc1((iu[mbs]+1)*a->bs2,&b->a);CHKERRQ(ierr);
2358   b->j    = ju;
2359   b->i    = iu;
2360   b->diag = NULL;
2361   b->ilen = NULL;
2362   b->imax = NULL;
2363 
2364   ierr    = ISDestroy(&b->row);CHKERRQ(ierr);
2365   ierr    = ISDestroy(&b->icol);CHKERRQ(ierr);
2366   b->row  = perm;
2367   b->icol = perm;
2368   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2369   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2370   ierr    = PetscMalloc1(bs*mbs+bs,&b->solve_work);CHKERRQ(ierr);
2371   /* In b structure:  Free imax, ilen, old a, old j.
2372      Allocate idnew, solve_work, new a, new j */
2373   ierr     = PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
2374   b->maxnz = b->nz = iu[mbs];
2375 
2376   (B)->info.factor_mallocs   = reallocs;
2377   (B)->info.fill_ratio_given = f;
2378   if (ai[mbs] != 0) {
2379     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2380   } else {
2381     (B)->info.fill_ratio_needed = 0.0;
2382   }
2383   ierr = MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);CHKERRQ(ierr);
2384   PetscFunctionReturn(0);
2385 }
2386 
2387 /*
2388   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2389 */
2390 #include <petscbt.h>
2391 #include <../src/mat/utils/freespace.h>
MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo * info)2392 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2393 {
2394   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b;
2395   PetscErrorCode     ierr;
2396   PetscBool          perm_identity,free_ij = PETSC_TRUE,missing;
2397   PetscInt           bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2398   const PetscInt     *rip;
2399   PetscInt           reallocs=0,i,*ui,*udiag,*cols;
2400   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2401   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2402   PetscReal          fill          =info->fill,levels=info->levels;
2403   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2404   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2405   PetscBT            lnkbt;
2406 
2407   PetscFunctionBegin;
2408   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2409   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
2410   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2411   if (bs > 1) {
2412     ierr = MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);CHKERRQ(ierr);
2413     PetscFunctionReturn(0);
2414   }
2415 
2416   /* check whether perm is the identity mapping */
2417   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2418   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2419   a->permute = PETSC_FALSE;
2420 
2421   ierr  = PetscMalloc1(am+1,&ui);CHKERRQ(ierr);
2422   ierr  = PetscMalloc1(am+1,&udiag);CHKERRQ(ierr);
2423   ui[0] = 0;
2424 
2425   /* ICC(0) without matrix ordering: simply rearrange column indices */
2426   if (!levels) {
2427     /* reuse the column pointers and row offsets for memory savings */
2428     for (i=0; i<am; i++) {
2429       ncols    = ai[i+1] - ai[i];
2430       ui[i+1]  = ui[i] + ncols;
2431       udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2432     }
2433     ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr);
2434     cols = uj;
2435     for (i=0; i<am; i++) {
2436       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2437       ncols = ai[i+1] - ai[i] -1;
2438       for (j=0; j<ncols; j++) *cols++ = aj[j];
2439       *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
2440     }
2441   } else { /* case: levels>0 */
2442     ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2443 
2444     /* initialization */
2445     /* jl: linked list for storing indices of the pivot rows
2446        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2447     ierr = PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);CHKERRQ(ierr);
2448     for (i=0; i<am; i++) {
2449       jl[i] = am; il[i] = 0;
2450     }
2451 
2452     /* create and initialize a linked list for storing column indices of the active row k */
2453     nlnk = am + 1;
2454     ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2455 
2456     /* initial FreeSpace size is fill*(ai[am]+1) */
2457     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);CHKERRQ(ierr);
2458 
2459     current_space = free_space;
2460 
2461     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);CHKERRQ(ierr);
2462 
2463     current_space_lvl = free_space_lvl;
2464 
2465     for (k=0; k<am; k++) {  /* for each active row k */
2466       /* initialize lnk by the column indices of row k */
2467       nzk   = 0;
2468       ncols = ai[k+1] - ai[k];
2469       if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2470       cols = aj+ai[k];
2471       ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2472       nzk += nlnk;
2473 
2474       /* update lnk by computing fill-in for each pivot row to be merged in */
2475       prow = jl[k]; /* 1st pivot row */
2476 
2477       while (prow < k) {
2478         nextprow = jl[prow];
2479 
2480         /* merge prow into k-th row */
2481         jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2482         jmax  = ui[prow+1];
2483         ncols = jmax-jmin;
2484         i     = jmin - ui[prow];
2485 
2486         cols = uj_ptr[prow] + i;  /* points to the 2nd nzero entry in U(prow,k:am-1) */
2487         uj   = uj_lvl_ptr[prow] + i;  /* levels of cols */
2488         j    = *(uj - 1);
2489         ierr = PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr);
2490         nzk += nlnk;
2491 
2492         /* update il and jl for prow */
2493         if (jmin < jmax) {
2494           il[prow] = jmin;
2495 
2496           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2497         }
2498         prow = nextprow;
2499       }
2500 
2501       /* if free space is not available, make more free space */
2502       if (current_space->local_remaining<nzk) {
2503         i    = am - k + 1; /* num of unfactored rows */
2504         i    = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2505         ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
2506         ierr = PetscFreeSpaceGet(i,&current_space_lvl);CHKERRQ(ierr);
2507         reallocs++;
2508       }
2509 
2510       /* copy data into free_space and free_space_lvl, then initialize lnk */
2511       if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2512       ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
2513 
2514       /* add the k-th row into il and jl */
2515       if (nzk > 1) {
2516         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2517         jl[k] = jl[i]; jl[i] = k;
2518         il[k] = ui[k] + 1;
2519       }
2520       uj_ptr[k]     = current_space->array;
2521       uj_lvl_ptr[k] = current_space_lvl->array;
2522 
2523       current_space->array               += nzk;
2524       current_space->local_used          += nzk;
2525       current_space->local_remaining     -= nzk;
2526       current_space_lvl->array           += nzk;
2527       current_space_lvl->local_used      += nzk;
2528       current_space_lvl->local_remaining -= nzk;
2529 
2530       ui[k+1] = ui[k] + nzk;
2531     }
2532 
2533     ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2534     ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr);
2535 
2536     /* destroy list of free space and other temporary array(s) */
2537     ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr);
2538     ierr = PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag);CHKERRQ(ierr); /* store matrix factor  */
2539     ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
2540     ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
2541 
2542   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2543 
2544   /* put together the new matrix in MATSEQSBAIJ format */
2545   ierr = MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
2546 
2547   b    = (Mat_SeqSBAIJ*)(fact)->data;
2548   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2549 
2550   b->singlemalloc = PETSC_FALSE;
2551   b->free_a       = PETSC_TRUE;
2552   b->free_ij      = free_ij;
2553 
2554   ierr = PetscMalloc1(ui[am]+1,&b->a);CHKERRQ(ierr);
2555 
2556   b->j         = uj;
2557   b->i         = ui;
2558   b->diag      = udiag;
2559   b->free_diag = PETSC_TRUE;
2560   b->ilen      = NULL;
2561   b->imax      = NULL;
2562   b->row       = perm;
2563   b->col       = perm;
2564 
2565   ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2566   ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2567 
2568   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2569 
2570   ierr = PetscMalloc1(am+1,&b->solve_work);CHKERRQ(ierr);
2571   ierr = PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
2572 
2573   b->maxnz = b->nz = ui[am];
2574 
2575   fact->info.factor_mallocs   = reallocs;
2576   fact->info.fill_ratio_given = fill;
2577   if (ai[am] != 0) {
2578     fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2579   } else {
2580     fact->info.fill_ratio_needed = 0.0;
2581   }
2582 #if defined(PETSC_USE_INFO)
2583   if (ai[am] != 0) {
2584     PetscReal af = fact->info.fill_ratio_needed;
2585     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr);
2586     ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr);
2587     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr);
2588   } else {
2589     ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr);
2590   }
2591 #endif
2592   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2593   PetscFunctionReturn(0);
2594 }
2595 
MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo * info)2596 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2597 {
2598   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
2599   Mat_SeqSBAIJ       *b;
2600   PetscErrorCode     ierr;
2601   PetscBool          perm_identity,free_ij = PETSC_TRUE;
2602   PetscInt           bs=A->rmap->bs,am=a->mbs;
2603   const PetscInt     *cols,*rip,*ai=a->i,*aj=a->j;
2604   PetscInt           reallocs=0,i,*ui;
2605   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2606   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2607   PetscReal          fill          =info->fill,levels=info->levels,ratio_needed;
2608   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2609   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2610   PetscBT            lnkbt;
2611 
2612   PetscFunctionBegin;
2613   /*
2614    This code originally uses Modified Sparse Row (MSR) storage
2615    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2616    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2617    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2618    thus the original code in MSR format is still used for these cases.
2619    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2620    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2621   */
2622   if (bs > 1) {
2623     ierr = MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);CHKERRQ(ierr);
2624     PetscFunctionReturn(0);
2625   }
2626 
2627   /* check whether perm is the identity mapping */
2628   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2629   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2630   a->permute = PETSC_FALSE;
2631 
2632   /* special case that simply copies fill pattern */
2633   if (!levels) {
2634     /* reuse the column pointers and row offsets for memory savings */
2635     ui           = a->i;
2636     uj           = a->j;
2637     free_ij      = PETSC_FALSE;
2638     ratio_needed = 1.0;
2639   } else { /* case: levels>0 */
2640     ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2641 
2642     /* initialization */
2643     ierr  = PetscMalloc1(am+1,&ui);CHKERRQ(ierr);
2644     ui[0] = 0;
2645 
2646     /* jl: linked list for storing indices of the pivot rows
2647        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2648     ierr = PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);CHKERRQ(ierr);
2649     for (i=0; i<am; i++) {
2650       jl[i] = am; il[i] = 0;
2651     }
2652 
2653     /* create and initialize a linked list for storing column indices of the active row k */
2654     nlnk = am + 1;
2655     ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2656 
2657     /* initial FreeSpace size is fill*(ai[am]+1) */
2658     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);CHKERRQ(ierr);
2659 
2660     current_space = free_space;
2661 
2662     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);CHKERRQ(ierr);
2663 
2664     current_space_lvl = free_space_lvl;
2665 
2666     for (k=0; k<am; k++) {  /* for each active row k */
2667       /* initialize lnk by the column indices of row rip[k] */
2668       nzk   = 0;
2669       ncols = ai[rip[k]+1] - ai[rip[k]];
2670       cols  = aj+ai[rip[k]];
2671       ierr  = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2672       nzk  += nlnk;
2673 
2674       /* update lnk by computing fill-in for each pivot row to be merged in */
2675       prow = jl[k]; /* 1st pivot row */
2676 
2677       while (prow < k) {
2678         nextprow = jl[prow];
2679 
2680         /* merge prow into k-th row */
2681         jmin     = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2682         jmax     = ui[prow+1];
2683         ncols    = jmax-jmin;
2684         i        = jmin - ui[prow];
2685         cols     = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2686         j        = *(uj_lvl_ptr[prow] + i - 1);
2687         cols_lvl = uj_lvl_ptr[prow]+i;
2688         ierr     = PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr);
2689         nzk     += nlnk;
2690 
2691         /* update il and jl for prow */
2692         if (jmin < jmax) {
2693           il[prow] = jmin;
2694           j        = *cols;
2695           jl[prow] = jl[j];
2696           jl[j]    = prow;
2697         }
2698         prow = nextprow;
2699       }
2700 
2701       /* if free space is not available, make more free space */
2702       if (current_space->local_remaining<nzk) {
2703         i    = am - k + 1; /* num of unfactored rows */
2704         i    = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2705         ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
2706         ierr = PetscFreeSpaceGet(i,&current_space_lvl);CHKERRQ(ierr);
2707         reallocs++;
2708       }
2709 
2710       /* copy data into free_space and free_space_lvl, then initialize lnk */
2711       ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
2712 
2713       /* add the k-th row into il and jl */
2714       if (nzk-1 > 0) {
2715         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2716         jl[k] = jl[i]; jl[i] = k;
2717         il[k] = ui[k] + 1;
2718       }
2719       uj_ptr[k]     = current_space->array;
2720       uj_lvl_ptr[k] = current_space_lvl->array;
2721 
2722       current_space->array               += nzk;
2723       current_space->local_used          += nzk;
2724       current_space->local_remaining     -= nzk;
2725       current_space_lvl->array           += nzk;
2726       current_space_lvl->local_used      += nzk;
2727       current_space_lvl->local_remaining -= nzk;
2728 
2729       ui[k+1] = ui[k] + nzk;
2730     }
2731 
2732     ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2733     ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr);
2734 
2735     /* destroy list of free space and other temporary array(s) */
2736     ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr);
2737     ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
2738     ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
2739     ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
2740     if (ai[am] != 0) {
2741       ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2742     } else {
2743       ratio_needed = 0.0;
2744     }
2745   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2746 
2747   /* put together the new matrix in MATSEQSBAIJ format */
2748   ierr = MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
2749 
2750   b = (Mat_SeqSBAIJ*)(fact)->data;
2751 
2752   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2753 
2754   b->singlemalloc = PETSC_FALSE;
2755   b->free_a       = PETSC_TRUE;
2756   b->free_ij      = free_ij;
2757 
2758   ierr = PetscMalloc1(ui[am]+1,&b->a);CHKERRQ(ierr);
2759 
2760   b->j             = uj;
2761   b->i             = ui;
2762   b->diag          = NULL;
2763   b->ilen          = NULL;
2764   b->imax          = NULL;
2765   b->row           = perm;
2766   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2767 
2768   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2769   b->icol = perm;
2770   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2771   ierr    = PetscMalloc1(am+1,&b->solve_work);CHKERRQ(ierr);
2772 
2773   b->maxnz = b->nz = ui[am];
2774 
2775   fact->info.factor_mallocs    = reallocs;
2776   fact->info.fill_ratio_given  = fill;
2777   fact->info.fill_ratio_needed = ratio_needed;
2778 #if defined(PETSC_USE_INFO)
2779   if (ai[am] != 0) {
2780     PetscReal af = fact->info.fill_ratio_needed;
2781     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr);
2782     ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr);
2783     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr);
2784   } else {
2785     ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr);
2786   }
2787 #endif
2788   if (perm_identity) {
2789     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2790   } else {
2791     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2792   }
2793   PetscFunctionReturn(0);
2794 }
2795