1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <../src/mat/impls/dense/seq/dense.h>
4 #include <../src/mat/impls/sbaij/seq/sbaij.h>
5 #include <petsc/private/kernels/blockinvert.h>
6 #include <petscbt.h>
7 #include <petscblaslapack.h>
8 
MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)9 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
10 {
11   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
12   PetscErrorCode ierr;
13   PetscInt       brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
14   const PetscInt *idx;
15   PetscBT        table_out,table_in;
16 
17   PetscFunctionBegin;
18   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
19   mbs  = a->mbs;
20   ai   = a->i;
21   aj   = a->j;
22   bs   = A->rmap->bs;
23   ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
24   ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
25   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
26   ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
27 
28   for (i=0; i<is_max; i++) { /* for each is */
29     isz  = 0;
30     ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
31 
32     /* Extract the indices, assume there can be duplicate entries */
33     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
34     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
35 
36     /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
37     bcol_max = 0;
38     for (j=0; j<n; ++j) {
39       brow = idx[j]/bs; /* convert the indices into block indices */
40       if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
41       if (!PetscBTLookupSet(table_out,brow)) {
42         nidx[isz++] = brow;
43         if (bcol_max < brow) bcol_max = brow;
44       }
45     }
46     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
47     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
48 
49     k = 0;
50     for (j=0; j<ov; j++) { /* for each overlap */
51       /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
52       ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
53       for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
54 
55       n = isz;  /* length of the updated is[i] */
56       for (brow=0; brow<mbs; brow++) {
57         start = ai[brow]; end   = ai[brow+1];
58         if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
59           for (l = start; l<end; l++) {
60             bcol = aj[l];
61             if (!PetscBTLookupSet(table_out,bcol)) {
62               nidx[isz++] = bcol;
63               if (bcol_max < bcol) bcol_max = bcol;
64             }
65           }
66           k++;
67           if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
68         } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
69           for (l = start; l<end; l++) {
70             bcol = aj[l];
71             if (bcol > bcol_max) break;
72             if (PetscBTLookup(table_in,bcol)) {
73               if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
74               break; /* for l = start; l<end ; l++) */
75             }
76           }
77         }
78       }
79     } /* for each overlap */
80 
81     /* expand the Index Set */
82     for (j=0; j<isz; j++) {
83       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
84     }
85     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
86   }
87   ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
88   ierr = PetscFree(nidx);CHKERRQ(ierr);
89   ierr = PetscFree(nidx2);CHKERRQ(ierr);
90   ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
91   PetscFunctionReturn(0);
92 }
93 
94 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
95         Zero some ops' to avoid invalid usse */
MatSeqSBAIJZeroOps_Private(Mat Bseq)96 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
97 {
98   PetscErrorCode ierr;
99 
100   PetscFunctionBegin;
101   ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr);
102   Bseq->ops->mult                   = NULL;
103   Bseq->ops->multadd                = NULL;
104   Bseq->ops->multtranspose          = NULL;
105   Bseq->ops->multtransposeadd       = NULL;
106   Bseq->ops->lufactor               = NULL;
107   Bseq->ops->choleskyfactor         = NULL;
108   Bseq->ops->lufactorsymbolic       = NULL;
109   Bseq->ops->choleskyfactorsymbolic = NULL;
110   Bseq->ops->getinertia             = NULL;
111   PetscFunctionReturn(0);
112 }
113 
114 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat * B)115 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
116 {
117   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*c;
118   PetscErrorCode ierr;
119   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
120   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
121   const PetscInt *irow,*icol;
122   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
123   PetscInt       *aj = a->j,*ai = a->i;
124   MatScalar      *mat_a;
125   Mat            C;
126   PetscBool      flag;
127 
128   PetscFunctionBegin;
129 
130   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
131   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
132   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
133   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
134 
135   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
136   ssmap = smap;
137   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
138   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
139   /* determine lens of each row */
140   for (i=0; i<nrows; i++) {
141     kstart  = ai[irow[i]];
142     kend    = kstart + a->ilen[irow[i]];
143     lens[i] = 0;
144     for (k=kstart; k<kend; k++) {
145       if (ssmap[aj[k]]) lens[i]++;
146     }
147   }
148   /* Create and fill new matrix */
149   if (scall == MAT_REUSE_MATRIX) {
150     c = (Mat_SeqSBAIJ*)((*B)->data);
151 
152     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
153     ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr);
154     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
155     ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr);
156     C    = *B;
157   } else {
158     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
159     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
160     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
161     ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
162   }
163   c = (Mat_SeqSBAIJ*)(C->data);
164   for (i=0; i<nrows; i++) {
165     row      = irow[i];
166     kstart   = ai[row];
167     kend     = kstart + a->ilen[row];
168     mat_i    = c->i[i];
169     mat_j    = c->j + mat_i;
170     mat_a    = c->a + mat_i*bs2;
171     mat_ilen = c->ilen + i;
172     for (k=kstart; k<kend; k++) {
173       if ((tcol=ssmap[a->j[k]])) {
174         *mat_j++ = tcol - 1;
175         ierr     = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr);
176         mat_a   += bs2;
177         (*mat_ilen)++;
178       }
179     }
180   }
181   /* sort */
182   {
183     MatScalar *work;
184 
185     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
186     for (i=0; i<nrows; i++) {
187       PetscInt ilen;
188       mat_i = c->i[i];
189       mat_j = c->j + mat_i;
190       mat_a = c->a + mat_i*bs2;
191       ilen  = c->ilen[i];
192       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
193     }
194     ierr = PetscFree(work);CHKERRQ(ierr);
195   }
196 
197   /* Free work space */
198   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
199   ierr = PetscFree(smap);CHKERRQ(ierr);
200   ierr = PetscFree(lens);CHKERRQ(ierr);
201   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
202   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
203 
204   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
205   *B   = C;
206   PetscFunctionReturn(0);
207 }
208 
MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat * B)209 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
210 {
211   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
212   IS             is1,is2;
213   PetscErrorCode ierr;
214   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
215   const PetscInt *irow,*icol;
216 
217   PetscFunctionBegin;
218   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
219   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
220   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
221   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
222 
223   /* Verify if the indices corespond to each element in a block
224    and form the IS with compressed IS */
225   maxmnbs = PetscMax(a->mbs,a->nbs);
226   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
227   ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr);
228   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
229   for (i=0; i<a->mbs; i++) {
230     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
231   }
232   count = 0;
233   for (i=0; i<nrows; i++) {
234     PetscInt j = irow[i] / bs;
235     if ((vary[j]--)==bs) iary[count++] = j;
236   }
237   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
238 
239   ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr);
240   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
241   for (i=0; i<a->nbs; i++) {
242     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
243   }
244   count = 0;
245   for (i=0; i<ncols; i++) {
246     PetscInt j = icol[i] / bs;
247     if ((vary[j]--)==bs) iary[count++] = j;
248   }
249   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
250   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
251   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
252   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
253 
254   ierr = MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
255   ierr = ISDestroy(&is1);CHKERRQ(ierr);
256   ierr = ISDestroy(&is2);CHKERRQ(ierr);
257 
258   if (isrow != iscol) {
259     PetscBool isequal;
260     ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr);
261     if (!isequal) {
262       ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr);
263     }
264   }
265   PetscFunctionReturn(0);
266 }
267 
MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat * B[])268 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
269 {
270   PetscErrorCode ierr;
271   PetscInt       i;
272 
273   PetscFunctionBegin;
274   if (scall == MAT_INITIAL_MATRIX) {
275     ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr);
276   }
277 
278   for (i=0; i<n; i++) {
279     ierr = MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
280   }
281   PetscFunctionReturn(0);
282 }
283 
284 /* -------------------------------------------------------*/
285 /* Should check that shapes of vectors and matrices match */
286 /* -------------------------------------------------------*/
287 
MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)288 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
289 {
290   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
291   PetscScalar       *z,x1,x2,zero=0.0;
292   const PetscScalar *x,*xb;
293   const MatScalar   *v;
294   PetscErrorCode    ierr;
295   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
296   const PetscInt    *aj=a->j,*ai=a->i,*ib;
297   PetscInt          nonzerorow=0;
298 
299   PetscFunctionBegin;
300   ierr = VecSet(zz,zero);CHKERRQ(ierr);
301   if (!a->nz) PetscFunctionReturn(0);
302   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
303   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
304 
305   v  = a->a;
306   xb = x;
307 
308   for (i=0; i<mbs; i++) {
309     n           = ai[1] - ai[0]; /* length of i_th block row of A */
310     x1          = xb[0]; x2 = xb[1];
311     ib          = aj + *ai;
312     jmin        = 0;
313     nonzerorow += (n>0);
314     if (*ib == i) {     /* (diag of A)*x */
315       z[2*i]   += v[0]*x1 + v[2]*x2;
316       z[2*i+1] += v[2]*x1 + v[3]*x2;
317       v        += 4; jmin++;
318     }
319     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
320     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
321     for (j=jmin; j<n; j++) {
322       /* (strict lower triangular part of A)*x  */
323       cval       = ib[j]*2;
324       z[cval]   += v[0]*x1 + v[1]*x2;
325       z[cval+1] += v[2]*x1 + v[3]*x2;
326       /* (strict upper triangular part of A)*x  */
327       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
328       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
329       v        += 4;
330     }
331     xb +=2; ai++;
332   }
333 
334   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
335   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
336   ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
337   PetscFunctionReturn(0);
338 }
339 
MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)340 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
341 {
342   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
343   PetscScalar       *z,x1,x2,x3,zero=0.0;
344   const PetscScalar *x,*xb;
345   const MatScalar   *v;
346   PetscErrorCode    ierr;
347   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
348   const PetscInt    *aj = a->j,*ai = a->i,*ib;
349   PetscInt          nonzerorow=0;
350 
351   PetscFunctionBegin;
352   ierr = VecSet(zz,zero);CHKERRQ(ierr);
353   if (!a->nz) PetscFunctionReturn(0);
354   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
355   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
356 
357   v  = a->a;
358   xb = x;
359 
360   for (i=0; i<mbs; i++) {
361     n           = ai[1] - ai[0]; /* length of i_th block row of A */
362     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
363     ib          = aj + *ai;
364     jmin        = 0;
365     nonzerorow += (n>0);
366     if (*ib == i) {     /* (diag of A)*x */
367       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
368       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
369       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
370       v        += 9; jmin++;
371     }
372     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
373     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
374     for (j=jmin; j<n; j++) {
375       /* (strict lower triangular part of A)*x  */
376       cval       = ib[j]*3;
377       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
378       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
379       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
380       /* (strict upper triangular part of A)*x  */
381       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
382       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
383       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
384       v        += 9;
385     }
386     xb +=3; ai++;
387   }
388 
389   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
390   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
391   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
392   PetscFunctionReturn(0);
393 }
394 
MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)395 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
396 {
397   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
398   PetscScalar       *z,x1,x2,x3,x4,zero=0.0;
399   const PetscScalar *x,*xb;
400   const MatScalar   *v;
401   PetscErrorCode    ierr;
402   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
403   const PetscInt    *aj = a->j,*ai = a->i,*ib;
404   PetscInt          nonzerorow = 0;
405 
406   PetscFunctionBegin;
407   ierr = VecSet(zz,zero);CHKERRQ(ierr);
408   if (!a->nz) PetscFunctionReturn(0);
409   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
410   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
411 
412   v  = a->a;
413   xb = x;
414 
415   for (i=0; i<mbs; i++) {
416     n           = ai[1] - ai[0]; /* length of i_th block row of A */
417     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
418     ib          = aj + *ai;
419     jmin        = 0;
420     nonzerorow += (n>0);
421     if (*ib == i) {     /* (diag of A)*x */
422       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
423       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
424       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
425       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
426       v        += 16; jmin++;
427     }
428     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
429     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
430     for (j=jmin; j<n; j++) {
431       /* (strict lower triangular part of A)*x  */
432       cval       = ib[j]*4;
433       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
434       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
435       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
436       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
437       /* (strict upper triangular part of A)*x  */
438       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
439       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
440       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
441       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
442       v        += 16;
443     }
444     xb +=4; ai++;
445   }
446 
447   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
448   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
449   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
450   PetscFunctionReturn(0);
451 }
452 
MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)453 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
454 {
455   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
456   PetscScalar       *z,x1,x2,x3,x4,x5,zero=0.0;
457   const PetscScalar *x,*xb;
458   const MatScalar   *v;
459   PetscErrorCode    ierr;
460   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
461   const PetscInt    *aj = a->j,*ai = a->i,*ib;
462   PetscInt          nonzerorow=0;
463 
464   PetscFunctionBegin;
465   ierr = VecSet(zz,zero);CHKERRQ(ierr);
466   if (!a->nz) PetscFunctionReturn(0);
467   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
468   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
469 
470   v  = a->a;
471   xb = x;
472 
473   for (i=0; i<mbs; i++) {
474     n           = ai[1] - ai[0]; /* length of i_th block row of A */
475     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
476     ib          = aj + *ai;
477     jmin        = 0;
478     nonzerorow += (n>0);
479     if (*ib == i) {      /* (diag of A)*x */
480       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
481       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
482       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
483       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
484       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
485       v        += 25; jmin++;
486     }
487     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
488     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
489     for (j=jmin; j<n; j++) {
490       /* (strict lower triangular part of A)*x  */
491       cval       = ib[j]*5;
492       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
493       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
494       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
495       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
496       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
497       /* (strict upper triangular part of A)*x  */
498       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
499       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
500       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
501       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
502       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
503       v        += 25;
504     }
505     xb +=5; ai++;
506   }
507 
508   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
509   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
510   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
511   PetscFunctionReturn(0);
512 }
513 
MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)514 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
515 {
516   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
517   PetscScalar       *z,x1,x2,x3,x4,x5,x6,zero=0.0;
518   const PetscScalar *x,*xb;
519   const MatScalar   *v;
520   PetscErrorCode    ierr;
521   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
522   const PetscInt    *aj=a->j,*ai=a->i,*ib;
523   PetscInt          nonzerorow=0;
524 
525   PetscFunctionBegin;
526   ierr = VecSet(zz,zero);CHKERRQ(ierr);
527   if (!a->nz) PetscFunctionReturn(0);
528   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
529   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
530 
531   v  = a->a;
532   xb = x;
533 
534   for (i=0; i<mbs; i++) {
535     n           = ai[1] - ai[0]; /* length of i_th block row of A */
536     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
537     ib          = aj + *ai;
538     jmin        = 0;
539     nonzerorow += (n>0);
540     if (*ib == i) {      /* (diag of A)*x */
541       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
542       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
543       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
544       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
545       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
546       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
547       v        += 36; jmin++;
548     }
549     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
550     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
551     for (j=jmin; j<n; j++) {
552       /* (strict lower triangular part of A)*x  */
553       cval       = ib[j]*6;
554       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
555       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
556       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
557       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
558       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
559       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
560       /* (strict upper triangular part of A)*x  */
561       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
562       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
563       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
564       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
565       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
566       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
567       v        += 36;
568     }
569     xb +=6; ai++;
570   }
571 
572   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
573   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
574   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
575   PetscFunctionReturn(0);
576 }
577 
MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)578 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
579 {
580   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
581   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
582   const PetscScalar *x,*xb;
583   const MatScalar   *v;
584   PetscErrorCode    ierr;
585   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
586   const PetscInt    *aj=a->j,*ai=a->i,*ib;
587   PetscInt          nonzerorow=0;
588 
589   PetscFunctionBegin;
590   ierr = VecSet(zz,zero);CHKERRQ(ierr);
591   if (!a->nz) PetscFunctionReturn(0);
592   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
593   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
594 
595   v  = a->a;
596   xb = x;
597 
598   for (i=0; i<mbs; i++) {
599     n           = ai[1] - ai[0]; /* length of i_th block row of A */
600     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
601     ib          = aj + *ai;
602     jmin        = 0;
603     nonzerorow += (n>0);
604     if (*ib == i) {      /* (diag of A)*x */
605       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
606       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
607       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
608       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
609       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
610       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
611       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
612       v        += 49; jmin++;
613     }
614     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
615     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
616     for (j=jmin; j<n; j++) {
617       /* (strict lower triangular part of A)*x  */
618       cval       = ib[j]*7;
619       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
620       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
621       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
622       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
623       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
624       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
625       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
626       /* (strict upper triangular part of A)*x  */
627       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
628       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
629       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
630       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
631       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
632       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
633       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
634       v       += 49;
635     }
636     xb +=7; ai++;
637   }
638   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
639   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
640   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
641   PetscFunctionReturn(0);
642 }
643 
644 /*
645     This will not work with MatScalar == float because it calls the BLAS
646 */
MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)647 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
648 {
649   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
650   PetscScalar       *z,*z_ptr,*zb,*work,*workt,zero=0.0;
651   const PetscScalar *x,*x_ptr,*xb;
652   const MatScalar   *v;
653   PetscErrorCode    ierr;
654   PetscInt          mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
655   const PetscInt    *idx,*aj,*ii;
656   PetscInt          nonzerorow=0;
657 
658   PetscFunctionBegin;
659   ierr = VecSet(zz,zero);CHKERRQ(ierr);
660   if (!a->nz) PetscFunctionReturn(0);
661   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
662   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
663 
664   x_ptr = x;
665   z_ptr = z;
666 
667   aj = a->j;
668   v  = a->a;
669   ii = a->i;
670 
671   if (!a->mult_work) {
672     ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
673   }
674   work = a->mult_work;
675 
676   for (i=0; i<mbs; i++) {
677     n           = ii[1] - ii[0]; ncols = n*bs;
678     workt       = work; idx=aj+ii[0];
679     nonzerorow += (n>0);
680 
681     /* upper triangular part */
682     for (j=0; j<n; j++) {
683       xb = x_ptr + bs*(*idx++);
684       for (k=0; k<bs; k++) workt[k] = xb[k];
685       workt += bs;
686     }
687     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
688     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
689 
690     /* strict lower triangular part */
691     idx = aj+ii[0];
692     if (n && *idx == i) {
693       ncols -= bs; v += bs2; idx++; n--;
694     }
695 
696     if (ncols > 0) {
697       workt = work;
698       ierr  = PetscArrayzero(workt,ncols);CHKERRQ(ierr);
699       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
700       for (j=0; j<n; j++) {
701         zb = z_ptr + bs*(*idx++);
702         for (k=0; k<bs; k++) zb[k] += workt[k];
703         workt += bs;
704       }
705     }
706     x += bs; v += n*bs2; z += bs; ii++;
707   }
708 
709   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
710   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
711   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
712   PetscFunctionReturn(0);
713 }
714 
MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)715 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
716 {
717   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
718   PetscScalar       *z,x1;
719   const PetscScalar *x,*xb;
720   const MatScalar   *v;
721   PetscErrorCode    ierr;
722   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
723   const PetscInt    *aj=a->j,*ai=a->i,*ib;
724   PetscInt          nonzerorow=0;
725 #if defined(PETSC_USE_COMPLEX)
726   const int         aconj = A->hermitian;
727 #else
728   const int         aconj = 0;
729 #endif
730 
731   PetscFunctionBegin;
732   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
733   if (!a->nz) PetscFunctionReturn(0);
734   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
735   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
736   v    = a->a;
737   xb   = x;
738 
739   for (i=0; i<mbs; i++) {
740     n           = ai[1] - ai[0]; /* length of i_th row of A */
741     x1          = xb[0];
742     ib          = aj + *ai;
743     jmin        = 0;
744     nonzerorow += (n>0);
745     if (n && *ib == i) {            /* (diag of A)*x */
746       z[i] += *v++ * x[*ib++]; jmin++;
747     }
748     if (aconj) {
749       for (j=jmin; j<n; j++) {
750         cval    = *ib;
751         z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x  */
752         z[i]    += *v++ * x[*ib++];    /* (strict upper triangular part of A)*x  */
753       }
754     } else {
755       for (j=jmin; j<n; j++) {
756         cval    = *ib;
757         z[cval] += *v * x1;         /* (strict lower triangular part of A)*x  */
758         z[i]    += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x  */
759       }
760     }
761     xb++; ai++;
762   }
763 
764   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
765   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
766 
767   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
768   PetscFunctionReturn(0);
769 }
770 
MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)771 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
772 {
773   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
774   PetscScalar       *z,x1,x2;
775   const PetscScalar *x,*xb;
776   const MatScalar   *v;
777   PetscErrorCode    ierr;
778   PetscInt          mbs =a->mbs,i,n,cval,j,jmin;
779   const PetscInt    *aj=a->j,*ai=a->i,*ib;
780   PetscInt          nonzerorow=0;
781 
782   PetscFunctionBegin;
783   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
784   if (!a->nz) PetscFunctionReturn(0);
785   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
786   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
787 
788   v  = a->a;
789   xb = x;
790 
791   for (i=0; i<mbs; i++) {
792     n           = ai[1] - ai[0]; /* length of i_th block row of A */
793     x1          = xb[0]; x2 = xb[1];
794     ib          = aj + *ai;
795     jmin        = 0;
796     nonzerorow += (n>0);
797     if (n && *ib == i) {      /* (diag of A)*x */
798       z[2*i]   += v[0]*x1 + v[2]*x2;
799       z[2*i+1] += v[2]*x1 + v[3]*x2;
800       v        += 4; jmin++;
801     }
802     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
803     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
804     for (j=jmin; j<n; j++) {
805       /* (strict lower triangular part of A)*x  */
806       cval       = ib[j]*2;
807       z[cval]   += v[0]*x1 + v[1]*x2;
808       z[cval+1] += v[2]*x1 + v[3]*x2;
809       /* (strict upper triangular part of A)*x  */
810       z[2*i]   += v[0]*x[cval] + v[2]*x[cval+1];
811       z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
812       v        += 4;
813     }
814     xb +=2; ai++;
815   }
816   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
817   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
818 
819   ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
820   PetscFunctionReturn(0);
821 }
822 
MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)823 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
824 {
825   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
826   PetscScalar       *z,x1,x2,x3;
827   const PetscScalar *x,*xb;
828   const MatScalar   *v;
829   PetscErrorCode    ierr;
830   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
831   const PetscInt    *aj=a->j,*ai=a->i,*ib;
832   PetscInt          nonzerorow=0;
833 
834   PetscFunctionBegin;
835   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
836   if (!a->nz) PetscFunctionReturn(0);
837   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
838   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
839 
840   v  = a->a;
841   xb = x;
842 
843   for (i=0; i<mbs; i++) {
844     n           = ai[1] - ai[0]; /* length of i_th block row of A */
845     x1          = xb[0]; x2 = xb[1]; x3 = xb[2];
846     ib          = aj + *ai;
847     jmin        = 0;
848     nonzerorow += (n>0);
849     if (n && *ib == i) {     /* (diag of A)*x */
850       z[3*i]   += v[0]*x1 + v[3]*x2 + v[6]*x3;
851       z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
852       z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
853       v        += 9; jmin++;
854     }
855     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
856     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
857     for (j=jmin; j<n; j++) {
858       /* (strict lower triangular part of A)*x  */
859       cval       = ib[j]*3;
860       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3;
861       z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
862       z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
863       /* (strict upper triangular part of A)*x  */
864       z[3*i]   += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
865       z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
866       z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
867       v        += 9;
868     }
869     xb +=3; ai++;
870   }
871 
872   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
873   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
874 
875   ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
876   PetscFunctionReturn(0);
877 }
878 
MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)879 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
880 {
881   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
882   PetscScalar       *z,x1,x2,x3,x4;
883   const PetscScalar *x,*xb;
884   const MatScalar   *v;
885   PetscErrorCode    ierr;
886   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
887   const PetscInt    *aj=a->j,*ai=a->i,*ib;
888   PetscInt          nonzerorow=0;
889 
890   PetscFunctionBegin;
891   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
892   if (!a->nz) PetscFunctionReturn(0);
893   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
894   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
895 
896   v  = a->a;
897   xb = x;
898 
899   for (i=0; i<mbs; i++) {
900     n           = ai[1] - ai[0]; /* length of i_th block row of A */
901     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
902     ib          = aj + *ai;
903     jmin        = 0;
904     nonzerorow += (n>0);
905     if (n && *ib == i) {      /* (diag of A)*x */
906       z[4*i]   += v[0]*x1 + v[4]*x2 +  v[8]*x3 + v[12]*x4;
907       z[4*i+1] += v[4]*x1 + v[5]*x2 +  v[9]*x3 + v[13]*x4;
908       z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
909       z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
910       v        += 16; jmin++;
911     }
912     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
913     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
914     for (j=jmin; j<n; j++) {
915       /* (strict lower triangular part of A)*x  */
916       cval       = ib[j]*4;
917       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
918       z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
919       z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
920       z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
921       /* (strict upper triangular part of A)*x  */
922       z[4*i]   += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
923       z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
924       z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
925       z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
926       v        += 16;
927     }
928     xb +=4; ai++;
929   }
930 
931   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
932   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
933 
934   ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
935   PetscFunctionReturn(0);
936 }
937 
MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)938 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
939 {
940   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
941   PetscScalar       *z,x1,x2,x3,x4,x5;
942   const PetscScalar *x,*xb;
943   const MatScalar   *v;
944   PetscErrorCode    ierr;
945   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
946   const PetscInt    *aj=a->j,*ai=a->i,*ib;
947   PetscInt          nonzerorow=0;
948 
949   PetscFunctionBegin;
950   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
951   if (!a->nz) PetscFunctionReturn(0);
952   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
953   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
954 
955   v  = a->a;
956   xb = x;
957 
958   for (i=0; i<mbs; i++) {
959     n           = ai[1] - ai[0]; /* length of i_th block row of A */
960     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
961     ib          = aj + *ai;
962     jmin        = 0;
963     nonzerorow += (n>0);
964     if (n && *ib == i) {      /* (diag of A)*x */
965       z[5*i]   += v[0]*x1  + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
966       z[5*i+1] += v[5]*x1  + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
967       z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
968       z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
969       z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
970       v        += 25; jmin++;
971     }
972     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
973     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
974     for (j=jmin; j<n; j++) {
975       /* (strict lower triangular part of A)*x  */
976       cval       = ib[j]*5;
977       z[cval]   += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
978       z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
979       z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
980       z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
981       z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
982       /* (strict upper triangular part of A)*x  */
983       z[5*i]   +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
984       z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
985       z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
986       z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
987       z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
988       v        += 25;
989     }
990     xb +=5; ai++;
991   }
992 
993   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
994   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
995 
996   ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
997   PetscFunctionReturn(0);
998 }
999 
MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)1000 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1001 {
1002   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1003   PetscScalar       *z,x1,x2,x3,x4,x5,x6;
1004   const PetscScalar *x,*xb;
1005   const MatScalar   *v;
1006   PetscErrorCode    ierr;
1007   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1008   const PetscInt    *aj=a->j,*ai=a->i,*ib;
1009   PetscInt          nonzerorow=0;
1010 
1011   PetscFunctionBegin;
1012   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1013   if (!a->nz) PetscFunctionReturn(0);
1014   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1015   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1016 
1017   v  = a->a;
1018   xb = x;
1019 
1020   for (i=0; i<mbs; i++) {
1021     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1022     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
1023     ib          = aj + *ai;
1024     jmin        = 0;
1025     nonzerorow += (n>0);
1026     if (n && *ib == i) {     /* (diag of A)*x */
1027       z[6*i]   += v[0]*x1  + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
1028       z[6*i+1] += v[6]*x1  + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
1029       z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1030       z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1031       z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1032       z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1033       v        += 36; jmin++;
1034     }
1035     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1036     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1037     for (j=jmin; j<n; j++) {
1038       /* (strict lower triangular part of A)*x  */
1039       cval       = ib[j]*6;
1040       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1041       z[cval+1] += v[6]*x1  + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1042       z[cval+2] += v[12]*x1  + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1043       z[cval+3] += v[18]*x1  + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1044       z[cval+4] += v[24]*x1  + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1045       z[cval+5] += v[30]*x1  + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1046       /* (strict upper triangular part of A)*x  */
1047       z[6*i]   +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1048       z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1049       z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1050       z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1051       z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1052       z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1053       v        += 36;
1054     }
1055     xb +=6; ai++;
1056   }
1057 
1058   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1059   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1060 
1061   ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1062   PetscFunctionReturn(0);
1063 }
1064 
MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)1065 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1066 {
1067   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1068   PetscScalar       *z,x1,x2,x3,x4,x5,x6,x7;
1069   const PetscScalar *x,*xb;
1070   const MatScalar   *v;
1071   PetscErrorCode    ierr;
1072   PetscInt          mbs = a->mbs,i,n,cval,j,jmin;
1073   const PetscInt    *aj=a->j,*ai=a->i,*ib;
1074   PetscInt          nonzerorow=0;
1075 
1076   PetscFunctionBegin;
1077   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1078   if (!a->nz) PetscFunctionReturn(0);
1079   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1080   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1081 
1082   v  = a->a;
1083   xb = x;
1084 
1085   for (i=0; i<mbs; i++) {
1086     n           = ai[1] - ai[0]; /* length of i_th block row of A */
1087     x1          = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1088     ib          = aj + *ai;
1089     jmin        = 0;
1090     nonzerorow += (n>0);
1091     if (n && *ib == i) {     /* (diag of A)*x */
1092       z[7*i]   += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1093       z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1094       z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1095       z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1096       z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1097       z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1098       z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1099       v        += 49; jmin++;
1100     }
1101     PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1102     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1103     for (j=jmin; j<n; j++) {
1104       /* (strict lower triangular part of A)*x  */
1105       cval       = ib[j]*7;
1106       z[cval]   += v[0]*x1  + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1107       z[cval+1] += v[7]*x1  + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1108       z[cval+2] += v[14]*x1  + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1109       z[cval+3] += v[21]*x1  + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1110       z[cval+4] += v[28]*x1  + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1111       z[cval+5] += v[35]*x1  + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1112       z[cval+6] += v[42]*x1  + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1113       /* (strict upper triangular part of A)*x  */
1114       z[7*i]  +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1115       z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1116       z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1117       z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1118       z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1119       z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1120       z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1121       v       += 49;
1122     }
1123     xb +=7; ai++;
1124   }
1125 
1126   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1127   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1128 
1129   ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1130   PetscFunctionReturn(0);
1131 }
1132 
MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)1133 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1134 {
1135   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1136   PetscScalar       *z,*z_ptr=NULL,*zb,*work,*workt;
1137   const PetscScalar *x,*x_ptr,*xb;
1138   const MatScalar   *v;
1139   PetscErrorCode    ierr;
1140   PetscInt          mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1141   const PetscInt    *idx,*aj,*ii;
1142   PetscInt          nonzerorow=0;
1143 
1144   PetscFunctionBegin;
1145   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1146   if (!a->nz) PetscFunctionReturn(0);
1147   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x;
1148   ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1149 
1150   aj = a->j;
1151   v  = a->a;
1152   ii = a->i;
1153 
1154   if (!a->mult_work) {
1155     ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
1156   }
1157   work = a->mult_work;
1158 
1159 
1160   for (i=0; i<mbs; i++) {
1161     n           = ii[1] - ii[0]; ncols = n*bs;
1162     workt       = work; idx=aj+ii[0];
1163     nonzerorow += (n>0);
1164 
1165     /* upper triangular part */
1166     for (j=0; j<n; j++) {
1167       xb = x_ptr + bs*(*idx++);
1168       for (k=0; k<bs; k++) workt[k] = xb[k];
1169       workt += bs;
1170     }
1171     /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1172     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1173 
1174     /* strict lower triangular part */
1175     idx = aj+ii[0];
1176     if (n && *idx == i) {
1177       ncols -= bs; v += bs2; idx++; n--;
1178     }
1179     if (ncols > 0) {
1180       workt = work;
1181       ierr  = PetscArrayzero(workt,ncols);CHKERRQ(ierr);
1182       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1183       for (j=0; j<n; j++) {
1184         zb = z_ptr + bs*(*idx++);
1185         for (k=0; k<bs; k++) zb[k] += workt[k];
1186         workt += bs;
1187       }
1188     }
1189 
1190     x += bs; v += n*bs2; z += bs; ii++;
1191   }
1192 
1193   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1194   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1195 
1196   ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1197   PetscFunctionReturn(0);
1198 }
1199 
MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)1200 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1201 {
1202   Mat_SeqSBAIJ   *a     = (Mat_SeqSBAIJ*)inA->data;
1203   PetscScalar    oalpha = alpha;
1204   PetscErrorCode ierr;
1205   PetscBLASInt   one = 1,totalnz;
1206 
1207   PetscFunctionBegin;
1208   ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
1209   PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1210   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1211   PetscFunctionReturn(0);
1212 }
1213 
MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal * norm)1214 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1215 {
1216   Mat_SeqSBAIJ    *a       = (Mat_SeqSBAIJ*)A->data;
1217   const MatScalar *v       = a->a;
1218   PetscReal       sum_diag = 0.0, sum_off = 0.0, *sum;
1219   PetscInt        i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1220   PetscErrorCode  ierr;
1221   const PetscInt  *aj=a->j,*col;
1222 
1223   PetscFunctionBegin;
1224   if (!a->nz) {
1225     *norm = 0.0;
1226     PetscFunctionReturn(0);
1227   }
1228   if (type == NORM_FROBENIUS) {
1229     for (k=0; k<mbs; k++) {
1230       jmin = a->i[k];
1231       jmax = a->i[k+1];
1232       col  = aj + jmin;
1233       if (jmax-jmin > 0 && *col == k) {         /* diagonal block */
1234         for (i=0; i<bs2; i++) {
1235           sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1236         }
1237         jmin++;
1238       }
1239       for (j=jmin; j<jmax; j++) {  /* off-diagonal blocks */
1240         for (i=0; i<bs2; i++) {
1241           sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1242         }
1243       }
1244     }
1245     *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1246     ierr = PetscLogFlops(2.0*bs2*a->nz);CHKERRQ(ierr);
1247   } else if (type == NORM_INFINITY || type == NORM_1) {  /* maximum row/column sum */
1248     ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
1249     for (i=0; i<mbs; i++) jl[i] = mbs;
1250     il[0] = 0;
1251 
1252     *norm = 0.0;
1253     for (k=0; k<mbs; k++) { /* k_th block row */
1254       for (j=0; j<bs; j++) sum[j]=0.0;
1255       /*-- col sum --*/
1256       i = jl[k]; /* first |A(i,k)| to be added */
1257       /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1258                   at step k */
1259       while (i<mbs) {
1260         nexti = jl[i];  /* next block row to be added */
1261         ik    = il[i];  /* block index of A(i,k) in the array a */
1262         for (j=0; j<bs; j++) {
1263           v = a->a + ik*bs2 + j*bs;
1264           for (k1=0; k1<bs; k1++) {
1265             sum[j] += PetscAbsScalar(*v); v++;
1266           }
1267         }
1268         /* update il, jl */
1269         jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1270         jmax = a->i[i+1];
1271         if (jmin < jmax) {
1272           il[i] = jmin;
1273           j     = a->j[jmin];
1274           jl[i] = jl[j]; jl[j]=i;
1275         }
1276         i = nexti;
1277       }
1278       /*-- row sum --*/
1279       jmin = a->i[k];
1280       jmax = a->i[k+1];
1281       for (i=jmin; i<jmax; i++) {
1282         for (j=0; j<bs; j++) {
1283           v = a->a + i*bs2 + j;
1284           for (k1=0; k1<bs; k1++) {
1285             sum[j] += PetscAbsScalar(*v); v += bs;
1286           }
1287         }
1288       }
1289       /* add k_th block row to il, jl */
1290       col = aj+jmin;
1291       if (jmax - jmin > 0 && *col == k) jmin++;
1292       if (jmin < jmax) {
1293         il[k] = jmin;
1294         j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1295       }
1296       for (j=0; j<bs; j++) {
1297         if (sum[j] > *norm) *norm = sum[j];
1298       }
1299     }
1300     ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
1301     ierr = PetscLogFlops(PetscMax(mbs*a->nz-1,0));CHKERRQ(ierr);
1302   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1303   PetscFunctionReturn(0);
1304 }
1305 
MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)1306 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1307 {
1308   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1309   PetscErrorCode ierr;
1310 
1311   PetscFunctionBegin;
1312   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
1313   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1314     *flg = PETSC_FALSE;
1315     PetscFunctionReturn(0);
1316   }
1317 
1318   /* if the a->i are the same */
1319   ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr);
1320   if (!*flg) PetscFunctionReturn(0);
1321 
1322   /* if a->j are the same */
1323   ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr);
1324   if (!*flg) PetscFunctionReturn(0);
1325 
1326   /* if a->a are the same */
1327   ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg);CHKERRQ(ierr);
1328   PetscFunctionReturn(0);
1329 }
1330 
MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)1331 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1332 {
1333   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1334   PetscErrorCode  ierr;
1335   PetscInt        i,j,k,row,bs,ambs,bs2;
1336   const PetscInt  *ai,*aj;
1337   PetscScalar     *x,zero = 0.0;
1338   const MatScalar *aa,*aa_j;
1339 
1340   PetscFunctionBegin;
1341   bs = A->rmap->bs;
1342   if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1343 
1344   aa   = a->a;
1345   ambs = a->mbs;
1346 
1347   if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1348     PetscInt *diag=a->diag;
1349     aa   = a->a;
1350     ambs = a->mbs;
1351     ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1352     for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1353     ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1354     PetscFunctionReturn(0);
1355   }
1356 
1357   ai   = a->i;
1358   aj   = a->j;
1359   bs2  = a->bs2;
1360   ierr = VecSet(v,zero);CHKERRQ(ierr);
1361   if (!a->nz) PetscFunctionReturn(0);
1362   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1363   for (i=0; i<ambs; i++) {
1364     j = ai[i];
1365     if (aj[j] == i) {    /* if this is a diagonal element */
1366       row  = i*bs;
1367       aa_j = aa + j*bs2;
1368       for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1369     }
1370   }
1371   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1372   PetscFunctionReturn(0);
1373 }
1374 
MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)1375 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1376 {
1377   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1378   PetscScalar       x;
1379   const PetscScalar *l,*li,*ri;
1380   MatScalar         *aa,*v;
1381   PetscErrorCode    ierr;
1382   PetscInt          i,j,k,lm,M,m,mbs,tmp,bs,bs2;
1383   const PetscInt    *ai,*aj;
1384   PetscBool         flg;
1385 
1386   PetscFunctionBegin;
1387   if (ll != rr) {
1388     ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1389     if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1390   }
1391   if (!ll) PetscFunctionReturn(0);
1392   ai  = a->i;
1393   aj  = a->j;
1394   aa  = a->a;
1395   m   = A->rmap->N;
1396   bs  = A->rmap->bs;
1397   mbs = a->mbs;
1398   bs2 = a->bs2;
1399 
1400   ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
1401   ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1402   if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1403   for (i=0; i<mbs; i++) { /* for each block row */
1404     M  = ai[i+1] - ai[i];
1405     li = l + i*bs;
1406     v  = aa + bs2*ai[i];
1407     for (j=0; j<M; j++) { /* for each block */
1408       ri = l + bs*aj[ai[i]+j];
1409       for (k=0; k<bs; k++) {
1410         x = ri[k];
1411         for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1412       }
1413     }
1414   }
1415   ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1416   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1417   PetscFunctionReturn(0);
1418 }
1419 
MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo * info)1420 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1421 {
1422   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1423 
1424   PetscFunctionBegin;
1425   info->block_size   = a->bs2;
1426   info->nz_allocated = a->bs2*a->maxnz;   /*num. of nonzeros in upper triangular part */
1427   info->nz_used      = a->bs2*a->nz;   /*num. of nonzeros in upper triangular part */
1428   info->nz_unneeded  = info->nz_allocated - info->nz_used;
1429   info->assemblies   = A->num_ass;
1430   info->mallocs      = A->info.mallocs;
1431   info->memory       = ((PetscObject)A)->mem;
1432   if (A->factortype) {
1433     info->fill_ratio_given  = A->info.fill_ratio_given;
1434     info->fill_ratio_needed = A->info.fill_ratio_needed;
1435     info->factor_mallocs    = A->info.factor_mallocs;
1436   } else {
1437     info->fill_ratio_given  = 0;
1438     info->fill_ratio_needed = 0;
1439     info->factor_mallocs    = 0;
1440   }
1441   PetscFunctionReturn(0);
1442 }
1443 
MatZeroEntries_SeqSBAIJ(Mat A)1444 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1445 {
1446   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1447   PetscErrorCode ierr;
1448 
1449   PetscFunctionBegin;
1450   ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr);
1451   PetscFunctionReturn(0);
1452 }
1453 
MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])1454 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1455 {
1456   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ*)A->data;
1457   PetscErrorCode  ierr;
1458   PetscInt        i,j,n,row,col,bs,mbs;
1459   const PetscInt  *ai,*aj;
1460   PetscReal       atmp;
1461   const MatScalar *aa;
1462   PetscScalar     *x;
1463   PetscInt        ncols,brow,bcol,krow,kcol;
1464 
1465   PetscFunctionBegin;
1466   if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1467   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1468   bs  = A->rmap->bs;
1469   aa  = a->a;
1470   ai  = a->i;
1471   aj  = a->j;
1472   mbs = a->mbs;
1473 
1474   ierr = VecSet(v,0.0);CHKERRQ(ierr);
1475   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1476   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1477   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1478   for (i=0; i<mbs; i++) {
1479     ncols = ai[1] - ai[0]; ai++;
1480     brow  = bs*i;
1481     for (j=0; j<ncols; j++) {
1482       bcol = bs*(*aj);
1483       for (kcol=0; kcol<bs; kcol++) {
1484         col = bcol + kcol;      /* col index */
1485         for (krow=0; krow<bs; krow++) {
1486           atmp = PetscAbsScalar(*aa); aa++;
1487           row  = brow + krow;   /* row index */
1488           if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1489           if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1490         }
1491       }
1492       aj++;
1493     }
1494   }
1495   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1496   PetscFunctionReturn(0);
1497 }
1498 
MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)1499 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
1500 {
1501   PetscErrorCode ierr;
1502 
1503   PetscFunctionBegin;
1504   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
1505   C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1506   PetscFunctionReturn(0);
1507 }
1508 
MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)1509 PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1510 {
1511   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1512   PetscScalar       *z = c;
1513   const PetscScalar *xb;
1514   PetscScalar       x1;
1515   const MatScalar   *v = a->a,*vv;
1516   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1517 #if defined(PETSC_USE_COMPLEX)
1518   const int         aconj = A->hermitian;
1519 #else
1520   const int         aconj = 0;
1521 #endif
1522 
1523   PetscFunctionBegin;
1524   for (i=0; i<mbs; i++) {
1525     n = ii[1] - ii[0]; ii++;
1526     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1527     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1528     jj = idx;
1529     vv = v;
1530     for (k=0; k<cn; k++) {
1531       idx = jj;
1532       v = vv;
1533       for (j=0; j<n; j++) {
1534         xb = b + (*idx); x1 = xb[0+k*bm];
1535         z[0+k*cm] += v[0]*x1;
1536         if (*idx != i) c[(*idx)+k*cm] += (aconj ? PetscConj(v[0]) : v[0])*b[i+k*bm];
1537         v += 1;
1538         ++idx;
1539       }
1540     }
1541     z += 1;
1542   }
1543   PetscFunctionReturn(0);
1544 }
1545 
MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)1546 PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1547 {
1548   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1549   PetscScalar       *z = c;
1550   const PetscScalar *xb;
1551   PetscScalar       x1,x2;
1552   const MatScalar   *v = a->a,*vv;
1553   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1554 
1555   PetscFunctionBegin;
1556   for (i=0; i<mbs; i++) {
1557     n = ii[1] - ii[0]; ii++;
1558     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1559     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1560     jj = idx;
1561     vv = v;
1562     for (k=0; k<cn; k++) {
1563       idx = jj;
1564       v = vv;
1565       for (j=0; j<n; j++) {
1566         xb    = b + 2*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
1567         z[0+k*cm] += v[0]*x1 + v[2]*x2;
1568         z[1+k*cm] += v[1]*x1 + v[3]*x2;
1569         if (*idx != i) {
1570           c[2*(*idx)+0+k*cm] += v[0]*b[2*i+k*bm] + v[1]*b[2*i+1+k*bm];
1571           c[2*(*idx)+1+k*cm] += v[2]*b[2*i+k*bm] + v[3]*b[2*i+1+k*bm];
1572         }
1573         v += 4;
1574         ++idx;
1575       }
1576     }
1577     z += 2;
1578   }
1579   PetscFunctionReturn(0);
1580 }
1581 
MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)1582 PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1583 {
1584   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1585   PetscScalar       *z = c;
1586   const PetscScalar *xb;
1587   PetscScalar       x1,x2,x3;
1588   const MatScalar   *v = a->a,*vv;
1589   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1590 
1591   PetscFunctionBegin;
1592   for (i=0; i<mbs; i++) {
1593     n = ii[1] - ii[0]; ii++;
1594     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1595     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1596     jj = idx;
1597     vv = v;
1598     for (k=0; k<cn; k++) {
1599       idx = jj;
1600       v = vv;
1601       for (j=0; j<n; j++) {
1602         xb = b + 3*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
1603         z[0+k*cm] += v[0]*x1 + v[3]*x2 + v[6]*x3;
1604         z[1+k*cm] += v[1]*x1 + v[4]*x2 + v[7]*x3;
1605         z[2+k*cm] += v[2]*x1 + v[5]*x2 + v[8]*x3;
1606         if (*idx != i) {
1607           c[3*(*idx)+0+k*cm] += v[0]*b[3*i+k*bm] + v[3]*b[3*i+1+k*bm] + v[6]*b[3*i+2+k*bm];
1608           c[3*(*idx)+1+k*cm] += v[1]*b[3*i+k*bm] + v[4]*b[3*i+1+k*bm] + v[7]*b[3*i+2+k*bm];
1609           c[3*(*idx)+2+k*cm] += v[2]*b[3*i+k*bm] + v[5]*b[3*i+1+k*bm] + v[8]*b[3*i+2+k*bm];
1610         }
1611         v += 9;
1612         ++idx;
1613       }
1614     }
1615     z += 3;
1616   }
1617   PetscFunctionReturn(0);
1618 }
1619 
MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)1620 PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1621 {
1622   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1623   PetscScalar       *z = c;
1624   const PetscScalar *xb;
1625   PetscScalar       x1,x2,x3,x4;
1626   const MatScalar   *v = a->a,*vv;
1627   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1628 
1629   PetscFunctionBegin;
1630   for (i=0; i<mbs; i++) {
1631     n = ii[1] - ii[0]; ii++;
1632     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1633     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1634     jj = idx;
1635     vv = v;
1636     for (k=0; k<cn; k++) {
1637       idx = jj;
1638       v = vv;
1639       for (j=0; j<n; j++) {
1640         xb = b + 4*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
1641         z[0+k*cm] += v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1642         z[1+k*cm] += v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1643         z[2+k*cm] += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1644         z[3+k*cm] += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1645         if (*idx != i) {
1646           c[4*(*idx)+0+k*cm] += v[0]*b[4*i+k*bm] + v[4]*b[4*i+1+k*bm] + v[8]*b[4*i+2+k*bm]  + v[12]*b[4*i+3+k*bm];
1647           c[4*(*idx)+1+k*cm] += v[1]*b[4*i+k*bm] + v[5]*b[4*i+1+k*bm] + v[9]*b[4*i+2+k*bm]  + v[13]*b[4*i+3+k*bm];
1648           c[4*(*idx)+2+k*cm] += v[2]*b[4*i+k*bm] + v[6]*b[4*i+1+k*bm] + v[10]*b[4*i+2+k*bm] + v[14]*b[4*i+3+k*bm];
1649           c[4*(*idx)+3+k*cm] += v[3]*b[4*i+k*bm] + v[7]*b[4*i+1+k*bm] + v[11]*b[4*i+2+k*bm] + v[15]*b[4*i+3+k*bm];
1650         }
1651         v += 16;
1652         ++idx;
1653       }
1654     }
1655     z += 4;
1656   }
1657   PetscFunctionReturn(0);
1658 }
1659 
MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)1660 PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1661 {
1662   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1663   PetscScalar       *z = c;
1664   const PetscScalar *xb;
1665   PetscScalar       x1,x2,x3,x4,x5;
1666   const MatScalar   *v = a->a,*vv;
1667   PetscInt          mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1668 
1669   PetscFunctionBegin;
1670   for (i=0; i<mbs; i++) {
1671     n = ii[1] - ii[0]; ii++;
1672     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1673     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1674     jj = idx;
1675     vv = v;
1676     for (k=0; k<cn; k++) {
1677       idx = jj;
1678       v = vv;
1679       for (j=0; j<n; j++) {
1680         xb = b + 5*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*cm];
1681         z[0+k*cm] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1682         z[1+k*cm] += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1683         z[2+k*cm] += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1684         z[3+k*cm] += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1685         z[4+k*cm] += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1686         if (*idx != i) {
1687           c[5*(*idx)+0+k*cm] += v[0]*b[5*i+k*bm] + v[5]*b[5*i+1+k*bm] + v[10]*b[5*i+2+k*bm] + v[15]*b[5*i+3+k*bm] + v[20]*b[5*i+4+k*bm];
1688           c[5*(*idx)+1+k*cm] += v[1]*b[5*i+k*bm] + v[6]*b[5*i+1+k*bm] + v[11]*b[5*i+2+k*bm] + v[16]*b[5*i+3+k*bm] + v[21]*b[5*i+4+k*bm];
1689           c[5*(*idx)+2+k*cm] += v[2]*b[5*i+k*bm] + v[7]*b[5*i+1+k*bm] + v[12]*b[5*i+2+k*bm] + v[17]*b[5*i+3+k*bm] + v[22]*b[5*i+4+k*bm];
1690           c[5*(*idx)+3+k*cm] += v[3]*b[5*i+k*bm] + v[8]*b[5*i+1+k*bm] + v[13]*b[5*i+2+k*bm] + v[18]*b[5*i+3+k*bm] + v[23]*b[5*i+4+k*bm];
1691           c[5*(*idx)+4+k*cm] += v[4]*b[5*i+k*bm] + v[9]*b[5*i+1+k*bm] + v[14]*b[5*i+2+k*bm] + v[19]*b[5*i+3+k*bm] + v[24]*b[5*i+4+k*bm];
1692         }
1693         v += 25;
1694         ++idx;
1695       }
1696     }
1697     z += 5;
1698   }
1699   PetscFunctionReturn(0);
1700 }
1701 
MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C)1702 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C)
1703 {
1704   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1705   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
1706   Mat_SeqDense      *cd = (Mat_SeqDense*)C->data;
1707   PetscInt          cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
1708   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1709   PetscBLASInt      bbs,bcn,bbm,bcm;
1710   PetscScalar       *z = NULL;
1711   PetscScalar       *c,*b;
1712   const MatScalar   *v;
1713   const PetscInt    *idx,*ii;
1714   PetscScalar       _DOne=1.0;
1715   PetscErrorCode    ierr;
1716 
1717   PetscFunctionBegin;
1718   if (!cm || !cn) PetscFunctionReturn(0);
1719   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
1720   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1721   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1722   b = bd->v;
1723   ierr = MatZeroEntries(C);CHKERRQ(ierr);
1724   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1725   switch (bs) {
1726   case 1:
1727     ierr = MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn);
1728     break;
1729   case 2:
1730     ierr = MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn);
1731     break;
1732   case 3:
1733     ierr = MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn);
1734     break;
1735   case 4:
1736     ierr = MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn);
1737     break;
1738   case 5:
1739     ierr = MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn);
1740     break;
1741   default: /* block sizes larger than 5 by 5 are handled by BLAS */
1742     ierr = PetscBLASIntCast(bs,&bbs);CHKERRQ(ierr);
1743     ierr = PetscBLASIntCast(cn,&bcn);CHKERRQ(ierr);
1744     ierr = PetscBLASIntCast(bm,&bbm);CHKERRQ(ierr);
1745     ierr = PetscBLASIntCast(cm,&bcm);CHKERRQ(ierr);
1746     idx = a->j;
1747     v   = a->a;
1748     mbs = a->mbs;
1749     ii  = a->i;
1750     z   = c;
1751     for (i=0; i<mbs; i++) {
1752       n = ii[1] - ii[0]; ii++;
1753       for (j=0; j<n; j++) {
1754         if (*idx != i) PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*i,&bbm,&_DOne,c+bs*(*idx),&bcm));
1755         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
1756         v += bs2;
1757       }
1758       z += bs;
1759     }
1760   }
1761   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1762   ierr = PetscLogFlops((2.0*(a->nz*2.0 - a->nonzerorowcnt)*bs2 - a->nonzerorowcnt)*cn);CHKERRQ(ierr);
1763   PetscFunctionReturn(0);
1764 }
1765