1 
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <../src/mat/impls/dense/seq/dense.h>
4 #include <petsc/private/kernels/blockinvert.h>
5 #include <petscbt.h>
6 #include <petscblaslapack.h>
7 
8 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
9 #include <immintrin.h>
10 #endif
11 
MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)12 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
13 {
14   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
15   PetscErrorCode ierr;
16   PetscInt       row,i,j,k,l,m,n,*nidx,isz,val,ival;
17   const PetscInt *idx;
18   PetscInt       start,end,*ai,*aj,bs,*nidx2;
19   PetscBT        table;
20 
21   PetscFunctionBegin;
22   m  = a->mbs;
23   ai = a->i;
24   aj = a->j;
25   bs = A->rmap->bs;
26 
27   if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
28 
29   ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
30   ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr);
31   ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
32 
33   for (i=0; i<is_max; i++) {
34     /* Initialise the two local arrays */
35     isz  = 0;
36     ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
37 
38     /* Extract the indices, assume there can be duplicate entries */
39     ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
40     ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
41 
42     /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
43     for (j=0; j<n; ++j) {
44       ival = idx[j]/bs; /* convert the indices into block indices */
45       if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
46       if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
47     }
48     ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
49     ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
50 
51     k = 0;
52     for (j=0; j<ov; j++) { /* for each overlap*/
53       n = isz;
54       for (; k<n; k++) {  /* do only those rows in nidx[k], which are not done yet */
55         row   = nidx[k];
56         start = ai[row];
57         end   = ai[row+1];
58         for (l = start; l<end; l++) {
59           val = aj[l];
60           if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
61         }
62       }
63     }
64     /* expand the Index Set */
65     for (j=0; j<isz; j++) {
66       for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
67     }
68     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
69   }
70   ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
71   ierr = PetscFree(nidx);CHKERRQ(ierr);
72   ierr = PetscFree(nidx2);CHKERRQ(ierr);
73   PetscFunctionReturn(0);
74 }
75 
MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat * B)76 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
77 {
78   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*c;
79   PetscErrorCode ierr;
80   PetscInt       *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
81   PetscInt       row,mat_i,*mat_j,tcol,*mat_ilen;
82   const PetscInt *irow,*icol;
83   PetscInt       nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
84   PetscInt       *aj = a->j,*ai = a->i;
85   MatScalar      *mat_a;
86   Mat            C;
87   PetscBool      flag;
88 
89   PetscFunctionBegin;
90   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
91   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
92   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
93   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
94 
95   ierr  = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
96   ssmap = smap;
97   ierr  = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
98   for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
99   /* determine lens of each row */
100   for (i=0; i<nrows; i++) {
101     kstart  = ai[irow[i]];
102     kend    = kstart + a->ilen[irow[i]];
103     lens[i] = 0;
104     for (k=kstart; k<kend; k++) {
105       if (ssmap[aj[k]]) lens[i]++;
106     }
107   }
108   /* Create and fill new matrix */
109   if (scall == MAT_REUSE_MATRIX) {
110     c = (Mat_SeqBAIJ*)((*B)->data);
111 
112     if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
113     ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr);
114     if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
115     ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr);
116     C    = *B;
117   } else {
118     ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
119     ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
120     ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
121     ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
122   }
123   c = (Mat_SeqBAIJ*)(C->data);
124   for (i=0; i<nrows; i++) {
125     row      = irow[i];
126     kstart   = ai[row];
127     kend     = kstart + a->ilen[row];
128     mat_i    = c->i[i];
129     mat_j    = c->j + mat_i;
130     mat_a    = c->a + mat_i*bs2;
131     mat_ilen = c->ilen + i;
132     for (k=kstart; k<kend; k++) {
133       if ((tcol=ssmap[a->j[k]])) {
134         *mat_j++ = tcol - 1;
135         ierr     = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr);
136         mat_a   += bs2;
137         (*mat_ilen)++;
138       }
139     }
140   }
141   /* sort */
142   {
143     MatScalar *work;
144     ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
145     for (i=0; i<nrows; i++) {
146       PetscInt ilen;
147       mat_i = c->i[i];
148       mat_j = c->j + mat_i;
149       mat_a = c->a + mat_i*bs2;
150       ilen  = c->ilen[i];
151       ierr  = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
152     }
153     ierr = PetscFree(work);CHKERRQ(ierr);
154   }
155 
156   /* Free work space */
157   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
158   ierr = PetscFree(smap);CHKERRQ(ierr);
159   ierr = PetscFree(lens);CHKERRQ(ierr);
160   ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
161   ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
162 
163   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
164   *B   = C;
165   PetscFunctionReturn(0);
166 }
167 
MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat * B)168 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
169 {
170   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
171   IS             is1,is2;
172   PetscErrorCode ierr;
173   PetscInt       *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
174   const PetscInt *irow,*icol;
175 
176   PetscFunctionBegin;
177   ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
178   ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
179   ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
180   ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
181 
182   /* Verify if the indices corespond to each element in a block
183    and form the IS with compressed IS */
184   maxmnbs = PetscMax(a->mbs,a->nbs);
185   ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
186   ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr);
187   for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
188   for (i=0; i<a->mbs; i++) {
189     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
190   }
191   count = 0;
192   for (i=0; i<nrows; i++) {
193     j = irow[i] / bs;
194     if ((vary[j]--)==bs) iary[count++] = j;
195   }
196   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
197 
198   ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr);
199   for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
200   for (i=0; i<a->nbs; i++) {
201     if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
202   }
203   count = 0;
204   for (i=0; i<ncols; i++) {
205     j = icol[i] / bs;
206     if ((vary[j]--)==bs) iary[count++] = j;
207   }
208   ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
209   ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
210   ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
211   ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
212 
213   ierr = MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
214   ierr = ISDestroy(&is1);CHKERRQ(ierr);
215   ierr = ISDestroy(&is2);CHKERRQ(ierr);
216   PetscFunctionReturn(0);
217 }
218 
MatDestroySubMatrix_SeqBAIJ(Mat C)219 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
220 {
221   PetscErrorCode ierr;
222   Mat_SeqBAIJ    *c = (Mat_SeqBAIJ*)C->data;
223   Mat_SubSppt    *submatj = c->submatis1;
224 
225   PetscFunctionBegin;
226   ierr = (*submatj->destroy)(C);CHKERRQ(ierr);
227   ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr);
228   PetscFunctionReturn(0);
229 }
230 
MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat * mat[])231 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
232 {
233   PetscErrorCode ierr;
234   PetscInt       i;
235   Mat            C;
236   Mat_SeqBAIJ    *c;
237   Mat_SubSppt    *submatj;
238 
239   PetscFunctionBegin;
240   for (i=0; i<n; i++) {
241     C       = (*mat)[i];
242     c       = (Mat_SeqBAIJ*)C->data;
243     submatj = c->submatis1;
244     if (submatj) {
245       ierr = (*submatj->destroy)(C);CHKERRQ(ierr);
246       ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr);
247       ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
248       ierr = PetscLayoutDestroy(&C->rmap);CHKERRQ(ierr);
249       ierr = PetscLayoutDestroy(&C->cmap);CHKERRQ(ierr);
250       ierr = PetscHeaderDestroy(&C);CHKERRQ(ierr);
251     } else {
252       ierr = MatDestroy(&C);CHKERRQ(ierr);
253     }
254   }
255   ierr = PetscFree(*mat);CHKERRQ(ierr);
256   PetscFunctionReturn(0);
257 }
258 
MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat * B[])259 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
260 {
261   PetscErrorCode ierr;
262   PetscInt       i;
263 
264   PetscFunctionBegin;
265   if (scall == MAT_INITIAL_MATRIX) {
266     ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr);
267   }
268 
269   for (i=0; i<n; i++) {
270     ierr = MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
271   }
272   PetscFunctionReturn(0);
273 }
274 
275 /* -------------------------------------------------------*/
276 /* Should check that shapes of vectors and matrices match */
277 /* -------------------------------------------------------*/
278 
MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)279 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
280 {
281   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
282   PetscScalar       *z,sum;
283   const PetscScalar *x;
284   const MatScalar   *v;
285   PetscErrorCode    ierr;
286   PetscInt          mbs,i,n;
287   const PetscInt    *idx,*ii,*ridx=NULL;
288   PetscBool         usecprow=a->compressedrow.use;
289 
290   PetscFunctionBegin;
291   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
292   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
293 
294   if (usecprow) {
295     mbs  = a->compressedrow.nrows;
296     ii   = a->compressedrow.i;
297     ridx = a->compressedrow.rindex;
298     ierr = PetscArrayzero(z,a->mbs);CHKERRQ(ierr);
299   } else {
300     mbs = a->mbs;
301     ii  = a->i;
302   }
303 
304   for (i=0; i<mbs; i++) {
305     n   = ii[1] - ii[0];
306     v   = a->a + ii[0];
307     idx = a->j + ii[0];
308     ii++;
309     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
310     PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
311     sum = 0.0;
312     PetscSparseDensePlusDot(sum,x,v,idx,n);
313     if (usecprow) {
314       z[ridx[i]] = sum;
315     } else {
316       z[i]       = sum;
317     }
318   }
319   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
320   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
321   ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
322   PetscFunctionReturn(0);
323 }
324 
MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)325 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
326 {
327   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
328   PetscScalar       *z = NULL,sum1,sum2,*zarray;
329   const PetscScalar *x,*xb;
330   PetscScalar       x1,x2;
331   const MatScalar   *v;
332   PetscErrorCode    ierr;
333   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
334   PetscBool         usecprow=a->compressedrow.use;
335 
336   PetscFunctionBegin;
337   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
338   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
339 
340   idx = a->j;
341   v   = a->a;
342   if (usecprow) {
343     mbs  = a->compressedrow.nrows;
344     ii   = a->compressedrow.i;
345     ridx = a->compressedrow.rindex;
346     ierr = PetscArrayzero(zarray,2*a->mbs);CHKERRQ(ierr);
347   } else {
348     mbs = a->mbs;
349     ii  = a->i;
350     z   = zarray;
351   }
352 
353   for (i=0; i<mbs; i++) {
354     n           = ii[1] - ii[0]; ii++;
355     sum1        = 0.0; sum2 = 0.0;
356     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
357     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
358     for (j=0; j<n; j++) {
359       xb    = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
360       sum1 += v[0]*x1 + v[2]*x2;
361       sum2 += v[1]*x1 + v[3]*x2;
362       v    += 4;
363     }
364     if (usecprow) z = zarray + 2*ridx[i];
365     z[0] = sum1; z[1] = sum2;
366     if (!usecprow) z += 2;
367   }
368   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
369   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
370   ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
371   PetscFunctionReturn(0);
372 }
373 
MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)374 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
375 {
376   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
377   PetscScalar       *z = NULL,sum1,sum2,sum3,x1,x2,x3,*zarray;
378   const PetscScalar *x,*xb;
379   const MatScalar   *v;
380   PetscErrorCode    ierr;
381   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
382   PetscBool         usecprow=a->compressedrow.use;
383 
384 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
385 #pragma disjoint(*v,*z,*xb)
386 #endif
387 
388   PetscFunctionBegin;
389   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
390   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
391 
392   idx = a->j;
393   v   = a->a;
394   if (usecprow) {
395     mbs  = a->compressedrow.nrows;
396     ii   = a->compressedrow.i;
397     ridx = a->compressedrow.rindex;
398     ierr = PetscArrayzero(zarray,3*a->mbs);CHKERRQ(ierr);
399   } else {
400     mbs = a->mbs;
401     ii  = a->i;
402     z   = zarray;
403   }
404 
405   for (i=0; i<mbs; i++) {
406     n           = ii[1] - ii[0]; ii++;
407     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0;
408     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
409     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
410     for (j=0; j<n; j++) {
411       xb = x + 3*(*idx++);
412       x1 = xb[0];
413       x2 = xb[1];
414       x3 = xb[2];
415 
416       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
417       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
418       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
419       v    += 9;
420     }
421     if (usecprow) z = zarray + 3*ridx[i];
422     z[0] = sum1; z[1] = sum2; z[2] = sum3;
423     if (!usecprow) z += 3;
424   }
425   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
426   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
427   ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
428   PetscFunctionReturn(0);
429 }
430 
MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)431 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
432 {
433   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
434   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
435   const PetscScalar *x,*xb;
436   const MatScalar   *v;
437   PetscErrorCode    ierr;
438   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
439   PetscBool         usecprow=a->compressedrow.use;
440 
441   PetscFunctionBegin;
442   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
443   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
444 
445   idx = a->j;
446   v   = a->a;
447   if (usecprow) {
448     mbs  = a->compressedrow.nrows;
449     ii   = a->compressedrow.i;
450     ridx = a->compressedrow.rindex;
451     ierr = PetscArrayzero(zarray,4*a->mbs);CHKERRQ(ierr);
452   } else {
453     mbs = a->mbs;
454     ii  = a->i;
455     z   = zarray;
456   }
457 
458   for (i=0; i<mbs; i++) {
459     n = ii[1] - ii[0];
460     ii++;
461     sum1 = 0.0;
462     sum2 = 0.0;
463     sum3 = 0.0;
464     sum4 = 0.0;
465 
466     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
467     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
468     for (j=0; j<n; j++) {
469       xb    = x + 4*(*idx++);
470       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
471       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
472       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
473       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
474       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
475       v    += 16;
476     }
477     if (usecprow) z = zarray + 4*ridx[i];
478     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
479     if (!usecprow) z += 4;
480   }
481   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
482   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
483   ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
484   PetscFunctionReturn(0);
485 }
486 
MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)487 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
488 {
489   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
490   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray;
491   const PetscScalar *xb,*x;
492   const MatScalar   *v;
493   PetscErrorCode    ierr;
494   const PetscInt    *idx,*ii,*ridx=NULL;
495   PetscInt          mbs,i,j,n;
496   PetscBool         usecprow=a->compressedrow.use;
497 
498   PetscFunctionBegin;
499   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
500   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
501 
502   idx = a->j;
503   v   = a->a;
504   if (usecprow) {
505     mbs  = a->compressedrow.nrows;
506     ii   = a->compressedrow.i;
507     ridx = a->compressedrow.rindex;
508     ierr = PetscArrayzero(zarray,5*a->mbs);CHKERRQ(ierr);
509   } else {
510     mbs = a->mbs;
511     ii  = a->i;
512     z   = zarray;
513   }
514 
515   for (i=0; i<mbs; i++) {
516     n           = ii[1] - ii[0]; ii++;
517     sum1        = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
518     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
519     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
520     for (j=0; j<n; j++) {
521       xb    = x + 5*(*idx++);
522       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
523       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
524       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
525       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
526       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
527       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
528       v    += 25;
529     }
530     if (usecprow) z = zarray + 5*ridx[i];
531     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
532     if (!usecprow) z += 5;
533   }
534   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
535   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
536   ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
537   PetscFunctionReturn(0);
538 }
539 
MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)540 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
541 {
542   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
543   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
544   const PetscScalar *x,*xb;
545   PetscScalar       x1,x2,x3,x4,x5,x6,*zarray;
546   const MatScalar   *v;
547   PetscErrorCode    ierr;
548   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
549   PetscBool         usecprow=a->compressedrow.use;
550 
551   PetscFunctionBegin;
552   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
553   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
554 
555   idx = a->j;
556   v   = a->a;
557   if (usecprow) {
558     mbs  = a->compressedrow.nrows;
559     ii   = a->compressedrow.i;
560     ridx = a->compressedrow.rindex;
561     ierr = PetscArrayzero(zarray,6*a->mbs);CHKERRQ(ierr);
562   } else {
563     mbs = a->mbs;
564     ii  = a->i;
565     z   = zarray;
566   }
567 
568   for (i=0; i<mbs; i++) {
569     n  = ii[1] - ii[0];
570     ii++;
571     sum1 = 0.0;
572     sum2 = 0.0;
573     sum3 = 0.0;
574     sum4 = 0.0;
575     sum5 = 0.0;
576     sum6 = 0.0;
577 
578     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
579     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
580     for (j=0; j<n; j++) {
581       xb    = x + 6*(*idx++);
582       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
583       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
584       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
585       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
586       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
587       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
588       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
589       v    += 36;
590     }
591     if (usecprow) z = zarray + 6*ridx[i];
592     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
593     if (!usecprow) z += 6;
594   }
595 
596   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
597   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
598   ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
599   PetscFunctionReturn(0);
600 }
601 
MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)602 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
603 {
604   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
605   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
606   const PetscScalar *x,*xb;
607   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*zarray;
608   const MatScalar   *v;
609   PetscErrorCode    ierr;
610   PetscInt          mbs,i,*idx,*ii,j,n,*ridx=NULL;
611   PetscBool         usecprow=a->compressedrow.use;
612 
613   PetscFunctionBegin;
614   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
615   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
616 
617   idx = a->j;
618   v   = a->a;
619   if (usecprow) {
620     mbs  = a->compressedrow.nrows;
621     ii   = a->compressedrow.i;
622     ridx = a->compressedrow.rindex;
623     ierr = PetscArrayzero(zarray,7*a->mbs);CHKERRQ(ierr);
624   } else {
625     mbs = a->mbs;
626     ii  = a->i;
627     z   = zarray;
628   }
629 
630   for (i=0; i<mbs; i++) {
631     n  = ii[1] - ii[0];
632     ii++;
633     sum1 = 0.0;
634     sum2 = 0.0;
635     sum3 = 0.0;
636     sum4 = 0.0;
637     sum5 = 0.0;
638     sum6 = 0.0;
639     sum7 = 0.0;
640 
641     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
642     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
643     for (j=0; j<n; j++) {
644       xb    = x + 7*(*idx++);
645       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
646       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
647       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
648       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
649       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
650       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
651       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
652       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
653       v    += 49;
654     }
655     if (usecprow) z = zarray + 7*ridx[i];
656     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
657     if (!usecprow) z += 7;
658   }
659 
660   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
661   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
662   ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
663   PetscFunctionReturn(0);
664 }
665 
666 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)667 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
668 {
669   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
670   PetscScalar    *z = NULL,*work,*workt,*zarray;
671   const PetscScalar *x,*xb;
672   const MatScalar   *v;
673   PetscErrorCode ierr;
674   PetscInt       mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
675   const PetscInt *idx,*ii,*ridx=NULL;
676   PetscInt       k;
677   PetscBool      usecprow=a->compressedrow.use;
678 
679   __m256d a0,a1,a2,a3,a4,a5;
680   __m256d w0,w1,w2,w3;
681   __m256d z0,z1,z2;
682   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
683 
684   PetscFunctionBegin;
685   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
686   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
687 
688   idx = a->j;
689   v   = a->a;
690   if (usecprow) {
691     mbs  = a->compressedrow.nrows;
692     ii   = a->compressedrow.i;
693     ridx = a->compressedrow.rindex;
694     ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr);
695   } else {
696     mbs = a->mbs;
697     ii  = a->i;
698     z   = zarray;
699   }
700 
701   if (!a->mult_work) {
702     k    = PetscMax(A->rmap->n,A->cmap->n);
703     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
704   }
705 
706   work = a->mult_work;
707   for (i=0; i<mbs; i++) {
708     n           = ii[1] - ii[0]; ii++;
709     workt       = work;
710     for (j=0; j<n; j++) {
711       xb = x + bs*(*idx++);
712       for (k=0; k<bs; k++) workt[k] = xb[k];
713       workt += bs;
714     }
715     if (usecprow) z = zarray + bs*ridx[i];
716 
717     z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
718 
719     for (j=0; j<n; j++) {
720       /* first column of a */
721       w0 = _mm256_set1_pd(work[j*9  ]);
722       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
723       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
724       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
725 
726       /* second column of a */
727       w1 = _mm256_set1_pd(work[j*9+ 1]);
728       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
729       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
730       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
731 
732       /* third column of a */
733       w2 = _mm256_set1_pd(work[j*9 +2]);
734       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
735       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
736       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
737 
738       /* fourth column of a */
739       w3 = _mm256_set1_pd(work[j*9+ 3]);
740       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
741       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
742       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
743 
744       /* fifth column of a */
745       w0 = _mm256_set1_pd(work[j*9+ 4]);
746       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
747       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
748       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
749 
750       /* sixth column of a */
751       w1 = _mm256_set1_pd(work[j*9+ 5]);
752       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
753       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
754       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
755 
756       /* seventh column of a */
757       w2 = _mm256_set1_pd(work[j*9+ 6]);
758       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
759       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
760       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
761 
762       /* eigth column of a */
763       w3 = _mm256_set1_pd(work[j*9+ 7]);
764       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
765       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
766       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
767 
768       /* ninth column of a */
769       w0 = _mm256_set1_pd(work[j*9+ 8]);
770       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
771       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
772       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
773     }
774 
775     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
776 
777     v += n*bs2;
778     if (!usecprow) z += bs;
779   }
780   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
781   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
782   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
783   PetscFunctionReturn(0);
784 }
785 #endif
786 
MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)787 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
788 {
789   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
790   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
791   const PetscScalar *x,*xb;
792   PetscScalar       *zarray,xv;
793   const MatScalar   *v;
794   PetscErrorCode    ierr;
795   const PetscInt    *ii,*ij=a->j,*idx;
796   PetscInt          mbs,i,j,k,n,*ridx=NULL;
797   PetscBool         usecprow=a->compressedrow.use;
798 
799   PetscFunctionBegin;
800   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
801   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
802 
803   v = a->a;
804   if (usecprow) {
805     mbs  = a->compressedrow.nrows;
806     ii   = a->compressedrow.i;
807     ridx = a->compressedrow.rindex;
808     ierr = PetscArrayzero(zarray,11*a->mbs);CHKERRQ(ierr);
809   } else {
810     mbs = a->mbs;
811     ii  = a->i;
812     z   = zarray;
813   }
814 
815   for (i=0; i<mbs; i++) {
816     n    = ii[i+1] - ii[i];
817     idx  = ij + ii[i];
818     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
819     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;
820 
821     for (j=0; j<n; j++) {
822       xb = x + 11*(idx[j]);
823 
824       for (k=0; k<11; k++) {
825         xv     =  xb[k];
826         sum1  += v[0]*xv;
827         sum2  += v[1]*xv;
828         sum3  += v[2]*xv;
829         sum4  += v[3]*xv;
830         sum5  += v[4]*xv;
831         sum6  += v[5]*xv;
832         sum7  += v[6]*xv;
833         sum8  += v[7]*xv;
834         sum9  += v[8]*xv;
835         sum10 += v[9]*xv;
836         sum11 += v[10]*xv;
837         v     += 11;
838       }
839     }
840     if (usecprow) z = zarray + 11*ridx[i];
841     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
842     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
843 
844     if (!usecprow) z += 11;
845   }
846 
847   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
848   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
849   ierr = PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);CHKERRQ(ierr);
850   PetscFunctionReturn(0);
851 }
852 
853 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
854 /* Default MatMult for block size 15 */
MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)855 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
856 {
857   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
858   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
859   const PetscScalar *x,*xb;
860   PetscScalar       *zarray,xv;
861   const MatScalar   *v;
862   PetscErrorCode    ierr;
863   const PetscInt    *ii,*ij=a->j,*idx;
864   PetscInt          mbs,i,j,k,n,*ridx=NULL;
865   PetscBool         usecprow=a->compressedrow.use;
866 
867   PetscFunctionBegin;
868   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
869   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
870 
871   v = a->a;
872   if (usecprow) {
873     mbs  = a->compressedrow.nrows;
874     ii   = a->compressedrow.i;
875     ridx = a->compressedrow.rindex;
876     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
877   } else {
878     mbs = a->mbs;
879     ii  = a->i;
880     z   = zarray;
881   }
882 
883   for (i=0; i<mbs; i++) {
884     n    = ii[i+1] - ii[i];
885     idx  = ij + ii[i];
886     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
887     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
888 
889     for (j=0; j<n; j++) {
890       xb = x + 15*(idx[j]);
891 
892       for (k=0; k<15; k++) {
893         xv     =  xb[k];
894         sum1  += v[0]*xv;
895         sum2  += v[1]*xv;
896         sum3  += v[2]*xv;
897         sum4  += v[3]*xv;
898         sum5  += v[4]*xv;
899         sum6  += v[5]*xv;
900         sum7  += v[6]*xv;
901         sum8  += v[7]*xv;
902         sum9  += v[8]*xv;
903         sum10 += v[9]*xv;
904         sum11 += v[10]*xv;
905         sum12 += v[11]*xv;
906         sum13 += v[12]*xv;
907         sum14 += v[13]*xv;
908         sum15 += v[14]*xv;
909         v     += 15;
910       }
911     }
912     if (usecprow) z = zarray + 15*ridx[i];
913     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
914     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
915 
916     if (!usecprow) z += 15;
917   }
918 
919   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
920   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
921   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
922   PetscFunctionReturn(0);
923 }
924 
925 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)926 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
927 {
928   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
929   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
930   const PetscScalar *x,*xb;
931   PetscScalar       x1,x2,x3,x4,*zarray;
932   const MatScalar   *v;
933   PetscErrorCode    ierr;
934   const PetscInt    *ii,*ij=a->j,*idx;
935   PetscInt          mbs,i,j,n,*ridx=NULL;
936   PetscBool         usecprow=a->compressedrow.use;
937 
938   PetscFunctionBegin;
939   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
940   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
941 
942   v = a->a;
943   if (usecprow) {
944     mbs  = a->compressedrow.nrows;
945     ii   = a->compressedrow.i;
946     ridx = a->compressedrow.rindex;
947     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
948   } else {
949     mbs = a->mbs;
950     ii  = a->i;
951     z   = zarray;
952   }
953 
954   for (i=0; i<mbs; i++) {
955     n    = ii[i+1] - ii[i];
956     idx  = ij + ii[i];
957     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
958     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
959 
960     for (j=0; j<n; j++) {
961       xb = x + 15*(idx[j]);
962       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
963 
964       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
965       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
966       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
967       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
968       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
969       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
970       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
971       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
972       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
973       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
974       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
975       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
976       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
977       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
978       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
979 
980       v += 60;
981 
982       x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
983 
984       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
985       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
986       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
987       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
988       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
989       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
990       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
991       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
992       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
993       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
994       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
995       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
996       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
997       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
998       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
999       v     += 60;
1000 
1001       x1     = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1002       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3   + v[45]*x4;
1003       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3   + v[46]*x4;
1004       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3  + v[47]*x4;
1005       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4;
1006       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3   + v[49]*x4;
1007       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3   + v[50]*x4;
1008       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4;
1009       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3  + v[52]*x4;
1010       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3   + v[53]*x4;
1011       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3   + v[54]*x4;
1012       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4;
1013       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4;
1014       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3   + v[57]*x4;
1015       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3   + v[58]*x4;
1016       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4;
1017       v     += 60;
1018 
1019       x1     = xb[12]; x2 = xb[13]; x3 = xb[14];
1020       sum1  += v[0]*x1 + v[15]*x2 + v[30]*x3;
1021       sum2  += v[1]*x1 + v[16]*x2 + v[31]*x3;
1022       sum3  += v[2]*x1 + v[17]*x2 + v[32]*x3;
1023       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3;
1024       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3;
1025       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3;
1026       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3;
1027       sum8  += v[7]*x1 + v[22]*x2 + v[37]*x3;
1028       sum9  += v[8]*x1 + v[23]*x2 + v[38]*x3;
1029       sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1030       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1031       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1032       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1033       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1034       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1035       v     += 45;
1036     }
1037     if (usecprow) z = zarray + 15*ridx[i];
1038     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1039     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1040 
1041     if (!usecprow) z += 15;
1042   }
1043 
1044   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1045   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1046   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1047   PetscFunctionReturn(0);
1048 }
1049 
1050 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)1051 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1052 {
1053   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1054   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1055   const PetscScalar *x,*xb;
1056   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1057   const MatScalar   *v;
1058   PetscErrorCode    ierr;
1059   const PetscInt    *ii,*ij=a->j,*idx;
1060   PetscInt          mbs,i,j,n,*ridx=NULL;
1061   PetscBool         usecprow=a->compressedrow.use;
1062 
1063   PetscFunctionBegin;
1064   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1065   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1066 
1067   v = a->a;
1068   if (usecprow) {
1069     mbs  = a->compressedrow.nrows;
1070     ii   = a->compressedrow.i;
1071     ridx = a->compressedrow.rindex;
1072     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
1073   } else {
1074     mbs = a->mbs;
1075     ii  = a->i;
1076     z   = zarray;
1077   }
1078 
1079   for (i=0; i<mbs; i++) {
1080     n    = ii[i+1] - ii[i];
1081     idx  = ij + ii[i];
1082     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1083     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1084 
1085     for (j=0; j<n; j++) {
1086       xb = x + 15*(idx[j]);
1087       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1088       x8 = xb[7];
1089 
1090       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1091       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1092       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1093       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1094       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1095       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1096       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1097       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1098       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1099       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1100       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1101       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1102       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1103       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1104       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1105       v     += 120;
1106 
1107       x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1108 
1109       sum1  += v[0]*x1 + v[15]*x2  + v[30]*x3  + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1110       sum2  += v[1]*x1 + v[16]*x2  + v[31]*x3  + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1111       sum3  += v[2]*x1 + v[17]*x2  + v[32]*x3  + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1112       sum4  += v[3]*x1 + v[18]*x2 + v[33]*x3  + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1113       sum5  += v[4]*x1 + v[19]*x2 + v[34]*x3  + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1114       sum6  += v[5]*x1 + v[20]*x2 + v[35]*x3  + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1115       sum7  += v[6]*x1 + v[21]*x2 + v[36]*x3  + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1116       sum8  += v[7]*x1 + v[22]*x2  + v[37]*x3  + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1117       sum9  += v[8]*x1 + v[23]*x2  + v[38]*x3  + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1118       sum10 += v[9]*x1 + v[24]*x2  + v[39]*x3  + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1119       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3  + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1120       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3  + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1121       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3  + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1122       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3  + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1123       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3  + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1124       v     += 105;
1125     }
1126     if (usecprow) z = zarray + 15*ridx[i];
1127     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1128     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1129 
1130     if (!usecprow) z += 15;
1131   }
1132 
1133   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1134   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1135   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1136   PetscFunctionReturn(0);
1137 }
1138 
1139 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)1140 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1141 {
1142   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1143   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1144   const PetscScalar *x,*xb;
1145   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1146   const MatScalar   *v;
1147   PetscErrorCode    ierr;
1148   const PetscInt    *ii,*ij=a->j,*idx;
1149   PetscInt          mbs,i,j,n,*ridx=NULL;
1150   PetscBool         usecprow=a->compressedrow.use;
1151 
1152   PetscFunctionBegin;
1153   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1154   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1155 
1156   v = a->a;
1157   if (usecprow) {
1158     mbs  = a->compressedrow.nrows;
1159     ii   = a->compressedrow.i;
1160     ridx = a->compressedrow.rindex;
1161     ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
1162   } else {
1163     mbs = a->mbs;
1164     ii  = a->i;
1165     z   = zarray;
1166   }
1167 
1168   for (i=0; i<mbs; i++) {
1169     n    = ii[i+1] - ii[i];
1170     idx  = ij + ii[i];
1171     sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1172     sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1173 
1174     for (j=0; j<n; j++) {
1175       xb = x + 15*(idx[j]);
1176       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1177       x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1178 
1179       sum1  +=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1180       sum2  +=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1181       sum3  +=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1182       sum4  +=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1183       sum5  += v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1184       sum6  += v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1185       sum7  += v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1186       sum8  += v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1187       sum9  += v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1188       sum10 += v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1189       sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1190       sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1191       sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1192       sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1193       sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1194       v     += 225;
1195     }
1196     if (usecprow) z = zarray + 15*ridx[i];
1197     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1198     z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1199 
1200     if (!usecprow) z += 15;
1201   }
1202 
1203   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1204   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1205   ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1206   PetscFunctionReturn(0);
1207 }
1208 
1209 /*
1210     This will not work with MatScalar == float because it calls the BLAS
1211 */
MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)1212 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1213 {
1214   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1215   PetscScalar       *z = NULL,*work,*workt,*zarray;
1216   const PetscScalar *x,*xb;
1217   const MatScalar   *v;
1218   PetscErrorCode    ierr;
1219   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1220   const PetscInt    *idx,*ii,*ridx=NULL;
1221   PetscInt          ncols,k;
1222   PetscBool         usecprow=a->compressedrow.use;
1223 
1224   PetscFunctionBegin;
1225   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1226   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1227 
1228   idx = a->j;
1229   v   = a->a;
1230   if (usecprow) {
1231     mbs  = a->compressedrow.nrows;
1232     ii   = a->compressedrow.i;
1233     ridx = a->compressedrow.rindex;
1234     ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr);
1235   } else {
1236     mbs = a->mbs;
1237     ii  = a->i;
1238     z   = zarray;
1239   }
1240 
1241   if (!a->mult_work) {
1242     k    = PetscMax(A->rmap->n,A->cmap->n);
1243     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1244   }
1245   work = a->mult_work;
1246   for (i=0; i<mbs; i++) {
1247     n           = ii[1] - ii[0]; ii++;
1248     ncols       = n*bs;
1249     workt       = work;
1250     for (j=0; j<n; j++) {
1251       xb = x + bs*(*idx++);
1252       for (k=0; k<bs; k++) workt[k] = xb[k];
1253       workt += bs;
1254     }
1255     if (usecprow) z = zarray + bs*ridx[i];
1256     PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1257     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1258     v += n*bs2;
1259     if (!usecprow) z += bs;
1260   }
1261   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1262   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1263   ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1264   PetscFunctionReturn(0);
1265 }
1266 
MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)1267 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1268 {
1269   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1270   const PetscScalar *x;
1271   PetscScalar       *y,*z,sum;
1272   const MatScalar   *v;
1273   PetscErrorCode    ierr;
1274   PetscInt          mbs=a->mbs,i,n,*ridx=NULL;
1275   const PetscInt    *idx,*ii;
1276   PetscBool         usecprow=a->compressedrow.use;
1277 
1278   PetscFunctionBegin;
1279   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1280   ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1281 
1282   idx = a->j;
1283   v   = a->a;
1284   if (usecprow) {
1285     if (zz != yy) {
1286       ierr = PetscArraycpy(z,y,mbs);CHKERRQ(ierr);
1287     }
1288     mbs  = a->compressedrow.nrows;
1289     ii   = a->compressedrow.i;
1290     ridx = a->compressedrow.rindex;
1291   } else {
1292     ii = a->i;
1293   }
1294 
1295   for (i=0; i<mbs; i++) {
1296     n = ii[1] - ii[0];
1297     ii++;
1298     if (!usecprow) {
1299       sum         = y[i];
1300     } else {
1301       sum = y[ridx[i]];
1302     }
1303     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1304     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Entries for the next row */
1305     PetscSparseDensePlusDot(sum,x,v,idx,n);
1306     v   += n;
1307     idx += n;
1308     if (usecprow) {
1309       z[ridx[i]] = sum;
1310     } else {
1311       z[i] = sum;
1312     }
1313   }
1314   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1315   ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1316   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1317   PetscFunctionReturn(0);
1318 }
1319 
MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)1320 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1321 {
1322   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1323   PetscScalar       *y = NULL,*z = NULL,sum1,sum2;
1324   const PetscScalar *x,*xb;
1325   PetscScalar       x1,x2,*yarray,*zarray;
1326   const MatScalar   *v;
1327   PetscErrorCode    ierr;
1328   PetscInt          mbs = a->mbs,i,n,j;
1329   const PetscInt    *idx,*ii,*ridx = NULL;
1330   PetscBool         usecprow = a->compressedrow.use;
1331 
1332   PetscFunctionBegin;
1333   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1334   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1335 
1336   idx = a->j;
1337   v   = a->a;
1338   if (usecprow) {
1339     if (zz != yy) {
1340       ierr = PetscArraycpy(zarray,yarray,2*mbs);CHKERRQ(ierr);
1341     }
1342     mbs  = a->compressedrow.nrows;
1343     ii   = a->compressedrow.i;
1344     ridx = a->compressedrow.rindex;
1345   } else {
1346     ii = a->i;
1347     y  = yarray;
1348     z  = zarray;
1349   }
1350 
1351   for (i=0; i<mbs; i++) {
1352     n = ii[1] - ii[0]; ii++;
1353     if (usecprow) {
1354       z = zarray + 2*ridx[i];
1355       y = yarray + 2*ridx[i];
1356     }
1357     sum1 = y[0]; sum2 = y[1];
1358     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1359     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1360     for (j=0; j<n; j++) {
1361       xb = x + 2*(*idx++);
1362       x1 = xb[0];
1363       x2 = xb[1];
1364 
1365       sum1 += v[0]*x1 + v[2]*x2;
1366       sum2 += v[1]*x1 + v[3]*x2;
1367       v    += 4;
1368     }
1369     z[0] = sum1; z[1] = sum2;
1370     if (!usecprow) {
1371       z += 2; y += 2;
1372     }
1373   }
1374   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1375   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1376   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1377   PetscFunctionReturn(0);
1378 }
1379 
MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)1380 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1381 {
1382   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1383   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1384   const PetscScalar *x,*xb;
1385   const MatScalar   *v;
1386   PetscErrorCode    ierr;
1387   PetscInt          mbs = a->mbs,i,j,n;
1388   const PetscInt    *idx,*ii,*ridx = NULL;
1389   PetscBool         usecprow = a->compressedrow.use;
1390 
1391   PetscFunctionBegin;
1392   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1393   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1394 
1395   idx = a->j;
1396   v   = a->a;
1397   if (usecprow) {
1398     if (zz != yy) {
1399       ierr = PetscArraycpy(zarray,yarray,3*mbs);CHKERRQ(ierr);
1400     }
1401     mbs  = a->compressedrow.nrows;
1402     ii   = a->compressedrow.i;
1403     ridx = a->compressedrow.rindex;
1404   } else {
1405     ii = a->i;
1406     y  = yarray;
1407     z  = zarray;
1408   }
1409 
1410   for (i=0; i<mbs; i++) {
1411     n = ii[1] - ii[0]; ii++;
1412     if (usecprow) {
1413       z = zarray + 3*ridx[i];
1414       y = yarray + 3*ridx[i];
1415     }
1416     sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1417     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
1418     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1419     for (j=0; j<n; j++) {
1420       xb    = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1421       sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1422       sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1423       sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1424       v    += 9;
1425     }
1426     z[0] = sum1; z[1] = sum2; z[2] = sum3;
1427     if (!usecprow) {
1428       z += 3; y += 3;
1429     }
1430   }
1431   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1432   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1433   ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1434   PetscFunctionReturn(0);
1435 }
1436 
MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)1437 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1438 {
1439   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1440   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1441   const PetscScalar *x,*xb;
1442   const MatScalar   *v;
1443   PetscErrorCode    ierr;
1444   PetscInt          mbs = a->mbs,i,j,n;
1445   const PetscInt    *idx,*ii,*ridx=NULL;
1446   PetscBool         usecprow=a->compressedrow.use;
1447 
1448   PetscFunctionBegin;
1449   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1450   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1451 
1452   idx = a->j;
1453   v   = a->a;
1454   if (usecprow) {
1455     if (zz != yy) {
1456       ierr = PetscArraycpy(zarray,yarray,4*mbs);CHKERRQ(ierr);
1457     }
1458     mbs  = a->compressedrow.nrows;
1459     ii   = a->compressedrow.i;
1460     ridx = a->compressedrow.rindex;
1461   } else {
1462     ii = a->i;
1463     y  = yarray;
1464     z  = zarray;
1465   }
1466 
1467   for (i=0; i<mbs; i++) {
1468     n = ii[1] - ii[0]; ii++;
1469     if (usecprow) {
1470       z = zarray + 4*ridx[i];
1471       y = yarray + 4*ridx[i];
1472     }
1473     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1474     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1475     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1476     for (j=0; j<n; j++) {
1477       xb    = x + 4*(*idx++);
1478       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1479       sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1480       sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1481       sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1482       sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1483       v    += 16;
1484     }
1485     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1486     if (!usecprow) {
1487       z += 4; y += 4;
1488     }
1489   }
1490   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1491   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1492   ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1493   PetscFunctionReturn(0);
1494 }
1495 
MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)1496 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1497 {
1498   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1499   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1500   const PetscScalar *x,*xb;
1501   PetscScalar       *yarray,*zarray;
1502   const MatScalar   *v;
1503   PetscErrorCode    ierr;
1504   PetscInt          mbs = a->mbs,i,j,n;
1505   const PetscInt    *idx,*ii,*ridx = NULL;
1506   PetscBool         usecprow=a->compressedrow.use;
1507 
1508   PetscFunctionBegin;
1509   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1510   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1511 
1512   idx = a->j;
1513   v   = a->a;
1514   if (usecprow) {
1515     if (zz != yy) {
1516       ierr = PetscArraycpy(zarray,yarray,5*mbs);CHKERRQ(ierr);
1517     }
1518     mbs  = a->compressedrow.nrows;
1519     ii   = a->compressedrow.i;
1520     ridx = a->compressedrow.rindex;
1521   } else {
1522     ii = a->i;
1523     y  = yarray;
1524     z  = zarray;
1525   }
1526 
1527   for (i=0; i<mbs; i++) {
1528     n = ii[1] - ii[0]; ii++;
1529     if (usecprow) {
1530       z = zarray + 5*ridx[i];
1531       y = yarray + 5*ridx[i];
1532     }
1533     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1534     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1535     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1536     for (j=0; j<n; j++) {
1537       xb    = x + 5*(*idx++);
1538       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1539       sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1540       sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1541       sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1542       sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1543       sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1544       v    += 25;
1545     }
1546     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1547     if (!usecprow) {
1548       z += 5; y += 5;
1549     }
1550   }
1551   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1552   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1553   ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1554   PetscFunctionReturn(0);
1555 }
1556 
MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)1557 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1558 {
1559   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1560   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
1561   const PetscScalar *x,*xb;
1562   PetscScalar       x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1563   const MatScalar   *v;
1564   PetscErrorCode    ierr;
1565   PetscInt          mbs = a->mbs,i,j,n;
1566   const PetscInt    *idx,*ii,*ridx=NULL;
1567   PetscBool         usecprow=a->compressedrow.use;
1568 
1569   PetscFunctionBegin;
1570   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1571   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1572 
1573   idx = a->j;
1574   v   = a->a;
1575   if (usecprow) {
1576     if (zz != yy) {
1577       ierr = PetscArraycpy(zarray,yarray,6*mbs);CHKERRQ(ierr);
1578     }
1579     mbs  = a->compressedrow.nrows;
1580     ii   = a->compressedrow.i;
1581     ridx = a->compressedrow.rindex;
1582   } else {
1583     ii = a->i;
1584     y  = yarray;
1585     z  = zarray;
1586   }
1587 
1588   for (i=0; i<mbs; i++) {
1589     n = ii[1] - ii[0]; ii++;
1590     if (usecprow) {
1591       z = zarray + 6*ridx[i];
1592       y = yarray + 6*ridx[i];
1593     }
1594     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1595     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1596     PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1597     for (j=0; j<n; j++) {
1598       xb    = x + 6*(*idx++);
1599       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1600       sum1 += v[0]*x1 + v[6]*x2  + v[12]*x3  + v[18]*x4 + v[24]*x5 + v[30]*x6;
1601       sum2 += v[1]*x1 + v[7]*x2  + v[13]*x3  + v[19]*x4 + v[25]*x5 + v[31]*x6;
1602       sum3 += v[2]*x1 + v[8]*x2  + v[14]*x3  + v[20]*x4 + v[26]*x5 + v[32]*x6;
1603       sum4 += v[3]*x1 + v[9]*x2  + v[15]*x3  + v[21]*x4 + v[27]*x5 + v[33]*x6;
1604       sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3  + v[22]*x4 + v[28]*x5 + v[34]*x6;
1605       sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3  + v[23]*x4 + v[29]*x5 + v[35]*x6;
1606       v    += 36;
1607     }
1608     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1609     if (!usecprow) {
1610       z += 6; y += 6;
1611     }
1612   }
1613   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1614   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1615   ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
1616   PetscFunctionReturn(0);
1617 }
1618 
MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)1619 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1620 {
1621   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1622   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1623   const PetscScalar *x,*xb;
1624   PetscScalar       x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1625   const MatScalar   *v;
1626   PetscErrorCode    ierr;
1627   PetscInt          mbs = a->mbs,i,j,n;
1628   const PetscInt    *idx,*ii,*ridx = NULL;
1629   PetscBool         usecprow=a->compressedrow.use;
1630 
1631   PetscFunctionBegin;
1632   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1633   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1634 
1635   idx = a->j;
1636   v   = a->a;
1637   if (usecprow) {
1638     if (zz != yy) {
1639       ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr);
1640     }
1641     mbs  = a->compressedrow.nrows;
1642     ii   = a->compressedrow.i;
1643     ridx = a->compressedrow.rindex;
1644   } else {
1645     ii = a->i;
1646     y  = yarray;
1647     z  = zarray;
1648   }
1649 
1650   for (i=0; i<mbs; i++) {
1651     n = ii[1] - ii[0]; ii++;
1652     if (usecprow) {
1653       z = zarray + 7*ridx[i];
1654       y = yarray + 7*ridx[i];
1655     }
1656     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1657     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1658     PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1659     for (j=0; j<n; j++) {
1660       xb    = x + 7*(*idx++);
1661       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1662       sum1 += v[0]*x1 + v[7]*x2  + v[14]*x3  + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1663       sum2 += v[1]*x1 + v[8]*x2  + v[15]*x3  + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1664       sum3 += v[2]*x1 + v[9]*x2  + v[16]*x3  + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1665       sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3  + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1666       sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3  + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1667       sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3  + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1668       sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3  + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1669       v    += 49;
1670     }
1671     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1672     if (!usecprow) {
1673       z += 7; y += 7;
1674     }
1675   }
1676   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1677   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1678   ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
1679   PetscFunctionReturn(0);
1680 }
1681 
1682 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)1683 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
1684 {
1685   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
1686   PetscScalar    *z = NULL,*work,*workt,*zarray;
1687   const PetscScalar *x,*xb;
1688   const MatScalar   *v;
1689   PetscErrorCode ierr;
1690   PetscInt       mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1691   PetscInt       k;
1692   PetscBool      usecprow=a->compressedrow.use;
1693   const PetscInt *idx,*ii,*ridx=NULL;
1694 
1695   __m256d a0,a1,a2,a3,a4,a5;
1696   __m256d w0,w1,w2,w3;
1697   __m256d z0,z1,z2;
1698   __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
1699 
1700   PetscFunctionBegin;
1701   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1702   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1703   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1704 
1705   idx = a->j;
1706   v   = a->a;
1707   if (usecprow) {
1708     mbs  = a->compressedrow.nrows;
1709     ii   = a->compressedrow.i;
1710     ridx = a->compressedrow.rindex;
1711   } else {
1712     mbs = a->mbs;
1713     ii  = a->i;
1714     z   = zarray;
1715   }
1716 
1717   if (!a->mult_work) {
1718     k    = PetscMax(A->rmap->n,A->cmap->n);
1719     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1720   }
1721 
1722   work = a->mult_work;
1723   for (i=0; i<mbs; i++) {
1724     n           = ii[1] - ii[0]; ii++;
1725     workt       = work;
1726     for (j=0; j<n; j++) {
1727       xb = x + bs*(*idx++);
1728       for (k=0; k<bs; k++) workt[k] = xb[k];
1729       workt += bs;
1730     }
1731     if (usecprow) z = zarray + bs*ridx[i];
1732 
1733     z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);
1734 
1735     for (j=0; j<n; j++) {
1736       /* first column of a */
1737       w0 = _mm256_set1_pd(work[j*9  ]);
1738       a0 = _mm256_loadu_pd(&v[j*81  ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1739       a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1740       a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
1741 
1742       /* second column of a */
1743       w1 = _mm256_set1_pd(work[j*9+ 1]);
1744       a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1745       a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1746       a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
1747 
1748       /* third column of a */
1749       w2 = _mm256_set1_pd(work[j*9+ 2]);
1750       a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
1751       a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
1752       a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
1753 
1754       /* fourth column of a */
1755       w3 = _mm256_set1_pd(work[j*9+ 3]);
1756       a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
1757       a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
1758       a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
1759 
1760       /* fifth column of a */
1761       w0 = _mm256_set1_pd(work[j*9+ 4]);
1762       a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
1763       a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
1764       a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
1765 
1766       /* sixth column of a */
1767       w1 = _mm256_set1_pd(work[j*9+ 5]);
1768       a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1769       a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1770       a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
1771 
1772       /* seventh column of a */
1773       w2 = _mm256_set1_pd(work[j*9+ 6]);
1774       a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
1775       a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
1776       a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
1777 
1778       /* eigth column of a */
1779       w3 = _mm256_set1_pd(work[j*9+ 7]);
1780       a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
1781       a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
1782       a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
1783 
1784       /* ninth column of a */
1785       w0 = _mm256_set1_pd(work[j*9+ 8]);
1786       a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1787       a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1788       a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
1789     }
1790 
1791     _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
1792 
1793     v += n*bs2;
1794     if (!usecprow) z += bs;
1795   }
1796   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1797   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1798   ierr = PetscLogFlops(162.0*a->nz);CHKERRQ(ierr);
1799   PetscFunctionReturn(0);
1800 }
1801 #endif
1802 
MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)1803 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
1804 {
1805   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1806   PetscScalar       *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
1807   const PetscScalar *x,*xb;
1808   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
1809   const MatScalar   *v;
1810   PetscErrorCode    ierr;
1811   PetscInt          mbs = a->mbs,i,j,n;
1812   const PetscInt    *idx,*ii,*ridx = NULL;
1813   PetscBool         usecprow=a->compressedrow.use;
1814 
1815   PetscFunctionBegin;
1816   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1817   ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1818 
1819   idx = a->j;
1820   v   = a->a;
1821   if (usecprow) {
1822     if (zz != yy) {
1823       ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr);
1824     }
1825     mbs  = a->compressedrow.nrows;
1826     ii   = a->compressedrow.i;
1827     ridx = a->compressedrow.rindex;
1828   } else {
1829     ii = a->i;
1830     y  = yarray;
1831     z  = zarray;
1832   }
1833 
1834   for (i=0; i<mbs; i++) {
1835     n = ii[1] - ii[0]; ii++;
1836     if (usecprow) {
1837       z = zarray + 11*ridx[i];
1838       y = yarray + 11*ridx[i];
1839     }
1840     sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1841     sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
1842     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);     /* Indices for the next row (assumes same size as this one) */
1843     PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1844     for (j=0; j<n; j++) {
1845       xb    = x + 11*(*idx++);
1846       x1    = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
1847       sum1 += v[  0]*x1 + v[  11]*x2  + v[2*11]*x3  + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9  + v[9*11]*x10  + v[10*11]*x11;
1848       sum2 += v[1+0]*x1 + v[1+11]*x2  + v[1+2*11]*x3  + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9  + v[1+9*11]*x10  + v[1+10*11]*x11;
1849       sum3 += v[2+0]*x1 + v[2+11]*x2  + v[2+2*11]*x3  + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9  + v[2+9*11]*x10  + v[2+10*11]*x11;
1850       sum4 += v[3+0]*x1 + v[3+11]*x2  + v[3+2*11]*x3  + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9  + v[3+9*11]*x10  + v[3+10*11]*x11;
1851       sum5 += v[4+0]*x1 + v[4+11]*x2  + v[4+2*11]*x3  + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9  + v[4+9*11]*x10  + v[4+10*11]*x11;
1852       sum6 += v[5+0]*x1 + v[5+11]*x2  + v[5+2*11]*x3  + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9  + v[5+9*11]*x10  + v[5+10*11]*x11;
1853       sum7 += v[6+0]*x1 + v[6+11]*x2  + v[6+2*11]*x3  + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9  + v[6+9*11]*x10  + v[6+10*11]*x11;
1854       sum8 += v[7+0]*x1 + v[7+11]*x2  + v[7+2*11]*x3  + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9  + v[7+9*11]*x10  + v[7+10*11]*x11;
1855       sum9 += v[8+0]*x1 + v[8+11]*x2  + v[8+2*11]*x3  + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9  + v[8+9*11]*x10  + v[8+10*11]*x11;
1856       sum10 += v[9+0]*x1 + v[9+11]*x2  + v[9+2*11]*x3  + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9  + v[9+9*11]*x10  + v[9+10*11]*x11;
1857       sum11 += v[10+0]*x1 + v[10+11]*x2  + v[10+2*11]*x3  + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9  + v[10+9*11]*x10  + v[10+10*11]*x11;
1858       v    += 121;
1859     }
1860     z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1861     z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10;
1862     if (!usecprow) {
1863       z += 11; y += 11;
1864     }
1865   }
1866   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1867   ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1868   ierr = PetscLogFlops(242.0*a->nz);CHKERRQ(ierr);
1869   PetscFunctionReturn(0);
1870 }
1871 
MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)1872 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1873 {
1874   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1875   PetscScalar       *z = NULL,*work,*workt,*zarray;
1876   const PetscScalar *x,*xb;
1877   const MatScalar   *v;
1878   PetscErrorCode    ierr;
1879   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1880   PetscInt          ncols,k;
1881   const PetscInt    *ridx = NULL,*idx,*ii;
1882   PetscBool         usecprow = a->compressedrow.use;
1883 
1884   PetscFunctionBegin;
1885   ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1886   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1887   ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1888 
1889   idx = a->j;
1890   v   = a->a;
1891   if (usecprow) {
1892     mbs  = a->compressedrow.nrows;
1893     ii   = a->compressedrow.i;
1894     ridx = a->compressedrow.rindex;
1895   } else {
1896     mbs = a->mbs;
1897     ii  = a->i;
1898     z   = zarray;
1899   }
1900 
1901   if (!a->mult_work) {
1902     k    = PetscMax(A->rmap->n,A->cmap->n);
1903     ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1904   }
1905   work = a->mult_work;
1906   for (i=0; i<mbs; i++) {
1907     n     = ii[1] - ii[0]; ii++;
1908     ncols = n*bs;
1909     workt = work;
1910     for (j=0; j<n; j++) {
1911       xb = x + bs*(*idx++);
1912       for (k=0; k<bs; k++) workt[k] = xb[k];
1913       workt += bs;
1914     }
1915     if (usecprow) z = zarray + bs*ridx[i];
1916     PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1917     /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1918     v += n*bs2;
1919     if (!usecprow) z += bs;
1920   }
1921   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1922   ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1923   ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
1924   PetscFunctionReturn(0);
1925 }
1926 
MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)1927 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1928 {
1929   PetscScalar    zero = 0.0;
1930   PetscErrorCode ierr;
1931 
1932   PetscFunctionBegin;
1933   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1934   ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1935   PetscFunctionReturn(0);
1936 }
1937 
MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)1938 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1939 {
1940   PetscScalar    zero = 0.0;
1941   PetscErrorCode ierr;
1942 
1943   PetscFunctionBegin;
1944   ierr = VecSet(zz,zero);CHKERRQ(ierr);
1945   ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1946   PetscFunctionReturn(0);
1947 }
1948 
MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)1949 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1950 {
1951   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
1952   PetscScalar       *z,x1,x2,x3,x4,x5;
1953   const PetscScalar *x,*xb = NULL;
1954   const MatScalar   *v;
1955   PetscErrorCode    ierr;
1956   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n;
1957   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
1958   Mat_CompressedRow cprow = a->compressedrow;
1959   PetscBool         usecprow = cprow.use;
1960 
1961   PetscFunctionBegin;
1962   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1963   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1964   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1965 
1966   idx = a->j;
1967   v   = a->a;
1968   if (usecprow) {
1969     mbs  = cprow.nrows;
1970     ii   = cprow.i;
1971     ridx = cprow.rindex;
1972   } else {
1973     mbs=a->mbs;
1974     ii = a->i;
1975     xb = x;
1976   }
1977 
1978   switch (bs) {
1979   case 1:
1980     for (i=0; i<mbs; i++) {
1981       if (usecprow) xb = x + ridx[i];
1982       x1 = xb[0];
1983       ib = idx + ii[0];
1984       n  = ii[1] - ii[0]; ii++;
1985       for (j=0; j<n; j++) {
1986         rval     = ib[j];
1987         z[rval] += PetscConj(*v) * x1;
1988         v++;
1989       }
1990       if (!usecprow) xb++;
1991     }
1992     break;
1993   case 2:
1994     for (i=0; i<mbs; i++) {
1995       if (usecprow) xb = x + 2*ridx[i];
1996       x1 = xb[0]; x2 = xb[1];
1997       ib = idx + ii[0];
1998       n  = ii[1] - ii[0]; ii++;
1999       for (j=0; j<n; j++) {
2000         rval       = ib[j]*2;
2001         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2002         z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2003         v         += 4;
2004       }
2005       if (!usecprow) xb += 2;
2006     }
2007     break;
2008   case 3:
2009     for (i=0; i<mbs; i++) {
2010       if (usecprow) xb = x + 3*ridx[i];
2011       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2012       ib = idx + ii[0];
2013       n  = ii[1] - ii[0]; ii++;
2014       for (j=0; j<n; j++) {
2015         rval       = ib[j]*3;
2016         z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2017         z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2018         z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2019         v         += 9;
2020       }
2021       if (!usecprow) xb += 3;
2022     }
2023     break;
2024   case 4:
2025     for (i=0; i<mbs; i++) {
2026       if (usecprow) xb = x + 4*ridx[i];
2027       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2028       ib = idx + ii[0];
2029       n  = ii[1] - ii[0]; ii++;
2030       for (j=0; j<n; j++) {
2031         rval       = ib[j]*4;
2032         z[rval++] +=  PetscConj(v[0])*x1 + PetscConj(v[1])*x2  + PetscConj(v[2])*x3  + PetscConj(v[3])*x4;
2033         z[rval++] +=  PetscConj(v[4])*x1 + PetscConj(v[5])*x2  + PetscConj(v[6])*x3  + PetscConj(v[7])*x4;
2034         z[rval++] +=  PetscConj(v[8])*x1 + PetscConj(v[9])*x2  + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2035         z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2036         v         += 16;
2037       }
2038       if (!usecprow) xb += 4;
2039     }
2040     break;
2041   case 5:
2042     for (i=0; i<mbs; i++) {
2043       if (usecprow) xb = x + 5*ridx[i];
2044       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2045       x4 = xb[3]; x5 = xb[4];
2046       ib = idx + ii[0];
2047       n  = ii[1] - ii[0]; ii++;
2048       for (j=0; j<n; j++) {
2049         rval       = ib[j]*5;
2050         z[rval++] +=  PetscConj(v[0])*x1 +  PetscConj(v[1])*x2 +  PetscConj(v[2])*x3 +  PetscConj(v[3])*x4 +  PetscConj(v[4])*x5;
2051         z[rval++] +=  PetscConj(v[5])*x1 +  PetscConj(v[6])*x2 +  PetscConj(v[7])*x3 +  PetscConj(v[8])*x4 +  PetscConj(v[9])*x5;
2052         z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2053         z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2054         z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2055         v         += 25;
2056       }
2057       if (!usecprow) xb += 5;
2058     }
2059     break;
2060   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2061     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2062 #if 0
2063     {
2064       PetscInt          ncols,k,bs2=a->bs2;
2065       PetscScalar       *work,*workt,zb;
2066       const PetscScalar *xtmp;
2067       if (!a->mult_work) {
2068         k    = PetscMax(A->rmap->n,A->cmap->n);
2069         ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2070       }
2071       work = a->mult_work;
2072       xtmp = x;
2073       for (i=0; i<mbs; i++) {
2074         n     = ii[1] - ii[0]; ii++;
2075         ncols = n*bs;
2076         ierr  = PetscArrayzero(work,ncols);CHKERRQ(ierr);
2077         if (usecprow) xtmp = x + bs*ridx[i];
2078         PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2079         /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2080         v += n*bs2;
2081         if (!usecprow) xtmp += bs;
2082         workt = work;
2083         for (j=0; j<n; j++) {
2084           zb = z + bs*(*idx++);
2085           for (k=0; k<bs; k++) zb[k] += workt[k] ;
2086           workt += bs;
2087         }
2088       }
2089     }
2090 #endif
2091   }
2092   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2093   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2094   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2095   PetscFunctionReturn(0);
2096 }
2097 
MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)2098 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2099 {
2100   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2101   PetscScalar       *zb,*z,x1,x2,x3,x4,x5;
2102   const PetscScalar *x,*xb = NULL;
2103   const MatScalar   *v;
2104   PetscErrorCode    ierr;
2105   PetscInt          mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2106   const PetscInt    *idx,*ii,*ib,*ridx = NULL;
2107   Mat_CompressedRow cprow   = a->compressedrow;
2108   PetscBool         usecprow=cprow.use;
2109 
2110   PetscFunctionBegin;
2111   if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2112   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2113   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2114 
2115   idx = a->j;
2116   v   = a->a;
2117   if (usecprow) {
2118     mbs  = cprow.nrows;
2119     ii   = cprow.i;
2120     ridx = cprow.rindex;
2121   } else {
2122     mbs=a->mbs;
2123     ii = a->i;
2124     xb = x;
2125   }
2126 
2127   switch (bs) {
2128   case 1:
2129     for (i=0; i<mbs; i++) {
2130       if (usecprow) xb = x + ridx[i];
2131       x1 = xb[0];
2132       ib = idx + ii[0];
2133       n  = ii[1] - ii[0]; ii++;
2134       for (j=0; j<n; j++) {
2135         rval     = ib[j];
2136         z[rval] += *v * x1;
2137         v++;
2138       }
2139       if (!usecprow) xb++;
2140     }
2141     break;
2142   case 2:
2143     for (i=0; i<mbs; i++) {
2144       if (usecprow) xb = x + 2*ridx[i];
2145       x1 = xb[0]; x2 = xb[1];
2146       ib = idx + ii[0];
2147       n  = ii[1] - ii[0]; ii++;
2148       for (j=0; j<n; j++) {
2149         rval       = ib[j]*2;
2150         z[rval++] += v[0]*x1 + v[1]*x2;
2151         z[rval++] += v[2]*x1 + v[3]*x2;
2152         v         += 4;
2153       }
2154       if (!usecprow) xb += 2;
2155     }
2156     break;
2157   case 3:
2158     for (i=0; i<mbs; i++) {
2159       if (usecprow) xb = x + 3*ridx[i];
2160       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2161       ib = idx + ii[0];
2162       n  = ii[1] - ii[0]; ii++;
2163       for (j=0; j<n; j++) {
2164         rval       = ib[j]*3;
2165         z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2166         z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2167         z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2168         v         += 9;
2169       }
2170       if (!usecprow) xb += 3;
2171     }
2172     break;
2173   case 4:
2174     for (i=0; i<mbs; i++) {
2175       if (usecprow) xb = x + 4*ridx[i];
2176       x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2177       ib = idx + ii[0];
2178       n  = ii[1] - ii[0]; ii++;
2179       for (j=0; j<n; j++) {
2180         rval       = ib[j]*4;
2181         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
2182         z[rval++] +=  v[4]*x1 +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
2183         z[rval++] +=  v[8]*x1 +  v[9]*x2 + v[10]*x3 + v[11]*x4;
2184         z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2185         v         += 16;
2186       }
2187       if (!usecprow) xb += 4;
2188     }
2189     break;
2190   case 5:
2191     for (i=0; i<mbs; i++) {
2192       if (usecprow) xb = x + 5*ridx[i];
2193       x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2194       x4 = xb[3]; x5 = xb[4];
2195       ib = idx + ii[0];
2196       n  = ii[1] - ii[0]; ii++;
2197       for (j=0; j<n; j++) {
2198         rval       = ib[j]*5;
2199         z[rval++] +=  v[0]*x1 +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
2200         z[rval++] +=  v[5]*x1 +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
2201         z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2202         z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2203         z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2204         v         += 25;
2205       }
2206       if (!usecprow) xb += 5;
2207     }
2208     break;
2209   default: {      /* block sizes larger then 5 by 5 are handled by BLAS */
2210     PetscInt          ncols,k;
2211     PetscScalar       *work,*workt;
2212     const PetscScalar *xtmp;
2213     if (!a->mult_work) {
2214       k    = PetscMax(A->rmap->n,A->cmap->n);
2215       ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2216     }
2217     work = a->mult_work;
2218     xtmp = x;
2219     for (i=0; i<mbs; i++) {
2220       n     = ii[1] - ii[0]; ii++;
2221       ncols = n*bs;
2222       ierr  = PetscArrayzero(work,ncols);CHKERRQ(ierr);
2223       if (usecprow) xtmp = x + bs*ridx[i];
2224       PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2225       /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2226       v += n*bs2;
2227       if (!usecprow) xtmp += bs;
2228       workt = work;
2229       for (j=0; j<n; j++) {
2230         zb = z + bs*(*idx++);
2231         for (k=0; k<bs; k++) zb[k] += workt[k];
2232         workt += bs;
2233       }
2234     }
2235     }
2236   }
2237   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2238   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2239   ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2240   PetscFunctionReturn(0);
2241 }
2242 
MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)2243 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2244 {
2245   Mat_SeqBAIJ    *a      = (Mat_SeqBAIJ*)inA->data;
2246   PetscInt       totalnz = a->bs2*a->nz;
2247   PetscScalar    oalpha  = alpha;
2248   PetscErrorCode ierr;
2249   PetscBLASInt   one = 1,tnz;
2250 
2251   PetscFunctionBegin;
2252   ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
2253   PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2254   ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
2255   PetscFunctionReturn(0);
2256 }
2257 
MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal * norm)2258 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2259 {
2260   PetscErrorCode ierr;
2261   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ*)A->data;
2262   MatScalar      *v  = a->a;
2263   PetscReal      sum = 0.0;
2264   PetscInt       i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2265 
2266   PetscFunctionBegin;
2267   if (type == NORM_FROBENIUS) {
2268 #if defined(PETSC_USE_REAL___FP16)
2269     PetscBLASInt one = 1,cnt = bs2*nz;
2270     *norm = BLASnrm2_(&cnt,v,&one);
2271 #else
2272     for (i=0; i<bs2*nz; i++) {
2273       sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2274     }
2275 #endif
2276     *norm = PetscSqrtReal(sum);
2277     ierr = PetscLogFlops(2.0*bs2*nz);CHKERRQ(ierr);
2278   } else if (type == NORM_1) { /* maximum column sum */
2279     PetscReal *tmp;
2280     PetscInt  *bcol = a->j;
2281     ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr);
2282     for (i=0; i<nz; i++) {
2283       for (j=0; j<bs; j++) {
2284         k1 = bs*(*bcol) + j; /* column index */
2285         for (k=0; k<bs; k++) {
2286           tmp[k1] += PetscAbsScalar(*v); v++;
2287         }
2288       }
2289       bcol++;
2290     }
2291     *norm = 0.0;
2292     for (j=0; j<A->cmap->n; j++) {
2293       if (tmp[j] > *norm) *norm = tmp[j];
2294     }
2295     ierr = PetscFree(tmp);CHKERRQ(ierr);
2296     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
2297   } else if (type == NORM_INFINITY) { /* maximum row sum */
2298     *norm = 0.0;
2299     for (k=0; k<bs; k++) {
2300       for (j=0; j<a->mbs; j++) {
2301         v   = a->a + bs2*a->i[j] + k;
2302         sum = 0.0;
2303         for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2304           for (k1=0; k1<bs; k1++) {
2305             sum += PetscAbsScalar(*v);
2306             v   += bs;
2307           }
2308         }
2309         if (sum > *norm) *norm = sum;
2310       }
2311     }
2312     ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
2313   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2314   PetscFunctionReturn(0);
2315 }
2316 
2317 
MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)2318 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2319 {
2320   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2321   PetscErrorCode ierr;
2322 
2323   PetscFunctionBegin;
2324   /* If the  matrix/block dimensions are not equal, or no of nonzeros or shift */
2325   if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2326     *flg = PETSC_FALSE;
2327     PetscFunctionReturn(0);
2328   }
2329 
2330   /* if the a->i are the same */
2331   ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr);
2332   if (!*flg) PetscFunctionReturn(0);
2333 
2334   /* if a->j are the same */
2335   ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr);
2336   if (!*flg) PetscFunctionReturn(0);
2337 
2338   /* if a->a are the same */
2339   ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);CHKERRQ(ierr);
2340   PetscFunctionReturn(0);
2341 
2342 }
2343 
MatGetDiagonal_SeqBAIJ(Mat A,Vec v)2344 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2345 {
2346   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2347   PetscErrorCode ierr;
2348   PetscInt       i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2349   PetscScalar    *x,zero = 0.0;
2350   MatScalar      *aa,*aa_j;
2351 
2352   PetscFunctionBegin;
2353   if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2354   bs   = A->rmap->bs;
2355   aa   = a->a;
2356   ai   = a->i;
2357   aj   = a->j;
2358   ambs = a->mbs;
2359   bs2  = a->bs2;
2360 
2361   ierr = VecSet(v,zero);CHKERRQ(ierr);
2362   ierr = VecGetArray(v,&x);CHKERRQ(ierr);
2363   ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
2364   if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2365   for (i=0; i<ambs; i++) {
2366     for (j=ai[i]; j<ai[i+1]; j++) {
2367       if (aj[j] == i) {
2368         row  = i*bs;
2369         aa_j = aa+j*bs2;
2370         for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2371         break;
2372       }
2373     }
2374   }
2375   ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2376   PetscFunctionReturn(0);
2377 }
2378 
MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)2379 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2380 {
2381   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2382   const PetscScalar *l,*r,*li,*ri;
2383   PetscScalar       x;
2384   MatScalar         *aa, *v;
2385   PetscErrorCode    ierr;
2386   PetscInt          i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2387   const PetscInt    *ai,*aj;
2388 
2389   PetscFunctionBegin;
2390   ai  = a->i;
2391   aj  = a->j;
2392   aa  = a->a;
2393   m   = A->rmap->n;
2394   n   = A->cmap->n;
2395   bs  = A->rmap->bs;
2396   mbs = a->mbs;
2397   bs2 = a->bs2;
2398   if (ll) {
2399     ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2400     ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2401     if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2402     for (i=0; i<mbs; i++) { /* for each block row */
2403       M  = ai[i+1] - ai[i];
2404       li = l + i*bs;
2405       v  = aa + bs2*ai[i];
2406       for (j=0; j<M; j++) { /* for each block */
2407         for (k=0; k<bs2; k++) {
2408           (*v++) *= li[k%bs];
2409         }
2410       }
2411     }
2412     ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2413     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2414   }
2415 
2416   if (rr) {
2417     ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2418     ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2419     if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2420     for (i=0; i<mbs; i++) { /* for each block row */
2421       iai = ai[i];
2422       M   = ai[i+1] - iai;
2423       v   = aa + bs2*iai;
2424       for (j=0; j<M; j++) { /* for each block */
2425         ri = r + bs*aj[iai+j];
2426         for (k=0; k<bs; k++) {
2427           x = ri[k];
2428           for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2429           v += bs;
2430         }
2431       }
2432     }
2433     ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2434     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2435   }
2436   PetscFunctionReturn(0);
2437 }
2438 
2439 
MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo * info)2440 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2441 {
2442   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2443 
2444   PetscFunctionBegin;
2445   info->block_size   = a->bs2;
2446   info->nz_allocated = a->bs2*a->maxnz;
2447   info->nz_used      = a->bs2*a->nz;
2448   info->nz_unneeded  = info->nz_allocated - info->nz_used;
2449   info->assemblies   = A->num_ass;
2450   info->mallocs      = A->info.mallocs;
2451   info->memory       = ((PetscObject)A)->mem;
2452   if (A->factortype) {
2453     info->fill_ratio_given  = A->info.fill_ratio_given;
2454     info->fill_ratio_needed = A->info.fill_ratio_needed;
2455     info->factor_mallocs    = A->info.factor_mallocs;
2456   } else {
2457     info->fill_ratio_given  = 0;
2458     info->fill_ratio_needed = 0;
2459     info->factor_mallocs    = 0;
2460   }
2461   PetscFunctionReturn(0);
2462 }
2463 
MatZeroEntries_SeqBAIJ(Mat A)2464 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2465 {
2466   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data;
2467   PetscErrorCode ierr;
2468 
2469   PetscFunctionBegin;
2470   ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr);
2471   PetscFunctionReturn(0);
2472 }
2473 
MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)2474 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
2475 {
2476   PetscErrorCode ierr;
2477 
2478   PetscFunctionBegin;
2479   ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
2480   C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
2481   PetscFunctionReturn(0);
2482 }
2483 
MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)2484 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2485 {
2486   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2487   PetscScalar       *z = NULL,sum1;
2488   const PetscScalar *xb;
2489   PetscScalar       x1;
2490   const MatScalar   *v,*vv;
2491   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2492   PetscBool         usecprow=a->compressedrow.use;
2493 
2494   PetscFunctionBegin;
2495   idx = a->j;
2496   v   = a->a;
2497   if (usecprow) {
2498     mbs  = a->compressedrow.nrows;
2499     ii   = a->compressedrow.i;
2500     ridx = a->compressedrow.rindex;
2501   } else {
2502     mbs = a->mbs;
2503     ii  = a->i;
2504     z   = c;
2505   }
2506 
2507   for (i=0; i<mbs; i++) {
2508     n           = ii[1] - ii[0]; ii++;
2509     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2510     PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2511     if (usecprow) z = c + ridx[i];
2512     jj = idx;
2513     vv = v;
2514     for (k=0; k<cn; k++) {
2515       idx = jj;
2516       v = vv;
2517       sum1    = 0.0;
2518       for (j=0; j<n; j++) {
2519         xb    = b + (*idx++); x1 = xb[0+k*bm];
2520         sum1 += v[0]*x1;
2521         v    += 1;
2522       }
2523       z[0+k*cm] = sum1;
2524     }
2525     if (!usecprow) z += 1;
2526   }
2527   PetscFunctionReturn(0);
2528 }
2529 
MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)2530 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2531 {
2532   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2533   PetscScalar       *z = NULL,sum1,sum2;
2534   const PetscScalar *xb;
2535   PetscScalar       x1,x2;
2536   const MatScalar   *v,*vv;
2537   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2538   PetscBool         usecprow=a->compressedrow.use;
2539 
2540   PetscFunctionBegin;
2541   idx = a->j;
2542   v   = a->a;
2543   if (usecprow) {
2544     mbs  = a->compressedrow.nrows;
2545     ii   = a->compressedrow.i;
2546     ridx = a->compressedrow.rindex;
2547   } else {
2548     mbs = a->mbs;
2549     ii  = a->i;
2550     z   = c;
2551   }
2552 
2553   for (i=0; i<mbs; i++) {
2554     n           = ii[1] - ii[0]; ii++;
2555     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2556     PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2557     if (usecprow) z = c + 2*ridx[i];
2558     jj = idx;
2559     vv = v;
2560     for (k=0; k<cn; k++) {
2561       idx = jj;
2562       v = vv;
2563       sum1    = 0.0; sum2 = 0.0;
2564       for (j=0; j<n; j++) {
2565         xb    = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
2566         sum1 += v[0]*x1 + v[2]*x2;
2567         sum2 += v[1]*x1 + v[3]*x2;
2568         v    += 4;
2569       }
2570       z[0+k*cm] = sum1; z[1+k*cm] = sum2;
2571     }
2572     if (!usecprow) z += 2;
2573   }
2574   PetscFunctionReturn(0);
2575 }
2576 
MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)2577 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2578 {
2579   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2580   PetscScalar       *z = NULL,sum1,sum2,sum3;
2581   const PetscScalar *xb;
2582   PetscScalar       x1,x2,x3;
2583   const MatScalar   *v,*vv;
2584   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2585   PetscBool         usecprow=a->compressedrow.use;
2586 
2587   PetscFunctionBegin;
2588   idx = a->j;
2589   v   = a->a;
2590   if (usecprow) {
2591     mbs  = a->compressedrow.nrows;
2592     ii   = a->compressedrow.i;
2593     ridx = a->compressedrow.rindex;
2594   } else {
2595     mbs = a->mbs;
2596     ii  = a->i;
2597     z   = c;
2598   }
2599 
2600   for (i=0; i<mbs; i++) {
2601     n           = ii[1] - ii[0]; ii++;
2602     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2603     PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2604     if (usecprow) z = c + 3*ridx[i];
2605     jj = idx;
2606     vv = v;
2607     for (k=0; k<cn; k++) {
2608       idx = jj;
2609       v = vv;
2610       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0;
2611       for (j=0; j<n; j++) {
2612         xb    = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
2613         sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
2614         sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
2615         sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
2616         v    += 9;
2617       }
2618       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3;
2619     }
2620     if (!usecprow) z += 3;
2621   }
2622   PetscFunctionReturn(0);
2623 }
2624 
MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)2625 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2626 {
2627   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2628   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4;
2629   const PetscScalar *xb;
2630   PetscScalar       x1,x2,x3,x4;
2631   const MatScalar   *v,*vv;
2632   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2633   PetscBool         usecprow=a->compressedrow.use;
2634 
2635   PetscFunctionBegin;
2636   idx = a->j;
2637   v   = a->a;
2638   if (usecprow) {
2639     mbs  = a->compressedrow.nrows;
2640     ii   = a->compressedrow.i;
2641     ridx = a->compressedrow.rindex;
2642   } else {
2643     mbs = a->mbs;
2644     ii  = a->i;
2645     z   = c;
2646   }
2647 
2648   for (i=0; i<mbs; i++) {
2649     n           = ii[1] - ii[0]; ii++;
2650     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2651     PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2652     if (usecprow) z = c + 4*ridx[i];
2653     jj = idx;
2654     vv = v;
2655     for (k=0; k<cn; k++) {
2656       idx = jj;
2657       v = vv;
2658       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
2659       for (j=0; j<n; j++) {
2660         xb    = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
2661         sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2662         sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2663         sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2664         sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2665         v    += 16;
2666       }
2667       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4;
2668     }
2669     if (!usecprow) z += 4;
2670   }
2671   PetscFunctionReturn(0);
2672 }
2673 
MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)2674 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2675 {
2676   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2677   PetscScalar       *z = NULL,sum1,sum2,sum3,sum4,sum5;
2678   const PetscScalar *xb;
2679   PetscScalar       x1,x2,x3,x4,x5;
2680   const MatScalar   *v,*vv;
2681   PetscInt          mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2682   PetscBool         usecprow=a->compressedrow.use;
2683 
2684   PetscFunctionBegin;
2685   idx = a->j;
2686   v   = a->a;
2687   if (usecprow) {
2688     mbs  = a->compressedrow.nrows;
2689     ii   = a->compressedrow.i;
2690     ridx = a->compressedrow.rindex;
2691   } else {
2692     mbs = a->mbs;
2693     ii  = a->i;
2694     z   = c;
2695   }
2696 
2697   for (i=0; i<mbs; i++) {
2698     n           = ii[1] - ii[0]; ii++;
2699     PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA);   /* Indices for the next row (assumes same size as this one) */
2700     PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2701     if (usecprow) z = c + 5*ridx[i];
2702     jj = idx;
2703     vv = v;
2704     for (k=0; k<cn; k++) {
2705       idx = jj;
2706       v = vv;
2707       sum1    = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
2708       for (j=0; j<n; j++) {
2709         xb    = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm];
2710         sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2711         sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2712         sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2713         sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2714         sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2715         v    += 25;
2716       }
2717       z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5;
2718     }
2719     if (!usecprow) z += 5;
2720   }
2721   PetscFunctionReturn(0);
2722 }
2723 
MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)2724 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)
2725 {
2726   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
2727   Mat_SeqDense      *bd = (Mat_SeqDense*)B->data;
2728   Mat_SeqDense      *cd = (Mat_SeqDense*)C->data;
2729   PetscInt          cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
2730   PetscInt          mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2731   PetscBLASInt      bbs,bcn,bbm,bcm;
2732   PetscScalar       *z = NULL;
2733   PetscScalar       *c,*b;
2734   const MatScalar   *v;
2735   const PetscInt    *idx,*ii,*ridx=NULL;
2736   PetscScalar       _DZero=0.0,_DOne=1.0;
2737   PetscBool         usecprow=a->compressedrow.use;
2738   PetscErrorCode    ierr;
2739 
2740   PetscFunctionBegin;
2741   if (!cm || !cn) PetscFunctionReturn(0);
2742   if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
2743   if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
2744   if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
2745   b = bd->v;
2746   if (a->nonzerorowcnt != A->rmap->n) {
2747     ierr = MatZeroEntries(C);CHKERRQ(ierr);
2748   }
2749   ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
2750   switch (bs) {
2751   case 1:
2752     ierr = MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);
2753     break;
2754   case 2:
2755     ierr = MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);
2756     break;
2757   case 3:
2758     ierr = MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);
2759     break;
2760   case 4:
2761     ierr = MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);
2762     break;
2763   case 5:
2764     ierr = MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);
2765     break;
2766   default: /* block sizes larger than 5 by 5 are handled by BLAS */
2767     ierr = PetscBLASIntCast(bs,&bbs);CHKERRQ(ierr);
2768     ierr = PetscBLASIntCast(cn,&bcn);CHKERRQ(ierr);
2769     ierr = PetscBLASIntCast(bm,&bbm);CHKERRQ(ierr);
2770     ierr = PetscBLASIntCast(cm,&bcm);CHKERRQ(ierr);
2771     idx = a->j;
2772     v   = a->a;
2773     if (usecprow) {
2774       mbs  = a->compressedrow.nrows;
2775       ii   = a->compressedrow.i;
2776       ridx = a->compressedrow.rindex;
2777     } else {
2778       mbs = a->mbs;
2779       ii  = a->i;
2780       z   = c;
2781     }
2782     for (i=0; i<mbs; i++) {
2783       n = ii[1] - ii[0]; ii++;
2784       if (usecprow) z = c + bs*ridx[i];
2785       if (n) {
2786         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm));
2787         v += bs2;
2788       }
2789       for (j=1; j<n; j++) {
2790         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
2791         v += bs2;
2792       }
2793       if (!usecprow) z += bs;
2794     }
2795   }
2796   ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
2797   ierr = PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn);CHKERRQ(ierr);
2798   PetscFunctionReturn(0);
2799 }
2800