1
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <../src/mat/impls/dense/seq/dense.h>
4 #include <petsc/private/kernels/blockinvert.h>
5 #include <petscbt.h>
6 #include <petscblaslapack.h>
7
8 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
9 #include <immintrin.h>
10 #endif
11
MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)12 PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
13 {
14 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
15 PetscErrorCode ierr;
16 PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival;
17 const PetscInt *idx;
18 PetscInt start,end,*ai,*aj,bs,*nidx2;
19 PetscBT table;
20
21 PetscFunctionBegin;
22 m = a->mbs;
23 ai = a->i;
24 aj = a->j;
25 bs = A->rmap->bs;
26
27 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
28
29 ierr = PetscBTCreate(m,&table);CHKERRQ(ierr);
30 ierr = PetscMalloc1(m+1,&nidx);CHKERRQ(ierr);
31 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
32
33 for (i=0; i<is_max; i++) {
34 /* Initialise the two local arrays */
35 isz = 0;
36 ierr = PetscBTMemzero(m,table);CHKERRQ(ierr);
37
38 /* Extract the indices, assume there can be duplicate entries */
39 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
40 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
41
42 /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
43 for (j=0; j<n; ++j) {
44 ival = idx[j]/bs; /* convert the indices into block indices */
45 if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
46 if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
47 }
48 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
49 ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
50
51 k = 0;
52 for (j=0; j<ov; j++) { /* for each overlap*/
53 n = isz;
54 for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */
55 row = nidx[k];
56 start = ai[row];
57 end = ai[row+1];
58 for (l = start; l<end; l++) {
59 val = aj[l];
60 if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
61 }
62 }
63 }
64 /* expand the Index Set */
65 for (j=0; j<isz; j++) {
66 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
67 }
68 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
69 }
70 ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
71 ierr = PetscFree(nidx);CHKERRQ(ierr);
72 ierr = PetscFree(nidx2);CHKERRQ(ierr);
73 PetscFunctionReturn(0);
74 }
75
MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat * B)76 PetscErrorCode MatCreateSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
77 {
78 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
79 PetscErrorCode ierr;
80 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
81 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
82 const PetscInt *irow,*icol;
83 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
84 PetscInt *aj = a->j,*ai = a->i;
85 MatScalar *mat_a;
86 Mat C;
87 PetscBool flag;
88
89 PetscFunctionBegin;
90 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
91 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
92 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
93 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
94
95 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
96 ssmap = smap;
97 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
98 for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
99 /* determine lens of each row */
100 for (i=0; i<nrows; i++) {
101 kstart = ai[irow[i]];
102 kend = kstart + a->ilen[irow[i]];
103 lens[i] = 0;
104 for (k=kstart; k<kend; k++) {
105 if (ssmap[aj[k]]) lens[i]++;
106 }
107 }
108 /* Create and fill new matrix */
109 if (scall == MAT_REUSE_MATRIX) {
110 c = (Mat_SeqBAIJ*)((*B)->data);
111
112 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
113 ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr);
114 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
115 ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr);
116 C = *B;
117 } else {
118 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
119 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
120 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
121 ierr = MatSeqBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
122 }
123 c = (Mat_SeqBAIJ*)(C->data);
124 for (i=0; i<nrows; i++) {
125 row = irow[i];
126 kstart = ai[row];
127 kend = kstart + a->ilen[row];
128 mat_i = c->i[i];
129 mat_j = c->j + mat_i;
130 mat_a = c->a + mat_i*bs2;
131 mat_ilen = c->ilen + i;
132 for (k=kstart; k<kend; k++) {
133 if ((tcol=ssmap[a->j[k]])) {
134 *mat_j++ = tcol - 1;
135 ierr = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr);
136 mat_a += bs2;
137 (*mat_ilen)++;
138 }
139 }
140 }
141 /* sort */
142 {
143 MatScalar *work;
144 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
145 for (i=0; i<nrows; i++) {
146 PetscInt ilen;
147 mat_i = c->i[i];
148 mat_j = c->j + mat_i;
149 mat_a = c->a + mat_i*bs2;
150 ilen = c->ilen[i];
151 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
152 }
153 ierr = PetscFree(work);CHKERRQ(ierr);
154 }
155
156 /* Free work space */
157 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
158 ierr = PetscFree(smap);CHKERRQ(ierr);
159 ierr = PetscFree(lens);CHKERRQ(ierr);
160 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
161 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
162
163 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
164 *B = C;
165 PetscFunctionReturn(0);
166 }
167
MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat * B)168 PetscErrorCode MatCreateSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
169 {
170 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
171 IS is1,is2;
172 PetscErrorCode ierr;
173 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
174 const PetscInt *irow,*icol;
175
176 PetscFunctionBegin;
177 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
178 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
179 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
180 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
181
182 /* Verify if the indices corespond to each element in a block
183 and form the IS with compressed IS */
184 maxmnbs = PetscMax(a->mbs,a->nbs);
185 ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
186 ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr);
187 for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
188 for (i=0; i<a->mbs; i++) {
189 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
190 }
191 count = 0;
192 for (i=0; i<nrows; i++) {
193 j = irow[i] / bs;
194 if ((vary[j]--)==bs) iary[count++] = j;
195 }
196 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
197
198 ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr);
199 for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
200 for (i=0; i<a->nbs; i++) {
201 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
202 }
203 count = 0;
204 for (i=0; i<ncols; i++) {
205 j = icol[i] / bs;
206 if ((vary[j]--)==bs) iary[count++] = j;
207 }
208 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
209 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
210 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
211 ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
212
213 ierr = MatCreateSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
214 ierr = ISDestroy(&is1);CHKERRQ(ierr);
215 ierr = ISDestroy(&is2);CHKERRQ(ierr);
216 PetscFunctionReturn(0);
217 }
218
MatDestroySubMatrix_SeqBAIJ(Mat C)219 PetscErrorCode MatDestroySubMatrix_SeqBAIJ(Mat C)
220 {
221 PetscErrorCode ierr;
222 Mat_SeqBAIJ *c = (Mat_SeqBAIJ*)C->data;
223 Mat_SubSppt *submatj = c->submatis1;
224
225 PetscFunctionBegin;
226 ierr = (*submatj->destroy)(C);CHKERRQ(ierr);
227 ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr);
228 PetscFunctionReturn(0);
229 }
230
MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat * mat[])231 PetscErrorCode MatDestroySubMatrices_SeqBAIJ(PetscInt n,Mat *mat[])
232 {
233 PetscErrorCode ierr;
234 PetscInt i;
235 Mat C;
236 Mat_SeqBAIJ *c;
237 Mat_SubSppt *submatj;
238
239 PetscFunctionBegin;
240 for (i=0; i<n; i++) {
241 C = (*mat)[i];
242 c = (Mat_SeqBAIJ*)C->data;
243 submatj = c->submatis1;
244 if (submatj) {
245 ierr = (*submatj->destroy)(C);CHKERRQ(ierr);
246 ierr = MatDestroySubMatrix_Private(submatj);CHKERRQ(ierr);
247 ierr = PetscFree(C->defaultvectype);CHKERRQ(ierr);
248 ierr = PetscLayoutDestroy(&C->rmap);CHKERRQ(ierr);
249 ierr = PetscLayoutDestroy(&C->cmap);CHKERRQ(ierr);
250 ierr = PetscHeaderDestroy(&C);CHKERRQ(ierr);
251 } else {
252 ierr = MatDestroy(&C);CHKERRQ(ierr);
253 }
254 }
255 ierr = PetscFree(*mat);CHKERRQ(ierr);
256 PetscFunctionReturn(0);
257 }
258
MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat * B[])259 PetscErrorCode MatCreateSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
260 {
261 PetscErrorCode ierr;
262 PetscInt i;
263
264 PetscFunctionBegin;
265 if (scall == MAT_INITIAL_MATRIX) {
266 ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr);
267 }
268
269 for (i=0; i<n; i++) {
270 ierr = MatCreateSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
271 }
272 PetscFunctionReturn(0);
273 }
274
275 /* -------------------------------------------------------*/
276 /* Should check that shapes of vectors and matrices match */
277 /* -------------------------------------------------------*/
278
MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)279 PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
280 {
281 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
282 PetscScalar *z,sum;
283 const PetscScalar *x;
284 const MatScalar *v;
285 PetscErrorCode ierr;
286 PetscInt mbs,i,n;
287 const PetscInt *idx,*ii,*ridx=NULL;
288 PetscBool usecprow=a->compressedrow.use;
289
290 PetscFunctionBegin;
291 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
292 ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
293
294 if (usecprow) {
295 mbs = a->compressedrow.nrows;
296 ii = a->compressedrow.i;
297 ridx = a->compressedrow.rindex;
298 ierr = PetscArrayzero(z,a->mbs);CHKERRQ(ierr);
299 } else {
300 mbs = a->mbs;
301 ii = a->i;
302 }
303
304 for (i=0; i<mbs; i++) {
305 n = ii[1] - ii[0];
306 v = a->a + ii[0];
307 idx = a->j + ii[0];
308 ii++;
309 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
310 PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
311 sum = 0.0;
312 PetscSparseDensePlusDot(sum,x,v,idx,n);
313 if (usecprow) {
314 z[ridx[i]] = sum;
315 } else {
316 z[i] = sum;
317 }
318 }
319 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
320 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
321 ierr = PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);CHKERRQ(ierr);
322 PetscFunctionReturn(0);
323 }
324
MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)325 PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
326 {
327 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
328 PetscScalar *z = NULL,sum1,sum2,*zarray;
329 const PetscScalar *x,*xb;
330 PetscScalar x1,x2;
331 const MatScalar *v;
332 PetscErrorCode ierr;
333 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
334 PetscBool usecprow=a->compressedrow.use;
335
336 PetscFunctionBegin;
337 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
338 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
339
340 idx = a->j;
341 v = a->a;
342 if (usecprow) {
343 mbs = a->compressedrow.nrows;
344 ii = a->compressedrow.i;
345 ridx = a->compressedrow.rindex;
346 ierr = PetscArrayzero(zarray,2*a->mbs);CHKERRQ(ierr);
347 } else {
348 mbs = a->mbs;
349 ii = a->i;
350 z = zarray;
351 }
352
353 for (i=0; i<mbs; i++) {
354 n = ii[1] - ii[0]; ii++;
355 sum1 = 0.0; sum2 = 0.0;
356 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
357 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
358 for (j=0; j<n; j++) {
359 xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
360 sum1 += v[0]*x1 + v[2]*x2;
361 sum2 += v[1]*x1 + v[3]*x2;
362 v += 4;
363 }
364 if (usecprow) z = zarray + 2*ridx[i];
365 z[0] = sum1; z[1] = sum2;
366 if (!usecprow) z += 2;
367 }
368 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
369 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
370 ierr = PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);CHKERRQ(ierr);
371 PetscFunctionReturn(0);
372 }
373
MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)374 PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
375 {
376 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
377 PetscScalar *z = NULL,sum1,sum2,sum3,x1,x2,x3,*zarray;
378 const PetscScalar *x,*xb;
379 const MatScalar *v;
380 PetscErrorCode ierr;
381 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
382 PetscBool usecprow=a->compressedrow.use;
383
384 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
385 #pragma disjoint(*v,*z,*xb)
386 #endif
387
388 PetscFunctionBegin;
389 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
390 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
391
392 idx = a->j;
393 v = a->a;
394 if (usecprow) {
395 mbs = a->compressedrow.nrows;
396 ii = a->compressedrow.i;
397 ridx = a->compressedrow.rindex;
398 ierr = PetscArrayzero(zarray,3*a->mbs);CHKERRQ(ierr);
399 } else {
400 mbs = a->mbs;
401 ii = a->i;
402 z = zarray;
403 }
404
405 for (i=0; i<mbs; i++) {
406 n = ii[1] - ii[0]; ii++;
407 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
408 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
409 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
410 for (j=0; j<n; j++) {
411 xb = x + 3*(*idx++);
412 x1 = xb[0];
413 x2 = xb[1];
414 x3 = xb[2];
415
416 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
417 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
418 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
419 v += 9;
420 }
421 if (usecprow) z = zarray + 3*ridx[i];
422 z[0] = sum1; z[1] = sum2; z[2] = sum3;
423 if (!usecprow) z += 3;
424 }
425 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
426 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
427 ierr = PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);CHKERRQ(ierr);
428 PetscFunctionReturn(0);
429 }
430
MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)431 PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
432 {
433 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
434 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
435 const PetscScalar *x,*xb;
436 const MatScalar *v;
437 PetscErrorCode ierr;
438 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
439 PetscBool usecprow=a->compressedrow.use;
440
441 PetscFunctionBegin;
442 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
443 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
444
445 idx = a->j;
446 v = a->a;
447 if (usecprow) {
448 mbs = a->compressedrow.nrows;
449 ii = a->compressedrow.i;
450 ridx = a->compressedrow.rindex;
451 ierr = PetscArrayzero(zarray,4*a->mbs);CHKERRQ(ierr);
452 } else {
453 mbs = a->mbs;
454 ii = a->i;
455 z = zarray;
456 }
457
458 for (i=0; i<mbs; i++) {
459 n = ii[1] - ii[0];
460 ii++;
461 sum1 = 0.0;
462 sum2 = 0.0;
463 sum3 = 0.0;
464 sum4 = 0.0;
465
466 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
467 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
468 for (j=0; j<n; j++) {
469 xb = x + 4*(*idx++);
470 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
471 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
472 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
473 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
474 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
475 v += 16;
476 }
477 if (usecprow) z = zarray + 4*ridx[i];
478 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
479 if (!usecprow) z += 4;
480 }
481 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
482 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
483 ierr = PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);CHKERRQ(ierr);
484 PetscFunctionReturn(0);
485 }
486
MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)487 PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
488 {
489 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
490 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*zarray;
491 const PetscScalar *xb,*x;
492 const MatScalar *v;
493 PetscErrorCode ierr;
494 const PetscInt *idx,*ii,*ridx=NULL;
495 PetscInt mbs,i,j,n;
496 PetscBool usecprow=a->compressedrow.use;
497
498 PetscFunctionBegin;
499 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
500 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
501
502 idx = a->j;
503 v = a->a;
504 if (usecprow) {
505 mbs = a->compressedrow.nrows;
506 ii = a->compressedrow.i;
507 ridx = a->compressedrow.rindex;
508 ierr = PetscArrayzero(zarray,5*a->mbs);CHKERRQ(ierr);
509 } else {
510 mbs = a->mbs;
511 ii = a->i;
512 z = zarray;
513 }
514
515 for (i=0; i<mbs; i++) {
516 n = ii[1] - ii[0]; ii++;
517 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
518 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
519 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
520 for (j=0; j<n; j++) {
521 xb = x + 5*(*idx++);
522 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
523 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
524 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
525 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
526 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
527 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
528 v += 25;
529 }
530 if (usecprow) z = zarray + 5*ridx[i];
531 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
532 if (!usecprow) z += 5;
533 }
534 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
535 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
536 ierr = PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);CHKERRQ(ierr);
537 PetscFunctionReturn(0);
538 }
539
MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)540 PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
541 {
542 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
543 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
544 const PetscScalar *x,*xb;
545 PetscScalar x1,x2,x3,x4,x5,x6,*zarray;
546 const MatScalar *v;
547 PetscErrorCode ierr;
548 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
549 PetscBool usecprow=a->compressedrow.use;
550
551 PetscFunctionBegin;
552 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
553 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
554
555 idx = a->j;
556 v = a->a;
557 if (usecprow) {
558 mbs = a->compressedrow.nrows;
559 ii = a->compressedrow.i;
560 ridx = a->compressedrow.rindex;
561 ierr = PetscArrayzero(zarray,6*a->mbs);CHKERRQ(ierr);
562 } else {
563 mbs = a->mbs;
564 ii = a->i;
565 z = zarray;
566 }
567
568 for (i=0; i<mbs; i++) {
569 n = ii[1] - ii[0];
570 ii++;
571 sum1 = 0.0;
572 sum2 = 0.0;
573 sum3 = 0.0;
574 sum4 = 0.0;
575 sum5 = 0.0;
576 sum6 = 0.0;
577
578 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
579 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
580 for (j=0; j<n; j++) {
581 xb = x + 6*(*idx++);
582 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
583 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
584 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
585 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
586 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
587 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
588 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
589 v += 36;
590 }
591 if (usecprow) z = zarray + 6*ridx[i];
592 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
593 if (!usecprow) z += 6;
594 }
595
596 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
597 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
598 ierr = PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);CHKERRQ(ierr);
599 PetscFunctionReturn(0);
600 }
601
MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)602 PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
603 {
604 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
605 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
606 const PetscScalar *x,*xb;
607 PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray;
608 const MatScalar *v;
609 PetscErrorCode ierr;
610 PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
611 PetscBool usecprow=a->compressedrow.use;
612
613 PetscFunctionBegin;
614 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
615 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
616
617 idx = a->j;
618 v = a->a;
619 if (usecprow) {
620 mbs = a->compressedrow.nrows;
621 ii = a->compressedrow.i;
622 ridx = a->compressedrow.rindex;
623 ierr = PetscArrayzero(zarray,7*a->mbs);CHKERRQ(ierr);
624 } else {
625 mbs = a->mbs;
626 ii = a->i;
627 z = zarray;
628 }
629
630 for (i=0; i<mbs; i++) {
631 n = ii[1] - ii[0];
632 ii++;
633 sum1 = 0.0;
634 sum2 = 0.0;
635 sum3 = 0.0;
636 sum4 = 0.0;
637 sum5 = 0.0;
638 sum6 = 0.0;
639 sum7 = 0.0;
640
641 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
642 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
643 for (j=0; j<n; j++) {
644 xb = x + 7*(*idx++);
645 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
646 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
647 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
648 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
649 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
650 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
651 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
652 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
653 v += 49;
654 }
655 if (usecprow) z = zarray + 7*ridx[i];
656 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
657 if (!usecprow) z += 7;
658 }
659
660 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
661 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
662 ierr = PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);CHKERRQ(ierr);
663 PetscFunctionReturn(0);
664 }
665
666 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)667 PetscErrorCode MatMult_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec zz)
668 {
669 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
670 PetscScalar *z = NULL,*work,*workt,*zarray;
671 const PetscScalar *x,*xb;
672 const MatScalar *v;
673 PetscErrorCode ierr;
674 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
675 const PetscInt *idx,*ii,*ridx=NULL;
676 PetscInt k;
677 PetscBool usecprow=a->compressedrow.use;
678
679 __m256d a0,a1,a2,a3,a4,a5;
680 __m256d w0,w1,w2,w3;
681 __m256d z0,z1,z2;
682 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
683
684 PetscFunctionBegin;
685 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
686 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
687
688 idx = a->j;
689 v = a->a;
690 if (usecprow) {
691 mbs = a->compressedrow.nrows;
692 ii = a->compressedrow.i;
693 ridx = a->compressedrow.rindex;
694 ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr);
695 } else {
696 mbs = a->mbs;
697 ii = a->i;
698 z = zarray;
699 }
700
701 if (!a->mult_work) {
702 k = PetscMax(A->rmap->n,A->cmap->n);
703 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
704 }
705
706 work = a->mult_work;
707 for (i=0; i<mbs; i++) {
708 n = ii[1] - ii[0]; ii++;
709 workt = work;
710 for (j=0; j<n; j++) {
711 xb = x + bs*(*idx++);
712 for (k=0; k<bs; k++) workt[k] = xb[k];
713 workt += bs;
714 }
715 if (usecprow) z = zarray + bs*ridx[i];
716
717 z0 = _mm256_setzero_pd(); z1 = _mm256_setzero_pd(); z2 = _mm256_setzero_pd();
718
719 for (j=0; j<n; j++) {
720 /* first column of a */
721 w0 = _mm256_set1_pd(work[j*9 ]);
722 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
723 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
724 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
725
726 /* second column of a */
727 w1 = _mm256_set1_pd(work[j*9+ 1]);
728 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
729 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
730 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
731
732 /* third column of a */
733 w2 = _mm256_set1_pd(work[j*9 +2]);
734 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
735 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
736 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
737
738 /* fourth column of a */
739 w3 = _mm256_set1_pd(work[j*9+ 3]);
740 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
741 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
742 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
743
744 /* fifth column of a */
745 w0 = _mm256_set1_pd(work[j*9+ 4]);
746 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
747 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
748 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
749
750 /* sixth column of a */
751 w1 = _mm256_set1_pd(work[j*9+ 5]);
752 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
753 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
754 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
755
756 /* seventh column of a */
757 w2 = _mm256_set1_pd(work[j*9+ 6]);
758 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
759 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
760 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
761
762 /* eigth column of a */
763 w3 = _mm256_set1_pd(work[j*9+ 7]);
764 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
765 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
766 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
767
768 /* ninth column of a */
769 w0 = _mm256_set1_pd(work[j*9+ 8]);
770 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
771 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
772 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
773 }
774
775 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
776
777 v += n*bs2;
778 if (!usecprow) z += bs;
779 }
780 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
781 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
782 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
783 PetscFunctionReturn(0);
784 }
785 #endif
786
MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)787 PetscErrorCode MatMult_SeqBAIJ_11(Mat A,Vec xx,Vec zz)
788 {
789 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
790 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
791 const PetscScalar *x,*xb;
792 PetscScalar *zarray,xv;
793 const MatScalar *v;
794 PetscErrorCode ierr;
795 const PetscInt *ii,*ij=a->j,*idx;
796 PetscInt mbs,i,j,k,n,*ridx=NULL;
797 PetscBool usecprow=a->compressedrow.use;
798
799 PetscFunctionBegin;
800 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
801 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
802
803 v = a->a;
804 if (usecprow) {
805 mbs = a->compressedrow.nrows;
806 ii = a->compressedrow.i;
807 ridx = a->compressedrow.rindex;
808 ierr = PetscArrayzero(zarray,11*a->mbs);CHKERRQ(ierr);
809 } else {
810 mbs = a->mbs;
811 ii = a->i;
812 z = zarray;
813 }
814
815 for (i=0; i<mbs; i++) {
816 n = ii[i+1] - ii[i];
817 idx = ij + ii[i];
818 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
819 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0;
820
821 for (j=0; j<n; j++) {
822 xb = x + 11*(idx[j]);
823
824 for (k=0; k<11; k++) {
825 xv = xb[k];
826 sum1 += v[0]*xv;
827 sum2 += v[1]*xv;
828 sum3 += v[2]*xv;
829 sum4 += v[3]*xv;
830 sum5 += v[4]*xv;
831 sum6 += v[5]*xv;
832 sum7 += v[6]*xv;
833 sum8 += v[7]*xv;
834 sum9 += v[8]*xv;
835 sum10 += v[9]*xv;
836 sum11 += v[10]*xv;
837 v += 11;
838 }
839 }
840 if (usecprow) z = zarray + 11*ridx[i];
841 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
842 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11;
843
844 if (!usecprow) z += 11;
845 }
846
847 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
848 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
849 ierr = PetscLogFlops(242.0*a->nz - 11.0*a->nonzerorowcnt);CHKERRQ(ierr);
850 PetscFunctionReturn(0);
851 }
852
853 /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
854 /* Default MatMult for block size 15 */
MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)855 PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
856 {
857 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
858 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
859 const PetscScalar *x,*xb;
860 PetscScalar *zarray,xv;
861 const MatScalar *v;
862 PetscErrorCode ierr;
863 const PetscInt *ii,*ij=a->j,*idx;
864 PetscInt mbs,i,j,k,n,*ridx=NULL;
865 PetscBool usecprow=a->compressedrow.use;
866
867 PetscFunctionBegin;
868 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
869 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
870
871 v = a->a;
872 if (usecprow) {
873 mbs = a->compressedrow.nrows;
874 ii = a->compressedrow.i;
875 ridx = a->compressedrow.rindex;
876 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
877 } else {
878 mbs = a->mbs;
879 ii = a->i;
880 z = zarray;
881 }
882
883 for (i=0; i<mbs; i++) {
884 n = ii[i+1] - ii[i];
885 idx = ij + ii[i];
886 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
887 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
888
889 for (j=0; j<n; j++) {
890 xb = x + 15*(idx[j]);
891
892 for (k=0; k<15; k++) {
893 xv = xb[k];
894 sum1 += v[0]*xv;
895 sum2 += v[1]*xv;
896 sum3 += v[2]*xv;
897 sum4 += v[3]*xv;
898 sum5 += v[4]*xv;
899 sum6 += v[5]*xv;
900 sum7 += v[6]*xv;
901 sum8 += v[7]*xv;
902 sum9 += v[8]*xv;
903 sum10 += v[9]*xv;
904 sum11 += v[10]*xv;
905 sum12 += v[11]*xv;
906 sum13 += v[12]*xv;
907 sum14 += v[13]*xv;
908 sum15 += v[14]*xv;
909 v += 15;
910 }
911 }
912 if (usecprow) z = zarray + 15*ridx[i];
913 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
914 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
915
916 if (!usecprow) z += 15;
917 }
918
919 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
920 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
921 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
922 PetscFunctionReturn(0);
923 }
924
925 /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)926 PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
927 {
928 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
929 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
930 const PetscScalar *x,*xb;
931 PetscScalar x1,x2,x3,x4,*zarray;
932 const MatScalar *v;
933 PetscErrorCode ierr;
934 const PetscInt *ii,*ij=a->j,*idx;
935 PetscInt mbs,i,j,n,*ridx=NULL;
936 PetscBool usecprow=a->compressedrow.use;
937
938 PetscFunctionBegin;
939 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
940 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
941
942 v = a->a;
943 if (usecprow) {
944 mbs = a->compressedrow.nrows;
945 ii = a->compressedrow.i;
946 ridx = a->compressedrow.rindex;
947 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
948 } else {
949 mbs = a->mbs;
950 ii = a->i;
951 z = zarray;
952 }
953
954 for (i=0; i<mbs; i++) {
955 n = ii[i+1] - ii[i];
956 idx = ij + ii[i];
957 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
958 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
959
960 for (j=0; j<n; j++) {
961 xb = x + 15*(idx[j]);
962 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
963
964 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
965 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
966 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
967 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
968 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
969 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
970 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
971 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
972 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
973 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
974 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
975 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
976 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
977 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
978 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
979
980 v += 60;
981
982 x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
983
984 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
985 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
986 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
987 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
988 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
989 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
990 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
991 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
992 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
993 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
994 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
995 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
996 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
997 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
998 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
999 v += 60;
1000
1001 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
1002 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
1003 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
1004 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
1005 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
1006 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
1007 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
1008 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
1009 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
1010 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
1011 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
1012 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
1013 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
1014 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
1015 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
1016 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
1017 v += 60;
1018
1019 x1 = xb[12]; x2 = xb[13]; x3 = xb[14];
1020 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3;
1021 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3;
1022 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3;
1023 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3;
1024 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3;
1025 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3;
1026 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3;
1027 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3;
1028 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3;
1029 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
1030 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
1031 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
1032 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
1033 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
1034 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
1035 v += 45;
1036 }
1037 if (usecprow) z = zarray + 15*ridx[i];
1038 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1039 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1040
1041 if (!usecprow) z += 15;
1042 }
1043
1044 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1045 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1046 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1047 PetscFunctionReturn(0);
1048 }
1049
1050 /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)1051 PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
1052 {
1053 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1054 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1055 const PetscScalar *x,*xb;
1056 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
1057 const MatScalar *v;
1058 PetscErrorCode ierr;
1059 const PetscInt *ii,*ij=a->j,*idx;
1060 PetscInt mbs,i,j,n,*ridx=NULL;
1061 PetscBool usecprow=a->compressedrow.use;
1062
1063 PetscFunctionBegin;
1064 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1065 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1066
1067 v = a->a;
1068 if (usecprow) {
1069 mbs = a->compressedrow.nrows;
1070 ii = a->compressedrow.i;
1071 ridx = a->compressedrow.rindex;
1072 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
1073 } else {
1074 mbs = a->mbs;
1075 ii = a->i;
1076 z = zarray;
1077 }
1078
1079 for (i=0; i<mbs; i++) {
1080 n = ii[i+1] - ii[i];
1081 idx = ij + ii[i];
1082 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1083 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1084
1085 for (j=0; j<n; j++) {
1086 xb = x + 15*(idx[j]);
1087 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1088 x8 = xb[7];
1089
1090 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
1091 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
1092 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
1093 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
1094 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
1095 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
1096 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
1097 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
1098 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
1099 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
1100 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
1101 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
1102 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
1103 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
1104 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
1105 v += 120;
1106
1107 x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
1108
1109 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
1110 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
1111 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
1112 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
1113 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
1114 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
1115 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
1116 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
1117 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
1118 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
1119 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
1120 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
1121 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
1122 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
1123 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
1124 v += 105;
1125 }
1126 if (usecprow) z = zarray + 15*ridx[i];
1127 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1128 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1129
1130 if (!usecprow) z += 15;
1131 }
1132
1133 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1134 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1135 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1136 PetscFunctionReturn(0);
1137 }
1138
1139 /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)1140 PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
1141 {
1142 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1143 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
1144 const PetscScalar *x,*xb;
1145 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
1146 const MatScalar *v;
1147 PetscErrorCode ierr;
1148 const PetscInt *ii,*ij=a->j,*idx;
1149 PetscInt mbs,i,j,n,*ridx=NULL;
1150 PetscBool usecprow=a->compressedrow.use;
1151
1152 PetscFunctionBegin;
1153 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1154 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1155
1156 v = a->a;
1157 if (usecprow) {
1158 mbs = a->compressedrow.nrows;
1159 ii = a->compressedrow.i;
1160 ridx = a->compressedrow.rindex;
1161 ierr = PetscArrayzero(zarray,15*a->mbs);CHKERRQ(ierr);
1162 } else {
1163 mbs = a->mbs;
1164 ii = a->i;
1165 z = zarray;
1166 }
1167
1168 for (i=0; i<mbs; i++) {
1169 n = ii[i+1] - ii[i];
1170 idx = ij + ii[i];
1171 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
1172 sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
1173
1174 for (j=0; j<n; j++) {
1175 xb = x + 15*(idx[j]);
1176 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1177 x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
1178
1179 sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
1180 sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
1181 sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
1182 sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
1183 sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
1184 sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
1185 sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
1186 sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
1187 sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
1188 sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
1189 sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
1190 sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
1191 sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
1192 sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
1193 sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
1194 v += 225;
1195 }
1196 if (usecprow) z = zarray + 15*ridx[i];
1197 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1198 z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1199
1200 if (!usecprow) z += 15;
1201 }
1202
1203 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1204 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1205 ierr = PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);CHKERRQ(ierr);
1206 PetscFunctionReturn(0);
1207 }
1208
1209 /*
1210 This will not work with MatScalar == float because it calls the BLAS
1211 */
MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)1212 PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1213 {
1214 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1215 PetscScalar *z = NULL,*work,*workt,*zarray;
1216 const PetscScalar *x,*xb;
1217 const MatScalar *v;
1218 PetscErrorCode ierr;
1219 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1220 const PetscInt *idx,*ii,*ridx=NULL;
1221 PetscInt ncols,k;
1222 PetscBool usecprow=a->compressedrow.use;
1223
1224 PetscFunctionBegin;
1225 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1226 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1227
1228 idx = a->j;
1229 v = a->a;
1230 if (usecprow) {
1231 mbs = a->compressedrow.nrows;
1232 ii = a->compressedrow.i;
1233 ridx = a->compressedrow.rindex;
1234 ierr = PetscArrayzero(zarray,bs*a->mbs);CHKERRQ(ierr);
1235 } else {
1236 mbs = a->mbs;
1237 ii = a->i;
1238 z = zarray;
1239 }
1240
1241 if (!a->mult_work) {
1242 k = PetscMax(A->rmap->n,A->cmap->n);
1243 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1244 }
1245 work = a->mult_work;
1246 for (i=0; i<mbs; i++) {
1247 n = ii[1] - ii[0]; ii++;
1248 ncols = n*bs;
1249 workt = work;
1250 for (j=0; j<n; j++) {
1251 xb = x + bs*(*idx++);
1252 for (k=0; k<bs; k++) workt[k] = xb[k];
1253 workt += bs;
1254 }
1255 if (usecprow) z = zarray + bs*ridx[i];
1256 PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1257 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1258 v += n*bs2;
1259 if (!usecprow) z += bs;
1260 }
1261 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1262 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1263 ierr = PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);CHKERRQ(ierr);
1264 PetscFunctionReturn(0);
1265 }
1266
MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)1267 PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1268 {
1269 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1270 const PetscScalar *x;
1271 PetscScalar *y,*z,sum;
1272 const MatScalar *v;
1273 PetscErrorCode ierr;
1274 PetscInt mbs=a->mbs,i,n,*ridx=NULL;
1275 const PetscInt *idx,*ii;
1276 PetscBool usecprow=a->compressedrow.use;
1277
1278 PetscFunctionBegin;
1279 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1280 ierr = VecGetArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1281
1282 idx = a->j;
1283 v = a->a;
1284 if (usecprow) {
1285 if (zz != yy) {
1286 ierr = PetscArraycpy(z,y,mbs);CHKERRQ(ierr);
1287 }
1288 mbs = a->compressedrow.nrows;
1289 ii = a->compressedrow.i;
1290 ridx = a->compressedrow.rindex;
1291 } else {
1292 ii = a->i;
1293 }
1294
1295 for (i=0; i<mbs; i++) {
1296 n = ii[1] - ii[0];
1297 ii++;
1298 if (!usecprow) {
1299 sum = y[i];
1300 } else {
1301 sum = y[ridx[i]];
1302 }
1303 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1304 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1305 PetscSparseDensePlusDot(sum,x,v,idx,n);
1306 v += n;
1307 idx += n;
1308 if (usecprow) {
1309 z[ridx[i]] = sum;
1310 } else {
1311 z[i] = sum;
1312 }
1313 }
1314 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1315 ierr = VecRestoreArrayPair(yy,zz,&y,&z);CHKERRQ(ierr);
1316 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1317 PetscFunctionReturn(0);
1318 }
1319
MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)1320 PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1321 {
1322 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1323 PetscScalar *y = NULL,*z = NULL,sum1,sum2;
1324 const PetscScalar *x,*xb;
1325 PetscScalar x1,x2,*yarray,*zarray;
1326 const MatScalar *v;
1327 PetscErrorCode ierr;
1328 PetscInt mbs = a->mbs,i,n,j;
1329 const PetscInt *idx,*ii,*ridx = NULL;
1330 PetscBool usecprow = a->compressedrow.use;
1331
1332 PetscFunctionBegin;
1333 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1334 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1335
1336 idx = a->j;
1337 v = a->a;
1338 if (usecprow) {
1339 if (zz != yy) {
1340 ierr = PetscArraycpy(zarray,yarray,2*mbs);CHKERRQ(ierr);
1341 }
1342 mbs = a->compressedrow.nrows;
1343 ii = a->compressedrow.i;
1344 ridx = a->compressedrow.rindex;
1345 } else {
1346 ii = a->i;
1347 y = yarray;
1348 z = zarray;
1349 }
1350
1351 for (i=0; i<mbs; i++) {
1352 n = ii[1] - ii[0]; ii++;
1353 if (usecprow) {
1354 z = zarray + 2*ridx[i];
1355 y = yarray + 2*ridx[i];
1356 }
1357 sum1 = y[0]; sum2 = y[1];
1358 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1359 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1360 for (j=0; j<n; j++) {
1361 xb = x + 2*(*idx++);
1362 x1 = xb[0];
1363 x2 = xb[1];
1364
1365 sum1 += v[0]*x1 + v[2]*x2;
1366 sum2 += v[1]*x1 + v[3]*x2;
1367 v += 4;
1368 }
1369 z[0] = sum1; z[1] = sum2;
1370 if (!usecprow) {
1371 z += 2; y += 2;
1372 }
1373 }
1374 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1375 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1376 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1377 PetscFunctionReturn(0);
1378 }
1379
MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)1380 PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1381 {
1382 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1383 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1384 const PetscScalar *x,*xb;
1385 const MatScalar *v;
1386 PetscErrorCode ierr;
1387 PetscInt mbs = a->mbs,i,j,n;
1388 const PetscInt *idx,*ii,*ridx = NULL;
1389 PetscBool usecprow = a->compressedrow.use;
1390
1391 PetscFunctionBegin;
1392 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1393 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1394
1395 idx = a->j;
1396 v = a->a;
1397 if (usecprow) {
1398 if (zz != yy) {
1399 ierr = PetscArraycpy(zarray,yarray,3*mbs);CHKERRQ(ierr);
1400 }
1401 mbs = a->compressedrow.nrows;
1402 ii = a->compressedrow.i;
1403 ridx = a->compressedrow.rindex;
1404 } else {
1405 ii = a->i;
1406 y = yarray;
1407 z = zarray;
1408 }
1409
1410 for (i=0; i<mbs; i++) {
1411 n = ii[1] - ii[0]; ii++;
1412 if (usecprow) {
1413 z = zarray + 3*ridx[i];
1414 y = yarray + 3*ridx[i];
1415 }
1416 sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1417 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1418 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1419 for (j=0; j<n; j++) {
1420 xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1421 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1422 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1423 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1424 v += 9;
1425 }
1426 z[0] = sum1; z[1] = sum2; z[2] = sum3;
1427 if (!usecprow) {
1428 z += 3; y += 3;
1429 }
1430 }
1431 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1432 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1433 ierr = PetscLogFlops(18.0*a->nz);CHKERRQ(ierr);
1434 PetscFunctionReturn(0);
1435 }
1436
MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)1437 PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1438 {
1439 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1440 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1441 const PetscScalar *x,*xb;
1442 const MatScalar *v;
1443 PetscErrorCode ierr;
1444 PetscInt mbs = a->mbs,i,j,n;
1445 const PetscInt *idx,*ii,*ridx=NULL;
1446 PetscBool usecprow=a->compressedrow.use;
1447
1448 PetscFunctionBegin;
1449 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1450 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1451
1452 idx = a->j;
1453 v = a->a;
1454 if (usecprow) {
1455 if (zz != yy) {
1456 ierr = PetscArraycpy(zarray,yarray,4*mbs);CHKERRQ(ierr);
1457 }
1458 mbs = a->compressedrow.nrows;
1459 ii = a->compressedrow.i;
1460 ridx = a->compressedrow.rindex;
1461 } else {
1462 ii = a->i;
1463 y = yarray;
1464 z = zarray;
1465 }
1466
1467 for (i=0; i<mbs; i++) {
1468 n = ii[1] - ii[0]; ii++;
1469 if (usecprow) {
1470 z = zarray + 4*ridx[i];
1471 y = yarray + 4*ridx[i];
1472 }
1473 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1474 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1475 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1476 for (j=0; j<n; j++) {
1477 xb = x + 4*(*idx++);
1478 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1479 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1480 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1481 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1482 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1483 v += 16;
1484 }
1485 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1486 if (!usecprow) {
1487 z += 4; y += 4;
1488 }
1489 }
1490 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1491 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1492 ierr = PetscLogFlops(32.0*a->nz);CHKERRQ(ierr);
1493 PetscFunctionReturn(0);
1494 }
1495
MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)1496 PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1497 {
1498 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1499 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1500 const PetscScalar *x,*xb;
1501 PetscScalar *yarray,*zarray;
1502 const MatScalar *v;
1503 PetscErrorCode ierr;
1504 PetscInt mbs = a->mbs,i,j,n;
1505 const PetscInt *idx,*ii,*ridx = NULL;
1506 PetscBool usecprow=a->compressedrow.use;
1507
1508 PetscFunctionBegin;
1509 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1510 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1511
1512 idx = a->j;
1513 v = a->a;
1514 if (usecprow) {
1515 if (zz != yy) {
1516 ierr = PetscArraycpy(zarray,yarray,5*mbs);CHKERRQ(ierr);
1517 }
1518 mbs = a->compressedrow.nrows;
1519 ii = a->compressedrow.i;
1520 ridx = a->compressedrow.rindex;
1521 } else {
1522 ii = a->i;
1523 y = yarray;
1524 z = zarray;
1525 }
1526
1527 for (i=0; i<mbs; i++) {
1528 n = ii[1] - ii[0]; ii++;
1529 if (usecprow) {
1530 z = zarray + 5*ridx[i];
1531 y = yarray + 5*ridx[i];
1532 }
1533 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1534 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1535 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1536 for (j=0; j<n; j++) {
1537 xb = x + 5*(*idx++);
1538 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1539 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1540 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1541 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1542 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1543 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1544 v += 25;
1545 }
1546 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1547 if (!usecprow) {
1548 z += 5; y += 5;
1549 }
1550 }
1551 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1552 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1553 ierr = PetscLogFlops(50.0*a->nz);CHKERRQ(ierr);
1554 PetscFunctionReturn(0);
1555 }
1556
MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)1557 PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1558 {
1559 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1560 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6;
1561 const PetscScalar *x,*xb;
1562 PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1563 const MatScalar *v;
1564 PetscErrorCode ierr;
1565 PetscInt mbs = a->mbs,i,j,n;
1566 const PetscInt *idx,*ii,*ridx=NULL;
1567 PetscBool usecprow=a->compressedrow.use;
1568
1569 PetscFunctionBegin;
1570 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1571 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1572
1573 idx = a->j;
1574 v = a->a;
1575 if (usecprow) {
1576 if (zz != yy) {
1577 ierr = PetscArraycpy(zarray,yarray,6*mbs);CHKERRQ(ierr);
1578 }
1579 mbs = a->compressedrow.nrows;
1580 ii = a->compressedrow.i;
1581 ridx = a->compressedrow.rindex;
1582 } else {
1583 ii = a->i;
1584 y = yarray;
1585 z = zarray;
1586 }
1587
1588 for (i=0; i<mbs; i++) {
1589 n = ii[1] - ii[0]; ii++;
1590 if (usecprow) {
1591 z = zarray + 6*ridx[i];
1592 y = yarray + 6*ridx[i];
1593 }
1594 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1595 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1596 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1597 for (j=0; j<n; j++) {
1598 xb = x + 6*(*idx++);
1599 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1600 sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1601 sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1602 sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1603 sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1604 sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1605 sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1606 v += 36;
1607 }
1608 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1609 if (!usecprow) {
1610 z += 6; y += 6;
1611 }
1612 }
1613 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1614 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1615 ierr = PetscLogFlops(72.0*a->nz);CHKERRQ(ierr);
1616 PetscFunctionReturn(0);
1617 }
1618
MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)1619 PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1620 {
1621 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1622 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1623 const PetscScalar *x,*xb;
1624 PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1625 const MatScalar *v;
1626 PetscErrorCode ierr;
1627 PetscInt mbs = a->mbs,i,j,n;
1628 const PetscInt *idx,*ii,*ridx = NULL;
1629 PetscBool usecprow=a->compressedrow.use;
1630
1631 PetscFunctionBegin;
1632 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1633 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1634
1635 idx = a->j;
1636 v = a->a;
1637 if (usecprow) {
1638 if (zz != yy) {
1639 ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr);
1640 }
1641 mbs = a->compressedrow.nrows;
1642 ii = a->compressedrow.i;
1643 ridx = a->compressedrow.rindex;
1644 } else {
1645 ii = a->i;
1646 y = yarray;
1647 z = zarray;
1648 }
1649
1650 for (i=0; i<mbs; i++) {
1651 n = ii[1] - ii[0]; ii++;
1652 if (usecprow) {
1653 z = zarray + 7*ridx[i];
1654 y = yarray + 7*ridx[i];
1655 }
1656 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1657 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1658 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1659 for (j=0; j<n; j++) {
1660 xb = x + 7*(*idx++);
1661 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1662 sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1663 sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1664 sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1665 sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1666 sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1667 sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1668 sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1669 v += 49;
1670 }
1671 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1672 if (!usecprow) {
1673 z += 7; y += 7;
1674 }
1675 }
1676 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1677 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1678 ierr = PetscLogFlops(98.0*a->nz);CHKERRQ(ierr);
1679 PetscFunctionReturn(0);
1680 }
1681
1682 #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)1683 PetscErrorCode MatMultAdd_SeqBAIJ_9_AVX2(Mat A,Vec xx,Vec yy,Vec zz)
1684 {
1685 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1686 PetscScalar *z = NULL,*work,*workt,*zarray;
1687 const PetscScalar *x,*xb;
1688 const MatScalar *v;
1689 PetscErrorCode ierr;
1690 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1691 PetscInt k;
1692 PetscBool usecprow=a->compressedrow.use;
1693 const PetscInt *idx,*ii,*ridx=NULL;
1694
1695 __m256d a0,a1,a2,a3,a4,a5;
1696 __m256d w0,w1,w2,w3;
1697 __m256d z0,z1,z2;
1698 __m256i mask1 = _mm256_set_epi64x(0LL, 0LL, 0LL, 1LL<<63);
1699
1700 PetscFunctionBegin;
1701 ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1702 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1703 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1704
1705 idx = a->j;
1706 v = a->a;
1707 if (usecprow) {
1708 mbs = a->compressedrow.nrows;
1709 ii = a->compressedrow.i;
1710 ridx = a->compressedrow.rindex;
1711 } else {
1712 mbs = a->mbs;
1713 ii = a->i;
1714 z = zarray;
1715 }
1716
1717 if (!a->mult_work) {
1718 k = PetscMax(A->rmap->n,A->cmap->n);
1719 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1720 }
1721
1722 work = a->mult_work;
1723 for (i=0; i<mbs; i++) {
1724 n = ii[1] - ii[0]; ii++;
1725 workt = work;
1726 for (j=0; j<n; j++) {
1727 xb = x + bs*(*idx++);
1728 for (k=0; k<bs; k++) workt[k] = xb[k];
1729 workt += bs;
1730 }
1731 if (usecprow) z = zarray + bs*ridx[i];
1732
1733 z0 = _mm256_loadu_pd(&z[ 0]); z1 = _mm256_loadu_pd(&z[ 4]); z2 = _mm256_set1_pd(z[ 8]);
1734
1735 for (j=0; j<n; j++) {
1736 /* first column of a */
1737 w0 = _mm256_set1_pd(work[j*9 ]);
1738 a0 = _mm256_loadu_pd(&v[j*81 ]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1739 a1 = _mm256_loadu_pd(&v[j*81+4]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1740 a2 = _mm256_loadu_pd(&v[j*81+8]); z2 = _mm256_fmadd_pd(a2,w0,z2);
1741
1742 /* second column of a */
1743 w1 = _mm256_set1_pd(work[j*9+ 1]);
1744 a0 = _mm256_loadu_pd(&v[j*81+ 9]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1745 a1 = _mm256_loadu_pd(&v[j*81+13]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1746 a2 = _mm256_loadu_pd(&v[j*81+17]); z2 = _mm256_fmadd_pd(a2,w1,z2);
1747
1748 /* third column of a */
1749 w2 = _mm256_set1_pd(work[j*9+ 2]);
1750 a3 = _mm256_loadu_pd(&v[j*81+18]); z0 = _mm256_fmadd_pd(a3,w2,z0);
1751 a4 = _mm256_loadu_pd(&v[j*81+22]); z1 = _mm256_fmadd_pd(a4,w2,z1);
1752 a5 = _mm256_loadu_pd(&v[j*81+26]); z2 = _mm256_fmadd_pd(a5,w2,z2);
1753
1754 /* fourth column of a */
1755 w3 = _mm256_set1_pd(work[j*9+ 3]);
1756 a0 = _mm256_loadu_pd(&v[j*81+27]); z0 = _mm256_fmadd_pd(a0,w3,z0);
1757 a1 = _mm256_loadu_pd(&v[j*81+31]); z1 = _mm256_fmadd_pd(a1,w3,z1);
1758 a2 = _mm256_loadu_pd(&v[j*81+35]); z2 = _mm256_fmadd_pd(a2,w3,z2);
1759
1760 /* fifth column of a */
1761 w0 = _mm256_set1_pd(work[j*9+ 4]);
1762 a3 = _mm256_loadu_pd(&v[j*81+36]); z0 = _mm256_fmadd_pd(a3,w0,z0);
1763 a4 = _mm256_loadu_pd(&v[j*81+40]); z1 = _mm256_fmadd_pd(a4,w0,z1);
1764 a5 = _mm256_loadu_pd(&v[j*81+44]); z2 = _mm256_fmadd_pd(a5,w0,z2);
1765
1766 /* sixth column of a */
1767 w1 = _mm256_set1_pd(work[j*9+ 5]);
1768 a0 = _mm256_loadu_pd(&v[j*81+45]); z0 = _mm256_fmadd_pd(a0,w1,z0);
1769 a1 = _mm256_loadu_pd(&v[j*81+49]); z1 = _mm256_fmadd_pd(a1,w1,z1);
1770 a2 = _mm256_loadu_pd(&v[j*81+53]); z2 = _mm256_fmadd_pd(a2,w1,z2);
1771
1772 /* seventh column of a */
1773 w2 = _mm256_set1_pd(work[j*9+ 6]);
1774 a0 = _mm256_loadu_pd(&v[j*81+54]); z0 = _mm256_fmadd_pd(a0,w2,z0);
1775 a1 = _mm256_loadu_pd(&v[j*81+58]); z1 = _mm256_fmadd_pd(a1,w2,z1);
1776 a2 = _mm256_loadu_pd(&v[j*81+62]); z2 = _mm256_fmadd_pd(a2,w2,z2);
1777
1778 /* eigth column of a */
1779 w3 = _mm256_set1_pd(work[j*9+ 7]);
1780 a3 = _mm256_loadu_pd(&v[j*81+63]); z0 = _mm256_fmadd_pd(a3,w3,z0);
1781 a4 = _mm256_loadu_pd(&v[j*81+67]); z1 = _mm256_fmadd_pd(a4,w3,z1);
1782 a5 = _mm256_loadu_pd(&v[j*81+71]); z2 = _mm256_fmadd_pd(a5,w3,z2);
1783
1784 /* ninth column of a */
1785 w0 = _mm256_set1_pd(work[j*9+ 8]);
1786 a0 = _mm256_loadu_pd(&v[j*81+72]); z0 = _mm256_fmadd_pd(a0,w0,z0);
1787 a1 = _mm256_loadu_pd(&v[j*81+76]); z1 = _mm256_fmadd_pd(a1,w0,z1);
1788 a2 = _mm256_maskload_pd(&v[j*81+80],mask1); z2 = _mm256_fmadd_pd(a2,w0,z2);
1789 }
1790
1791 _mm256_storeu_pd(&z[ 0], z0); _mm256_storeu_pd(&z[ 4], z1); _mm256_maskstore_pd(&z[8], mask1, z2);
1792
1793 v += n*bs2;
1794 if (!usecprow) z += bs;
1795 }
1796 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1797 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1798 ierr = PetscLogFlops(162.0*a->nz);CHKERRQ(ierr);
1799 PetscFunctionReturn(0);
1800 }
1801 #endif
1802
MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)1803 PetscErrorCode MatMultAdd_SeqBAIJ_11(Mat A,Vec xx,Vec yy,Vec zz)
1804 {
1805 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1806 PetscScalar *y = NULL,*z = NULL,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11;
1807 const PetscScalar *x,*xb;
1808 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,*yarray,*zarray;
1809 const MatScalar *v;
1810 PetscErrorCode ierr;
1811 PetscInt mbs = a->mbs,i,j,n;
1812 const PetscInt *idx,*ii,*ridx = NULL;
1813 PetscBool usecprow=a->compressedrow.use;
1814
1815 PetscFunctionBegin;
1816 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1817 ierr = VecGetArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1818
1819 idx = a->j;
1820 v = a->a;
1821 if (usecprow) {
1822 if (zz != yy) {
1823 ierr = PetscArraycpy(zarray,yarray,7*mbs);CHKERRQ(ierr);
1824 }
1825 mbs = a->compressedrow.nrows;
1826 ii = a->compressedrow.i;
1827 ridx = a->compressedrow.rindex;
1828 } else {
1829 ii = a->i;
1830 y = yarray;
1831 z = zarray;
1832 }
1833
1834 for (i=0; i<mbs; i++) {
1835 n = ii[1] - ii[0]; ii++;
1836 if (usecprow) {
1837 z = zarray + 11*ridx[i];
1838 y = yarray + 11*ridx[i];
1839 }
1840 sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1841 sum8 = y[7]; sum9 = y[8]; sum10 = y[9]; sum11 = y[10];
1842 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1843 PetscPrefetchBlock(v+121*n,121*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1844 for (j=0; j<n; j++) {
1845 xb = x + 11*(*idx++);
1846 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10];
1847 sum1 += v[ 0]*x1 + v[ 11]*x2 + v[2*11]*x3 + v[3*11]*x4 + v[4*11]*x5 + v[5*11]*x6 + v[6*11]*x7+ v[7*11]*x8 + v[8*11]*x9 + v[9*11]*x10 + v[10*11]*x11;
1848 sum2 += v[1+0]*x1 + v[1+11]*x2 + v[1+2*11]*x3 + v[1+3*11]*x4 + v[1+4*11]*x5 + v[1+5*11]*x6 + v[1+6*11]*x7+ v[1+7*11]*x8 + v[1+8*11]*x9 + v[1+9*11]*x10 + v[1+10*11]*x11;
1849 sum3 += v[2+0]*x1 + v[2+11]*x2 + v[2+2*11]*x3 + v[2+3*11]*x4 + v[2+4*11]*x5 + v[2+5*11]*x6 + v[2+6*11]*x7+ v[2+7*11]*x8 + v[2+8*11]*x9 + v[2+9*11]*x10 + v[2+10*11]*x11;
1850 sum4 += v[3+0]*x1 + v[3+11]*x2 + v[3+2*11]*x3 + v[3+3*11]*x4 + v[3+4*11]*x5 + v[3+5*11]*x6 + v[3+6*11]*x7+ v[3+7*11]*x8 + v[3+8*11]*x9 + v[3+9*11]*x10 + v[3+10*11]*x11;
1851 sum5 += v[4+0]*x1 + v[4+11]*x2 + v[4+2*11]*x3 + v[4+3*11]*x4 + v[4+4*11]*x5 + v[4+5*11]*x6 + v[4+6*11]*x7+ v[4+7*11]*x8 + v[4+8*11]*x9 + v[4+9*11]*x10 + v[4+10*11]*x11;
1852 sum6 += v[5+0]*x1 + v[5+11]*x2 + v[5+2*11]*x3 + v[5+3*11]*x4 + v[5+4*11]*x5 + v[5+5*11]*x6 + v[5+6*11]*x7+ v[5+7*11]*x8 + v[5+8*11]*x9 + v[5+9*11]*x10 + v[5+10*11]*x11;
1853 sum7 += v[6+0]*x1 + v[6+11]*x2 + v[6+2*11]*x3 + v[6+3*11]*x4 + v[6+4*11]*x5 + v[6+5*11]*x6 + v[6+6*11]*x7+ v[6+7*11]*x8 + v[6+8*11]*x9 + v[6+9*11]*x10 + v[6+10*11]*x11;
1854 sum8 += v[7+0]*x1 + v[7+11]*x2 + v[7+2*11]*x3 + v[7+3*11]*x4 + v[7+4*11]*x5 + v[7+5*11]*x6 + v[7+6*11]*x7+ v[7+7*11]*x8 + v[7+8*11]*x9 + v[7+9*11]*x10 + v[7+10*11]*x11;
1855 sum9 += v[8+0]*x1 + v[8+11]*x2 + v[8+2*11]*x3 + v[8+3*11]*x4 + v[8+4*11]*x5 + v[8+5*11]*x6 + v[8+6*11]*x7+ v[8+7*11]*x8 + v[8+8*11]*x9 + v[8+9*11]*x10 + v[8+10*11]*x11;
1856 sum10 += v[9+0]*x1 + v[9+11]*x2 + v[9+2*11]*x3 + v[9+3*11]*x4 + v[9+4*11]*x5 + v[9+5*11]*x6 + v[9+6*11]*x7+ v[9+7*11]*x8 + v[9+8*11]*x9 + v[9+9*11]*x10 + v[9+10*11]*x11;
1857 sum11 += v[10+0]*x1 + v[10+11]*x2 + v[10+2*11]*x3 + v[10+3*11]*x4 + v[10+4*11]*x5 + v[10+5*11]*x6 + v[10+6*11]*x7+ v[10+7*11]*x8 + v[10+8*11]*x9 + v[10+9*11]*x10 + v[10+10*11]*x11;
1858 v += 121;
1859 }
1860 z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1861 z[7] = sum6; z[8] = sum7; z[9] = sum8; z[10] = sum9; z[11] = sum10;
1862 if (!usecprow) {
1863 z += 11; y += 11;
1864 }
1865 }
1866 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1867 ierr = VecRestoreArrayPair(yy,zz,&yarray,&zarray);CHKERRQ(ierr);
1868 ierr = PetscLogFlops(242.0*a->nz);CHKERRQ(ierr);
1869 PetscFunctionReturn(0);
1870 }
1871
MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)1872 PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1873 {
1874 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1875 PetscScalar *z = NULL,*work,*workt,*zarray;
1876 const PetscScalar *x,*xb;
1877 const MatScalar *v;
1878 PetscErrorCode ierr;
1879 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1880 PetscInt ncols,k;
1881 const PetscInt *ridx = NULL,*idx,*ii;
1882 PetscBool usecprow = a->compressedrow.use;
1883
1884 PetscFunctionBegin;
1885 ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1886 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1887 ierr = VecGetArray(zz,&zarray);CHKERRQ(ierr);
1888
1889 idx = a->j;
1890 v = a->a;
1891 if (usecprow) {
1892 mbs = a->compressedrow.nrows;
1893 ii = a->compressedrow.i;
1894 ridx = a->compressedrow.rindex;
1895 } else {
1896 mbs = a->mbs;
1897 ii = a->i;
1898 z = zarray;
1899 }
1900
1901 if (!a->mult_work) {
1902 k = PetscMax(A->rmap->n,A->cmap->n);
1903 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
1904 }
1905 work = a->mult_work;
1906 for (i=0; i<mbs; i++) {
1907 n = ii[1] - ii[0]; ii++;
1908 ncols = n*bs;
1909 workt = work;
1910 for (j=0; j<n; j++) {
1911 xb = x + bs*(*idx++);
1912 for (k=0; k<bs; k++) workt[k] = xb[k];
1913 workt += bs;
1914 }
1915 if (usecprow) z = zarray + bs*ridx[i];
1916 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1917 /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1918 v += n*bs2;
1919 if (!usecprow) z += bs;
1920 }
1921 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1922 ierr = VecRestoreArray(zz,&zarray);CHKERRQ(ierr);
1923 ierr = PetscLogFlops(2.0*a->nz*bs2);CHKERRQ(ierr);
1924 PetscFunctionReturn(0);
1925 }
1926
MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)1927 PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1928 {
1929 PetscScalar zero = 0.0;
1930 PetscErrorCode ierr;
1931
1932 PetscFunctionBegin;
1933 ierr = VecSet(zz,zero);CHKERRQ(ierr);
1934 ierr = MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1935 PetscFunctionReturn(0);
1936 }
1937
MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)1938 PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1939 {
1940 PetscScalar zero = 0.0;
1941 PetscErrorCode ierr;
1942
1943 PetscFunctionBegin;
1944 ierr = VecSet(zz,zero);CHKERRQ(ierr);
1945 ierr = MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);CHKERRQ(ierr);
1946 PetscFunctionReturn(0);
1947 }
1948
MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)1949 PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1950 {
1951 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1952 PetscScalar *z,x1,x2,x3,x4,x5;
1953 const PetscScalar *x,*xb = NULL;
1954 const MatScalar *v;
1955 PetscErrorCode ierr;
1956 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n;
1957 const PetscInt *idx,*ii,*ib,*ridx = NULL;
1958 Mat_CompressedRow cprow = a->compressedrow;
1959 PetscBool usecprow = cprow.use;
1960
1961 PetscFunctionBegin;
1962 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
1963 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1964 ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1965
1966 idx = a->j;
1967 v = a->a;
1968 if (usecprow) {
1969 mbs = cprow.nrows;
1970 ii = cprow.i;
1971 ridx = cprow.rindex;
1972 } else {
1973 mbs=a->mbs;
1974 ii = a->i;
1975 xb = x;
1976 }
1977
1978 switch (bs) {
1979 case 1:
1980 for (i=0; i<mbs; i++) {
1981 if (usecprow) xb = x + ridx[i];
1982 x1 = xb[0];
1983 ib = idx + ii[0];
1984 n = ii[1] - ii[0]; ii++;
1985 for (j=0; j<n; j++) {
1986 rval = ib[j];
1987 z[rval] += PetscConj(*v) * x1;
1988 v++;
1989 }
1990 if (!usecprow) xb++;
1991 }
1992 break;
1993 case 2:
1994 for (i=0; i<mbs; i++) {
1995 if (usecprow) xb = x + 2*ridx[i];
1996 x1 = xb[0]; x2 = xb[1];
1997 ib = idx + ii[0];
1998 n = ii[1] - ii[0]; ii++;
1999 for (j=0; j<n; j++) {
2000 rval = ib[j]*2;
2001 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
2002 z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
2003 v += 4;
2004 }
2005 if (!usecprow) xb += 2;
2006 }
2007 break;
2008 case 3:
2009 for (i=0; i<mbs; i++) {
2010 if (usecprow) xb = x + 3*ridx[i];
2011 x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2012 ib = idx + ii[0];
2013 n = ii[1] - ii[0]; ii++;
2014 for (j=0; j<n; j++) {
2015 rval = ib[j]*3;
2016 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
2017 z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
2018 z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
2019 v += 9;
2020 }
2021 if (!usecprow) xb += 3;
2022 }
2023 break;
2024 case 4:
2025 for (i=0; i<mbs; i++) {
2026 if (usecprow) xb = x + 4*ridx[i];
2027 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2028 ib = idx + ii[0];
2029 n = ii[1] - ii[0]; ii++;
2030 for (j=0; j<n; j++) {
2031 rval = ib[j]*4;
2032 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4;
2033 z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4;
2034 z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
2035 z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
2036 v += 16;
2037 }
2038 if (!usecprow) xb += 4;
2039 }
2040 break;
2041 case 5:
2042 for (i=0; i<mbs; i++) {
2043 if (usecprow) xb = x + 5*ridx[i];
2044 x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2045 x4 = xb[3]; x5 = xb[4];
2046 ib = idx + ii[0];
2047 n = ii[1] - ii[0]; ii++;
2048 for (j=0; j<n; j++) {
2049 rval = ib[j]*5;
2050 z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5;
2051 z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5;
2052 z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
2053 z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
2054 z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
2055 v += 25;
2056 }
2057 if (!usecprow) xb += 5;
2058 }
2059 break;
2060 default: /* block sizes larger than 5 by 5 are handled by BLAS */
2061 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
2062 #if 0
2063 {
2064 PetscInt ncols,k,bs2=a->bs2;
2065 PetscScalar *work,*workt,zb;
2066 const PetscScalar *xtmp;
2067 if (!a->mult_work) {
2068 k = PetscMax(A->rmap->n,A->cmap->n);
2069 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2070 }
2071 work = a->mult_work;
2072 xtmp = x;
2073 for (i=0; i<mbs; i++) {
2074 n = ii[1] - ii[0]; ii++;
2075 ncols = n*bs;
2076 ierr = PetscArrayzero(work,ncols);CHKERRQ(ierr);
2077 if (usecprow) xtmp = x + bs*ridx[i];
2078 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2079 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2080 v += n*bs2;
2081 if (!usecprow) xtmp += bs;
2082 workt = work;
2083 for (j=0; j<n; j++) {
2084 zb = z + bs*(*idx++);
2085 for (k=0; k<bs; k++) zb[k] += workt[k] ;
2086 workt += bs;
2087 }
2088 }
2089 }
2090 #endif
2091 }
2092 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2093 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2094 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2095 PetscFunctionReturn(0);
2096 }
2097
MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)2098 PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
2099 {
2100 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2101 PetscScalar *zb,*z,x1,x2,x3,x4,x5;
2102 const PetscScalar *x,*xb = NULL;
2103 const MatScalar *v;
2104 PetscErrorCode ierr;
2105 PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
2106 const PetscInt *idx,*ii,*ib,*ridx = NULL;
2107 Mat_CompressedRow cprow = a->compressedrow;
2108 PetscBool usecprow=cprow.use;
2109
2110 PetscFunctionBegin;
2111 if (yy != zz) { ierr = VecCopy(yy,zz);CHKERRQ(ierr); }
2112 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
2113 ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
2114
2115 idx = a->j;
2116 v = a->a;
2117 if (usecprow) {
2118 mbs = cprow.nrows;
2119 ii = cprow.i;
2120 ridx = cprow.rindex;
2121 } else {
2122 mbs=a->mbs;
2123 ii = a->i;
2124 xb = x;
2125 }
2126
2127 switch (bs) {
2128 case 1:
2129 for (i=0; i<mbs; i++) {
2130 if (usecprow) xb = x + ridx[i];
2131 x1 = xb[0];
2132 ib = idx + ii[0];
2133 n = ii[1] - ii[0]; ii++;
2134 for (j=0; j<n; j++) {
2135 rval = ib[j];
2136 z[rval] += *v * x1;
2137 v++;
2138 }
2139 if (!usecprow) xb++;
2140 }
2141 break;
2142 case 2:
2143 for (i=0; i<mbs; i++) {
2144 if (usecprow) xb = x + 2*ridx[i];
2145 x1 = xb[0]; x2 = xb[1];
2146 ib = idx + ii[0];
2147 n = ii[1] - ii[0]; ii++;
2148 for (j=0; j<n; j++) {
2149 rval = ib[j]*2;
2150 z[rval++] += v[0]*x1 + v[1]*x2;
2151 z[rval++] += v[2]*x1 + v[3]*x2;
2152 v += 4;
2153 }
2154 if (!usecprow) xb += 2;
2155 }
2156 break;
2157 case 3:
2158 for (i=0; i<mbs; i++) {
2159 if (usecprow) xb = x + 3*ridx[i];
2160 x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2161 ib = idx + ii[0];
2162 n = ii[1] - ii[0]; ii++;
2163 for (j=0; j<n; j++) {
2164 rval = ib[j]*3;
2165 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
2166 z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
2167 z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
2168 v += 9;
2169 }
2170 if (!usecprow) xb += 3;
2171 }
2172 break;
2173 case 4:
2174 for (i=0; i<mbs; i++) {
2175 if (usecprow) xb = x + 4*ridx[i];
2176 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
2177 ib = idx + ii[0];
2178 n = ii[1] - ii[0]; ii++;
2179 for (j=0; j<n; j++) {
2180 rval = ib[j]*4;
2181 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
2182 z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
2183 z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
2184 z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
2185 v += 16;
2186 }
2187 if (!usecprow) xb += 4;
2188 }
2189 break;
2190 case 5:
2191 for (i=0; i<mbs; i++) {
2192 if (usecprow) xb = x + 5*ridx[i];
2193 x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
2194 x4 = xb[3]; x5 = xb[4];
2195 ib = idx + ii[0];
2196 n = ii[1] - ii[0]; ii++;
2197 for (j=0; j<n; j++) {
2198 rval = ib[j]*5;
2199 z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
2200 z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
2201 z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
2202 z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
2203 z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
2204 v += 25;
2205 }
2206 if (!usecprow) xb += 5;
2207 }
2208 break;
2209 default: { /* block sizes larger then 5 by 5 are handled by BLAS */
2210 PetscInt ncols,k;
2211 PetscScalar *work,*workt;
2212 const PetscScalar *xtmp;
2213 if (!a->mult_work) {
2214 k = PetscMax(A->rmap->n,A->cmap->n);
2215 ierr = PetscMalloc1(k+1,&a->mult_work);CHKERRQ(ierr);
2216 }
2217 work = a->mult_work;
2218 xtmp = x;
2219 for (i=0; i<mbs; i++) {
2220 n = ii[1] - ii[0]; ii++;
2221 ncols = n*bs;
2222 ierr = PetscArrayzero(work,ncols);CHKERRQ(ierr);
2223 if (usecprow) xtmp = x + bs*ridx[i];
2224 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
2225 /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
2226 v += n*bs2;
2227 if (!usecprow) xtmp += bs;
2228 workt = work;
2229 for (j=0; j<n; j++) {
2230 zb = z + bs*(*idx++);
2231 for (k=0; k<bs; k++) zb[k] += workt[k];
2232 workt += bs;
2233 }
2234 }
2235 }
2236 }
2237 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
2238 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
2239 ierr = PetscLogFlops(2.0*a->nz*a->bs2);CHKERRQ(ierr);
2240 PetscFunctionReturn(0);
2241 }
2242
MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)2243 PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
2244 {
2245 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
2246 PetscInt totalnz = a->bs2*a->nz;
2247 PetscScalar oalpha = alpha;
2248 PetscErrorCode ierr;
2249 PetscBLASInt one = 1,tnz;
2250
2251 PetscFunctionBegin;
2252 ierr = PetscBLASIntCast(totalnz,&tnz);CHKERRQ(ierr);
2253 PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
2254 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
2255 PetscFunctionReturn(0);
2256 }
2257
MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal * norm)2258 PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
2259 {
2260 PetscErrorCode ierr;
2261 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2262 MatScalar *v = a->a;
2263 PetscReal sum = 0.0;
2264 PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
2265
2266 PetscFunctionBegin;
2267 if (type == NORM_FROBENIUS) {
2268 #if defined(PETSC_USE_REAL___FP16)
2269 PetscBLASInt one = 1,cnt = bs2*nz;
2270 *norm = BLASnrm2_(&cnt,v,&one);
2271 #else
2272 for (i=0; i<bs2*nz; i++) {
2273 sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
2274 }
2275 #endif
2276 *norm = PetscSqrtReal(sum);
2277 ierr = PetscLogFlops(2.0*bs2*nz);CHKERRQ(ierr);
2278 } else if (type == NORM_1) { /* maximum column sum */
2279 PetscReal *tmp;
2280 PetscInt *bcol = a->j;
2281 ierr = PetscCalloc1(A->cmap->n+1,&tmp);CHKERRQ(ierr);
2282 for (i=0; i<nz; i++) {
2283 for (j=0; j<bs; j++) {
2284 k1 = bs*(*bcol) + j; /* column index */
2285 for (k=0; k<bs; k++) {
2286 tmp[k1] += PetscAbsScalar(*v); v++;
2287 }
2288 }
2289 bcol++;
2290 }
2291 *norm = 0.0;
2292 for (j=0; j<A->cmap->n; j++) {
2293 if (tmp[j] > *norm) *norm = tmp[j];
2294 }
2295 ierr = PetscFree(tmp);CHKERRQ(ierr);
2296 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
2297 } else if (type == NORM_INFINITY) { /* maximum row sum */
2298 *norm = 0.0;
2299 for (k=0; k<bs; k++) {
2300 for (j=0; j<a->mbs; j++) {
2301 v = a->a + bs2*a->i[j] + k;
2302 sum = 0.0;
2303 for (i=0; i<a->i[j+1]-a->i[j]; i++) {
2304 for (k1=0; k1<bs; k1++) {
2305 sum += PetscAbsScalar(*v);
2306 v += bs;
2307 }
2308 }
2309 if (sum > *norm) *norm = sum;
2310 }
2311 }
2312 ierr = PetscLogFlops(PetscMax(bs2*nz-1,0));CHKERRQ(ierr);
2313 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
2314 PetscFunctionReturn(0);
2315 }
2316
2317
MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)2318 PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
2319 {
2320 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
2321 PetscErrorCode ierr;
2322
2323 PetscFunctionBegin;
2324 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
2325 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
2326 *flg = PETSC_FALSE;
2327 PetscFunctionReturn(0);
2328 }
2329
2330 /* if the a->i are the same */
2331 ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr);
2332 if (!*flg) PetscFunctionReturn(0);
2333
2334 /* if a->j are the same */
2335 ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr);
2336 if (!*flg) PetscFunctionReturn(0);
2337
2338 /* if a->a are the same */
2339 ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs),flg);CHKERRQ(ierr);
2340 PetscFunctionReturn(0);
2341
2342 }
2343
MatGetDiagonal_SeqBAIJ(Mat A,Vec v)2344 PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
2345 {
2346 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2347 PetscErrorCode ierr;
2348 PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
2349 PetscScalar *x,zero = 0.0;
2350 MatScalar *aa,*aa_j;
2351
2352 PetscFunctionBegin;
2353 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
2354 bs = A->rmap->bs;
2355 aa = a->a;
2356 ai = a->i;
2357 aj = a->j;
2358 ambs = a->mbs;
2359 bs2 = a->bs2;
2360
2361 ierr = VecSet(v,zero);CHKERRQ(ierr);
2362 ierr = VecGetArray(v,&x);CHKERRQ(ierr);
2363 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
2364 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2365 for (i=0; i<ambs; i++) {
2366 for (j=ai[i]; j<ai[i+1]; j++) {
2367 if (aj[j] == i) {
2368 row = i*bs;
2369 aa_j = aa+j*bs2;
2370 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2371 break;
2372 }
2373 }
2374 }
2375 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
2376 PetscFunctionReturn(0);
2377 }
2378
MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)2379 PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2380 {
2381 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2382 const PetscScalar *l,*r,*li,*ri;
2383 PetscScalar x;
2384 MatScalar *aa, *v;
2385 PetscErrorCode ierr;
2386 PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2387 const PetscInt *ai,*aj;
2388
2389 PetscFunctionBegin;
2390 ai = a->i;
2391 aj = a->j;
2392 aa = a->a;
2393 m = A->rmap->n;
2394 n = A->cmap->n;
2395 bs = A->rmap->bs;
2396 mbs = a->mbs;
2397 bs2 = a->bs2;
2398 if (ll) {
2399 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
2400 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
2401 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2402 for (i=0; i<mbs; i++) { /* for each block row */
2403 M = ai[i+1] - ai[i];
2404 li = l + i*bs;
2405 v = aa + bs2*ai[i];
2406 for (j=0; j<M; j++) { /* for each block */
2407 for (k=0; k<bs2; k++) {
2408 (*v++) *= li[k%bs];
2409 }
2410 }
2411 }
2412 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
2413 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2414 }
2415
2416 if (rr) {
2417 ierr = VecGetArrayRead(rr,&r);CHKERRQ(ierr);
2418 ierr = VecGetLocalSize(rr,&rn);CHKERRQ(ierr);
2419 if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2420 for (i=0; i<mbs; i++) { /* for each block row */
2421 iai = ai[i];
2422 M = ai[i+1] - iai;
2423 v = aa + bs2*iai;
2424 for (j=0; j<M; j++) { /* for each block */
2425 ri = r + bs*aj[iai+j];
2426 for (k=0; k<bs; k++) {
2427 x = ri[k];
2428 for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2429 v += bs;
2430 }
2431 }
2432 }
2433 ierr = VecRestoreArrayRead(rr,&r);CHKERRQ(ierr);
2434 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
2435 }
2436 PetscFunctionReturn(0);
2437 }
2438
2439
MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo * info)2440 PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2441 {
2442 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2443
2444 PetscFunctionBegin;
2445 info->block_size = a->bs2;
2446 info->nz_allocated = a->bs2*a->maxnz;
2447 info->nz_used = a->bs2*a->nz;
2448 info->nz_unneeded = info->nz_allocated - info->nz_used;
2449 info->assemblies = A->num_ass;
2450 info->mallocs = A->info.mallocs;
2451 info->memory = ((PetscObject)A)->mem;
2452 if (A->factortype) {
2453 info->fill_ratio_given = A->info.fill_ratio_given;
2454 info->fill_ratio_needed = A->info.fill_ratio_needed;
2455 info->factor_mallocs = A->info.factor_mallocs;
2456 } else {
2457 info->fill_ratio_given = 0;
2458 info->fill_ratio_needed = 0;
2459 info->factor_mallocs = 0;
2460 }
2461 PetscFunctionReturn(0);
2462 }
2463
MatZeroEntries_SeqBAIJ(Mat A)2464 PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2465 {
2466 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2467 PetscErrorCode ierr;
2468
2469 PetscFunctionBegin;
2470 ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr);
2471 PetscFunctionReturn(0);
2472 }
2473
MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)2474 PetscErrorCode MatMatMultSymbolic_SeqBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
2475 {
2476 PetscErrorCode ierr;
2477
2478 PetscFunctionBegin;
2479 ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
2480 C->ops->matmultnumeric = MatMatMultNumeric_SeqBAIJ_SeqDense;
2481 PetscFunctionReturn(0);
2482 }
2483
MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)2484 PetscErrorCode MatMatMult_SeqBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2485 {
2486 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2487 PetscScalar *z = NULL,sum1;
2488 const PetscScalar *xb;
2489 PetscScalar x1;
2490 const MatScalar *v,*vv;
2491 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2492 PetscBool usecprow=a->compressedrow.use;
2493
2494 PetscFunctionBegin;
2495 idx = a->j;
2496 v = a->a;
2497 if (usecprow) {
2498 mbs = a->compressedrow.nrows;
2499 ii = a->compressedrow.i;
2500 ridx = a->compressedrow.rindex;
2501 } else {
2502 mbs = a->mbs;
2503 ii = a->i;
2504 z = c;
2505 }
2506
2507 for (i=0; i<mbs; i++) {
2508 n = ii[1] - ii[0]; ii++;
2509 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2510 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2511 if (usecprow) z = c + ridx[i];
2512 jj = idx;
2513 vv = v;
2514 for (k=0; k<cn; k++) {
2515 idx = jj;
2516 v = vv;
2517 sum1 = 0.0;
2518 for (j=0; j<n; j++) {
2519 xb = b + (*idx++); x1 = xb[0+k*bm];
2520 sum1 += v[0]*x1;
2521 v += 1;
2522 }
2523 z[0+k*cm] = sum1;
2524 }
2525 if (!usecprow) z += 1;
2526 }
2527 PetscFunctionReturn(0);
2528 }
2529
MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)2530 PetscErrorCode MatMatMult_SeqBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2531 {
2532 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2533 PetscScalar *z = NULL,sum1,sum2;
2534 const PetscScalar *xb;
2535 PetscScalar x1,x2;
2536 const MatScalar *v,*vv;
2537 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2538 PetscBool usecprow=a->compressedrow.use;
2539
2540 PetscFunctionBegin;
2541 idx = a->j;
2542 v = a->a;
2543 if (usecprow) {
2544 mbs = a->compressedrow.nrows;
2545 ii = a->compressedrow.i;
2546 ridx = a->compressedrow.rindex;
2547 } else {
2548 mbs = a->mbs;
2549 ii = a->i;
2550 z = c;
2551 }
2552
2553 for (i=0; i<mbs; i++) {
2554 n = ii[1] - ii[0]; ii++;
2555 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2556 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2557 if (usecprow) z = c + 2*ridx[i];
2558 jj = idx;
2559 vv = v;
2560 for (k=0; k<cn; k++) {
2561 idx = jj;
2562 v = vv;
2563 sum1 = 0.0; sum2 = 0.0;
2564 for (j=0; j<n; j++) {
2565 xb = b + 2*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
2566 sum1 += v[0]*x1 + v[2]*x2;
2567 sum2 += v[1]*x1 + v[3]*x2;
2568 v += 4;
2569 }
2570 z[0+k*cm] = sum1; z[1+k*cm] = sum2;
2571 }
2572 if (!usecprow) z += 2;
2573 }
2574 PetscFunctionReturn(0);
2575 }
2576
MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)2577 PetscErrorCode MatMatMult_SeqBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2578 {
2579 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2580 PetscScalar *z = NULL,sum1,sum2,sum3;
2581 const PetscScalar *xb;
2582 PetscScalar x1,x2,x3;
2583 const MatScalar *v,*vv;
2584 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2585 PetscBool usecprow=a->compressedrow.use;
2586
2587 PetscFunctionBegin;
2588 idx = a->j;
2589 v = a->a;
2590 if (usecprow) {
2591 mbs = a->compressedrow.nrows;
2592 ii = a->compressedrow.i;
2593 ridx = a->compressedrow.rindex;
2594 } else {
2595 mbs = a->mbs;
2596 ii = a->i;
2597 z = c;
2598 }
2599
2600 for (i=0; i<mbs; i++) {
2601 n = ii[1] - ii[0]; ii++;
2602 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2603 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2604 if (usecprow) z = c + 3*ridx[i];
2605 jj = idx;
2606 vv = v;
2607 for (k=0; k<cn; k++) {
2608 idx = jj;
2609 v = vv;
2610 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
2611 for (j=0; j<n; j++) {
2612 xb = b + 3*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
2613 sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
2614 sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
2615 sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
2616 v += 9;
2617 }
2618 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3;
2619 }
2620 if (!usecprow) z += 3;
2621 }
2622 PetscFunctionReturn(0);
2623 }
2624
MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)2625 PetscErrorCode MatMatMult_SeqBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2626 {
2627 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2628 PetscScalar *z = NULL,sum1,sum2,sum3,sum4;
2629 const PetscScalar *xb;
2630 PetscScalar x1,x2,x3,x4;
2631 const MatScalar *v,*vv;
2632 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2633 PetscBool usecprow=a->compressedrow.use;
2634
2635 PetscFunctionBegin;
2636 idx = a->j;
2637 v = a->a;
2638 if (usecprow) {
2639 mbs = a->compressedrow.nrows;
2640 ii = a->compressedrow.i;
2641 ridx = a->compressedrow.rindex;
2642 } else {
2643 mbs = a->mbs;
2644 ii = a->i;
2645 z = c;
2646 }
2647
2648 for (i=0; i<mbs; i++) {
2649 n = ii[1] - ii[0]; ii++;
2650 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2651 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2652 if (usecprow) z = c + 4*ridx[i];
2653 jj = idx;
2654 vv = v;
2655 for (k=0; k<cn; k++) {
2656 idx = jj;
2657 v = vv;
2658 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0;
2659 for (j=0; j<n; j++) {
2660 xb = b + 4*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
2661 sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2662 sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2663 sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2664 sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2665 v += 16;
2666 }
2667 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4;
2668 }
2669 if (!usecprow) z += 4;
2670 }
2671 PetscFunctionReturn(0);
2672 }
2673
MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)2674 PetscErrorCode MatMatMult_SeqBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
2675 {
2676 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2677 PetscScalar *z = NULL,sum1,sum2,sum3,sum4,sum5;
2678 const PetscScalar *xb;
2679 PetscScalar x1,x2,x3,x4,x5;
2680 const MatScalar *v,*vv;
2681 PetscInt mbs,i,*idx,*ii,j,*jj,n,k,*ridx=NULL;
2682 PetscBool usecprow=a->compressedrow.use;
2683
2684 PetscFunctionBegin;
2685 idx = a->j;
2686 v = a->a;
2687 if (usecprow) {
2688 mbs = a->compressedrow.nrows;
2689 ii = a->compressedrow.i;
2690 ridx = a->compressedrow.rindex;
2691 } else {
2692 mbs = a->mbs;
2693 ii = a->i;
2694 z = c;
2695 }
2696
2697 for (i=0; i<mbs; i++) {
2698 n = ii[1] - ii[0]; ii++;
2699 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
2700 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
2701 if (usecprow) z = c + 5*ridx[i];
2702 jj = idx;
2703 vv = v;
2704 for (k=0; k<cn; k++) {
2705 idx = jj;
2706 v = vv;
2707 sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
2708 for (j=0; j<n; j++) {
2709 xb = b + 5*(*idx++); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*bm];
2710 sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2711 sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2712 sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2713 sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2714 sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2715 v += 25;
2716 }
2717 z[0+k*cm] = sum1; z[1+k*cm] = sum2; z[2+k*cm] = sum3; z[3+k*cm] = sum4; z[4+k*cm] = sum5;
2718 }
2719 if (!usecprow) z += 5;
2720 }
2721 PetscFunctionReturn(0);
2722 }
2723
MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)2724 PetscErrorCode MatMatMultNumeric_SeqBAIJ_SeqDense(Mat A,Mat B,Mat C)
2725 {
2726 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2727 Mat_SeqDense *bd = (Mat_SeqDense*)B->data;
2728 Mat_SeqDense *cd = (Mat_SeqDense*)C->data;
2729 PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
2730 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
2731 PetscBLASInt bbs,bcn,bbm,bcm;
2732 PetscScalar *z = NULL;
2733 PetscScalar *c,*b;
2734 const MatScalar *v;
2735 const PetscInt *idx,*ii,*ridx=NULL;
2736 PetscScalar _DZero=0.0,_DOne=1.0;
2737 PetscBool usecprow=a->compressedrow.use;
2738 PetscErrorCode ierr;
2739
2740 PetscFunctionBegin;
2741 if (!cm || !cn) PetscFunctionReturn(0);
2742 if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
2743 if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
2744 if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
2745 b = bd->v;
2746 if (a->nonzerorowcnt != A->rmap->n) {
2747 ierr = MatZeroEntries(C);CHKERRQ(ierr);
2748 }
2749 ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
2750 switch (bs) {
2751 case 1:
2752 ierr = MatMatMult_SeqBAIJ_1_Private(A, b, bm, c, cm, cn);
2753 break;
2754 case 2:
2755 ierr = MatMatMult_SeqBAIJ_2_Private(A, b, bm, c, cm, cn);
2756 break;
2757 case 3:
2758 ierr = MatMatMult_SeqBAIJ_3_Private(A, b, bm, c, cm, cn);
2759 break;
2760 case 4:
2761 ierr = MatMatMult_SeqBAIJ_4_Private(A, b, bm, c, cm, cn);
2762 break;
2763 case 5:
2764 ierr = MatMatMult_SeqBAIJ_5_Private(A, b, bm, c, cm, cn);
2765 break;
2766 default: /* block sizes larger than 5 by 5 are handled by BLAS */
2767 ierr = PetscBLASIntCast(bs,&bbs);CHKERRQ(ierr);
2768 ierr = PetscBLASIntCast(cn,&bcn);CHKERRQ(ierr);
2769 ierr = PetscBLASIntCast(bm,&bbm);CHKERRQ(ierr);
2770 ierr = PetscBLASIntCast(cm,&bcm);CHKERRQ(ierr);
2771 idx = a->j;
2772 v = a->a;
2773 if (usecprow) {
2774 mbs = a->compressedrow.nrows;
2775 ii = a->compressedrow.i;
2776 ridx = a->compressedrow.rindex;
2777 } else {
2778 mbs = a->mbs;
2779 ii = a->i;
2780 z = c;
2781 }
2782 for (i=0; i<mbs; i++) {
2783 n = ii[1] - ii[0]; ii++;
2784 if (usecprow) z = c + bs*ridx[i];
2785 if (n) {
2786 PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DZero,z,&bcm));
2787 v += bs2;
2788 }
2789 for (j=1; j<n; j++) {
2790 PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
2791 v += bs2;
2792 }
2793 if (!usecprow) z += bs;
2794 }
2795 }
2796 ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
2797 ierr = PetscLogFlops((2.0*a->nz*bs2 - bs*a->nonzerorowcnt)*cn);CHKERRQ(ierr);
2798 PetscFunctionReturn(0);
2799 }
2800