1
2 #include <../src/mat/impls/baij/seq/baij.h>
3 #include <../src/mat/impls/dense/seq/dense.h>
4 #include <../src/mat/impls/sbaij/seq/sbaij.h>
5 #include <petsc/private/kernels/blockinvert.h>
6 #include <petscbt.h>
7 #include <petscblaslapack.h>
8
MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)9 PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
10 {
11 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
12 PetscErrorCode ierr;
13 PetscInt brow,i,j,k,l,mbs,n,*nidx,isz,bcol,bcol_max,start,end,*ai,*aj,bs,*nidx2;
14 const PetscInt *idx;
15 PetscBT table_out,table_in;
16
17 PetscFunctionBegin;
18 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
19 mbs = a->mbs;
20 ai = a->i;
21 aj = a->j;
22 bs = A->rmap->bs;
23 ierr = PetscBTCreate(mbs,&table_out);CHKERRQ(ierr);
24 ierr = PetscMalloc1(mbs+1,&nidx);CHKERRQ(ierr);
25 ierr = PetscMalloc1(A->rmap->N+1,&nidx2);CHKERRQ(ierr);
26 ierr = PetscBTCreate(mbs,&table_in);CHKERRQ(ierr);
27
28 for (i=0; i<is_max; i++) { /* for each is */
29 isz = 0;
30 ierr = PetscBTMemzero(mbs,table_out);CHKERRQ(ierr);
31
32 /* Extract the indices, assume there can be duplicate entries */
33 ierr = ISGetIndices(is[i],&idx);CHKERRQ(ierr);
34 ierr = ISGetLocalSize(is[i],&n);CHKERRQ(ierr);
35
36 /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
37 bcol_max = 0;
38 for (j=0; j<n; ++j) {
39 brow = idx[j]/bs; /* convert the indices into block indices */
40 if (brow >= mbs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
41 if (!PetscBTLookupSet(table_out,brow)) {
42 nidx[isz++] = brow;
43 if (bcol_max < brow) bcol_max = brow;
44 }
45 }
46 ierr = ISRestoreIndices(is[i],&idx);CHKERRQ(ierr);
47 ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
48
49 k = 0;
50 for (j=0; j<ov; j++) { /* for each overlap */
51 /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
52 ierr = PetscBTMemzero(mbs,table_in);CHKERRQ(ierr);
53 for (l=k; l<isz; l++) { ierr = PetscBTSet(table_in,nidx[l]);CHKERRQ(ierr); }
54
55 n = isz; /* length of the updated is[i] */
56 for (brow=0; brow<mbs; brow++) {
57 start = ai[brow]; end = ai[brow+1];
58 if (PetscBTLookup(table_in,brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
59 for (l = start; l<end; l++) {
60 bcol = aj[l];
61 if (!PetscBTLookupSet(table_out,bcol)) {
62 nidx[isz++] = bcol;
63 if (bcol_max < bcol) bcol_max = bcol;
64 }
65 }
66 k++;
67 if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
68 } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
69 for (l = start; l<end; l++) {
70 bcol = aj[l];
71 if (bcol > bcol_max) break;
72 if (PetscBTLookup(table_in,bcol)) {
73 if (!PetscBTLookupSet(table_out,brow)) nidx[isz++] = brow;
74 break; /* for l = start; l<end ; l++) */
75 }
76 }
77 }
78 }
79 } /* for each overlap */
80
81 /* expand the Index Set */
82 for (j=0; j<isz; j++) {
83 for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
84 }
85 ierr = ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
86 }
87 ierr = PetscBTDestroy(&table_out);CHKERRQ(ierr);
88 ierr = PetscFree(nidx);CHKERRQ(ierr);
89 ierr = PetscFree(nidx2);CHKERRQ(ierr);
90 ierr = PetscBTDestroy(&table_in);CHKERRQ(ierr);
91 PetscFunctionReturn(0);
92 }
93
94 /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
95 Zero some ops' to avoid invalid usse */
MatSeqSBAIJZeroOps_Private(Mat Bseq)96 PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
97 {
98 PetscErrorCode ierr;
99
100 PetscFunctionBegin;
101 ierr = MatSetOption(Bseq,MAT_SYMMETRIC,PETSC_FALSE);CHKERRQ(ierr);
102 Bseq->ops->mult = NULL;
103 Bseq->ops->multadd = NULL;
104 Bseq->ops->multtranspose = NULL;
105 Bseq->ops->multtransposeadd = NULL;
106 Bseq->ops->lufactor = NULL;
107 Bseq->ops->choleskyfactor = NULL;
108 Bseq->ops->lufactorsymbolic = NULL;
109 Bseq->ops->choleskyfactorsymbolic = NULL;
110 Bseq->ops->getinertia = NULL;
111 PetscFunctionReturn(0);
112 }
113
114 /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat * B)115 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
116 {
117 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*c;
118 PetscErrorCode ierr;
119 PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
120 PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
121 const PetscInt *irow,*icol;
122 PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
123 PetscInt *aj = a->j,*ai = a->i;
124 MatScalar *mat_a;
125 Mat C;
126 PetscBool flag;
127
128 PetscFunctionBegin;
129
130 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
131 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
132 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
133 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
134
135 ierr = PetscCalloc1(1+oldcols,&smap);CHKERRQ(ierr);
136 ssmap = smap;
137 ierr = PetscMalloc1(1+nrows,&lens);CHKERRQ(ierr);
138 for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
139 /* determine lens of each row */
140 for (i=0; i<nrows; i++) {
141 kstart = ai[irow[i]];
142 kend = kstart + a->ilen[irow[i]];
143 lens[i] = 0;
144 for (k=kstart; k<kend; k++) {
145 if (ssmap[aj[k]]) lens[i]++;
146 }
147 }
148 /* Create and fill new matrix */
149 if (scall == MAT_REUSE_MATRIX) {
150 c = (Mat_SeqSBAIJ*)((*B)->data);
151
152 if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
153 ierr = PetscArraycmp(c->ilen,lens,c->mbs,&flag);CHKERRQ(ierr);
154 if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
155 ierr = PetscArrayzero(c->ilen,c->mbs);CHKERRQ(ierr);
156 C = *B;
157 } else {
158 ierr = MatCreate(PetscObjectComm((PetscObject)A),&C);CHKERRQ(ierr);
159 ierr = MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
160 ierr = MatSetType(C,((PetscObject)A)->type_name);CHKERRQ(ierr);
161 ierr = MatSeqSBAIJSetPreallocation(C,bs,0,lens);CHKERRQ(ierr);
162 }
163 c = (Mat_SeqSBAIJ*)(C->data);
164 for (i=0; i<nrows; i++) {
165 row = irow[i];
166 kstart = ai[row];
167 kend = kstart + a->ilen[row];
168 mat_i = c->i[i];
169 mat_j = c->j + mat_i;
170 mat_a = c->a + mat_i*bs2;
171 mat_ilen = c->ilen + i;
172 for (k=kstart; k<kend; k++) {
173 if ((tcol=ssmap[a->j[k]])) {
174 *mat_j++ = tcol - 1;
175 ierr = PetscArraycpy(mat_a,a->a+k*bs2,bs2);CHKERRQ(ierr);
176 mat_a += bs2;
177 (*mat_ilen)++;
178 }
179 }
180 }
181 /* sort */
182 {
183 MatScalar *work;
184
185 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
186 for (i=0; i<nrows; i++) {
187 PetscInt ilen;
188 mat_i = c->i[i];
189 mat_j = c->j + mat_i;
190 mat_a = c->a + mat_i*bs2;
191 ilen = c->ilen[i];
192 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
193 }
194 ierr = PetscFree(work);CHKERRQ(ierr);
195 }
196
197 /* Free work space */
198 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
199 ierr = PetscFree(smap);CHKERRQ(ierr);
200 ierr = PetscFree(lens);CHKERRQ(ierr);
201 ierr = MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
202 ierr = MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
203
204 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
205 *B = C;
206 PetscFunctionReturn(0);
207 }
208
MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat * B)209 PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
210 {
211 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
212 IS is1,is2;
213 PetscErrorCode ierr;
214 PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs;
215 const PetscInt *irow,*icol;
216
217 PetscFunctionBegin;
218 ierr = ISGetIndices(isrow,&irow);CHKERRQ(ierr);
219 ierr = ISGetIndices(iscol,&icol);CHKERRQ(ierr);
220 ierr = ISGetLocalSize(isrow,&nrows);CHKERRQ(ierr);
221 ierr = ISGetLocalSize(iscol,&ncols);CHKERRQ(ierr);
222
223 /* Verify if the indices corespond to each element in a block
224 and form the IS with compressed IS */
225 maxmnbs = PetscMax(a->mbs,a->nbs);
226 ierr = PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);CHKERRQ(ierr);
227 ierr = PetscArrayzero(vary,a->mbs);CHKERRQ(ierr);
228 for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
229 for (i=0; i<a->mbs; i++) {
230 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
231 }
232 count = 0;
233 for (i=0; i<nrows; i++) {
234 PetscInt j = irow[i] / bs;
235 if ((vary[j]--)==bs) iary[count++] = j;
236 }
237 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);CHKERRQ(ierr);
238
239 ierr = PetscArrayzero(vary,a->nbs);CHKERRQ(ierr);
240 for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
241 for (i=0; i<a->nbs; i++) {
242 if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
243 }
244 count = 0;
245 for (i=0; i<ncols; i++) {
246 PetscInt j = icol[i] / bs;
247 if ((vary[j]--)==bs) iary[count++] = j;
248 }
249 ierr = ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);CHKERRQ(ierr);
250 ierr = ISRestoreIndices(isrow,&irow);CHKERRQ(ierr);
251 ierr = ISRestoreIndices(iscol,&icol);CHKERRQ(ierr);
252 ierr = PetscFree2(vary,iary);CHKERRQ(ierr);
253
254 ierr = MatCreateSubMatrix_SeqSBAIJ_Private(A,is1,is2,scall,B);CHKERRQ(ierr);
255 ierr = ISDestroy(&is1);CHKERRQ(ierr);
256 ierr = ISDestroy(&is2);CHKERRQ(ierr);
257
258 if (isrow != iscol) {
259 PetscBool isequal;
260 ierr = ISEqual(isrow,iscol,&isequal);CHKERRQ(ierr);
261 if (!isequal) {
262 ierr = MatSeqSBAIJZeroOps_Private(*B);CHKERRQ(ierr);
263 }
264 }
265 PetscFunctionReturn(0);
266 }
267
MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat * B[])268 PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
269 {
270 PetscErrorCode ierr;
271 PetscInt i;
272
273 PetscFunctionBegin;
274 if (scall == MAT_INITIAL_MATRIX) {
275 ierr = PetscCalloc1(n+1,B);CHKERRQ(ierr);
276 }
277
278 for (i=0; i<n; i++) {
279 ierr = MatCreateSubMatrix_SeqSBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);CHKERRQ(ierr);
280 }
281 PetscFunctionReturn(0);
282 }
283
284 /* -------------------------------------------------------*/
285 /* Should check that shapes of vectors and matrices match */
286 /* -------------------------------------------------------*/
287
MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)288 PetscErrorCode MatMult_SeqSBAIJ_2(Mat A,Vec xx,Vec zz)
289 {
290 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
291 PetscScalar *z,x1,x2,zero=0.0;
292 const PetscScalar *x,*xb;
293 const MatScalar *v;
294 PetscErrorCode ierr;
295 PetscInt mbs = a->mbs,i,n,cval,j,jmin;
296 const PetscInt *aj=a->j,*ai=a->i,*ib;
297 PetscInt nonzerorow=0;
298
299 PetscFunctionBegin;
300 ierr = VecSet(zz,zero);CHKERRQ(ierr);
301 if (!a->nz) PetscFunctionReturn(0);
302 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
303 ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
304
305 v = a->a;
306 xb = x;
307
308 for (i=0; i<mbs; i++) {
309 n = ai[1] - ai[0]; /* length of i_th block row of A */
310 x1 = xb[0]; x2 = xb[1];
311 ib = aj + *ai;
312 jmin = 0;
313 nonzerorow += (n>0);
314 if (*ib == i) { /* (diag of A)*x */
315 z[2*i] += v[0]*x1 + v[2]*x2;
316 z[2*i+1] += v[2]*x1 + v[3]*x2;
317 v += 4; jmin++;
318 }
319 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
320 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
321 for (j=jmin; j<n; j++) {
322 /* (strict lower triangular part of A)*x */
323 cval = ib[j]*2;
324 z[cval] += v[0]*x1 + v[1]*x2;
325 z[cval+1] += v[2]*x1 + v[3]*x2;
326 /* (strict upper triangular part of A)*x */
327 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
328 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
329 v += 4;
330 }
331 xb +=2; ai++;
332 }
333
334 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
335 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
336 ierr = PetscLogFlops(8.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
337 PetscFunctionReturn(0);
338 }
339
MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)340 PetscErrorCode MatMult_SeqSBAIJ_3(Mat A,Vec xx,Vec zz)
341 {
342 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
343 PetscScalar *z,x1,x2,x3,zero=0.0;
344 const PetscScalar *x,*xb;
345 const MatScalar *v;
346 PetscErrorCode ierr;
347 PetscInt mbs = a->mbs,i,n,cval,j,jmin;
348 const PetscInt *aj = a->j,*ai = a->i,*ib;
349 PetscInt nonzerorow=0;
350
351 PetscFunctionBegin;
352 ierr = VecSet(zz,zero);CHKERRQ(ierr);
353 if (!a->nz) PetscFunctionReturn(0);
354 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
355 ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
356
357 v = a->a;
358 xb = x;
359
360 for (i=0; i<mbs; i++) {
361 n = ai[1] - ai[0]; /* length of i_th block row of A */
362 x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
363 ib = aj + *ai;
364 jmin = 0;
365 nonzerorow += (n>0);
366 if (*ib == i) { /* (diag of A)*x */
367 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
368 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
369 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
370 v += 9; jmin++;
371 }
372 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
373 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
374 for (j=jmin; j<n; j++) {
375 /* (strict lower triangular part of A)*x */
376 cval = ib[j]*3;
377 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
378 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
379 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
380 /* (strict upper triangular part of A)*x */
381 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
382 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
383 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
384 v += 9;
385 }
386 xb +=3; ai++;
387 }
388
389 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
390 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
391 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
392 PetscFunctionReturn(0);
393 }
394
MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)395 PetscErrorCode MatMult_SeqSBAIJ_4(Mat A,Vec xx,Vec zz)
396 {
397 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
398 PetscScalar *z,x1,x2,x3,x4,zero=0.0;
399 const PetscScalar *x,*xb;
400 const MatScalar *v;
401 PetscErrorCode ierr;
402 PetscInt mbs = a->mbs,i,n,cval,j,jmin;
403 const PetscInt *aj = a->j,*ai = a->i,*ib;
404 PetscInt nonzerorow = 0;
405
406 PetscFunctionBegin;
407 ierr = VecSet(zz,zero);CHKERRQ(ierr);
408 if (!a->nz) PetscFunctionReturn(0);
409 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
410 ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
411
412 v = a->a;
413 xb = x;
414
415 for (i=0; i<mbs; i++) {
416 n = ai[1] - ai[0]; /* length of i_th block row of A */
417 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
418 ib = aj + *ai;
419 jmin = 0;
420 nonzerorow += (n>0);
421 if (*ib == i) { /* (diag of A)*x */
422 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
423 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
424 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
425 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
426 v += 16; jmin++;
427 }
428 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
429 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
430 for (j=jmin; j<n; j++) {
431 /* (strict lower triangular part of A)*x */
432 cval = ib[j]*4;
433 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
434 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
435 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
436 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
437 /* (strict upper triangular part of A)*x */
438 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
439 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
440 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
441 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
442 v += 16;
443 }
444 xb +=4; ai++;
445 }
446
447 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
448 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
449 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
450 PetscFunctionReturn(0);
451 }
452
MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)453 PetscErrorCode MatMult_SeqSBAIJ_5(Mat A,Vec xx,Vec zz)
454 {
455 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
456 PetscScalar *z,x1,x2,x3,x4,x5,zero=0.0;
457 const PetscScalar *x,*xb;
458 const MatScalar *v;
459 PetscErrorCode ierr;
460 PetscInt mbs = a->mbs,i,n,cval,j,jmin;
461 const PetscInt *aj = a->j,*ai = a->i,*ib;
462 PetscInt nonzerorow=0;
463
464 PetscFunctionBegin;
465 ierr = VecSet(zz,zero);CHKERRQ(ierr);
466 if (!a->nz) PetscFunctionReturn(0);
467 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
468 ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
469
470 v = a->a;
471 xb = x;
472
473 for (i=0; i<mbs; i++) {
474 n = ai[1] - ai[0]; /* length of i_th block row of A */
475 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
476 ib = aj + *ai;
477 jmin = 0;
478 nonzerorow += (n>0);
479 if (*ib == i) { /* (diag of A)*x */
480 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
481 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
482 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
483 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
484 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
485 v += 25; jmin++;
486 }
487 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
488 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
489 for (j=jmin; j<n; j++) {
490 /* (strict lower triangular part of A)*x */
491 cval = ib[j]*5;
492 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
493 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
494 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
495 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
496 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
497 /* (strict upper triangular part of A)*x */
498 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
499 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
500 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
501 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
502 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
503 v += 25;
504 }
505 xb +=5; ai++;
506 }
507
508 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
509 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
510 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
511 PetscFunctionReturn(0);
512 }
513
MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)514 PetscErrorCode MatMult_SeqSBAIJ_6(Mat A,Vec xx,Vec zz)
515 {
516 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
517 PetscScalar *z,x1,x2,x3,x4,x5,x6,zero=0.0;
518 const PetscScalar *x,*xb;
519 const MatScalar *v;
520 PetscErrorCode ierr;
521 PetscInt mbs = a->mbs,i,n,cval,j,jmin;
522 const PetscInt *aj=a->j,*ai=a->i,*ib;
523 PetscInt nonzerorow=0;
524
525 PetscFunctionBegin;
526 ierr = VecSet(zz,zero);CHKERRQ(ierr);
527 if (!a->nz) PetscFunctionReturn(0);
528 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
529 ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
530
531 v = a->a;
532 xb = x;
533
534 for (i=0; i<mbs; i++) {
535 n = ai[1] - ai[0]; /* length of i_th block row of A */
536 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
537 ib = aj + *ai;
538 jmin = 0;
539 nonzerorow += (n>0);
540 if (*ib == i) { /* (diag of A)*x */
541 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
542 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
543 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
544 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
545 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
546 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
547 v += 36; jmin++;
548 }
549 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
550 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
551 for (j=jmin; j<n; j++) {
552 /* (strict lower triangular part of A)*x */
553 cval = ib[j]*6;
554 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
555 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
556 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
557 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
558 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
559 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
560 /* (strict upper triangular part of A)*x */
561 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
562 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
563 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
564 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
565 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
566 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
567 v += 36;
568 }
569 xb +=6; ai++;
570 }
571
572 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
573 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
574 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
575 PetscFunctionReturn(0);
576 }
577
MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)578 PetscErrorCode MatMult_SeqSBAIJ_7(Mat A,Vec xx,Vec zz)
579 {
580 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
581 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7,zero=0.0;
582 const PetscScalar *x,*xb;
583 const MatScalar *v;
584 PetscErrorCode ierr;
585 PetscInt mbs = a->mbs,i,n,cval,j,jmin;
586 const PetscInt *aj=a->j,*ai=a->i,*ib;
587 PetscInt nonzerorow=0;
588
589 PetscFunctionBegin;
590 ierr = VecSet(zz,zero);CHKERRQ(ierr);
591 if (!a->nz) PetscFunctionReturn(0);
592 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
593 ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
594
595 v = a->a;
596 xb = x;
597
598 for (i=0; i<mbs; i++) {
599 n = ai[1] - ai[0]; /* length of i_th block row of A */
600 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
601 ib = aj + *ai;
602 jmin = 0;
603 nonzerorow += (n>0);
604 if (*ib == i) { /* (diag of A)*x */
605 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
606 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
607 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
608 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
609 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
610 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
611 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
612 v += 49; jmin++;
613 }
614 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
615 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
616 for (j=jmin; j<n; j++) {
617 /* (strict lower triangular part of A)*x */
618 cval = ib[j]*7;
619 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
620 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
621 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
622 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
623 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
624 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
625 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
626 /* (strict upper triangular part of A)*x */
627 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
628 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
629 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
630 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
631 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
632 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
633 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
634 v += 49;
635 }
636 xb +=7; ai++;
637 }
638 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
639 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
640 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow) - nonzerorow);CHKERRQ(ierr);
641 PetscFunctionReturn(0);
642 }
643
644 /*
645 This will not work with MatScalar == float because it calls the BLAS
646 */
MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)647 PetscErrorCode MatMult_SeqSBAIJ_N(Mat A,Vec xx,Vec zz)
648 {
649 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
650 PetscScalar *z,*z_ptr,*zb,*work,*workt,zero=0.0;
651 const PetscScalar *x,*x_ptr,*xb;
652 const MatScalar *v;
653 PetscErrorCode ierr;
654 PetscInt mbs =a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
655 const PetscInt *idx,*aj,*ii;
656 PetscInt nonzerorow=0;
657
658 PetscFunctionBegin;
659 ierr = VecSet(zz,zero);CHKERRQ(ierr);
660 if (!a->nz) PetscFunctionReturn(0);
661 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
662 ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
663
664 x_ptr = x;
665 z_ptr = z;
666
667 aj = a->j;
668 v = a->a;
669 ii = a->i;
670
671 if (!a->mult_work) {
672 ierr = PetscMalloc1(A->rmap->N+1,&a->mult_work);CHKERRQ(ierr);
673 }
674 work = a->mult_work;
675
676 for (i=0; i<mbs; i++) {
677 n = ii[1] - ii[0]; ncols = n*bs;
678 workt = work; idx=aj+ii[0];
679 nonzerorow += (n>0);
680
681 /* upper triangular part */
682 for (j=0; j<n; j++) {
683 xb = x_ptr + bs*(*idx++);
684 for (k=0; k<bs; k++) workt[k] = xb[k];
685 workt += bs;
686 }
687 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
688 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
689
690 /* strict lower triangular part */
691 idx = aj+ii[0];
692 if (n && *idx == i) {
693 ncols -= bs; v += bs2; idx++; n--;
694 }
695
696 if (ncols > 0) {
697 workt = work;
698 ierr = PetscArrayzero(workt,ncols);CHKERRQ(ierr);
699 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
700 for (j=0; j<n; j++) {
701 zb = z_ptr + bs*(*idx++);
702 for (k=0; k<bs; k++) zb[k] += workt[k];
703 workt += bs;
704 }
705 }
706 x += bs; v += n*bs2; z += bs; ii++;
707 }
708
709 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
710 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
711 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow)*bs2 - nonzerorow);CHKERRQ(ierr);
712 PetscFunctionReturn(0);
713 }
714
MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)715 PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
716 {
717 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
718 PetscScalar *z,x1;
719 const PetscScalar *x,*xb;
720 const MatScalar *v;
721 PetscErrorCode ierr;
722 PetscInt mbs =a->mbs,i,n,cval,j,jmin;
723 const PetscInt *aj=a->j,*ai=a->i,*ib;
724 PetscInt nonzerorow=0;
725 #if defined(PETSC_USE_COMPLEX)
726 const int aconj = A->hermitian;
727 #else
728 const int aconj = 0;
729 #endif
730
731 PetscFunctionBegin;
732 ierr = VecCopy(yy,zz);CHKERRQ(ierr);
733 if (!a->nz) PetscFunctionReturn(0);
734 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
735 ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
736 v = a->a;
737 xb = x;
738
739 for (i=0; i<mbs; i++) {
740 n = ai[1] - ai[0]; /* length of i_th row of A */
741 x1 = xb[0];
742 ib = aj + *ai;
743 jmin = 0;
744 nonzerorow += (n>0);
745 if (n && *ib == i) { /* (diag of A)*x */
746 z[i] += *v++ * x[*ib++]; jmin++;
747 }
748 if (aconj) {
749 for (j=jmin; j<n; j++) {
750 cval = *ib;
751 z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */
752 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
753 }
754 } else {
755 for (j=jmin; j<n; j++) {
756 cval = *ib;
757 z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
758 z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
759 }
760 }
761 xb++; ai++;
762 }
763
764 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
765 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
766
767 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
768 PetscFunctionReturn(0);
769 }
770
MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)771 PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
772 {
773 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
774 PetscScalar *z,x1,x2;
775 const PetscScalar *x,*xb;
776 const MatScalar *v;
777 PetscErrorCode ierr;
778 PetscInt mbs =a->mbs,i,n,cval,j,jmin;
779 const PetscInt *aj=a->j,*ai=a->i,*ib;
780 PetscInt nonzerorow=0;
781
782 PetscFunctionBegin;
783 ierr = VecCopy(yy,zz);CHKERRQ(ierr);
784 if (!a->nz) PetscFunctionReturn(0);
785 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
786 ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
787
788 v = a->a;
789 xb = x;
790
791 for (i=0; i<mbs; i++) {
792 n = ai[1] - ai[0]; /* length of i_th block row of A */
793 x1 = xb[0]; x2 = xb[1];
794 ib = aj + *ai;
795 jmin = 0;
796 nonzerorow += (n>0);
797 if (n && *ib == i) { /* (diag of A)*x */
798 z[2*i] += v[0]*x1 + v[2]*x2;
799 z[2*i+1] += v[2]*x1 + v[3]*x2;
800 v += 4; jmin++;
801 }
802 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
803 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
804 for (j=jmin; j<n; j++) {
805 /* (strict lower triangular part of A)*x */
806 cval = ib[j]*2;
807 z[cval] += v[0]*x1 + v[1]*x2;
808 z[cval+1] += v[2]*x1 + v[3]*x2;
809 /* (strict upper triangular part of A)*x */
810 z[2*i] += v[0]*x[cval] + v[2]*x[cval+1];
811 z[2*i+1] += v[1]*x[cval] + v[3]*x[cval+1];
812 v += 4;
813 }
814 xb +=2; ai++;
815 }
816 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
817 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
818
819 ierr = PetscLogFlops(4.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
820 PetscFunctionReturn(0);
821 }
822
MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)823 PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
824 {
825 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
826 PetscScalar *z,x1,x2,x3;
827 const PetscScalar *x,*xb;
828 const MatScalar *v;
829 PetscErrorCode ierr;
830 PetscInt mbs = a->mbs,i,n,cval,j,jmin;
831 const PetscInt *aj=a->j,*ai=a->i,*ib;
832 PetscInt nonzerorow=0;
833
834 PetscFunctionBegin;
835 ierr = VecCopy(yy,zz);CHKERRQ(ierr);
836 if (!a->nz) PetscFunctionReturn(0);
837 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
838 ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
839
840 v = a->a;
841 xb = x;
842
843 for (i=0; i<mbs; i++) {
844 n = ai[1] - ai[0]; /* length of i_th block row of A */
845 x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
846 ib = aj + *ai;
847 jmin = 0;
848 nonzerorow += (n>0);
849 if (n && *ib == i) { /* (diag of A)*x */
850 z[3*i] += v[0]*x1 + v[3]*x2 + v[6]*x3;
851 z[3*i+1] += v[3]*x1 + v[4]*x2 + v[7]*x3;
852 z[3*i+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
853 v += 9; jmin++;
854 }
855 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
856 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
857 for (j=jmin; j<n; j++) {
858 /* (strict lower triangular part of A)*x */
859 cval = ib[j]*3;
860 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3;
861 z[cval+1] += v[3]*x1 + v[4]*x2 + v[5]*x3;
862 z[cval+2] += v[6]*x1 + v[7]*x2 + v[8]*x3;
863 /* (strict upper triangular part of A)*x */
864 z[3*i] += v[0]*x[cval] + v[3]*x[cval+1]+ v[6]*x[cval+2];
865 z[3*i+1] += v[1]*x[cval] + v[4]*x[cval+1]+ v[7]*x[cval+2];
866 z[3*i+2] += v[2]*x[cval] + v[5]*x[cval+1]+ v[8]*x[cval+2];
867 v += 9;
868 }
869 xb +=3; ai++;
870 }
871
872 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
873 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
874
875 ierr = PetscLogFlops(18.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
876 PetscFunctionReturn(0);
877 }
878
MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)879 PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
880 {
881 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
882 PetscScalar *z,x1,x2,x3,x4;
883 const PetscScalar *x,*xb;
884 const MatScalar *v;
885 PetscErrorCode ierr;
886 PetscInt mbs = a->mbs,i,n,cval,j,jmin;
887 const PetscInt *aj=a->j,*ai=a->i,*ib;
888 PetscInt nonzerorow=0;
889
890 PetscFunctionBegin;
891 ierr = VecCopy(yy,zz);CHKERRQ(ierr);
892 if (!a->nz) PetscFunctionReturn(0);
893 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
894 ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
895
896 v = a->a;
897 xb = x;
898
899 for (i=0; i<mbs; i++) {
900 n = ai[1] - ai[0]; /* length of i_th block row of A */
901 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
902 ib = aj + *ai;
903 jmin = 0;
904 nonzerorow += (n>0);
905 if (n && *ib == i) { /* (diag of A)*x */
906 z[4*i] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
907 z[4*i+1] += v[4]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
908 z[4*i+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[14]*x4;
909 z[4*i+3] += v[12]*x1+ v[13]*x2+ v[14]*x3 + v[15]*x4;
910 v += 16; jmin++;
911 }
912 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
913 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
914 for (j=jmin; j<n; j++) {
915 /* (strict lower triangular part of A)*x */
916 cval = ib[j]*4;
917 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
918 z[cval+1] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
919 z[cval+2] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
920 z[cval+3] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
921 /* (strict upper triangular part of A)*x */
922 z[4*i] += v[0]*x[cval] + v[4]*x[cval+1]+ v[8]*x[cval+2] + v[12]*x[cval+3];
923 z[4*i+1] += v[1]*x[cval] + v[5]*x[cval+1]+ v[9]*x[cval+2] + v[13]*x[cval+3];
924 z[4*i+2] += v[2]*x[cval] + v[6]*x[cval+1]+ v[10]*x[cval+2]+ v[14]*x[cval+3];
925 z[4*i+3] += v[3]*x[cval] + v[7]*x[cval+1]+ v[11]*x[cval+2]+ v[15]*x[cval+3];
926 v += 16;
927 }
928 xb +=4; ai++;
929 }
930
931 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
932 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
933
934 ierr = PetscLogFlops(32.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
935 PetscFunctionReturn(0);
936 }
937
MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)938 PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
939 {
940 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
941 PetscScalar *z,x1,x2,x3,x4,x5;
942 const PetscScalar *x,*xb;
943 const MatScalar *v;
944 PetscErrorCode ierr;
945 PetscInt mbs = a->mbs,i,n,cval,j,jmin;
946 const PetscInt *aj=a->j,*ai=a->i,*ib;
947 PetscInt nonzerorow=0;
948
949 PetscFunctionBegin;
950 ierr = VecCopy(yy,zz);CHKERRQ(ierr);
951 if (!a->nz) PetscFunctionReturn(0);
952 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
953 ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
954
955 v = a->a;
956 xb = x;
957
958 for (i=0; i<mbs; i++) {
959 n = ai[1] - ai[0]; /* length of i_th block row of A */
960 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4];
961 ib = aj + *ai;
962 jmin = 0;
963 nonzerorow += (n>0);
964 if (n && *ib == i) { /* (diag of A)*x */
965 z[5*i] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4+ v[20]*x5;
966 z[5*i+1] += v[5]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4+ v[21]*x5;
967 z[5*i+2] += v[10]*x1 +v[11]*x2 + v[12]*x3 + v[17]*x4+ v[22]*x5;
968 z[5*i+3] += v[15]*x1 +v[16]*x2 + v[17]*x3 + v[18]*x4+ v[23]*x5;
969 z[5*i+4] += v[20]*x1 +v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
970 v += 25; jmin++;
971 }
972 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
973 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
974 for (j=jmin; j<n; j++) {
975 /* (strict lower triangular part of A)*x */
976 cval = ib[j]*5;
977 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
978 z[cval+1] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
979 z[cval+2] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4+ v[14]*x5;
980 z[cval+3] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4+ v[19]*x5;
981 z[cval+4] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4+ v[24]*x5;
982 /* (strict upper triangular part of A)*x */
983 z[5*i] +=v[0]*x[cval]+v[5]*x[cval+1]+v[10]*x[cval+2]+v[15]*x[cval+3]+v[20]*x[cval+4];
984 z[5*i+1] +=v[1]*x[cval]+v[6]*x[cval+1]+v[11]*x[cval+2]+v[16]*x[cval+3]+v[21]*x[cval+4];
985 z[5*i+2] +=v[2]*x[cval]+v[7]*x[cval+1]+v[12]*x[cval+2]+v[17]*x[cval+3]+v[22]*x[cval+4];
986 z[5*i+3] +=v[3]*x[cval]+v[8]*x[cval+1]+v[13]*x[cval+2]+v[18]*x[cval+3]+v[23]*x[cval+4];
987 z[5*i+4] +=v[4]*x[cval]+v[9]*x[cval+1]+v[14]*x[cval+2]+v[19]*x[cval+3]+v[24]*x[cval+4];
988 v += 25;
989 }
990 xb +=5; ai++;
991 }
992
993 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
994 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
995
996 ierr = PetscLogFlops(50.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
997 PetscFunctionReturn(0);
998 }
999
MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)1000 PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1001 {
1002 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1003 PetscScalar *z,x1,x2,x3,x4,x5,x6;
1004 const PetscScalar *x,*xb;
1005 const MatScalar *v;
1006 PetscErrorCode ierr;
1007 PetscInt mbs = a->mbs,i,n,cval,j,jmin;
1008 const PetscInt *aj=a->j,*ai=a->i,*ib;
1009 PetscInt nonzerorow=0;
1010
1011 PetscFunctionBegin;
1012 ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1013 if (!a->nz) PetscFunctionReturn(0);
1014 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1015 ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1016
1017 v = a->a;
1018 xb = x;
1019
1020 for (i=0; i<mbs; i++) {
1021 n = ai[1] - ai[0]; /* length of i_th block row of A */
1022 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5];
1023 ib = aj + *ai;
1024 jmin = 0;
1025 nonzerorow += (n>0);
1026 if (n && *ib == i) { /* (diag of A)*x */
1027 z[6*i] += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4+ v[24]*x5 + v[30]*x6;
1028 z[6*i+1] += v[6]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4+ v[25]*x5 + v[31]*x6;
1029 z[6*i+2] += v[12]*x1 +v[13]*x2 + v[14]*x3 + v[20]*x4+ v[26]*x5 + v[32]*x6;
1030 z[6*i+3] += v[18]*x1 +v[19]*x2 + v[20]*x3 + v[21]*x4+ v[27]*x5 + v[33]*x6;
1031 z[6*i+4] += v[24]*x1 +v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[34]*x6;
1032 z[6*i+5] += v[30]*x1 +v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1033 v += 36; jmin++;
1034 }
1035 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1036 PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1037 for (j=jmin; j<n; j++) {
1038 /* (strict lower triangular part of A)*x */
1039 cval = ib[j]*6;
1040 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6;
1041 z[cval+1] += v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4+ v[10]*x5 + v[11]*x6;
1042 z[cval+2] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4+ v[16]*x5 + v[17]*x6;
1043 z[cval+3] += v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4+ v[22]*x5 + v[23]*x6;
1044 z[cval+4] += v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4+ v[28]*x5 + v[29]*x6;
1045 z[cval+5] += v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4+ v[34]*x5 + v[35]*x6;
1046 /* (strict upper triangular part of A)*x */
1047 z[6*i] +=v[0]*x[cval]+v[6]*x[cval+1]+v[12]*x[cval+2]+v[18]*x[cval+3]+v[24]*x[cval+4]+v[30]*x[cval+5];
1048 z[6*i+1] +=v[1]*x[cval]+v[7]*x[cval+1]+v[13]*x[cval+2]+v[19]*x[cval+3]+v[25]*x[cval+4]+v[31]*x[cval+5];
1049 z[6*i+2] +=v[2]*x[cval]+v[8]*x[cval+1]+v[14]*x[cval+2]+v[20]*x[cval+3]+v[26]*x[cval+4]+v[32]*x[cval+5];
1050 z[6*i+3] +=v[3]*x[cval]+v[9]*x[cval+1]+v[15]*x[cval+2]+v[21]*x[cval+3]+v[27]*x[cval+4]+v[33]*x[cval+5];
1051 z[6*i+4] +=v[4]*x[cval]+v[10]*x[cval+1]+v[16]*x[cval+2]+v[22]*x[cval+3]+v[28]*x[cval+4]+v[34]*x[cval+5];
1052 z[6*i+5] +=v[5]*x[cval]+v[11]*x[cval+1]+v[17]*x[cval+2]+v[23]*x[cval+3]+v[29]*x[cval+4]+v[35]*x[cval+5];
1053 v += 36;
1054 }
1055 xb +=6; ai++;
1056 }
1057
1058 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1059 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1060
1061 ierr = PetscLogFlops(72.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1062 PetscFunctionReturn(0);
1063 }
1064
MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)1065 PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1066 {
1067 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1068 PetscScalar *z,x1,x2,x3,x4,x5,x6,x7;
1069 const PetscScalar *x,*xb;
1070 const MatScalar *v;
1071 PetscErrorCode ierr;
1072 PetscInt mbs = a->mbs,i,n,cval,j,jmin;
1073 const PetscInt *aj=a->j,*ai=a->i,*ib;
1074 PetscInt nonzerorow=0;
1075
1076 PetscFunctionBegin;
1077 ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1078 if (!a->nz) PetscFunctionReturn(0);
1079 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
1080 ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
1081
1082 v = a->a;
1083 xb = x;
1084
1085 for (i=0; i<mbs; i++) {
1086 n = ai[1] - ai[0]; /* length of i_th block row of A */
1087 x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5=xb[4]; x6=xb[5]; x7=xb[6];
1088 ib = aj + *ai;
1089 jmin = 0;
1090 nonzerorow += (n>0);
1091 if (n && *ib == i) { /* (diag of A)*x */
1092 z[7*i] += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4+ v[28]*x5 + v[35]*x6+ v[42]*x7;
1093 z[7*i+1] += v[7]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4+ v[29]*x5 + v[36]*x6+ v[43]*x7;
1094 z[7*i+2] += v[14]*x1+ v[15]*x2 +v[16]*x3 + v[23]*x4+ v[30]*x5 + v[37]*x6+ v[44]*x7;
1095 z[7*i+3] += v[21]*x1+ v[22]*x2 +v[23]*x3 + v[24]*x4+ v[31]*x5 + v[38]*x6+ v[45]*x7;
1096 z[7*i+4] += v[28]*x1+ v[29]*x2 +v[30]*x3 + v[31]*x4+ v[32]*x5 + v[39]*x6+ v[46]*x7;
1097 z[7*i+5] += v[35]*x1+ v[36]*x2 +v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[47]*x7;
1098 z[7*i+6] += v[42]*x1+ v[43]*x2 +v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1099 v += 49; jmin++;
1100 }
1101 PetscPrefetchBlock(ib+jmin+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1102 PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1103 for (j=jmin; j<n; j++) {
1104 /* (strict lower triangular part of A)*x */
1105 cval = ib[j]*7;
1106 z[cval] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4+ v[4]*x5 + v[5]*x6+ v[6]*x7;
1107 z[cval+1] += v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4+ v[11]*x5 + v[12]*x6+ v[13]*x7;
1108 z[cval+2] += v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4+ v[18]*x5 + v[19]*x6+ v[20]*x7;
1109 z[cval+3] += v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4+ v[25]*x5 + v[26]*x6+ v[27]*x7;
1110 z[cval+4] += v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4+ v[32]*x5 + v[33]*x6+ v[34]*x7;
1111 z[cval+5] += v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4+ v[39]*x5 + v[40]*x6+ v[41]*x7;
1112 z[cval+6] += v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4+ v[46]*x5 + v[47]*x6+ v[48]*x7;
1113 /* (strict upper triangular part of A)*x */
1114 z[7*i] +=v[0]*x[cval]+v[7]*x[cval+1]+v[14]*x[cval+2]+v[21]*x[cval+3]+v[28]*x[cval+4]+v[35]*x[cval+5]+v[42]*x[cval+6];
1115 z[7*i+1]+=v[1]*x[cval]+v[8]*x[cval+1]+v[15]*x[cval+2]+v[22]*x[cval+3]+v[29]*x[cval+4]+v[36]*x[cval+5]+v[43]*x[cval+6];
1116 z[7*i+2]+=v[2]*x[cval]+v[9]*x[cval+1]+v[16]*x[cval+2]+v[23]*x[cval+3]+v[30]*x[cval+4]+v[37]*x[cval+5]+v[44]*x[cval+6];
1117 z[7*i+3]+=v[3]*x[cval]+v[10]*x[cval+1]+v[17]*x[cval+2]+v[24]*x[cval+3]+v[31]*x[cval+4]+v[38]*x[cval+5]+v[45]*x[cval+6];
1118 z[7*i+4]+=v[4]*x[cval]+v[11]*x[cval+1]+v[18]*x[cval+2]+v[25]*x[cval+3]+v[32]*x[cval+4]+v[39]*x[cval+5]+v[46]*x[cval+6];
1119 z[7*i+5]+=v[5]*x[cval]+v[12]*x[cval+1]+v[19]*x[cval+2]+v[26]*x[cval+3]+v[33]*x[cval+4]+v[40]*x[cval+5]+v[47]*x[cval+6];
1120 z[7*i+6]+=v[6]*x[cval]+v[13]*x[cval+1]+v[20]*x[cval+2]+v[27]*x[cval+3]+v[34]*x[cval+4]+v[41]*x[cval+5]+v[48]*x[cval+6];
1121 v += 49;
1122 }
1123 xb +=7; ai++;
1124 }
1125
1126 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1127 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1128
1129 ierr = PetscLogFlops(98.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1130 PetscFunctionReturn(0);
1131 }
1132
MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)1133 PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1134 {
1135 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1136 PetscScalar *z,*z_ptr=NULL,*zb,*work,*workt;
1137 const PetscScalar *x,*x_ptr,*xb;
1138 const MatScalar *v;
1139 PetscErrorCode ierr;
1140 PetscInt mbs = a->mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2,ncols,k;
1141 const PetscInt *idx,*aj,*ii;
1142 PetscInt nonzerorow=0;
1143
1144 PetscFunctionBegin;
1145 ierr = VecCopy(yy,zz);CHKERRQ(ierr);
1146 if (!a->nz) PetscFunctionReturn(0);
1147 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); x_ptr=x;
1148 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); z_ptr=z;
1149
1150 aj = a->j;
1151 v = a->a;
1152 ii = a->i;
1153
1154 if (!a->mult_work) {
1155 ierr = PetscMalloc1(A->rmap->n+1,&a->mult_work);CHKERRQ(ierr);
1156 }
1157 work = a->mult_work;
1158
1159
1160 for (i=0; i<mbs; i++) {
1161 n = ii[1] - ii[0]; ncols = n*bs;
1162 workt = work; idx=aj+ii[0];
1163 nonzerorow += (n>0);
1164
1165 /* upper triangular part */
1166 for (j=0; j<n; j++) {
1167 xb = x_ptr + bs*(*idx++);
1168 for (k=0; k<bs; k++) workt[k] = xb[k];
1169 workt += bs;
1170 }
1171 /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1172 PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1173
1174 /* strict lower triangular part */
1175 idx = aj+ii[0];
1176 if (n && *idx == i) {
1177 ncols -= bs; v += bs2; idx++; n--;
1178 }
1179 if (ncols > 0) {
1180 workt = work;
1181 ierr = PetscArrayzero(workt,ncols);CHKERRQ(ierr);
1182 PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,v,workt);
1183 for (j=0; j<n; j++) {
1184 zb = z_ptr + bs*(*idx++);
1185 for (k=0; k<bs; k++) zb[k] += workt[k];
1186 workt += bs;
1187 }
1188 }
1189
1190 x += bs; v += n*bs2; z += bs; ii++;
1191 }
1192
1193 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
1194 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
1195
1196 ierr = PetscLogFlops(2.0*(a->nz*2.0 - nonzerorow));CHKERRQ(ierr);
1197 PetscFunctionReturn(0);
1198 }
1199
MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)1200 PetscErrorCode MatScale_SeqSBAIJ(Mat inA,PetscScalar alpha)
1201 {
1202 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)inA->data;
1203 PetscScalar oalpha = alpha;
1204 PetscErrorCode ierr;
1205 PetscBLASInt one = 1,totalnz;
1206
1207 PetscFunctionBegin;
1208 ierr = PetscBLASIntCast(a->bs2*a->nz,&totalnz);CHKERRQ(ierr);
1209 PetscStackCallBLAS("BLASscal",BLASscal_(&totalnz,&oalpha,a->a,&one));
1210 ierr = PetscLogFlops(totalnz);CHKERRQ(ierr);
1211 PetscFunctionReturn(0);
1212 }
1213
MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal * norm)1214 PetscErrorCode MatNorm_SeqSBAIJ(Mat A,NormType type,PetscReal *norm)
1215 {
1216 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1217 const MatScalar *v = a->a;
1218 PetscReal sum_diag = 0.0, sum_off = 0.0, *sum;
1219 PetscInt i,j,k,bs = A->rmap->bs,bs2=a->bs2,k1,mbs=a->mbs,jmin,jmax,nexti,ik,*jl,*il;
1220 PetscErrorCode ierr;
1221 const PetscInt *aj=a->j,*col;
1222
1223 PetscFunctionBegin;
1224 if (!a->nz) {
1225 *norm = 0.0;
1226 PetscFunctionReturn(0);
1227 }
1228 if (type == NORM_FROBENIUS) {
1229 for (k=0; k<mbs; k++) {
1230 jmin = a->i[k];
1231 jmax = a->i[k+1];
1232 col = aj + jmin;
1233 if (jmax-jmin > 0 && *col == k) { /* diagonal block */
1234 for (i=0; i<bs2; i++) {
1235 sum_diag += PetscRealPart(PetscConj(*v)*(*v)); v++;
1236 }
1237 jmin++;
1238 }
1239 for (j=jmin; j<jmax; j++) { /* off-diagonal blocks */
1240 for (i=0; i<bs2; i++) {
1241 sum_off += PetscRealPart(PetscConj(*v)*(*v)); v++;
1242 }
1243 }
1244 }
1245 *norm = PetscSqrtReal(sum_diag + 2*sum_off);
1246 ierr = PetscLogFlops(2.0*bs2*a->nz);CHKERRQ(ierr);
1247 } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1248 ierr = PetscMalloc3(bs,&sum,mbs,&il,mbs,&jl);CHKERRQ(ierr);
1249 for (i=0; i<mbs; i++) jl[i] = mbs;
1250 il[0] = 0;
1251
1252 *norm = 0.0;
1253 for (k=0; k<mbs; k++) { /* k_th block row */
1254 for (j=0; j<bs; j++) sum[j]=0.0;
1255 /*-- col sum --*/
1256 i = jl[k]; /* first |A(i,k)| to be added */
1257 /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1258 at step k */
1259 while (i<mbs) {
1260 nexti = jl[i]; /* next block row to be added */
1261 ik = il[i]; /* block index of A(i,k) in the array a */
1262 for (j=0; j<bs; j++) {
1263 v = a->a + ik*bs2 + j*bs;
1264 for (k1=0; k1<bs; k1++) {
1265 sum[j] += PetscAbsScalar(*v); v++;
1266 }
1267 }
1268 /* update il, jl */
1269 jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1270 jmax = a->i[i+1];
1271 if (jmin < jmax) {
1272 il[i] = jmin;
1273 j = a->j[jmin];
1274 jl[i] = jl[j]; jl[j]=i;
1275 }
1276 i = nexti;
1277 }
1278 /*-- row sum --*/
1279 jmin = a->i[k];
1280 jmax = a->i[k+1];
1281 for (i=jmin; i<jmax; i++) {
1282 for (j=0; j<bs; j++) {
1283 v = a->a + i*bs2 + j;
1284 for (k1=0; k1<bs; k1++) {
1285 sum[j] += PetscAbsScalar(*v); v += bs;
1286 }
1287 }
1288 }
1289 /* add k_th block row to il, jl */
1290 col = aj+jmin;
1291 if (jmax - jmin > 0 && *col == k) jmin++;
1292 if (jmin < jmax) {
1293 il[k] = jmin;
1294 j = a->j[jmin]; jl[k] = jl[j]; jl[j] = k;
1295 }
1296 for (j=0; j<bs; j++) {
1297 if (sum[j] > *norm) *norm = sum[j];
1298 }
1299 }
1300 ierr = PetscFree3(sum,il,jl);CHKERRQ(ierr);
1301 ierr = PetscLogFlops(PetscMax(mbs*a->nz-1,0));CHKERRQ(ierr);
1302 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1303 PetscFunctionReturn(0);
1304 }
1305
MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)1306 PetscErrorCode MatEqual_SeqSBAIJ(Mat A,Mat B,PetscBool * flg)
1307 {
1308 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b = (Mat_SeqSBAIJ*)B->data;
1309 PetscErrorCode ierr;
1310
1311 PetscFunctionBegin;
1312 /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1313 if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1314 *flg = PETSC_FALSE;
1315 PetscFunctionReturn(0);
1316 }
1317
1318 /* if the a->i are the same */
1319 ierr = PetscArraycmp(a->i,b->i,a->mbs+1,flg);CHKERRQ(ierr);
1320 if (!*flg) PetscFunctionReturn(0);
1321
1322 /* if a->j are the same */
1323 ierr = PetscArraycmp(a->j,b->j,a->nz,flg);CHKERRQ(ierr);
1324 if (!*flg) PetscFunctionReturn(0);
1325
1326 /* if a->a are the same */
1327 ierr = PetscArraycmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(A->rmap->bs),flg);CHKERRQ(ierr);
1328 PetscFunctionReturn(0);
1329 }
1330
MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)1331 PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A,Vec v)
1332 {
1333 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1334 PetscErrorCode ierr;
1335 PetscInt i,j,k,row,bs,ambs,bs2;
1336 const PetscInt *ai,*aj;
1337 PetscScalar *x,zero = 0.0;
1338 const MatScalar *aa,*aa_j;
1339
1340 PetscFunctionBegin;
1341 bs = A->rmap->bs;
1342 if (A->factortype && bs>1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix with bs>1");
1343
1344 aa = a->a;
1345 ambs = a->mbs;
1346
1347 if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1348 PetscInt *diag=a->diag;
1349 aa = a->a;
1350 ambs = a->mbs;
1351 ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1352 for (i=0; i<ambs; i++) x[i] = 1.0/aa[diag[i]];
1353 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1354 PetscFunctionReturn(0);
1355 }
1356
1357 ai = a->i;
1358 aj = a->j;
1359 bs2 = a->bs2;
1360 ierr = VecSet(v,zero);CHKERRQ(ierr);
1361 if (!a->nz) PetscFunctionReturn(0);
1362 ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1363 for (i=0; i<ambs; i++) {
1364 j = ai[i];
1365 if (aj[j] == i) { /* if this is a diagonal element */
1366 row = i*bs;
1367 aa_j = aa + j*bs2;
1368 for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
1369 }
1370 }
1371 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1372 PetscFunctionReturn(0);
1373 }
1374
MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)1375 PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A,Vec ll,Vec rr)
1376 {
1377 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1378 PetscScalar x;
1379 const PetscScalar *l,*li,*ri;
1380 MatScalar *aa,*v;
1381 PetscErrorCode ierr;
1382 PetscInt i,j,k,lm,M,m,mbs,tmp,bs,bs2;
1383 const PetscInt *ai,*aj;
1384 PetscBool flg;
1385
1386 PetscFunctionBegin;
1387 if (ll != rr) {
1388 ierr = VecEqual(ll,rr,&flg);CHKERRQ(ierr);
1389 if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"For symmetric format, left and right scaling vectors must be same\n");
1390 }
1391 if (!ll) PetscFunctionReturn(0);
1392 ai = a->i;
1393 aj = a->j;
1394 aa = a->a;
1395 m = A->rmap->N;
1396 bs = A->rmap->bs;
1397 mbs = a->mbs;
1398 bs2 = a->bs2;
1399
1400 ierr = VecGetArrayRead(ll,&l);CHKERRQ(ierr);
1401 ierr = VecGetLocalSize(ll,&lm);CHKERRQ(ierr);
1402 if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
1403 for (i=0; i<mbs; i++) { /* for each block row */
1404 M = ai[i+1] - ai[i];
1405 li = l + i*bs;
1406 v = aa + bs2*ai[i];
1407 for (j=0; j<M; j++) { /* for each block */
1408 ri = l + bs*aj[ai[i]+j];
1409 for (k=0; k<bs; k++) {
1410 x = ri[k];
1411 for (tmp=0; tmp<bs; tmp++) (*v++) *= li[tmp]*x;
1412 }
1413 }
1414 }
1415 ierr = VecRestoreArrayRead(ll,&l);CHKERRQ(ierr);
1416 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1417 PetscFunctionReturn(0);
1418 }
1419
MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo * info)1420 PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A,MatInfoType flag,MatInfo *info)
1421 {
1422 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1423
1424 PetscFunctionBegin;
1425 info->block_size = a->bs2;
1426 info->nz_allocated = a->bs2*a->maxnz; /*num. of nonzeros in upper triangular part */
1427 info->nz_used = a->bs2*a->nz; /*num. of nonzeros in upper triangular part */
1428 info->nz_unneeded = info->nz_allocated - info->nz_used;
1429 info->assemblies = A->num_ass;
1430 info->mallocs = A->info.mallocs;
1431 info->memory = ((PetscObject)A)->mem;
1432 if (A->factortype) {
1433 info->fill_ratio_given = A->info.fill_ratio_given;
1434 info->fill_ratio_needed = A->info.fill_ratio_needed;
1435 info->factor_mallocs = A->info.factor_mallocs;
1436 } else {
1437 info->fill_ratio_given = 0;
1438 info->fill_ratio_needed = 0;
1439 info->factor_mallocs = 0;
1440 }
1441 PetscFunctionReturn(0);
1442 }
1443
MatZeroEntries_SeqSBAIJ(Mat A)1444 PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1445 {
1446 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1447 PetscErrorCode ierr;
1448
1449 PetscFunctionBegin;
1450 ierr = PetscArrayzero(a->a,a->bs2*a->i[a->mbs]);CHKERRQ(ierr);
1451 PetscFunctionReturn(0);
1452 }
1453
MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])1454 PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A,Vec v,PetscInt idx[])
1455 {
1456 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1457 PetscErrorCode ierr;
1458 PetscInt i,j,n,row,col,bs,mbs;
1459 const PetscInt *ai,*aj;
1460 PetscReal atmp;
1461 const MatScalar *aa;
1462 PetscScalar *x;
1463 PetscInt ncols,brow,bcol,krow,kcol;
1464
1465 PetscFunctionBegin;
1466 if (idx) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Send email to petsc-maint@mcs.anl.gov");
1467 if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1468 bs = A->rmap->bs;
1469 aa = a->a;
1470 ai = a->i;
1471 aj = a->j;
1472 mbs = a->mbs;
1473
1474 ierr = VecSet(v,0.0);CHKERRQ(ierr);
1475 ierr = VecGetArray(v,&x);CHKERRQ(ierr);
1476 ierr = VecGetLocalSize(v,&n);CHKERRQ(ierr);
1477 if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
1478 for (i=0; i<mbs; i++) {
1479 ncols = ai[1] - ai[0]; ai++;
1480 brow = bs*i;
1481 for (j=0; j<ncols; j++) {
1482 bcol = bs*(*aj);
1483 for (kcol=0; kcol<bs; kcol++) {
1484 col = bcol + kcol; /* col index */
1485 for (krow=0; krow<bs; krow++) {
1486 atmp = PetscAbsScalar(*aa); aa++;
1487 row = brow + krow; /* row index */
1488 if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1489 if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1490 }
1491 }
1492 aj++;
1493 }
1494 }
1495 ierr = VecRestoreArray(v,&x);CHKERRQ(ierr);
1496 PetscFunctionReturn(0);
1497 }
1498
MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)1499 PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A,Mat B,PetscReal fill,Mat C)
1500 {
1501 PetscErrorCode ierr;
1502
1503 PetscFunctionBegin;
1504 ierr = MatMatMultSymbolic_SeqDense_SeqDense(A,B,0.0,C);CHKERRQ(ierr);
1505 C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1506 PetscFunctionReturn(0);
1507 }
1508
MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)1509 PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1510 {
1511 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1512 PetscScalar *z = c;
1513 const PetscScalar *xb;
1514 PetscScalar x1;
1515 const MatScalar *v = a->a,*vv;
1516 PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1517 #if defined(PETSC_USE_COMPLEX)
1518 const int aconj = A->hermitian;
1519 #else
1520 const int aconj = 0;
1521 #endif
1522
1523 PetscFunctionBegin;
1524 for (i=0; i<mbs; i++) {
1525 n = ii[1] - ii[0]; ii++;
1526 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1527 PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1528 jj = idx;
1529 vv = v;
1530 for (k=0; k<cn; k++) {
1531 idx = jj;
1532 v = vv;
1533 for (j=0; j<n; j++) {
1534 xb = b + (*idx); x1 = xb[0+k*bm];
1535 z[0+k*cm] += v[0]*x1;
1536 if (*idx != i) c[(*idx)+k*cm] += (aconj ? PetscConj(v[0]) : v[0])*b[i+k*bm];
1537 v += 1;
1538 ++idx;
1539 }
1540 }
1541 z += 1;
1542 }
1543 PetscFunctionReturn(0);
1544 }
1545
MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)1546 PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1547 {
1548 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1549 PetscScalar *z = c;
1550 const PetscScalar *xb;
1551 PetscScalar x1,x2;
1552 const MatScalar *v = a->a,*vv;
1553 PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1554
1555 PetscFunctionBegin;
1556 for (i=0; i<mbs; i++) {
1557 n = ii[1] - ii[0]; ii++;
1558 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1559 PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1560 jj = idx;
1561 vv = v;
1562 for (k=0; k<cn; k++) {
1563 idx = jj;
1564 v = vv;
1565 for (j=0; j<n; j++) {
1566 xb = b + 2*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm];
1567 z[0+k*cm] += v[0]*x1 + v[2]*x2;
1568 z[1+k*cm] += v[1]*x1 + v[3]*x2;
1569 if (*idx != i) {
1570 c[2*(*idx)+0+k*cm] += v[0]*b[2*i+k*bm] + v[1]*b[2*i+1+k*bm];
1571 c[2*(*idx)+1+k*cm] += v[2]*b[2*i+k*bm] + v[3]*b[2*i+1+k*bm];
1572 }
1573 v += 4;
1574 ++idx;
1575 }
1576 }
1577 z += 2;
1578 }
1579 PetscFunctionReturn(0);
1580 }
1581
MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)1582 PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1583 {
1584 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1585 PetscScalar *z = c;
1586 const PetscScalar *xb;
1587 PetscScalar x1,x2,x3;
1588 const MatScalar *v = a->a,*vv;
1589 PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1590
1591 PetscFunctionBegin;
1592 for (i=0; i<mbs; i++) {
1593 n = ii[1] - ii[0]; ii++;
1594 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1595 PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1596 jj = idx;
1597 vv = v;
1598 for (k=0; k<cn; k++) {
1599 idx = jj;
1600 v = vv;
1601 for (j=0; j<n; j++) {
1602 xb = b + 3*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm];
1603 z[0+k*cm] += v[0]*x1 + v[3]*x2 + v[6]*x3;
1604 z[1+k*cm] += v[1]*x1 + v[4]*x2 + v[7]*x3;
1605 z[2+k*cm] += v[2]*x1 + v[5]*x2 + v[8]*x3;
1606 if (*idx != i) {
1607 c[3*(*idx)+0+k*cm] += v[0]*b[3*i+k*bm] + v[3]*b[3*i+1+k*bm] + v[6]*b[3*i+2+k*bm];
1608 c[3*(*idx)+1+k*cm] += v[1]*b[3*i+k*bm] + v[4]*b[3*i+1+k*bm] + v[7]*b[3*i+2+k*bm];
1609 c[3*(*idx)+2+k*cm] += v[2]*b[3*i+k*bm] + v[5]*b[3*i+1+k*bm] + v[8]*b[3*i+2+k*bm];
1610 }
1611 v += 9;
1612 ++idx;
1613 }
1614 }
1615 z += 3;
1616 }
1617 PetscFunctionReturn(0);
1618 }
1619
MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)1620 PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1621 {
1622 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1623 PetscScalar *z = c;
1624 const PetscScalar *xb;
1625 PetscScalar x1,x2,x3,x4;
1626 const MatScalar *v = a->a,*vv;
1627 PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1628
1629 PetscFunctionBegin;
1630 for (i=0; i<mbs; i++) {
1631 n = ii[1] - ii[0]; ii++;
1632 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1633 PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1634 jj = idx;
1635 vv = v;
1636 for (k=0; k<cn; k++) {
1637 idx = jj;
1638 v = vv;
1639 for (j=0; j<n; j++) {
1640 xb = b + 4*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm];
1641 z[0+k*cm] += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1642 z[1+k*cm] += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1643 z[2+k*cm] += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1644 z[3+k*cm] += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1645 if (*idx != i) {
1646 c[4*(*idx)+0+k*cm] += v[0]*b[4*i+k*bm] + v[4]*b[4*i+1+k*bm] + v[8]*b[4*i+2+k*bm] + v[12]*b[4*i+3+k*bm];
1647 c[4*(*idx)+1+k*cm] += v[1]*b[4*i+k*bm] + v[5]*b[4*i+1+k*bm] + v[9]*b[4*i+2+k*bm] + v[13]*b[4*i+3+k*bm];
1648 c[4*(*idx)+2+k*cm] += v[2]*b[4*i+k*bm] + v[6]*b[4*i+1+k*bm] + v[10]*b[4*i+2+k*bm] + v[14]*b[4*i+3+k*bm];
1649 c[4*(*idx)+3+k*cm] += v[3]*b[4*i+k*bm] + v[7]*b[4*i+1+k*bm] + v[11]*b[4*i+2+k*bm] + v[15]*b[4*i+3+k*bm];
1650 }
1651 v += 16;
1652 ++idx;
1653 }
1654 }
1655 z += 4;
1656 }
1657 PetscFunctionReturn(0);
1658 }
1659
MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar * b,PetscInt bm,PetscScalar * c,PetscInt cm,PetscInt cn)1660 PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A,PetscScalar* b,PetscInt bm,PetscScalar* c,PetscInt cm,PetscInt cn)
1661 {
1662 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1663 PetscScalar *z = c;
1664 const PetscScalar *xb;
1665 PetscScalar x1,x2,x3,x4,x5;
1666 const MatScalar *v = a->a,*vv;
1667 PetscInt mbs = a->mbs,i,*idx = a->j,*ii = a->i,j,*jj,n,k;
1668
1669 PetscFunctionBegin;
1670 for (i=0; i<mbs; i++) {
1671 n = ii[1] - ii[0]; ii++;
1672 PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1673 PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1674 jj = idx;
1675 vv = v;
1676 for (k=0; k<cn; k++) {
1677 idx = jj;
1678 v = vv;
1679 for (j=0; j<n; j++) {
1680 xb = b + 5*(*idx); x1 = xb[0+k*bm]; x2 = xb[1+k*bm]; x3 = xb[2+k*bm]; x4 = xb[3+k*bm]; x5 = xb[4+k*cm];
1681 z[0+k*cm] += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1682 z[1+k*cm] += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1683 z[2+k*cm] += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1684 z[3+k*cm] += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1685 z[4+k*cm] += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1686 if (*idx != i) {
1687 c[5*(*idx)+0+k*cm] += v[0]*b[5*i+k*bm] + v[5]*b[5*i+1+k*bm] + v[10]*b[5*i+2+k*bm] + v[15]*b[5*i+3+k*bm] + v[20]*b[5*i+4+k*bm];
1688 c[5*(*idx)+1+k*cm] += v[1]*b[5*i+k*bm] + v[6]*b[5*i+1+k*bm] + v[11]*b[5*i+2+k*bm] + v[16]*b[5*i+3+k*bm] + v[21]*b[5*i+4+k*bm];
1689 c[5*(*idx)+2+k*cm] += v[2]*b[5*i+k*bm] + v[7]*b[5*i+1+k*bm] + v[12]*b[5*i+2+k*bm] + v[17]*b[5*i+3+k*bm] + v[22]*b[5*i+4+k*bm];
1690 c[5*(*idx)+3+k*cm] += v[3]*b[5*i+k*bm] + v[8]*b[5*i+1+k*bm] + v[13]*b[5*i+2+k*bm] + v[18]*b[5*i+3+k*bm] + v[23]*b[5*i+4+k*bm];
1691 c[5*(*idx)+4+k*cm] += v[4]*b[5*i+k*bm] + v[9]*b[5*i+1+k*bm] + v[14]*b[5*i+2+k*bm] + v[19]*b[5*i+3+k*bm] + v[24]*b[5*i+4+k*bm];
1692 }
1693 v += 25;
1694 ++idx;
1695 }
1696 }
1697 z += 5;
1698 }
1699 PetscFunctionReturn(0);
1700 }
1701
MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C)1702 PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A,Mat B,Mat C)
1703 {
1704 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data;
1705 Mat_SeqDense *bd = (Mat_SeqDense*)B->data;
1706 Mat_SeqDense *cd = (Mat_SeqDense*)C->data;
1707 PetscInt cm=cd->lda,cn=B->cmap->n,bm=bd->lda;
1708 PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1709 PetscBLASInt bbs,bcn,bbm,bcm;
1710 PetscScalar *z = NULL;
1711 PetscScalar *c,*b;
1712 const MatScalar *v;
1713 const PetscInt *idx,*ii;
1714 PetscScalar _DOne=1.0;
1715 PetscErrorCode ierr;
1716
1717 PetscFunctionBegin;
1718 if (!cm || !cn) PetscFunctionReturn(0);
1719 if (B->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in A %D not equal rows in B %D\n",A->cmap->n,B->rmap->n);
1720 if (A->rmap->n != C->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number rows in C %D not equal rows in A %D\n",C->rmap->n,A->rmap->n);
1721 if (B->cmap->n != C->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Number columns in B %D not equal columns in C %D\n",B->cmap->n,C->cmap->n);
1722 b = bd->v;
1723 ierr = MatZeroEntries(C);CHKERRQ(ierr);
1724 ierr = MatDenseGetArray(C,&c);CHKERRQ(ierr);
1725 switch (bs) {
1726 case 1:
1727 ierr = MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn);
1728 break;
1729 case 2:
1730 ierr = MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn);
1731 break;
1732 case 3:
1733 ierr = MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn);
1734 break;
1735 case 4:
1736 ierr = MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn);
1737 break;
1738 case 5:
1739 ierr = MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn);
1740 break;
1741 default: /* block sizes larger than 5 by 5 are handled by BLAS */
1742 ierr = PetscBLASIntCast(bs,&bbs);CHKERRQ(ierr);
1743 ierr = PetscBLASIntCast(cn,&bcn);CHKERRQ(ierr);
1744 ierr = PetscBLASIntCast(bm,&bbm);CHKERRQ(ierr);
1745 ierr = PetscBLASIntCast(cm,&bcm);CHKERRQ(ierr);
1746 idx = a->j;
1747 v = a->a;
1748 mbs = a->mbs;
1749 ii = a->i;
1750 z = c;
1751 for (i=0; i<mbs; i++) {
1752 n = ii[1] - ii[0]; ii++;
1753 for (j=0; j<n; j++) {
1754 if (*idx != i) PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*i,&bbm,&_DOne,c+bs*(*idx),&bcm));
1755 PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bbs,&bcn,&bbs,&_DOne,v,&bbs,b+bs*(*idx++),&bbm,&_DOne,z,&bcm));
1756 v += bs2;
1757 }
1758 z += bs;
1759 }
1760 }
1761 ierr = MatDenseRestoreArray(C,&c);CHKERRQ(ierr);
1762 ierr = PetscLogFlops((2.0*(a->nz*2.0 - a->nonzerorowcnt)*bs2 - a->nonzerorowcnt)*cn);CHKERRQ(ierr);
1763 PetscFunctionReturn(0);
1764 }
1765