1
2 /*
3 Defines basic operations for the MATSEQAIJPERM matrix class.
4 This class is derived from the MATSEQAIJ class and retains the
5 compressed row storage (aka Yale sparse matrix format) but augments
6 it with some permutation information that enables some operations
7 to be more vectorizable. A physically rearranged copy of the matrix
8 may be stored if the user desires.
9
10 Eventually a variety of permutations may be supported.
11 */
12
13 #include <../src/mat/impls/aij/seq/aij.h>
14
15 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
16 #include <immintrin.h>
17
18 #if !defined(_MM_SCALE_8)
19 #define _MM_SCALE_8 8
20 #endif
21 #if !defined(_MM_SCALE_4)
22 #define _MM_SCALE_4 4
23 #endif
24 #endif
25
26 #define NDIM 512
27 /* NDIM specifies how many rows at a time we should work with when
28 * performing the vectorized mat-vec. This depends on various factors
29 * such as vector register length, etc., and I really need to add a
30 * way for the user (or the library) to tune this. I'm setting it to
31 * 512 for now since that is what Ed D'Azevedo was using in his Fortran
32 * routines. */
33
34 typedef struct {
35 PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
36
37 PetscInt ngroup;
38 PetscInt *xgroup;
39 /* Denotes where groups of rows with same number of nonzeros
40 * begin and end, i.e., xgroup[i] gives us the position in iperm[]
41 * where the ith group begins. */
42
43 PetscInt *nzgroup; /* how many nonzeros each row that is a member of group i has. */
44 PetscInt *iperm; /* The permutation vector. */
45
46 /* Some of this stuff is for Ed's recursive triangular solve.
47 * I'm not sure what I need yet. */
48 PetscInt blocksize;
49 PetscInt nstep;
50 PetscInt *jstart_list;
51 PetscInt *jend_list;
52 PetscInt *action_list;
53 PetscInt *ngroup_list;
54 PetscInt **ipointer_list;
55 PetscInt **xgroup_list;
56 PetscInt **nzgroup_list;
57 PetscInt **iperm_list;
58 } Mat_SeqAIJPERM;
59
MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat * newmat)60 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
61 {
62 /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
63 /* so we will ignore 'MatType type'. */
64 PetscErrorCode ierr;
65 Mat B = *newmat;
66 Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr;
67
68 PetscFunctionBegin;
69 if (reuse == MAT_INITIAL_MATRIX) {
70 ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
71 aijperm=(Mat_SeqAIJPERM*)B->spptr;
72 }
73
74 /* Reset the original function pointers. */
75 B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
76 B->ops->destroy = MatDestroy_SeqAIJ;
77 B->ops->duplicate = MatDuplicate_SeqAIJ;
78 B->ops->mult = MatMult_SeqAIJ;
79 B->ops->multadd = MatMultAdd_SeqAIJ;
80
81 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL);CHKERRQ(ierr);
82
83 /* Free everything in the Mat_SeqAIJPERM data structure.*/
84 ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
85 ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
86 ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
87 ierr = PetscFree(B->spptr);CHKERRQ(ierr);
88
89 /* Change the type of B to MATSEQAIJ. */
90 ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
91
92 *newmat = B;
93 PetscFunctionReturn(0);
94 }
95
MatDestroy_SeqAIJPERM(Mat A)96 PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
97 {
98 PetscErrorCode ierr;
99 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
100
101 PetscFunctionBegin;
102 if (aijperm) {
103 /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
104 ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
105 ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
106 ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
107 ierr = PetscFree(A->spptr);CHKERRQ(ierr);
108 }
109 /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
110 * to destroy everything that remains. */
111 ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
112 /* Note that I don't call MatSetType(). I believe this is because that
113 * is only to be called when *building* a matrix. I could be wrong, but
114 * that is how things work for the SuperLU matrix class. */
115 ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
116 PetscFunctionReturn(0);
117 }
118
MatDuplicate_SeqAIJPERM(Mat A,MatDuplicateOption op,Mat * M)119 PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
120 {
121 PetscErrorCode ierr;
122 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
123 Mat_SeqAIJPERM *aijperm_dest;
124 PetscBool perm;
125
126 PetscFunctionBegin;
127 ierr = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
128 ierr = PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm);CHKERRQ(ierr);
129 if (perm) {
130 aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr;
131 ierr = PetscFree(aijperm_dest->xgroup);CHKERRQ(ierr);
132 ierr = PetscFree(aijperm_dest->nzgroup);CHKERRQ(ierr);
133 ierr = PetscFree(aijperm_dest->iperm);CHKERRQ(ierr);
134 } else {
135 ierr = PetscNewLog(*M,&aijperm_dest);CHKERRQ(ierr);
136 (*M)->spptr = (void*) aijperm_dest;
137 ierr = PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM);CHKERRQ(ierr);
138 ierr = PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr);
139 }
140 ierr = PetscArraycpy(aijperm_dest,aijperm,1);CHKERRQ(ierr);
141 /* Allocate space for, and copy the grouping and permutation info.
142 * I note that when the groups are initially determined in
143 * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
144 * necessary. But at this point, we know how large they need to be, and
145 * allocate only the necessary amount of memory. So the duplicated matrix
146 * may actually use slightly less storage than the original! */
147 ierr = PetscMalloc1(A->rmap->n, &aijperm_dest->iperm);CHKERRQ(ierr);
148 ierr = PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup);CHKERRQ(ierr);
149 ierr = PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup);CHKERRQ(ierr);
150 ierr = PetscArraycpy(aijperm_dest->iperm,aijperm->iperm,A->rmap->n);CHKERRQ(ierr);
151 ierr = PetscArraycpy(aijperm_dest->xgroup,aijperm->xgroup,aijperm->ngroup+1);CHKERRQ(ierr);
152 ierr = PetscArraycpy(aijperm_dest->nzgroup,aijperm->nzgroup,aijperm->ngroup);CHKERRQ(ierr);
153 PetscFunctionReturn(0);
154 }
155
MatSeqAIJPERM_create_perm(Mat A)156 PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
157 {
158 PetscErrorCode ierr;
159 Mat_SeqAIJ *a = (Mat_SeqAIJ*)(A)->data;
160 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
161 PetscInt m; /* Number of rows in the matrix. */
162 PetscInt *ia; /* From the CSR representation; points to the beginning of each row. */
163 PetscInt maxnz; /* Maximum number of nonzeros in any row. */
164 PetscInt *rows_in_bucket;
165 /* To construct the permutation, we sort each row into one of maxnz
166 * buckets based on how many nonzeros are in the row. */
167 PetscInt nz;
168 PetscInt *nz_in_row; /* the number of nonzero elements in row k. */
169 PetscInt *ipnz;
170 /* When constructing the iperm permutation vector,
171 * ipnz[nz] is used to point to the next place in the permutation vector
172 * that a row with nz nonzero elements should be placed.*/
173 PetscInt i, ngroup, istart, ipos;
174
175 PetscFunctionBegin;
176 if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */
177 aijperm->nonzerostate = A->nonzerostate;
178 /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
179 ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
180 ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
181 ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
182
183 m = A->rmap->n;
184 ia = a->i;
185
186 /* Allocate the arrays that will hold the permutation vector. */
187 ierr = PetscMalloc1(m, &aijperm->iperm);CHKERRQ(ierr);
188
189 /* Allocate some temporary work arrays that will be used in
190 * calculating the permuation vector and groupings. */
191 ierr = PetscMalloc1(m, &nz_in_row);CHKERRQ(ierr);
192
193 /* Now actually figure out the permutation and grouping. */
194
195 /* First pass: Determine number of nonzeros in each row, maximum
196 * number of nonzeros in any row, and how many rows fall into each
197 * "bucket" of rows with same number of nonzeros. */
198 maxnz = 0;
199 for (i=0; i<m; i++) {
200 nz_in_row[i] = ia[i+1]-ia[i];
201 if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
202 }
203 ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket);CHKERRQ(ierr);
204 ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz);CHKERRQ(ierr);
205
206 for (i=0; i<=maxnz; i++) {
207 rows_in_bucket[i] = 0;
208 }
209 for (i=0; i<m; i++) {
210 nz = nz_in_row[i];
211 rows_in_bucket[nz]++;
212 }
213
214 /* Allocate space for the grouping info. There will be at most (maxnz + 1)
215 * groups. (It is maxnz + 1 instead of simply maxnz because there may be
216 * rows with no nonzero elements.) If there are (maxnz + 1) groups,
217 * then xgroup[] must consist of (maxnz + 2) elements, since the last
218 * element of xgroup will tell us where the (maxnz + 1)th group ends.
219 * We allocate space for the maximum number of groups;
220 * that is potentially a little wasteful, but not too much so.
221 * Perhaps I should fix it later. */
222 ierr = PetscMalloc1(maxnz+2, &aijperm->xgroup);CHKERRQ(ierr);
223 ierr = PetscMalloc1(maxnz+1, &aijperm->nzgroup);CHKERRQ(ierr);
224
225 /* Second pass. Look at what is in the buckets and create the groupings.
226 * Note that it is OK to have a group of rows with no non-zero values. */
227 ngroup = 0;
228 istart = 0;
229 for (i=0; i<=maxnz; i++) {
230 if (rows_in_bucket[i] > 0) {
231 aijperm->nzgroup[ngroup] = i;
232 aijperm->xgroup[ngroup] = istart;
233 ngroup++;
234 istart += rows_in_bucket[i];
235 }
236 }
237
238 aijperm->xgroup[ngroup] = istart;
239 aijperm->ngroup = ngroup;
240
241 /* Now fill in the permutation vector iperm. */
242 ipnz[0] = 0;
243 for (i=0; i<maxnz; i++) {
244 ipnz[i+1] = ipnz[i] + rows_in_bucket[i];
245 }
246
247 for (i=0; i<m; i++) {
248 nz = nz_in_row[i];
249 ipos = ipnz[nz];
250 aijperm->iperm[ipos] = i;
251 ipnz[nz]++;
252 }
253
254 /* Clean up temporary work arrays. */
255 ierr = PetscFree(rows_in_bucket);CHKERRQ(ierr);
256 ierr = PetscFree(ipnz);CHKERRQ(ierr);
257 ierr = PetscFree(nz_in_row);CHKERRQ(ierr);
258 PetscFunctionReturn(0);
259 }
260
261
MatAssemblyEnd_SeqAIJPERM(Mat A,MatAssemblyType mode)262 PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
263 {
264 PetscErrorCode ierr;
265 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
266
267 PetscFunctionBegin;
268 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
269
270 /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
271 * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
272 * I'm not sure if this is the best way to do this, but it avoids
273 * a lot of code duplication.
274 * I also note that currently MATSEQAIJPERM doesn't know anything about
275 * the Mat_CompressedRow data structure that SeqAIJ now uses when there
276 * are many zero rows. If the SeqAIJ assembly end routine decides to use
277 * this, this may break things. (Don't know... haven't looked at it.) */
278 a->inode.use = PETSC_FALSE;
279 ierr = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
280
281 /* Now calculate the permutation and grouping information. */
282 ierr = MatSeqAIJPERM_create_perm(A);CHKERRQ(ierr);
283 PetscFunctionReturn(0);
284 }
285
MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy)286 PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy)
287 {
288 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
289 const PetscScalar *x;
290 PetscScalar *y;
291 const MatScalar *aa;
292 PetscErrorCode ierr;
293 const PetscInt *aj,*ai;
294 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
295 PetscInt i,j;
296 #endif
297 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
298 __m512d vec_x,vec_y,vec_vals;
299 __m256i vec_idx,vec_ipos,vec_j;
300 __mmask8 mask;
301 #endif
302
303 /* Variables that don't appear in MatMult_SeqAIJ. */
304 Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
305 PetscInt *iperm; /* Points to the permutation vector. */
306 PetscInt *xgroup;
307 /* Denotes where groups of rows with same number of nonzeros
308 * begin and end in iperm. */
309 PetscInt *nzgroup;
310 PetscInt ngroup;
311 PetscInt igroup;
312 PetscInt jstart,jend;
313 /* jstart is used in loops to denote the position in iperm where a
314 * group starts; jend denotes the position where it ends.
315 * (jend + 1 is where the next group starts.) */
316 PetscInt iold,nz;
317 PetscInt istart,iend,isize;
318 PetscInt ipos;
319 PetscScalar yp[NDIM];
320 PetscInt ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
321
322 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
323 #pragma disjoint(*x,*y,*aa)
324 #endif
325
326 PetscFunctionBegin;
327 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
328 ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
329 aj = a->j; /* aj[k] gives column index for element aa[k]. */
330 aa = a->a; /* Nonzero elements stored row-by-row. */
331 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
332
333 /* Get the info we need about the permutations and groupings. */
334 iperm = aijperm->iperm;
335 ngroup = aijperm->ngroup;
336 xgroup = aijperm->xgroup;
337 nzgroup = aijperm->nzgroup;
338
339 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
340 fortranmultaijperm_(&m,x,ii,aj,aa,y);
341 #else
342
343 for (igroup=0; igroup<ngroup; igroup++) {
344 jstart = xgroup[igroup];
345 jend = xgroup[igroup+1] - 1;
346 nz = nzgroup[igroup];
347
348 /* Handle the special cases where the number of nonzeros per row
349 * in the group is either 0 or 1. */
350 if (nz == 0) {
351 for (i=jstart; i<=jend; i++) {
352 y[iperm[i]] = 0.0;
353 }
354 } else if (nz == 1) {
355 for (i=jstart; i<=jend; i++) {
356 iold = iperm[i];
357 ipos = ai[iold];
358 y[iold] = aa[ipos] * x[aj[ipos]];
359 }
360 } else {
361
362 /* We work our way through the current group in chunks of NDIM rows
363 * at a time. */
364
365 for (istart=jstart; istart<=jend; istart+=NDIM) {
366 /* Figure out where the chunk of 'isize' rows ends in iperm.
367 * 'isize may of course be less than NDIM for the last chunk. */
368 iend = istart + (NDIM - 1);
369
370 if (iend > jend) iend = jend;
371
372 isize = iend - istart + 1;
373
374 /* Initialize the yp[] array that will be used to hold part of
375 * the permuted results vector, and figure out where in aa each
376 * row of the chunk will begin. */
377 for (i=0; i<isize; i++) {
378 iold = iperm[istart + i];
379 /* iold is a row number from the matrix A *before* reordering. */
380 ip[i] = ai[iold];
381 /* ip[i] tells us where the ith row of the chunk begins in aa. */
382 yp[i] = (PetscScalar) 0.0;
383 }
384
385 /* If the number of zeros per row exceeds the number of rows in
386 * the chunk, we should vectorize along nz, that is, perform the
387 * mat-vec one row at a time as in the usual CSR case. */
388 if (nz > isize) {
389 #if defined(PETSC_HAVE_CRAY_VECTOR)
390 #pragma _CRI preferstream
391 #endif
392 for (i=0; i<isize; i++) {
393 #if defined(PETSC_HAVE_CRAY_VECTOR)
394 #pragma _CRI prefervector
395 #endif
396
397 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
398 vec_y = _mm512_setzero_pd();
399 ipos = ip[i];
400 for (j=0; j<(nz>>3); j++) {
401 vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
402 vec_vals = _mm512_loadu_pd(&aa[ipos]);
403 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
404 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
405 ipos += 8;
406 }
407 if ((nz&0x07)>2) {
408 mask = (__mmask8)(0xff >> (8-(nz&0x07)));
409 vec_idx = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
410 vec_vals = _mm512_loadu_pd(&aa[ipos]);
411 vec_x = _mm512_mask_i32gather_pd(vec_x,mask,vec_idx,x,_MM_SCALE_8);
412 vec_y = _mm512_mask3_fmadd_pd(vec_x,vec_vals,vec_y,mask);
413 } else if ((nz&0x07)==2) {
414 yp[i] += aa[ipos]*x[aj[ipos]];
415 yp[i] += aa[ipos+1]*x[aj[ipos+1]];
416 } else if ((nz&0x07)==1) {
417 yp[i] += aa[ipos]*x[aj[ipos]];
418 }
419 yp[i] += _mm512_reduce_add_pd(vec_y);
420 #else
421 for (j=0; j<nz; j++) {
422 ipos = ip[i] + j;
423 yp[i] += aa[ipos] * x[aj[ipos]];
424 }
425 #endif
426 }
427 } else {
428 /* Otherwise, there are enough rows in the chunk to make it
429 * worthwhile to vectorize across the rows, that is, to do the
430 * matvec by operating with "columns" of the chunk. */
431 for (j=0; j<nz; j++) {
432 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
433 vec_j = _mm256_set1_epi32(j);
434 for (i=0; i<((isize>>3)<<3); i+=8) {
435 vec_y = _mm512_loadu_pd(&yp[i]);
436 vec_ipos = _mm256_loadu_si256((__m256i const*)&ip[i]);
437 vec_ipos = _mm256_add_epi32(vec_ipos,vec_j);
438 vec_idx = _mm256_i32gather_epi32(aj,vec_ipos,_MM_SCALE_4);
439 vec_vals = _mm512_i32gather_pd(vec_ipos,aa,_MM_SCALE_8);
440 vec_x = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
441 vec_y = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
442 _mm512_storeu_pd(&yp[i],vec_y);
443 }
444 for (i=isize-(isize&0x07); i<isize; i++) {
445 ipos = ip[i]+j;
446 yp[i] += aa[ipos]*x[aj[ipos]];
447 }
448 #else
449 for (i=0; i<isize; i++) {
450 ipos = ip[i] + j;
451 yp[i] += aa[ipos] * x[aj[ipos]];
452 }
453 #endif
454 }
455 }
456
457 #if defined(PETSC_HAVE_CRAY_VECTOR)
458 #pragma _CRI ivdep
459 #endif
460 /* Put results from yp[] into non-permuted result vector y. */
461 for (i=0; i<isize; i++) {
462 y[iperm[istart+i]] = yp[i];
463 }
464 } /* End processing chunk of isize rows of a group. */
465 } /* End handling matvec for chunk with nz > 1. */
466 } /* End loop over igroup. */
467 #endif
468 ierr = PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0));CHKERRQ(ierr);
469 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
470 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
471 PetscFunctionReturn(0);
472 }
473
474
475 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
476 * Note that the names I used to designate the vectors differs from that
477 * used in MatMultAdd_SeqAIJ(). I did this to keep my notation consistent
478 * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
479 /*
480 I hate having virtually identical code for the mult and the multadd!!!
481 */
MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy)482 PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy)
483 {
484 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
485 const PetscScalar *x;
486 PetscScalar *y,*w;
487 const MatScalar *aa;
488 PetscErrorCode ierr;
489 const PetscInt *aj,*ai;
490 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
491 PetscInt i,j;
492 #endif
493
494 /* Variables that don't appear in MatMultAdd_SeqAIJ. */
495 Mat_SeqAIJPERM * aijperm;
496 PetscInt *iperm; /* Points to the permutation vector. */
497 PetscInt *xgroup;
498 /* Denotes where groups of rows with same number of nonzeros
499 * begin and end in iperm. */
500 PetscInt *nzgroup;
501 PetscInt ngroup;
502 PetscInt igroup;
503 PetscInt jstart,jend;
504 /* jstart is used in loops to denote the position in iperm where a
505 * group starts; jend denotes the position where it ends.
506 * (jend + 1 is where the next group starts.) */
507 PetscInt iold,nz;
508 PetscInt istart,iend,isize;
509 PetscInt ipos;
510 PetscScalar yp[NDIM];
511 PetscInt ip[NDIM];
512 /* yp[] and ip[] are treated as vector "registers" for performing
513 * the mat-vec. */
514
515 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
516 #pragma disjoint(*x,*y,*aa)
517 #endif
518
519 PetscFunctionBegin;
520 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
521 ierr = VecGetArrayPair(yy,ww,&y,&w);CHKERRQ(ierr);
522
523 aj = a->j; /* aj[k] gives column index for element aa[k]. */
524 aa = a->a; /* Nonzero elements stored row-by-row. */
525 ai = a->i; /* ai[k] is the position in aa and aj where row k starts. */
526
527 /* Get the info we need about the permutations and groupings. */
528 aijperm = (Mat_SeqAIJPERM*) A->spptr;
529 iperm = aijperm->iperm;
530 ngroup = aijperm->ngroup;
531 xgroup = aijperm->xgroup;
532 nzgroup = aijperm->nzgroup;
533
534 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
535 fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w);
536 #else
537
538 for (igroup=0; igroup<ngroup; igroup++) {
539 jstart = xgroup[igroup];
540 jend = xgroup[igroup+1] - 1;
541
542 nz = nzgroup[igroup];
543
544 /* Handle the special cases where the number of nonzeros per row
545 * in the group is either 0 or 1. */
546 if (nz == 0) {
547 for (i=jstart; i<=jend; i++) {
548 iold = iperm[i];
549 y[iold] = w[iold];
550 }
551 }
552 else if (nz == 1) {
553 for (i=jstart; i<=jend; i++) {
554 iold = iperm[i];
555 ipos = ai[iold];
556 y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
557 }
558 }
559 /* For the general case: */
560 else {
561
562 /* We work our way through the current group in chunks of NDIM rows
563 * at a time. */
564
565 for (istart=jstart; istart<=jend; istart+=NDIM) {
566 /* Figure out where the chunk of 'isize' rows ends in iperm.
567 * 'isize may of course be less than NDIM for the last chunk. */
568 iend = istart + (NDIM - 1);
569 if (iend > jend) iend = jend;
570 isize = iend - istart + 1;
571
572 /* Initialize the yp[] array that will be used to hold part of
573 * the permuted results vector, and figure out where in aa each
574 * row of the chunk will begin. */
575 for (i=0; i<isize; i++) {
576 iold = iperm[istart + i];
577 /* iold is a row number from the matrix A *before* reordering. */
578 ip[i] = ai[iold];
579 /* ip[i] tells us where the ith row of the chunk begins in aa. */
580 yp[i] = w[iold];
581 }
582
583 /* If the number of zeros per row exceeds the number of rows in
584 * the chunk, we should vectorize along nz, that is, perform the
585 * mat-vec one row at a time as in the usual CSR case. */
586 if (nz > isize) {
587 #if defined(PETSC_HAVE_CRAY_VECTOR)
588 #pragma _CRI preferstream
589 #endif
590 for (i=0; i<isize; i++) {
591 #if defined(PETSC_HAVE_CRAY_VECTOR)
592 #pragma _CRI prefervector
593 #endif
594 for (j=0; j<nz; j++) {
595 ipos = ip[i] + j;
596 yp[i] += aa[ipos] * x[aj[ipos]];
597 }
598 }
599 }
600 /* Otherwise, there are enough rows in the chunk to make it
601 * worthwhile to vectorize across the rows, that is, to do the
602 * matvec by operating with "columns" of the chunk. */
603 else {
604 for (j=0; j<nz; j++) {
605 for (i=0; i<isize; i++) {
606 ipos = ip[i] + j;
607 yp[i] += aa[ipos] * x[aj[ipos]];
608 }
609 }
610 }
611
612 #if defined(PETSC_HAVE_CRAY_VECTOR)
613 #pragma _CRI ivdep
614 #endif
615 /* Put results from yp[] into non-permuted result vector y. */
616 for (i=0; i<isize; i++) {
617 y[iperm[istart+i]] = yp[i];
618 }
619 } /* End processing chunk of isize rows of a group. */
620
621 } /* End handling matvec for chunk with nz > 1. */
622 } /* End loop over igroup. */
623
624 #endif
625 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
626 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
627 ierr = VecRestoreArrayPair(yy,ww,&y,&w);CHKERRQ(ierr);
628 PetscFunctionReturn(0);
629 }
630
631 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
632 * SeqAIJPERM matrix. This routine is called by the MatCreate_SeqAIJPERM()
633 * routine, but can also be used to convert an assembled SeqAIJ matrix
634 * into a SeqAIJPERM one. */
MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat * newmat)635 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat)
636 {
637 PetscErrorCode ierr;
638 Mat B = *newmat;
639 Mat_SeqAIJPERM *aijperm;
640 PetscBool sametype;
641
642 PetscFunctionBegin;
643 if (reuse == MAT_INITIAL_MATRIX) {
644 ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
645 }
646 ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
647 if (sametype) PetscFunctionReturn(0);
648
649 ierr = PetscNewLog(B,&aijperm);CHKERRQ(ierr);
650 B->spptr = (void*) aijperm;
651
652 /* Set function pointers for methods that we inherit from AIJ but override. */
653 B->ops->duplicate = MatDuplicate_SeqAIJPERM;
654 B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
655 B->ops->destroy = MatDestroy_SeqAIJPERM;
656 B->ops->mult = MatMult_SeqAIJPERM;
657 B->ops->multadd = MatMultAdd_SeqAIJPERM;
658
659 aijperm->nonzerostate = -1; /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
660 /* If A has already been assembled, compute the permutation. */
661 if (A->assembled) {
662 ierr = MatSeqAIJPERM_create_perm(B);CHKERRQ(ierr);
663 }
664
665 ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr);
666
667 ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM);CHKERRQ(ierr);
668 *newmat = B;
669 PetscFunctionReturn(0);
670 }
671
672 /*@C
673 MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM.
674 This type inherits from AIJ, but calculates some additional permutation
675 information that is used to allow better vectorization of some
676 operations. At the cost of increased storage, the AIJ formatted
677 matrix can be copied to a format in which pieces of the matrix are
678 stored in ELLPACK format, allowing the vectorized matrix multiply
679 routine to use stride-1 memory accesses. As with the AIJ type, it is
680 important to preallocate matrix storage in order to get good assembly
681 performance.
682
683 Collective
684
685 Input Parameters:
686 + comm - MPI communicator, set to PETSC_COMM_SELF
687 . m - number of rows
688 . n - number of columns
689 . nz - number of nonzeros per row (same for all rows)
690 - nnz - array containing the number of nonzeros in the various rows
691 (possibly different for each row) or NULL
692
693 Output Parameter:
694 . A - the matrix
695
696 Notes:
697 If nnz is given then nz is ignored
698
699 Level: intermediate
700
701 .seealso: MatCreate(), MatCreateMPIAIJPERM(), MatSetValues()
702 @*/
MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat * A)703 PetscErrorCode MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
704 {
705 PetscErrorCode ierr;
706
707 PetscFunctionBegin;
708 ierr = MatCreate(comm,A);CHKERRQ(ierr);
709 ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
710 ierr = MatSetType(*A,MATSEQAIJPERM);CHKERRQ(ierr);
711 ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
712 PetscFunctionReturn(0);
713 }
714
MatCreate_SeqAIJPERM(Mat A)715 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
716 {
717 PetscErrorCode ierr;
718
719 PetscFunctionBegin;
720 ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
721 ierr = MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
722 PetscFunctionReturn(0);
723 }
724
725