1 
2 /*
3   Defines basic operations for the MATSEQAIJPERM matrix class.
4   This class is derived from the MATSEQAIJ class and retains the
5   compressed row storage (aka Yale sparse matrix format) but augments
6   it with some permutation information that enables some operations
7   to be more vectorizable.  A physically rearranged copy of the matrix
8   may be stored if the user desires.
9 
10   Eventually a variety of permutations may be supported.
11 */
12 
13 #include <../src/mat/impls/aij/seq/aij.h>
14 
15 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
16 #include <immintrin.h>
17 
18 #if !defined(_MM_SCALE_8)
19 #define _MM_SCALE_8    8
20 #endif
21 #if !defined(_MM_SCALE_4)
22 #define _MM_SCALE_4    4
23 #endif
24 #endif
25 
26 #define NDIM 512
27 /* NDIM specifies how many rows at a time we should work with when
28  * performing the vectorized mat-vec.  This depends on various factors
29  * such as vector register length, etc., and I really need to add a
30  * way for the user (or the library) to tune this.  I'm setting it to
31  * 512 for now since that is what Ed D'Azevedo was using in his Fortran
32  * routines. */
33 
34 typedef struct {
35   PetscObjectState nonzerostate; /* used to determine if the nonzero structure has changed and hence the permutations need updating */
36 
37   PetscInt         ngroup;
38   PetscInt         *xgroup;
39   /* Denotes where groups of rows with same number of nonzeros
40    * begin and end, i.e., xgroup[i] gives us the position in iperm[]
41    * where the ith group begins. */
42 
43   PetscInt         *nzgroup; /*  how many nonzeros each row that is a member of group i has. */
44   PetscInt         *iperm;  /* The permutation vector. */
45 
46   /* Some of this stuff is for Ed's recursive triangular solve.
47    * I'm not sure what I need yet. */
48   PetscInt         blocksize;
49   PetscInt         nstep;
50   PetscInt         *jstart_list;
51   PetscInt         *jend_list;
52   PetscInt         *action_list;
53   PetscInt         *ngroup_list;
54   PetscInt         **ipointer_list;
55   PetscInt         **xgroup_list;
56   PetscInt         **nzgroup_list;
57   PetscInt         **iperm_list;
58 } Mat_SeqAIJPERM;
59 
MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat * newmat)60 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
61 {
62   /* This routine is only called to convert a MATAIJPERM to its base PETSc type, */
63   /* so we will ignore 'MatType type'. */
64   PetscErrorCode ierr;
65   Mat            B       = *newmat;
66   Mat_SeqAIJPERM *aijperm=(Mat_SeqAIJPERM*)A->spptr;
67 
68   PetscFunctionBegin;
69   if (reuse == MAT_INITIAL_MATRIX) {
70     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
71     aijperm=(Mat_SeqAIJPERM*)B->spptr;
72   }
73 
74   /* Reset the original function pointers. */
75   B->ops->assemblyend = MatAssemblyEnd_SeqAIJ;
76   B->ops->destroy     = MatDestroy_SeqAIJ;
77   B->ops->duplicate   = MatDuplicate_SeqAIJ;
78   B->ops->mult        = MatMult_SeqAIJ;
79   B->ops->multadd     = MatMultAdd_SeqAIJ;
80 
81   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",NULL);CHKERRQ(ierr);
82 
83   /* Free everything in the Mat_SeqAIJPERM data structure.*/
84   ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
85   ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
86   ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
87   ierr = PetscFree(B->spptr);CHKERRQ(ierr);
88 
89   /* Change the type of B to MATSEQAIJ. */
90   ierr = PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJ);CHKERRQ(ierr);
91 
92   *newmat = B;
93   PetscFunctionReturn(0);
94 }
95 
MatDestroy_SeqAIJPERM(Mat A)96 PetscErrorCode MatDestroy_SeqAIJPERM(Mat A)
97 {
98   PetscErrorCode ierr;
99   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
100 
101   PetscFunctionBegin;
102   if (aijperm) {
103     /* If MatHeaderMerge() was used then this SeqAIJPERM matrix will not have a spprt. */
104     ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
105     ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
106     ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
107     ierr = PetscFree(A->spptr);CHKERRQ(ierr);
108   }
109   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ()
110    * to destroy everything that remains. */
111   ierr = PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJ);CHKERRQ(ierr);
112   /* Note that I don't call MatSetType().  I believe this is because that
113    * is only to be called when *building* a matrix.  I could be wrong, but
114    * that is how things work for the SuperLU matrix class. */
115   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
116   PetscFunctionReturn(0);
117 }
118 
MatDuplicate_SeqAIJPERM(Mat A,MatDuplicateOption op,Mat * M)119 PetscErrorCode MatDuplicate_SeqAIJPERM(Mat A, MatDuplicateOption op, Mat *M)
120 {
121   PetscErrorCode ierr;
122   Mat_SeqAIJPERM *aijperm      = (Mat_SeqAIJPERM*) A->spptr;
123   Mat_SeqAIJPERM *aijperm_dest;
124   PetscBool      perm;
125 
126   PetscFunctionBegin;
127   ierr        = MatDuplicate_SeqAIJ(A,op,M);CHKERRQ(ierr);
128   ierr        = PetscObjectTypeCompare((PetscObject)*M,MATSEQAIJPERM,&perm);CHKERRQ(ierr);
129   if (perm) {
130     aijperm_dest = (Mat_SeqAIJPERM *) (*M)->spptr;
131     ierr = PetscFree(aijperm_dest->xgroup);CHKERRQ(ierr);
132     ierr = PetscFree(aijperm_dest->nzgroup);CHKERRQ(ierr);
133     ierr = PetscFree(aijperm_dest->iperm);CHKERRQ(ierr);
134   } else {
135     ierr        = PetscNewLog(*M,&aijperm_dest);CHKERRQ(ierr);
136     (*M)->spptr = (void*) aijperm_dest;
137     ierr        = PetscObjectChangeTypeName((PetscObject)*M,MATSEQAIJPERM);CHKERRQ(ierr);
138     ierr        = PetscObjectComposeFunction((PetscObject)*M,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr);
139   }
140   ierr        = PetscArraycpy(aijperm_dest,aijperm,1);CHKERRQ(ierr);
141   /* Allocate space for, and copy the grouping and permutation info.
142    * I note that when the groups are initially determined in
143    * MatSeqAIJPERM_create_perm, xgroup and nzgroup may be sized larger than
144    * necessary.  But at this point, we know how large they need to be, and
145    * allocate only the necessary amount of memory.  So the duplicated matrix
146    * may actually use slightly less storage than the original! */
147   ierr = PetscMalloc1(A->rmap->n, &aijperm_dest->iperm);CHKERRQ(ierr);
148   ierr = PetscMalloc1(aijperm->ngroup+1, &aijperm_dest->xgroup);CHKERRQ(ierr);
149   ierr = PetscMalloc1(aijperm->ngroup, &aijperm_dest->nzgroup);CHKERRQ(ierr);
150   ierr = PetscArraycpy(aijperm_dest->iperm,aijperm->iperm,A->rmap->n);CHKERRQ(ierr);
151   ierr = PetscArraycpy(aijperm_dest->xgroup,aijperm->xgroup,aijperm->ngroup+1);CHKERRQ(ierr);
152   ierr = PetscArraycpy(aijperm_dest->nzgroup,aijperm->nzgroup,aijperm->ngroup);CHKERRQ(ierr);
153   PetscFunctionReturn(0);
154 }
155 
MatSeqAIJPERM_create_perm(Mat A)156 PetscErrorCode MatSeqAIJPERM_create_perm(Mat A)
157 {
158   PetscErrorCode ierr;
159   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)(A)->data;
160   Mat_SeqAIJPERM *aijperm = (Mat_SeqAIJPERM*) A->spptr;
161   PetscInt       m;       /* Number of rows in the matrix. */
162   PetscInt       *ia;       /* From the CSR representation; points to the beginning  of each row. */
163   PetscInt       maxnz;      /* Maximum number of nonzeros in any row. */
164   PetscInt       *rows_in_bucket;
165   /* To construct the permutation, we sort each row into one of maxnz
166    * buckets based on how many nonzeros are in the row. */
167   PetscInt       nz;
168   PetscInt       *nz_in_row;         /* the number of nonzero elements in row k. */
169   PetscInt       *ipnz;
170   /* When constructing the iperm permutation vector,
171    * ipnz[nz] is used to point to the next place in the permutation vector
172    * that a row with nz nonzero elements should be placed.*/
173   PetscInt       i, ngroup, istart, ipos;
174 
175   PetscFunctionBegin;
176   if (aijperm->nonzerostate == A->nonzerostate) PetscFunctionReturn(0); /* permutation exists and matches current nonzero structure */
177   aijperm->nonzerostate = A->nonzerostate;
178  /* Free anything previously put in the Mat_SeqAIJPERM data structure. */
179   ierr = PetscFree(aijperm->xgroup);CHKERRQ(ierr);
180   ierr = PetscFree(aijperm->nzgroup);CHKERRQ(ierr);
181   ierr = PetscFree(aijperm->iperm);CHKERRQ(ierr);
182 
183   m  = A->rmap->n;
184   ia = a->i;
185 
186   /* Allocate the arrays that will hold the permutation vector. */
187   ierr = PetscMalloc1(m, &aijperm->iperm);CHKERRQ(ierr);
188 
189   /* Allocate some temporary work arrays that will be used in
190    * calculating the permuation vector and groupings. */
191   ierr = PetscMalloc1(m, &nz_in_row);CHKERRQ(ierr);
192 
193   /* Now actually figure out the permutation and grouping. */
194 
195   /* First pass: Determine number of nonzeros in each row, maximum
196    * number of nonzeros in any row, and how many rows fall into each
197    * "bucket" of rows with same number of nonzeros. */
198   maxnz = 0;
199   for (i=0; i<m; i++) {
200     nz_in_row[i] = ia[i+1]-ia[i];
201     if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
202   }
203   ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &rows_in_bucket);CHKERRQ(ierr);
204   ierr = PetscMalloc1(PetscMax(maxnz,m)+1, &ipnz);CHKERRQ(ierr);
205 
206   for (i=0; i<=maxnz; i++) {
207     rows_in_bucket[i] = 0;
208   }
209   for (i=0; i<m; i++) {
210     nz = nz_in_row[i];
211     rows_in_bucket[nz]++;
212   }
213 
214   /* Allocate space for the grouping info.  There will be at most (maxnz + 1)
215    * groups.  (It is maxnz + 1 instead of simply maxnz because there may be
216    * rows with no nonzero elements.)  If there are (maxnz + 1) groups,
217    * then xgroup[] must consist of (maxnz + 2) elements, since the last
218    * element of xgroup will tell us where the (maxnz + 1)th group ends.
219    * We allocate space for the maximum number of groups;
220    * that is potentially a little wasteful, but not too much so.
221    * Perhaps I should fix it later. */
222   ierr = PetscMalloc1(maxnz+2, &aijperm->xgroup);CHKERRQ(ierr);
223   ierr = PetscMalloc1(maxnz+1, &aijperm->nzgroup);CHKERRQ(ierr);
224 
225   /* Second pass.  Look at what is in the buckets and create the groupings.
226    * Note that it is OK to have a group of rows with no non-zero values. */
227   ngroup = 0;
228   istart = 0;
229   for (i=0; i<=maxnz; i++) {
230     if (rows_in_bucket[i] > 0) {
231       aijperm->nzgroup[ngroup] = i;
232       aijperm->xgroup[ngroup]  = istart;
233       ngroup++;
234       istart += rows_in_bucket[i];
235     }
236   }
237 
238   aijperm->xgroup[ngroup] = istart;
239   aijperm->ngroup         = ngroup;
240 
241   /* Now fill in the permutation vector iperm. */
242   ipnz[0] = 0;
243   for (i=0; i<maxnz; i++) {
244     ipnz[i+1] = ipnz[i] + rows_in_bucket[i];
245   }
246 
247   for (i=0; i<m; i++) {
248     nz                   = nz_in_row[i];
249     ipos                 = ipnz[nz];
250     aijperm->iperm[ipos] = i;
251     ipnz[nz]++;
252   }
253 
254   /* Clean up temporary work arrays. */
255   ierr = PetscFree(rows_in_bucket);CHKERRQ(ierr);
256   ierr = PetscFree(ipnz);CHKERRQ(ierr);
257   ierr = PetscFree(nz_in_row);CHKERRQ(ierr);
258   PetscFunctionReturn(0);
259 }
260 
261 
MatAssemblyEnd_SeqAIJPERM(Mat A,MatAssemblyType mode)262 PetscErrorCode MatAssemblyEnd_SeqAIJPERM(Mat A, MatAssemblyType mode)
263 {
264   PetscErrorCode ierr;
265   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
266 
267   PetscFunctionBegin;
268   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
269 
270   /* Since a MATSEQAIJPERM matrix is really just a MATSEQAIJ with some
271    * extra information, call the AssemblyEnd routine for a MATSEQAIJ.
272    * I'm not sure if this is the best way to do this, but it avoids
273    * a lot of code duplication.
274    * I also note that currently MATSEQAIJPERM doesn't know anything about
275    * the Mat_CompressedRow data structure that SeqAIJ now uses when there
276    * are many zero rows.  If the SeqAIJ assembly end routine decides to use
277    * this, this may break things.  (Don't know... haven't looked at it.) */
278   a->inode.use = PETSC_FALSE;
279   ierr         = MatAssemblyEnd_SeqAIJ(A, mode);CHKERRQ(ierr);
280 
281   /* Now calculate the permutation and grouping information. */
282   ierr = MatSeqAIJPERM_create_perm(A);CHKERRQ(ierr);
283   PetscFunctionReturn(0);
284 }
285 
MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy)286 PetscErrorCode MatMult_SeqAIJPERM(Mat A,Vec xx,Vec yy)
287 {
288   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
289   const PetscScalar *x;
290   PetscScalar       *y;
291   const MatScalar   *aa;
292   PetscErrorCode    ierr;
293   const PetscInt    *aj,*ai;
294 #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking))
295   PetscInt          i,j;
296 #endif
297 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
298   __m512d           vec_x,vec_y,vec_vals;
299   __m256i           vec_idx,vec_ipos,vec_j;
300   __mmask8           mask;
301 #endif
302 
303   /* Variables that don't appear in MatMult_SeqAIJ. */
304   Mat_SeqAIJPERM    *aijperm = (Mat_SeqAIJPERM*) A->spptr;
305   PetscInt          *iperm;  /* Points to the permutation vector. */
306   PetscInt          *xgroup;
307   /* Denotes where groups of rows with same number of nonzeros
308    * begin and end in iperm. */
309   PetscInt          *nzgroup;
310   PetscInt          ngroup;
311   PetscInt          igroup;
312   PetscInt          jstart,jend;
313   /* jstart is used in loops to denote the position in iperm where a
314    * group starts; jend denotes the position where it ends.
315    * (jend + 1 is where the next group starts.) */
316   PetscInt          iold,nz;
317   PetscInt          istart,iend,isize;
318   PetscInt          ipos;
319   PetscScalar       yp[NDIM];
320   PetscInt          ip[NDIM];    /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */
321 
322 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
323 #pragma disjoint(*x,*y,*aa)
324 #endif
325 
326   PetscFunctionBegin;
327   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
328   ierr = VecGetArray(yy,&y);CHKERRQ(ierr);
329   aj   = a->j;   /* aj[k] gives column index for element aa[k]. */
330   aa   = a->a; /* Nonzero elements stored row-by-row. */
331   ai   = a->i;  /* ai[k] is the position in aa and aj where row k starts. */
332 
333   /* Get the info we need about the permutations and groupings. */
334   iperm   = aijperm->iperm;
335   ngroup  = aijperm->ngroup;
336   xgroup  = aijperm->xgroup;
337   nzgroup = aijperm->nzgroup;
338 
339 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTAIJPERM) && defined(notworking)
340   fortranmultaijperm_(&m,x,ii,aj,aa,y);
341 #else
342 
343   for (igroup=0; igroup<ngroup; igroup++) {
344     jstart = xgroup[igroup];
345     jend   = xgroup[igroup+1] - 1;
346     nz     = nzgroup[igroup];
347 
348     /* Handle the special cases where the number of nonzeros per row
349      * in the group is either 0 or 1. */
350     if (nz == 0) {
351       for (i=jstart; i<=jend; i++) {
352         y[iperm[i]] = 0.0;
353       }
354     } else if (nz == 1) {
355       for (i=jstart; i<=jend; i++) {
356         iold    = iperm[i];
357         ipos    = ai[iold];
358         y[iold] = aa[ipos] * x[aj[ipos]];
359       }
360     } else {
361 
362       /* We work our way through the current group in chunks of NDIM rows
363        * at a time. */
364 
365       for (istart=jstart; istart<=jend; istart+=NDIM) {
366         /* Figure out where the chunk of 'isize' rows ends in iperm.
367          * 'isize may of course be less than NDIM for the last chunk. */
368         iend = istart + (NDIM - 1);
369 
370         if (iend > jend) iend = jend;
371 
372         isize = iend - istart + 1;
373 
374         /* Initialize the yp[] array that will be used to hold part of
375          * the permuted results vector, and figure out where in aa each
376          * row of the chunk will begin. */
377         for (i=0; i<isize; i++) {
378           iold = iperm[istart + i];
379           /* iold is a row number from the matrix A *before* reordering. */
380           ip[i] = ai[iold];
381           /* ip[i] tells us where the ith row of the chunk begins in aa. */
382           yp[i] = (PetscScalar) 0.0;
383         }
384 
385         /* If the number of zeros per row exceeds the number of rows in
386          * the chunk, we should vectorize along nz, that is, perform the
387          * mat-vec one row at a time as in the usual CSR case. */
388         if (nz > isize) {
389 #if defined(PETSC_HAVE_CRAY_VECTOR)
390 #pragma _CRI preferstream
391 #endif
392           for (i=0; i<isize; i++) {
393 #if defined(PETSC_HAVE_CRAY_VECTOR)
394 #pragma _CRI prefervector
395 #endif
396 
397 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
398             vec_y = _mm512_setzero_pd();
399             ipos = ip[i];
400             for (j=0; j<(nz>>3); j++) {
401               vec_idx  = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
402               vec_vals = _mm512_loadu_pd(&aa[ipos]);
403               vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
404               vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
405               ipos += 8;
406             }
407             if ((nz&0x07)>2) {
408               mask     = (__mmask8)(0xff >> (8-(nz&0x07)));
409               vec_idx  = _mm256_loadu_si256((__m256i const*)&aj[ipos]);
410               vec_vals = _mm512_loadu_pd(&aa[ipos]);
411               vec_x    = _mm512_mask_i32gather_pd(vec_x,mask,vec_idx,x,_MM_SCALE_8);
412               vec_y    = _mm512_mask3_fmadd_pd(vec_x,vec_vals,vec_y,mask);
413             } else if ((nz&0x07)==2) {
414               yp[i] += aa[ipos]*x[aj[ipos]];
415               yp[i] += aa[ipos+1]*x[aj[ipos+1]];
416             } else if ((nz&0x07)==1) {
417               yp[i] += aa[ipos]*x[aj[ipos]];
418             }
419             yp[i] += _mm512_reduce_add_pd(vec_y);
420 #else
421             for (j=0; j<nz; j++) {
422               ipos   = ip[i] + j;
423               yp[i] += aa[ipos] * x[aj[ipos]];
424             }
425 #endif
426           }
427         } else {
428           /* Otherwise, there are enough rows in the chunk to make it
429            * worthwhile to vectorize across the rows, that is, to do the
430            * matvec by operating with "columns" of the chunk. */
431           for (j=0; j<nz; j++) {
432 #if defined(PETSC_USE_AVX512_KERNELS) && defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX512F__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
433             vec_j = _mm256_set1_epi32(j);
434             for (i=0; i<((isize>>3)<<3); i+=8) {
435               vec_y    = _mm512_loadu_pd(&yp[i]);
436               vec_ipos = _mm256_loadu_si256((__m256i const*)&ip[i]);
437               vec_ipos = _mm256_add_epi32(vec_ipos,vec_j);
438               vec_idx  = _mm256_i32gather_epi32(aj,vec_ipos,_MM_SCALE_4);
439               vec_vals = _mm512_i32gather_pd(vec_ipos,aa,_MM_SCALE_8);
440               vec_x    = _mm512_i32gather_pd(vec_idx,x,_MM_SCALE_8);
441               vec_y    = _mm512_fmadd_pd(vec_x,vec_vals,vec_y);
442               _mm512_storeu_pd(&yp[i],vec_y);
443             }
444             for (i=isize-(isize&0x07); i<isize; i++) {
445               ipos = ip[i]+j;
446               yp[i] += aa[ipos]*x[aj[ipos]];
447             }
448 #else
449             for (i=0; i<isize; i++) {
450               ipos   = ip[i] + j;
451               yp[i] += aa[ipos] * x[aj[ipos]];
452             }
453 #endif
454           }
455         }
456 
457 #if defined(PETSC_HAVE_CRAY_VECTOR)
458 #pragma _CRI ivdep
459 #endif
460         /* Put results from yp[] into non-permuted result vector y. */
461         for (i=0; i<isize; i++) {
462           y[iperm[istart+i]] = yp[i];
463         }
464       } /* End processing chunk of isize rows of a group. */
465     } /* End handling matvec for chunk with nz > 1. */
466   } /* End loop over igroup. */
467 #endif
468   ierr = PetscLogFlops(PetscMax(2.0*a->nz - A->rmap->n,0));CHKERRQ(ierr);
469   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
470   ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr);
471   PetscFunctionReturn(0);
472 }
473 
474 
475 /* MatMultAdd_SeqAIJPERM() calculates yy = ww + A * xx.
476  * Note that the names I used to designate the vectors differs from that
477  * used in MatMultAdd_SeqAIJ().  I did this to keep my notation consistent
478  * with the MatMult_SeqAIJPERM() routine, which is very similar to this one. */
479 /*
480     I hate having virtually identical code for the mult and the multadd!!!
481 */
MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy)482 PetscErrorCode MatMultAdd_SeqAIJPERM(Mat A,Vec xx,Vec ww,Vec yy)
483 {
484   Mat_SeqAIJ        *a = (Mat_SeqAIJ*)A->data;
485   const PetscScalar *x;
486   PetscScalar       *y,*w;
487   const MatScalar   *aa;
488   PetscErrorCode    ierr;
489   const PetscInt    *aj,*ai;
490 #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
491   PetscInt i,j;
492 #endif
493 
494   /* Variables that don't appear in MatMultAdd_SeqAIJ. */
495   Mat_SeqAIJPERM * aijperm;
496   PetscInt       *iperm;    /* Points to the permutation vector. */
497   PetscInt       *xgroup;
498   /* Denotes where groups of rows with same number of nonzeros
499    * begin and end in iperm. */
500   PetscInt *nzgroup;
501   PetscInt ngroup;
502   PetscInt igroup;
503   PetscInt jstart,jend;
504   /* jstart is used in loops to denote the position in iperm where a
505    * group starts; jend denotes the position where it ends.
506    * (jend + 1 is where the next group starts.) */
507   PetscInt    iold,nz;
508   PetscInt    istart,iend,isize;
509   PetscInt    ipos;
510   PetscScalar yp[NDIM];
511   PetscInt    ip[NDIM];
512   /* yp[] and ip[] are treated as vector "registers" for performing
513    * the mat-vec. */
514 
515 #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
516 #pragma disjoint(*x,*y,*aa)
517 #endif
518 
519   PetscFunctionBegin;
520   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
521   ierr = VecGetArrayPair(yy,ww,&y,&w);CHKERRQ(ierr);
522 
523   aj = a->j;   /* aj[k] gives column index for element aa[k]. */
524   aa = a->a;   /* Nonzero elements stored row-by-row. */
525   ai = a->i;   /* ai[k] is the position in aa and aj where row k starts. */
526 
527   /* Get the info we need about the permutations and groupings. */
528   aijperm = (Mat_SeqAIJPERM*) A->spptr;
529   iperm   = aijperm->iperm;
530   ngroup  = aijperm->ngroup;
531   xgroup  = aijperm->xgroup;
532   nzgroup = aijperm->nzgroup;
533 
534 #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDAIJPERM)
535   fortranmultaddaijperm_(&m,x,ii,aj,aa,y,w);
536 #else
537 
538   for (igroup=0; igroup<ngroup; igroup++) {
539     jstart = xgroup[igroup];
540     jend   = xgroup[igroup+1] - 1;
541 
542     nz = nzgroup[igroup];
543 
544     /* Handle the special cases where the number of nonzeros per row
545      * in the group is either 0 or 1. */
546     if (nz == 0) {
547       for (i=jstart; i<=jend; i++) {
548         iold    = iperm[i];
549         y[iold] = w[iold];
550       }
551     }
552     else if (nz == 1) {
553       for (i=jstart; i<=jend; i++) {
554         iold    = iperm[i];
555         ipos    = ai[iold];
556         y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
557       }
558     }
559     /* For the general case: */
560     else {
561 
562       /* We work our way through the current group in chunks of NDIM rows
563        * at a time. */
564 
565       for (istart=jstart; istart<=jend; istart+=NDIM) {
566         /* Figure out where the chunk of 'isize' rows ends in iperm.
567          * 'isize may of course be less than NDIM for the last chunk. */
568         iend = istart + (NDIM - 1);
569         if (iend > jend) iend = jend;
570         isize = iend - istart + 1;
571 
572         /* Initialize the yp[] array that will be used to hold part of
573          * the permuted results vector, and figure out where in aa each
574          * row of the chunk will begin. */
575         for (i=0; i<isize; i++) {
576           iold = iperm[istart + i];
577           /* iold is a row number from the matrix A *before* reordering. */
578           ip[i] = ai[iold];
579           /* ip[i] tells us where the ith row of the chunk begins in aa. */
580           yp[i] = w[iold];
581         }
582 
583         /* If the number of zeros per row exceeds the number of rows in
584          * the chunk, we should vectorize along nz, that is, perform the
585          * mat-vec one row at a time as in the usual CSR case. */
586         if (nz > isize) {
587 #if defined(PETSC_HAVE_CRAY_VECTOR)
588 #pragma _CRI preferstream
589 #endif
590           for (i=0; i<isize; i++) {
591 #if defined(PETSC_HAVE_CRAY_VECTOR)
592 #pragma _CRI prefervector
593 #endif
594             for (j=0; j<nz; j++) {
595               ipos   = ip[i] + j;
596               yp[i] += aa[ipos] * x[aj[ipos]];
597             }
598           }
599         }
600         /* Otherwise, there are enough rows in the chunk to make it
601          * worthwhile to vectorize across the rows, that is, to do the
602          * matvec by operating with "columns" of the chunk. */
603         else {
604           for (j=0; j<nz; j++) {
605             for (i=0; i<isize; i++) {
606               ipos   = ip[i] + j;
607               yp[i] += aa[ipos] * x[aj[ipos]];
608             }
609           }
610         }
611 
612 #if defined(PETSC_HAVE_CRAY_VECTOR)
613 #pragma _CRI ivdep
614 #endif
615         /* Put results from yp[] into non-permuted result vector y. */
616         for (i=0; i<isize; i++) {
617           y[iperm[istart+i]] = yp[i];
618         }
619       } /* End processing chunk of isize rows of a group. */
620 
621     } /* End handling matvec for chunk with nz > 1. */
622   } /* End loop over igroup. */
623 
624 #endif
625   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
626   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
627   ierr = VecRestoreArrayPair(yy,ww,&y,&w);CHKERRQ(ierr);
628   PetscFunctionReturn(0);
629 }
630 
631 /* MatConvert_SeqAIJ_SeqAIJPERM converts a SeqAIJ matrix into a
632  * SeqAIJPERM matrix.  This routine is called by the MatCreate_SeqAIJPERM()
633  * routine, but can also be used to convert an assembled SeqAIJ matrix
634  * into a SeqAIJPERM one. */
MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat * newmat)635 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat)
636 {
637   PetscErrorCode ierr;
638   Mat            B = *newmat;
639   Mat_SeqAIJPERM *aijperm;
640   PetscBool      sametype;
641 
642   PetscFunctionBegin;
643   if (reuse == MAT_INITIAL_MATRIX) {
644     ierr = MatDuplicate(A,MAT_COPY_VALUES,&B);CHKERRQ(ierr);
645   }
646   ierr = PetscObjectTypeCompare((PetscObject)A,type,&sametype);CHKERRQ(ierr);
647   if (sametype) PetscFunctionReturn(0);
648 
649   ierr     = PetscNewLog(B,&aijperm);CHKERRQ(ierr);
650   B->spptr = (void*) aijperm;
651 
652   /* Set function pointers for methods that we inherit from AIJ but override. */
653   B->ops->duplicate   = MatDuplicate_SeqAIJPERM;
654   B->ops->assemblyend = MatAssemblyEnd_SeqAIJPERM;
655   B->ops->destroy     = MatDestroy_SeqAIJPERM;
656   B->ops->mult        = MatMult_SeqAIJPERM;
657   B->ops->multadd     = MatMultAdd_SeqAIJPERM;
658 
659   aijperm->nonzerostate = -1;  /* this will trigger the generation of the permutation information the first time through MatAssembly()*/
660   /* If A has already been assembled, compute the permutation. */
661   if (A->assembled) {
662     ierr = MatSeqAIJPERM_create_perm(B);CHKERRQ(ierr);
663   }
664 
665   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijperm_seqaij_C",MatConvert_SeqAIJPERM_SeqAIJ);CHKERRQ(ierr);
666 
667   ierr    = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJPERM);CHKERRQ(ierr);
668   *newmat = B;
669   PetscFunctionReturn(0);
670 }
671 
672 /*@C
673    MatCreateSeqAIJPERM - Creates a sparse matrix of type SEQAIJPERM.
674    This type inherits from AIJ, but calculates some additional permutation
675    information that is used to allow better vectorization of some
676    operations.  At the cost of increased storage, the AIJ formatted
677    matrix can be copied to a format in which pieces of the matrix are
678    stored in ELLPACK format, allowing the vectorized matrix multiply
679    routine to use stride-1 memory accesses.  As with the AIJ type, it is
680    important to preallocate matrix storage in order to get good assembly
681    performance.
682 
683    Collective
684 
685    Input Parameters:
686 +  comm - MPI communicator, set to PETSC_COMM_SELF
687 .  m - number of rows
688 .  n - number of columns
689 .  nz - number of nonzeros per row (same for all rows)
690 -  nnz - array containing the number of nonzeros in the various rows
691          (possibly different for each row) or NULL
692 
693    Output Parameter:
694 .  A - the matrix
695 
696    Notes:
697    If nnz is given then nz is ignored
698 
699    Level: intermediate
700 
701 .seealso: MatCreate(), MatCreateMPIAIJPERM(), MatSetValues()
702 @*/
MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat * A)703 PetscErrorCode  MatCreateSeqAIJPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
704 {
705   PetscErrorCode ierr;
706 
707   PetscFunctionBegin;
708   ierr = MatCreate(comm,A);CHKERRQ(ierr);
709   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
710   ierr = MatSetType(*A,MATSEQAIJPERM);CHKERRQ(ierr);
711   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,nnz);CHKERRQ(ierr);
712   PetscFunctionReturn(0);
713 }
714 
MatCreate_SeqAIJPERM(Mat A)715 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJPERM(Mat A)
716 {
717   PetscErrorCode ierr;
718 
719   PetscFunctionBegin;
720   ierr = MatSetType(A,MATSEQAIJ);CHKERRQ(ierr);
721   ierr = MatConvert_SeqAIJ_SeqAIJPERM(A,MATSEQAIJPERM,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
722   PetscFunctionReturn(0);
723 }
724 
725