1 
2 /*
3     Factorization code for BAIJ format.
4 */
5 #include <../src/mat/impls/baij/seq/baij.h>
6 #include <petsc/private/kernels/blockinvert.h>
7 
8 /* ------------------------------------------------------------*/
9 /*
10       Version for when blocks are 4 by 4
11 */
MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo * info)12 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
13 {
14   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
15   IS             isrow = b->row,isicol = b->icol;
16   PetscErrorCode ierr;
17   const PetscInt *r,*ic;
18   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
19   PetscInt       *ajtmpold,*ajtmp,nz,row;
20   PetscInt       *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
21   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
22   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
23   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
24   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
25   MatScalar      m13,m14,m15,m16;
26   MatScalar      *ba           = b->a,*aa = a->a;
27   PetscBool      pivotinblocks = b->pivotinblocks;
28   PetscReal      shift         = info->shiftamount;
29   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;
30 
31   PetscFunctionBegin;
32   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
33   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
34   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
35   allowzeropivot = PetscNot(A->erroriffailure);
36 
37   for (i=0; i<n; i++) {
38     nz    = bi[i+1] - bi[i];
39     ajtmp = bj + bi[i];
40     for  (j=0; j<nz; j++) {
41       x     = rtmp+16*ajtmp[j];
42       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
43       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
44     }
45     /* load in initial (unfactored row) */
46     idx      = r[i];
47     nz       = ai[idx+1] - ai[idx];
48     ajtmpold = aj + ai[idx];
49     v        = aa + 16*ai[idx];
50     for (j=0; j<nz; j++) {
51       x     = rtmp+16*ic[ajtmpold[j]];
52       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
53       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
54       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
55       x[14] = v[14]; x[15] = v[15];
56       v    += 16;
57     }
58     row = *ajtmp++;
59     while (row < i) {
60       pc  = rtmp + 16*row;
61       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
62       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
63       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
64       p15 = pc[14]; p16 = pc[15];
65       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
66           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
67           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
68           || p16 != 0.0) {
69         pv    = ba + 16*diag_offset[row];
70         pj    = bj + diag_offset[row] + 1;
71         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
72         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
73         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
74         x15   = pv[14]; x16 = pv[15];
75         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
76         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
77         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
78         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
79 
80         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
81         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
82         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
83         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
84 
85         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
86         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
87         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
88         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
89 
90         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
91         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
92         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
93         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
94 
95         nz  = bi[row+1] - diag_offset[row] - 1;
96         pv += 16;
97         for (j=0; j<nz; j++) {
98           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
99           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
100           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
101           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
102           x     = rtmp + 16*pj[j];
103           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
104           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
105           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
106           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
107 
108           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
109           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
110           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
111           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
112 
113           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
114           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
115           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
116           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
117 
118           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
119           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
120           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
121           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
122 
123           pv += 16;
124         }
125         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
126       }
127       row = *ajtmp++;
128     }
129     /* finished row so stick it into b->a */
130     pv = ba + 16*bi[i];
131     pj = bj + bi[i];
132     nz = bi[i+1] - bi[i];
133     for (j=0; j<nz; j++) {
134       x      = rtmp+16*pj[j];
135       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
136       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
137       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
138       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
139       pv    += 16;
140     }
141     /* invert diagonal block */
142     w = ba + 16*diag_offset[i];
143     if (pivotinblocks) {
144       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
145       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
146     } else {
147       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
148     }
149   }
150 
151   ierr = PetscFree(rtmp);CHKERRQ(ierr);
152   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
153   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
154 
155   C->ops->solve          = MatSolve_SeqBAIJ_4_inplace;
156   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
157   C->assembled           = PETSC_TRUE;
158 
159   ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
160   PetscFunctionReturn(0);
161 }
162 
163 /* MatLUFactorNumeric_SeqBAIJ_4 -
164      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
165        PetscKernel_A_gets_A_times_B()
166        PetscKernel_A_gets_A_minus_B_times_C()
167        PetscKernel_A_gets_inverse_A()
168 */
169 
MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo * info)170 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
171 {
172   Mat            C     = B;
173   Mat_SeqBAIJ    *a    = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
174   IS             isrow = b->row,isicol = b->icol;
175   PetscErrorCode ierr;
176   const PetscInt *r,*ic;
177   PetscInt       i,j,k,nz,nzL,row;
178   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
179   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
180   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
181   PetscInt       flg;
182   PetscReal      shift;
183   PetscBool      allowzeropivot,zeropivotdetected;
184 
185   PetscFunctionBegin;
186   allowzeropivot = PetscNot(A->erroriffailure);
187   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
188   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
189 
190   if (info->shifttype == MAT_SHIFT_NONE) {
191     shift = 0;
192   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
193     shift = info->shiftamount;
194   }
195 
196   /* generate work space needed by the factorization */
197   ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr);
198   ierr = PetscArrayzero(rtmp,bs2*n);CHKERRQ(ierr);
199 
200   for (i=0; i<n; i++) {
201     /* zero rtmp */
202     /* L part */
203     nz    = bi[i+1] - bi[i];
204     bjtmp = bj + bi[i];
205     for  (j=0; j<nz; j++) {
206       ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);CHKERRQ(ierr);
207     }
208 
209     /* U part */
210     nz    = bdiag[i]-bdiag[i+1];
211     bjtmp = bj + bdiag[i+1]+1;
212     for  (j=0; j<nz; j++) {
213       ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);CHKERRQ(ierr);
214     }
215 
216     /* load in initial (unfactored row) */
217     nz    = ai[r[i]+1] - ai[r[i]];
218     ajtmp = aj + ai[r[i]];
219     v     = aa + bs2*ai[r[i]];
220     for (j=0; j<nz; j++) {
221       ierr = PetscArraycpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2);CHKERRQ(ierr);
222     }
223 
224     /* elimination */
225     bjtmp = bj + bi[i];
226     nzL   = bi[i+1] - bi[i];
227     for (k=0; k < nzL; k++) {
228       row = bjtmp[k];
229       pc  = rtmp + bs2*row;
230       for (flg=0,j=0; j<bs2; j++) {
231         if (pc[j]!=0.0) {
232           flg = 1;
233           break;
234         }
235       }
236       if (flg) {
237         pv = b->a + bs2*bdiag[row];
238         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
239         ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
240 
241         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
242         pv = b->a + bs2*(bdiag[row+1]+1);
243         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
244         for (j=0; j<nz; j++) {
245           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
246           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
247           v    = rtmp + bs2*pj[j];
248           ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
249           pv  += bs2;
250         }
251         ierr = PetscLogFlops(128.0*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
252       }
253     }
254 
255     /* finished row so stick it into b->a */
256     /* L part */
257     pv = b->a + bs2*bi[i];
258     pj = b->j + bi[i];
259     nz = bi[i+1] - bi[i];
260     for (j=0; j<nz; j++) {
261       ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);CHKERRQ(ierr);
262     }
263 
264     /* Mark diagonal and invert diagonal for simplier triangular solves */
265     pv   = b->a + bs2*bdiag[i];
266     pj   = b->j + bdiag[i];
267     ierr = PetscArraycpy(pv,rtmp+bs2*pj[0],bs2);CHKERRQ(ierr);
268     ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
269     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
270 
271     /* U part */
272     pv = b->a + bs2*(bdiag[i+1]+1);
273     pj = b->j + bdiag[i+1]+1;
274     nz = bdiag[i] - bdiag[i+1] - 1;
275     for (j=0; j<nz; j++) {
276       ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);CHKERRQ(ierr);
277     }
278   }
279 
280   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
281   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
282   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
283 
284   C->ops->solve          = MatSolve_SeqBAIJ_4;
285   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
286   C->assembled           = PETSC_TRUE;
287 
288   ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
289   PetscFunctionReturn(0);
290 }
291 
MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo * info)292 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
293 {
294 /*
295     Default Version for when blocks are 4 by 4 Using natural ordering
296 */
297   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
298   PetscErrorCode ierr;
299   PetscInt       i,j,n = a->mbs,*bi = b->i,*bj = b->j;
300   PetscInt       *ajtmpold,*ajtmp,nz,row;
301   PetscInt       *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
302   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
303   MatScalar      p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
304   MatScalar      p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
305   MatScalar      p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
306   MatScalar      m13,m14,m15,m16;
307   MatScalar      *ba           = b->a,*aa = a->a;
308   PetscBool      pivotinblocks = b->pivotinblocks;
309   PetscReal      shift         = info->shiftamount;
310   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;
311 
312   PetscFunctionBegin;
313   allowzeropivot = PetscNot(A->erroriffailure);
314   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
315 
316   for (i=0; i<n; i++) {
317     nz    = bi[i+1] - bi[i];
318     ajtmp = bj + bi[i];
319     for  (j=0; j<nz; j++) {
320       x     = rtmp+16*ajtmp[j];
321       x[0]  = x[1]  = x[2]  = x[3]  = x[4]  = x[5]  = x[6] = x[7] = x[8] = x[9] = 0.0;
322       x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
323     }
324     /* load in initial (unfactored row) */
325     nz       = ai[i+1] - ai[i];
326     ajtmpold = aj + ai[i];
327     v        = aa + 16*ai[i];
328     for (j=0; j<nz; j++) {
329       x     = rtmp+16*ajtmpold[j];
330       x[0]  = v[0];  x[1]  = v[1];  x[2]  = v[2];  x[3]  = v[3];
331       x[4]  = v[4];  x[5]  = v[5];  x[6]  = v[6];  x[7]  = v[7];  x[8]  = v[8];
332       x[9]  = v[9];  x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
333       x[14] = v[14]; x[15] = v[15];
334       v    += 16;
335     }
336     row = *ajtmp++;
337     while (row < i) {
338       pc  = rtmp + 16*row;
339       p1  = pc[0];  p2  = pc[1];  p3  = pc[2];  p4  = pc[3];
340       p5  = pc[4];  p6  = pc[5];  p7  = pc[6];  p8  = pc[7];  p9  = pc[8];
341       p10 = pc[9];  p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
342       p15 = pc[14]; p16 = pc[15];
343       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
344           p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
345           p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
346           || p16 != 0.0) {
347         pv    = ba + 16*diag_offset[row];
348         pj    = bj + diag_offset[row] + 1;
349         x1    = pv[0];  x2  = pv[1];  x3  = pv[2];  x4  = pv[3];
350         x5    = pv[4];  x6  = pv[5];  x7  = pv[6];  x8  = pv[7];  x9  = pv[8];
351         x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
352         x15   = pv[14]; x16 = pv[15];
353         pc[0] = m1 = p1*x1 + p5*x2  + p9*x3  + p13*x4;
354         pc[1] = m2 = p2*x1 + p6*x2  + p10*x3 + p14*x4;
355         pc[2] = m3 = p3*x1 + p7*x2  + p11*x3 + p15*x4;
356         pc[3] = m4 = p4*x1 + p8*x2  + p12*x3 + p16*x4;
357 
358         pc[4] = m5 = p1*x5 + p5*x6  + p9*x7  + p13*x8;
359         pc[5] = m6 = p2*x5 + p6*x6  + p10*x7 + p14*x8;
360         pc[6] = m7 = p3*x5 + p7*x6  + p11*x7 + p15*x8;
361         pc[7] = m8 = p4*x5 + p8*x6  + p12*x7 + p16*x8;
362 
363         pc[8]  = m9  = p1*x9 + p5*x10  + p9*x11  + p13*x12;
364         pc[9]  = m10 = p2*x9 + p6*x10  + p10*x11 + p14*x12;
365         pc[10] = m11 = p3*x9 + p7*x10  + p11*x11 + p15*x12;
366         pc[11] = m12 = p4*x9 + p8*x10  + p12*x11 + p16*x12;
367 
368         pc[12] = m13 = p1*x13 + p5*x14  + p9*x15  + p13*x16;
369         pc[13] = m14 = p2*x13 + p6*x14  + p10*x15 + p14*x16;
370         pc[14] = m15 = p3*x13 + p7*x14  + p11*x15 + p15*x16;
371         pc[15] = m16 = p4*x13 + p8*x14  + p12*x15 + p16*x16;
372         nz     = bi[row+1] - diag_offset[row] - 1;
373         pv    += 16;
374         for (j=0; j<nz; j++) {
375           x1    = pv[0];  x2  = pv[1];   x3 = pv[2];  x4  = pv[3];
376           x5    = pv[4];  x6  = pv[5];   x7 = pv[6];  x8  = pv[7]; x9 = pv[8];
377           x10   = pv[9];  x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
378           x14   = pv[13]; x15 = pv[14]; x16 = pv[15];
379           x     = rtmp + 16*pj[j];
380           x[0] -= m1*x1 + m5*x2  + m9*x3  + m13*x4;
381           x[1] -= m2*x1 + m6*x2  + m10*x3 + m14*x4;
382           x[2] -= m3*x1 + m7*x2  + m11*x3 + m15*x4;
383           x[3] -= m4*x1 + m8*x2  + m12*x3 + m16*x4;
384 
385           x[4] -= m1*x5 + m5*x6  + m9*x7  + m13*x8;
386           x[5] -= m2*x5 + m6*x6  + m10*x7 + m14*x8;
387           x[6] -= m3*x5 + m7*x6  + m11*x7 + m15*x8;
388           x[7] -= m4*x5 + m8*x6  + m12*x7 + m16*x8;
389 
390           x[8]  -= m1*x9 + m5*x10 + m9*x11  + m13*x12;
391           x[9]  -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
392           x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
393           x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
394 
395           x[12] -= m1*x13 + m5*x14  + m9*x15  + m13*x16;
396           x[13] -= m2*x13 + m6*x14  + m10*x15 + m14*x16;
397           x[14] -= m3*x13 + m7*x14  + m11*x15 + m15*x16;
398           x[15] -= m4*x13 + m8*x14  + m12*x15 + m16*x16;
399 
400           pv += 16;
401         }
402         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
403       }
404       row = *ajtmp++;
405     }
406     /* finished row so stick it into b->a */
407     pv = ba + 16*bi[i];
408     pj = bj + bi[i];
409     nz = bi[i+1] - bi[i];
410     for (j=0; j<nz; j++) {
411       x      = rtmp+16*pj[j];
412       pv[0]  = x[0];  pv[1]  = x[1];  pv[2]  = x[2];  pv[3]  = x[3];
413       pv[4]  = x[4];  pv[5]  = x[5];  pv[6]  = x[6];  pv[7]  = x[7]; pv[8] = x[8];
414       pv[9]  = x[9];  pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
415       pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
416       pv    += 16;
417     }
418     /* invert diagonal block */
419     w = ba + 16*diag_offset[i];
420     if (pivotinblocks) {
421       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
422       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
423     } else {
424       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
425     }
426   }
427 
428   ierr = PetscFree(rtmp);CHKERRQ(ierr);
429 
430   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
431   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
432   C->assembled           = PETSC_TRUE;
433 
434   ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
435   PetscFunctionReturn(0);
436 }
437 
438 /*
439   MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
440     copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
441 */
MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo * info)442 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
443 {
444   Mat            C =B;
445   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
446   PetscErrorCode ierr;
447   PetscInt       i,j,k,nz,nzL,row;
448   const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
449   const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
450   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
451   PetscInt       flg;
452   PetscReal      shift;
453   PetscBool      allowzeropivot,zeropivotdetected;
454 
455   PetscFunctionBegin;
456   allowzeropivot = PetscNot(A->erroriffailure);
457 
458   /* generate work space needed by the factorization */
459   ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr);
460   ierr = PetscArrayzero(rtmp,bs2*n);CHKERRQ(ierr);
461 
462   if (info->shifttype == MAT_SHIFT_NONE) {
463     shift = 0;
464   } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
465     shift = info->shiftamount;
466   }
467 
468   for (i=0; i<n; i++) {
469     /* zero rtmp */
470     /* L part */
471     nz    = bi[i+1] - bi[i];
472     bjtmp = bj + bi[i];
473     for  (j=0; j<nz; j++) {
474       ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);CHKERRQ(ierr);
475     }
476 
477     /* U part */
478     nz    = bdiag[i] - bdiag[i+1];
479     bjtmp = bj + bdiag[i+1]+1;
480     for  (j=0; j<nz; j++) {
481       ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);CHKERRQ(ierr);
482     }
483 
484     /* load in initial (unfactored row) */
485     nz    = ai[i+1] - ai[i];
486     ajtmp = aj + ai[i];
487     v     = aa + bs2*ai[i];
488     for (j=0; j<nz; j++) {
489       ierr = PetscArraycpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2);CHKERRQ(ierr);
490     }
491 
492     /* elimination */
493     bjtmp = bj + bi[i];
494     nzL   = bi[i+1] - bi[i];
495     for (k=0; k < nzL; k++) {
496       row = bjtmp[k];
497       pc  = rtmp + bs2*row;
498       for (flg=0,j=0; j<bs2; j++) {
499         if (pc[j]!=0.0) {
500           flg = 1;
501           break;
502         }
503       }
504       if (flg) {
505         pv = b->a + bs2*bdiag[row];
506         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
507         ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr);
508 
509         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
510         pv = b->a + bs2*(bdiag[row+1]+1);
511         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
512         for (j=0; j<nz; j++) {
513           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
514           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
515           v    = rtmp + bs2*pj[j];
516           ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr);
517           pv  += bs2;
518         }
519         ierr = PetscLogFlops(128.0*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
520       }
521     }
522 
523     /* finished row so stick it into b->a */
524     /* L part */
525     pv = b->a + bs2*bi[i];
526     pj = b->j + bi[i];
527     nz = bi[i+1] - bi[i];
528     for (j=0; j<nz; j++) {
529       ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);CHKERRQ(ierr);
530     }
531 
532     /* Mark diagonal and invert diagonal for simplier triangular solves */
533     pv   = b->a + bs2*bdiag[i];
534     pj   = b->j + bdiag[i];
535     ierr = PetscArraycpy(pv,rtmp+bs2*pj[0],bs2);CHKERRQ(ierr);
536     ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
537     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
538 
539     /* U part */
540     pv = b->a + bs2*(bdiag[i+1]+1);
541     pj = b->j + bdiag[i+1]+1;
542     nz = bdiag[i] - bdiag[i+1] - 1;
543     for (j=0; j<nz; j++) {
544       ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);CHKERRQ(ierr);
545     }
546   }
547   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
548 
549   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering;
550   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
551   C->assembled           = PETSC_TRUE;
552 
553   ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
554   PetscFunctionReturn(0);
555 }
556 
557 #if defined(PETSC_HAVE_SSE)
558 
559 #include PETSC_HAVE_SSE
560 
561 /* SSE Version for when blocks are 4 by 4 Using natural ordering */
MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo * info)562 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
563 {
564   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
565   PetscErrorCode ierr;
566   int            i,j,n = a->mbs;
567   int            *bj = b->j,*bjtmp,*pj;
568   int            row;
569   int            *ajtmpold,nz,*bi=b->i;
570   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
571   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
572   MatScalar      *ba    = b->a,*aa = a->a;
573   int            nonzero=0;
574   PetscBool      pivotinblocks = b->pivotinblocks;
575   PetscReal      shift         = info->shiftamount;
576   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;
577 
578   PetscFunctionBegin;
579   allowzeropivot = PetscNot(A->erroriffailure);
580   SSE_SCOPE_BEGIN;
581 
582   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
583   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
584   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
585   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
586 /*    if ((unsigned long)bj==(unsigned long)aj) { */
587 /*      colscale = 4; */
588 /*    } */
589   for (i=0; i<n; i++) {
590     nz    = bi[i+1] - bi[i];
591     bjtmp = bj + bi[i];
592     /* zero out the 4x4 block accumulators */
593     /* zero out one register */
594     XOR_PS(XMM7,XMM7);
595     for  (j=0; j<nz; j++) {
596       x = rtmp+16*bjtmp[j];
597 /*        x = rtmp+4*bjtmp[j]; */
598       SSE_INLINE_BEGIN_1(x)
599       /* Copy zero register to memory locations */
600       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
601       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
602       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
603       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
604       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
605       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
606       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
607       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
608       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
609       SSE_INLINE_END_1;
610     }
611     /* load in initial (unfactored row) */
612     nz       = ai[i+1] - ai[i];
613     ajtmpold = aj + ai[i];
614     v        = aa + 16*ai[i];
615     for (j=0; j<nz; j++) {
616       x = rtmp+16*ajtmpold[j];
617 /*        x = rtmp+colscale*ajtmpold[j]; */
618       /* Copy v block into x block */
619       SSE_INLINE_BEGIN_2(v,x)
620       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
621       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
622       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
623 
624       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
625       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
626 
627       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
628       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
629 
630       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
631       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
632 
633       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
634       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
635 
636       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
637       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
638 
639       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
640       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
641 
642       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
643       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
644       SSE_INLINE_END_2;
645 
646       v += 16;
647     }
648 /*      row = (*bjtmp++)/4; */
649     row = *bjtmp++;
650     while (row < i) {
651       pc = rtmp + 16*row;
652       SSE_INLINE_BEGIN_1(pc)
653       /* Load block from lower triangle */
654       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
655       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
656       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
657 
658       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
659       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
660 
661       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
662       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
663 
664       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
665       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
666 
667       /* Compare block to zero block */
668 
669       SSE_COPY_PS(XMM4,XMM7)
670       SSE_CMPNEQ_PS(XMM4,XMM0)
671 
672       SSE_COPY_PS(XMM5,XMM7)
673       SSE_CMPNEQ_PS(XMM5,XMM1)
674 
675       SSE_COPY_PS(XMM6,XMM7)
676       SSE_CMPNEQ_PS(XMM6,XMM2)
677 
678       SSE_CMPNEQ_PS(XMM7,XMM3)
679 
680       /* Reduce the comparisons to one SSE register */
681       SSE_OR_PS(XMM6,XMM7)
682       SSE_OR_PS(XMM5,XMM4)
683       SSE_OR_PS(XMM5,XMM6)
684       SSE_INLINE_END_1;
685 
686       /* Reduce the one SSE register to an integer register for branching */
687       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
688       MOVEMASK(nonzero,XMM5);
689 
690       /* If block is nonzero ... */
691       if (nonzero) {
692         pv = ba + 16*diag_offset[row];
693         PREFETCH_L1(&pv[16]);
694         pj = bj + diag_offset[row] + 1;
695 
696         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
697         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
698         /* but the diagonal was inverted already */
699         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
700 
701         SSE_INLINE_BEGIN_2(pv,pc)
702         /* Column 0, product is accumulated in XMM4 */
703         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
704         SSE_SHUFFLE(XMM4,XMM4,0x00)
705         SSE_MULT_PS(XMM4,XMM0)
706 
707         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
708         SSE_SHUFFLE(XMM5,XMM5,0x00)
709         SSE_MULT_PS(XMM5,XMM1)
710         SSE_ADD_PS(XMM4,XMM5)
711 
712         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
713         SSE_SHUFFLE(XMM6,XMM6,0x00)
714         SSE_MULT_PS(XMM6,XMM2)
715         SSE_ADD_PS(XMM4,XMM6)
716 
717         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
718         SSE_SHUFFLE(XMM7,XMM7,0x00)
719         SSE_MULT_PS(XMM7,XMM3)
720         SSE_ADD_PS(XMM4,XMM7)
721 
722         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
723         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
724 
725         /* Column 1, product is accumulated in XMM5 */
726         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
727         SSE_SHUFFLE(XMM5,XMM5,0x00)
728         SSE_MULT_PS(XMM5,XMM0)
729 
730         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
731         SSE_SHUFFLE(XMM6,XMM6,0x00)
732         SSE_MULT_PS(XMM6,XMM1)
733         SSE_ADD_PS(XMM5,XMM6)
734 
735         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
736         SSE_SHUFFLE(XMM7,XMM7,0x00)
737         SSE_MULT_PS(XMM7,XMM2)
738         SSE_ADD_PS(XMM5,XMM7)
739 
740         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
741         SSE_SHUFFLE(XMM6,XMM6,0x00)
742         SSE_MULT_PS(XMM6,XMM3)
743         SSE_ADD_PS(XMM5,XMM6)
744 
745         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
746         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
747 
748         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
749 
750         /* Column 2, product is accumulated in XMM6 */
751         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
752         SSE_SHUFFLE(XMM6,XMM6,0x00)
753         SSE_MULT_PS(XMM6,XMM0)
754 
755         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
756         SSE_SHUFFLE(XMM7,XMM7,0x00)
757         SSE_MULT_PS(XMM7,XMM1)
758         SSE_ADD_PS(XMM6,XMM7)
759 
760         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
761         SSE_SHUFFLE(XMM7,XMM7,0x00)
762         SSE_MULT_PS(XMM7,XMM2)
763         SSE_ADD_PS(XMM6,XMM7)
764 
765         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
766         SSE_SHUFFLE(XMM7,XMM7,0x00)
767         SSE_MULT_PS(XMM7,XMM3)
768         SSE_ADD_PS(XMM6,XMM7)
769 
770         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
771         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
772 
773         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
774         /* Column 3, product is accumulated in XMM0 */
775         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
776         SSE_SHUFFLE(XMM7,XMM7,0x00)
777         SSE_MULT_PS(XMM0,XMM7)
778 
779         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
780         SSE_SHUFFLE(XMM7,XMM7,0x00)
781         SSE_MULT_PS(XMM1,XMM7)
782         SSE_ADD_PS(XMM0,XMM1)
783 
784         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
785         SSE_SHUFFLE(XMM1,XMM1,0x00)
786         SSE_MULT_PS(XMM1,XMM2)
787         SSE_ADD_PS(XMM0,XMM1)
788 
789         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
790         SSE_SHUFFLE(XMM7,XMM7,0x00)
791         SSE_MULT_PS(XMM3,XMM7)
792         SSE_ADD_PS(XMM0,XMM3)
793 
794         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
795         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
796 
797         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
798         /* This is code to be maintained and read by humans afterall. */
799         /* Copy Multiplier Col 3 into XMM3 */
800         SSE_COPY_PS(XMM3,XMM0)
801         /* Copy Multiplier Col 2 into XMM2 */
802         SSE_COPY_PS(XMM2,XMM6)
803         /* Copy Multiplier Col 1 into XMM1 */
804         SSE_COPY_PS(XMM1,XMM5)
805         /* Copy Multiplier Col 0 into XMM0 */
806         SSE_COPY_PS(XMM0,XMM4)
807         SSE_INLINE_END_2;
808 
809         /* Update the row: */
810         nz  = bi[row+1] - diag_offset[row] - 1;
811         pv += 16;
812         for (j=0; j<nz; j++) {
813           PREFETCH_L1(&pv[16]);
814           x = rtmp + 16*pj[j];
815 /*            x = rtmp + 4*pj[j]; */
816 
817           /* X:=X-M*PV, One column at a time */
818           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
819           SSE_INLINE_BEGIN_2(x,pv)
820           /* Load First Column of X*/
821           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
822           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
823 
824           /* Matrix-Vector Product: */
825           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
826           SSE_SHUFFLE(XMM5,XMM5,0x00)
827           SSE_MULT_PS(XMM5,XMM0)
828           SSE_SUB_PS(XMM4,XMM5)
829 
830           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
831           SSE_SHUFFLE(XMM6,XMM6,0x00)
832           SSE_MULT_PS(XMM6,XMM1)
833           SSE_SUB_PS(XMM4,XMM6)
834 
835           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
836           SSE_SHUFFLE(XMM7,XMM7,0x00)
837           SSE_MULT_PS(XMM7,XMM2)
838           SSE_SUB_PS(XMM4,XMM7)
839 
840           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
841           SSE_SHUFFLE(XMM5,XMM5,0x00)
842           SSE_MULT_PS(XMM5,XMM3)
843           SSE_SUB_PS(XMM4,XMM5)
844 
845           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
846           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
847 
848           /* Second Column */
849           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
850           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
851 
852           /* Matrix-Vector Product: */
853           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
854           SSE_SHUFFLE(XMM6,XMM6,0x00)
855           SSE_MULT_PS(XMM6,XMM0)
856           SSE_SUB_PS(XMM5,XMM6)
857 
858           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
859           SSE_SHUFFLE(XMM7,XMM7,0x00)
860           SSE_MULT_PS(XMM7,XMM1)
861           SSE_SUB_PS(XMM5,XMM7)
862 
863           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
864           SSE_SHUFFLE(XMM4,XMM4,0x00)
865           SSE_MULT_PS(XMM4,XMM2)
866           SSE_SUB_PS(XMM5,XMM4)
867 
868           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
869           SSE_SHUFFLE(XMM6,XMM6,0x00)
870           SSE_MULT_PS(XMM6,XMM3)
871           SSE_SUB_PS(XMM5,XMM6)
872 
873           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
874           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
875 
876           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
877 
878           /* Third Column */
879           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
880           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
881 
882           /* Matrix-Vector Product: */
883           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
884           SSE_SHUFFLE(XMM7,XMM7,0x00)
885           SSE_MULT_PS(XMM7,XMM0)
886           SSE_SUB_PS(XMM6,XMM7)
887 
888           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
889           SSE_SHUFFLE(XMM4,XMM4,0x00)
890           SSE_MULT_PS(XMM4,XMM1)
891           SSE_SUB_PS(XMM6,XMM4)
892 
893           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
894           SSE_SHUFFLE(XMM5,XMM5,0x00)
895           SSE_MULT_PS(XMM5,XMM2)
896           SSE_SUB_PS(XMM6,XMM5)
897 
898           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
899           SSE_SHUFFLE(XMM7,XMM7,0x00)
900           SSE_MULT_PS(XMM7,XMM3)
901           SSE_SUB_PS(XMM6,XMM7)
902 
903           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
904           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
905 
906           /* Fourth Column */
907           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
908           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
909 
910           /* Matrix-Vector Product: */
911           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
912           SSE_SHUFFLE(XMM5,XMM5,0x00)
913           SSE_MULT_PS(XMM5,XMM0)
914           SSE_SUB_PS(XMM4,XMM5)
915 
916           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
917           SSE_SHUFFLE(XMM6,XMM6,0x00)
918           SSE_MULT_PS(XMM6,XMM1)
919           SSE_SUB_PS(XMM4,XMM6)
920 
921           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
922           SSE_SHUFFLE(XMM7,XMM7,0x00)
923           SSE_MULT_PS(XMM7,XMM2)
924           SSE_SUB_PS(XMM4,XMM7)
925 
926           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
927           SSE_SHUFFLE(XMM5,XMM5,0x00)
928           SSE_MULT_PS(XMM5,XMM3)
929           SSE_SUB_PS(XMM4,XMM5)
930 
931           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
932           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
933           SSE_INLINE_END_2;
934           pv += 16;
935         }
936         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
937       }
938       row = *bjtmp++;
939 /*        row = (*bjtmp++)/4; */
940     }
941     /* finished row so stick it into b->a */
942     pv = ba + 16*bi[i];
943     pj = bj + bi[i];
944     nz = bi[i+1] - bi[i];
945 
946     /* Copy x block back into pv block */
947     for (j=0; j<nz; j++) {
948       x = rtmp+16*pj[j];
949 /*        x  = rtmp+4*pj[j]; */
950 
951       SSE_INLINE_BEGIN_2(x,pv)
952       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
953       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
954       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
955 
956       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
957       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
958 
959       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
960       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
961 
962       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
963       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
964 
965       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
966       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
967 
968       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
969       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
970 
971       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
972       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
973 
974       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
975       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
976       SSE_INLINE_END_2;
977       pv += 16;
978     }
979     /* invert diagonal block */
980     w = ba + 16*diag_offset[i];
981     if (pivotinblocks) {
982       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
983       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
984     } else {
985       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
986     }
987 /*      ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
988     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
989   }
990 
991   ierr = PetscFree(rtmp);CHKERRQ(ierr);
992 
993   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
994   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
995   C->assembled           = PETSC_TRUE;
996 
997   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
998   /* Flop Count from inverting diagonal blocks */
999   SSE_SCOPE_END;
1000   PetscFunctionReturn(0);
1001 }
1002 
MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)1003 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
1004 {
1005   Mat            A  =C;
1006   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1007   PetscErrorCode ierr;
1008   int            i,j,n = a->mbs;
1009   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1010   unsigned short *aj = (unsigned short*)(a->j),*ajtmp;
1011   unsigned int   row;
1012   int            nz,*bi=b->i;
1013   int            *diag_offset = b->diag,*ai=a->i;
1014   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1015   MatScalar      *ba    = b->a,*aa = a->a;
1016   int            nonzero=0;
1017   PetscBool      pivotinblocks = b->pivotinblocks;
1018   PetscReal      shift = info->shiftamount;
1019   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;
1020 
1021   PetscFunctionBegin;
1022   allowzeropivot = PetscNot(A->erroriffailure);
1023   SSE_SCOPE_BEGIN;
1024 
1025   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1026   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1027   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
1028   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1029 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1030 /*      colscale = 4; */
1031 /*    } */
1032 
1033   for (i=0; i<n; i++) {
1034     nz    = bi[i+1] - bi[i];
1035     bjtmp = bj + bi[i];
1036     /* zero out the 4x4 block accumulators */
1037     /* zero out one register */
1038     XOR_PS(XMM7,XMM7);
1039     for  (j=0; j<nz; j++) {
1040       x = rtmp+16*((unsigned int)bjtmp[j]);
1041 /*        x = rtmp+4*bjtmp[j]; */
1042       SSE_INLINE_BEGIN_1(x)
1043       /* Copy zero register to memory locations */
1044       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1045       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1046       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1047       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1048       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1049       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1050       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1051       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1052       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1053       SSE_INLINE_END_1;
1054     }
1055     /* load in initial (unfactored row) */
1056     nz    = ai[i+1] - ai[i];
1057     ajtmp = aj + ai[i];
1058     v     = aa + 16*ai[i];
1059     for (j=0; j<nz; j++) {
1060       x = rtmp+16*((unsigned int)ajtmp[j]);
1061 /*        x = rtmp+colscale*ajtmp[j]; */
1062       /* Copy v block into x block */
1063       SSE_INLINE_BEGIN_2(v,x)
1064       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1065       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1066       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1067 
1068       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1069       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1070 
1071       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1072       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1073 
1074       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1075       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1076 
1077       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1078       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1079 
1080       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1081       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1082 
1083       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1084       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1085 
1086       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1087       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1088       SSE_INLINE_END_2;
1089 
1090       v += 16;
1091     }
1092 /*      row = (*bjtmp++)/4; */
1093     row = (unsigned int)(*bjtmp++);
1094     while (row < i) {
1095       pc = rtmp + 16*row;
1096       SSE_INLINE_BEGIN_1(pc)
1097       /* Load block from lower triangle */
1098       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1099       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1100       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1101 
1102       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1103       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1104 
1105       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1106       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1107 
1108       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1109       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1110 
1111       /* Compare block to zero block */
1112 
1113       SSE_COPY_PS(XMM4,XMM7)
1114       SSE_CMPNEQ_PS(XMM4,XMM0)
1115 
1116       SSE_COPY_PS(XMM5,XMM7)
1117       SSE_CMPNEQ_PS(XMM5,XMM1)
1118 
1119       SSE_COPY_PS(XMM6,XMM7)
1120       SSE_CMPNEQ_PS(XMM6,XMM2)
1121 
1122       SSE_CMPNEQ_PS(XMM7,XMM3)
1123 
1124       /* Reduce the comparisons to one SSE register */
1125       SSE_OR_PS(XMM6,XMM7)
1126       SSE_OR_PS(XMM5,XMM4)
1127       SSE_OR_PS(XMM5,XMM6)
1128       SSE_INLINE_END_1;
1129 
1130       /* Reduce the one SSE register to an integer register for branching */
1131       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1132       MOVEMASK(nonzero,XMM5);
1133 
1134       /* If block is nonzero ... */
1135       if (nonzero) {
1136         pv = ba + 16*diag_offset[row];
1137         PREFETCH_L1(&pv[16]);
1138         pj = bj + diag_offset[row] + 1;
1139 
1140         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1141         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1142         /* but the diagonal was inverted already */
1143         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1144 
1145         SSE_INLINE_BEGIN_2(pv,pc)
1146         /* Column 0, product is accumulated in XMM4 */
1147         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1148         SSE_SHUFFLE(XMM4,XMM4,0x00)
1149         SSE_MULT_PS(XMM4,XMM0)
1150 
1151         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1152         SSE_SHUFFLE(XMM5,XMM5,0x00)
1153         SSE_MULT_PS(XMM5,XMM1)
1154         SSE_ADD_PS(XMM4,XMM5)
1155 
1156         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1157         SSE_SHUFFLE(XMM6,XMM6,0x00)
1158         SSE_MULT_PS(XMM6,XMM2)
1159         SSE_ADD_PS(XMM4,XMM6)
1160 
1161         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1162         SSE_SHUFFLE(XMM7,XMM7,0x00)
1163         SSE_MULT_PS(XMM7,XMM3)
1164         SSE_ADD_PS(XMM4,XMM7)
1165 
1166         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1167         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1168 
1169         /* Column 1, product is accumulated in XMM5 */
1170         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1171         SSE_SHUFFLE(XMM5,XMM5,0x00)
1172         SSE_MULT_PS(XMM5,XMM0)
1173 
1174         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1175         SSE_SHUFFLE(XMM6,XMM6,0x00)
1176         SSE_MULT_PS(XMM6,XMM1)
1177         SSE_ADD_PS(XMM5,XMM6)
1178 
1179         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1180         SSE_SHUFFLE(XMM7,XMM7,0x00)
1181         SSE_MULT_PS(XMM7,XMM2)
1182         SSE_ADD_PS(XMM5,XMM7)
1183 
1184         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1185         SSE_SHUFFLE(XMM6,XMM6,0x00)
1186         SSE_MULT_PS(XMM6,XMM3)
1187         SSE_ADD_PS(XMM5,XMM6)
1188 
1189         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1190         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1191 
1192         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1193 
1194         /* Column 2, product is accumulated in XMM6 */
1195         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1196         SSE_SHUFFLE(XMM6,XMM6,0x00)
1197         SSE_MULT_PS(XMM6,XMM0)
1198 
1199         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1200         SSE_SHUFFLE(XMM7,XMM7,0x00)
1201         SSE_MULT_PS(XMM7,XMM1)
1202         SSE_ADD_PS(XMM6,XMM7)
1203 
1204         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1205         SSE_SHUFFLE(XMM7,XMM7,0x00)
1206         SSE_MULT_PS(XMM7,XMM2)
1207         SSE_ADD_PS(XMM6,XMM7)
1208 
1209         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1210         SSE_SHUFFLE(XMM7,XMM7,0x00)
1211         SSE_MULT_PS(XMM7,XMM3)
1212         SSE_ADD_PS(XMM6,XMM7)
1213 
1214         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1215         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1216 
1217         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1218         /* Column 3, product is accumulated in XMM0 */
1219         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1220         SSE_SHUFFLE(XMM7,XMM7,0x00)
1221         SSE_MULT_PS(XMM0,XMM7)
1222 
1223         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1224         SSE_SHUFFLE(XMM7,XMM7,0x00)
1225         SSE_MULT_PS(XMM1,XMM7)
1226         SSE_ADD_PS(XMM0,XMM1)
1227 
1228         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1229         SSE_SHUFFLE(XMM1,XMM1,0x00)
1230         SSE_MULT_PS(XMM1,XMM2)
1231         SSE_ADD_PS(XMM0,XMM1)
1232 
1233         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1234         SSE_SHUFFLE(XMM7,XMM7,0x00)
1235         SSE_MULT_PS(XMM3,XMM7)
1236         SSE_ADD_PS(XMM0,XMM3)
1237 
1238         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1239         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1240 
1241         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1242         /* This is code to be maintained and read by humans afterall. */
1243         /* Copy Multiplier Col 3 into XMM3 */
1244         SSE_COPY_PS(XMM3,XMM0)
1245         /* Copy Multiplier Col 2 into XMM2 */
1246         SSE_COPY_PS(XMM2,XMM6)
1247         /* Copy Multiplier Col 1 into XMM1 */
1248         SSE_COPY_PS(XMM1,XMM5)
1249         /* Copy Multiplier Col 0 into XMM0 */
1250         SSE_COPY_PS(XMM0,XMM4)
1251         SSE_INLINE_END_2;
1252 
1253         /* Update the row: */
1254         nz  = bi[row+1] - diag_offset[row] - 1;
1255         pv += 16;
1256         for (j=0; j<nz; j++) {
1257           PREFETCH_L1(&pv[16]);
1258           x = rtmp + 16*((unsigned int)pj[j]);
1259 /*            x = rtmp + 4*pj[j]; */
1260 
1261           /* X:=X-M*PV, One column at a time */
1262           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1263           SSE_INLINE_BEGIN_2(x,pv)
1264           /* Load First Column of X*/
1265           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1266           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1267 
1268           /* Matrix-Vector Product: */
1269           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1270           SSE_SHUFFLE(XMM5,XMM5,0x00)
1271           SSE_MULT_PS(XMM5,XMM0)
1272           SSE_SUB_PS(XMM4,XMM5)
1273 
1274           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1275           SSE_SHUFFLE(XMM6,XMM6,0x00)
1276           SSE_MULT_PS(XMM6,XMM1)
1277           SSE_SUB_PS(XMM4,XMM6)
1278 
1279           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1280           SSE_SHUFFLE(XMM7,XMM7,0x00)
1281           SSE_MULT_PS(XMM7,XMM2)
1282           SSE_SUB_PS(XMM4,XMM7)
1283 
1284           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1285           SSE_SHUFFLE(XMM5,XMM5,0x00)
1286           SSE_MULT_PS(XMM5,XMM3)
1287           SSE_SUB_PS(XMM4,XMM5)
1288 
1289           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1290           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1291 
1292           /* Second Column */
1293           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1294           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1295 
1296           /* Matrix-Vector Product: */
1297           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1298           SSE_SHUFFLE(XMM6,XMM6,0x00)
1299           SSE_MULT_PS(XMM6,XMM0)
1300           SSE_SUB_PS(XMM5,XMM6)
1301 
1302           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1303           SSE_SHUFFLE(XMM7,XMM7,0x00)
1304           SSE_MULT_PS(XMM7,XMM1)
1305           SSE_SUB_PS(XMM5,XMM7)
1306 
1307           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1308           SSE_SHUFFLE(XMM4,XMM4,0x00)
1309           SSE_MULT_PS(XMM4,XMM2)
1310           SSE_SUB_PS(XMM5,XMM4)
1311 
1312           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1313           SSE_SHUFFLE(XMM6,XMM6,0x00)
1314           SSE_MULT_PS(XMM6,XMM3)
1315           SSE_SUB_PS(XMM5,XMM6)
1316 
1317           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1318           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1319 
1320           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1321 
1322           /* Third Column */
1323           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1324           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1325 
1326           /* Matrix-Vector Product: */
1327           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1328           SSE_SHUFFLE(XMM7,XMM7,0x00)
1329           SSE_MULT_PS(XMM7,XMM0)
1330           SSE_SUB_PS(XMM6,XMM7)
1331 
1332           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1333           SSE_SHUFFLE(XMM4,XMM4,0x00)
1334           SSE_MULT_PS(XMM4,XMM1)
1335           SSE_SUB_PS(XMM6,XMM4)
1336 
1337           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1338           SSE_SHUFFLE(XMM5,XMM5,0x00)
1339           SSE_MULT_PS(XMM5,XMM2)
1340           SSE_SUB_PS(XMM6,XMM5)
1341 
1342           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1343           SSE_SHUFFLE(XMM7,XMM7,0x00)
1344           SSE_MULT_PS(XMM7,XMM3)
1345           SSE_SUB_PS(XMM6,XMM7)
1346 
1347           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1348           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1349 
1350           /* Fourth Column */
1351           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1352           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1353 
1354           /* Matrix-Vector Product: */
1355           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1356           SSE_SHUFFLE(XMM5,XMM5,0x00)
1357           SSE_MULT_PS(XMM5,XMM0)
1358           SSE_SUB_PS(XMM4,XMM5)
1359 
1360           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1361           SSE_SHUFFLE(XMM6,XMM6,0x00)
1362           SSE_MULT_PS(XMM6,XMM1)
1363           SSE_SUB_PS(XMM4,XMM6)
1364 
1365           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1366           SSE_SHUFFLE(XMM7,XMM7,0x00)
1367           SSE_MULT_PS(XMM7,XMM2)
1368           SSE_SUB_PS(XMM4,XMM7)
1369 
1370           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1371           SSE_SHUFFLE(XMM5,XMM5,0x00)
1372           SSE_MULT_PS(XMM5,XMM3)
1373           SSE_SUB_PS(XMM4,XMM5)
1374 
1375           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1376           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1377           SSE_INLINE_END_2;
1378           pv += 16;
1379         }
1380         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1381       }
1382       row = (unsigned int)(*bjtmp++);
1383 /*        row = (*bjtmp++)/4; */
1384 /*        bjtmp++; */
1385     }
1386     /* finished row so stick it into b->a */
1387     pv = ba + 16*bi[i];
1388     pj = bj + bi[i];
1389     nz = bi[i+1] - bi[i];
1390 
1391     /* Copy x block back into pv block */
1392     for (j=0; j<nz; j++) {
1393       x = rtmp+16*((unsigned int)pj[j]);
1394 /*        x  = rtmp+4*pj[j]; */
1395 
1396       SSE_INLINE_BEGIN_2(x,pv)
1397       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1398       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1399       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1400 
1401       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1402       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1403 
1404       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1405       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1406 
1407       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1408       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1409 
1410       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1411       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1412 
1413       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1414       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1415 
1416       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1417       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1418 
1419       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1420       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1421       SSE_INLINE_END_2;
1422       pv += 16;
1423     }
1424     /* invert diagonal block */
1425     w = ba + 16*diag_offset[i];
1426     if (pivotinblocks) {
1427       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr);
1428       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1429     } else {
1430       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1431     }
1432     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1433   }
1434 
1435   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1436 
1437   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1438   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1439   C->assembled           = PETSC_TRUE;
1440 
1441   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1442   /* Flop Count from inverting diagonal blocks */
1443   SSE_SCOPE_END;
1444   PetscFunctionReturn(0);
1445 }
1446 
MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo * info)1447 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1448 {
1449   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1450   PetscErrorCode ierr;
1451   int            i,j,n = a->mbs;
1452   unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1453   unsigned int   row;
1454   int            *ajtmpold,nz,*bi=b->i;
1455   int            *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1456   MatScalar      *pv,*v,*rtmp,*pc,*w,*x;
1457   MatScalar      *ba    = b->a,*aa = a->a;
1458   int            nonzero=0;
1459   PetscBool      pivotinblocks = b->pivotinblocks;
1460   PetscReal      shift = info->shiftamount;
1461   PetscBool      allowzeropivot,zeropivotdetected=PETSC_FALSE;
1462 
1463   PetscFunctionBegin;
1464   allowzeropivot = PetscNot(A->erroriffailure);
1465   SSE_SCOPE_BEGIN;
1466 
1467   if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned.  SSE will not work.");
1468   if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned.  SSE will not work.");
1469   ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr);
1470   if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned.  SSE will not work.");
1471 /*    if ((unsigned long)bj==(unsigned long)aj) { */
1472 /*      colscale = 4; */
1473 /*    } */
1474   if ((unsigned long)bj==(unsigned long)aj) {
1475     return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1476   }
1477 
1478   for (i=0; i<n; i++) {
1479     nz    = bi[i+1] - bi[i];
1480     bjtmp = bj + bi[i];
1481     /* zero out the 4x4 block accumulators */
1482     /* zero out one register */
1483     XOR_PS(XMM7,XMM7);
1484     for  (j=0; j<nz; j++) {
1485       x = rtmp+16*((unsigned int)bjtmp[j]);
1486 /*        x = rtmp+4*bjtmp[j]; */
1487       SSE_INLINE_BEGIN_1(x)
1488       /* Copy zero register to memory locations */
1489       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1490       SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1491       SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1492       SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1493       SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1494       SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1495       SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1496       SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1497       SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1498       SSE_INLINE_END_1;
1499     }
1500     /* load in initial (unfactored row) */
1501     nz       = ai[i+1] - ai[i];
1502     ajtmpold = aj + ai[i];
1503     v        = aa + 16*ai[i];
1504     for (j=0; j<nz; j++) {
1505       x = rtmp+16*ajtmpold[j];
1506 /*        x = rtmp+colscale*ajtmpold[j]; */
1507       /* Copy v block into x block */
1508       SSE_INLINE_BEGIN_2(v,x)
1509       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1510       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1511       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1512 
1513       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1514       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1515 
1516       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1517       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1518 
1519       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1520       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1521 
1522       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1523       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1524 
1525       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1526       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1527 
1528       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1529       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1530 
1531       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1532       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1533       SSE_INLINE_END_2;
1534 
1535       v += 16;
1536     }
1537 /*      row = (*bjtmp++)/4; */
1538     row = (unsigned int)(*bjtmp++);
1539     while (row < i) {
1540       pc = rtmp + 16*row;
1541       SSE_INLINE_BEGIN_1(pc)
1542       /* Load block from lower triangle */
1543       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1544       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1545       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1546 
1547       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1548       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1549 
1550       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1551       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1552 
1553       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1554       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1555 
1556       /* Compare block to zero block */
1557 
1558       SSE_COPY_PS(XMM4,XMM7)
1559       SSE_CMPNEQ_PS(XMM4,XMM0)
1560 
1561       SSE_COPY_PS(XMM5,XMM7)
1562       SSE_CMPNEQ_PS(XMM5,XMM1)
1563 
1564       SSE_COPY_PS(XMM6,XMM7)
1565       SSE_CMPNEQ_PS(XMM6,XMM2)
1566 
1567       SSE_CMPNEQ_PS(XMM7,XMM3)
1568 
1569       /* Reduce the comparisons to one SSE register */
1570       SSE_OR_PS(XMM6,XMM7)
1571       SSE_OR_PS(XMM5,XMM4)
1572       SSE_OR_PS(XMM5,XMM6)
1573       SSE_INLINE_END_1;
1574 
1575       /* Reduce the one SSE register to an integer register for branching */
1576       /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1577       MOVEMASK(nonzero,XMM5);
1578 
1579       /* If block is nonzero ... */
1580       if (nonzero) {
1581         pv = ba + 16*diag_offset[row];
1582         PREFETCH_L1(&pv[16]);
1583         pj = bj + diag_offset[row] + 1;
1584 
1585         /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1586         /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1587         /* but the diagonal was inverted already */
1588         /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1589 
1590         SSE_INLINE_BEGIN_2(pv,pc)
1591         /* Column 0, product is accumulated in XMM4 */
1592         SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1593         SSE_SHUFFLE(XMM4,XMM4,0x00)
1594         SSE_MULT_PS(XMM4,XMM0)
1595 
1596         SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1597         SSE_SHUFFLE(XMM5,XMM5,0x00)
1598         SSE_MULT_PS(XMM5,XMM1)
1599         SSE_ADD_PS(XMM4,XMM5)
1600 
1601         SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1602         SSE_SHUFFLE(XMM6,XMM6,0x00)
1603         SSE_MULT_PS(XMM6,XMM2)
1604         SSE_ADD_PS(XMM4,XMM6)
1605 
1606         SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1607         SSE_SHUFFLE(XMM7,XMM7,0x00)
1608         SSE_MULT_PS(XMM7,XMM3)
1609         SSE_ADD_PS(XMM4,XMM7)
1610 
1611         SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1612         SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1613 
1614         /* Column 1, product is accumulated in XMM5 */
1615         SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1616         SSE_SHUFFLE(XMM5,XMM5,0x00)
1617         SSE_MULT_PS(XMM5,XMM0)
1618 
1619         SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1620         SSE_SHUFFLE(XMM6,XMM6,0x00)
1621         SSE_MULT_PS(XMM6,XMM1)
1622         SSE_ADD_PS(XMM5,XMM6)
1623 
1624         SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1625         SSE_SHUFFLE(XMM7,XMM7,0x00)
1626         SSE_MULT_PS(XMM7,XMM2)
1627         SSE_ADD_PS(XMM5,XMM7)
1628 
1629         SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1630         SSE_SHUFFLE(XMM6,XMM6,0x00)
1631         SSE_MULT_PS(XMM6,XMM3)
1632         SSE_ADD_PS(XMM5,XMM6)
1633 
1634         SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1635         SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1636 
1637         SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1638 
1639         /* Column 2, product is accumulated in XMM6 */
1640         SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1641         SSE_SHUFFLE(XMM6,XMM6,0x00)
1642         SSE_MULT_PS(XMM6,XMM0)
1643 
1644         SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1645         SSE_SHUFFLE(XMM7,XMM7,0x00)
1646         SSE_MULT_PS(XMM7,XMM1)
1647         SSE_ADD_PS(XMM6,XMM7)
1648 
1649         SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1650         SSE_SHUFFLE(XMM7,XMM7,0x00)
1651         SSE_MULT_PS(XMM7,XMM2)
1652         SSE_ADD_PS(XMM6,XMM7)
1653 
1654         SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1655         SSE_SHUFFLE(XMM7,XMM7,0x00)
1656         SSE_MULT_PS(XMM7,XMM3)
1657         SSE_ADD_PS(XMM6,XMM7)
1658 
1659         SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1660         SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1661 
1662         /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1663         /* Column 3, product is accumulated in XMM0 */
1664         SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1665         SSE_SHUFFLE(XMM7,XMM7,0x00)
1666         SSE_MULT_PS(XMM0,XMM7)
1667 
1668         SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1669         SSE_SHUFFLE(XMM7,XMM7,0x00)
1670         SSE_MULT_PS(XMM1,XMM7)
1671         SSE_ADD_PS(XMM0,XMM1)
1672 
1673         SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1674         SSE_SHUFFLE(XMM1,XMM1,0x00)
1675         SSE_MULT_PS(XMM1,XMM2)
1676         SSE_ADD_PS(XMM0,XMM1)
1677 
1678         SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1679         SSE_SHUFFLE(XMM7,XMM7,0x00)
1680         SSE_MULT_PS(XMM3,XMM7)
1681         SSE_ADD_PS(XMM0,XMM3)
1682 
1683         SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1684         SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1685 
1686         /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1687         /* This is code to be maintained and read by humans afterall. */
1688         /* Copy Multiplier Col 3 into XMM3 */
1689         SSE_COPY_PS(XMM3,XMM0)
1690         /* Copy Multiplier Col 2 into XMM2 */
1691         SSE_COPY_PS(XMM2,XMM6)
1692         /* Copy Multiplier Col 1 into XMM1 */
1693         SSE_COPY_PS(XMM1,XMM5)
1694         /* Copy Multiplier Col 0 into XMM0 */
1695         SSE_COPY_PS(XMM0,XMM4)
1696         SSE_INLINE_END_2;
1697 
1698         /* Update the row: */
1699         nz  = bi[row+1] - diag_offset[row] - 1;
1700         pv += 16;
1701         for (j=0; j<nz; j++) {
1702           PREFETCH_L1(&pv[16]);
1703           x = rtmp + 16*((unsigned int)pj[j]);
1704 /*            x = rtmp + 4*pj[j]; */
1705 
1706           /* X:=X-M*PV, One column at a time */
1707           /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1708           SSE_INLINE_BEGIN_2(x,pv)
1709           /* Load First Column of X*/
1710           SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1711           SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1712 
1713           /* Matrix-Vector Product: */
1714           SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1715           SSE_SHUFFLE(XMM5,XMM5,0x00)
1716           SSE_MULT_PS(XMM5,XMM0)
1717           SSE_SUB_PS(XMM4,XMM5)
1718 
1719           SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1720           SSE_SHUFFLE(XMM6,XMM6,0x00)
1721           SSE_MULT_PS(XMM6,XMM1)
1722           SSE_SUB_PS(XMM4,XMM6)
1723 
1724           SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1725           SSE_SHUFFLE(XMM7,XMM7,0x00)
1726           SSE_MULT_PS(XMM7,XMM2)
1727           SSE_SUB_PS(XMM4,XMM7)
1728 
1729           SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1730           SSE_SHUFFLE(XMM5,XMM5,0x00)
1731           SSE_MULT_PS(XMM5,XMM3)
1732           SSE_SUB_PS(XMM4,XMM5)
1733 
1734           SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1735           SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1736 
1737           /* Second Column */
1738           SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1739           SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1740 
1741           /* Matrix-Vector Product: */
1742           SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1743           SSE_SHUFFLE(XMM6,XMM6,0x00)
1744           SSE_MULT_PS(XMM6,XMM0)
1745           SSE_SUB_PS(XMM5,XMM6)
1746 
1747           SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1748           SSE_SHUFFLE(XMM7,XMM7,0x00)
1749           SSE_MULT_PS(XMM7,XMM1)
1750           SSE_SUB_PS(XMM5,XMM7)
1751 
1752           SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1753           SSE_SHUFFLE(XMM4,XMM4,0x00)
1754           SSE_MULT_PS(XMM4,XMM2)
1755           SSE_SUB_PS(XMM5,XMM4)
1756 
1757           SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1758           SSE_SHUFFLE(XMM6,XMM6,0x00)
1759           SSE_MULT_PS(XMM6,XMM3)
1760           SSE_SUB_PS(XMM5,XMM6)
1761 
1762           SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1763           SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1764 
1765           SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1766 
1767           /* Third Column */
1768           SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1769           SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1770 
1771           /* Matrix-Vector Product: */
1772           SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1773           SSE_SHUFFLE(XMM7,XMM7,0x00)
1774           SSE_MULT_PS(XMM7,XMM0)
1775           SSE_SUB_PS(XMM6,XMM7)
1776 
1777           SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1778           SSE_SHUFFLE(XMM4,XMM4,0x00)
1779           SSE_MULT_PS(XMM4,XMM1)
1780           SSE_SUB_PS(XMM6,XMM4)
1781 
1782           SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1783           SSE_SHUFFLE(XMM5,XMM5,0x00)
1784           SSE_MULT_PS(XMM5,XMM2)
1785           SSE_SUB_PS(XMM6,XMM5)
1786 
1787           SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1788           SSE_SHUFFLE(XMM7,XMM7,0x00)
1789           SSE_MULT_PS(XMM7,XMM3)
1790           SSE_SUB_PS(XMM6,XMM7)
1791 
1792           SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1793           SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1794 
1795           /* Fourth Column */
1796           SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1797           SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1798 
1799           /* Matrix-Vector Product: */
1800           SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1801           SSE_SHUFFLE(XMM5,XMM5,0x00)
1802           SSE_MULT_PS(XMM5,XMM0)
1803           SSE_SUB_PS(XMM4,XMM5)
1804 
1805           SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1806           SSE_SHUFFLE(XMM6,XMM6,0x00)
1807           SSE_MULT_PS(XMM6,XMM1)
1808           SSE_SUB_PS(XMM4,XMM6)
1809 
1810           SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1811           SSE_SHUFFLE(XMM7,XMM7,0x00)
1812           SSE_MULT_PS(XMM7,XMM2)
1813           SSE_SUB_PS(XMM4,XMM7)
1814 
1815           SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1816           SSE_SHUFFLE(XMM5,XMM5,0x00)
1817           SSE_MULT_PS(XMM5,XMM3)
1818           SSE_SUB_PS(XMM4,XMM5)
1819 
1820           SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1821           SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1822           SSE_INLINE_END_2;
1823           pv += 16;
1824         }
1825         ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr);
1826       }
1827       row = (unsigned int)(*bjtmp++);
1828 /*        row = (*bjtmp++)/4; */
1829 /*        bjtmp++; */
1830     }
1831     /* finished row so stick it into b->a */
1832     pv = ba + 16*bi[i];
1833     pj = bj + bi[i];
1834     nz = bi[i+1] - bi[i];
1835 
1836     /* Copy x block back into pv block */
1837     for (j=0; j<nz; j++) {
1838       x = rtmp+16*((unsigned int)pj[j]);
1839 /*        x  = rtmp+4*pj[j]; */
1840 
1841       SSE_INLINE_BEGIN_2(x,pv)
1842       /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1843       SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1844       SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1845 
1846       SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1847       SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1848 
1849       SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1850       SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1851 
1852       SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1853       SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1854 
1855       SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1856       SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1857 
1858       SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1859       SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1860 
1861       SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1862       SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1863 
1864       SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1865       SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1866       SSE_INLINE_END_2;
1867       pv += 16;
1868     }
1869     /* invert diagonal block */
1870     w = ba + 16*diag_offset[i];
1871     if (pivotinblocks) {
1872       ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,,&zeropivotdetected);CHKERRQ(ierr);
1873       if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1874     } else {
1875       ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr);
1876     }
1877 /*      ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */
1878     /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1879   }
1880 
1881   ierr = PetscFree(rtmp);CHKERRQ(ierr);
1882 
1883   C->ops->solve          = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1884   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1885   C->assembled           = PETSC_TRUE;
1886 
1887   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr);
1888   /* Flop Count from inverting diagonal blocks */
1889   SSE_SCOPE_END;
1890   PetscFunctionReturn(0);
1891 }
1892 
1893 #endif
1894