1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <petsc/private/kernels/blockinvert.h>
3
4 /* Block operations are done by accessing one column at at time */
5 /* Default MatSolve for block size 14 */
6
MatSolve_SeqBAIJ_14_NaturalOrdering(Mat A,Vec bb,Vec xx)7 PetscErrorCode MatSolve_SeqBAIJ_14_NaturalOrdering(Mat A,Vec bb,Vec xx)
8 {
9 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
10 PetscErrorCode ierr;
11 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
12 PetscInt i,k,nz,idx,idt,m;
13 const MatScalar *aa=a->a,*v;
14 PetscScalar s[14];
15 PetscScalar *x,xv;
16 const PetscScalar *b;
17
18 PetscFunctionBegin;
19 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
20 ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
21
22 /* forward solve the lower triangular */
23 for (i=0; i<n; i++) {
24 v = aa + bs2*ai[i];
25 vi = aj + ai[i];
26 nz = ai[i+1] - ai[i];
27 idt = bs*i;
28 x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt];
29 x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt];
30 x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt];
31 for (m=0; m<nz; m++) {
32 idx = bs*vi[m];
33 for (k=0; k<bs; k++) {
34 xv = x[k + idx];
35 x[idt] -= v[0]*xv;
36 x[1+idt] -= v[1]*xv;
37 x[2+idt] -= v[2]*xv;
38 x[3+idt] -= v[3]*xv;
39 x[4+idt] -= v[4]*xv;
40 x[5+idt] -= v[5]*xv;
41 x[6+idt] -= v[6]*xv;
42 x[7+idt] -= v[7]*xv;
43 x[8+idt] -= v[8]*xv;
44 x[9+idt] -= v[9]*xv;
45 x[10+idt] -= v[10]*xv;
46 x[11+idt] -= v[11]*xv;
47 x[12+idt] -= v[12]*xv;
48 x[13+idt] -= v[13]*xv;
49 v += 14;
50 }
51 }
52 }
53 /* backward solve the upper triangular */
54 for (i=n-1; i>=0; i--) {
55 v = aa + bs2*(adiag[i+1]+1);
56 vi = aj + adiag[i+1]+1;
57 nz = adiag[i] - adiag[i+1] - 1;
58 idt = bs*i;
59 s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt];
60 s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt];
61 s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt];
62
63 for (m=0; m<nz; m++) {
64 idx = bs*vi[m];
65 for (k=0; k<bs; k++) {
66 xv = x[k + idx];
67 s[0] -= v[0]*xv;
68 s[1] -= v[1]*xv;
69 s[2] -= v[2]*xv;
70 s[3] -= v[3]*xv;
71 s[4] -= v[4]*xv;
72 s[5] -= v[5]*xv;
73 s[6] -= v[6]*xv;
74 s[7] -= v[7]*xv;
75 s[8] -= v[8]*xv;
76 s[9] -= v[9]*xv;
77 s[10] -= v[10]*xv;
78 s[11] -= v[11]*xv;
79 s[12] -= v[12]*xv;
80 s[13] -= v[13]*xv;
81 v += 14;
82 }
83 }
84 ierr = PetscArrayzero(x+idt,bs);CHKERRQ(ierr);
85 for (k=0; k<bs; k++) {
86 x[idt] += v[0]*s[k];
87 x[1+idt] += v[1]*s[k];
88 x[2+idt] += v[2]*s[k];
89 x[3+idt] += v[3]*s[k];
90 x[4+idt] += v[4]*s[k];
91 x[5+idt] += v[5]*s[k];
92 x[6+idt] += v[6]*s[k];
93 x[7+idt] += v[7]*s[k];
94 x[8+idt] += v[8]*s[k];
95 x[9+idt] += v[9]*s[k];
96 x[10+idt] += v[10]*s[k];
97 x[11+idt] += v[11]*s[k];
98 x[12+idt] += v[12]*s[k];
99 x[13+idt] += v[13]*s[k];
100 v += 14;
101 }
102 }
103 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
104 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
105 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
106 PetscFunctionReturn(0);
107 }
108
109 /* Block operations are done by accessing one column at at time */
110 /* Default MatSolve for block size 13 */
111
MatSolve_SeqBAIJ_13_NaturalOrdering(Mat A,Vec bb,Vec xx)112 PetscErrorCode MatSolve_SeqBAIJ_13_NaturalOrdering(Mat A,Vec bb,Vec xx)
113 {
114 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
115 PetscErrorCode ierr;
116 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
117 PetscInt i,k,nz,idx,idt,m;
118 const MatScalar *aa=a->a,*v;
119 PetscScalar s[13];
120 PetscScalar *x,xv;
121 const PetscScalar *b;
122
123 PetscFunctionBegin;
124 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
125 ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
126
127 /* forward solve the lower triangular */
128 for (i=0; i<n; i++) {
129 v = aa + bs2*ai[i];
130 vi = aj + ai[i];
131 nz = ai[i+1] - ai[i];
132 idt = bs*i;
133 x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt];
134 x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt];
135 x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt];
136 for (m=0; m<nz; m++) {
137 idx = bs*vi[m];
138 for (k=0; k<bs; k++) {
139 xv = x[k + idx];
140 x[idt] -= v[0]*xv;
141 x[1+idt] -= v[1]*xv;
142 x[2+idt] -= v[2]*xv;
143 x[3+idt] -= v[3]*xv;
144 x[4+idt] -= v[4]*xv;
145 x[5+idt] -= v[5]*xv;
146 x[6+idt] -= v[6]*xv;
147 x[7+idt] -= v[7]*xv;
148 x[8+idt] -= v[8]*xv;
149 x[9+idt] -= v[9]*xv;
150 x[10+idt] -= v[10]*xv;
151 x[11+idt] -= v[11]*xv;
152 x[12+idt] -= v[12]*xv;
153 v += 13;
154 }
155 }
156 }
157 /* backward solve the upper triangular */
158 for (i=n-1; i>=0; i--) {
159 v = aa + bs2*(adiag[i+1]+1);
160 vi = aj + adiag[i+1]+1;
161 nz = adiag[i] - adiag[i+1] - 1;
162 idt = bs*i;
163 s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt];
164 s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt];
165 s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt];
166
167 for (m=0; m<nz; m++) {
168 idx = bs*vi[m];
169 for (k=0; k<bs; k++) {
170 xv = x[k + idx];
171 s[0] -= v[0]*xv;
172 s[1] -= v[1]*xv;
173 s[2] -= v[2]*xv;
174 s[3] -= v[3]*xv;
175 s[4] -= v[4]*xv;
176 s[5] -= v[5]*xv;
177 s[6] -= v[6]*xv;
178 s[7] -= v[7]*xv;
179 s[8] -= v[8]*xv;
180 s[9] -= v[9]*xv;
181 s[10] -= v[10]*xv;
182 s[11] -= v[11]*xv;
183 s[12] -= v[12]*xv;
184 v += 13;
185 }
186 }
187 ierr = PetscArrayzero(x+idt,bs);CHKERRQ(ierr);
188 for (k=0; k<bs; k++) {
189 x[idt] += v[0]*s[k];
190 x[1+idt] += v[1]*s[k];
191 x[2+idt] += v[2]*s[k];
192 x[3+idt] += v[3]*s[k];
193 x[4+idt] += v[4]*s[k];
194 x[5+idt] += v[5]*s[k];
195 x[6+idt] += v[6]*s[k];
196 x[7+idt] += v[7]*s[k];
197 x[8+idt] += v[8]*s[k];
198 x[9+idt] += v[9]*s[k];
199 x[10+idt] += v[10]*s[k];
200 x[11+idt] += v[11]*s[k];
201 x[12+idt] += v[12]*s[k];
202 v += 13;
203 }
204 }
205 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
206 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
207 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
208 PetscFunctionReturn(0);
209 }
210
211 /* Block operations are done by accessing one column at at time */
212 /* Default MatSolve for block size 12 */
213
MatSolve_SeqBAIJ_12_NaturalOrdering(Mat A,Vec bb,Vec xx)214 PetscErrorCode MatSolve_SeqBAIJ_12_NaturalOrdering(Mat A,Vec bb,Vec xx)
215 {
216 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data;
217 PetscErrorCode ierr;
218 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
219 PetscInt i,k,nz,idx,idt,m;
220 const MatScalar *aa=a->a,*v;
221 PetscScalar s[12];
222 PetscScalar *x,xv;
223 const PetscScalar *b;
224
225 PetscFunctionBegin;
226 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
227 ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
228
229 /* forward solve the lower triangular */
230 for (i=0; i<n; i++) {
231 v = aa + bs2*ai[i];
232 vi = aj + ai[i];
233 nz = ai[i+1] - ai[i];
234 idt = bs*i;
235 x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt];
236 x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt];
237 x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt];
238 for (m=0; m<nz; m++) {
239 idx = bs*vi[m];
240 for (k=0; k<bs; k++) {
241 xv = x[k + idx];
242 x[idt] -= v[0]*xv;
243 x[1+idt] -= v[1]*xv;
244 x[2+idt] -= v[2]*xv;
245 x[3+idt] -= v[3]*xv;
246 x[4+idt] -= v[4]*xv;
247 x[5+idt] -= v[5]*xv;
248 x[6+idt] -= v[6]*xv;
249 x[7+idt] -= v[7]*xv;
250 x[8+idt] -= v[8]*xv;
251 x[9+idt] -= v[9]*xv;
252 x[10+idt] -= v[10]*xv;
253 x[11+idt] -= v[11]*xv;
254 v += 12;
255 }
256 }
257 }
258 /* backward solve the upper triangular */
259 for (i=n-1; i>=0; i--) {
260 v = aa + bs2*(adiag[i+1]+1);
261 vi = aj + adiag[i+1]+1;
262 nz = adiag[i] - adiag[i+1] - 1;
263 idt = bs*i;
264 s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt];
265 s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt];
266 s[10] = x[10+idt]; s[11] = x[11+idt];
267
268 for (m=0; m<nz; m++) {
269 idx = bs*vi[m];
270 for (k=0; k<bs; k++) {
271 xv = x[k + idx];
272 s[0] -= v[0]*xv;
273 s[1] -= v[1]*xv;
274 s[2] -= v[2]*xv;
275 s[3] -= v[3]*xv;
276 s[4] -= v[4]*xv;
277 s[5] -= v[5]*xv;
278 s[6] -= v[6]*xv;
279 s[7] -= v[7]*xv;
280 s[8] -= v[8]*xv;
281 s[9] -= v[9]*xv;
282 s[10] -= v[10]*xv;
283 s[11] -= v[11]*xv;
284 v += 12;
285 }
286 }
287 ierr = PetscArrayzero(x+idt,bs);CHKERRQ(ierr);
288 for (k=0; k<bs; k++) {
289 x[idt] += v[0]*s[k];
290 x[1+idt] += v[1]*s[k];
291 x[2+idt] += v[2]*s[k];
292 x[3+idt] += v[3]*s[k];
293 x[4+idt] += v[4]*s[k];
294 x[5+idt] += v[5]*s[k];
295 x[6+idt] += v[6]*s[k];
296 x[7+idt] += v[7]*s[k];
297 x[8+idt] += v[8]*s[k];
298 x[9+idt] += v[9]*s[k];
299 x[10+idt] += v[10]*s[k];
300 x[11+idt] += v[11]*s[k];
301 v += 12;
302 }
303 }
304 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
305 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
306 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
307 PetscFunctionReturn(0);
308 }
309