1
2 /*
3 Routines to compute overlapping regions of a parallel MPI matrix
4 and to find submatrices that were shared across processors.
5 */
6 #include <../src/mat/impls/baij/mpi/mpibaij.h>
7 #include <petscbt.h>
8
9 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**);
10 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
11 extern PetscErrorCode MatGetRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
12 extern PetscErrorCode MatRestoreRow_MPIBAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
13
MatIncreaseOverlap_MPIBAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)14 PetscErrorCode MatIncreaseOverlap_MPIBAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
15 {
16 PetscErrorCode ierr;
17 PetscInt i,N=C->cmap->N, bs=C->rmap->bs;
18 IS *is_new;
19
20 PetscFunctionBegin;
21 ierr = PetscMalloc1(imax,&is_new);CHKERRQ(ierr);
22 /* Convert the indices into block format */
23 ierr = ISCompressIndicesGeneral(N,C->rmap->n,bs,imax,is,is_new);CHKERRQ(ierr);
24 if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified\n");
25 for (i=0; i<ov; ++i) {
26 ierr = MatIncreaseOverlap_MPIBAIJ_Once(C,imax,is_new);CHKERRQ(ierr);
27 }
28 for (i=0; i<imax; i++) {ierr = ISDestroy(&is[i]);CHKERRQ(ierr);}
29 ierr = ISExpandIndicesGeneral(N,N,bs,imax,is_new,is);CHKERRQ(ierr);
30 for (i=0; i<imax; i++) {ierr = ISDestroy(&is_new[i]);CHKERRQ(ierr);}
31 ierr = PetscFree(is_new);CHKERRQ(ierr);
32 PetscFunctionReturn(0);
33 }
34
35 /*
36 Sample message format:
37 If a processor A wants processor B to process some elements corresponding
38 to index sets is[1], is[5]
39 mesg [0] = 2 (no of index sets in the mesg)
40 -----------
41 mesg [1] = 1 => is[1]
42 mesg [2] = sizeof(is[1]);
43 -----------
44 mesg [5] = 5 => is[5]
45 mesg [6] = sizeof(is[5]);
46 -----------
47 mesg [7]
48 mesg [n] data(is[1])
49 -----------
50 mesg[n+1]
51 mesg[m] data(is[5])
52 -----------
53
54 Notes:
55 nrqs - no of requests sent (or to be sent out)
56 nrqr - no of requests received (which have to be or which have been processed)
57 */
MatIncreaseOverlap_MPIBAIJ_Once(Mat C,PetscInt imax,IS is[])58 PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Once(Mat C,PetscInt imax,IS is[])
59 {
60 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
61 const PetscInt **idx,*idx_i;
62 PetscInt *n,*w3,*w4,**data,len;
63 PetscErrorCode ierr;
64 PetscMPIInt size,rank,tag1,tag2,*w2,*w1,nrqr;
65 PetscInt Mbs,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
66 PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2,*d_p;
67 PetscMPIInt *onodes1,*olengths1,*onodes2,*olengths2,proc=-1;
68 PetscBT *table;
69 MPI_Comm comm,*iscomms;
70 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
71 MPI_Status *s_status,*recv_status;
72 char *t_p;
73
74 PetscFunctionBegin;
75 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
76 size = c->size;
77 rank = c->rank;
78 Mbs = c->Mbs;
79
80 ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
81 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
82
83 ierr = PetscMalloc2(imax+1,(PetscInt***)&idx,imax,&n);CHKERRQ(ierr);
84
85 for (i=0; i<imax; i++) {
86 ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
87 ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
88 }
89
90 /* evaluate communication - mesg to who,length of mesg, and buffer space
91 required. Based on this, buffers are allocated, and data copied into them*/
92 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr);
93 for (i=0; i<imax; i++) {
94 ierr = PetscArrayzero(w4,size);CHKERRQ(ierr); /* initialise work vector*/
95 idx_i = idx[i];
96 len = n[i];
97 for (j=0; j<len; j++) {
98 row = idx_i[j];
99 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
100 ierr = PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc);CHKERRQ(ierr);
101 w4[proc]++;
102 }
103 for (j=0; j<size; j++) {
104 if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
105 }
106 }
107
108 nrqs = 0; /* no of outgoing messages */
109 msz = 0; /* total mesg length (for all proc */
110 w1[rank] = 0; /* no mesg sent to itself */
111 w3[rank] = 0;
112 for (i=0; i<size; i++) {
113 if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
114 }
115 /* pa - is list of processors to communicate with */
116 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr);
117 for (i=0,j=0; i<size; i++) {
118 if (w1[i]) {pa[j] = i; j++;}
119 }
120
121 /* Each message would have a header = 1 + 2*(no of IS) + data */
122 for (i=0; i<nrqs; i++) {
123 j = pa[i];
124 w1[j] += w2[j] + 2*w3[j];
125 msz += w1[j];
126 }
127
128 /* Determine the number of messages to expect, their lengths, from from-ids */
129 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
130 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
131
132 /* Now post the Irecvs corresponding to these messages */
133 ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
134
135 /* Allocate Memory for outgoing messages */
136 ierr = PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);CHKERRQ(ierr);
137 ierr = PetscArrayzero(outdat,size);CHKERRQ(ierr);
138 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
139 {
140 PetscInt *iptr = tmp,ict = 0;
141 for (i=0; i<nrqs; i++) {
142 j = pa[i];
143 iptr += ict;
144 outdat[j] = iptr;
145 ict = w1[j];
146 }
147 }
148
149 /* Form the outgoing messages */
150 /*plug in the headers*/
151 for (i=0; i<nrqs; i++) {
152 j = pa[i];
153 outdat[j][0] = 0;
154 ierr = PetscArrayzero(outdat[j]+1,2*w3[j]);CHKERRQ(ierr);
155 ptr[j] = outdat[j] + 2*w3[j] + 1;
156 }
157
158 /* Memory for doing local proc's work*/
159 {
160 ierr = PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mbs*imax,&d_p, (Mbs/PETSC_BITS_PER_BYTE+1)*imax,&t_p);CHKERRQ(ierr);
161
162 for (i=0; i<imax; i++) {
163 table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i;
164 data[i] = d_p + (Mbs)*i;
165 }
166 }
167
168 /* Parse the IS and update local tables and the outgoing buf with the data*/
169 {
170 PetscInt n_i,*data_i,isz_i,*outdat_j,ctr_j;
171 PetscBT table_i;
172
173 for (i=0; i<imax; i++) {
174 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr);
175 n_i = n[i];
176 table_i = table[i];
177 idx_i = idx[i];
178 data_i = data[i];
179 isz_i = isz[i];
180 for (j=0; j<n_i; j++) { /* parse the indices of each IS */
181 row = idx_i[j];
182 ierr = PetscLayoutFindOwner(C->rmap,row*C->rmap->bs,&proc);CHKERRQ(ierr);
183 if (proc != rank) { /* copy to the outgoing buffer */
184 ctr[proc]++;
185 *ptr[proc] = row;
186 ptr[proc]++;
187 } else { /* Update the local table */
188 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
189 }
190 }
191 /* Update the headers for the current IS */
192 for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
193 if ((ctr_j = ctr[j])) {
194 outdat_j = outdat[j];
195 k = ++outdat_j[0];
196 outdat_j[2*k] = ctr_j;
197 outdat_j[2*k-1] = i;
198 }
199 }
200 isz[i] = isz_i;
201 }
202 }
203
204 /* Now post the sends */
205 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
206 for (i=0; i<nrqs; ++i) {
207 j = pa[i];
208 ierr = MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
209 }
210
211 /* No longer need the original indices*/
212 for (i=0; i<imax; ++i) {
213 ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
214 }
215 ierr = PetscFree2(*(PetscInt***)&idx,n);CHKERRQ(ierr);
216
217 ierr = PetscMalloc1(imax,&iscomms);CHKERRQ(ierr);
218 for (i=0; i<imax; ++i) {
219 ierr = PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);CHKERRQ(ierr);
220 ierr = ISDestroy(&is[i]);CHKERRQ(ierr);
221 }
222
223 /* Do Local work*/
224 ierr = MatIncreaseOverlap_MPIBAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
225
226 /* Receive messages*/
227 ierr = PetscMalloc1(nrqr+1,&recv_status);CHKERRQ(ierr);
228 if (nrqr) {ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);}
229
230 ierr = PetscMalloc1(nrqs+1,&s_status);CHKERRQ(ierr);
231 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);}
232
233 /* Phase 1 sends are complete - deallocate buffers */
234 ierr = PetscFree4(outdat,ptr,tmp,ctr);CHKERRQ(ierr);
235 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
236
237 ierr = PetscMalloc1(nrqr+1,&xdata);CHKERRQ(ierr);
238 ierr = PetscMalloc1(nrqr+1,&isz1);CHKERRQ(ierr);
239 ierr = MatIncreaseOverlap_MPIBAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
240 ierr = PetscFree(rbuf[0]);CHKERRQ(ierr);
241 ierr = PetscFree(rbuf);CHKERRQ(ierr);
242
243 /* Send the data back*/
244 /* Do a global reduction to know the buffer space req for incoming messages*/
245 {
246 PetscMPIInt *rw1;
247
248 ierr = PetscCalloc1(size,&rw1);CHKERRQ(ierr);
249
250 for (i=0; i<nrqr; ++i) {
251 proc = recv_status[i].MPI_SOURCE;
252 if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
253 rw1[proc] = isz1[i];
254 }
255
256 ierr = PetscFree(onodes1);CHKERRQ(ierr);
257 ierr = PetscFree(olengths1);CHKERRQ(ierr);
258
259 /* Determine the number of messages to expect, their lengths, from from-ids */
260 ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
261 ierr = PetscFree(rw1);CHKERRQ(ierr);
262 }
263 /* Now post the Irecvs corresponding to these messages */
264 ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
265
266 /* Now post the sends */
267 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
268 for (i=0; i<nrqr; ++i) {
269 j = recv_status[i].MPI_SOURCE;
270 ierr = MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
271 }
272
273 /* receive work done on other processors*/
274 {
275 PetscMPIInt idex;
276 PetscInt is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
277 PetscBT table_i;
278 MPI_Status *status2;
279
280 ierr = PetscMalloc1(PetscMax(nrqr,nrqs)+1,&status2);CHKERRQ(ierr);
281 for (i=0; i<nrqs; ++i) {
282 ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
283 /* Process the message*/
284 rbuf2_i = rbuf2[idex];
285 ct1 = 2*rbuf2_i[0]+1;
286 jmax = rbuf2[idex][0];
287 for (j=1; j<=jmax; j++) {
288 max = rbuf2_i[2*j];
289 is_no = rbuf2_i[2*j-1];
290 isz_i = isz[is_no];
291 data_i = data[is_no];
292 table_i = table[is_no];
293 for (k=0; k<max; k++,ct1++) {
294 row = rbuf2_i[ct1];
295 if (!PetscBTLookupSet(table_i,row)) data_i[isz_i++] = row;
296 }
297 isz[is_no] = isz_i;
298 }
299 }
300 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);}
301 ierr = PetscFree(status2);CHKERRQ(ierr);
302 }
303
304 for (i=0; i<imax; ++i) {
305 ierr = ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);CHKERRQ(ierr);
306 ierr = PetscCommDestroy(&iscomms[i]);CHKERRQ(ierr);
307 }
308
309 ierr = PetscFree(iscomms);CHKERRQ(ierr);
310 ierr = PetscFree(onodes2);CHKERRQ(ierr);
311 ierr = PetscFree(olengths2);CHKERRQ(ierr);
312
313 ierr = PetscFree(pa);CHKERRQ(ierr);
314 ierr = PetscFree(rbuf2[0]);CHKERRQ(ierr);
315 ierr = PetscFree(rbuf2);CHKERRQ(ierr);
316 ierr = PetscFree(s_waits1);CHKERRQ(ierr);
317 ierr = PetscFree(r_waits1);CHKERRQ(ierr);
318 ierr = PetscFree(s_waits2);CHKERRQ(ierr);
319 ierr = PetscFree(r_waits2);CHKERRQ(ierr);
320 ierr = PetscFree5(table,data,isz,d_p,t_p);CHKERRQ(ierr);
321 ierr = PetscFree(s_status);CHKERRQ(ierr);
322 ierr = PetscFree(recv_status);CHKERRQ(ierr);
323 ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
324 ierr = PetscFree(xdata);CHKERRQ(ierr);
325 ierr = PetscFree(isz1);CHKERRQ(ierr);
326 PetscFunctionReturn(0);
327 }
328
329 /*
330 MatIncreaseOverlap_MPIBAIJ_Local - Called by MatincreaseOverlap, to do
331 the work on the local processor.
332
333 Inputs:
334 C - MAT_MPIBAIJ;
335 imax - total no of index sets processed at a time;
336 table - an array of char - size = Mbs bits.
337
338 Output:
339 isz - array containing the count of the solution elements corresponding
340 to each index set;
341 data - pointer to the solutions
342 */
MatIncreaseOverlap_MPIBAIJ_Local(Mat C,PetscInt imax,PetscBT * table,PetscInt * isz,PetscInt ** data)343 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data)
344 {
345 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
346 Mat A = c->A,B = c->B;
347 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
348 PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
349 PetscInt *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
350 PetscBT table_i;
351
352 PetscFunctionBegin;
353 rstart = c->rstartbs;
354 cstart = c->cstartbs;
355 ai = a->i;
356 aj = a->j;
357 bi = b->i;
358 bj = b->j;
359 garray = c->garray;
360
361
362 for (i=0; i<imax; i++) {
363 data_i = data[i];
364 table_i = table[i];
365 isz_i = isz[i];
366 for (j=0,max=isz[i]; j<max; j++) {
367 row = data_i[j] - rstart;
368 start = ai[row];
369 end = ai[row+1];
370 for (k=start; k<end; k++) { /* Amat */
371 val = aj[k] + cstart;
372 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
373 }
374 start = bi[row];
375 end = bi[row+1];
376 for (k=start; k<end; k++) { /* Bmat */
377 val = garray[bj[k]];
378 if (!PetscBTLookupSet(table_i,val)) data_i[isz_i++] = val;
379 }
380 }
381 isz[i] = isz_i;
382 }
383 PetscFunctionReturn(0);
384 }
385 /*
386 MatIncreaseOverlap_MPIBAIJ_Receive - Process the received messages,
387 and return the output
388
389 Input:
390 C - the matrix
391 nrqr - no of messages being processed.
392 rbuf - an array of pointers to the received requests
393
394 Output:
395 xdata - array of messages to be sent back
396 isz1 - size of each message
397
398 For better efficiency perhaps we should malloc separately each xdata[i],
399 then if a remalloc is required we need only copy the data for that one row
400 rather than all previous rows as it is now where a single large chunck of
401 memory is used.
402
403 */
MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,PetscInt nrqr,PetscInt ** rbuf,PetscInt ** xdata,PetscInt * isz1)404 static PetscErrorCode MatIncreaseOverlap_MPIBAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
405 {
406 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
407 Mat A = c->A,B = c->B;
408 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
409 PetscErrorCode ierr;
410 PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
411 PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
412 PetscInt val,max1,max2,Mbs,no_malloc =0,*tmp,new_estimate,ctr;
413 PetscInt *rbuf_i,kmax,rbuf_0;
414 PetscBT xtable;
415
416 PetscFunctionBegin;
417 Mbs = c->Mbs;
418 rstart = c->rstartbs;
419 cstart = c->cstartbs;
420 ai = a->i;
421 aj = a->j;
422 bi = b->i;
423 bj = b->j;
424 garray = c->garray;
425
426
427 for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
428 rbuf_i = rbuf[i];
429 rbuf_0 = rbuf_i[0];
430 ct += rbuf_0;
431 for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
432 }
433
434 if (c->Mbs) max1 = ct*(a->nz +b->nz)/c->Mbs;
435 else max1 = 1;
436 mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
437 ierr = PetscMalloc1(mem_estimate,&xdata[0]);CHKERRQ(ierr);
438 ++no_malloc;
439 ierr = PetscBTCreate(Mbs,&xtable);CHKERRQ(ierr);
440 ierr = PetscArrayzero(isz1,nrqr);CHKERRQ(ierr);
441
442 ct3 = 0;
443 for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
444 rbuf_i = rbuf[i];
445 rbuf_0 = rbuf_i[0];
446 ct1 = 2*rbuf_0+1;
447 ct2 = ct1;
448 ct3 += ct1;
449 for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
450 ierr = PetscBTMemzero(Mbs,xtable);CHKERRQ(ierr);
451 oct2 = ct2;
452 kmax = rbuf_i[2*j];
453 for (k=0; k<kmax; k++,ct1++) {
454 row = rbuf_i[ct1];
455 if (!PetscBTLookupSet(xtable,row)) {
456 if (!(ct3 < mem_estimate)) {
457 new_estimate = (PetscInt)(1.5*mem_estimate)+1;
458 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
459 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
460 ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
461 xdata[0] = tmp;
462 mem_estimate = new_estimate; ++no_malloc;
463 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
464 }
465 xdata[i][ct2++] = row;
466 ct3++;
467 }
468 }
469 for (k=oct2,max2=ct2; k<max2; k++) {
470 row = xdata[i][k] - rstart;
471 start = ai[row];
472 end = ai[row+1];
473 for (l=start; l<end; l++) {
474 val = aj[l] + cstart;
475 if (!PetscBTLookupSet(xtable,val)) {
476 if (!(ct3 < mem_estimate)) {
477 new_estimate = (PetscInt)(1.5*mem_estimate)+1;
478 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
479 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
480 ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
481 xdata[0] = tmp;
482 mem_estimate = new_estimate; ++no_malloc;
483 for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
484 }
485 xdata[i][ct2++] = val;
486 ct3++;
487 }
488 }
489 start = bi[row];
490 end = bi[row+1];
491 for (l=start; l<end; l++) {
492 val = garray[bj[l]];
493 if (!PetscBTLookupSet(xtable,val)) {
494 if (!(ct3 < mem_estimate)) {
495 new_estimate = (PetscInt)(1.5*mem_estimate)+1;
496 ierr = PetscMalloc1(new_estimate,&tmp);CHKERRQ(ierr);
497 ierr = PetscArraycpy(tmp,xdata[0],mem_estimate);CHKERRQ(ierr);
498 ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
499 xdata[0] = tmp;
500 mem_estimate = new_estimate; ++no_malloc;
501 for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
502 }
503 xdata[i][ct2++] = val;
504 ct3++;
505 }
506 }
507 }
508 /* Update the header*/
509 xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
510 xdata[i][2*j-1] = rbuf_i[2*j-1];
511 }
512 xdata[i][0] = rbuf_0;
513 xdata[i+1] = xdata[i] + ct2;
514 isz1[i] = ct2; /* size of each message */
515 }
516 ierr = PetscBTDestroy(&xtable);CHKERRQ(ierr);
517 ierr = PetscInfo3(C,"Allocated %D bytes, required %D, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);CHKERRQ(ierr);
518 PetscFunctionReturn(0);
519 }
520
MatCreateSubMatrices_MPIBAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat * submat[])521 PetscErrorCode MatCreateSubMatrices_MPIBAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
522 {
523 IS *isrow_block,*iscol_block;
524 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
525 PetscErrorCode ierr;
526 PetscInt nmax,nstages_local,nstages,i,pos,max_no,N=C->cmap->N,bs=C->rmap->bs;
527 Mat_SeqBAIJ *subc;
528 Mat_SubSppt *smat;
529
530 PetscFunctionBegin;
531 /* The compression and expansion should be avoided. Doesn't point
532 out errors, might change the indices, hence buggey */
533 ierr = PetscMalloc2(ismax+1,&isrow_block,ismax+1,&iscol_block);CHKERRQ(ierr);
534 ierr = ISCompressIndicesGeneral(N,C->rmap->n,bs,ismax,isrow,isrow_block);CHKERRQ(ierr);
535 ierr = ISCompressIndicesGeneral(N,C->cmap->n,bs,ismax,iscol,iscol_block);CHKERRQ(ierr);
536
537 /* Determine the number of stages through which submatrices are done */
538 if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
539 else nmax = 20*1000000 / (c->Nbs * sizeof(PetscInt));
540 if (!nmax) nmax = 1;
541
542 if (scall == MAT_INITIAL_MATRIX) {
543 nstages_local = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
544
545 /* Make sure every processor loops through the nstages */
546 ierr = MPIU_Allreduce(&nstages_local,&nstages,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));CHKERRQ(ierr);
547
548 /* Allocate memory to hold all the submatrices and dummy submatrices */
549 ierr = PetscCalloc1(ismax+nstages,submat);CHKERRQ(ierr);
550 } else { /* MAT_REUSE_MATRIX */
551 if (ismax) {
552 subc = (Mat_SeqBAIJ*)((*submat)[0]->data);
553 smat = subc->submatis1;
554 } else { /* (*submat)[0] is a dummy matrix */
555 smat = (Mat_SubSppt*)(*submat)[0]->data;
556 }
557 if (!smat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"MatCreateSubMatrices(...,MAT_REUSE_MATRIX,...) requires submat");
558 nstages = smat->nstages;
559 }
560
561 for (i=0,pos=0; i<nstages; i++) {
562 if (pos+nmax <= ismax) max_no = nmax;
563 else if (pos == ismax) max_no = 0;
564 else max_no = ismax-pos;
565
566 ierr = MatCreateSubMatrices_MPIBAIJ_local(C,max_no,isrow_block+pos,iscol_block+pos,scall,*submat+pos);CHKERRQ(ierr);
567 if (!max_no && scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
568 smat = (Mat_SubSppt*)(*submat)[pos]->data;
569 smat->nstages = nstages;
570 }
571 pos += max_no;
572 }
573
574 if (scall == MAT_INITIAL_MATRIX && ismax) {
575 /* save nstages for reuse */
576 subc = (Mat_SeqBAIJ*)((*submat)[0]->data);
577 smat = subc->submatis1;
578 smat->nstages = nstages;
579 }
580
581 for (i=0; i<ismax; i++) {
582 ierr = ISDestroy(&isrow_block[i]);CHKERRQ(ierr);
583 ierr = ISDestroy(&iscol_block[i]);CHKERRQ(ierr);
584 }
585 ierr = PetscFree2(isrow_block,iscol_block);CHKERRQ(ierr);
586 PetscFunctionReturn(0);
587 }
588
589 #if defined(PETSC_USE_CTABLE)
PetscGetProc(const PetscInt row,const PetscMPIInt size,const PetscInt proc_gnode[],PetscMPIInt * rank)590 PetscErrorCode PetscGetProc(const PetscInt row, const PetscMPIInt size, const PetscInt proc_gnode[], PetscMPIInt *rank)
591 {
592 PetscInt nGlobalNd = proc_gnode[size];
593 PetscMPIInt fproc;
594 PetscErrorCode ierr;
595
596 PetscFunctionBegin;
597 ierr = PetscMPIIntCast((PetscInt)(((float)row * (float)size / (float)nGlobalNd + 0.5)),&fproc);CHKERRQ(ierr);
598 if (fproc > size) fproc = size;
599 while (row < proc_gnode[fproc] || row >= proc_gnode[fproc+1]) {
600 if (row < proc_gnode[fproc]) fproc--;
601 else fproc++;
602 }
603 *rank = fproc;
604 PetscFunctionReturn(0);
605 }
606 #endif
607
608 /* -------------------------------------------------------------------------*/
609 /* This code is used for BAIJ and SBAIJ matrices (unfortunate dependency) */
MatCreateSubMatrices_MPIBAIJ_local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat * submats)610 PetscErrorCode MatCreateSubMatrices_MPIBAIJ_local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
611 {
612 Mat_MPIBAIJ *c = (Mat_MPIBAIJ*)C->data;
613 Mat A = c->A;
614 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)c->B->data,*subc;
615 const PetscInt **icol,**irow;
616 PetscInt *nrow,*ncol,start;
617 PetscErrorCode ierr;
618 PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
619 PetscInt **sbuf1,**sbuf2,*sbuf2_i,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
620 PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
621 PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
622 PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
623 #if defined(PETSC_USE_CTABLE)
624 PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
625 #else
626 PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
627 #endif
628 const PetscInt *irow_i,*icol_i;
629 PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
630 MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
631 MPI_Request *r_waits4,*s_waits3,*s_waits4;
632 MPI_Status *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
633 MPI_Status *r_status3,*r_status4,*s_status4;
634 MPI_Comm comm;
635 PetscScalar **rbuf4,*rbuf4_i=NULL,**sbuf_aa,*vals,*mat_a=NULL,*imat_a=NULL,*sbuf_aa_i;
636 PetscMPIInt *onodes1,*olengths1,end;
637 PetscInt **row2proc,*row2proc_i,*imat_ilen,*imat_j,*imat_i;
638 Mat_SubSppt *smat_i;
639 PetscBool *issorted,colflag,iscsorted=PETSC_TRUE;
640 PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
641 PetscInt bs=C->rmap->bs,bs2=c->bs2,rstart = c->rstartbs;
642 PetscBool ijonly=c->ijonly; /* private flag indicates only matrix data structures are requested */
643 PetscInt nzA,nzB,*a_i=a->i,*b_i=b->i,*a_j = a->j,*b_j = b->j,ctmp,imark,*cworkA,*cworkB;
644 PetscScalar *vworkA=NULL,*vworkB=NULL,*a_a = a->a,*b_a = b->a;
645 PetscInt cstart = c->cstartbs,*bmap = c->garray;
646 PetscBool *allrows,*allcolumns;
647
648 PetscFunctionBegin;
649 ierr = PetscObjectGetComm((PetscObject)C,&comm);CHKERRQ(ierr);
650 size = c->size;
651 rank = c->rank;
652
653 ierr = PetscMalloc5(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns,ismax,&allrows);CHKERRQ(ierr);
654 ierr = PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);CHKERRQ(ierr);
655
656 for (i=0; i<ismax; i++) {
657 ierr = ISSorted(iscol[i],&issorted[i]);CHKERRQ(ierr);
658 if (!issorted[i]) iscsorted = issorted[i]; /* columns are not sorted! */
659 ierr = ISSorted(isrow[i],&issorted[i]);CHKERRQ(ierr);
660
661 /* Check for special case: allcolumns */
662 ierr = ISIdentity(iscol[i],&colflag);CHKERRQ(ierr);
663 ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
664
665 if (colflag && ncol[i] == c->Nbs) {
666 allcolumns[i] = PETSC_TRUE;
667 icol[i] = NULL;
668 } else {
669 allcolumns[i] = PETSC_FALSE;
670 ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
671 }
672
673 /* Check for special case: allrows */
674 ierr = ISIdentity(isrow[i],&colflag);CHKERRQ(ierr);
675 ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
676 if (colflag && nrow[i] == c->Mbs) {
677 allrows[i] = PETSC_TRUE;
678 irow[i] = NULL;
679 } else {
680 allrows[i] = PETSC_FALSE;
681 ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
682 }
683 }
684
685 if (scall == MAT_REUSE_MATRIX) {
686 /* Assumes new rows are same length as the old rows */
687 for (i=0; i<ismax; i++) {
688 subc = (Mat_SeqBAIJ*)(submats[i]->data);
689 if (subc->mbs != nrow[i] || subc->nbs != ncol[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
690
691 /* Initial matrix as if empty */
692 ierr = PetscArrayzero(subc->ilen,subc->mbs);CHKERRQ(ierr);
693
694 /* Initial matrix as if empty */
695 submats[i]->factortype = C->factortype;
696
697 smat_i = subc->submatis1;
698
699 nrqs = smat_i->nrqs;
700 nrqr = smat_i->nrqr;
701 rbuf1 = smat_i->rbuf1;
702 rbuf2 = smat_i->rbuf2;
703 rbuf3 = smat_i->rbuf3;
704 req_source2 = smat_i->req_source2;
705
706 sbuf1 = smat_i->sbuf1;
707 sbuf2 = smat_i->sbuf2;
708 ptr = smat_i->ptr;
709 tmp = smat_i->tmp;
710 ctr = smat_i->ctr;
711
712 pa = smat_i->pa;
713 req_size = smat_i->req_size;
714 req_source1 = smat_i->req_source1;
715
716 allcolumns[i] = smat_i->allcolumns;
717 allrows[i] = smat_i->allrows;
718 row2proc[i] = smat_i->row2proc;
719 rmap[i] = smat_i->rmap;
720 cmap[i] = smat_i->cmap;
721 }
722
723 if (!ismax){ /* Get dummy submatrices and retrieve struct submatis1 */
724 if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
725 smat_i = (Mat_SubSppt*)submats[0]->data;
726
727 nrqs = smat_i->nrqs;
728 nrqr = smat_i->nrqr;
729 rbuf1 = smat_i->rbuf1;
730 rbuf2 = smat_i->rbuf2;
731 rbuf3 = smat_i->rbuf3;
732 req_source2 = smat_i->req_source2;
733
734 sbuf1 = smat_i->sbuf1;
735 sbuf2 = smat_i->sbuf2;
736 ptr = smat_i->ptr;
737 tmp = smat_i->tmp;
738 ctr = smat_i->ctr;
739
740 pa = smat_i->pa;
741 req_size = smat_i->req_size;
742 req_source1 = smat_i->req_source1;
743
744 allcolumns[0] = PETSC_FALSE;
745 }
746 } else { /* scall == MAT_INITIAL_MATRIX */
747 /* Get some new tags to keep the communication clean */
748 ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
749 ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
750
751 /* evaluate communication - mesg to who, length of mesg, and buffer space
752 required. Based on this, buffers are allocated, and data copied into them*/
753 ierr = PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);CHKERRQ(ierr); /* mesg size, initialize work vectors */
754
755 for (i=0; i<ismax; i++) {
756 jmax = nrow[i];
757 irow_i = irow[i];
758
759 ierr = PetscMalloc1(jmax,&row2proc_i);CHKERRQ(ierr);
760 row2proc[i] = row2proc_i;
761
762 if (issorted[i]) proc = 0;
763 for (j=0; j<jmax; j++) {
764 if (!issorted[i]) proc = 0;
765 if (allrows[i]) row = j;
766 else row = irow_i[j];
767
768 while (row >= c->rangebs[proc+1]) proc++;
769 w4[proc]++;
770 row2proc_i[j] = proc; /* map row index to proc */
771 }
772 for (j=0; j<size; j++) {
773 if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;}
774 }
775 }
776
777 nrqs = 0; /* no of outgoing messages */
778 msz = 0; /* total mesg length (for all procs) */
779 w1[rank] = 0; /* no mesg sent to self */
780 w3[rank] = 0;
781 for (i=0; i<size; i++) {
782 if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
783 }
784 ierr = PetscMalloc1(nrqs+1,&pa);CHKERRQ(ierr); /*(proc -array)*/
785 for (i=0,j=0; i<size; i++) {
786 if (w1[i]) { pa[j] = i; j++; }
787 }
788
789 /* Each message would have a header = 1 + 2*(no of IS) + data */
790 for (i=0; i<nrqs; i++) {
791 j = pa[i];
792 w1[j] += w2[j] + 2* w3[j];
793 msz += w1[j];
794 }
795 ierr = PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);CHKERRQ(ierr);
796
797 /* Determine the number of messages to expect, their lengths, from from-ids */
798 ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
799 ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
800
801 /* Now post the Irecvs corresponding to these messages */
802 tag0 = ((PetscObject)C)->tag;
803 ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
804
805 ierr = PetscFree(onodes1);CHKERRQ(ierr);
806 ierr = PetscFree(olengths1);CHKERRQ(ierr);
807
808 /* Allocate Memory for outgoing messages */
809 ierr = PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);CHKERRQ(ierr);
810 ierr = PetscArrayzero(sbuf1,size);CHKERRQ(ierr);
811 ierr = PetscArrayzero(ptr,size);CHKERRQ(ierr);
812
813 {
814 PetscInt *iptr = tmp;
815 k = 0;
816 for (i=0; i<nrqs; i++) {
817 j = pa[i];
818 iptr += k;
819 sbuf1[j] = iptr;
820 k = w1[j];
821 }
822 }
823
824 /* Form the outgoing messages. Initialize the header space */
825 for (i=0; i<nrqs; i++) {
826 j = pa[i];
827 sbuf1[j][0] = 0;
828 ierr = PetscArrayzero(sbuf1[j]+1,2*w3[j]);CHKERRQ(ierr);
829 ptr[j] = sbuf1[j] + 2*w3[j] + 1;
830 }
831
832 /* Parse the isrow and copy data into outbuf */
833 for (i=0; i<ismax; i++) {
834 row2proc_i = row2proc[i];
835 ierr = PetscArrayzero(ctr,size);CHKERRQ(ierr);
836 irow_i = irow[i];
837 jmax = nrow[i];
838 for (j=0; j<jmax; j++) { /* parse the indices of each IS */
839 proc = row2proc_i[j];
840 if (allrows[i]) row = j;
841 else row = irow_i[j];
842
843 if (proc != rank) { /* copy to the outgoing buf*/
844 ctr[proc]++;
845 *ptr[proc] = row;
846 ptr[proc]++;
847 }
848 }
849 /* Update the headers for the current IS */
850 for (j=0; j<size; j++) { /* Can Optimise this loop too */
851 if ((ctr_j = ctr[j])) {
852 sbuf1_j = sbuf1[j];
853 k = ++sbuf1_j[0];
854 sbuf1_j[2*k] = ctr_j;
855 sbuf1_j[2*k-1] = i;
856 }
857 }
858 }
859
860 /* Now post the sends */
861 ierr = PetscMalloc1(nrqs+1,&s_waits1);CHKERRQ(ierr);
862 for (i=0; i<nrqs; ++i) {
863 j = pa[i];
864 ierr = MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
865 }
866
867 /* Post Receives to capture the buffer size */
868 ierr = PetscMalloc1(nrqs+1,&r_waits2);CHKERRQ(ierr);
869 ierr = PetscMalloc3(nrqs+1,&req_source2,nrqs+1,&rbuf2,nrqs+1,&rbuf3);CHKERRQ(ierr);
870 rbuf2[0] = tmp + msz;
871 for (i=1; i<nrqs; ++i) {
872 rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
873 }
874 for (i=0; i<nrqs; ++i) {
875 j = pa[i];
876 ierr = MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);CHKERRQ(ierr);
877 }
878
879 /* Send to other procs the buf size they should allocate */
880 /* Receive messages*/
881 ierr = PetscMalloc1(nrqr+1,&s_waits2);CHKERRQ(ierr);
882 ierr = PetscMalloc1(nrqr+1,&r_status1);CHKERRQ(ierr);
883 ierr = PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);CHKERRQ(ierr);
884
885 ierr = MPI_Waitall(nrqr,r_waits1,r_status1);CHKERRQ(ierr);
886 for (i=0; i<nrqr; ++i) {
887 req_size[i] = 0;
888 rbuf1_i = rbuf1[i];
889 start = 2*rbuf1_i[0] + 1;
890 ierr = MPI_Get_count(r_status1+i,MPIU_INT,&end);CHKERRQ(ierr);
891 ierr = PetscMalloc1(end+1,&sbuf2[i]);CHKERRQ(ierr);
892 sbuf2_i = sbuf2[i];
893 for (j=start; j<end; j++) {
894 row = rbuf1_i[j] - rstart;
895 ncols = a_i[row+1] - a_i[row] + b_i[row+1] - b_i[row];
896 sbuf2_i[j] = ncols;
897 req_size[i] += ncols;
898 }
899 req_source1[i] = r_status1[i].MPI_SOURCE;
900 /* form the header */
901 sbuf2_i[0] = req_size[i];
902 for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
903
904 ierr = MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);CHKERRQ(ierr);
905 }
906
907 ierr = PetscFree(r_status1);CHKERRQ(ierr);
908 ierr = PetscFree(r_waits1);CHKERRQ(ierr);
909 ierr = PetscFree4(w1,w2,w3,w4);CHKERRQ(ierr);
910
911 /* Receive messages*/
912 ierr = PetscMalloc1(nrqs+1,&r_waits3);CHKERRQ(ierr);
913 ierr = PetscMalloc1(nrqs+1,&r_status2);CHKERRQ(ierr);
914
915 ierr = MPI_Waitall(nrqs,r_waits2,r_status2);CHKERRQ(ierr);
916 for (i=0; i<nrqs; ++i) {
917 ierr = PetscMalloc1(rbuf2[i][0]+1,&rbuf3[i]);CHKERRQ(ierr);
918 req_source2[i] = r_status2[i].MPI_SOURCE;
919 ierr = MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);CHKERRQ(ierr);
920 }
921 ierr = PetscFree(r_status2);CHKERRQ(ierr);
922 ierr = PetscFree(r_waits2);CHKERRQ(ierr);
923
924 /* Wait on sends1 and sends2 */
925 ierr = PetscMalloc1(nrqs+1,&s_status1);CHKERRQ(ierr);
926 ierr = PetscMalloc1(nrqr+1,&s_status2);CHKERRQ(ierr);
927
928 if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);}
929 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);}
930 ierr = PetscFree(s_status1);CHKERRQ(ierr);
931 ierr = PetscFree(s_status2);CHKERRQ(ierr);
932 ierr = PetscFree(s_waits1);CHKERRQ(ierr);
933 ierr = PetscFree(s_waits2);CHKERRQ(ierr);
934
935 /* Now allocate sending buffers for a->j, and send them off */
936 ierr = PetscMalloc1(nrqr+1,&sbuf_aj);CHKERRQ(ierr);
937 for (i=0,j=0; i<nrqr; i++) j += req_size[i];
938 ierr = PetscMalloc1(j+1,&sbuf_aj[0]);CHKERRQ(ierr);
939 for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
940
941 ierr = PetscMalloc1(nrqr+1,&s_waits3);CHKERRQ(ierr);
942 {
943
944 for (i=0; i<nrqr; i++) {
945 rbuf1_i = rbuf1[i];
946 sbuf_aj_i = sbuf_aj[i];
947 ct1 = 2*rbuf1_i[0] + 1;
948 ct2 = 0;
949 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
950 kmax = rbuf1[i][2*j];
951 for (k=0; k<kmax; k++,ct1++) {
952 row = rbuf1_i[ct1] - rstart;
953 nzA = a_i[row+1] - a_i[row]; nzB = b_i[row+1] - b_i[row];
954 ncols = nzA + nzB;
955 cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
956
957 /* load the column indices for this row into cols */
958 cols = sbuf_aj_i + ct2;
959 for (l=0; l<nzB; l++) {
960 if ((ctmp = bmap[cworkB[l]]) < cstart) cols[l] = ctmp;
961 else break;
962 }
963 imark = l;
964 for (l=0; l<nzA; l++) {cols[imark+l] = cstart + cworkA[l];}
965 for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
966 ct2 += ncols;
967 }
968 }
969 ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);CHKERRQ(ierr);
970 }
971 }
972 ierr = PetscMalloc2(nrqs+1,&r_status3,nrqr+1,&s_status3);CHKERRQ(ierr);
973
974 /* create col map: global col of C -> local col of submatrices */
975 #if defined(PETSC_USE_CTABLE)
976 for (i=0; i<ismax; i++) {
977 if (!allcolumns[i]) {
978 ierr = PetscTableCreate(ncol[i]+1,c->Nbs+1,&cmap[i]);CHKERRQ(ierr);
979
980 jmax = ncol[i];
981 icol_i = icol[i];
982 cmap_i = cmap[i];
983 for (j=0; j<jmax; j++) {
984 ierr = PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
985 }
986 } else cmap[i] = NULL;
987 }
988 #else
989 for (i=0; i<ismax; i++) {
990 if (!allcolumns[i]) {
991 ierr = PetscCalloc1(c->Nbs,&cmap[i]);CHKERRQ(ierr);
992 jmax = ncol[i];
993 icol_i = icol[i];
994 cmap_i = cmap[i];
995 for (j=0; j<jmax; j++) cmap_i[icol_i[j]] = j+1;
996 } else cmap[i] = NULL;
997 }
998 #endif
999
1000 /* Create lens which is required for MatCreate... */
1001 for (i=0,j=0; i<ismax; i++) j += nrow[i];
1002 ierr = PetscMalloc1(ismax,&lens);CHKERRQ(ierr);
1003
1004 if (ismax) {
1005 ierr = PetscCalloc1(j,&lens[0]);CHKERRQ(ierr);
1006 }
1007 for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
1008
1009 /* Update lens from local data */
1010 for (i=0; i<ismax; i++) {
1011 row2proc_i = row2proc[i];
1012 jmax = nrow[i];
1013 if (!allcolumns[i]) cmap_i = cmap[i];
1014 irow_i = irow[i];
1015 lens_i = lens[i];
1016 for (j=0; j<jmax; j++) {
1017 if (allrows[i]) row = j;
1018 else row = irow_i[j]; /* global blocked row of C */
1019
1020 proc = row2proc_i[j];
1021 if (proc == rank) {
1022 /* Get indices from matA and then from matB */
1023 #if defined(PETSC_USE_CTABLE)
1024 PetscInt tt;
1025 #endif
1026 row = row - rstart;
1027 nzA = a_i[row+1] - a_i[row];
1028 nzB = b_i[row+1] - b_i[row];
1029 cworkA = a_j + a_i[row];
1030 cworkB = b_j + b_i[row];
1031
1032 if (!allcolumns[i]) {
1033 #if defined(PETSC_USE_CTABLE)
1034 for (k=0; k<nzA; k++) {
1035 ierr = PetscTableFind(cmap_i,cstart+cworkA[k]+1,&tt);CHKERRQ(ierr);
1036 if (tt) lens_i[j]++;
1037 }
1038 for (k=0; k<nzB; k++) {
1039 ierr = PetscTableFind(cmap_i,bmap[cworkB[k]]+1,&tt);CHKERRQ(ierr);
1040 if (tt) lens_i[j]++;
1041 }
1042
1043 #else
1044 for (k=0; k<nzA; k++) {
1045 if (cmap_i[cstart + cworkA[k]]) lens_i[j]++;
1046 }
1047 for (k=0; k<nzB; k++) {
1048 if (cmap_i[bmap[cworkB[k]]]) lens_i[j]++;
1049 }
1050 #endif
1051 } else { /* allcolumns */
1052 lens_i[j] = nzA + nzB;
1053 }
1054 }
1055 }
1056 }
1057
1058 /* Create row map: global row of C -> local row of submatrices */
1059 for (i=0; i<ismax; i++) {
1060 if (!allrows[i]) {
1061 #if defined(PETSC_USE_CTABLE)
1062 ierr = PetscTableCreate(nrow[i]+1,c->Mbs+1,&rmap[i]);CHKERRQ(ierr);
1063 irow_i = irow[i];
1064 jmax = nrow[i];
1065 for (j=0; j<jmax; j++) {
1066 if (allrows[i]) {
1067 ierr = PetscTableAdd(rmap[i],j+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1068 } else {
1069 ierr = PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);CHKERRQ(ierr);
1070 }
1071 }
1072 #else
1073 ierr = PetscCalloc1(c->Mbs,&rmap[i]);CHKERRQ(ierr);
1074 rmap_i = rmap[i];
1075 irow_i = irow[i];
1076 jmax = nrow[i];
1077 for (j=0; j<jmax; j++) {
1078 if (allrows[i]) rmap_i[j] = j;
1079 else rmap_i[irow_i[j]] = j;
1080 }
1081 #endif
1082 } else rmap[i] = NULL;
1083 }
1084
1085 /* Update lens from offproc data */
1086 {
1087 PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
1088
1089 ierr = MPI_Waitall(nrqs,r_waits3,r_status3);CHKERRQ(ierr);
1090 for (tmp2=0; tmp2<nrqs; tmp2++) {
1091 sbuf1_i = sbuf1[pa[tmp2]];
1092 jmax = sbuf1_i[0];
1093 ct1 = 2*jmax+1;
1094 ct2 = 0;
1095 rbuf2_i = rbuf2[tmp2];
1096 rbuf3_i = rbuf3[tmp2];
1097 for (j=1; j<=jmax; j++) {
1098 is_no = sbuf1_i[2*j-1];
1099 max1 = sbuf1_i[2*j];
1100 lens_i = lens[is_no];
1101 if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1102 rmap_i = rmap[is_no];
1103 for (k=0; k<max1; k++,ct1++) {
1104 if (allrows[is_no]) {
1105 row = sbuf1_i[ct1];
1106 } else {
1107 #if defined(PETSC_USE_CTABLE)
1108 ierr = PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);CHKERRQ(ierr);
1109 row--;
1110 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1111 #else
1112 row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1113 #endif
1114 }
1115 max2 = rbuf2_i[ct1];
1116 for (l=0; l<max2; l++,ct2++) {
1117 if (!allcolumns[is_no]) {
1118 #if defined(PETSC_USE_CTABLE)
1119 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1120 #else
1121 tcol = cmap_i[rbuf3_i[ct2]];
1122 #endif
1123 if (tcol) lens_i[row]++;
1124 } else { /* allcolumns */
1125 lens_i[row]++; /* lens_i[row] += max2 ? */
1126 }
1127 }
1128 }
1129 }
1130 }
1131 }
1132 ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1133 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);}
1134 ierr = PetscFree2(r_status3,s_status3);CHKERRQ(ierr);
1135 ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1136
1137 /* Create the submatrices */
1138 for (i=0; i<ismax; i++) {
1139 PetscInt bs_tmp;
1140 if (ijonly) bs_tmp = 1;
1141 else bs_tmp = bs;
1142
1143 ierr = MatCreate(PETSC_COMM_SELF,submats+i);CHKERRQ(ierr);
1144 ierr = MatSetSizes(submats[i],nrow[i]*bs_tmp,ncol[i]*bs_tmp,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1145
1146 ierr = MatSetType(submats[i],((PetscObject)A)->type_name);CHKERRQ(ierr);
1147 ierr = MatSeqBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i]);CHKERRQ(ierr);
1148 ierr = MatSeqSBAIJSetPreallocation(submats[i],bs_tmp,0,lens[i]);CHKERRQ(ierr); /* this subroutine is used by SBAIJ routines */
1149
1150 /* create struct Mat_SubSppt and attached it to submat */
1151 ierr = PetscNew(&smat_i);CHKERRQ(ierr);
1152 subc = (Mat_SeqBAIJ*)submats[i]->data;
1153 subc->submatis1 = smat_i;
1154
1155 smat_i->destroy = submats[i]->ops->destroy;
1156 submats[i]->ops->destroy = MatDestroySubMatrix_SeqBAIJ;
1157 submats[i]->factortype = C->factortype;
1158
1159 smat_i->id = i;
1160 smat_i->nrqs = nrqs;
1161 smat_i->nrqr = nrqr;
1162 smat_i->rbuf1 = rbuf1;
1163 smat_i->rbuf2 = rbuf2;
1164 smat_i->rbuf3 = rbuf3;
1165 smat_i->sbuf2 = sbuf2;
1166 smat_i->req_source2 = req_source2;
1167
1168 smat_i->sbuf1 = sbuf1;
1169 smat_i->ptr = ptr;
1170 smat_i->tmp = tmp;
1171 smat_i->ctr = ctr;
1172
1173 smat_i->pa = pa;
1174 smat_i->req_size = req_size;
1175 smat_i->req_source1 = req_source1;
1176
1177 smat_i->allcolumns = allcolumns[i];
1178 smat_i->allrows = allrows[i];
1179 smat_i->singleis = PETSC_FALSE;
1180 smat_i->row2proc = row2proc[i];
1181 smat_i->rmap = rmap[i];
1182 smat_i->cmap = cmap[i];
1183 }
1184
1185 if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
1186 ierr = MatCreate(PETSC_COMM_SELF,&submats[0]);CHKERRQ(ierr);
1187 ierr = MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);CHKERRQ(ierr);
1188 ierr = MatSetType(submats[0],MATDUMMY);CHKERRQ(ierr);
1189
1190 /* create struct Mat_SubSppt and attached it to submat */
1191 ierr = PetscNewLog(submats[0],&smat_i);CHKERRQ(ierr);
1192 submats[0]->data = (void*)smat_i;
1193
1194 smat_i->destroy = submats[0]->ops->destroy;
1195 submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
1196 submats[0]->factortype = C->factortype;
1197
1198 smat_i->id = 0;
1199 smat_i->nrqs = nrqs;
1200 smat_i->nrqr = nrqr;
1201 smat_i->rbuf1 = rbuf1;
1202 smat_i->rbuf2 = rbuf2;
1203 smat_i->rbuf3 = rbuf3;
1204 smat_i->sbuf2 = sbuf2;
1205 smat_i->req_source2 = req_source2;
1206
1207 smat_i->sbuf1 = sbuf1;
1208 smat_i->ptr = ptr;
1209 smat_i->tmp = tmp;
1210 smat_i->ctr = ctr;
1211
1212 smat_i->pa = pa;
1213 smat_i->req_size = req_size;
1214 smat_i->req_source1 = req_source1;
1215
1216 smat_i->allcolumns = PETSC_FALSE;
1217 smat_i->singleis = PETSC_FALSE;
1218 smat_i->row2proc = NULL;
1219 smat_i->rmap = NULL;
1220 smat_i->cmap = NULL;
1221 }
1222
1223
1224 if (ismax) {ierr = PetscFree(lens[0]);CHKERRQ(ierr);}
1225 ierr = PetscFree(lens);CHKERRQ(ierr);
1226 ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1227 ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1228
1229 } /* endof scall == MAT_INITIAL_MATRIX */
1230
1231 /* Post recv matrix values */
1232 if (!ijonly) {
1233 ierr = PetscObjectGetNewTag((PetscObject)C,&tag4);CHKERRQ(ierr);
1234 ierr = PetscMalloc1(nrqs+1,&rbuf4);CHKERRQ(ierr);
1235 ierr = PetscMalloc1(nrqs+1,&r_waits4);CHKERRQ(ierr);
1236 ierr = PetscMalloc1(nrqs+1,&r_status4);CHKERRQ(ierr);
1237 ierr = PetscMalloc1(nrqr+1,&s_status4);CHKERRQ(ierr);
1238 for (i=0; i<nrqs; ++i) {
1239 ierr = PetscMalloc1(rbuf2[i][0]*bs2,&rbuf4[i]);CHKERRQ(ierr);
1240 ierr = MPI_Irecv(rbuf4[i],rbuf2[i][0]*bs2,MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);CHKERRQ(ierr);
1241 }
1242
1243 /* Allocate sending buffers for a->a, and send them off */
1244 ierr = PetscMalloc1(nrqr+1,&sbuf_aa);CHKERRQ(ierr);
1245 for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1246
1247 ierr = PetscMalloc1((j+1)*bs2,&sbuf_aa[0]);CHKERRQ(ierr);
1248 for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1]*bs2;
1249
1250 ierr = PetscMalloc1(nrqr+1,&s_waits4);CHKERRQ(ierr);
1251
1252 for (i=0; i<nrqr; i++) {
1253 rbuf1_i = rbuf1[i];
1254 sbuf_aa_i = sbuf_aa[i];
1255 ct1 = 2*rbuf1_i[0]+1;
1256 ct2 = 0;
1257 for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1258 kmax = rbuf1_i[2*j];
1259 for (k=0; k<kmax; k++,ct1++) {
1260 row = rbuf1_i[ct1] - rstart;
1261 nzA = a_i[row+1] - a_i[row];
1262 nzB = b_i[row+1] - b_i[row];
1263 ncols = nzA + nzB;
1264 cworkB = b_j + b_i[row];
1265 vworkA = a_a + a_i[row]*bs2;
1266 vworkB = b_a + b_i[row]*bs2;
1267
1268 /* load the column values for this row into vals*/
1269 vals = sbuf_aa_i+ct2*bs2;
1270 for (l=0; l<nzB; l++) {
1271 if ((bmap[cworkB[l]]) < cstart) {
1272 ierr = PetscArraycpy(vals+l*bs2,vworkB+l*bs2,bs2);CHKERRQ(ierr);
1273 } else break;
1274 }
1275 imark = l;
1276 for (l=0; l<nzA; l++) {
1277 ierr = PetscArraycpy(vals+(imark+l)*bs2,vworkA+l*bs2,bs2);CHKERRQ(ierr);
1278 }
1279 for (l=imark; l<nzB; l++) {
1280 ierr = PetscArraycpy(vals+(nzA+l)*bs2,vworkB+l*bs2,bs2);CHKERRQ(ierr);
1281 }
1282
1283 ct2 += ncols;
1284 }
1285 }
1286 ierr = MPI_Isend(sbuf_aa_i,req_size[i]*bs2,MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);CHKERRQ(ierr);
1287 }
1288 }
1289
1290 /* Assemble the matrices */
1291 /* First assemble the local rows */
1292 for (i=0; i<ismax; i++) {
1293 row2proc_i = row2proc[i];
1294 subc = (Mat_SeqBAIJ*)submats[i]->data;
1295 imat_ilen = subc->ilen;
1296 imat_j = subc->j;
1297 imat_i = subc->i;
1298 imat_a = subc->a;
1299
1300 if (!allcolumns[i]) cmap_i = cmap[i];
1301 rmap_i = rmap[i];
1302 irow_i = irow[i];
1303 jmax = nrow[i];
1304 for (j=0; j<jmax; j++) {
1305 if (allrows[i]) row = j;
1306 else row = irow_i[j];
1307 proc = row2proc_i[j];
1308
1309 if (proc == rank) {
1310
1311 row = row - rstart;
1312 nzA = a_i[row+1] - a_i[row];
1313 nzB = b_i[row+1] - b_i[row];
1314 cworkA = a_j + a_i[row];
1315 cworkB = b_j + b_i[row];
1316 if (!ijonly) {
1317 vworkA = a_a + a_i[row]*bs2;
1318 vworkB = b_a + b_i[row]*bs2;
1319 }
1320
1321 if (allrows[i]) {
1322 row = row+rstart;
1323 } else {
1324 #if defined(PETSC_USE_CTABLE)
1325 ierr = PetscTableFind(rmap_i,row+rstart+1,&row);CHKERRQ(ierr);
1326 row--;
1327
1328 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1329 #else
1330 row = rmap_i[row + rstart];
1331 #endif
1332 }
1333 mat_i = imat_i[row];
1334 if (!ijonly) mat_a = imat_a + mat_i*bs2;
1335 mat_j = imat_j + mat_i;
1336 ilen = imat_ilen[row];
1337
1338 /* load the column indices for this row into cols*/
1339 if (!allcolumns[i]) {
1340 for (l=0; l<nzB; l++) {
1341 if ((ctmp = bmap[cworkB[l]]) < cstart) {
1342 #if defined(PETSC_USE_CTABLE)
1343 ierr = PetscTableFind(cmap_i,ctmp+1,&tcol);CHKERRQ(ierr);
1344 if (tcol) {
1345 #else
1346 if ((tcol = cmap_i[ctmp])) {
1347 #endif
1348 *mat_j++ = tcol - 1;
1349 ierr = PetscArraycpy(mat_a,vworkB+l*bs2,bs2);CHKERRQ(ierr);
1350 mat_a += bs2;
1351 ilen++;
1352 }
1353 } else break;
1354 }
1355 imark = l;
1356 for (l=0; l<nzA; l++) {
1357 #if defined(PETSC_USE_CTABLE)
1358 ierr = PetscTableFind(cmap_i,cstart+cworkA[l]+1,&tcol);CHKERRQ(ierr);
1359 if (tcol) {
1360 #else
1361 if ((tcol = cmap_i[cstart + cworkA[l]])) {
1362 #endif
1363 *mat_j++ = tcol - 1;
1364 if (!ijonly) {
1365 ierr = PetscArraycpy(mat_a,vworkA+l*bs2,bs2);CHKERRQ(ierr);
1366 mat_a += bs2;
1367 }
1368 ilen++;
1369 }
1370 }
1371 for (l=imark; l<nzB; l++) {
1372 #if defined(PETSC_USE_CTABLE)
1373 ierr = PetscTableFind(cmap_i,bmap[cworkB[l]]+1,&tcol);CHKERRQ(ierr);
1374 if (tcol) {
1375 #else
1376 if ((tcol = cmap_i[bmap[cworkB[l]]])) {
1377 #endif
1378 *mat_j++ = tcol - 1;
1379 if (!ijonly) {
1380 ierr = PetscArraycpy(mat_a,vworkB+l*bs2,bs2);CHKERRQ(ierr);
1381 mat_a += bs2;
1382 }
1383 ilen++;
1384 }
1385 }
1386 } else { /* allcolumns */
1387 for (l=0; l<nzB; l++) {
1388 if ((ctmp = bmap[cworkB[l]]) < cstart) {
1389 *mat_j++ = ctmp;
1390 ierr = PetscArraycpy(mat_a,vworkB+l*bs2,bs2);CHKERRQ(ierr);
1391 mat_a += bs2;
1392 ilen++;
1393 } else break;
1394 }
1395 imark = l;
1396 for (l=0; l<nzA; l++) {
1397 *mat_j++ = cstart+cworkA[l];
1398 if (!ijonly) {
1399 ierr = PetscArraycpy(mat_a,vworkA+l*bs2,bs2);CHKERRQ(ierr);
1400 mat_a += bs2;
1401 }
1402 ilen++;
1403 }
1404 for (l=imark; l<nzB; l++) {
1405 *mat_j++ = bmap[cworkB[l]];
1406 if (!ijonly) {
1407 ierr = PetscArraycpy(mat_a,vworkB+l*bs2,bs2);CHKERRQ(ierr);
1408 mat_a += bs2;
1409 }
1410 ilen++;
1411 }
1412 }
1413 imat_ilen[row] = ilen;
1414 }
1415 }
1416 }
1417
1418 /* Now assemble the off proc rows */
1419 if (!ijonly) {
1420 ierr = MPI_Waitall(nrqs,r_waits4,r_status4);CHKERRQ(ierr);
1421 }
1422 for (tmp2=0; tmp2<nrqs; tmp2++) {
1423 sbuf1_i = sbuf1[pa[tmp2]];
1424 jmax = sbuf1_i[0];
1425 ct1 = 2*jmax + 1;
1426 ct2 = 0;
1427 rbuf2_i = rbuf2[tmp2];
1428 rbuf3_i = rbuf3[tmp2];
1429 if (!ijonly) rbuf4_i = rbuf4[tmp2];
1430 for (j=1; j<=jmax; j++) {
1431 is_no = sbuf1_i[2*j-1];
1432 rmap_i = rmap[is_no];
1433 if (!allcolumns[is_no]) cmap_i = cmap[is_no];
1434 subc = (Mat_SeqBAIJ*)submats[is_no]->data;
1435 imat_ilen = subc->ilen;
1436 imat_j = subc->j;
1437 imat_i = subc->i;
1438 if (!ijonly) imat_a = subc->a;
1439 max1 = sbuf1_i[2*j];
1440 for (k=0; k<max1; k++,ct1++) { /* for each recved block row */
1441 row = sbuf1_i[ct1];
1442
1443 if (allrows[is_no]) {
1444 row = sbuf1_i[ct1];
1445 } else {
1446 #if defined(PETSC_USE_CTABLE)
1447 ierr = PetscTableFind(rmap_i,row+1,&row);CHKERRQ(ierr);
1448 row--;
1449 if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1450 #else
1451 row = rmap_i[row];
1452 #endif
1453 }
1454 ilen = imat_ilen[row];
1455 mat_i = imat_i[row];
1456 if (!ijonly) mat_a = imat_a + mat_i*bs2;
1457 mat_j = imat_j + mat_i;
1458 max2 = rbuf2_i[ct1];
1459 if (!allcolumns[is_no]) {
1460 for (l=0; l<max2; l++,ct2++) {
1461 #if defined(PETSC_USE_CTABLE)
1462 ierr = PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);CHKERRQ(ierr);
1463 #else
1464 tcol = cmap_i[rbuf3_i[ct2]];
1465 #endif
1466 if (tcol) {
1467 *mat_j++ = tcol - 1;
1468 if (!ijonly) {
1469 ierr = PetscArraycpy(mat_a,rbuf4_i+ct2*bs2,bs2);CHKERRQ(ierr);
1470 mat_a += bs2;
1471 }
1472 ilen++;
1473 }
1474 }
1475 } else { /* allcolumns */
1476 for (l=0; l<max2; l++,ct2++) {
1477 *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
1478 if (!ijonly) {
1479 ierr = PetscArraycpy(mat_a,rbuf4_i+ct2*bs2,bs2);CHKERRQ(ierr);
1480 mat_a += bs2;
1481 }
1482 ilen++;
1483 }
1484 }
1485 imat_ilen[row] = ilen;
1486 }
1487 }
1488 }
1489
1490 if (!iscsorted) { /* sort column indices of the rows */
1491 MatScalar *work;
1492
1493 ierr = PetscMalloc1(bs2,&work);CHKERRQ(ierr);
1494 for (i=0; i<ismax; i++) {
1495 subc = (Mat_SeqBAIJ*)submats[i]->data;
1496 imat_ilen = subc->ilen;
1497 imat_j = subc->j;
1498 imat_i = subc->i;
1499 if (!ijonly) imat_a = subc->a;
1500 if (allcolumns[i]) continue;
1501
1502 jmax = nrow[i];
1503 for (j=0; j<jmax; j++) {
1504 mat_i = imat_i[j];
1505 mat_j = imat_j + mat_i;
1506 ilen = imat_ilen[j];
1507 if (ijonly) {
1508 ierr = PetscSortInt(ilen,mat_j);CHKERRQ(ierr);
1509 } else {
1510 mat_a = imat_a + mat_i*bs2;
1511 ierr = PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);CHKERRQ(ierr);
1512 }
1513 }
1514 }
1515 ierr = PetscFree(work);CHKERRQ(ierr);
1516 }
1517
1518 if (!ijonly) {
1519 ierr = PetscFree(r_status4);CHKERRQ(ierr);
1520 ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1521 if (nrqr) {ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);}
1522 ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1523 ierr = PetscFree(s_status4);CHKERRQ(ierr);
1524 }
1525
1526 /* Restore the indices */
1527 for (i=0; i<ismax; i++) {
1528 if (!allrows[i]) {
1529 ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1530 }
1531 if (!allcolumns[i]) {
1532 ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1533 }
1534 }
1535
1536 for (i=0; i<ismax; i++) {
1537 ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1538 ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1539 }
1540
1541 ierr = PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);CHKERRQ(ierr);
1542 ierr = PetscFree5(row2proc,cmap,rmap,allcolumns,allrows);CHKERRQ(ierr);
1543
1544 if (!ijonly) {
1545 ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1546 ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1547
1548 for (i=0; i<nrqs; ++i) {
1549 ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1550 }
1551 ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1552 }
1553 c->ijonly = PETSC_FALSE; /* set back to the default */
1554 PetscFunctionReturn(0);
1555 }
1556