1 /*
2 Defines parallel vector scatters.
3 */
4
5 #include <petsc/private/vecscatterimpl.h>
6
7
VecScatterView_MPI(VecScatter ctx,PetscViewer viewer)8 PetscErrorCode VecScatterView_MPI(VecScatter ctx,PetscViewer viewer)
9 {
10 VecScatter_MPI_General *to =(VecScatter_MPI_General*)ctx->todata;
11 VecScatter_MPI_General *from=(VecScatter_MPI_General*)ctx->fromdata;
12 PetscErrorCode ierr;
13 PetscInt i;
14 PetscMPIInt rank;
15 PetscViewerFormat format;
16 PetscBool iascii;
17
18 PetscFunctionBegin;
19 ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);CHKERRQ(ierr);
20 if (iascii) {
21 ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)ctx),&rank);CHKERRQ(ierr);
22 ierr = PetscViewerGetFormat(viewer,&format);CHKERRQ(ierr);
23 if (format == PETSC_VIEWER_ASCII_INFO) {
24 PetscInt nsend_max,nrecv_max,lensend_max,lenrecv_max,alldata,itmp;
25
26 ierr = MPI_Reduce(&to->n,&nsend_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));CHKERRQ(ierr);
27 ierr = MPI_Reduce(&from->n,&nrecv_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));CHKERRQ(ierr);
28 itmp = to->starts[to->n+1];
29 ierr = MPI_Reduce(&itmp,&lensend_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));CHKERRQ(ierr);
30 itmp = from->starts[from->n+1];
31 ierr = MPI_Reduce(&itmp,&lenrecv_max,1,MPIU_INT,MPI_MAX,0,PetscObjectComm((PetscObject)ctx));CHKERRQ(ierr);
32 ierr = MPI_Reduce(&itmp,&alldata,1,MPIU_INT,MPI_SUM,0,PetscObjectComm((PetscObject)ctx));CHKERRQ(ierr);
33
34 ierr = PetscViewerASCIIPrintf(viewer,"VecScatter statistics\n");CHKERRQ(ierr);
35 ierr = PetscViewerASCIIPrintf(viewer," Blocksize %D\n",to->bs);CHKERRQ(ierr);
36 ierr = PetscViewerASCIIPrintf(viewer," Maximum number sends %D\n",nsend_max);CHKERRQ(ierr);
37 ierr = PetscViewerASCIIPrintf(viewer," Maximum number receives %D\n",nrecv_max);CHKERRQ(ierr);
38 ierr = PetscViewerASCIIPrintf(viewer," Maximum data sent %D\n",(int)(lensend_max*to->bs*sizeof(PetscScalar)));CHKERRQ(ierr);
39 ierr = PetscViewerASCIIPrintf(viewer," Maximum data received %D\n",(int)(lenrecv_max*to->bs*sizeof(PetscScalar)));CHKERRQ(ierr);
40 ierr = PetscViewerASCIIPrintf(viewer," Total data sent %D\n",(int)(alldata*to->bs*sizeof(PetscScalar)));CHKERRQ(ierr);
41 } else {
42 ierr = PetscViewerASCIIPrintf(viewer," VecScatter Blocksize %D\n",to->bs);CHKERRQ(ierr);
43 ierr = PetscViewerASCIIPushSynchronized(viewer);CHKERRQ(ierr);
44 ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number sends = %D; Number to self = %D\n",rank,to->n,to->local.n);CHKERRQ(ierr);
45 if (to->n) {
46 for (i=0; i<to->n; i++) {
47 ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D length = %D to whom %d\n",rank,i,to->starts[i+1]-to->starts[i],to->procs[i]);CHKERRQ(ierr);
48 }
49 ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Now the indices for all remote sends (in order by process sent to)\n",rank);CHKERRQ(ierr);
50 for (i=0; i<to->starts[to->n]; i++) {
51 ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D \n",rank,to->indices[i]);CHKERRQ(ierr);
52 }
53 }
54
55 ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Number receives = %D; Number from self = %D\n",rank,from->n,from->local.n);CHKERRQ(ierr);
56 if (from->n) {
57 for (i=0; i<from->n; i++) {
58 ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D length %D from whom %d\n",rank,i,from->starts[i+1]-from->starts[i],from->procs[i]);CHKERRQ(ierr);
59 }
60
61 ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Now the indices for all remote receives (in order by process received from)\n",rank);CHKERRQ(ierr);
62 for (i=0; i<from->starts[from->n]; i++) {
63 ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] %D \n",rank,from->indices[i]);CHKERRQ(ierr);
64 }
65 }
66 if (to->local.n) {
67 ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Indices for local part of scatter\n",rank);CHKERRQ(ierr);
68 for (i=0; i<to->local.n; i++) { /* the to and from have the opposite meaning from what you would expect */
69 ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] From %D to %D \n",rank,to->local.vslots[i],from->local.vslots[i]);CHKERRQ(ierr);
70 }
71 }
72
73 for (i=0; i<to->msize; i++) {
74 if (to->sharedspacestarts[i+1] > to->sharedspacestarts[i]) {
75 ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Via shared memory to local memory partner %d count %d\n",rank,i,to->sharedspacestarts[i+1]-to->sharedspacestarts[i]);CHKERRQ(ierr);
76 }
77 }
78 for (i=0; i<from->msize; i++) {
79 if (from->sharedspacestarts[i+1] > from->sharedspacestarts[i]) {
80 ierr = PetscViewerASCIISynchronizedPrintf(viewer,"[%d] Via shared memory from local memory partner %d count %d\n",rank,i,from->sharedspacestarts[i+1]-from->sharedspacestarts[i]);CHKERRQ(ierr);
81 }
82 }
83
84 ierr = PetscViewerFlush(viewer);CHKERRQ(ierr);
85 ierr = PetscViewerASCIIPopSynchronized(viewer);CHKERRQ(ierr);
86
87 ierr = PetscViewerASCIIPrintf(viewer,"Method used to implement the VecScatter: ");CHKERRQ(ierr);
88 }
89 }
90 PetscFunctionReturn(0);
91 }
92
93 /* -----------------------------------------------------------------------------------*/
94 /*
95 The next routine determines what part of the local part of the scatter is an
96 exact copy of values into their current location. We check this here and
97 then know that we need not perform that portion of the scatter when the vector is
98 scattering to itself with INSERT_VALUES.
99
100 This is currently not used but would speed up, for example DMLocalToLocalBegin/End()
101
102 */
VecScatterLocalOptimize_Private(VecScatter scatter,VecScatter_Seq_General * to,VecScatter_Seq_General * from)103 PetscErrorCode VecScatterLocalOptimize_Private(VecScatter scatter,VecScatter_Seq_General *to,VecScatter_Seq_General *from)
104 {
105 PetscInt n = to->n,n_nonmatching = 0,i,*to_slots = to->vslots,*from_slots = from->vslots;
106 PetscErrorCode ierr;
107 PetscInt *nto_slots,*nfrom_slots,j = 0;
108
109 PetscFunctionBegin;
110 for (i=0; i<n; i++) {
111 if (to_slots[i] != from_slots[i]) n_nonmatching++;
112 }
113
114 if (!n_nonmatching) {
115 to->nonmatching_computed = PETSC_TRUE;
116 to->n_nonmatching = from->n_nonmatching = 0;
117 ierr = PetscInfo1(scatter,"Reduced %D to 0\n", n);CHKERRQ(ierr);
118 } else if (n_nonmatching == n) {
119 to->nonmatching_computed = PETSC_FALSE;
120 ierr = PetscInfo(scatter,"All values non-matching\n");CHKERRQ(ierr);
121 } else {
122 to->nonmatching_computed= PETSC_TRUE;
123 to->n_nonmatching = from->n_nonmatching = n_nonmatching;
124
125 ierr = PetscMalloc1(n_nonmatching,&nto_slots);CHKERRQ(ierr);
126 ierr = PetscMalloc1(n_nonmatching,&nfrom_slots);CHKERRQ(ierr);
127
128 to->slots_nonmatching = nto_slots;
129 from->slots_nonmatching = nfrom_slots;
130 for (i=0; i<n; i++) {
131 if (to_slots[i] != from_slots[i]) {
132 nto_slots[j] = to_slots[i];
133 nfrom_slots[j] = from_slots[i];
134 j++;
135 }
136 }
137 ierr = PetscInfo2(scatter,"Reduced %D to %D\n",n,n_nonmatching);CHKERRQ(ierr);
138 }
139 PetscFunctionReturn(0);
140 }
141
142 /* --------------------------------------------------------------------------------------*/
143
144 /* -------------------------------------------------------------------------------------*/
VecScatterDestroy_PtoP_MPI3(VecScatter ctx)145 PetscErrorCode VecScatterDestroy_PtoP_MPI3(VecScatter ctx)
146 {
147 VecScatter_MPI_General *to = (VecScatter_MPI_General*)ctx->todata;
148 VecScatter_MPI_General *from = (VecScatter_MPI_General*)ctx->fromdata;
149 PetscErrorCode ierr;
150 PetscInt i;
151
152 PetscFunctionBegin;
153 /* release MPI resources obtained with MPI_Send_init() and MPI_Recv_init() */
154 if (to->requests) {
155 for (i=0; i<to->n; i++) {
156 ierr = MPI_Request_free(to->requests + i);CHKERRQ(ierr);
157 }
158 }
159 if (to->rev_requests) {
160 for (i=0; i<to->n; i++) {
161 ierr = MPI_Request_free(to->rev_requests + i);CHKERRQ(ierr);
162 }
163 }
164 if (from->requests) {
165 for (i=0; i<from->n; i++) {
166 ierr = MPI_Request_free(from->requests + i);CHKERRQ(ierr);
167 }
168 }
169 if (from->rev_requests) {
170 for (i=0; i<from->n; i++) {
171 ierr = MPI_Request_free(from->rev_requests + i);CHKERRQ(ierr);
172 }
173 }
174 if (to->sharedwin != MPI_WIN_NULL) {ierr = MPI_Win_free(&to->sharedwin);CHKERRQ(ierr);}
175 if (from->sharedwin != MPI_WIN_NULL) {ierr = MPI_Win_free(&from->sharedwin);CHKERRQ(ierr);}
176 ierr = PetscFree(to->sharedspaces);CHKERRQ(ierr);
177 ierr = PetscFree(to->sharedspacesoffset);CHKERRQ(ierr);
178 ierr = PetscFree(to->sharedspaceindices);CHKERRQ(ierr);
179 ierr = PetscFree(to->sharedspacestarts);CHKERRQ(ierr);
180
181 ierr = PetscFree(from->sharedspaceindices);CHKERRQ(ierr);
182 ierr = PetscFree(from->sharedspaces);CHKERRQ(ierr);
183 ierr = PetscFree(from->sharedspacesoffset);CHKERRQ(ierr);
184 ierr = PetscFree(from->sharedspacestarts);CHKERRQ(ierr);
185
186 ierr = PetscFree(to->local.vslots);CHKERRQ(ierr);
187 ierr = PetscFree(from->local.vslots);CHKERRQ(ierr);
188 ierr = PetscFree(to->local.slots_nonmatching);CHKERRQ(ierr);
189 ierr = PetscFree(from->local.slots_nonmatching);CHKERRQ(ierr);
190 ierr = PetscFree(to->rev_requests);CHKERRQ(ierr);
191 ierr = PetscFree(from->rev_requests);CHKERRQ(ierr);
192 ierr = PetscFree(to->requests);CHKERRQ(ierr);
193 ierr = PetscFree(from->requests);CHKERRQ(ierr);
194 ierr = PetscFree4(to->values,to->indices,to->starts,to->procs);CHKERRQ(ierr);
195 ierr = PetscFree2(to->sstatus,to->rstatus);CHKERRQ(ierr);
196 ierr = PetscFree4(from->values,from->indices,from->starts,from->procs);CHKERRQ(ierr);
197 ierr = VecScatterMemcpyPlanDestroy_PtoP(to,from);CHKERRQ(ierr);
198 ierr = PetscFree(from);CHKERRQ(ierr);
199 ierr = PetscFree(to);CHKERRQ(ierr);
200 PetscFunctionReturn(0);
201 }
202
203 /* --------------------------------------------------------------------------------------*/
204
VecScatterCopy_PtoP_X(VecScatter in,VecScatter out)205 PetscErrorCode VecScatterCopy_PtoP_X(VecScatter in,VecScatter out)
206 {
207 VecScatter_MPI_General *in_to = (VecScatter_MPI_General*)in->todata;
208 VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
209 PetscErrorCode ierr;
210 PetscInt ny,bs = in_from->bs,jj;
211 PetscShmComm scomm;
212 MPI_Comm mscomm;
213 MPI_Info info;
214
215 PetscFunctionBegin;
216 out->ops->begin = in->ops->begin;
217 out->ops->end = in->ops->end;
218 out->ops->copy = in->ops->copy;
219 out->ops->destroy = in->ops->destroy;
220 out->ops->view = in->ops->view;
221
222 /* allocate entire send scatter context */
223 ierr = PetscNewLog(out,&out_to);CHKERRQ(ierr);
224 out_to->sharedwin = MPI_WIN_NULL;
225 ierr = PetscNewLog(out,&out_from);CHKERRQ(ierr);
226 out_from->sharedwin = MPI_WIN_NULL;
227
228 ny = in_to->starts[in_to->n];
229 out_to->n = in_to->n;
230 out_to->format = in_to->format;
231
232 ierr = PetscMalloc1(out_to->n,&out_to->requests);CHKERRQ(ierr);
233 ierr = PetscMalloc4(bs*ny,&out_to->values,ny,&out_to->indices,out_to->n+1,&out_to->starts,out_to->n,&out_to->procs);CHKERRQ(ierr);
234 ierr = PetscMalloc2(PetscMax(in_to->n,in_from->n),&out_to->sstatus,PetscMax(in_to->n,in_from->n),&out_to->rstatus);CHKERRQ(ierr);
235 ierr = PetscArraycpy(out_to->indices,in_to->indices,ny);CHKERRQ(ierr);
236 ierr = PetscArraycpy(out_to->starts,in_to->starts,out_to->n+1);CHKERRQ(ierr);
237 ierr = PetscArraycpy(out_to->procs,in_to->procs,out_to->n);CHKERRQ(ierr);
238
239 out->todata = (void*)out_to;
240 out_to->local.n = in_to->local.n;
241 out_to->local.nonmatching_computed = PETSC_FALSE;
242 out_to->local.n_nonmatching = 0;
243 out_to->local.slots_nonmatching = NULL;
244 if (in_to->local.n) {
245 ierr = PetscMalloc1(in_to->local.n,&out_to->local.vslots);CHKERRQ(ierr);
246 ierr = PetscMalloc1(in_from->local.n,&out_from->local.vslots);CHKERRQ(ierr);
247 ierr = PetscArraycpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n);CHKERRQ(ierr);
248 ierr = PetscArraycpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n);CHKERRQ(ierr);
249 } else {
250 out_to->local.vslots = NULL;
251 out_from->local.vslots = NULL;
252 }
253
254 /* allocate entire receive context */
255 ierr = VecScatterMemcpyPlanCopy(&in_to->local.memcpy_plan,&out_to->local.memcpy_plan);CHKERRQ(ierr);
256 ierr = VecScatterMemcpyPlanCopy(&in_from->local.memcpy_plan,&out_from->local.memcpy_plan);CHKERRQ(ierr);
257 ierr = VecScatterMemcpyPlanCopy(&in_to->memcpy_plan,&out_to->memcpy_plan);CHKERRQ(ierr);
258 ierr = VecScatterMemcpyPlanCopy(&in_from->memcpy_plan,&out_from->memcpy_plan);CHKERRQ(ierr);
259
260 out_from->format = in_from->format;
261 ny = in_from->starts[in_from->n];
262 out_from->n = in_from->n;
263
264 ierr = PetscMalloc1(out_from->n,&out_from->requests);CHKERRQ(ierr);
265 ierr = PetscMalloc4(ny*bs,&out_from->values,ny,&out_from->indices,out_from->n+1,&out_from->starts,out_from->n,&out_from->procs);CHKERRQ(ierr);
266 ierr = PetscArraycpy(out_from->indices,in_from->indices,ny);CHKERRQ(ierr);
267 ierr = PetscArraycpy(out_from->starts,in_from->starts,out_from->n+1);CHKERRQ(ierr);
268 ierr = PetscArraycpy(out_from->procs,in_from->procs,out_from->n);CHKERRQ(ierr);
269
270 out->fromdata = (void*)out_from;
271 out_from->local.n = in_from->local.n;
272 out_from->local.nonmatching_computed = PETSC_FALSE;
273 out_from->local.n_nonmatching = 0;
274 out_from->local.slots_nonmatching = NULL;
275
276 /*
277 set up the request arrays for use with isend_init() and irecv_init()
278 */
279 {
280 PetscMPIInt tag;
281 MPI_Comm comm;
282 PetscInt *sstarts = out_to->starts, *rstarts = out_from->starts;
283 PetscMPIInt *sprocs = out_to->procs, *rprocs = out_from->procs;
284 PetscInt i;
285 MPI_Request *swaits = out_to->requests,*rwaits = out_from->requests;
286 MPI_Request *rev_swaits,*rev_rwaits;
287 PetscScalar *Ssvalues = out_to->values, *Srvalues = out_from->values;
288
289 ierr = PetscMalloc1(in_to->n,&out_to->rev_requests);CHKERRQ(ierr);
290 ierr = PetscMalloc1(in_from->n,&out_from->rev_requests);CHKERRQ(ierr);
291
292 rev_rwaits = out_to->rev_requests;
293 rev_swaits = out_from->rev_requests;
294
295 out_from->bs = out_to->bs = bs;
296 tag = ((PetscObject)out)->tag;
297 ierr = PetscObjectGetComm((PetscObject)out,&comm);CHKERRQ(ierr);
298
299 /* Register the receives that you will use later (sends for scatter reverse) */
300 for (i=0; i<out_from->n; i++) {
301 ierr = MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);CHKERRQ(ierr);
302 ierr = MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rev_swaits+i);CHKERRQ(ierr);
303 }
304
305 for (i=0; i<out_to->n; i++) {
306 ierr = MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);CHKERRQ(ierr);
307 }
308
309 /* Register receives for scatter reverse */
310 for (i=0; i<out_to->n; i++) {
311 ierr = MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,rev_rwaits+i);CHKERRQ(ierr);
312 }
313 }
314
315 /* since the to and from data structures are not symmetric for shared memory copies we insure they always listed in "standard" form */
316 if (!in_to->sharedwin) {
317 VecScatter_MPI_General *totmp = in_to,*out_totmp = out_to;
318 in_to = in_from;
319 in_from = totmp;
320 out_to = out_from;
321 out_from = out_totmp;
322 }
323
324 /* copy the to parts for the shared memory copies between processes */
325 out_to->sharedcnt = in_to->sharedcnt;
326 out_to->msize = in_to->msize;
327 ierr = PetscMalloc1(out_to->msize+1,&out_to->sharedspacestarts);CHKERRQ(ierr);
328 ierr = PetscArraycpy(out_to->sharedspacestarts,in_to->sharedspacestarts,out_to->msize+1);CHKERRQ(ierr);
329 ierr = PetscMalloc1(out_to->sharedcnt,&out_to->sharedspaceindices);CHKERRQ(ierr);
330 ierr = PetscArraycpy(out_to->sharedspaceindices,in_to->sharedspaceindices,out_to->sharedcnt);CHKERRQ(ierr);
331
332 ierr = PetscShmCommGet(PetscObjectComm((PetscObject)in),&scomm);CHKERRQ(ierr);
333 ierr = PetscShmCommGetMpiShmComm(scomm,&mscomm);CHKERRQ(ierr);
334 ierr = MPI_Info_create(&info);CHKERRQ(ierr);
335 ierr = MPI_Info_set(info, "alloc_shared_noncontig", "true");CHKERRQ(ierr);
336 ierr = MPIU_Win_allocate_shared(bs*out_to->sharedcnt*sizeof(PetscScalar),sizeof(PetscScalar),info,mscomm,&out_to->sharedspace,&out_to->sharedwin);CHKERRQ(ierr);
337 ierr = MPI_Info_free(&info);CHKERRQ(ierr);
338
339 /* copy the to parts for the shared memory copies between processes */
340 out_from->sharedcnt = in_from->sharedcnt;
341 out_from->msize = in_from->msize;
342 ierr = PetscMalloc1(out_from->msize+1,&out_from->sharedspacestarts);CHKERRQ(ierr);
343 ierr = PetscArraycpy(out_from->sharedspacestarts,in_from->sharedspacestarts,out_from->msize+1);CHKERRQ(ierr);
344 ierr = PetscMalloc1(out_from->sharedcnt,&out_from->sharedspaceindices);CHKERRQ(ierr);
345 ierr = PetscArraycpy(out_from->sharedspaceindices,in_from->sharedspaceindices,out_from->sharedcnt);CHKERRQ(ierr);
346 ierr = PetscMalloc1(out_from->msize,&out_from->sharedspacesoffset);CHKERRQ(ierr);
347 ierr = PetscArraycpy(out_from->sharedspacesoffset,in_from->sharedspacesoffset,out_from->msize);CHKERRQ(ierr);
348 ierr = PetscCalloc1(out_from->msize,&out_from->sharedspaces);CHKERRQ(ierr);
349 for (jj=0; jj<out_from->msize; jj++) {
350 MPI_Aint isize;
351 PetscMPIInt disp_unit;
352 ierr = MPIU_Win_shared_query(out_to->sharedwin,jj,&isize,&disp_unit,&out_from->sharedspaces[jj]);CHKERRQ(ierr);
353 }
354 PetscFunctionReturn(0);
355 }
356
VecScatterCopy_PtoP_AllToAll(VecScatter in,VecScatter out)357 PetscErrorCode VecScatterCopy_PtoP_AllToAll(VecScatter in,VecScatter out)
358 {
359 VecScatter_MPI_General *in_to = (VecScatter_MPI_General*)in->todata;
360 VecScatter_MPI_General *in_from = (VecScatter_MPI_General*)in->fromdata,*out_to,*out_from;
361 PetscErrorCode ierr;
362 PetscInt ny,bs = in_from->bs;
363 PetscMPIInt size;
364
365 PetscFunctionBegin;
366 ierr = MPI_Comm_size(PetscObjectComm((PetscObject)in),&size);CHKERRQ(ierr);
367
368 out->ops->begin = in->ops->begin;
369 out->ops->end = in->ops->end;
370 out->ops->copy = in->ops->copy;
371 out->ops->destroy = in->ops->destroy;
372 out->ops->view = in->ops->view;
373
374 /* allocate entire send scatter context */
375 ierr = PetscNewLog(out,&out_to);CHKERRQ(ierr);
376 out_to->sharedwin = MPI_WIN_NULL;
377 ierr = PetscNewLog(out,&out_from);CHKERRQ(ierr);
378 out_from->sharedwin = MPI_WIN_NULL;
379
380 ny = in_to->starts[in_to->n];
381 out_to->n = in_to->n;
382 out_to->format = in_to->format;
383
384 ierr = PetscMalloc1(out_to->n,&out_to->requests);CHKERRQ(ierr);
385 ierr = PetscMalloc4(bs*ny,&out_to->values,ny,&out_to->indices,out_to->n+1,&out_to->starts,out_to->n,&out_to->procs);CHKERRQ(ierr);
386 ierr = PetscMalloc2(PetscMax(in_to->n,in_from->n),&out_to->sstatus,PetscMax(in_to->n,in_from->n),&out_to->rstatus);CHKERRQ(ierr);
387 ierr = PetscArraycpy(out_to->indices,in_to->indices,ny);CHKERRQ(ierr);
388 ierr = PetscArraycpy(out_to->starts,in_to->starts,out_to->n+1);CHKERRQ(ierr);
389 ierr = PetscArraycpy(out_to->procs,in_to->procs,out_to->n);CHKERRQ(ierr);
390
391 out->todata = (void*)out_to;
392 out_to->local.n = in_to->local.n;
393 out_to->local.nonmatching_computed = PETSC_FALSE;
394 out_to->local.n_nonmatching = 0;
395 out_to->local.slots_nonmatching = NULL;
396 if (in_to->local.n) {
397 ierr = PetscMalloc1(in_to->local.n,&out_to->local.vslots);CHKERRQ(ierr);
398 ierr = PetscMalloc1(in_from->local.n,&out_from->local.vslots);CHKERRQ(ierr);
399 ierr = PetscArraycpy(out_to->local.vslots,in_to->local.vslots,in_to->local.n);CHKERRQ(ierr);
400 ierr = PetscArraycpy(out_from->local.vslots,in_from->local.vslots,in_from->local.n);CHKERRQ(ierr);
401 } else {
402 out_to->local.vslots = NULL;
403 out_from->local.vslots = NULL;
404 }
405
406 /* allocate entire receive context */
407 ierr = VecScatterMemcpyPlanCopy_PtoP(in_to,in_from,out_to,out_from);CHKERRQ(ierr);
408
409 out_from->format = in_from->format;
410 ny = in_from->starts[in_from->n];
411 out_from->n = in_from->n;
412
413 ierr = PetscMalloc1(out_from->n,&out_from->requests);CHKERRQ(ierr);
414 ierr = PetscMalloc4(ny*bs,&out_from->values,ny,&out_from->indices,out_from->n+1,&out_from->starts,out_from->n,&out_from->procs);CHKERRQ(ierr);
415 ierr = PetscArraycpy(out_from->indices,in_from->indices,ny);CHKERRQ(ierr);
416 ierr = PetscArraycpy(out_from->starts,in_from->starts,out_from->n+1);CHKERRQ(ierr);
417 ierr = PetscArraycpy(out_from->procs,in_from->procs,out_from->n);CHKERRQ(ierr);
418
419 out->fromdata = (void*)out_from;
420 out_from->local.n = in_from->local.n;
421 out_from->local.nonmatching_computed = PETSC_FALSE;
422 out_from->local.n_nonmatching = 0;
423 out_from->local.slots_nonmatching = NULL;
424
425 PetscFunctionReturn(0);
426 }
427 /* --------------------------------------------------------------------------------------------------
428 Packs and unpacks the message data into send or from receive buffers.
429
430 These could be generated automatically.
431
432 Fortran kernels etc. could be used.
433 */
Pack_1(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,PetscScalar * y,PetscInt bs)434 PETSC_STATIC_INLINE void Pack_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
435 {
436 PetscInt i;
437 for (i=0; i<n; i++) y[i] = x[indicesx[i]];
438 }
439
UnPack_1(PetscInt n,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)440 PETSC_STATIC_INLINE PetscErrorCode UnPack_1(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
441 {
442 PetscInt i;
443
444 PetscFunctionBegin;
445 switch (addv) {
446 case INSERT_VALUES:
447 case INSERT_ALL_VALUES:
448 for (i=0; i<n; i++) y[indicesy[i]] = x[i];
449 break;
450 case ADD_VALUES:
451 case ADD_ALL_VALUES:
452 for (i=0; i<n; i++) y[indicesy[i]] += x[i];
453 break;
454 #if !defined(PETSC_USE_COMPLEX)
455 case MAX_VALUES:
456 for (i=0; i<n; i++) y[indicesy[i]] = PetscMax(y[indicesy[i]],x[i]);
457 #else
458 case MAX_VALUES:
459 #endif
460 case NOT_SET_VALUES:
461 break;
462 default:
463 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
464 }
465 PetscFunctionReturn(0);
466 }
467
Scatter_1(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)468 PETSC_STATIC_INLINE PetscErrorCode Scatter_1(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
469 {
470 PetscInt i;
471
472 PetscFunctionBegin;
473 switch (addv) {
474 case INSERT_VALUES:
475 case INSERT_ALL_VALUES:
476 for (i=0; i<n; i++) y[indicesy[i]] = x[indicesx[i]];
477 break;
478 case ADD_VALUES:
479 case ADD_ALL_VALUES:
480 for (i=0; i<n; i++) y[indicesy[i]] += x[indicesx[i]];
481 break;
482 #if !defined(PETSC_USE_COMPLEX)
483 case MAX_VALUES:
484 for (i=0; i<n; i++) y[indicesy[i]] = PetscMax(y[indicesy[i]],x[indicesx[i]]);
485 #else
486 case MAX_VALUES:
487 #endif
488 case NOT_SET_VALUES:
489 break;
490 default:
491 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
492 }
493 PetscFunctionReturn(0);
494 }
495
496 /* ----------------------------------------------------------------------------------------------- */
Pack_2(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,PetscScalar * y,PetscInt bs)497 PETSC_STATIC_INLINE void Pack_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
498 {
499 PetscInt i,idx;
500
501 for (i=0; i<n; i++) {
502 idx = *indicesx++;
503 y[0] = x[idx];
504 y[1] = x[idx+1];
505 y += 2;
506 }
507 }
508
UnPack_2(PetscInt n,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)509 PETSC_STATIC_INLINE PetscErrorCode UnPack_2(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
510 {
511 PetscInt i,idy;
512
513 PetscFunctionBegin;
514 switch (addv) {
515 case INSERT_VALUES:
516 case INSERT_ALL_VALUES:
517 for (i=0; i<n; i++) {
518 idy = *indicesy++;
519 y[idy] = x[0];
520 y[idy+1] = x[1];
521 x += 2;
522 }
523 break;
524 case ADD_VALUES:
525 case ADD_ALL_VALUES:
526 for (i=0; i<n; i++) {
527 idy = *indicesy++;
528 y[idy] += x[0];
529 y[idy+1] += x[1];
530 x += 2;
531 }
532 break;
533 #if !defined(PETSC_USE_COMPLEX)
534 case MAX_VALUES:
535 for (i=0; i<n; i++) {
536 idy = *indicesy++;
537 y[idy] = PetscMax(y[idy],x[0]);
538 y[idy+1] = PetscMax(y[idy+1],x[1]);
539 x += 2;
540 }
541 #else
542 case MAX_VALUES:
543 #endif
544 case NOT_SET_VALUES:
545 break;
546 default:
547 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
548 }
549 PetscFunctionReturn(0);
550 }
551
Scatter_2(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)552 PETSC_STATIC_INLINE PetscErrorCode Scatter_2(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
553 {
554 PetscInt i,idx,idy;
555
556 PetscFunctionBegin;
557 switch (addv) {
558 case INSERT_VALUES:
559 case INSERT_ALL_VALUES:
560 for (i=0; i<n; i++) {
561 idx = *indicesx++;
562 idy = *indicesy++;
563 y[idy] = x[idx];
564 y[idy+1] = x[idx+1];
565 }
566 break;
567 case ADD_VALUES:
568 case ADD_ALL_VALUES:
569 for (i=0; i<n; i++) {
570 idx = *indicesx++;
571 idy = *indicesy++;
572 y[idy] += x[idx];
573 y[idy+1] += x[idx+1];
574 }
575 break;
576 #if !defined(PETSC_USE_COMPLEX)
577 case MAX_VALUES:
578 for (i=0; i<n; i++) {
579 idx = *indicesx++;
580 idy = *indicesy++;
581 y[idy] = PetscMax(y[idy],x[idx]);
582 y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
583 }
584 #else
585 case MAX_VALUES:
586 #endif
587 case NOT_SET_VALUES:
588 break;
589 default:
590 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
591 }
592 PetscFunctionReturn(0);
593 }
594 /* ----------------------------------------------------------------------------------------------- */
Pack_3(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,PetscScalar * y,PetscInt bs)595 PETSC_STATIC_INLINE void Pack_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
596 {
597 PetscInt i,idx;
598
599 for (i=0; i<n; i++) {
600 idx = *indicesx++;
601 y[0] = x[idx];
602 y[1] = x[idx+1];
603 y[2] = x[idx+2];
604 y += 3;
605 }
606 }
UnPack_3(PetscInt n,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)607 PETSC_STATIC_INLINE PetscErrorCode UnPack_3(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
608 {
609 PetscInt i,idy;
610
611 PetscFunctionBegin;
612 switch (addv) {
613 case INSERT_VALUES:
614 case INSERT_ALL_VALUES:
615 for (i=0; i<n; i++) {
616 idy = *indicesy++;
617 y[idy] = x[0];
618 y[idy+1] = x[1];
619 y[idy+2] = x[2];
620 x += 3;
621 }
622 break;
623 case ADD_VALUES:
624 case ADD_ALL_VALUES:
625 for (i=0; i<n; i++) {
626 idy = *indicesy++;
627 y[idy] += x[0];
628 y[idy+1] += x[1];
629 y[idy+2] += x[2];
630 x += 3;
631 }
632 break;
633 #if !defined(PETSC_USE_COMPLEX)
634 case MAX_VALUES:
635 for (i=0; i<n; i++) {
636 idy = *indicesy++;
637 y[idy] = PetscMax(y[idy],x[0]);
638 y[idy+1] = PetscMax(y[idy+1],x[1]);
639 y[idy+2] = PetscMax(y[idy+2],x[2]);
640 x += 3;
641 }
642 #else
643 case MAX_VALUES:
644 #endif
645 case NOT_SET_VALUES:
646 break;
647 default:
648 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
649 }
650 PetscFunctionReturn(0);
651 }
652
Scatter_3(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)653 PETSC_STATIC_INLINE PetscErrorCode Scatter_3(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
654 {
655 PetscInt i,idx,idy;
656
657 PetscFunctionBegin;
658 switch (addv) {
659 case INSERT_VALUES:
660 case INSERT_ALL_VALUES:
661 for (i=0; i<n; i++) {
662 idx = *indicesx++;
663 idy = *indicesy++;
664 y[idy] = x[idx];
665 y[idy+1] = x[idx+1];
666 y[idy+2] = x[idx+2];
667 }
668 break;
669 case ADD_VALUES:
670 case ADD_ALL_VALUES:
671 for (i=0; i<n; i++) {
672 idx = *indicesx++;
673 idy = *indicesy++;
674 y[idy] += x[idx];
675 y[idy+1] += x[idx+1];
676 y[idy+2] += x[idx+2];
677 }
678 break;
679 #if !defined(PETSC_USE_COMPLEX)
680 case MAX_VALUES:
681 for (i=0; i<n; i++) {
682 idx = *indicesx++;
683 idy = *indicesy++;
684 y[idy] = PetscMax(y[idy],x[idx]);
685 y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
686 y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
687 }
688 #else
689 case MAX_VALUES:
690 #endif
691 case NOT_SET_VALUES:
692 break;
693 default:
694 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
695 }
696 PetscFunctionReturn(0);
697 }
698 /* ----------------------------------------------------------------------------------------------- */
Pack_4(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,PetscScalar * y,PetscInt bs)699 PETSC_STATIC_INLINE void Pack_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
700 {
701 PetscInt i,idx;
702
703 for (i=0; i<n; i++) {
704 idx = *indicesx++;
705 y[0] = x[idx];
706 y[1] = x[idx+1];
707 y[2] = x[idx+2];
708 y[3] = x[idx+3];
709 y += 4;
710 }
711 }
UnPack_4(PetscInt n,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)712 PETSC_STATIC_INLINE PetscErrorCode UnPack_4(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
713 {
714 PetscInt i,idy;
715
716 PetscFunctionBegin;
717 switch (addv) {
718 case INSERT_VALUES:
719 case INSERT_ALL_VALUES:
720 for (i=0; i<n; i++) {
721 idy = *indicesy++;
722 y[idy] = x[0];
723 y[idy+1] = x[1];
724 y[idy+2] = x[2];
725 y[idy+3] = x[3];
726 x += 4;
727 }
728 break;
729 case ADD_VALUES:
730 case ADD_ALL_VALUES:
731 for (i=0; i<n; i++) {
732 idy = *indicesy++;
733 y[idy] += x[0];
734 y[idy+1] += x[1];
735 y[idy+2] += x[2];
736 y[idy+3] += x[3];
737 x += 4;
738 }
739 break;
740 #if !defined(PETSC_USE_COMPLEX)
741 case MAX_VALUES:
742 for (i=0; i<n; i++) {
743 idy = *indicesy++;
744 y[idy] = PetscMax(y[idy],x[0]);
745 y[idy+1] = PetscMax(y[idy+1],x[1]);
746 y[idy+2] = PetscMax(y[idy+2],x[2]);
747 y[idy+3] = PetscMax(y[idy+3],x[3]);
748 x += 4;
749 }
750 #else
751 case MAX_VALUES:
752 #endif
753 case NOT_SET_VALUES:
754 break;
755 default:
756 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
757 }
758 PetscFunctionReturn(0);
759 }
760
Scatter_4(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)761 PETSC_STATIC_INLINE PetscErrorCode Scatter_4(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
762 {
763 PetscInt i,idx,idy;
764
765 PetscFunctionBegin;
766 switch (addv) {
767 case INSERT_VALUES:
768 case INSERT_ALL_VALUES:
769 for (i=0; i<n; i++) {
770 idx = *indicesx++;
771 idy = *indicesy++;
772 y[idy] = x[idx];
773 y[idy+1] = x[idx+1];
774 y[idy+2] = x[idx+2];
775 y[idy+3] = x[idx+3];
776 }
777 break;
778 case ADD_VALUES:
779 case ADD_ALL_VALUES:
780 for (i=0; i<n; i++) {
781 idx = *indicesx++;
782 idy = *indicesy++;
783 y[idy] += x[idx];
784 y[idy+1] += x[idx+1];
785 y[idy+2] += x[idx+2];
786 y[idy+3] += x[idx+3];
787 }
788 break;
789 #if !defined(PETSC_USE_COMPLEX)
790 case MAX_VALUES:
791 for (i=0; i<n; i++) {
792 idx = *indicesx++;
793 idy = *indicesy++;
794 y[idy] = PetscMax(y[idy],x[idx]);
795 y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
796 y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
797 y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
798 }
799 #else
800 case MAX_VALUES:
801 #endif
802 case NOT_SET_VALUES:
803 break;
804 default:
805 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
806 }
807 PetscFunctionReturn(0);
808 }
809 /* ----------------------------------------------------------------------------------------------- */
Pack_5(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,PetscScalar * y,PetscInt bs)810 PETSC_STATIC_INLINE void Pack_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
811 {
812 PetscInt i,idx;
813
814 for (i=0; i<n; i++) {
815 idx = *indicesx++;
816 y[0] = x[idx];
817 y[1] = x[idx+1];
818 y[2] = x[idx+2];
819 y[3] = x[idx+3];
820 y[4] = x[idx+4];
821 y += 5;
822 }
823 }
824
UnPack_5(PetscInt n,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)825 PETSC_STATIC_INLINE PetscErrorCode UnPack_5(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
826 {
827 PetscInt i,idy;
828
829 PetscFunctionBegin;
830 switch (addv) {
831 case INSERT_VALUES:
832 case INSERT_ALL_VALUES:
833 for (i=0; i<n; i++) {
834 idy = *indicesy++;
835 y[idy] = x[0];
836 y[idy+1] = x[1];
837 y[idy+2] = x[2];
838 y[idy+3] = x[3];
839 y[idy+4] = x[4];
840 x += 5;
841 }
842 break;
843 case ADD_VALUES:
844 case ADD_ALL_VALUES:
845 for (i=0; i<n; i++) {
846 idy = *indicesy++;
847 y[idy] += x[0];
848 y[idy+1] += x[1];
849 y[idy+2] += x[2];
850 y[idy+3] += x[3];
851 y[idy+4] += x[4];
852 x += 5;
853 }
854 break;
855 #if !defined(PETSC_USE_COMPLEX)
856 case MAX_VALUES:
857 for (i=0; i<n; i++) {
858 idy = *indicesy++;
859 y[idy] = PetscMax(y[idy],x[0]);
860 y[idy+1] = PetscMax(y[idy+1],x[1]);
861 y[idy+2] = PetscMax(y[idy+2],x[2]);
862 y[idy+3] = PetscMax(y[idy+3],x[3]);
863 y[idy+4] = PetscMax(y[idy+4],x[4]);
864 x += 5;
865 }
866 #else
867 case MAX_VALUES:
868 #endif
869 case NOT_SET_VALUES:
870 break;
871 default:
872 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
873 }
874 PetscFunctionReturn(0);
875 }
876
Scatter_5(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)877 PETSC_STATIC_INLINE PetscErrorCode Scatter_5(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
878 {
879 PetscInt i,idx,idy;
880
881 PetscFunctionBegin;
882 switch (addv) {
883 case INSERT_VALUES:
884 case INSERT_ALL_VALUES:
885 for (i=0; i<n; i++) {
886 idx = *indicesx++;
887 idy = *indicesy++;
888 y[idy] = x[idx];
889 y[idy+1] = x[idx+1];
890 y[idy+2] = x[idx+2];
891 y[idy+3] = x[idx+3];
892 y[idy+4] = x[idx+4];
893 }
894 break;
895 case ADD_VALUES:
896 case ADD_ALL_VALUES:
897 for (i=0; i<n; i++) {
898 idx = *indicesx++;
899 idy = *indicesy++;
900 y[idy] += x[idx];
901 y[idy+1] += x[idx+1];
902 y[idy+2] += x[idx+2];
903 y[idy+3] += x[idx+3];
904 y[idy+4] += x[idx+4];
905 }
906 break;
907 #if !defined(PETSC_USE_COMPLEX)
908 case MAX_VALUES:
909 for (i=0; i<n; i++) {
910 idx = *indicesx++;
911 idy = *indicesy++;
912 y[idy] = PetscMax(y[idy],x[idx]);
913 y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
914 y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
915 y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
916 y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
917 }
918 #else
919 case MAX_VALUES:
920 #endif
921 case NOT_SET_VALUES:
922 break;
923 default:
924 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
925 }
926 PetscFunctionReturn(0);
927 }
928 /* ----------------------------------------------------------------------------------------------- */
Pack_6(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,PetscScalar * y,PetscInt bs)929 PETSC_STATIC_INLINE void Pack_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
930 {
931 PetscInt i,idx;
932
933 for (i=0; i<n; i++) {
934 idx = *indicesx++;
935 y[0] = x[idx];
936 y[1] = x[idx+1];
937 y[2] = x[idx+2];
938 y[3] = x[idx+3];
939 y[4] = x[idx+4];
940 y[5] = x[idx+5];
941 y += 6;
942 }
943 }
944
UnPack_6(PetscInt n,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)945 PETSC_STATIC_INLINE PetscErrorCode UnPack_6(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
946 {
947 PetscInt i,idy;
948
949 PetscFunctionBegin;
950 switch (addv) {
951 case INSERT_VALUES:
952 case INSERT_ALL_VALUES:
953 for (i=0; i<n; i++) {
954 idy = *indicesy++;
955 y[idy] = x[0];
956 y[idy+1] = x[1];
957 y[idy+2] = x[2];
958 y[idy+3] = x[3];
959 y[idy+4] = x[4];
960 y[idy+5] = x[5];
961 x += 6;
962 }
963 break;
964 case ADD_VALUES:
965 case ADD_ALL_VALUES:
966 for (i=0; i<n; i++) {
967 idy = *indicesy++;
968 y[idy] += x[0];
969 y[idy+1] += x[1];
970 y[idy+2] += x[2];
971 y[idy+3] += x[3];
972 y[idy+4] += x[4];
973 y[idy+5] += x[5];
974 x += 6;
975 }
976 break;
977 #if !defined(PETSC_USE_COMPLEX)
978 case MAX_VALUES:
979 for (i=0; i<n; i++) {
980 idy = *indicesy++;
981 y[idy] = PetscMax(y[idy],x[0]);
982 y[idy+1] = PetscMax(y[idy+1],x[1]);
983 y[idy+2] = PetscMax(y[idy+2],x[2]);
984 y[idy+3] = PetscMax(y[idy+3],x[3]);
985 y[idy+4] = PetscMax(y[idy+4],x[4]);
986 y[idy+5] = PetscMax(y[idy+5],x[5]);
987 x += 6;
988 }
989 #else
990 case MAX_VALUES:
991 #endif
992 case NOT_SET_VALUES:
993 break;
994 default:
995 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
996 }
997 PetscFunctionReturn(0);
998 }
999
Scatter_6(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)1000 PETSC_STATIC_INLINE PetscErrorCode Scatter_6(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1001 {
1002 PetscInt i,idx,idy;
1003
1004 PetscFunctionBegin;
1005 switch (addv) {
1006 case INSERT_VALUES:
1007 case INSERT_ALL_VALUES:
1008 for (i=0; i<n; i++) {
1009 idx = *indicesx++;
1010 idy = *indicesy++;
1011 y[idy] = x[idx];
1012 y[idy+1] = x[idx+1];
1013 y[idy+2] = x[idx+2];
1014 y[idy+3] = x[idx+3];
1015 y[idy+4] = x[idx+4];
1016 y[idy+5] = x[idx+5];
1017 }
1018 break;
1019 case ADD_VALUES:
1020 case ADD_ALL_VALUES:
1021 for (i=0; i<n; i++) {
1022 idx = *indicesx++;
1023 idy = *indicesy++;
1024 y[idy] += x[idx];
1025 y[idy+1] += x[idx+1];
1026 y[idy+2] += x[idx+2];
1027 y[idy+3] += x[idx+3];
1028 y[idy+4] += x[idx+4];
1029 y[idy+5] += x[idx+5];
1030 }
1031 break;
1032 #if !defined(PETSC_USE_COMPLEX)
1033 case MAX_VALUES:
1034 for (i=0; i<n; i++) {
1035 idx = *indicesx++;
1036 idy = *indicesy++;
1037 y[idy] = PetscMax(y[idy],x[idx]);
1038 y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1039 y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1040 y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1041 y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1042 y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1043 }
1044 #else
1045 case MAX_VALUES:
1046 #endif
1047 case NOT_SET_VALUES:
1048 break;
1049 default:
1050 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1051 }
1052 PetscFunctionReturn(0);
1053 }
1054 /* ----------------------------------------------------------------------------------------------- */
Pack_7(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,PetscScalar * y,PetscInt bs)1055 PETSC_STATIC_INLINE void Pack_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1056 {
1057 PetscInt i,idx;
1058
1059 for (i=0; i<n; i++) {
1060 idx = *indicesx++;
1061 y[0] = x[idx];
1062 y[1] = x[idx+1];
1063 y[2] = x[idx+2];
1064 y[3] = x[idx+3];
1065 y[4] = x[idx+4];
1066 y[5] = x[idx+5];
1067 y[6] = x[idx+6];
1068 y += 7;
1069 }
1070 }
1071
UnPack_7(PetscInt n,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)1072 PETSC_STATIC_INLINE PetscErrorCode UnPack_7(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1073 {
1074 PetscInt i,idy;
1075
1076 PetscFunctionBegin;
1077 switch (addv) {
1078 case INSERT_VALUES:
1079 case INSERT_ALL_VALUES:
1080 for (i=0; i<n; i++) {
1081 idy = *indicesy++;
1082 y[idy] = x[0];
1083 y[idy+1] = x[1];
1084 y[idy+2] = x[2];
1085 y[idy+3] = x[3];
1086 y[idy+4] = x[4];
1087 y[idy+5] = x[5];
1088 y[idy+6] = x[6];
1089 x += 7;
1090 }
1091 break;
1092 case ADD_VALUES:
1093 case ADD_ALL_VALUES:
1094 for (i=0; i<n; i++) {
1095 idy = *indicesy++;
1096 y[idy] += x[0];
1097 y[idy+1] += x[1];
1098 y[idy+2] += x[2];
1099 y[idy+3] += x[3];
1100 y[idy+4] += x[4];
1101 y[idy+5] += x[5];
1102 y[idy+6] += x[6];
1103 x += 7;
1104 }
1105 break;
1106 #if !defined(PETSC_USE_COMPLEX)
1107 case MAX_VALUES:
1108 for (i=0; i<n; i++) {
1109 idy = *indicesy++;
1110 y[idy] = PetscMax(y[idy],x[0]);
1111 y[idy+1] = PetscMax(y[idy+1],x[1]);
1112 y[idy+2] = PetscMax(y[idy+2],x[2]);
1113 y[idy+3] = PetscMax(y[idy+3],x[3]);
1114 y[idy+4] = PetscMax(y[idy+4],x[4]);
1115 y[idy+5] = PetscMax(y[idy+5],x[5]);
1116 y[idy+6] = PetscMax(y[idy+6],x[6]);
1117 x += 7;
1118 }
1119 #else
1120 case MAX_VALUES:
1121 #endif
1122 case NOT_SET_VALUES:
1123 break;
1124 default:
1125 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1126 }
1127 PetscFunctionReturn(0);
1128 }
1129
Scatter_7(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)1130 PETSC_STATIC_INLINE PetscErrorCode Scatter_7(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1131 {
1132 PetscInt i,idx,idy;
1133
1134 PetscFunctionBegin;
1135 switch (addv) {
1136 case INSERT_VALUES:
1137 case INSERT_ALL_VALUES:
1138 for (i=0; i<n; i++) {
1139 idx = *indicesx++;
1140 idy = *indicesy++;
1141 y[idy] = x[idx];
1142 y[idy+1] = x[idx+1];
1143 y[idy+2] = x[idx+2];
1144 y[idy+3] = x[idx+3];
1145 y[idy+4] = x[idx+4];
1146 y[idy+5] = x[idx+5];
1147 y[idy+6] = x[idx+6];
1148 }
1149 break;
1150 case ADD_VALUES:
1151 case ADD_ALL_VALUES:
1152 for (i=0; i<n; i++) {
1153 idx = *indicesx++;
1154 idy = *indicesy++;
1155 y[idy] += x[idx];
1156 y[idy+1] += x[idx+1];
1157 y[idy+2] += x[idx+2];
1158 y[idy+3] += x[idx+3];
1159 y[idy+4] += x[idx+4];
1160 y[idy+5] += x[idx+5];
1161 y[idy+6] += x[idx+6];
1162 }
1163 break;
1164 #if !defined(PETSC_USE_COMPLEX)
1165 case MAX_VALUES:
1166 for (i=0; i<n; i++) {
1167 idx = *indicesx++;
1168 idy = *indicesy++;
1169 y[idy] = PetscMax(y[idy],x[idx]);
1170 y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1171 y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1172 y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1173 y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1174 y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1175 y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1176 }
1177 #else
1178 case MAX_VALUES:
1179 #endif
1180 case NOT_SET_VALUES:
1181 break;
1182 default:
1183 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1184 }
1185 PetscFunctionReturn(0);
1186 }
1187 /* ----------------------------------------------------------------------------------------------- */
Pack_8(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,PetscScalar * y,PetscInt bs)1188 PETSC_STATIC_INLINE void Pack_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1189 {
1190 PetscInt i,idx;
1191
1192 for (i=0; i<n; i++) {
1193 idx = *indicesx++;
1194 y[0] = x[idx];
1195 y[1] = x[idx+1];
1196 y[2] = x[idx+2];
1197 y[3] = x[idx+3];
1198 y[4] = x[idx+4];
1199 y[5] = x[idx+5];
1200 y[6] = x[idx+6];
1201 y[7] = x[idx+7];
1202 y += 8;
1203 }
1204 }
1205
UnPack_8(PetscInt n,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)1206 PETSC_STATIC_INLINE PetscErrorCode UnPack_8(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1207 {
1208 PetscInt i,idy;
1209
1210 PetscFunctionBegin;
1211 switch (addv) {
1212 case INSERT_VALUES:
1213 case INSERT_ALL_VALUES:
1214 for (i=0; i<n; i++) {
1215 idy = *indicesy++;
1216 y[idy] = x[0];
1217 y[idy+1] = x[1];
1218 y[idy+2] = x[2];
1219 y[idy+3] = x[3];
1220 y[idy+4] = x[4];
1221 y[idy+5] = x[5];
1222 y[idy+6] = x[6];
1223 y[idy+7] = x[7];
1224 x += 8;
1225 }
1226 break;
1227 case ADD_VALUES:
1228 case ADD_ALL_VALUES:
1229 for (i=0; i<n; i++) {
1230 idy = *indicesy++;
1231 y[idy] += x[0];
1232 y[idy+1] += x[1];
1233 y[idy+2] += x[2];
1234 y[idy+3] += x[3];
1235 y[idy+4] += x[4];
1236 y[idy+5] += x[5];
1237 y[idy+6] += x[6];
1238 y[idy+7] += x[7];
1239 x += 8;
1240 }
1241 break;
1242 #if !defined(PETSC_USE_COMPLEX)
1243 case MAX_VALUES:
1244 for (i=0; i<n; i++) {
1245 idy = *indicesy++;
1246 y[idy] = PetscMax(y[idy],x[0]);
1247 y[idy+1] = PetscMax(y[idy+1],x[1]);
1248 y[idy+2] = PetscMax(y[idy+2],x[2]);
1249 y[idy+3] = PetscMax(y[idy+3],x[3]);
1250 y[idy+4] = PetscMax(y[idy+4],x[4]);
1251 y[idy+5] = PetscMax(y[idy+5],x[5]);
1252 y[idy+6] = PetscMax(y[idy+6],x[6]);
1253 y[idy+7] = PetscMax(y[idy+7],x[7]);
1254 x += 8;
1255 }
1256 #else
1257 case MAX_VALUES:
1258 #endif
1259 case NOT_SET_VALUES:
1260 break;
1261 default:
1262 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1263 }
1264 PetscFunctionReturn(0);
1265 }
1266
Scatter_8(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)1267 PETSC_STATIC_INLINE PetscErrorCode Scatter_8(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1268 {
1269 PetscInt i,idx,idy;
1270
1271 PetscFunctionBegin;
1272 switch (addv) {
1273 case INSERT_VALUES:
1274 case INSERT_ALL_VALUES:
1275 for (i=0; i<n; i++) {
1276 idx = *indicesx++;
1277 idy = *indicesy++;
1278 y[idy] = x[idx];
1279 y[idy+1] = x[idx+1];
1280 y[idy+2] = x[idx+2];
1281 y[idy+3] = x[idx+3];
1282 y[idy+4] = x[idx+4];
1283 y[idy+5] = x[idx+5];
1284 y[idy+6] = x[idx+6];
1285 y[idy+7] = x[idx+7];
1286 }
1287 break;
1288 case ADD_VALUES:
1289 case ADD_ALL_VALUES:
1290 for (i=0; i<n; i++) {
1291 idx = *indicesx++;
1292 idy = *indicesy++;
1293 y[idy] += x[idx];
1294 y[idy+1] += x[idx+1];
1295 y[idy+2] += x[idx+2];
1296 y[idy+3] += x[idx+3];
1297 y[idy+4] += x[idx+4];
1298 y[idy+5] += x[idx+5];
1299 y[idy+6] += x[idx+6];
1300 y[idy+7] += x[idx+7];
1301 }
1302 break;
1303 #if !defined(PETSC_USE_COMPLEX)
1304 case MAX_VALUES:
1305 for (i=0; i<n; i++) {
1306 idx = *indicesx++;
1307 idy = *indicesy++;
1308 y[idy] = PetscMax(y[idy],x[idx]);
1309 y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1310 y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1311 y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1312 y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1313 y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1314 y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1315 y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
1316 }
1317 #else
1318 case MAX_VALUES:
1319 #endif
1320 case NOT_SET_VALUES:
1321 break;
1322 default:
1323 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1324 }
1325 PetscFunctionReturn(0);
1326 }
1327
Pack_9(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,PetscScalar * y,PetscInt bs)1328 PETSC_STATIC_INLINE void Pack_9(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1329 {
1330 PetscInt i,idx;
1331
1332 for (i=0; i<n; i++) {
1333 idx = *indicesx++;
1334 y[0] = x[idx];
1335 y[1] = x[idx+1];
1336 y[2] = x[idx+2];
1337 y[3] = x[idx+3];
1338 y[4] = x[idx+4];
1339 y[5] = x[idx+5];
1340 y[6] = x[idx+6];
1341 y[7] = x[idx+7];
1342 y[8] = x[idx+8];
1343 y += 9;
1344 }
1345 }
1346
UnPack_9(PetscInt n,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)1347 PETSC_STATIC_INLINE PetscErrorCode UnPack_9(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1348 {
1349 PetscInt i,idy;
1350
1351 PetscFunctionBegin;
1352 switch (addv) {
1353 case INSERT_VALUES:
1354 case INSERT_ALL_VALUES:
1355 for (i=0; i<n; i++) {
1356 idy = *indicesy++;
1357 y[idy] = x[0];
1358 y[idy+1] = x[1];
1359 y[idy+2] = x[2];
1360 y[idy+3] = x[3];
1361 y[idy+4] = x[4];
1362 y[idy+5] = x[5];
1363 y[idy+6] = x[6];
1364 y[idy+7] = x[7];
1365 y[idy+8] = x[8];
1366 x += 9;
1367 }
1368 break;
1369 case ADD_VALUES:
1370 case ADD_ALL_VALUES:
1371 for (i=0; i<n; i++) {
1372 idy = *indicesy++;
1373 y[idy] += x[0];
1374 y[idy+1] += x[1];
1375 y[idy+2] += x[2];
1376 y[idy+3] += x[3];
1377 y[idy+4] += x[4];
1378 y[idy+5] += x[5];
1379 y[idy+6] += x[6];
1380 y[idy+7] += x[7];
1381 y[idy+8] += x[8];
1382 x += 9;
1383 }
1384 break;
1385 #if !defined(PETSC_USE_COMPLEX)
1386 case MAX_VALUES:
1387 for (i=0; i<n; i++) {
1388 idy = *indicesy++;
1389 y[idy] = PetscMax(y[idy],x[0]);
1390 y[idy+1] = PetscMax(y[idy+1],x[1]);
1391 y[idy+2] = PetscMax(y[idy+2],x[2]);
1392 y[idy+3] = PetscMax(y[idy+3],x[3]);
1393 y[idy+4] = PetscMax(y[idy+4],x[4]);
1394 y[idy+5] = PetscMax(y[idy+5],x[5]);
1395 y[idy+6] = PetscMax(y[idy+6],x[6]);
1396 y[idy+7] = PetscMax(y[idy+7],x[7]);
1397 y[idy+8] = PetscMax(y[idy+8],x[8]);
1398 x += 9;
1399 }
1400 #else
1401 case MAX_VALUES:
1402 #endif
1403 case NOT_SET_VALUES:
1404 break;
1405 default:
1406 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1407 }
1408 PetscFunctionReturn(0);
1409 }
1410
Scatter_9(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)1411 PETSC_STATIC_INLINE PetscErrorCode Scatter_9(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1412 {
1413 PetscInt i,idx,idy;
1414
1415 PetscFunctionBegin;
1416 switch (addv) {
1417 case INSERT_VALUES:
1418 case INSERT_ALL_VALUES:
1419 for (i=0; i<n; i++) {
1420 idx = *indicesx++;
1421 idy = *indicesy++;
1422 y[idy] = x[idx];
1423 y[idy+1] = x[idx+1];
1424 y[idy+2] = x[idx+2];
1425 y[idy+3] = x[idx+3];
1426 y[idy+4] = x[idx+4];
1427 y[idy+5] = x[idx+5];
1428 y[idy+6] = x[idx+6];
1429 y[idy+7] = x[idx+7];
1430 y[idy+8] = x[idx+8];
1431 }
1432 break;
1433 case ADD_VALUES:
1434 case ADD_ALL_VALUES:
1435 for (i=0; i<n; i++) {
1436 idx = *indicesx++;
1437 idy = *indicesy++;
1438 y[idy] += x[idx];
1439 y[idy+1] += x[idx+1];
1440 y[idy+2] += x[idx+2];
1441 y[idy+3] += x[idx+3];
1442 y[idy+4] += x[idx+4];
1443 y[idy+5] += x[idx+5];
1444 y[idy+6] += x[idx+6];
1445 y[idy+7] += x[idx+7];
1446 y[idy+8] += x[idx+8];
1447 }
1448 break;
1449 #if !defined(PETSC_USE_COMPLEX)
1450 case MAX_VALUES:
1451 for (i=0; i<n; i++) {
1452 idx = *indicesx++;
1453 idy = *indicesy++;
1454 y[idy] = PetscMax(y[idy],x[idx]);
1455 y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1456 y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1457 y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1458 y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1459 y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1460 y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1461 y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
1462 y[idy+8] = PetscMax(y[idy+8],x[idx+8]);
1463 }
1464 #else
1465 case MAX_VALUES:
1466 #endif
1467 case NOT_SET_VALUES:
1468 break;
1469 default:
1470 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1471 }
1472 PetscFunctionReturn(0);
1473 }
1474
Pack_10(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,PetscScalar * y,PetscInt bs)1475 PETSC_STATIC_INLINE void Pack_10(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1476 {
1477 PetscInt i,idx;
1478
1479 for (i=0; i<n; i++) {
1480 idx = *indicesx++;
1481 y[0] = x[idx];
1482 y[1] = x[idx+1];
1483 y[2] = x[idx+2];
1484 y[3] = x[idx+3];
1485 y[4] = x[idx+4];
1486 y[5] = x[idx+5];
1487 y[6] = x[idx+6];
1488 y[7] = x[idx+7];
1489 y[8] = x[idx+8];
1490 y[9] = x[idx+9];
1491 y += 10;
1492 }
1493 }
1494
UnPack_10(PetscInt n,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)1495 PETSC_STATIC_INLINE PetscErrorCode UnPack_10(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1496 {
1497 PetscInt i,idy;
1498
1499 PetscFunctionBegin;
1500 switch (addv) {
1501 case INSERT_VALUES:
1502 case INSERT_ALL_VALUES:
1503 for (i=0; i<n; i++) {
1504 idy = *indicesy++;
1505 y[idy] = x[0];
1506 y[idy+1] = x[1];
1507 y[idy+2] = x[2];
1508 y[idy+3] = x[3];
1509 y[idy+4] = x[4];
1510 y[idy+5] = x[5];
1511 y[idy+6] = x[6];
1512 y[idy+7] = x[7];
1513 y[idy+8] = x[8];
1514 y[idy+9] = x[9];
1515 x += 10;
1516 }
1517 break;
1518 case ADD_VALUES:
1519 case ADD_ALL_VALUES:
1520 for (i=0; i<n; i++) {
1521 idy = *indicesy++;
1522 y[idy] += x[0];
1523 y[idy+1] += x[1];
1524 y[idy+2] += x[2];
1525 y[idy+3] += x[3];
1526 y[idy+4] += x[4];
1527 y[idy+5] += x[5];
1528 y[idy+6] += x[6];
1529 y[idy+7] += x[7];
1530 y[idy+8] += x[8];
1531 y[idy+9] += x[9];
1532 x += 10;
1533 }
1534 break;
1535 #if !defined(PETSC_USE_COMPLEX)
1536 case MAX_VALUES:
1537 for (i=0; i<n; i++) {
1538 idy = *indicesy++;
1539 y[idy] = PetscMax(y[idy],x[0]);
1540 y[idy+1] = PetscMax(y[idy+1],x[1]);
1541 y[idy+2] = PetscMax(y[idy+2],x[2]);
1542 y[idy+3] = PetscMax(y[idy+3],x[3]);
1543 y[idy+4] = PetscMax(y[idy+4],x[4]);
1544 y[idy+5] = PetscMax(y[idy+5],x[5]);
1545 y[idy+6] = PetscMax(y[idy+6],x[6]);
1546 y[idy+7] = PetscMax(y[idy+7],x[7]);
1547 y[idy+8] = PetscMax(y[idy+8],x[8]);
1548 y[idy+9] = PetscMax(y[idy+9],x[9]);
1549 x += 10;
1550 }
1551 #else
1552 case MAX_VALUES:
1553 #endif
1554 case NOT_SET_VALUES:
1555 break;
1556 default:
1557 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1558 }
1559 PetscFunctionReturn(0);
1560 }
1561
Scatter_10(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)1562 PETSC_STATIC_INLINE PetscErrorCode Scatter_10(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1563 {
1564 PetscInt i,idx,idy;
1565
1566 PetscFunctionBegin;
1567 switch (addv) {
1568 case INSERT_VALUES:
1569 case INSERT_ALL_VALUES:
1570 for (i=0; i<n; i++) {
1571 idx = *indicesx++;
1572 idy = *indicesy++;
1573 y[idy] = x[idx];
1574 y[idy+1] = x[idx+1];
1575 y[idy+2] = x[idx+2];
1576 y[idy+3] = x[idx+3];
1577 y[idy+4] = x[idx+4];
1578 y[idy+5] = x[idx+5];
1579 y[idy+6] = x[idx+6];
1580 y[idy+7] = x[idx+7];
1581 y[idy+8] = x[idx+8];
1582 y[idy+9] = x[idx+9];
1583 }
1584 break;
1585 case ADD_VALUES:
1586 case ADD_ALL_VALUES:
1587 for (i=0; i<n; i++) {
1588 idx = *indicesx++;
1589 idy = *indicesy++;
1590 y[idy] += x[idx];
1591 y[idy+1] += x[idx+1];
1592 y[idy+2] += x[idx+2];
1593 y[idy+3] += x[idx+3];
1594 y[idy+4] += x[idx+4];
1595 y[idy+5] += x[idx+5];
1596 y[idy+6] += x[idx+6];
1597 y[idy+7] += x[idx+7];
1598 y[idy+8] += x[idx+8];
1599 y[idy+9] += x[idx+9];
1600 }
1601 break;
1602 #if !defined(PETSC_USE_COMPLEX)
1603 case MAX_VALUES:
1604 for (i=0; i<n; i++) {
1605 idx = *indicesx++;
1606 idy = *indicesy++;
1607 y[idy] = PetscMax(y[idy],x[idx]);
1608 y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1609 y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1610 y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1611 y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1612 y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1613 y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1614 y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
1615 y[idy+8] = PetscMax(y[idy+8],x[idx+8]);
1616 y[idy+9] = PetscMax(y[idy+9],x[idx+9]);
1617 }
1618 #else
1619 case MAX_VALUES:
1620 #endif
1621 case NOT_SET_VALUES:
1622 break;
1623 default:
1624 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1625 }
1626 PetscFunctionReturn(0);
1627 }
1628
Pack_11(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,PetscScalar * y,PetscInt bs)1629 PETSC_STATIC_INLINE void Pack_11(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1630 {
1631 PetscInt i,idx;
1632
1633 for (i=0; i<n; i++) {
1634 idx = *indicesx++;
1635 y[0] = x[idx];
1636 y[1] = x[idx+1];
1637 y[2] = x[idx+2];
1638 y[3] = x[idx+3];
1639 y[4] = x[idx+4];
1640 y[5] = x[idx+5];
1641 y[6] = x[idx+6];
1642 y[7] = x[idx+7];
1643 y[8] = x[idx+8];
1644 y[9] = x[idx+9];
1645 y[10] = x[idx+10];
1646 y += 11;
1647 }
1648 }
1649
UnPack_11(PetscInt n,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)1650 PETSC_STATIC_INLINE PetscErrorCode UnPack_11(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1651 {
1652 PetscInt i,idy;
1653
1654 PetscFunctionBegin;
1655 switch (addv) {
1656 case INSERT_VALUES:
1657 case INSERT_ALL_VALUES:
1658 for (i=0; i<n; i++) {
1659 idy = *indicesy++;
1660 y[idy] = x[0];
1661 y[idy+1] = x[1];
1662 y[idy+2] = x[2];
1663 y[idy+3] = x[3];
1664 y[idy+4] = x[4];
1665 y[idy+5] = x[5];
1666 y[idy+6] = x[6];
1667 y[idy+7] = x[7];
1668 y[idy+8] = x[8];
1669 y[idy+9] = x[9];
1670 y[idy+10] = x[10];
1671 x += 11;
1672 }
1673 break;
1674 case ADD_VALUES:
1675 case ADD_ALL_VALUES:
1676 for (i=0; i<n; i++) {
1677 idy = *indicesy++;
1678 y[idy] += x[0];
1679 y[idy+1] += x[1];
1680 y[idy+2] += x[2];
1681 y[idy+3] += x[3];
1682 y[idy+4] += x[4];
1683 y[idy+5] += x[5];
1684 y[idy+6] += x[6];
1685 y[idy+7] += x[7];
1686 y[idy+8] += x[8];
1687 y[idy+9] += x[9];
1688 y[idy+10] += x[10];
1689 x += 11;
1690 }
1691 break;
1692 #if !defined(PETSC_USE_COMPLEX)
1693 case MAX_VALUES:
1694 for (i=0; i<n; i++) {
1695 idy = *indicesy++;
1696 y[idy] = PetscMax(y[idy],x[0]);
1697 y[idy+1] = PetscMax(y[idy+1],x[1]);
1698 y[idy+2] = PetscMax(y[idy+2],x[2]);
1699 y[idy+3] = PetscMax(y[idy+3],x[3]);
1700 y[idy+4] = PetscMax(y[idy+4],x[4]);
1701 y[idy+5] = PetscMax(y[idy+5],x[5]);
1702 y[idy+6] = PetscMax(y[idy+6],x[6]);
1703 y[idy+7] = PetscMax(y[idy+7],x[7]);
1704 y[idy+8] = PetscMax(y[idy+8],x[8]);
1705 y[idy+9] = PetscMax(y[idy+9],x[9]);
1706 y[idy+10] = PetscMax(y[idy+10],x[10]);
1707 x += 11;
1708 }
1709 #else
1710 case MAX_VALUES:
1711 #endif
1712 case NOT_SET_VALUES:
1713 break;
1714 default:
1715 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1716 }
1717 PetscFunctionReturn(0);
1718 }
1719
Scatter_11(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)1720 PETSC_STATIC_INLINE PetscErrorCode Scatter_11(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1721 {
1722 PetscInt i,idx,idy;
1723
1724 PetscFunctionBegin;
1725 switch (addv) {
1726 case INSERT_VALUES:
1727 case INSERT_ALL_VALUES:
1728 for (i=0; i<n; i++) {
1729 idx = *indicesx++;
1730 idy = *indicesy++;
1731 y[idy] = x[idx];
1732 y[idy+1] = x[idx+1];
1733 y[idy+2] = x[idx+2];
1734 y[idy+3] = x[idx+3];
1735 y[idy+4] = x[idx+4];
1736 y[idy+5] = x[idx+5];
1737 y[idy+6] = x[idx+6];
1738 y[idy+7] = x[idx+7];
1739 y[idy+8] = x[idx+8];
1740 y[idy+9] = x[idx+9];
1741 y[idy+10] = x[idx+10];
1742 }
1743 break;
1744 case ADD_VALUES:
1745 case ADD_ALL_VALUES:
1746 for (i=0; i<n; i++) {
1747 idx = *indicesx++;
1748 idy = *indicesy++;
1749 y[idy] += x[idx];
1750 y[idy+1] += x[idx+1];
1751 y[idy+2] += x[idx+2];
1752 y[idy+3] += x[idx+3];
1753 y[idy+4] += x[idx+4];
1754 y[idy+5] += x[idx+5];
1755 y[idy+6] += x[idx+6];
1756 y[idy+7] += x[idx+7];
1757 y[idy+8] += x[idx+8];
1758 y[idy+9] += x[idx+9];
1759 y[idy+10] += x[idx+10];
1760 }
1761 break;
1762 #if !defined(PETSC_USE_COMPLEX)
1763 case MAX_VALUES:
1764 for (i=0; i<n; i++) {
1765 idx = *indicesx++;
1766 idy = *indicesy++;
1767 y[idy] = PetscMax(y[idy],x[idx]);
1768 y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1769 y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1770 y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1771 y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1772 y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1773 y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1774 y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
1775 y[idy+8] = PetscMax(y[idy+8],x[idx+8]);
1776 y[idy+9] = PetscMax(y[idy+9],x[idx+9]);
1777 y[idy+10] = PetscMax(y[idy+10],x[idx+10]);
1778 }
1779 #else
1780 case MAX_VALUES:
1781 #endif
1782 case NOT_SET_VALUES:
1783 break;
1784 default:
1785 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1786 }
1787 PetscFunctionReturn(0);
1788 }
1789
1790 /* ----------------------------------------------------------------------------------------------- */
Pack_12(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,PetscScalar * y,PetscInt bs)1791 PETSC_STATIC_INLINE void Pack_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1792 {
1793 PetscInt i,idx;
1794
1795 for (i=0; i<n; i++) {
1796 idx = *indicesx++;
1797 y[0] = x[idx];
1798 y[1] = x[idx+1];
1799 y[2] = x[idx+2];
1800 y[3] = x[idx+3];
1801 y[4] = x[idx+4];
1802 y[5] = x[idx+5];
1803 y[6] = x[idx+6];
1804 y[7] = x[idx+7];
1805 y[8] = x[idx+8];
1806 y[9] = x[idx+9];
1807 y[10] = x[idx+10];
1808 y[11] = x[idx+11];
1809 y += 12;
1810 }
1811 }
1812
UnPack_12(PetscInt n,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)1813 PETSC_STATIC_INLINE PetscErrorCode UnPack_12(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1814 {
1815 PetscInt i,idy;
1816
1817 PetscFunctionBegin;
1818 switch (addv) {
1819 case INSERT_VALUES:
1820 case INSERT_ALL_VALUES:
1821 for (i=0; i<n; i++) {
1822 idy = *indicesy++;
1823 y[idy] = x[0];
1824 y[idy+1] = x[1];
1825 y[idy+2] = x[2];
1826 y[idy+3] = x[3];
1827 y[idy+4] = x[4];
1828 y[idy+5] = x[5];
1829 y[idy+6] = x[6];
1830 y[idy+7] = x[7];
1831 y[idy+8] = x[8];
1832 y[idy+9] = x[9];
1833 y[idy+10] = x[10];
1834 y[idy+11] = x[11];
1835 x += 12;
1836 }
1837 break;
1838 case ADD_VALUES:
1839 case ADD_ALL_VALUES:
1840 for (i=0; i<n; i++) {
1841 idy = *indicesy++;
1842 y[idy] += x[0];
1843 y[idy+1] += x[1];
1844 y[idy+2] += x[2];
1845 y[idy+3] += x[3];
1846 y[idy+4] += x[4];
1847 y[idy+5] += x[5];
1848 y[idy+6] += x[6];
1849 y[idy+7] += x[7];
1850 y[idy+8] += x[8];
1851 y[idy+9] += x[9];
1852 y[idy+10] += x[10];
1853 y[idy+11] += x[11];
1854 x += 12;
1855 }
1856 break;
1857 #if !defined(PETSC_USE_COMPLEX)
1858 case MAX_VALUES:
1859 for (i=0; i<n; i++) {
1860 idy = *indicesy++;
1861 y[idy] = PetscMax(y[idy],x[0]);
1862 y[idy+1] = PetscMax(y[idy+1],x[1]);
1863 y[idy+2] = PetscMax(y[idy+2],x[2]);
1864 y[idy+3] = PetscMax(y[idy+3],x[3]);
1865 y[idy+4] = PetscMax(y[idy+4],x[4]);
1866 y[idy+5] = PetscMax(y[idy+5],x[5]);
1867 y[idy+6] = PetscMax(y[idy+6],x[6]);
1868 y[idy+7] = PetscMax(y[idy+7],x[7]);
1869 y[idy+8] = PetscMax(y[idy+8],x[8]);
1870 y[idy+9] = PetscMax(y[idy+9],x[9]);
1871 y[idy+10] = PetscMax(y[idy+10],x[10]);
1872 y[idy+11] = PetscMax(y[idy+11],x[11]);
1873 x += 12;
1874 }
1875 #else
1876 case MAX_VALUES:
1877 #endif
1878 case NOT_SET_VALUES:
1879 break;
1880 default:
1881 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1882 }
1883 PetscFunctionReturn(0);
1884 }
1885
Scatter_12(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)1886 PETSC_STATIC_INLINE PetscErrorCode Scatter_12(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1887 {
1888 PetscInt i,idx,idy;
1889
1890 PetscFunctionBegin;
1891 switch (addv) {
1892 case INSERT_VALUES:
1893 case INSERT_ALL_VALUES:
1894 for (i=0; i<n; i++) {
1895 idx = *indicesx++;
1896 idy = *indicesy++;
1897 y[idy] = x[idx];
1898 y[idy+1] = x[idx+1];
1899 y[idy+2] = x[idx+2];
1900 y[idy+3] = x[idx+3];
1901 y[idy+4] = x[idx+4];
1902 y[idy+5] = x[idx+5];
1903 y[idy+6] = x[idx+6];
1904 y[idy+7] = x[idx+7];
1905 y[idy+8] = x[idx+8];
1906 y[idy+9] = x[idx+9];
1907 y[idy+10] = x[idx+10];
1908 y[idy+11] = x[idx+11];
1909 }
1910 break;
1911 case ADD_VALUES:
1912 case ADD_ALL_VALUES:
1913 for (i=0; i<n; i++) {
1914 idx = *indicesx++;
1915 idy = *indicesy++;
1916 y[idy] += x[idx];
1917 y[idy+1] += x[idx+1];
1918 y[idy+2] += x[idx+2];
1919 y[idy+3] += x[idx+3];
1920 y[idy+4] += x[idx+4];
1921 y[idy+5] += x[idx+5];
1922 y[idy+6] += x[idx+6];
1923 y[idy+7] += x[idx+7];
1924 y[idy+8] += x[idx+8];
1925 y[idy+9] += x[idx+9];
1926 y[idy+10] += x[idx+10];
1927 y[idy+11] += x[idx+11];
1928 }
1929 break;
1930 #if !defined(PETSC_USE_COMPLEX)
1931 case MAX_VALUES:
1932 for (i=0; i<n; i++) {
1933 idx = *indicesx++;
1934 idy = *indicesy++;
1935 y[idy] = PetscMax(y[idy],x[idx]);
1936 y[idy+1] = PetscMax(y[idy+1],x[idx+1]);
1937 y[idy+2] = PetscMax(y[idy+2],x[idx+2]);
1938 y[idy+3] = PetscMax(y[idy+3],x[idx+3]);
1939 y[idy+4] = PetscMax(y[idy+4],x[idx+4]);
1940 y[idy+5] = PetscMax(y[idy+5],x[idx+5]);
1941 y[idy+6] = PetscMax(y[idy+6],x[idx+6]);
1942 y[idy+7] = PetscMax(y[idy+7],x[idx+7]);
1943 y[idy+8] = PetscMax(y[idy+8],x[idx+8]);
1944 y[idy+9] = PetscMax(y[idy+9],x[idx+9]);
1945 y[idy+10] = PetscMax(y[idy+10],x[idx+10]);
1946 y[idy+11] = PetscMax(y[idy+11],x[idx+11]);
1947 }
1948 #else
1949 case MAX_VALUES:
1950 #endif
1951 case NOT_SET_VALUES:
1952 break;
1953 default:
1954 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
1955 }
1956 PetscFunctionReturn(0);
1957 }
1958
1959 /* ----------------------------------------------------------------------------------------------- */
Pack_bs(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,PetscScalar * y,PetscInt bs)1960 PETSC_STATIC_INLINE void Pack_bs(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,PetscScalar *y,PetscInt bs)
1961 {
1962 PetscInt i,idx;
1963 PetscErrorCode ierr;
1964
1965 for (i=0; i<n; i++) {
1966 idx = *indicesx++;
1967 ierr = PetscArraycpy(y,x + idx,bs);CHKERRV(ierr);
1968 y += bs;
1969 }
1970 }
1971
UnPack_bs(PetscInt n,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)1972 PETSC_STATIC_INLINE PetscErrorCode UnPack_bs(PetscInt n,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
1973 {
1974 PetscInt i,idy,j;
1975 PetscErrorCode ierr;
1976
1977 PetscFunctionBegin;
1978 switch (addv) {
1979 case INSERT_VALUES:
1980 case INSERT_ALL_VALUES:
1981 for (i=0; i<n; i++) {
1982 idy = *indicesy++;
1983 ierr = PetscArraycpy(y + idy,x,bs);CHKERRQ(ierr);
1984 x += bs;
1985 }
1986 break;
1987 case ADD_VALUES:
1988 case ADD_ALL_VALUES:
1989 for (i=0; i<n; i++) {
1990 idy = *indicesy++;
1991 for (j=0; j<bs; j++) y[idy+j] += x[j];
1992 x += bs;
1993 }
1994 break;
1995 #if !defined(PETSC_USE_COMPLEX)
1996 case MAX_VALUES:
1997 for (i=0; i<n; i++) {
1998 idy = *indicesy++;
1999 for (j=0; j<bs; j++) y[idy+j] = PetscMax(y[idy+j],x[j]);
2000 x += bs;
2001 }
2002 #else
2003 case MAX_VALUES:
2004 #endif
2005 case NOT_SET_VALUES:
2006 break;
2007 default:
2008 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
2009 }
2010 PetscFunctionReturn(0);
2011 }
2012
Scatter_bs(PetscInt n,const PetscInt * indicesx,const PetscScalar * x,const PetscInt * indicesy,PetscScalar * y,InsertMode addv,PetscInt bs)2013 PETSC_STATIC_INLINE PetscErrorCode Scatter_bs(PetscInt n,const PetscInt *indicesx,const PetscScalar *x,const PetscInt *indicesy,PetscScalar *y,InsertMode addv,PetscInt bs)
2014 {
2015 PetscInt i,idx,idy,j;
2016 PetscErrorCode ierr;
2017
2018 PetscFunctionBegin;
2019 switch (addv) {
2020 case INSERT_VALUES:
2021 case INSERT_ALL_VALUES:
2022 for (i=0; i<n; i++) {
2023 idx = *indicesx++;
2024 idy = *indicesy++;
2025 ierr = PetscArraycpy(y + idy, x + idx,bs);CHKERRQ(ierr);
2026 }
2027 break;
2028 case ADD_VALUES:
2029 case ADD_ALL_VALUES:
2030 for (i=0; i<n; i++) {
2031 idx = *indicesx++;
2032 idy = *indicesy++;
2033 for (j=0; j<bs; j++) y[idy+j] += x[idx+j];
2034 }
2035 break;
2036 #if !defined(PETSC_USE_COMPLEX)
2037 case MAX_VALUES:
2038 for (i=0; i<n; i++) {
2039 idx = *indicesx++;
2040 idy = *indicesy++;
2041 for (j=0; j<bs; j++) y[idy+j] = PetscMax(y[idy+j],x[idx+j]);
2042 }
2043 #else
2044 case MAX_VALUES:
2045 #endif
2046 case NOT_SET_VALUES:
2047 break;
2048 default:
2049 SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot handle insert mode %d", addv);
2050 }
2051 PetscFunctionReturn(0);
2052 }
2053
2054 /* Create the VecScatterBegin/End_P for our chosen block sizes */
2055 #define BS 1
2056 #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2057 #define BS 2
2058 #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2059 #define BS 3
2060 #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2061 #define BS 4
2062 #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2063 #define BS 5
2064 #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2065 #define BS 6
2066 #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2067 #define BS 7
2068 #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2069 #define BS 8
2070 #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2071 #define BS 9
2072 #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2073 #define BS 10
2074 #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2075 #define BS 11
2076 #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2077 #define BS 12
2078 #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2079 #define BS bs
2080 #include <../src/vec/vscat/impls/mpi3/vpscat.h>
2081
2082 /* ==========================================================================================*/
2083 /* create parallel to sequential scatter context */
2084
2085 extern PetscErrorCode VecScatterCreateCommon_PtoS_MPI3(VecScatter_MPI_General*,VecScatter_MPI_General*,VecScatter);
2086
2087 /*
2088 bs indicates how many elements there are in each block. Normally this would be 1.
2089
2090 contains check that PetscMPIInt can handle the sizes needed
2091 */
2092 #include <petscbt.h>
VecScatterCreateLocal_PtoS_MPI3(PetscInt nx,const PetscInt * inidx,PetscInt ny,const PetscInt * inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)2093 PetscErrorCode VecScatterCreateLocal_PtoS_MPI3(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
2094 {
2095 VecScatter_MPI_General *from,*to;
2096 PetscMPIInt size,rank,mrank,imdex,tag,n;
2097 PetscInt *source = NULL,*owners = NULL,nxr;
2098 PetscInt *lowner = NULL,*start = NULL,*lsharedowner = NULL,*sharedstart = NULL,lengthy,lengthx;
2099 PetscMPIInt *nprocs = NULL,nrecvs,nrecvshared;
2100 PetscInt i,j,idx = 0,nsends;
2101 PetscMPIInt *owner = NULL;
2102 PetscInt *starts = NULL,count,slen,slenshared;
2103 PetscInt *rvalues,*svalues,base,*values,nprocslocal,recvtotal,*rsvalues;
2104 PetscMPIInt *onodes1,*olengths1;
2105 MPI_Comm comm;
2106 MPI_Request *send_waits = NULL,*recv_waits = NULL;
2107 MPI_Status recv_status,*send_status;
2108 PetscErrorCode ierr;
2109 PetscShmComm scomm;
2110 PetscMPIInt jj;
2111 MPI_Info info;
2112 MPI_Comm mscomm;
2113 MPI_Win sharedoffsetwin; /* Window that owns sharedspaceoffset */
2114 PetscInt *sharedspaceoffset;
2115 VecScatterType type;
2116 PetscBool mpi3node;
2117 PetscFunctionBegin;
2118 ierr = PetscObjectGetNewTag((PetscObject)ctx,&tag);CHKERRQ(ierr);
2119 ierr = PetscObjectGetComm((PetscObject)xin,&comm);CHKERRQ(ierr);
2120 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
2121 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
2122
2123 ierr = VecScatterGetType(ctx,&type);CHKERRQ(ierr);
2124 ierr = PetscStrcmp(type,"mpi3node",&mpi3node);CHKERRQ(ierr);
2125
2126 ierr = PetscShmCommGet(comm,&scomm);CHKERRQ(ierr);
2127 ierr = PetscShmCommGetMpiShmComm(scomm,&mscomm);CHKERRQ(ierr);
2128
2129 ierr = MPI_Info_create(&info);CHKERRQ(ierr);
2130 ierr = MPI_Info_set(info, "alloc_shared_noncontig", "true");CHKERRQ(ierr);
2131
2132 if (mpi3node) {
2133 /* Check if (parallel) inidx has duplicate indices.
2134 Current VecScatterEndMPI3Node() (case StoP) writes the sequential vector to the parallel vector.
2135 Writing to the same shared location without variable locking
2136 leads to incorrect scattering. See src/vec/vscat/examples/runex2_5 and runex3_5 */
2137
2138 PetscInt *mem,**optr;
2139 MPI_Win swin;
2140 PetscMPIInt msize;
2141
2142 ierr = MPI_Comm_size(mscomm,&msize);CHKERRQ(ierr);
2143 ierr = MPI_Comm_rank(mscomm,&mrank);CHKERRQ(ierr);
2144
2145 ierr = PetscMalloc1(msize,&optr);CHKERRQ(ierr);
2146 ierr = MPIU_Win_allocate_shared((nx+1)*sizeof(PetscInt),sizeof(PetscInt),MPI_INFO_NULL,mscomm,&mem,&swin);CHKERRQ(ierr);
2147
2148 /* Write local nx and inidx into mem */
2149 mem[0] = nx;
2150 for (i=1; i<=nx; i++) mem[i] = inidx[i-1];
2151 ierr = MPI_Barrier(mscomm);CHKERRQ(ierr); /* sync shared memory */
2152
2153 if (!mrank) {
2154 PetscBT table;
2155 /* sz and dsp_unit are not used. Replacing them with NULL would cause MPI_Win_shared_query() crash */
2156 MPI_Aint sz;
2157 PetscMPIInt dsp_unit;
2158 PetscInt N = xin->map->N;
2159
2160 ierr = PetscBTCreate(N,&table);CHKERRQ(ierr); /* may not be scalable */
2161 ierr = PetscBTMemzero(N,table);CHKERRQ(ierr);
2162
2163 jj = 0;
2164 while (jj<msize && !ctx->is_duplicate) {
2165 if (jj == mrank) {jj++; continue;}
2166 ierr = MPIU_Win_shared_query(swin,jj,&sz,&dsp_unit,&optr[jj]);CHKERRQ(ierr);
2167 for (j=1; j<=optr[jj][0]; j++) { /* optr[jj][0] = nx */
2168 if (PetscBTLookupSet(table,optr[jj][j])) {
2169 ctx->is_duplicate = PETSC_TRUE; /* only mrank=0 has this info., will be checked in VecScatterEndMPI3Node() */
2170 break;
2171 }
2172 }
2173 jj++;
2174 }
2175 ierr = PetscBTDestroy(&table);CHKERRQ(ierr);
2176 }
2177
2178 if (swin != MPI_WIN_NULL) {ierr = MPI_Win_free(&swin);CHKERRQ(ierr);}
2179 ierr = PetscFree(optr);CHKERRQ(ierr);
2180 }
2181
2182 owners = xin->map->range;
2183 ierr = VecGetSize(yin,&lengthy);CHKERRQ(ierr);
2184 ierr = VecGetSize(xin,&lengthx);CHKERRQ(ierr);
2185
2186 /* first count number of contributors to each processor */
2187 ierr = PetscMalloc2(size,&nprocs,nx,&owner);CHKERRQ(ierr);
2188 ierr = PetscArrayzero(nprocs,size);CHKERRQ(ierr);
2189
2190 j = 0;
2191 nsends = 0;
2192 for (i=0; i<nx; i++) {
2193 ierr = PetscIntMultError(bs,inidx[i],&idx);CHKERRQ(ierr);
2194 if (idx < owners[j]) j = 0;
2195 for (; j<size; j++) {
2196 if (idx < owners[j+1]) {
2197 if (!nprocs[j]++) nsends++;
2198 owner[i] = j;
2199 break;
2200 }
2201 }
2202 if (j == size) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB,"ith %D block entry %D not owned by any process, upper bound %D",i,idx,owners[size]);
2203 }
2204
2205 nprocslocal = nprocs[rank];
2206 nprocs[rank] = 0;
2207 if (nprocslocal) nsends--;
2208 /* inform other processors of number of messages and max length*/
2209 ierr = PetscGatherNumberOfMessages(comm,NULL,nprocs,&nrecvs);CHKERRQ(ierr);
2210 ierr = PetscGatherMessageLengths(comm,nsends,nrecvs,nprocs,&onodes1,&olengths1);CHKERRQ(ierr);
2211 ierr = PetscSortMPIIntWithArray(nrecvs,onodes1,olengths1);CHKERRQ(ierr);
2212 recvtotal = 0;
2213 for (i=0; i<nrecvs; i++) {
2214 ierr = PetscIntSumError(recvtotal,olengths1[i],&recvtotal);CHKERRQ(ierr);
2215 }
2216
2217 /* post receives: */
2218 ierr = PetscMalloc3(recvtotal,&rvalues,nrecvs,&source,nrecvs,&recv_waits);CHKERRQ(ierr);
2219 count = 0;
2220 for (i=0; i<nrecvs; i++) {
2221 ierr = MPI_Irecv((rvalues+count),olengths1[i],MPIU_INT,onodes1[i],tag,comm,recv_waits+i);CHKERRQ(ierr);
2222 count += olengths1[i];
2223 }
2224
2225 /* do sends:
2226 1) starts[i] gives the starting index in svalues for stuff going to
2227 the ith processor
2228 */
2229 nxr = 0;
2230 for (i=0; i<nx; i++) {
2231 if (owner[i] != rank) nxr++;
2232 }
2233 ierr = PetscMalloc3(nxr,&svalues,nsends,&send_waits,size+1,&starts);CHKERRQ(ierr);
2234
2235 starts[0] = 0;
2236 for (i=1; i<size; i++) starts[i] = starts[i-1] + nprocs[i-1];
2237 for (i=0; i<nx; i++) {
2238 if (owner[i] != rank) svalues[starts[owner[i]]++] = bs*inidx[i];
2239 }
2240 starts[0] = 0;
2241 for (i=1; i<size+1; i++) starts[i] = starts[i-1] + nprocs[i-1];
2242 count = 0;
2243 for (i=0; i<size; i++) {
2244 if (nprocs[i]) {
2245 ierr = MPI_Isend(svalues+starts[i],nprocs[i],MPIU_INT,i,tag,comm,send_waits+count++);CHKERRQ(ierr);
2246 }
2247 }
2248
2249 /* wait on receives; this is everyone who we need to deliver data to */
2250 count = nrecvs;
2251 slen = 0;
2252 nrecvshared = 0;
2253 slenshared = 0;
2254 while (count) {
2255 ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
2256 /* unpack receives into our local space */
2257 ierr = MPI_Get_count(&recv_status,MPIU_INT,&n);CHKERRQ(ierr);
2258 ierr = PetscShmCommGlobalToLocal(scomm,onodes1[imdex],&jj);CHKERRQ(ierr);
2259 if (jj > -1) {
2260 ierr = PetscInfo3(NULL,"[%d] Sending values to shared memory partner %d,global rank %d\n",rank,jj,onodes1[imdex]);CHKERRQ(ierr);
2261 nrecvshared++;
2262 slenshared += n;
2263 }
2264 slen += n;
2265 count--;
2266 }
2267
2268 if (slen != recvtotal) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Total message lengths %D not expected %D",slen,recvtotal);
2269
2270 /* allocate entire send scatter context */
2271 ierr = PetscNewLog(ctx,&to);CHKERRQ(ierr);
2272 to->sharedwin = MPI_WIN_NULL;
2273 to->n = nrecvs-nrecvshared;
2274
2275 ierr = PetscMalloc1(nrecvs-nrecvshared,&to->requests);CHKERRQ(ierr);
2276 ierr = PetscMalloc4(bs*(slen-slenshared),&to->values,slen-slenshared,&to->indices,nrecvs-nrecvshared+1,&to->starts,nrecvs-nrecvshared,&to->procs);CHKERRQ(ierr);
2277 ierr = PetscMalloc2(PetscMax(to->n,nsends),&to->sstatus,PetscMax(to->n,nsends),&to->rstatus);CHKERRQ(ierr);
2278
2279 ierr = MPI_Comm_size(mscomm,&to->msize);CHKERRQ(ierr);
2280 ierr = PetscMalloc1(slenshared,&to->sharedspaceindices);CHKERRQ(ierr);
2281 ierr = PetscCalloc1(to->msize+1,&to->sharedspacestarts);CHKERRQ(ierr);
2282
2283 ctx->todata = (void*)to;
2284 to->starts[0] = 0;
2285 to->sharedspacestarts[0] = 0;
2286
2287 /* Allocate shared memory space for shared memory partner communication */
2288 ierr = MPIU_Win_allocate_shared(to->msize*sizeof(PetscInt),sizeof(PetscInt),info,mscomm,&sharedspaceoffset,&sharedoffsetwin);CHKERRQ(ierr);
2289 for (i=0; i<to->msize; i++) sharedspaceoffset[i] = -1; /* mark with -1 for later error checking */
2290 if (nrecvs) {
2291 /* move the data into the send scatter */
2292 base = owners[rank];
2293 rsvalues = rvalues;
2294 to->n = 0;
2295 for (i=0; i<nrecvs; i++) {
2296 values = rsvalues;
2297 ierr = PetscShmCommGlobalToLocal(scomm,(PetscMPIInt)onodes1[i],&jj);CHKERRQ(ierr);
2298 if (jj > -1) {
2299 to->sharedspacestarts[jj] = to->sharedcnt;
2300 to->sharedspacestarts[jj+1] = to->sharedcnt + olengths1[i];
2301 for (j=0; j<olengths1[i]; j++) to->sharedspaceindices[to->sharedcnt + j] = values[j] - base;
2302 sharedspaceoffset[jj] = to->sharedcnt;
2303 to->sharedcnt += olengths1[i];
2304 ierr = PetscInfo4(NULL,"[%d] Sending %d values to shared memory partner %d,global rank %d\n",rank,olengths1[i],jj,onodes1[i]);CHKERRQ(ierr);
2305 } else {
2306 to->starts[to->n+1] = to->starts[to->n] + olengths1[i];
2307 to->procs[to->n] = onodes1[i];
2308 for (j=0; j<olengths1[i]; j++) to->indices[to->starts[to->n] + j] = values[j] - base;
2309 to->n++;
2310 }
2311 rsvalues += olengths1[i];
2312 }
2313 }
2314 ierr = PetscFree(olengths1);CHKERRQ(ierr);
2315 ierr = PetscFree(onodes1);CHKERRQ(ierr);
2316 ierr = PetscFree3(rvalues,source,recv_waits);CHKERRQ(ierr);
2317
2318 if (mpi3node) {
2319 /* to->sharedspace is used only as a flag */
2320 i = 0;
2321 if (to->sharedcnt) i = 1;
2322 ierr = MPIU_Win_allocate_shared(i*sizeof(PetscScalar),sizeof(PetscScalar),info,mscomm,&to->sharedspace,&to->sharedwin);CHKERRQ(ierr);
2323 } else { /* mpi3 */
2324 ierr = MPIU_Win_allocate_shared(bs*to->sharedcnt*sizeof(PetscScalar),sizeof(PetscScalar),info,mscomm,&to->sharedspace,&to->sharedwin);CHKERRQ(ierr);
2325 }
2326 if (to->sharedwin == MPI_WIN_NULL) SETERRQ(PETSC_COMM_SELF,100,"what the");
2327 ierr = MPI_Info_free(&info);CHKERRQ(ierr);
2328
2329 /* allocate entire receive scatter context */
2330 ierr = PetscNewLog(ctx,&from);CHKERRQ(ierr);
2331 from->sharedwin = MPI_WIN_NULL;
2332 from->msize = to->msize;
2333 /* we don't actually know the correct value for from->n at this point so we use the upper bound */
2334 from->n = nsends;
2335
2336 ierr = PetscMalloc1(nsends,&from->requests);CHKERRQ(ierr);
2337 ierr = PetscMalloc4((ny-nprocslocal)*bs,&from->values,ny-nprocslocal,&from->indices,nsends+1,&from->starts,from->n,&from->procs);CHKERRQ(ierr);
2338 ctx->fromdata = (void*)from;
2339
2340 ierr = PetscCalloc1(to->msize+1,&from->sharedspacestarts);CHKERRQ(ierr);
2341 /* move data into receive scatter */
2342 ierr = PetscMalloc2(size,&lowner,nsends+1,&start);CHKERRQ(ierr);
2343 ierr = PetscMalloc2(size,&lsharedowner,to->msize+1,&sharedstart);CHKERRQ(ierr);
2344 count = 0; from->starts[0] = start[0] = 0;
2345 from->sharedcnt = 0;
2346 for (i=0; i<size; i++) {
2347 lsharedowner[i] = -1;
2348 lowner[i] = -1;
2349 if (nprocs[i]) {
2350 ierr = PetscShmCommGlobalToLocal(scomm,i,&jj);CHKERRQ(ierr);
2351 if (jj > -1) {
2352 from->sharedspacestarts[jj] = sharedstart[jj] = from->sharedcnt;
2353 from->sharedspacestarts[jj+1] = sharedstart[jj+1] = from->sharedspacestarts[jj] + nprocs[i];
2354 from->sharedcnt += nprocs[i];
2355 lsharedowner[i] = jj;
2356 } else {
2357 lowner[i] = count++;
2358 from->procs[count-1] = i;
2359 from->starts[count] = start[count] = start[count-1] + nprocs[i];
2360 }
2361 }
2362 }
2363 from->n = count;
2364
2365 for (i=0; i<nx; i++) {
2366 if (owner[i] != rank && lowner[owner[i]] > -1) {
2367 from->indices[start[lowner[owner[i]]]++] = bs*inidy[i];
2368 if (bs*inidy[i] >= lengthy) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
2369 }
2370 }
2371
2372 /* copy over appropriate parts of inidy[] into from->sharedspaceindices */
2373 if (mpi3node) {
2374 /* in addition, copy over appropriate parts of inidx[] into 2nd part of from->sharedspaceindices */
2375 PetscInt *sharedspaceindicesx;
2376 ierr = PetscMalloc1(2*from->sharedcnt,&from->sharedspaceindices);CHKERRQ(ierr);
2377 sharedspaceindicesx = from->sharedspaceindices + from->sharedcnt;
2378 for (i=0; i<nx; i++) {
2379 if (owner[i] != rank && lsharedowner[owner[i]] > -1) {
2380 if (sharedstart[lsharedowner[owner[i]]] > from->sharedcnt) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MEM,"won");
2381
2382 from->sharedspaceindices[sharedstart[lsharedowner[owner[i]]]] = bs*inidy[i];
2383 sharedspaceindicesx[sharedstart[lsharedowner[owner[i]]]] = bs*inidx[i]-owners[owner[i]]; /* local inidx */
2384 sharedstart[lsharedowner[owner[i]]]++;
2385 }
2386 }
2387 } else { /* mpi3 */
2388 ierr = PetscMalloc1(from->sharedcnt,&from->sharedspaceindices);CHKERRQ(ierr);
2389 for (i=0; i<nx; i++) {
2390 if (owner[i] != rank && lsharedowner[owner[i]] > -1) {
2391 if (sharedstart[lsharedowner[owner[i]]] > from->sharedcnt) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MEM,"won");
2392 from->sharedspaceindices[sharedstart[lsharedowner[owner[i]]]++] = bs*inidy[i];
2393 }
2394 }
2395 }
2396
2397 ierr = PetscFree2(lowner,start);CHKERRQ(ierr);
2398 ierr = PetscFree2(lsharedowner,sharedstart);CHKERRQ(ierr);
2399 ierr = PetscFree2(nprocs,owner);CHKERRQ(ierr);
2400
2401 /* wait on sends */
2402 if (nsends) {
2403 ierr = PetscMalloc1(nsends,&send_status);CHKERRQ(ierr);
2404 ierr = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);
2405 ierr = PetscFree(send_status);CHKERRQ(ierr);
2406 }
2407 ierr = PetscFree3(svalues,send_waits,starts);CHKERRQ(ierr);
2408
2409 if (nprocslocal) {
2410 PetscInt nt = from->local.n = to->local.n = nprocslocal;
2411 /* we have a scatter to ourselves */
2412 ierr = PetscMalloc1(nt,&to->local.vslots);CHKERRQ(ierr);
2413 ierr = PetscMalloc1(nt,&from->local.vslots);CHKERRQ(ierr);
2414 nt = 0;
2415 for (i=0; i<nx; i++) {
2416 idx = bs*inidx[i];
2417 if (idx >= owners[rank] && idx < owners[rank+1]) {
2418 to->local.vslots[nt] = idx - owners[rank];
2419 from->local.vslots[nt++] = bs*inidy[i];
2420 if (bs*inidy[i] >= lengthy) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Scattering past end of TO vector");
2421 }
2422 }
2423 ierr = PetscLogObjectMemory((PetscObject)ctx,2*nt*sizeof(PetscInt));CHKERRQ(ierr);
2424 } else {
2425 from->local.n = 0;
2426 from->local.vslots = NULL;
2427 to->local.n = 0;
2428 to->local.vslots = NULL;
2429 }
2430
2431 /* Get the shared memory address for all processes we will be copying data from */
2432 ierr = PetscCalloc1(to->msize,&from->sharedspaces);CHKERRQ(ierr);
2433 ierr = PetscCalloc1(to->msize,&from->sharedspacesoffset);CHKERRQ(ierr);
2434 ierr = MPI_Comm_rank(mscomm,&mrank);CHKERRQ(ierr);
2435 for (jj=0; jj<to->msize; jj++) {
2436 MPI_Aint isize;
2437 PetscMPIInt disp_unit;
2438 PetscInt *ptr;
2439 ierr = MPIU_Win_shared_query(to->sharedwin,jj,&isize,&disp_unit,&from->sharedspaces[jj]);CHKERRQ(ierr);
2440 ierr = MPIU_Win_shared_query(sharedoffsetwin,jj,&isize,&disp_unit,&ptr);CHKERRQ(ierr);
2441 from->sharedspacesoffset[jj] = ptr[mrank];
2442 }
2443 ierr = MPI_Win_free(&sharedoffsetwin);CHKERRQ(ierr);
2444
2445 if (mpi3node && !from->sharedspace) {
2446 /* comput from->notdone to be used by VecScatterEndMPI3Node() */
2447 PetscInt notdone = 0;
2448 for (i=0; i<from->msize; i++) {
2449 if (from->sharedspacesoffset && from->sharedspacesoffset[i] > -1) {
2450 notdone++;
2451 }
2452 }
2453 from->notdone = notdone;
2454 }
2455
2456 from->local.nonmatching_computed = PETSC_FALSE;
2457 from->local.n_nonmatching = 0;
2458 from->local.slots_nonmatching = NULL;
2459 to->local.nonmatching_computed = PETSC_FALSE;
2460 to->local.n_nonmatching = 0;
2461 to->local.slots_nonmatching = NULL;
2462
2463 from->format = VEC_SCATTER_MPI_GENERAL;
2464 to->format = VEC_SCATTER_MPI_GENERAL;
2465 from->bs = bs;
2466 to->bs = bs;
2467
2468 ierr = VecScatterCreateCommon_PtoS_MPI3(from,to,ctx);CHKERRQ(ierr);
2469 PetscFunctionReturn(0);
2470 }
2471
2472 /*
2473 bs indicates how many elements there are in each block. Normally this would be 1.
2474 */
VecScatterCreateCommon_PtoS_MPI3(VecScatter_MPI_General * from,VecScatter_MPI_General * to,VecScatter ctx)2475 PetscErrorCode VecScatterCreateCommon_PtoS_MPI3(VecScatter_MPI_General *from,VecScatter_MPI_General *to,VecScatter ctx)
2476 {
2477 MPI_Comm comm;
2478 PetscMPIInt tag = ((PetscObject)ctx)->tag, tagr;
2479 PetscInt bs = to->bs;
2480 PetscMPIInt size;
2481 PetscInt i, n;
2482 PetscErrorCode ierr;
2483 VecScatterType type;
2484 PetscBool mpi3node;
2485
2486 PetscFunctionBegin;
2487 ierr = PetscObjectGetComm((PetscObject)ctx,&comm);CHKERRQ(ierr);
2488 ierr = PetscObjectGetNewTag((PetscObject)ctx,&tagr);CHKERRQ(ierr);
2489 ctx->ops->destroy = VecScatterDestroy_PtoP_MPI3;
2490
2491 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
2492 /* check if the receives are ALL going into contiguous locations; if so can skip indexing */
2493 to->contiq = PETSC_FALSE;
2494 n = from->starts[from->n];
2495 from->contiq = PETSC_TRUE;
2496 for (i=1; i<n; i++) {
2497 if (from->indices[i] != from->indices[i-1] + bs) {
2498 from->contiq = PETSC_FALSE;
2499 break;
2500 }
2501 }
2502
2503 ctx->ops->copy = VecScatterCopy_PtoP_AllToAll;
2504 {
2505 PetscInt *sstarts = to->starts, *rstarts = from->starts;
2506 PetscMPIInt *sprocs = to->procs, *rprocs = from->procs;
2507 MPI_Request *swaits = to->requests,*rwaits = from->requests;
2508 MPI_Request *rev_swaits,*rev_rwaits;
2509 PetscScalar *Ssvalues = to->values, *Srvalues = from->values;
2510
2511 /* allocate additional wait variables for the "reverse" scatter */
2512 ierr = PetscMalloc1(to->n,&rev_rwaits);CHKERRQ(ierr);
2513 ierr = PetscMalloc1(from->n,&rev_swaits);CHKERRQ(ierr);
2514 to->rev_requests = rev_rwaits;
2515 from->rev_requests = rev_swaits;
2516
2517 for (i=0; i<from->n; i++) {
2518 ierr = MPI_Send_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tagr,comm,rev_swaits+i);CHKERRQ(ierr);
2519 }
2520
2521 for (i=0; i<to->n; i++) {
2522 ierr = MPI_Send_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tag,comm,swaits+i);CHKERRQ(ierr);
2523 }
2524 /* Register receives for scatter and reverse */
2525 for (i=0; i<from->n; i++) {
2526 ierr = MPI_Recv_init(Srvalues+bs*rstarts[i],bs*rstarts[i+1]-bs*rstarts[i],MPIU_SCALAR,rprocs[i],tag,comm,rwaits+i);CHKERRQ(ierr);
2527 }
2528 for (i=0; i<to->n; i++) {
2529 ierr = MPI_Recv_init(Ssvalues+bs*sstarts[i],bs*sstarts[i+1]-bs*sstarts[i],MPIU_SCALAR,sprocs[i],tagr,comm,rev_rwaits+i);CHKERRQ(ierr);
2530 }
2531 ctx->ops->copy = VecScatterCopy_PtoP_X;
2532 }
2533 ierr = PetscInfo1(ctx,"Using blocksize %D scatter\n",bs);CHKERRQ(ierr);
2534
2535 if (PetscDefined(USE_DEBUG)) {
2536 ierr = MPIU_Allreduce(&bs,&i,1,MPIU_INT,MPI_MIN,PetscObjectComm((PetscObject)ctx));CHKERRQ(ierr);
2537 ierr = MPIU_Allreduce(&bs,&n,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)ctx));CHKERRQ(ierr);
2538 if (bs!=i || bs!=n) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Blocks size %D != %D or %D",bs,i,n);
2539 }
2540
2541 ierr = VecScatterGetType(ctx,&type);CHKERRQ(ierr);
2542 ierr = PetscStrcmp(type,"mpi3node",&mpi3node);CHKERRQ(ierr);
2543
2544 if (mpi3node) {
2545 switch (bs) {
2546 case 12:
2547 ctx->ops->begin = VecScatterBeginMPI3Node_12;
2548 ctx->ops->end = VecScatterEndMPI3Node_12;
2549 break;
2550 case 11:
2551 ctx->ops->begin = VecScatterBeginMPI3Node_11;
2552 ctx->ops->end = VecScatterEndMPI3Node_11;
2553 break;
2554 case 10:
2555 ctx->ops->begin = VecScatterBeginMPI3Node_10;
2556 ctx->ops->end = VecScatterEndMPI3Node_10;
2557 break;
2558 case 9:
2559 ctx->ops->begin = VecScatterBeginMPI3Node_9;
2560 ctx->ops->end = VecScatterEndMPI3Node_9;
2561 break;
2562 case 8:
2563 ctx->ops->begin = VecScatterBeginMPI3Node_8;
2564 ctx->ops->end = VecScatterEndMPI3Node_8;
2565 break;
2566 case 7:
2567 ctx->ops->begin = VecScatterBeginMPI3Node_7;
2568 ctx->ops->end = VecScatterEndMPI3Node_7;
2569 break;
2570 case 6:
2571 ctx->ops->begin = VecScatterBeginMPI3Node_6;
2572 ctx->ops->end = VecScatterEndMPI3Node_6;
2573 break;
2574 case 5:
2575 ctx->ops->begin = VecScatterBeginMPI3Node_5;
2576 ctx->ops->end = VecScatterEndMPI3Node_5;
2577 break;
2578 case 4:
2579 ctx->ops->begin = VecScatterBeginMPI3Node_4;
2580 ctx->ops->end = VecScatterEndMPI3Node_4;
2581 break;
2582 case 3:
2583 ctx->ops->begin = VecScatterBeginMPI3Node_3;
2584 ctx->ops->end = VecScatterEndMPI3Node_3;
2585 break;
2586 case 2:
2587 ctx->ops->begin = VecScatterBeginMPI3Node_2;
2588 ctx->ops->end = VecScatterEndMPI3Node_2;
2589 break;
2590 case 1:
2591 ctx->ops->begin = VecScatterBeginMPI3Node_1;
2592 ctx->ops->end = VecScatterEndMPI3Node_1;
2593 break;
2594 default:
2595 ctx->ops->begin = VecScatterBegin_bs;
2596 ctx->ops->end = VecScatterEnd_bs;
2597 }
2598 } else { /* !mpi3node */
2599
2600 switch (bs) {
2601 case 12:
2602 ctx->ops->begin = VecScatterBegin_12;
2603 ctx->ops->end = VecScatterEnd_12;
2604 break;
2605 case 11:
2606 ctx->ops->begin = VecScatterBegin_11;
2607 ctx->ops->end = VecScatterEnd_11;
2608 break;
2609 case 10:
2610 ctx->ops->begin = VecScatterBegin_10;
2611 ctx->ops->end = VecScatterEnd_10;
2612 break;
2613 case 9:
2614 ctx->ops->begin = VecScatterBegin_9;
2615 ctx->ops->end = VecScatterEnd_9;
2616 break;
2617 case 8:
2618 ctx->ops->begin = VecScatterBegin_8;
2619 ctx->ops->end = VecScatterEnd_8;
2620 break;
2621 case 7:
2622 ctx->ops->begin = VecScatterBegin_7;
2623 ctx->ops->end = VecScatterEnd_7;
2624 break;
2625 case 6:
2626 ctx->ops->begin = VecScatterBegin_6;
2627 ctx->ops->end = VecScatterEnd_6;
2628 break;
2629 case 5:
2630 ctx->ops->begin = VecScatterBegin_5;
2631 ctx->ops->end = VecScatterEnd_5;
2632 break;
2633 case 4:
2634 ctx->ops->begin = VecScatterBegin_4;
2635 ctx->ops->end = VecScatterEnd_4;
2636 break;
2637 case 3:
2638 ctx->ops->begin = VecScatterBegin_3;
2639 ctx->ops->end = VecScatterEnd_3;
2640 break;
2641 case 2:
2642 ctx->ops->begin = VecScatterBegin_2;
2643 ctx->ops->end = VecScatterEnd_2;
2644 break;
2645 case 1:
2646 if (mpi3node) {
2647 ctx->ops->begin = VecScatterBeginMPI3Node_1;
2648 ctx->ops->end = VecScatterEndMPI3Node_1;
2649 } else {
2650 ctx->ops->begin = VecScatterBegin_1;
2651 ctx->ops->end = VecScatterEnd_1;
2652 }
2653 break;
2654 default:
2655 ctx->ops->begin = VecScatterBegin_bs;
2656 ctx->ops->end = VecScatterEnd_bs;
2657 }
2658 }
2659
2660 ctx->ops->view = VecScatterView_MPI;
2661
2662 /* try to optimize PtoP vecscatter with memcpy's */
2663 ierr = VecScatterMemcpyPlanCreate_PtoP(to,from);CHKERRQ(ierr);
2664 PetscFunctionReturn(0);
2665 }
2666
2667 /* ------------------------------------------------------------------------------------*/
2668 /*
2669 Scatter from local Seq vectors to a parallel vector.
2670 Reverses the order of the arguments, calls VecScatterCreateLocal_PtoS() then
2671 reverses the result.
2672 */
VecScatterCreateLocal_StoP_MPI3(PetscInt nx,const PetscInt * inidx,PetscInt ny,const PetscInt * inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)2673 PetscErrorCode VecScatterCreateLocal_StoP_MPI3(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
2674 {
2675 PetscErrorCode ierr;
2676 MPI_Request *waits;
2677 VecScatter_MPI_General *to,*from;
2678
2679 PetscFunctionBegin;
2680 ierr = VecScatterCreateLocal_PtoS_MPI3(ny,inidy,nx,inidx,yin,xin,bs,ctx);CHKERRQ(ierr);
2681 to = (VecScatter_MPI_General*)ctx->fromdata;
2682 from = (VecScatter_MPI_General*)ctx->todata;
2683 ctx->todata = (void*)to;
2684 ctx->fromdata = (void*)from;
2685 /* these two are special, they are ALWAYS stored in to struct */
2686 to->sstatus = from->sstatus;
2687 to->rstatus = from->rstatus;
2688
2689 from->sstatus = NULL;
2690 from->rstatus = NULL;
2691 waits = from->rev_requests;
2692 from->rev_requests = from->requests;
2693 from->requests = waits;
2694 waits = to->rev_requests;
2695 to->rev_requests = to->requests;
2696 to->requests = waits;
2697 PetscFunctionReturn(0);
2698 }
2699
2700 /* ---------------------------------------------------------------------------------*/
VecScatterCreateLocal_PtoP_MPI3(PetscInt nx,const PetscInt * inidx,PetscInt ny,const PetscInt * inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)2701 PetscErrorCode VecScatterCreateLocal_PtoP_MPI3(PetscInt nx,const PetscInt *inidx,PetscInt ny,const PetscInt *inidy,Vec xin,Vec yin,PetscInt bs,VecScatter ctx)
2702 {
2703 PetscErrorCode ierr;
2704 PetscMPIInt size,rank,tag,imdex,n;
2705 PetscInt *owners = xin->map->range;
2706 PetscMPIInt *nprocs = NULL;
2707 PetscInt i,j,idx,nsends,*local_inidx = NULL,*local_inidy = NULL;
2708 PetscMPIInt *owner = NULL;
2709 PetscInt *starts = NULL,count,slen;
2710 PetscInt *rvalues = NULL,*svalues = NULL,base,*values = NULL,*rsvalues,recvtotal,lastidx;
2711 PetscMPIInt *onodes1,*olengths1,nrecvs;
2712 MPI_Comm comm;
2713 MPI_Request *send_waits = NULL,*recv_waits = NULL;
2714 MPI_Status recv_status,*send_status = NULL;
2715 PetscBool duplicate = PETSC_FALSE;
2716 PetscBool found = PETSC_FALSE;
2717
2718 PetscFunctionBegin;
2719 ierr = PetscObjectGetNewTag((PetscObject)ctx,&tag);CHKERRQ(ierr);
2720 ierr = PetscObjectGetComm((PetscObject)xin,&comm);CHKERRQ(ierr);
2721 ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
2722 ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
2723 if (size == 1) {
2724 ierr = VecScatterCreateLocal_StoP_MPI3(nx,inidx,ny,inidy,xin,yin,bs,ctx);CHKERRQ(ierr);
2725 PetscFunctionReturn(0);
2726 }
2727
2728 /*
2729 Each processor ships off its inidx[j] and inidy[j] to the appropriate processor
2730 They then call the StoPScatterCreate()
2731 */
2732 /* first count number of contributors to each processor */
2733 ierr = PetscMalloc3(size,&nprocs,nx,&owner,(size+1),&starts);CHKERRQ(ierr);
2734 ierr = PetscArrayzero(nprocs,size);CHKERRQ(ierr);
2735
2736 lastidx = -1;
2737 j = 0;
2738 for (i=0; i<nx; i++) {
2739 /* if indices are NOT locally sorted, need to start search at the beginning */
2740 if (lastidx > (idx = bs*inidx[i])) j = 0;
2741 lastidx = idx;
2742 for (; j<size; j++) {
2743 if (idx >= owners[j] && idx < owners[j+1]) {
2744 nprocs[j]++;
2745 owner[i] = j;
2746 found = PETSC_TRUE;
2747 break;
2748 }
2749 }
2750 if (PetscUnlikelyDebug(!found)) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index %D out of range",idx);
2751 found = PETSC_FALSE;
2752 }
2753 nsends = 0;
2754 for (i=0; i<size; i++) nsends += (nprocs[i] > 0);
2755
2756 /* inform other processors of number of messages and max length*/
2757 ierr = PetscGatherNumberOfMessages(comm,NULL,nprocs,&nrecvs);CHKERRQ(ierr);
2758 ierr = PetscGatherMessageLengths(comm,nsends,nrecvs,nprocs,&onodes1,&olengths1);CHKERRQ(ierr);
2759 ierr = PetscSortMPIIntWithArray(nrecvs,onodes1,olengths1);CHKERRQ(ierr);
2760 recvtotal = 0;
2761 for (i=0; i<nrecvs; i++) {
2762 ierr = PetscIntSumError(recvtotal,olengths1[i],&recvtotal);CHKERRQ(ierr);
2763 }
2764
2765 /* post receives: */
2766 ierr = PetscMalloc5(2*recvtotal,&rvalues,2*nx,&svalues,nrecvs,&recv_waits,nsends,&send_waits,nsends,&send_status);CHKERRQ(ierr);
2767
2768 count = 0;
2769 for (i=0; i<nrecvs; i++) {
2770 ierr = MPI_Irecv((rvalues+2*count),2*olengths1[i],MPIU_INT,onodes1[i],tag,comm,recv_waits+i);CHKERRQ(ierr);
2771 ierr = PetscIntSumError(count,olengths1[i],&count);CHKERRQ(ierr);
2772 }
2773 ierr = PetscFree(onodes1);CHKERRQ(ierr);
2774
2775 /* do sends:
2776 1) starts[i] gives the starting index in svalues for stuff going to
2777 the ith processor
2778 */
2779 starts[0]= 0;
2780 for (i=1; i<size; i++) starts[i] = starts[i-1] + nprocs[i-1];
2781 for (i=0; i<nx; i++) {
2782 svalues[2*starts[owner[i]]] = bs*inidx[i];
2783 svalues[1 + 2*starts[owner[i]]++] = bs*inidy[i];
2784 }
2785
2786 starts[0] = 0;
2787 for (i=1; i<size+1; i++) starts[i] = starts[i-1] + nprocs[i-1];
2788 count = 0;
2789 for (i=0; i<size; i++) {
2790 if (nprocs[i]) {
2791 ierr = MPI_Isend(svalues+2*starts[i],2*nprocs[i],MPIU_INT,i,tag,comm,send_waits+count);CHKERRQ(ierr);
2792 count++;
2793 }
2794 }
2795 ierr = PetscFree3(nprocs,owner,starts);CHKERRQ(ierr);
2796
2797 /* wait on receives */
2798 count = nrecvs;
2799 slen = 0;
2800 while (count) {
2801 ierr = MPI_Waitany(nrecvs,recv_waits,&imdex,&recv_status);CHKERRQ(ierr);
2802 /* unpack receives into our local space */
2803 ierr = MPI_Get_count(&recv_status,MPIU_INT,&n);CHKERRQ(ierr);
2804 slen += n/2;
2805 count--;
2806 }
2807 if (slen != recvtotal) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Total message lengths %D not as expected %D",slen,recvtotal);
2808
2809 ierr = PetscMalloc2(slen,&local_inidx,slen,&local_inidy);CHKERRQ(ierr);
2810 base = owners[rank];
2811 count = 0;
2812 rsvalues = rvalues;
2813 for (i=0; i<nrecvs; i++) {
2814 values = rsvalues;
2815 rsvalues += 2*olengths1[i];
2816 for (j=0; j<olengths1[i]; j++) {
2817 local_inidx[count] = values[2*j] - base;
2818 local_inidy[count++] = values[2*j+1];
2819 }
2820 }
2821 ierr = PetscFree(olengths1);CHKERRQ(ierr);
2822
2823 /* wait on sends */
2824 if (nsends) {ierr = MPI_Waitall(nsends,send_waits,send_status);CHKERRQ(ierr);}
2825 ierr = PetscFree5(rvalues,svalues,recv_waits,send_waits,send_status);CHKERRQ(ierr);
2826
2827 /*
2828 should sort and remove duplicates from local_inidx,local_inidy
2829 */
2830 #if defined(do_it_slow)
2831 /* sort on the from index */
2832 ierr = PetscSortIntWithArray(slen,local_inidx,local_inidy);CHKERRQ(ierr);
2833 start = 0;
2834 while (start < slen) {
2835 count = start+1;
2836 last = local_inidx[start];
2837 while (count < slen && last == local_inidx[count]) count++;
2838 if (count > start + 1) { /* found 2 or more same local_inidx[] in a row */
2839 /* sort on to index */
2840 ierr = PetscSortInt(count-start,local_inidy+start);CHKERRQ(ierr);
2841 }
2842 /* remove duplicates; not most efficient way, but probably good enough */
2843 i = start;
2844 while (i < count-1) {
2845 if (local_inidy[i] != local_inidy[i+1]) i++;
2846 else { /* found a duplicate */
2847 duplicate = PETSC_TRUE;
2848 for (j=i; j<slen-1; j++) {
2849 local_inidx[j] = local_inidx[j+1];
2850 local_inidy[j] = local_inidy[j+1];
2851 }
2852 slen--;
2853 count--;
2854 }
2855 }
2856 start = count;
2857 }
2858 #endif
2859 if (duplicate) {
2860 ierr = PetscInfo(ctx,"Duplicate from to indices passed in VecScatterCreate(), they are ignored\n");CHKERRQ(ierr);
2861 }
2862 ierr = VecScatterCreateLocal_StoP_MPI3(slen,local_inidx,slen,local_inidy,xin,yin,bs,ctx);CHKERRQ(ierr);
2863 ierr = PetscFree2(local_inidx,local_inidy);CHKERRQ(ierr);
2864 PetscFunctionReturn(0);
2865 }
2866
VecScatterSetUp_MPI3(VecScatter ctx)2867 PetscErrorCode VecScatterSetUp_MPI3(VecScatter ctx)
2868 {
2869 PetscErrorCode ierr;
2870
2871 PetscFunctionBegin;
2872 ierr = VecScatterSetUp_vectype_private(ctx,VecScatterCreateLocal_PtoS_MPI3,VecScatterCreateLocal_StoP_MPI3,VecScatterCreateLocal_PtoP_MPI3);CHKERRQ(ierr);
2873 PetscFunctionReturn(0);
2874 }
2875
VecScatterCreate_MPI3(VecScatter ctx)2876 PetscErrorCode VecScatterCreate_MPI3(VecScatter ctx)
2877 {
2878 PetscErrorCode ierr;
2879
2880 PetscFunctionBegin;
2881 ctx->ops->setup = VecScatterSetUp_MPI3;
2882 ierr = PetscObjectChangeTypeName((PetscObject)ctx,VECSCATTERMPI3);CHKERRQ(ierr);
2883 ierr = PetscInfo(ctx,"Using MPI3 for vector scatter\n");CHKERRQ(ierr);
2884 PetscFunctionReturn(0);
2885 }
2886
VecScatterCreate_MPI3Node(VecScatter ctx)2887 PetscErrorCode VecScatterCreate_MPI3Node(VecScatter ctx)
2888 {
2889 PetscErrorCode ierr;
2890
2891 PetscFunctionBegin;
2892 ctx->ops->setup = VecScatterSetUp_MPI3;
2893 ierr = PetscObjectChangeTypeName((PetscObject)ctx,VECSCATTERMPI3NODE);CHKERRQ(ierr);
2894 ierr = PetscInfo(ctx,"Using MPI3NODE for vector scatter\n");CHKERRQ(ierr);
2895 PetscFunctionReturn(0);
2896 }
2897