1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 */
5
6 #include "mpidrma.h"
7
8 extern MPIR_T_pvar_timer_t PVAR_TIMER_rma_rmaqueue_set ATTRIBUTE((unused));
9
10 /*
11 === BEGIN_MPI_T_CVAR_INFO_BLOCK ===
12
13 cvars:
14 - name : MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE
15 category : CH3
16 type : int
17 default : 65536
18 class : none
19 verbosity : MPI_T_VERBOSITY_USER_BASIC
20 scope : MPI_T_SCOPE_ALL_EQ
21 description : >-
22 Specify the threshold of data size of a RMA operation
23 which can be piggybacked with a LOCK message. It is
24 always a positive value and should not be smaller
25 than MPIDI_RMA_IMMED_BYTES.
26 If user sets it as a small value, for middle and large
27 data size, we will lose performance because of always
28 waiting for round-trip of LOCK synchronization; if
29 user sets it as a large value, we need to consume
30 more memory on target side to buffer this lock request
31 when lock is not satisfied.
32
33 === END_MPI_T_CVAR_INFO_BLOCK ===
34 */
35
MPIDI_CH3I_Put(const void * origin_addr,int origin_count,MPI_Datatype origin_datatype,int target_rank,MPI_Aint target_disp,int target_count,MPI_Datatype target_datatype,MPIR_Win * win_ptr,MPIR_Request * ureq)36 int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
37 origin_datatype, int target_rank, MPI_Aint target_disp,
38 int target_count, MPI_Datatype target_datatype, MPIR_Win * win_ptr,
39 MPIR_Request * ureq)
40 {
41 int mpi_errno = MPI_SUCCESS;
42 int dt_contig ATTRIBUTE((unused)), rank;
43 MPIR_Datatype*dtp;
44 MPI_Aint dt_true_lb ATTRIBUTE((unused));
45 intptr_t data_sz;
46 MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
47 int made_progress = 0;
48 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_PUT);
49
50 MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_PUT);
51
52 MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
53 mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
54
55 MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);
56
57 if (data_sz == 0) {
58 goto fn_exit;
59 }
60
61 rank = win_ptr->comm_ptr->rank;
62
63 if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
64 win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
65 /* check if target is local and shared memory is allocated on window,
66 * if so, we directly perform this operation on shared memory region. */
67
68 /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
69 * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
70 * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
71 * which is only set to TRUE when SHM region is allocated in nemesis.
72 * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
73 */
74 MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
75 MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
76 }
77
78 /* If the put is a local operation, do it here */
79 if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
80 (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
81 mpi_errno = MPIDI_CH3I_Shm_put_op(origin_addr, origin_count, origin_datatype, target_rank,
82 target_disp, target_count, target_datatype, win_ptr);
83 MPIR_ERR_CHECK(mpi_errno);
84
85 if (ureq) {
86 /* Complete user request and release the ch3 ref */
87 mpi_errno = MPID_Request_complete(ureq);
88 MPIR_ERR_CHECK(mpi_errno);
89 }
90 }
91 else {
92 MPIDI_RMA_Op_t *op_ptr = NULL;
93 MPIDI_CH3_Pkt_put_t *put_pkt = NULL;
94 int use_immed_pkt = FALSE;
95 int is_origin_contig, is_target_contig;
96
97 /* queue it up */
98 mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
99 MPIR_ERR_CHECK(mpi_errno);
100
101 MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
102
103 /******************** Setting operation struct areas ***********************/
104
105 /* FIXME: For contig and very short operations, use a streamlined op */
106 op_ptr->origin_addr = (void *) origin_addr;
107 op_ptr->origin_count = origin_count;
108 op_ptr->origin_datatype = origin_datatype;
109 op_ptr->target_rank = target_rank;
110
111 /* Remember user request */
112 op_ptr->ureq = ureq;
113
114 /* if source or target datatypes are derived, increment their
115 * reference counts */
116 if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
117 MPIR_Datatype_get_ptr(origin_datatype, dtp);
118 MPIR_Datatype_ptr_add_ref(dtp);
119 }
120 if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
121 MPIR_Datatype_get_ptr(target_datatype, dtp);
122 MPIR_Datatype_ptr_add_ref(dtp);
123 }
124
125 MPIR_Datatype_is_contig(origin_datatype, &is_origin_contig);
126 MPIR_Datatype_is_contig(target_datatype, &is_target_contig);
127
128 /* Judge if we can use IMMED data packet */
129 if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
130 MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) {
131 if (data_sz <= MPIDI_RMA_IMMED_BYTES)
132 use_immed_pkt = TRUE;
133 }
134
135 /* Judge if this operation is an piggyback candidate */
136 if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
137 MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
138 /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
139 * for both origin and target data. We should extend this optimization to derived
140 * datatypes as well. */
141 if (data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
142 op_ptr->piggyback_lock_candidate = 1;
143 }
144
145 /************** Setting packet struct areas in operation ****************/
146
147 put_pkt = &(op_ptr->pkt.put);
148
149 if (use_immed_pkt) {
150 MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT_IMMED);
151 }
152 else {
153 MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
154 }
155
156 put_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
157 win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
158 put_pkt->count = target_count;
159 put_pkt->datatype = target_datatype;
160 put_pkt->info.flattened_type_size = 0;
161 put_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
162 put_pkt->source_win_handle = win_ptr->handle;
163 put_pkt->pkt_flags = MPIDI_CH3_PKT_FLAG_NONE;
164 if (use_immed_pkt) {
165 void *src = (void *) origin_addr, *dest = (void *) &(put_pkt->info.data);
166 mpi_errno = immed_copy(src, dest, data_sz);
167 MPIR_ERR_CHECK(mpi_errno);
168 }
169
170 MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
171
172 mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
173 MPIR_ERR_CHECK(mpi_errno);
174
175 mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
176 MPIR_ERR_CHECK(mpi_errno);
177
178 if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
179 MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
180 while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
181 mpi_errno = wait_progress_engine();
182 MPIR_ERR_CHECK(mpi_errno);
183 }
184 }
185 }
186
187 fn_exit:
188 MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_PUT);
189 return mpi_errno;
190
191 /* --BEGIN ERROR HANDLING-- */
192 fn_fail:
193 goto fn_exit;
194 /* --END ERROR HANDLING-- */
195 }
196
MPIDI_CH3I_Get(void * origin_addr,int origin_count,MPI_Datatype origin_datatype,int target_rank,MPI_Aint target_disp,int target_count,MPI_Datatype target_datatype,MPIR_Win * win_ptr,MPIR_Request * ureq)197 int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
198 origin_datatype, int target_rank, MPI_Aint target_disp,
199 int target_count, MPI_Datatype target_datatype, MPIR_Win * win_ptr,
200 MPIR_Request * ureq)
201 {
202 int mpi_errno = MPI_SUCCESS;
203 intptr_t orig_data_sz, target_data_sz;
204 int dt_contig ATTRIBUTE((unused)), rank;
205 MPI_Aint dt_true_lb ATTRIBUTE((unused));
206 MPIR_Datatype*dtp;
207 MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
208 int made_progress = 0;
209 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET);
210
211 MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_GET);
212
213 MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
214 mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
215
216 MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, orig_data_sz, dtp,
217 dt_true_lb);
218
219 if (orig_data_sz == 0) {
220 goto fn_exit;
221 }
222
223 rank = win_ptr->comm_ptr->rank;
224
225 if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
226 win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
227 /* check if target is local and shared memory is allocated on window,
228 * if so, we directly perform this operation on shared memory region. */
229
230 /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
231 * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
232 * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
233 * which is only set to TRUE when SHM region is allocated in nemesis.
234 * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
235 */
236 MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
237 MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
238 }
239
240 /* If the get is a local operation, do it here */
241 if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
242 (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
243 mpi_errno = MPIDI_CH3I_Shm_get_op(origin_addr, origin_count, origin_datatype, target_rank,
244 target_disp, target_count, target_datatype, win_ptr);
245 MPIR_ERR_CHECK(mpi_errno);
246
247 if (ureq) {
248 /* Complete user request and release the ch3 ref */
249 mpi_errno = MPID_Request_complete(ureq);
250 MPIR_ERR_CHECK(mpi_errno);
251 }
252 }
253 else {
254 MPIDI_RMA_Op_t *op_ptr = NULL;
255 MPIDI_CH3_Pkt_get_t *get_pkt = NULL;
256 MPI_Aint target_type_size;
257 int use_immed_resp_pkt = FALSE;
258 int is_origin_contig, is_target_contig;
259
260 /* queue it up */
261 mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
262 MPIR_ERR_CHECK(mpi_errno);
263
264 MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
265
266 /******************** Setting operation struct areas ***********************/
267
268 /* FIXME: For contig and very short operations, use a streamlined op */
269 op_ptr->origin_addr = origin_addr;
270 op_ptr->origin_count = origin_count;
271 op_ptr->origin_datatype = origin_datatype;
272 op_ptr->target_rank = target_rank;
273
274 /* Remember user request */
275 op_ptr->ureq = ureq;
276
277 /* if source or target datatypes are derived, increment their
278 * reference counts */
279 if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
280 MPIR_Datatype_get_ptr(origin_datatype, dtp);
281 MPIR_Datatype_ptr_add_ref(dtp);
282 }
283 if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
284 MPIR_Datatype_get_ptr(target_datatype, dtp);
285 MPIR_Datatype_ptr_add_ref(dtp);
286 }
287
288 MPIR_Datatype_is_contig(origin_datatype, &is_origin_contig);
289 MPIR_Datatype_is_contig(target_datatype, &is_target_contig);
290
291 MPIR_Datatype_get_size_macro(target_datatype, target_type_size);
292 MPIR_Assign_trunc(target_data_sz, target_count * target_type_size, intptr_t);
293
294 /* Judge if we can use IMMED data response packet */
295 if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
296 MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) {
297 if (target_data_sz <= MPIDI_RMA_IMMED_BYTES)
298 use_immed_resp_pkt = TRUE;
299 }
300
301 /* Judge if this operation is an piggyback candidate. */
302 if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
303 MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
304 /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
305 * for both origin and target data. We should extend this optimization to derived
306 * datatypes as well. */
307 op_ptr->piggyback_lock_candidate = 1;
308 }
309
310 /************** Setting packet struct areas in operation ****************/
311
312 get_pkt = &(op_ptr->pkt.get);
313 MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
314 get_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
315 win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
316 get_pkt->count = target_count;
317 get_pkt->datatype = target_datatype;
318 get_pkt->info.flattened_type_size = 0;
319 get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
320 get_pkt->pkt_flags = MPIDI_CH3_PKT_FLAG_NONE;
321 if (use_immed_resp_pkt)
322 get_pkt->pkt_flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
323
324 MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
325
326 mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
327 MPIR_ERR_CHECK(mpi_errno);
328
329 mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
330 MPIR_ERR_CHECK(mpi_errno);
331
332 if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
333 MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
334 while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
335 mpi_errno = wait_progress_engine();
336 MPIR_ERR_CHECK(mpi_errno);
337 }
338 }
339 }
340
341 fn_exit:
342 MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_GET);
343 return mpi_errno;
344
345 /* --BEGIN ERROR HANDLING-- */
346 fn_fail:
347 goto fn_exit;
348 /* --END ERROR HANDLING-- */
349 }
350
351
MPIDI_CH3I_Accumulate(const void * origin_addr,int origin_count,MPI_Datatype origin_datatype,int target_rank,MPI_Aint target_disp,int target_count,MPI_Datatype target_datatype,MPI_Op op,MPIR_Win * win_ptr,MPIR_Request * ureq)352 int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
353 origin_datatype, int target_rank, MPI_Aint target_disp,
354 int target_count, MPI_Datatype target_datatype, MPI_Op op,
355 MPIR_Win * win_ptr, MPIR_Request * ureq)
356 {
357 int mpi_errno = MPI_SUCCESS;
358 intptr_t data_sz;
359 int dt_contig ATTRIBUTE((unused)), rank;
360 MPI_Aint dt_true_lb ATTRIBUTE((unused));
361 MPIR_Datatype*dtp;
362 MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
363 int made_progress = 0;
364 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
365
366 MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
367
368 MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
369 mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
370
371 MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);
372
373 if (data_sz == 0) {
374 goto fn_exit;
375 }
376
377 rank = win_ptr->comm_ptr->rank;
378
379 if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
380 win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
381 /* check if target is local and shared memory is allocated on window,
382 * if so, we directly perform this operation on shared memory region. */
383
384 /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
385 * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
386 * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
387 * which is only set to TRUE when SHM region is allocated in nemesis.
388 * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
389 */
390 MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
391 MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
392 }
393
394 /* Do =! rank first (most likely branch?) */
395 if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
396 (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
397 mpi_errno = MPIDI_CH3I_Shm_acc_op(origin_addr, origin_count, origin_datatype,
398 target_rank, target_disp, target_count, target_datatype,
399 op, win_ptr);
400 MPIR_ERR_CHECK(mpi_errno);
401
402 if (ureq) {
403 /* Complete user request and release the ch3 ref */
404 mpi_errno = MPID_Request_complete(ureq);
405 MPIR_ERR_CHECK(mpi_errno);
406 }
407 }
408 else {
409 MPIDI_RMA_Op_t *op_ptr = NULL;
410 MPIDI_CH3_Pkt_accum_t *accum_pkt = NULL;
411 int use_immed_pkt = FALSE;
412 int is_origin_contig, is_target_contig;
413 MPI_Aint stream_elem_count, stream_unit_count;
414 MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
415 MPIR_Datatype*origin_dtp = NULL, *target_dtp = NULL;
416 int i;
417
418 /* queue it up */
419 mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
420 MPIR_ERR_CHECK(mpi_errno);
421
422 MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
423
424 /******************** Setting operation struct areas ***********************/
425
426 op_ptr->origin_addr = (void *) origin_addr;
427 op_ptr->origin_count = origin_count;
428 op_ptr->origin_datatype = origin_datatype;
429 op_ptr->target_rank = target_rank;
430
431 /* Remember user request */
432 op_ptr->ureq = ureq;
433
434 /* if source or target datatypes are derived, increment their
435 * reference counts */
436 if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
437 MPIR_Datatype_get_ptr(origin_datatype, origin_dtp);
438 }
439 if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
440 MPIR_Datatype_get_ptr(target_datatype, target_dtp);
441 }
442
443 /* Get size and count for predefined datatype elements */
444 if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
445 MPIR_Datatype_get_size_macro(origin_datatype, predefined_dtp_size);
446 predefined_dtp_count = origin_count;
447 MPIR_Datatype_get_extent_macro(origin_datatype, predefined_dtp_extent);
448 }
449 else {
450 MPIR_Assert(origin_dtp->basic_type != MPI_DATATYPE_NULL);
451 MPIR_Datatype_get_size_macro(origin_dtp->basic_type, predefined_dtp_size);
452 predefined_dtp_count = data_sz / predefined_dtp_size;
453 MPIR_Datatype_get_extent_macro(origin_dtp->basic_type, predefined_dtp_extent);
454 }
455 MPIR_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
456 predefined_dtp_extent > 0);
457
458 /* Calculate number of predefined elements in each stream unit, and
459 * total number of stream units. */
460 stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
461 stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
462 MPIR_Assert(stream_elem_count > 0 && stream_unit_count > 0);
463
464 for (i = 0; i < stream_unit_count; i++) {
465 if (origin_dtp != NULL) {
466 MPIR_Datatype_ptr_add_ref(origin_dtp);
467 }
468 if (target_dtp != NULL) {
469 MPIR_Datatype_ptr_add_ref(target_dtp);
470 }
471 }
472
473 MPIR_Datatype_is_contig(origin_datatype, &is_origin_contig);
474 MPIR_Datatype_is_contig(target_datatype, &is_target_contig);
475
476 /* Judge if we can use IMMED data packet */
477 if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
478 MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) {
479 if (data_sz <= MPIDI_RMA_IMMED_BYTES)
480 use_immed_pkt = TRUE;
481 }
482
483 /* Judge if this operation is an piggyback candidate. */
484 if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
485 MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
486 /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
487 * for both origin and target data. We should extend this optimization to derived
488 * datatypes as well. */
489 if (data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
490 op_ptr->piggyback_lock_candidate = 1;
491 }
492
493 /************** Setting packet struct areas in operation ****************/
494
495 accum_pkt = &(op_ptr->pkt.accum);
496
497 if (use_immed_pkt) {
498 MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE_IMMED);
499 }
500 else {
501 MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE);
502 }
503
504 accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
505 win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
506 accum_pkt->count = target_count;
507 accum_pkt->datatype = target_datatype;
508 accum_pkt->info.flattened_type_size = 0;
509 accum_pkt->op = op;
510 accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
511 accum_pkt->source_win_handle = win_ptr->handle;
512 accum_pkt->pkt_flags = MPIDI_CH3_PKT_FLAG_NONE;
513 if (use_immed_pkt) {
514 void *src = (void *) origin_addr, *dest = (void *) &(accum_pkt->info.data);
515 mpi_errno = immed_copy(src, dest, data_sz);
516 MPIR_ERR_CHECK(mpi_errno);
517 }
518
519 MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
520
521 mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
522 MPIR_ERR_CHECK(mpi_errno);
523
524 mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
525 MPIR_ERR_CHECK(mpi_errno);
526
527 if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
528 MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
529 while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
530 mpi_errno = wait_progress_engine();
531 MPIR_ERR_CHECK(mpi_errno);
532 }
533 }
534 }
535
536 fn_exit:
537 MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
538 return mpi_errno;
539
540 /* --BEGIN ERROR HANDLING-- */
541 fn_fail:
542 goto fn_exit;
543 /* --END ERROR HANDLING-- */
544 }
545
546
MPIDI_CH3I_Get_accumulate(const void * origin_addr,int origin_count,MPI_Datatype origin_datatype,void * result_addr,int result_count,MPI_Datatype result_datatype,int target_rank,MPI_Aint target_disp,int target_count,MPI_Datatype target_datatype,MPI_Op op,MPIR_Win * win_ptr,MPIR_Request * ureq)547 int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
548 MPI_Datatype origin_datatype, void *result_addr, int result_count,
549 MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
550 int target_count, MPI_Datatype target_datatype, MPI_Op op,
551 MPIR_Win * win_ptr, MPIR_Request * ureq)
552 {
553 int mpi_errno = MPI_SUCCESS;
554 intptr_t orig_data_sz, target_data_sz;
555 int rank;
556 int dt_contig ATTRIBUTE((unused));
557 MPI_Aint dt_true_lb ATTRIBUTE((unused));
558 MPIR_Datatype*dtp;
559 MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
560 int made_progress = 0;
561 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
562
563 MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
564
565 MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
566 mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
567
568 MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, target_data_sz, dtp,
569 dt_true_lb);
570
571 if (target_data_sz == 0) {
572 goto fn_exit;
573 }
574
575 rank = win_ptr->comm_ptr->rank;
576
577 if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
578 win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
579 /* check if target is local and shared memory is allocated on window,
580 * if so, we directly perform this operation on shared memory region. */
581
582 /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
583 * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
584 * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
585 * which is only set to TRUE when SHM region is allocated in nemesis.
586 * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
587 */
588 MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
589 MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
590 }
591
592 /* Do =! rank first (most likely branch?) */
593 if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
594 (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
595 mpi_errno = MPIDI_CH3I_Shm_get_acc_op(origin_addr, origin_count, origin_datatype,
596 result_addr, result_count, result_datatype,
597 target_rank, target_disp, target_count,
598 target_datatype, op, win_ptr);
599 MPIR_ERR_CHECK(mpi_errno);
600
601 if (ureq) {
602 /* Complete user request and release the ch3 ref */
603 mpi_errno = MPID_Request_complete(ureq);
604 MPIR_ERR_CHECK(mpi_errno);
605 }
606 }
607 else {
608 MPIDI_RMA_Op_t *op_ptr = NULL;
609 MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt;
610 MPI_Aint origin_type_size;
611 MPI_Aint target_type_size;
612 int use_immed_pkt = FALSE, i;
613 int is_origin_contig, is_target_contig, is_result_contig;
614 MPI_Aint stream_elem_count, stream_unit_count;
615 MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
616 MPIR_Datatype*origin_dtp = NULL, *target_dtp = NULL, *result_dtp = NULL;
617 int is_empty_origin = FALSE;
618
619 /* Judge if origin buffer is empty */
620 if (op == MPI_NO_OP)
621 is_empty_origin = TRUE;
622
623 /* Append the operation to the window's RMA ops queue */
624 mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
625 MPIR_ERR_CHECK(mpi_errno);
626
627 /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */
628
629 MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
630
631 /******************** Setting operation struct areas ***********************/
632
633 op_ptr->origin_addr = (void *) origin_addr;
634 op_ptr->origin_count = origin_count;
635 op_ptr->origin_datatype = origin_datatype;
636 op_ptr->result_addr = result_addr;
637 op_ptr->result_count = result_count;
638 op_ptr->result_datatype = result_datatype;
639 op_ptr->target_rank = target_rank;
640
641 /* Remember user request */
642 op_ptr->ureq = ureq;
643
644 /* if source or target datatypes are derived, increment their
645 * reference counts */
646 if (is_empty_origin == FALSE && !MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
647 MPIR_Datatype_get_ptr(origin_datatype, origin_dtp);
648 }
649 if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
650 MPIR_Datatype_get_ptr(result_datatype, result_dtp);
651 }
652 if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
653 MPIR_Datatype_get_ptr(target_datatype, target_dtp);
654 }
655
656 if (is_empty_origin == FALSE) {
657 MPIR_Datatype_get_size_macro(origin_datatype, origin_type_size);
658 MPIR_Assign_trunc(orig_data_sz, origin_count * origin_type_size, intptr_t);
659 }
660 else {
661 /* If origin buffer is empty, set origin data size to 0 */
662 orig_data_sz = 0;
663 }
664
665 MPIR_Datatype_get_size_macro(target_datatype, target_type_size);
666
667 /* Get size and count for predefined datatype elements */
668 if (MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
669 predefined_dtp_size = target_type_size;
670 predefined_dtp_count = target_count;
671 MPIR_Datatype_get_extent_macro(target_datatype, predefined_dtp_extent);
672 }
673 else {
674 MPIR_Assert(target_dtp->basic_type != MPI_DATATYPE_NULL);
675 MPIR_Datatype_get_size_macro(target_dtp->basic_type, predefined_dtp_size);
676 predefined_dtp_count = target_data_sz / predefined_dtp_size;
677 MPIR_Datatype_get_extent_macro(target_dtp->basic_type, predefined_dtp_extent);
678 }
679 MPIR_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
680 predefined_dtp_extent > 0);
681
682 /* Calculate number of predefined elements in each stream unit, and
683 * total number of stream units. */
684 stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
685 stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
686 MPIR_Assert(stream_elem_count > 0 && stream_unit_count > 0);
687
688 for (i = 0; i < stream_unit_count; i++) {
689 if (origin_dtp != NULL) {
690 MPIR_Datatype_ptr_add_ref(origin_dtp);
691 }
692 if (target_dtp != NULL) {
693 MPIR_Datatype_ptr_add_ref(target_dtp);
694 }
695 if (result_dtp != NULL) {
696 MPIR_Datatype_ptr_add_ref(result_dtp);
697 }
698 }
699
700 if (is_empty_origin == FALSE) {
701 MPIR_Datatype_is_contig(origin_datatype, &is_origin_contig);
702 }
703 else {
704 /* If origin buffer is empty, mark origin data as contig data */
705 is_origin_contig = 1;
706 }
707 MPIR_Datatype_is_contig(target_datatype, &is_target_contig);
708 MPIR_Datatype_is_contig(result_datatype, &is_result_contig);
709
710 /* Judge if we can use IMMED data packet */
711 if ((is_empty_origin == TRUE || MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) &&
712 MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
713 MPIR_DATATYPE_IS_PREDEFINED(target_datatype) &&
714 is_origin_contig && is_target_contig && is_result_contig) {
715 if (target_data_sz <= MPIDI_RMA_IMMED_BYTES)
716 use_immed_pkt = TRUE;
717 }
718
719 /* Judge if this operation is a piggyback candidate */
720 if ((is_empty_origin == TRUE || MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) &&
721 MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
722 MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
723 /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
724 * for origin, target and result data. We should extend this optimization to derived
725 * datatypes as well. */
726 if (orig_data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
727 op_ptr->piggyback_lock_candidate = 1;
728 }
729
730 /************** Setting packet struct areas in operation ****************/
731
732 get_accum_pkt = &(op_ptr->pkt.get_accum);
733
734 if (use_immed_pkt) {
735 MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM_IMMED);
736 }
737 else {
738 MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM);
739 }
740
741 get_accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
742 win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
743 get_accum_pkt->count = target_count;
744 get_accum_pkt->datatype = target_datatype;
745 get_accum_pkt->info.flattened_type_size = 0;
746 get_accum_pkt->op = op;
747 get_accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
748 get_accum_pkt->pkt_flags = MPIDI_CH3_PKT_FLAG_NONE;
749 if (use_immed_pkt) {
750 void *src = (void *) origin_addr, *dest = (void *) &(get_accum_pkt->info.data);
751 mpi_errno = immed_copy(src, dest, orig_data_sz);
752 MPIR_ERR_CHECK(mpi_errno);
753 }
754
755 MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
756
757 mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
758 MPIR_ERR_CHECK(mpi_errno);
759
760 mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
761 MPIR_ERR_CHECK(mpi_errno);
762
763 if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
764 MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
765 while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
766 mpi_errno = wait_progress_engine();
767 MPIR_ERR_CHECK(mpi_errno);
768 }
769 }
770 }
771
772 fn_exit:
773 MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
774 return mpi_errno;
775
776 /* --BEGIN ERROR HANDLING-- */
777 fn_fail:
778 goto fn_exit;
779 /* --END ERROR HANDLING-- */
780 }
781
782
MPID_Put(const void * origin_addr,int origin_count,MPI_Datatype origin_datatype,int target_rank,MPI_Aint target_disp,int target_count,MPI_Datatype target_datatype,MPIR_Win * win_ptr)783 int MPID_Put(const void *origin_addr, int origin_count, MPI_Datatype
784 origin_datatype, int target_rank, MPI_Aint target_disp,
785 int target_count, MPI_Datatype target_datatype, MPIR_Win * win_ptr)
786 {
787 int mpi_errno = MPI_SUCCESS;
788
789 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_PUT);
790 MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPID_PUT);
791
792 mpi_errno = MPIDI_CH3I_Put(origin_addr, origin_count, origin_datatype,
793 target_rank, target_disp, target_count, target_datatype,
794 win_ptr, NULL);
795
796 fn_exit:
797 MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPID_PUT);
798 return mpi_errno;
799
800 /* --BEGIN ERROR HANDLING-- */
801 fn_fail:
802 goto fn_exit;
803 /* --END ERROR HANDLING-- */
804 }
805
MPID_Get(void * origin_addr,int origin_count,MPI_Datatype origin_datatype,int target_rank,MPI_Aint target_disp,int target_count,MPI_Datatype target_datatype,MPIR_Win * win_ptr)806 int MPID_Get(void *origin_addr, int origin_count, MPI_Datatype
807 origin_datatype, int target_rank, MPI_Aint target_disp,
808 int target_count, MPI_Datatype target_datatype, MPIR_Win * win_ptr)
809 {
810 int mpi_errno = MPI_SUCCESS;
811
812 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_GET);
813 MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPID_GET);
814
815 mpi_errno = MPIDI_CH3I_Get(origin_addr, origin_count, origin_datatype,
816 target_rank, target_disp, target_count, target_datatype,
817 win_ptr, NULL);
818
819 fn_exit:
820 MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPID_GET);
821 return mpi_errno;
822
823 /* --BEGIN ERROR HANDLING-- */
824 fn_fail:
825 goto fn_exit;
826 /* --END ERROR HANDLING-- */
827 }
828
MPID_Accumulate(const void * origin_addr,int origin_count,MPI_Datatype origin_datatype,int target_rank,MPI_Aint target_disp,int target_count,MPI_Datatype target_datatype,MPI_Op op,MPIR_Win * win_ptr)829 int MPID_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
830 origin_datatype, int target_rank, MPI_Aint target_disp,
831 int target_count, MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win_ptr)
832 {
833 int mpi_errno = MPI_SUCCESS;
834
835 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_ACCUMULATE);
836 MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPID_ACCUMULATE);
837
838 mpi_errno = MPIDI_CH3I_Accumulate(origin_addr, origin_count, origin_datatype,
839 target_rank, target_disp, target_count, target_datatype,
840 op, win_ptr, NULL);
841
842 fn_exit:
843 MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPID_ACCUMULATE);
844 return mpi_errno;
845
846 /* --BEGIN ERROR HANDLING-- */
847 fn_fail:
848 goto fn_exit;
849 /* --END ERROR HANDLING-- */
850 }
851
MPID_Get_accumulate(const void * origin_addr,int origin_count,MPI_Datatype origin_datatype,void * result_addr,int result_count,MPI_Datatype result_datatype,int target_rank,MPI_Aint target_disp,int target_count,MPI_Datatype target_datatype,MPI_Op op,MPIR_Win * win_ptr)852 int MPID_Get_accumulate(const void *origin_addr, int origin_count,
853 MPI_Datatype origin_datatype, void *result_addr, int result_count,
854 MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
855 int target_count, MPI_Datatype target_datatype, MPI_Op op,
856 MPIR_Win * win_ptr)
857 {
858 int mpi_errno = MPI_SUCCESS;
859
860 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_GET_ACCUMULATE);
861 MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPID_GET_ACCUMULATE);
862
863 mpi_errno = MPIDI_CH3I_Get_accumulate(origin_addr, origin_count, origin_datatype,
864 result_addr, result_count, result_datatype,
865 target_rank, target_disp, target_count,
866 target_datatype, op, win_ptr, NULL);
867
868 fn_exit:
869 MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPID_GET_ACCUMULATE);
870 return mpi_errno;
871
872 /* --BEGIN ERROR HANDLING-- */
873 fn_fail:
874 goto fn_exit;
875 /* --END ERROR HANDLING-- */
876 }
877
878
MPID_Compare_and_swap(const void * origin_addr,const void * compare_addr,void * result_addr,MPI_Datatype datatype,int target_rank,MPI_Aint target_disp,MPIR_Win * win_ptr)879 int MPID_Compare_and_swap(const void *origin_addr, const void *compare_addr,
880 void *result_addr, MPI_Datatype datatype, int target_rank,
881 MPI_Aint target_disp, MPIR_Win * win_ptr)
882 {
883 int mpi_errno = MPI_SUCCESS;
884 int rank;
885 MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
886 int made_progress = 0;
887
888 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_COMPARE_AND_SWAP);
889
890 MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPID_COMPARE_AND_SWAP);
891
892 MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
893 mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
894
895 rank = win_ptr->comm_ptr->rank;
896
897 if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
898 win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
899 /* check if target is local and shared memory is allocated on window,
900 * if so, we directly perform this operation on shared memory region. */
901
902 /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
903 * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
904 * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
905 * which is only set to TRUE when SHM region is allocated in nemesis.
906 * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
907 */
908 MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
909 MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
910 }
911
912 /* The datatype must be predefined, and one of: C integer, Fortran integer,
913 * Logical, Multi-language types, or Byte. This is checked above the ADI,
914 * so there's no need to check it again here. */
915
916 /* FIXME: For shared memory windows, we should provide an implementation
917 * that uses a processor atomic operation. */
918 if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
919 (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
920 mpi_errno = MPIDI_CH3I_Shm_cas_op(origin_addr, compare_addr, result_addr,
921 datatype, target_rank, target_disp, win_ptr);
922 MPIR_ERR_CHECK(mpi_errno);
923 }
924 else {
925 MPIDI_RMA_Op_t *op_ptr = NULL;
926 MPIDI_CH3_Pkt_cas_t *cas_pkt = NULL;
927 MPI_Aint type_size;
928 void *src = NULL, *dest = NULL;
929
930 /* Append this operation to the RMA ops queue */
931 mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
932 MPIR_ERR_CHECK(mpi_errno);
933
934 MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
935
936 /******************** Setting operation struct areas ***********************/
937
938 op_ptr->origin_addr = (void *) origin_addr;
939 op_ptr->origin_count = 1;
940 op_ptr->origin_datatype = datatype;
941 op_ptr->result_addr = result_addr;
942 op_ptr->result_datatype = datatype;
943 op_ptr->compare_addr = (void *) compare_addr;
944 op_ptr->compare_datatype = datatype;
945 op_ptr->target_rank = target_rank;
946 op_ptr->piggyback_lock_candidate = 1; /* CAS is always able to piggyback LOCK */
947
948 /************** Setting packet struct areas in operation ****************/
949
950 cas_pkt = &(op_ptr->pkt.cas);
951 MPIDI_Pkt_init(cas_pkt, MPIDI_CH3_PKT_CAS_IMMED);
952 cas_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
953 win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
954 cas_pkt->datatype = datatype;
955 cas_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
956 cas_pkt->pkt_flags = MPIDI_CH3_PKT_FLAG_NONE;
957
958 /* REQUIRE: All datatype arguments must be of the same, builtin
959 * type and counts must be 1. */
960 MPIR_Datatype_get_size_macro(datatype, type_size);
961 MPIR_Assert(type_size <= sizeof(MPIDI_CH3_CAS_Immed_u));
962
963 src = (void *) origin_addr, dest = (void *) (&(cas_pkt->origin_data));
964 mpi_errno = immed_copy(src, dest, type_size);
965 MPIR_ERR_CHECK(mpi_errno);
966
967 src = (void *) compare_addr, dest = (void *) (&(cas_pkt->compare_data));
968 mpi_errno = immed_copy(src, dest, type_size);
969 MPIR_ERR_CHECK(mpi_errno);
970
971 MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
972
973 mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
974 MPIR_ERR_CHECK(mpi_errno);
975
976 mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
977 MPIR_ERR_CHECK(mpi_errno);
978
979 if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
980 MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
981 while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
982 mpi_errno = wait_progress_engine();
983 MPIR_ERR_CHECK(mpi_errno);
984 }
985 }
986 }
987
988 fn_exit:
989 MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPID_COMPARE_AND_SWAP);
990 return mpi_errno;
991 /* --BEGIN ERROR HANDLING-- */
992 fn_fail:
993 goto fn_exit;
994 /* --END ERROR HANDLING-- */
995 }
996
997
MPID_Fetch_and_op(const void * origin_addr,void * result_addr,MPI_Datatype datatype,int target_rank,MPI_Aint target_disp,MPI_Op op,MPIR_Win * win_ptr)998 int MPID_Fetch_and_op(const void *origin_addr, void *result_addr,
999 MPI_Datatype datatype, int target_rank,
1000 MPI_Aint target_disp, MPI_Op op, MPIR_Win * win_ptr)
1001 {
1002 int mpi_errno = MPI_SUCCESS;
1003 int rank;
1004 MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
1005 int made_progress = 0;
1006
1007 MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_FETCH_AND_OP);
1008
1009 MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPID_FETCH_AND_OP);
1010
1011 MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
1012 mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
1013
1014 rank = win_ptr->comm_ptr->rank;
1015
1016 if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
1017 win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
1018 /* check if target is local and shared memory is allocated on window,
1019 * if so, we directly perform this operation on shared memory region. */
1020
1021 /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
1022 * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
1023 * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
1024 * which is only set to TRUE when SHM region is allocated in nemesis.
1025 * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
1026 */
1027 MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
1028 MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
1029 }
1030
1031 /* The datatype and op must be predefined. This is checked above the ADI,
1032 * so there's no need to check it again here. */
1033
1034 /* FIXME: For shared memory windows, we should provide an implementation
1035 * that uses a processor atomic operation. */
1036 if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
1037 (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
1038 mpi_errno = MPIDI_CH3I_Shm_fop_op(origin_addr, result_addr, datatype,
1039 target_rank, target_disp, op, win_ptr);
1040 MPIR_ERR_CHECK(mpi_errno);
1041 }
1042 else {
1043 MPIDI_RMA_Op_t *op_ptr = NULL;
1044 MPIDI_CH3_Pkt_fop_t *fop_pkt;
1045 MPI_Aint type_size;
1046 int use_immed_pkt = FALSE;
1047 int is_contig;
1048
1049 /* Append this operation to the RMA ops queue */
1050 mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
1051 MPIR_ERR_CHECK(mpi_errno);
1052
1053 MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
1054
1055 /******************** Setting operation struct areas ***********************/
1056
1057 op_ptr->origin_addr = (void *) origin_addr;
1058 op_ptr->origin_count = 1;
1059 op_ptr->origin_datatype = datatype;
1060 op_ptr->result_addr = result_addr;
1061 op_ptr->result_datatype = datatype;
1062 op_ptr->target_rank = target_rank;
1063 op_ptr->piggyback_lock_candidate = 1;
1064
1065 /************** Setting packet struct areas in operation ****************/
1066
1067 MPIR_Datatype_get_size_macro(datatype, type_size);
1068 MPIR_Assert(type_size <= sizeof(MPIDI_CH3_FOP_Immed_u));
1069
1070 MPIR_Datatype_is_contig(datatype, &is_contig);
1071
1072 if (is_contig) {
1073 /* Judge if we can use IMMED data packet */
1074 if (type_size <= MPIDI_RMA_IMMED_BYTES) {
1075 use_immed_pkt = TRUE;
1076 }
1077 }
1078
1079 fop_pkt = &(op_ptr->pkt.fop);
1080
1081 if (use_immed_pkt) {
1082 MPIDI_Pkt_init(fop_pkt, MPIDI_CH3_PKT_FOP_IMMED);
1083 }
1084 else {
1085 MPIDI_Pkt_init(fop_pkt, MPIDI_CH3_PKT_FOP);
1086 }
1087 fop_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
1088 win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
1089 fop_pkt->datatype = datatype;
1090 fop_pkt->op = op;
1091 fop_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
1092 fop_pkt->pkt_flags = MPIDI_CH3_PKT_FLAG_NONE;
1093 if (use_immed_pkt) {
1094 void *src = (void *) origin_addr, *dest = (void *) &(fop_pkt->info.data);
1095 mpi_errno = immed_copy(src, dest, type_size);
1096 MPIR_ERR_CHECK(mpi_errno);
1097 }
1098
1099 MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
1100
1101 mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
1102 MPIR_ERR_CHECK(mpi_errno);
1103
1104 mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
1105 MPIR_ERR_CHECK(mpi_errno);
1106
1107 if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
1108 MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
1109 while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
1110 mpi_errno = wait_progress_engine();
1111 MPIR_ERR_CHECK(mpi_errno);
1112 }
1113 }
1114 }
1115
1116 fn_exit:
1117 MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPID_FETCH_AND_OP);
1118 return mpi_errno;
1119 /* --BEGIN ERROR HANDLING-- */
1120 fn_fail:
1121 goto fn_exit;
1122 /* --END ERROR HANDLING-- */
1123 }
1124