1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  */
5 
6 #include "mpidrma.h"
7 
8 extern MPIR_T_pvar_timer_t PVAR_TIMER_rma_rmaqueue_set ATTRIBUTE((unused));
9 
10 /*
11 === BEGIN_MPI_T_CVAR_INFO_BLOCK ===
12 
13 cvars:
14     - name        : MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE
15       category    : CH3
16       type        : int
17       default     : 65536
18       class       : none
19       verbosity   : MPI_T_VERBOSITY_USER_BASIC
20       scope       : MPI_T_SCOPE_ALL_EQ
21       description : >-
22           Specify the threshold of data size of a RMA operation
23           which can be piggybacked with a LOCK message. It is
24           always a positive value and should not be smaller
25           than MPIDI_RMA_IMMED_BYTES.
26           If user sets it as a small value, for middle and large
27           data size, we will lose performance because of always
28           waiting for round-trip of LOCK synchronization; if
29           user sets it as a large value, we need to consume
30           more memory on target side to buffer this lock request
31           when lock is not satisfied.
32 
33 === END_MPI_T_CVAR_INFO_BLOCK ===
34 */
35 
MPIDI_CH3I_Put(const void * origin_addr,int origin_count,MPI_Datatype origin_datatype,int target_rank,MPI_Aint target_disp,int target_count,MPI_Datatype target_datatype,MPIR_Win * win_ptr,MPIR_Request * ureq)36 int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype
37                    origin_datatype, int target_rank, MPI_Aint target_disp,
38                    int target_count, MPI_Datatype target_datatype, MPIR_Win * win_ptr,
39                    MPIR_Request * ureq)
40 {
41     int mpi_errno = MPI_SUCCESS;
42     int dt_contig ATTRIBUTE((unused)), rank;
43     MPIR_Datatype*dtp;
44     MPI_Aint dt_true_lb ATTRIBUTE((unused));
45     intptr_t data_sz;
46     MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
47     int made_progress = 0;
48     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_PUT);
49 
50     MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_PUT);
51 
52     MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
53                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
54 
55     MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);
56 
57     if (data_sz == 0) {
58         goto fn_exit;
59     }
60 
61     rank = win_ptr->comm_ptr->rank;
62 
63     if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
64         win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
65         /* check if target is local and shared memory is allocated on window,
66          * if so, we directly perform this operation on shared memory region. */
67 
68         /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
69          * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
70          * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
71          * which is only set to TRUE when SHM region is allocated in nemesis.
72          * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
73          */
74         MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
75         MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
76     }
77 
78     /* If the put is a local operation, do it here */
79     if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
80         (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
81         mpi_errno = MPIDI_CH3I_Shm_put_op(origin_addr, origin_count, origin_datatype, target_rank,
82                                           target_disp, target_count, target_datatype, win_ptr);
83         MPIR_ERR_CHECK(mpi_errno);
84 
85         if (ureq) {
86             /* Complete user request and release the ch3 ref */
87             mpi_errno = MPID_Request_complete(ureq);
88             MPIR_ERR_CHECK(mpi_errno);
89         }
90     }
91     else {
92         MPIDI_RMA_Op_t *op_ptr = NULL;
93         MPIDI_CH3_Pkt_put_t *put_pkt = NULL;
94         int use_immed_pkt = FALSE;
95         int is_origin_contig, is_target_contig;
96 
97         /* queue it up */
98         mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
99         MPIR_ERR_CHECK(mpi_errno);
100 
101         MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
102 
103         /******************** Setting operation struct areas ***********************/
104 
105         /* FIXME: For contig and very short operations, use a streamlined op */
106         op_ptr->origin_addr = (void *) origin_addr;
107         op_ptr->origin_count = origin_count;
108         op_ptr->origin_datatype = origin_datatype;
109         op_ptr->target_rank = target_rank;
110 
111         /* Remember user request */
112         op_ptr->ureq = ureq;
113 
114         /* if source or target datatypes are derived, increment their
115          * reference counts */
116         if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
117             MPIR_Datatype_get_ptr(origin_datatype, dtp);
118             MPIR_Datatype_ptr_add_ref(dtp);
119         }
120         if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
121             MPIR_Datatype_get_ptr(target_datatype, dtp);
122             MPIR_Datatype_ptr_add_ref(dtp);
123         }
124 
125         MPIR_Datatype_is_contig(origin_datatype, &is_origin_contig);
126         MPIR_Datatype_is_contig(target_datatype, &is_target_contig);
127 
128         /* Judge if we can use IMMED data packet */
129         if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
130             MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) {
131             if (data_sz <= MPIDI_RMA_IMMED_BYTES)
132                 use_immed_pkt = TRUE;
133         }
134 
135         /* Judge if this operation is an piggyback candidate */
136         if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
137             MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
138             /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
139              * for both origin and target data. We should extend this optimization to derived
140              * datatypes as well. */
141             if (data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
142                 op_ptr->piggyback_lock_candidate = 1;
143         }
144 
145         /************** Setting packet struct areas in operation ****************/
146 
147         put_pkt = &(op_ptr->pkt.put);
148 
149         if (use_immed_pkt) {
150             MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT_IMMED);
151         }
152         else {
153             MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT);
154         }
155 
156         put_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
157             win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
158         put_pkt->count = target_count;
159         put_pkt->datatype = target_datatype;
160         put_pkt->info.flattened_type_size = 0;
161         put_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
162         put_pkt->source_win_handle = win_ptr->handle;
163         put_pkt->pkt_flags = MPIDI_CH3_PKT_FLAG_NONE;
164         if (use_immed_pkt) {
165             void *src = (void *) origin_addr, *dest = (void *) &(put_pkt->info.data);
166             mpi_errno = immed_copy(src, dest, data_sz);
167             MPIR_ERR_CHECK(mpi_errno);
168         }
169 
170         MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
171 
172         mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
173         MPIR_ERR_CHECK(mpi_errno);
174 
175         mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
176         MPIR_ERR_CHECK(mpi_errno);
177 
178         if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
179             MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
180             while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
181                 mpi_errno = wait_progress_engine();
182                 MPIR_ERR_CHECK(mpi_errno);
183             }
184         }
185     }
186 
187   fn_exit:
188     MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_PUT);
189     return mpi_errno;
190 
191     /* --BEGIN ERROR HANDLING-- */
192   fn_fail:
193     goto fn_exit;
194     /* --END ERROR HANDLING-- */
195 }
196 
MPIDI_CH3I_Get(void * origin_addr,int origin_count,MPI_Datatype origin_datatype,int target_rank,MPI_Aint target_disp,int target_count,MPI_Datatype target_datatype,MPIR_Win * win_ptr,MPIR_Request * ureq)197 int MPIDI_CH3I_Get(void *origin_addr, int origin_count, MPI_Datatype
198                    origin_datatype, int target_rank, MPI_Aint target_disp,
199                    int target_count, MPI_Datatype target_datatype, MPIR_Win * win_ptr,
200                    MPIR_Request * ureq)
201 {
202     int mpi_errno = MPI_SUCCESS;
203     intptr_t orig_data_sz, target_data_sz;
204     int dt_contig ATTRIBUTE((unused)), rank;
205     MPI_Aint dt_true_lb ATTRIBUTE((unused));
206     MPIR_Datatype*dtp;
207     MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
208     int made_progress = 0;
209     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET);
210 
211     MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_GET);
212 
213     MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
214                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
215 
216     MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, orig_data_sz, dtp,
217                             dt_true_lb);
218 
219     if (orig_data_sz == 0) {
220         goto fn_exit;
221     }
222 
223     rank = win_ptr->comm_ptr->rank;
224 
225     if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
226         win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
227         /* check if target is local and shared memory is allocated on window,
228          * if so, we directly perform this operation on shared memory region. */
229 
230         /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
231          * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
232          * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
233          * which is only set to TRUE when SHM region is allocated in nemesis.
234          * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
235          */
236         MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
237         MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
238     }
239 
240     /* If the get is a local operation, do it here */
241     if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
242         (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
243         mpi_errno = MPIDI_CH3I_Shm_get_op(origin_addr, origin_count, origin_datatype, target_rank,
244                                           target_disp, target_count, target_datatype, win_ptr);
245         MPIR_ERR_CHECK(mpi_errno);
246 
247         if (ureq) {
248             /* Complete user request and release the ch3 ref */
249             mpi_errno = MPID_Request_complete(ureq);
250             MPIR_ERR_CHECK(mpi_errno);
251         }
252     }
253     else {
254         MPIDI_RMA_Op_t *op_ptr = NULL;
255         MPIDI_CH3_Pkt_get_t *get_pkt = NULL;
256         MPI_Aint target_type_size;
257         int use_immed_resp_pkt = FALSE;
258         int is_origin_contig, is_target_contig;
259 
260         /* queue it up */
261         mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
262         MPIR_ERR_CHECK(mpi_errno);
263 
264         MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
265 
266         /******************** Setting operation struct areas ***********************/
267 
268         /* FIXME: For contig and very short operations, use a streamlined op */
269         op_ptr->origin_addr = origin_addr;
270         op_ptr->origin_count = origin_count;
271         op_ptr->origin_datatype = origin_datatype;
272         op_ptr->target_rank = target_rank;
273 
274         /* Remember user request */
275         op_ptr->ureq = ureq;
276 
277         /* if source or target datatypes are derived, increment their
278          * reference counts */
279         if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
280             MPIR_Datatype_get_ptr(origin_datatype, dtp);
281             MPIR_Datatype_ptr_add_ref(dtp);
282         }
283         if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
284             MPIR_Datatype_get_ptr(target_datatype, dtp);
285             MPIR_Datatype_ptr_add_ref(dtp);
286         }
287 
288         MPIR_Datatype_is_contig(origin_datatype, &is_origin_contig);
289         MPIR_Datatype_is_contig(target_datatype, &is_target_contig);
290 
291         MPIR_Datatype_get_size_macro(target_datatype, target_type_size);
292         MPIR_Assign_trunc(target_data_sz, target_count * target_type_size, intptr_t);
293 
294         /* Judge if we can use IMMED data response packet */
295         if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
296             MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) {
297             if (target_data_sz <= MPIDI_RMA_IMMED_BYTES)
298                 use_immed_resp_pkt = TRUE;
299         }
300 
301         /* Judge if this operation is an piggyback candidate. */
302         if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
303             MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
304             /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
305              * for both origin and target data. We should extend this optimization to derived
306              * datatypes as well. */
307             op_ptr->piggyback_lock_candidate = 1;
308         }
309 
310         /************** Setting packet struct areas in operation ****************/
311 
312         get_pkt = &(op_ptr->pkt.get);
313         MPIDI_Pkt_init(get_pkt, MPIDI_CH3_PKT_GET);
314         get_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
315             win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
316         get_pkt->count = target_count;
317         get_pkt->datatype = target_datatype;
318         get_pkt->info.flattened_type_size = 0;
319         get_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
320         get_pkt->pkt_flags = MPIDI_CH3_PKT_FLAG_NONE;
321         if (use_immed_resp_pkt)
322             get_pkt->pkt_flags |= MPIDI_CH3_PKT_FLAG_RMA_IMMED_RESP;
323 
324         MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
325 
326         mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
327         MPIR_ERR_CHECK(mpi_errno);
328 
329         mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
330         MPIR_ERR_CHECK(mpi_errno);
331 
332         if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
333             MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
334             while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
335                 mpi_errno = wait_progress_engine();
336                 MPIR_ERR_CHECK(mpi_errno);
337             }
338         }
339     }
340 
341   fn_exit:
342     MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_GET);
343     return mpi_errno;
344 
345     /* --BEGIN ERROR HANDLING-- */
346   fn_fail:
347     goto fn_exit;
348     /* --END ERROR HANDLING-- */
349 }
350 
351 
MPIDI_CH3I_Accumulate(const void * origin_addr,int origin_count,MPI_Datatype origin_datatype,int target_rank,MPI_Aint target_disp,int target_count,MPI_Datatype target_datatype,MPI_Op op,MPIR_Win * win_ptr,MPIR_Request * ureq)352 int MPIDI_CH3I_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
353                           origin_datatype, int target_rank, MPI_Aint target_disp,
354                           int target_count, MPI_Datatype target_datatype, MPI_Op op,
355                           MPIR_Win * win_ptr, MPIR_Request * ureq)
356 {
357     int mpi_errno = MPI_SUCCESS;
358     intptr_t data_sz;
359     int dt_contig ATTRIBUTE((unused)), rank;
360     MPI_Aint dt_true_lb ATTRIBUTE((unused));
361     MPIR_Datatype*dtp;
362     MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
363     int made_progress = 0;
364     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
365 
366     MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
367 
368     MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
369                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
370 
371     MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb);
372 
373     if (data_sz == 0) {
374         goto fn_exit;
375     }
376 
377     rank = win_ptr->comm_ptr->rank;
378 
379     if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
380         win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
381         /* check if target is local and shared memory is allocated on window,
382          * if so, we directly perform this operation on shared memory region. */
383 
384         /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
385          * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
386          * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
387          * which is only set to TRUE when SHM region is allocated in nemesis.
388          * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
389          */
390         MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
391         MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
392     }
393 
394     /* Do =! rank first (most likely branch?) */
395     if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
396         (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
397         mpi_errno = MPIDI_CH3I_Shm_acc_op(origin_addr, origin_count, origin_datatype,
398                                           target_rank, target_disp, target_count, target_datatype,
399                                           op, win_ptr);
400         MPIR_ERR_CHECK(mpi_errno);
401 
402         if (ureq) {
403             /* Complete user request and release the ch3 ref */
404             mpi_errno = MPID_Request_complete(ureq);
405             MPIR_ERR_CHECK(mpi_errno);
406         }
407     }
408     else {
409         MPIDI_RMA_Op_t *op_ptr = NULL;
410         MPIDI_CH3_Pkt_accum_t *accum_pkt = NULL;
411         int use_immed_pkt = FALSE;
412         int is_origin_contig, is_target_contig;
413         MPI_Aint stream_elem_count, stream_unit_count;
414         MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
415         MPIR_Datatype*origin_dtp = NULL, *target_dtp = NULL;
416         int i;
417 
418         /* queue it up */
419         mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
420         MPIR_ERR_CHECK(mpi_errno);
421 
422         MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
423 
424         /******************** Setting operation struct areas ***********************/
425 
426         op_ptr->origin_addr = (void *) origin_addr;
427         op_ptr->origin_count = origin_count;
428         op_ptr->origin_datatype = origin_datatype;
429         op_ptr->target_rank = target_rank;
430 
431         /* Remember user request */
432         op_ptr->ureq = ureq;
433 
434         /* if source or target datatypes are derived, increment their
435          * reference counts */
436         if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
437             MPIR_Datatype_get_ptr(origin_datatype, origin_dtp);
438         }
439         if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
440             MPIR_Datatype_get_ptr(target_datatype, target_dtp);
441         }
442 
443         /* Get size and count for predefined datatype elements */
444         if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
445             MPIR_Datatype_get_size_macro(origin_datatype, predefined_dtp_size);
446             predefined_dtp_count = origin_count;
447             MPIR_Datatype_get_extent_macro(origin_datatype, predefined_dtp_extent);
448         }
449         else {
450             MPIR_Assert(origin_dtp->basic_type != MPI_DATATYPE_NULL);
451             MPIR_Datatype_get_size_macro(origin_dtp->basic_type, predefined_dtp_size);
452             predefined_dtp_count = data_sz / predefined_dtp_size;
453             MPIR_Datatype_get_extent_macro(origin_dtp->basic_type, predefined_dtp_extent);
454         }
455         MPIR_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
456                     predefined_dtp_extent > 0);
457 
458         /* Calculate number of predefined elements in each stream unit, and
459          * total number of stream units. */
460         stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
461         stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
462         MPIR_Assert(stream_elem_count > 0 && stream_unit_count > 0);
463 
464         for (i = 0; i < stream_unit_count; i++) {
465             if (origin_dtp != NULL) {
466                 MPIR_Datatype_ptr_add_ref(origin_dtp);
467             }
468             if (target_dtp != NULL) {
469                 MPIR_Datatype_ptr_add_ref(target_dtp);
470             }
471         }
472 
473         MPIR_Datatype_is_contig(origin_datatype, &is_origin_contig);
474         MPIR_Datatype_is_contig(target_datatype, &is_target_contig);
475 
476         /* Judge if we can use IMMED data packet */
477         if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
478             MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) {
479             if (data_sz <= MPIDI_RMA_IMMED_BYTES)
480                 use_immed_pkt = TRUE;
481         }
482 
483         /* Judge if this operation is an piggyback candidate. */
484         if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) &&
485             MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
486             /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
487              * for both origin and target data. We should extend this optimization to derived
488              * datatypes as well. */
489             if (data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
490                 op_ptr->piggyback_lock_candidate = 1;
491         }
492 
493         /************** Setting packet struct areas in operation ****************/
494 
495         accum_pkt = &(op_ptr->pkt.accum);
496 
497         if (use_immed_pkt) {
498             MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE_IMMED);
499         }
500         else {
501             MPIDI_Pkt_init(accum_pkt, MPIDI_CH3_PKT_ACCUMULATE);
502         }
503 
504         accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
505             win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
506         accum_pkt->count = target_count;
507         accum_pkt->datatype = target_datatype;
508         accum_pkt->info.flattened_type_size = 0;
509         accum_pkt->op = op;
510         accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
511         accum_pkt->source_win_handle = win_ptr->handle;
512         accum_pkt->pkt_flags = MPIDI_CH3_PKT_FLAG_NONE;
513         if (use_immed_pkt) {
514             void *src = (void *) origin_addr, *dest = (void *) &(accum_pkt->info.data);
515             mpi_errno = immed_copy(src, dest, data_sz);
516             MPIR_ERR_CHECK(mpi_errno);
517         }
518 
519         MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
520 
521         mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
522         MPIR_ERR_CHECK(mpi_errno);
523 
524         mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
525         MPIR_ERR_CHECK(mpi_errno);
526 
527         if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
528             MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
529             while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
530                 mpi_errno = wait_progress_engine();
531                 MPIR_ERR_CHECK(mpi_errno);
532             }
533         }
534     }
535 
536   fn_exit:
537     MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_ACCUMULATE);
538     return mpi_errno;
539 
540     /* --BEGIN ERROR HANDLING-- */
541   fn_fail:
542     goto fn_exit;
543     /* --END ERROR HANDLING-- */
544 }
545 
546 
MPIDI_CH3I_Get_accumulate(const void * origin_addr,int origin_count,MPI_Datatype origin_datatype,void * result_addr,int result_count,MPI_Datatype result_datatype,int target_rank,MPI_Aint target_disp,int target_count,MPI_Datatype target_datatype,MPI_Op op,MPIR_Win * win_ptr,MPIR_Request * ureq)547 int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count,
548                               MPI_Datatype origin_datatype, void *result_addr, int result_count,
549                               MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
550                               int target_count, MPI_Datatype target_datatype, MPI_Op op,
551                               MPIR_Win * win_ptr, MPIR_Request * ureq)
552 {
553     int mpi_errno = MPI_SUCCESS;
554     intptr_t orig_data_sz, target_data_sz;
555     int rank;
556     int dt_contig ATTRIBUTE((unused));
557     MPI_Aint dt_true_lb ATTRIBUTE((unused));
558     MPIR_Datatype*dtp;
559     MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
560     int made_progress = 0;
561     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
562 
563     MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
564 
565     MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
566                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
567 
568     MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, target_data_sz, dtp,
569                             dt_true_lb);
570 
571     if (target_data_sz == 0) {
572         goto fn_exit;
573     }
574 
575     rank = win_ptr->comm_ptr->rank;
576 
577     if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
578         win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
579         /* check if target is local and shared memory is allocated on window,
580          * if so, we directly perform this operation on shared memory region. */
581 
582         /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
583          * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
584          * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
585          * which is only set to TRUE when SHM region is allocated in nemesis.
586          * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
587          */
588         MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
589         MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
590     }
591 
592     /* Do =! rank first (most likely branch?) */
593     if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
594         (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
595         mpi_errno = MPIDI_CH3I_Shm_get_acc_op(origin_addr, origin_count, origin_datatype,
596                                               result_addr, result_count, result_datatype,
597                                               target_rank, target_disp, target_count,
598                                               target_datatype, op, win_ptr);
599         MPIR_ERR_CHECK(mpi_errno);
600 
601         if (ureq) {
602             /* Complete user request and release the ch3 ref */
603             mpi_errno = MPID_Request_complete(ureq);
604             MPIR_ERR_CHECK(mpi_errno);
605         }
606     }
607     else {
608         MPIDI_RMA_Op_t *op_ptr = NULL;
609         MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt;
610         MPI_Aint origin_type_size;
611         MPI_Aint target_type_size;
612         int use_immed_pkt = FALSE, i;
613         int is_origin_contig, is_target_contig, is_result_contig;
614         MPI_Aint stream_elem_count, stream_unit_count;
615         MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent;
616         MPIR_Datatype*origin_dtp = NULL, *target_dtp = NULL, *result_dtp = NULL;
617         int is_empty_origin = FALSE;
618 
619         /* Judge if origin buffer is empty */
620         if (op == MPI_NO_OP)
621             is_empty_origin = TRUE;
622 
623         /* Append the operation to the window's RMA ops queue */
624         mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
625         MPIR_ERR_CHECK(mpi_errno);
626 
627         /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */
628 
629         MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
630 
631         /******************** Setting operation struct areas ***********************/
632 
633         op_ptr->origin_addr = (void *) origin_addr;
634         op_ptr->origin_count = origin_count;
635         op_ptr->origin_datatype = origin_datatype;
636         op_ptr->result_addr = result_addr;
637         op_ptr->result_count = result_count;
638         op_ptr->result_datatype = result_datatype;
639         op_ptr->target_rank = target_rank;
640 
641         /* Remember user request */
642         op_ptr->ureq = ureq;
643 
644         /* if source or target datatypes are derived, increment their
645          * reference counts */
646         if (is_empty_origin == FALSE && !MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) {
647             MPIR_Datatype_get_ptr(origin_datatype, origin_dtp);
648         }
649         if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) {
650             MPIR_Datatype_get_ptr(result_datatype, result_dtp);
651         }
652         if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
653             MPIR_Datatype_get_ptr(target_datatype, target_dtp);
654         }
655 
656         if (is_empty_origin == FALSE) {
657             MPIR_Datatype_get_size_macro(origin_datatype, origin_type_size);
658             MPIR_Assign_trunc(orig_data_sz, origin_count * origin_type_size, intptr_t);
659         }
660         else {
661             /* If origin buffer is empty, set origin data size to 0 */
662             orig_data_sz = 0;
663         }
664 
665         MPIR_Datatype_get_size_macro(target_datatype, target_type_size);
666 
667         /* Get size and count for predefined datatype elements */
668         if (MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
669             predefined_dtp_size = target_type_size;
670             predefined_dtp_count = target_count;
671             MPIR_Datatype_get_extent_macro(target_datatype, predefined_dtp_extent);
672         }
673         else {
674             MPIR_Assert(target_dtp->basic_type != MPI_DATATYPE_NULL);
675             MPIR_Datatype_get_size_macro(target_dtp->basic_type, predefined_dtp_size);
676             predefined_dtp_count = target_data_sz / predefined_dtp_size;
677             MPIR_Datatype_get_extent_macro(target_dtp->basic_type, predefined_dtp_extent);
678         }
679         MPIR_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 &&
680                     predefined_dtp_extent > 0);
681 
682         /* Calculate number of predefined elements in each stream unit, and
683          * total number of stream units. */
684         stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent;
685         stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1;
686         MPIR_Assert(stream_elem_count > 0 && stream_unit_count > 0);
687 
688         for (i = 0; i < stream_unit_count; i++) {
689             if (origin_dtp != NULL) {
690                 MPIR_Datatype_ptr_add_ref(origin_dtp);
691             }
692             if (target_dtp != NULL) {
693                 MPIR_Datatype_ptr_add_ref(target_dtp);
694             }
695             if (result_dtp != NULL) {
696                 MPIR_Datatype_ptr_add_ref(result_dtp);
697             }
698         }
699 
700         if (is_empty_origin == FALSE) {
701             MPIR_Datatype_is_contig(origin_datatype, &is_origin_contig);
702         }
703         else {
704             /* If origin buffer is empty, mark origin data as contig data */
705             is_origin_contig = 1;
706         }
707         MPIR_Datatype_is_contig(target_datatype, &is_target_contig);
708         MPIR_Datatype_is_contig(result_datatype, &is_result_contig);
709 
710         /* Judge if we can use IMMED data packet */
711         if ((is_empty_origin == TRUE || MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) &&
712             MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
713             MPIR_DATATYPE_IS_PREDEFINED(target_datatype) &&
714             is_origin_contig && is_target_contig && is_result_contig) {
715             if (target_data_sz <= MPIDI_RMA_IMMED_BYTES)
716                 use_immed_pkt = TRUE;
717         }
718 
719         /* Judge if this operation is a piggyback candidate */
720         if ((is_empty_origin == TRUE || MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) &&
721             MPIR_DATATYPE_IS_PREDEFINED(result_datatype) &&
722             MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) {
723             /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes
724              * for origin, target and result data. We should extend this optimization to derived
725              * datatypes as well. */
726             if (orig_data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE)
727                 op_ptr->piggyback_lock_candidate = 1;
728         }
729 
730         /************** Setting packet struct areas in operation ****************/
731 
732         get_accum_pkt = &(op_ptr->pkt.get_accum);
733 
734         if (use_immed_pkt) {
735             MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM_IMMED);
736         }
737         else {
738             MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM);
739         }
740 
741         get_accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
742             win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
743         get_accum_pkt->count = target_count;
744         get_accum_pkt->datatype = target_datatype;
745         get_accum_pkt->info.flattened_type_size = 0;
746         get_accum_pkt->op = op;
747         get_accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
748         get_accum_pkt->pkt_flags = MPIDI_CH3_PKT_FLAG_NONE;
749         if (use_immed_pkt) {
750             void *src = (void *) origin_addr, *dest = (void *) &(get_accum_pkt->info.data);
751             mpi_errno = immed_copy(src, dest, orig_data_sz);
752             MPIR_ERR_CHECK(mpi_errno);
753         }
754 
755         MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
756 
757         mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
758         MPIR_ERR_CHECK(mpi_errno);
759 
760         mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
761         MPIR_ERR_CHECK(mpi_errno);
762 
763         if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
764             MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
765             while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
766                 mpi_errno = wait_progress_engine();
767                 MPIR_ERR_CHECK(mpi_errno);
768             }
769         }
770     }
771 
772   fn_exit:
773     MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE);
774     return mpi_errno;
775 
776     /* --BEGIN ERROR HANDLING-- */
777   fn_fail:
778     goto fn_exit;
779     /* --END ERROR HANDLING-- */
780 }
781 
782 
MPID_Put(const void * origin_addr,int origin_count,MPI_Datatype origin_datatype,int target_rank,MPI_Aint target_disp,int target_count,MPI_Datatype target_datatype,MPIR_Win * win_ptr)783 int MPID_Put(const void *origin_addr, int origin_count, MPI_Datatype
784              origin_datatype, int target_rank, MPI_Aint target_disp,
785              int target_count, MPI_Datatype target_datatype, MPIR_Win * win_ptr)
786 {
787     int mpi_errno = MPI_SUCCESS;
788 
789     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_PUT);
790     MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPID_PUT);
791 
792     mpi_errno = MPIDI_CH3I_Put(origin_addr, origin_count, origin_datatype,
793                                target_rank, target_disp, target_count, target_datatype,
794                                win_ptr, NULL);
795 
796   fn_exit:
797     MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPID_PUT);
798     return mpi_errno;
799 
800     /* --BEGIN ERROR HANDLING-- */
801   fn_fail:
802     goto fn_exit;
803     /* --END ERROR HANDLING-- */
804 }
805 
MPID_Get(void * origin_addr,int origin_count,MPI_Datatype origin_datatype,int target_rank,MPI_Aint target_disp,int target_count,MPI_Datatype target_datatype,MPIR_Win * win_ptr)806 int MPID_Get(void *origin_addr, int origin_count, MPI_Datatype
807              origin_datatype, int target_rank, MPI_Aint target_disp,
808              int target_count, MPI_Datatype target_datatype, MPIR_Win * win_ptr)
809 {
810     int mpi_errno = MPI_SUCCESS;
811 
812     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_GET);
813     MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPID_GET);
814 
815     mpi_errno = MPIDI_CH3I_Get(origin_addr, origin_count, origin_datatype,
816                                target_rank, target_disp, target_count, target_datatype,
817                                win_ptr, NULL);
818 
819   fn_exit:
820     MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPID_GET);
821     return mpi_errno;
822 
823     /* --BEGIN ERROR HANDLING-- */
824   fn_fail:
825     goto fn_exit;
826     /* --END ERROR HANDLING-- */
827 }
828 
MPID_Accumulate(const void * origin_addr,int origin_count,MPI_Datatype origin_datatype,int target_rank,MPI_Aint target_disp,int target_count,MPI_Datatype target_datatype,MPI_Op op,MPIR_Win * win_ptr)829 int MPID_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype
830                     origin_datatype, int target_rank, MPI_Aint target_disp,
831                     int target_count, MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win_ptr)
832 {
833     int mpi_errno = MPI_SUCCESS;
834 
835     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_ACCUMULATE);
836     MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPID_ACCUMULATE);
837 
838     mpi_errno = MPIDI_CH3I_Accumulate(origin_addr, origin_count, origin_datatype,
839                                       target_rank, target_disp, target_count, target_datatype,
840                                       op, win_ptr, NULL);
841 
842   fn_exit:
843     MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPID_ACCUMULATE);
844     return mpi_errno;
845 
846     /* --BEGIN ERROR HANDLING-- */
847   fn_fail:
848     goto fn_exit;
849     /* --END ERROR HANDLING-- */
850 }
851 
MPID_Get_accumulate(const void * origin_addr,int origin_count,MPI_Datatype origin_datatype,void * result_addr,int result_count,MPI_Datatype result_datatype,int target_rank,MPI_Aint target_disp,int target_count,MPI_Datatype target_datatype,MPI_Op op,MPIR_Win * win_ptr)852 int MPID_Get_accumulate(const void *origin_addr, int origin_count,
853                         MPI_Datatype origin_datatype, void *result_addr, int result_count,
854                         MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
855                         int target_count, MPI_Datatype target_datatype, MPI_Op op,
856                         MPIR_Win * win_ptr)
857 {
858     int mpi_errno = MPI_SUCCESS;
859 
860     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_GET_ACCUMULATE);
861     MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPID_GET_ACCUMULATE);
862 
863     mpi_errno = MPIDI_CH3I_Get_accumulate(origin_addr, origin_count, origin_datatype,
864                                           result_addr, result_count, result_datatype,
865                                           target_rank, target_disp, target_count,
866                                           target_datatype, op, win_ptr, NULL);
867 
868   fn_exit:
869     MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPID_GET_ACCUMULATE);
870     return mpi_errno;
871 
872     /* --BEGIN ERROR HANDLING-- */
873   fn_fail:
874     goto fn_exit;
875     /* --END ERROR HANDLING-- */
876 }
877 
878 
MPID_Compare_and_swap(const void * origin_addr,const void * compare_addr,void * result_addr,MPI_Datatype datatype,int target_rank,MPI_Aint target_disp,MPIR_Win * win_ptr)879 int MPID_Compare_and_swap(const void *origin_addr, const void *compare_addr,
880                           void *result_addr, MPI_Datatype datatype, int target_rank,
881                           MPI_Aint target_disp, MPIR_Win * win_ptr)
882 {
883     int mpi_errno = MPI_SUCCESS;
884     int rank;
885     MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
886     int made_progress = 0;
887 
888     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_COMPARE_AND_SWAP);
889 
890     MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPID_COMPARE_AND_SWAP);
891 
892     MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
893                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
894 
895     rank = win_ptr->comm_ptr->rank;
896 
897     if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
898         win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
899         /* check if target is local and shared memory is allocated on window,
900          * if so, we directly perform this operation on shared memory region. */
901 
902         /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
903          * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
904          * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
905          * which is only set to TRUE when SHM region is allocated in nemesis.
906          * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
907          */
908         MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
909         MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
910     }
911 
912     /* The datatype must be predefined, and one of: C integer, Fortran integer,
913      * Logical, Multi-language types, or Byte.  This is checked above the ADI,
914      * so there's no need to check it again here. */
915 
916     /* FIXME: For shared memory windows, we should provide an implementation
917      * that uses a processor atomic operation. */
918     if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
919         (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
920         mpi_errno = MPIDI_CH3I_Shm_cas_op(origin_addr, compare_addr, result_addr,
921                                           datatype, target_rank, target_disp, win_ptr);
922         MPIR_ERR_CHECK(mpi_errno);
923     }
924     else {
925         MPIDI_RMA_Op_t *op_ptr = NULL;
926         MPIDI_CH3_Pkt_cas_t *cas_pkt = NULL;
927         MPI_Aint type_size;
928         void *src = NULL, *dest = NULL;
929 
930         /* Append this operation to the RMA ops queue */
931         mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
932         MPIR_ERR_CHECK(mpi_errno);
933 
934         MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
935 
936         /******************** Setting operation struct areas ***********************/
937 
938         op_ptr->origin_addr = (void *) origin_addr;
939         op_ptr->origin_count = 1;
940         op_ptr->origin_datatype = datatype;
941         op_ptr->result_addr = result_addr;
942         op_ptr->result_datatype = datatype;
943         op_ptr->compare_addr = (void *) compare_addr;
944         op_ptr->compare_datatype = datatype;
945         op_ptr->target_rank = target_rank;
946         op_ptr->piggyback_lock_candidate = 1;   /* CAS is always able to piggyback LOCK */
947 
948         /************** Setting packet struct areas in operation ****************/
949 
950         cas_pkt = &(op_ptr->pkt.cas);
951         MPIDI_Pkt_init(cas_pkt, MPIDI_CH3_PKT_CAS_IMMED);
952         cas_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
953             win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
954         cas_pkt->datatype = datatype;
955         cas_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
956         cas_pkt->pkt_flags = MPIDI_CH3_PKT_FLAG_NONE;
957 
958         /* REQUIRE: All datatype arguments must be of the same, builtin
959          * type and counts must be 1. */
960         MPIR_Datatype_get_size_macro(datatype, type_size);
961         MPIR_Assert(type_size <= sizeof(MPIDI_CH3_CAS_Immed_u));
962 
963         src = (void *) origin_addr, dest = (void *) (&(cas_pkt->origin_data));
964         mpi_errno = immed_copy(src, dest, type_size);
965         MPIR_ERR_CHECK(mpi_errno);
966 
967         src = (void *) compare_addr, dest = (void *) (&(cas_pkt->compare_data));
968         mpi_errno = immed_copy(src, dest, type_size);
969         MPIR_ERR_CHECK(mpi_errno);
970 
971         MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
972 
973         mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
974         MPIR_ERR_CHECK(mpi_errno);
975 
976         mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
977         MPIR_ERR_CHECK(mpi_errno);
978 
979         if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
980             MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
981             while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
982                 mpi_errno = wait_progress_engine();
983                 MPIR_ERR_CHECK(mpi_errno);
984             }
985         }
986     }
987 
988   fn_exit:
989     MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPID_COMPARE_AND_SWAP);
990     return mpi_errno;
991     /* --BEGIN ERROR HANDLING-- */
992   fn_fail:
993     goto fn_exit;
994     /* --END ERROR HANDLING-- */
995 }
996 
997 
MPID_Fetch_and_op(const void * origin_addr,void * result_addr,MPI_Datatype datatype,int target_rank,MPI_Aint target_disp,MPI_Op op,MPIR_Win * win_ptr)998 int MPID_Fetch_and_op(const void *origin_addr, void *result_addr,
999                       MPI_Datatype datatype, int target_rank,
1000                       MPI_Aint target_disp, MPI_Op op, MPIR_Win * win_ptr)
1001 {
1002     int mpi_errno = MPI_SUCCESS;
1003     int rank;
1004     MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL;
1005     int made_progress = 0;
1006 
1007     MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_FETCH_AND_OP);
1008 
1009     MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPID_FETCH_AND_OP);
1010 
1011     MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE,
1012                         mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync");
1013 
1014     rank = win_ptr->comm_ptr->rank;
1015 
1016     if (win_ptr->shm_allocated == TRUE && target_rank != rank &&
1017         win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) {
1018         /* check if target is local and shared memory is allocated on window,
1019          * if so, we directly perform this operation on shared memory region. */
1020 
1021         /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on
1022          * the same node. However, in ch3:sock, even if origin and target are on the same node, they do
1023          * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first,
1024          * which is only set to TRUE when SHM region is allocated in nemesis.
1025          * In future we need to figure out a way to check if origin and target are in the same "SHM comm".
1026          */
1027         MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc);
1028         MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc);
1029     }
1030 
1031     /* The datatype and op must be predefined.  This is checked above the ADI,
1032      * so there's no need to check it again here. */
1033 
1034     /* FIXME: For shared memory windows, we should provide an implementation
1035      * that uses a processor atomic operation. */
1036     if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED ||
1037         (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) {
1038         mpi_errno = MPIDI_CH3I_Shm_fop_op(origin_addr, result_addr, datatype,
1039                                           target_rank, target_disp, op, win_ptr);
1040         MPIR_ERR_CHECK(mpi_errno);
1041     }
1042     else {
1043         MPIDI_RMA_Op_t *op_ptr = NULL;
1044         MPIDI_CH3_Pkt_fop_t *fop_pkt;
1045         MPI_Aint type_size;
1046         int use_immed_pkt = FALSE;
1047         int is_contig;
1048 
1049         /* Append this operation to the RMA ops queue */
1050         mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr);
1051         MPIR_ERR_CHECK(mpi_errno);
1052 
1053         MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set);
1054 
1055         /******************** Setting operation struct areas ***********************/
1056 
1057         op_ptr->origin_addr = (void *) origin_addr;
1058         op_ptr->origin_count = 1;
1059         op_ptr->origin_datatype = datatype;
1060         op_ptr->result_addr = result_addr;
1061         op_ptr->result_datatype = datatype;
1062         op_ptr->target_rank = target_rank;
1063         op_ptr->piggyback_lock_candidate = 1;
1064 
1065         /************** Setting packet struct areas in operation ****************/
1066 
1067         MPIR_Datatype_get_size_macro(datatype, type_size);
1068         MPIR_Assert(type_size <= sizeof(MPIDI_CH3_FOP_Immed_u));
1069 
1070         MPIR_Datatype_is_contig(datatype, &is_contig);
1071 
1072         if (is_contig) {
1073             /* Judge if we can use IMMED data packet */
1074             if (type_size <= MPIDI_RMA_IMMED_BYTES) {
1075                 use_immed_pkt = TRUE;
1076             }
1077         }
1078 
1079         fop_pkt = &(op_ptr->pkt.fop);
1080 
1081         if (use_immed_pkt) {
1082             MPIDI_Pkt_init(fop_pkt, MPIDI_CH3_PKT_FOP_IMMED);
1083         }
1084         else {
1085             MPIDI_Pkt_init(fop_pkt, MPIDI_CH3_PKT_FOP);
1086         }
1087         fop_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr +
1088             win_ptr->basic_info_table[target_rank].disp_unit * target_disp;
1089         fop_pkt->datatype = datatype;
1090         fop_pkt->op = op;
1091         fop_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle;
1092         fop_pkt->pkt_flags = MPIDI_CH3_PKT_FLAG_NONE;
1093         if (use_immed_pkt) {
1094             void *src = (void *) origin_addr, *dest = (void *) &(fop_pkt->info.data);
1095             mpi_errno = immed_copy(src, dest, type_size);
1096             MPIR_ERR_CHECK(mpi_errno);
1097         }
1098 
1099         MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set);
1100 
1101         mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr);
1102         MPIR_ERR_CHECK(mpi_errno);
1103 
1104         mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress);
1105         MPIR_ERR_CHECK(mpi_errno);
1106 
1107         if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 &&
1108             MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
1109             while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) {
1110                 mpi_errno = wait_progress_engine();
1111                 MPIR_ERR_CHECK(mpi_errno);
1112             }
1113         }
1114     }
1115 
1116   fn_exit:
1117     MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPID_FETCH_AND_OP);
1118     return mpi_errno;
1119     /* --BEGIN ERROR HANDLING-- */
1120   fn_fail:
1121     goto fn_exit;
1122     /* --END ERROR HANDLING-- */
1123 }
1124