1 /* One-sided MPI implementation of Libcaf
2 *
3 * Copyright (c) 2012-2018, Sourcery, Inc.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *     * Redistributions of source code must retain the above copyright
9 *       notice, this list of conditions and the following disclaimer.
10 *     * Redistributions in binary form must reproduce the above copyright
11 *       notice, this list of conditions and the following disclaimer in the
12 *       documentation and/or other materials provided with the distribution.
13 *     * Neither the name of the Sourcery, Inc., nor the
14 *       names of its contributors may be used to endorse or promote products
15 *       derived from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL SOURCERY, INC., BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
27 
28 /****l* mpi/mpi_caf.c
29  * NAME
30  *   mpi_caf
31  * SYNOPSIS
32  *   This program implements the LIBCAF_MPI transport layer.
33 ******
34 */
35 
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>     /* For memcpy. */
39 #include <stdarg.h>     /* For variadic arguments. */
40 #include <float.h>      /* For type conversion of floating point numbers. */
41 #ifndef ALLOCA_MISSING
42 #include <alloca.h>     /* Assume functionality provided elsewhere if missing */
43 #endif
44 #include <unistd.h>
45 #include <stdint.h>     /* For int32_t. */
46 #include <mpi.h>
47 #include <pthread.h>
48 #include <signal.h>     /* For raise */
49 
50 #ifdef HAVE_MPI_EXT_H
51 #include <mpi-ext.h>
52 #endif
53 #ifdef USE_FAILED_IMAGES
54   #define WITH_FAILED_IMAGES 1
55 #endif
56 
57 #include "libcaf.h"
58 
59 /* Define GFC_CAF_CHECK to enable run-time checking. */
60 /* #define GFC_CAF_CHECK  1 */
61 
62 /* Debug array referencing  */
63 static char* caf_array_ref_str[] = {
64   "CAF_ARR_REF_NONE",
65   "CAF_ARR_REF_VECTOR",
66   "CAF_ARR_REF_FULL",
67   "CAF_ARR_REF_RANGE",
68   "CAF_ARR_REF_SINGLE",
69   "CAF_ARR_REF_OPEN_END",
70   "CAF_ARR_REF_OPEN_START"
71 };
72 
73 static char* caf_ref_type_str[] = {
74   "CAF_REF_COMPONENT",
75   "CAF_REF_ARRAY",
76   "CAF_REF_STATIC_ARRAY",
77 };
78 
79 #ifndef EXTRA_DEBUG_OUTPUT
80 #define dprint(...)
81 #define chk_err(...)
82 #else
83 #define dprint(format, ...)                     \
84 fprintf(stderr, "%d/%d: %s(%d) " format,        \
85         caf_this_image, caf_num_images,         \
86         __FUNCTION__, __LINE__, ## __VA_ARGS__)
87 #define chk_err(ierr)                               \
88 do                                                  \
89 {                                                   \
90   if (ierr != MPI_SUCCESS)                          \
91   {                                                 \
92     int err_class, err_len;                         \
93     char err_str[MPI_MAX_ERROR_STRING];             \
94     MPI_Error_class(ierr, &err_class);              \
95     MPI_Error_string(ierr, err_str, &err_len);      \
96     dprint("MPI-error: err_class=%d ierr=%d [%s]",  \
97            err_class, ierr, err_str);               \
98   }                                                 \
99 } while (0)
100 #endif
101 
102 #ifdef GCC_GE_7
103 /* The caf-token of the mpi-library.
104  * Objects of this data structure are owned by the library and are treated as a
105  * black box by the compiler.  In the coarray-program the tokens are opaque
106  * pointers, i.e. black boxes.
107  *
108  * For each coarray (allocatable|save|pointer) (scalar|array|event|lock) a
109  * token needs to be present. */
110 typedef struct mpi_caf_token_t
111 {
112   /* The pointer to memory associated to this token's data on the local image.
113    * The compiler uses the address for direct access to the memory of the object
114    * this token is assocated to, i.e., the memory pointed to be local_memptr is
115    * the scalar or array.
116    * When the library is responsible for deleting the memory, then this is the
117    * one to free. */
118   void *memptr;
119   /* The MPI window to associated to the object's data.
120    * The window is used to access the data on other images. In pre GCC_GE_7
121    * installations this was the token. */
122   MPI_Win memptr_win;
123   /* The pointer to the primary array, i.e., to coarrays that are arrays and
124    * not a derived type. */
125   gfc_descriptor_t *desc;
126 } mpi_caf_token_t;
127 
128 /* For components of derived type coarrays a slave_token is needed when the
129  * component has the allocatable or pointer attribute. The token is reduced in
130  * size, because the other data is already accessible and has been read from
131  * the remote to fullfill the request.
132  *
133  *   TYPE t
134  *   +------------------+
135  *   | comp *           |
136  *   | comp_token *     |
137  *   +------------------+
138  *
139  *   TYPE(t) : o                struct T // the mpi_caf_token to t
140  *                              +----------------+
141  *                              | ...            |
142  *                              +----------------+
143  *
144  *   o[2]%.comp                 // using T to get o of [2]
145  *
146  *   +-o-on-image-2----+  "copy" of the requierd parts of o[2] on current image
147  *   | 0x4711          |  comp * in global_dynamic_window
148  *   | 0x2424          |  comp_token * of type slave_token
149  *   +-----------------+
150  *   now all required data is present on the current image to access the remote
151  *   components. This nests without limit. */
152 typedef struct mpi_caf_slave_token_t
153 {
154   /* The pointer to the memory associated to this slave token's data on the
155    * local image.  When the library is responsible for deleting the memory,
156    * then this is the one to free.  And this is the only reason why its stored
157    * here. */
158   void *memptr;
159   /* The pointer to the descriptor or NULL for scalars.
160    * When referencing a remote component array, then the extensions of the array
161    * are needed. Usually the data pointer is at offset zero of the descriptor_t
162    * structure, but we don't rely on it. So store the pointer to the base
163    * address of the descriptor. The descriptor always is in the window of the
164    * master data or the allocated component and is never stored at an address
165    * not accessible by a window. */
166   gfc_descriptor_t *desc;
167 } mpi_caf_slave_token_t;
168 
169 #define TOKEN(X) &(((mpi_caf_token_t *) (X))->memptr_win)
170 #else
171 typedef MPI_Win *mpi_caf_token_t;
172 #define TOKEN(X) ((mpi_caf_token_t) (X))
173 #endif
174 
175 /* Forward declaration of prototype. */
176 
177 static void terminate_internal (int stat_code, int exit_code)
178             __attribute__((noreturn));
179 static void sync_images_internal (int count, int images[], int *stat,
180                                   char *errmsg, size_t errmsg_len,
181                                   bool internal);
182 static void error_stop_str (const char *string, size_t len, bool quiet)
183             __attribute__((noreturn));
184 
185 /* Global variables. */
186 static int caf_this_image;
187 static int caf_num_images = 0;
188 static int caf_is_finalized = 0;
189 static MPI_Win global_dynamic_win;
190 
191 #if MPI_VERSION >= 3
192   MPI_Info mpi_info_same_size;
193 #endif // MPI_VERSION
194 
195 /* The size of pointer on this plattform. */
196 static const size_t stdptr_size = sizeof(void *);
197 
198 /* Variables needed for syncing images. */
199 
200 static int *images_full;
201 MPI_Request *sync_handles;
202 static int *arrived;
203 static const int MPI_TAG_CAF_SYNC_IMAGES = 424242;
204 
205 /* Pending puts */
206 #if defined(NONBLOCKING_PUT) && !defined(CAF_MPI_LOCK_UNLOCK)
207 typedef struct win_sync {
208   MPI_Win *win;
209   int img;
210   struct win_sync *next;
211 } win_sync;
212 
213 static win_sync *last_elem = NULL;
214 static win_sync *pending_puts = NULL;
215 #endif
216 
217 /* Linked list of static coarrays registered.  Do not expose to public in the
218  * header, because it is implementation specific. */
219 struct caf_allocated_tokens_t
220 {
221   caf_token_t token;
222   struct caf_allocated_tokens_t *prev;
223 } *caf_allocated_tokens = NULL;
224 
225 #ifdef GCC_GE_7
226 /* Linked list of slave coarrays registered. */
227 struct caf_allocated_slave_tokens_t
228 {
229   mpi_caf_slave_token_t *token;
230   struct caf_allocated_slave_tokens_t *prev;
231 } *caf_allocated_slave_tokens = NULL;
232 #endif
233 
234 /* Image status variable */
235 static int img_status = 0;
236 static MPI_Win *stat_tok;
237 
238 /* Active messages variables */
239 char **buff_am;
240 MPI_Status *s_am;
241 MPI_Request *req_am;
242 MPI_Datatype *dts;
243 char *msgbody;
244 pthread_mutex_t lock_am;
245 int done_am = 0;
246 
247 char err_buffer[MPI_MAX_ERROR_STRING];
248 
249 /* All CAF runtime calls should use this comm instead of MPI_COMM_WORLD for
250  * interoperability purposes. */
251 MPI_Comm CAF_COMM_WORLD;
252 
253 static caf_teams_list *teams_list = NULL;
254 static caf_used_teams_list *used_teams = NULL;
255 
256 /* Emitted when a theorectically unreachable part is reached. */
257 const char unreachable[] = "Fatal error: unreachable alternative found.\n";
258 
259 #ifdef WITH_FAILED_IMAGES
260 /* The stati of the other images.  image_stati is an array of size
261  * caf_num_images at the beginning the status of each image is noted here where
262  * the index is the image number minus one. */
263 int *image_stati;
264 
265 /* This gives the number of all images that are known to have failed. */
266 int num_images_failed = 0;
267 
268 /* This is the number of all images that are known to have stopped. */
269 int num_images_stopped = 0;
270 
271 /* The async. request-handle to all participating images. */
272 MPI_Request alive_request;
273 
274 /* This dummy is used for the alive request.  Its content is arbitrary and
275  * never read.  Its just a memory location where one could put something,
276  * which is never done. */
277 int alive_dummy;
278 
279 /* The mpi error-handler object associate to CAF_COMM_WORLD. */
280 MPI_Errhandler failed_CAF_COMM_mpi_err_handler;
281 
282 /* The monitor comm for detecting failed images. We can not attach the monitor
283  * to CAF_COMM_WORLD or the messages send by sync images would be caught by the
284  * monitor. */
285 MPI_Comm alive_comm;
286 
287 /* Set when entering a sync_images_internal, to prevent the error handler from
288  * eating our messages. */
289 bool no_stopped_images_check_in_errhandler = 0;
290 #endif
291 
292 /* For MPI interoperability, allow external initialization
293  * (and thus finalization) of MPI. */
294 bool caf_owns_mpi = false;
295 
296 /* Foo function pointers for coreduce.
297  * The handles when arguments are passed by reference. */
298 int (*int8_t_by_reference)(void *, void *);
299 int (*int16_t_by_reference)(void *, void *);
300 int (*int32_t_by_reference)(void *, void *);
301 int (*int64_t_by_reference)(void *, void *);
302 float (*float_by_reference)(void *, void *);
303 double (*double_by_reference)(void *, void *);
304 /* Strings are always passed by reference. */
305 void (*char_by_reference)(void *, int, void *, void *, int, int);
306 /* The handles when arguments are passed by value. */
307 int8_t (*int8_t_by_value)(int8_t, int8_t);
308 int16_t (*int16_t_by_value)(int16_t, int16_t);
309 int (*int32_t_by_value)(int32_t, int32_t);
310 int64_t (*int64_t_by_value)(int64_t, int64_t);
311 float (*float_by_value)(float, float);
312 double (*double_by_value)(double, double);
313 
314 /* Define shortcuts for Win_lock and _unlock depending on whether the primitives
315  * are available in the MPI implementation.  When they are not available the
316  * shortcut is expanded to nothing by the preprocessor else to the API call.
317  * This prevents having #ifdef #else #endif constructs strewn all over the code
318  * reducing its readability. */
319 #ifdef CAF_MPI_LOCK_UNLOCK
320 #define CAF_Win_lock(type, img, win) MPI_Win_lock (type, img, 0, win)
321 #define CAF_Win_unlock(img, win) MPI_Win_unlock (img, win)
322 #define CAF_Win_lock_all(win)
323 #define CAF_Win_unlock_all(win)
324 #else // CAF_MPI_LOCK_UNLOCK
325 #define CAF_Win_lock(type, img, win)
326 #define CAF_Win_unlock(img, win) MPI_Win_flush (img, win)
327 #if MPI_VERSION >= 3
328 #define CAF_Win_lock_all(win) MPI_Win_lock_all (MPI_MODE_NOCHECK, win)
329 #else
330 #define CAF_Win_lock_all(win)
331 #endif
332 #define CAF_Win_unlock_all(win) MPI_Win_unlock_all (win)
333 #endif // CAF_MPI_LOCK_UNLOCK
334 
335 #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
336 
337 #if defined(NONBLOCKING_PUT) && !defined(CAF_MPI_LOCK_UNLOCK)
explicit_flush()338 void explicit_flush()
339 {
340   win_sync *w = pending_puts, *t;
341   MPI_Win *p;
342   int ierr;
343   while (w != NULL)
344   {
345     p = w->win;
346     ierr = MPI_Win_flush(w->img,*p); chk_err(ierr);
347     t = w;
348     w = w->next;
349     free(t);
350   }
351   last_elem = NULL;
352   pending_puts = NULL;
353 }
354 #endif
355 
356 #ifdef HELPER
helperFunction()357 void helperFunction()
358 {
359   int i = 0, flag = 0, msgid = 0, ierr;
360   int ndim = 0, position = 0;
361 
362   s_am = calloc(caf_num_images, sizeof(MPI_Status));
363   req_am = calloc(caf_num_images, sizeof(MPI_Request));
364   dts = calloc(caf_num_images, sizeof(MPI_Datatype));
365 
366   for (i = 0; i < caf_num_images; i++)
367   {
368     ierr = MPI_Irecv(buff_am[i], 1000, MPI_PACKED, i, 1, CAF_COMM_WORLD,
369                      &req_am[i]); chk_err(ierr);
370   }
371 
372   while (1)
373   {
374     pthread_mutex_lock(&lock_am);
375     for (i = 0; i < caf_num_images; i++)
376     {
377       if (!caf_is_finalized)
378       {
379         ierr = MPI_Test(&req_am[i], &flag, &s_am[i]); chk_err(ierr);
380         if (flag == 1)
381         {
382           position = 0;
383           ierr = MPI_Unpack(buff_am[i], 1000, &position, &msgid, 1, MPI_INT,
384                             CAF_COMM_WORLD); chk_err(ierr);
385           /* msgid=2 was initially assigned to strided transfers,
386            * it can be reused
387            * Strided transfers Msgid=2
388            * You can add you own function */
389 
390           if (msgid == 2)
391           {
392             msgid = 0; position = 0;
393           }
394           ierr = MPI_Irecv(buff_am[i], 1000, MPI_PACKED, i, 1, CAF_COMM_WORLD,
395                            &req_am[i]); chk_err(ierr);
396           flag = 0;
397         }
398       }
399       else
400       {
401         done_am = 1;
402         pthread_mutex_unlock(&lock_am);
403         return;
404       }
405     }
406     pthread_mutex_unlock(&lock_am);
407   }
408 }
409 #endif
410 
411 
412 /* Keep in sync with single.c. */
413 
414 static void
caf_runtime_error(const char * message,...)415 caf_runtime_error (const char *message, ...)
416 {
417   va_list ap;
418   fprintf(stderr, "Fortran runtime error on image %d: ", caf_this_image);
419   va_start(ap, message);
420   vfprintf(stderr, message, ap);
421   va_end(ap);
422   fprintf(stderr, "\n");
423 
424   /* FIXME: Shutdown the Fortran RTL to flush the buffer.  PR 43849.
425    * FIXME: Do some more effort than just to abort. */
426   //  MPI_Finalize();
427 
428   /* Should be unreachable, but to make sure also call exit. */
429   exit(EXIT_FAILURE);
430 }
431 
432 /* Forward declaration of the feature unsupported message for failed images
433  * functions. */
434 static void
435 unsupported_fail_images_message(const char * functionname);
436 
437 /* Forward declaration of the feature unimplemented message for allocatable
438  * components. */
439 static void
440 unimplemented_alloc_comps_message(const char * functionname);
441 
442 static void
locking_atomic_op(MPI_Win win,int * value,int newval,int compare,int image_index,size_t index)443 locking_atomic_op(MPI_Win win, int *value, int newval,
444                   int compare, int image_index, size_t index)
445 {
446   CAF_Win_lock(MPI_LOCK_EXCLUSIVE, image_index - 1, win);
447   int ierr = MPI_Compare_and_swap(&newval, &compare,value, MPI_INT,
448                                   image_index - 1, index * sizeof(int), win);
449   chk_err(ierr);
450   CAF_Win_unlock(image_index - 1, win);
451 }
452 
453 
454 /* Define a helper to check whether the image at the given index is healthy,
455  * i.e., it hasn't failed. */
456 #ifdef WITH_FAILED_IMAGES
457 #define check_image_health(image_index, stat)                   \
458 if (image_stati[image_index - 1] == STAT_FAILED_IMAGE)          \
459 {                                                               \
460   if (stat == NULL) terminate_internal (STAT_FAILED_IMAGE, 0);  \
461   *stat = STAT_FAILED_IMAGE;                                    \
462   return;                                                       \
463 }
464 #else
465 #define check_image_health(image_index, stat)
466 #endif
467 
468 #ifdef WITH_FAILED_IMAGES
469 /* Handle failed image's errors and try to recover the remaining process to
470  * allow the user to detect an image fail and exit gracefully. */
471 static void
failed_stopped_errorhandler_function(MPI_Comm * pcomm,int * perr,...)472 failed_stopped_errorhandler_function (MPI_Comm* pcomm, int* perr, ...)
473 {
474   MPI_Comm comm, shrunk, newcomm;
475   int num_failed_in_group, i, err, ierr;
476   MPI_Group comm_world_group, failed_group;
477   int *ranks_of_failed_in_comm_world, *ranks_failed;
478   int ns, srank, crank, rc, flag, drank, newrank;
479   bool stopped = false;
480 
481   comm = *pcomm;
482 
483   MPI_Error_class(*perr, &err);
484   if (err != MPIX_ERR_PROC_FAILED && err != MPIX_ERR_REVOKED)
485   {
486     /* We can handle PROC_FAILED and REVOKED ones only. */
487     char errstr[MPI_MAX_ERROR_STRING];
488     int errlen;
489     MPI_Error_string(err, errstr, &errlen);
490     /* We can't use caf_runtime_error here, because that would exit, which
491      * means only the one process will stop, but we need to stop MPI
492      * completely, which can be done by calling MPI_Abort(). */
493     fprintf(stderr,
494             "Fortran runtime error on image #%d:\nMPI error: '%s'.\n",
495             caf_this_image, errstr);
496     MPI_Abort(*pcomm, err);
497   }
498 
499   dprint("(error = %d)\n", err);
500 
501   ierr = MPIX_Comm_failure_ack(comm); chk_err(ierr);
502   ierr = MPIX_Comm_failure_get_acked(comm, &failed_group); chk_err(ierr);
503   ierr = MPI_Group_size(failed_group, &num_failed_in_group); chk_err(ierr);
504 
505   dprint("%d images failed.\n", num_failed_in_group);
506   if (num_failed_in_group <= 0)
507   {
508     *perr = MPI_SUCCESS;
509     return;
510   }
511   if (num_failed_in_group > caf_num_images)
512   {
513     *perr = MPI_SUCCESS;
514     return;
515   }
516 
517   ierr = MPI_Comm_group(comm, &comm_world_group); chk_err(ierr);
518   ranks_of_failed_in_comm_world =
519     (int *) alloca(sizeof(int) * num_failed_in_group);
520   ranks_failed = (int *) alloca(sizeof(int) * num_failed_in_group);
521   for (i = 0; i < num_failed_in_group; ++i)
522   {
523     ranks_failed[i] = i;
524   }
525   /* Now translate the ranks of the failed images into communicator world. */
526   ierr = MPI_Group_translate_ranks(failed_group, num_failed_in_group,
527                                    ranks_failed, comm_world_group,
528                                    ranks_of_failed_in_comm_world);
529   chk_err(ierr);
530 
531   num_images_failed += num_failed_in_group;
532 
533   if (!no_stopped_images_check_in_errhandler)
534   {
535     int buffer, flag;
536     MPI_Request req;
537     MPI_Status request_status;
538     dprint("Checking for stopped images.\n");
539     ierr = MPI_Irecv(&buffer, 1, MPI_INT, MPI_ANY_SOURCE,
540                      MPI_TAG_CAF_SYNC_IMAGES, CAF_COMM_WORLD, &req);
541     chk_err(ierr);
542     if (ierr == MPI_SUCCESS)
543     {
544       ierr = MPI_Test(&req, &flag, &request_status); chk_err(ierr);
545       if (flag)
546       {
547         // Received a result
548         if (buffer == STAT_STOPPED_IMAGE)
549         {
550           dprint("Image #%d found stopped.\n", request_status.MPI_SOURCE);
551           stopped = true;
552           if (image_stati[request_status.MPI_SOURCE] == 0)
553             ++num_images_stopped;
554           image_stati[request_status.MPI_SOURCE] = STAT_STOPPED_IMAGE;
555         }
556       }
557       else
558       {
559         dprint("No stopped images found.\n");
560         ierr = MPI_Cancel(&req); chk_err(ierr);
561       }
562     }
563     else
564     {
565       int err;
566       MPI_Error_class(ierr, &err);
567       dprint("Error on checking for stopped images %d.\n", err);
568     }
569   }
570 
571   /* TODO: Consider whether removing the failed image from images_full will be
572    * necessary. This is more or less politics. */
573   for (i = 0; i < num_failed_in_group; ++i)
574   {
575     if (ranks_of_failed_in_comm_world[i] >= 0
576         && ranks_of_failed_in_comm_world[i] < caf_num_images)
577     {
578       if (image_stati[ranks_of_failed_in_comm_world[i]] == 0)
579         image_stati[ranks_of_failed_in_comm_world[i]] = STAT_FAILED_IMAGE;
580     }
581     else
582     {
583       dprint("Rank of failed image %d out of range of images 0..%d.\n",
584              ranks_of_failed_in_comm_world[i], caf_num_images);
585     }
586   }
587 
588 redo:
589   dprint("Before shrink. \n");
590   ierr = MPIX_Comm_shrink(*pcomm, &shrunk);
591   dprint("After shrink, rc = %d.\n", ierr);
592   ierr = MPI_Comm_set_errhandler(shrunk, failed_CAF_COMM_mpi_err_handler);
593   chk_err(ierr);
594   ierr = MPI_Comm_size(shrunk, &ns); chk_err(ierr);
595   ierr = MPI_Comm_rank(shrunk, &srank); chk_err(ierr);
596   ierr = MPI_Comm_rank(*pcomm, &crank); chk_err(ierr);
597 
598   dprint("After getting ranks, ns = %d, srank = %d, crank = %d.\n",
599          ns, srank, crank);
600 
601   /* Split does the magic: removing spare processes and reordering ranks
602    * so that all surviving processes remain at their former place */
603   rc = MPI_Comm_split(shrunk, (crank < 0) ? MPI_UNDEFINED : 1, crank, &newcomm);
604   ierr = MPI_Comm_rank(newcomm, &newrank); chk_err(ierr);
605   dprint("After split, rc = %d, rank = %d.\n", rc, newrank);
606   flag = (rc == MPI_SUCCESS);
607   /* Split or some of the communications above may have failed if
608    * new failures have disrupted the process: we need to
609    * make sure we succeeded at all ranks, or retry until it works. */
610   flag = MPIX_Comm_agree(newcomm, &flag);
611   dprint("After agree, flag = %d.\n", flag);
612 
613   ierr = MPI_Comm_rank(newcomm, &drank); chk_err(ierr);
614   dprint("After rank, drank = %d.\n", drank);
615 
616   ierr = MPI_Comm_free(&shrunk); chk_err(ierr);
617   if (MPI_SUCCESS != flag)
618   {
619     if (MPI_SUCCESS == rc)
620     {
621       ierr = MPI_Comm_free(&newcomm); chk_err(ierr);
622     }
623     goto redo;
624   }
625 
626   {
627     int cmpres;
628     ierr = MPI_Comm_compare(*pcomm, CAF_COMM_WORLD, &cmpres); chk_err(ierr);
629     dprint("Comm_compare(*comm, CAF_COMM_WORLD, res = %d) = %d.\n",
630            cmpres, ierr);
631     ierr = MPI_Comm_compare(*pcomm, alive_comm, &cmpres); chk_err(ierr);
632     dprint("Comm_compare(*comm, alive_comm, res = %d) = %d.\n",
633            cmpres, ierr);
634     if (cmpres == MPI_CONGRUENT)
635     {
636       ierr = MPI_Win_detach(*stat_tok, &img_status); chk_err(ierr);
637       dprint("detached win img_status.\n");
638       ierr = MPI_Win_free(stat_tok); chk_err(ierr);
639       dprint("freed win img_status.\n");
640       ierr = MPI_Win_create(&img_status, sizeof(int), 1, mpi_info_same_size,
641                             newcomm, stat_tok); chk_err(ierr);
642       dprint("(re-)created win img_status.\n");
643       CAF_Win_lock_all(*stat_tok);
644       dprint("Win_lock_all on img_status.\n");
645     }
646   }
647   /* Also free the old communicator before replacing it. */
648   ierr = MPI_Comm_free(pcomm); chk_err(ierr);
649   *pcomm = newcomm;
650 
651   *perr = stopped ? STAT_STOPPED_IMAGE : STAT_FAILED_IMAGE;
652 }
653 #endif
654 
mutex_lock(MPI_Win win,int image_index,size_t index,int * stat,int * acquired_lock,char * errmsg,size_t errmsg_len)655 void mutex_lock(MPI_Win win, int image_index, size_t index, int *stat,
656                 int *acquired_lock, char *errmsg, size_t errmsg_len)
657 {
658   const char msg[] = "Already locked";
659 #if MPI_VERSION >= 3
660   int value = 0, compare = 0, newval = caf_this_image, ierr = 0, i = 0;
661 #ifdef WITH_FAILED_IMAGES
662   int flag, check_failure = 100, zero = 0;
663 #endif
664 
665   if (stat != NULL)
666     *stat = 0;
667 
668 #ifdef WITH_FAILED_IMAGES
669   ierr = MPI_Test(&alive_request, &flag, MPI_STATUS_IGNORE); chk_err(ierr);
670 #endif
671 
672   locking_atomic_op(win, &value, newval, compare, image_index, index);
673 
674   if (value == caf_this_image && image_index == caf_this_image)
675     goto stat_error;
676 
677   if (acquired_lock != NULL)
678   {
679     if (value == 0)
680       *acquired_lock = 1;
681     else
682       *acquired_lock = 0;
683     return;
684   }
685 
686   while (value != 0)
687   {
688     ++i;
689 #ifdef WITH_FAILED_IMAGES
690     if (i == check_failure)
691     {
692       i = 1;
693       ierr = MPI_Test(&alive_request, &flag, MPI_STATUS_IGNORE); chk_err(ierr);
694     }
695 #endif
696 
697     locking_atomic_op(win, &value, newval, compare, image_index, index);
698 #ifdef WITH_FAILED_IMAGES
699     if (image_stati[value] == STAT_FAILED_IMAGE)
700     {
701       CAF_Win_lock(MPI_LOCK_EXCLUSIVE, image_index - 1, win);
702       /* MPI_Fetch_and_op(&zero, &newval, MPI_INT, image_index - 1,
703        * index * sizeof(int), MPI_REPLACE, win); */
704       ierr = MPI_Compare_and_swap(&zero, &value, &newval, MPI_INT,
705                                   image_index - 1, index * sizeof(int), win);
706       chk_err(ierr);
707       CAF_Win_unlock(image_index - 1, win);
708       break;
709     }
710 #else
711     usleep(caf_this_image * i);
712 #endif
713   }
714 
715   if (stat)
716     *stat = ierr;
717   else if (ierr == STAT_FAILED_IMAGE)
718     terminate_internal(ierr, 0);
719 
720   return;
721 
722 stat_error:
723   if (errmsg != NULL)
724   {
725     memset(errmsg,' ',errmsg_len);
726     memcpy(errmsg, msg, MIN(errmsg_len,strlen(msg)));
727   }
728 
729   if (stat != NULL)
730     *stat = 99;
731   else
732     terminate_internal(99, 1);
733 #else // MPI_VERSION
734 #warning Locking for MPI-2 is not implemented
735   printf("Locking for MPI-2 is not supported, "
736          "please update your MPI implementation\n");
737 #endif // MPI_VERSION
738 }
739 
mutex_unlock(MPI_Win win,int image_index,size_t index,int * stat,char * errmsg,size_t errmsg_len)740 void mutex_unlock(MPI_Win win, int image_index, size_t index, int *stat,
741                   char* errmsg, size_t errmsg_len)
742 {
743   const char msg[] = "Variable is not locked";
744   if (stat != NULL)
745     *stat = 0;
746 #if MPI_VERSION >= 3
747   int value = 1, ierr = 0, newval = 0, flag;
748 #ifdef WITH_FAILED_IMAGES
749   ierr = MPI_Test(&alive_request, &flag, MPI_STATUS_IGNORE); chk_err(ierr);
750 #endif
751 
752   CAF_Win_lock(MPI_LOCK_EXCLUSIVE, image_index - 1, win);
753   ierr = MPI_Fetch_and_op(&newval, &value, MPI_INT, image_index - 1,
754                           index * sizeof(int), MPI_REPLACE, win); chk_err(ierr);
755   ierr = CAF_Win_unlock(image_index - 1, win); chk_err(ierr);
756 
757   /* Temporarily commented */
758   /* if (value == 0)
759    *   goto stat_error; */
760 
761   if (stat)
762     *stat = ierr;
763   else if (ierr == STAT_FAILED_IMAGE)
764     terminate_internal(ierr, 0);
765 
766   return;
767 
768 stat_error:
769   if (errmsg != NULL)
770   {
771     memset(errmsg,' ',errmsg_len);
772     memcpy(errmsg, msg, MIN(errmsg_len,strlen(msg)));
773   }
774   if (stat != NULL)
775     *stat = 99;
776   else
777     terminate_internal(99, 1);
778 #else // MPI_VERSION
779 #warning Locking for MPI-2 is not implemented
780   printf("Locking for MPI-2 is not supported, "
781          "please update your MPI implementation\n");
782 #endif // MPI_VERSION
783 }
784 
785 /* Initialize coarray program.  This routine assumes that no other
786  * MPI initialization happened before. */
787 
788 void
PREFIX(init)789 PREFIX(init) (int *argc, char ***argv)
790 {
791   int flag;
792   if (caf_num_images == 0)
793   {
794     int ierr = 0, i = 0, j = 0, rc, prov_lev = 0;
795     int is_init = 0, prior_thread_level = MPI_THREAD_FUNNELED;
796     ierr = MPI_Initialized(&is_init); chk_err(ierr);
797 
798     if (is_init)
799     {
800       ierr = MPI_Query_thread(&prior_thread_level); chk_err(ierr);
801     }
802 #ifdef HELPER
803     if (is_init)
804     {
805         prov_lev = prior_thread_level;
806         caf_owns_mpi = false;
807     }
808     else
809     {
810         ierr = MPI_Init_thread(argc, argv, MPI_THREAD_MULTIPLE, &prov_lev);
811         chk_err(ierr);
812         caf_owns_mpi = true;
813     }
814 
815     if (caf_this_image == 0 && MPI_THREAD_MULTIPLE != prov_lev)
816       caf_runtime_error("MPI_THREAD_MULTIPLE is not supported: %d", prov_lev);
817 #else
818     if (is_init)
819       caf_owns_mpi = false;
820     else
821     {
822       ierr = MPI_Init_thread(argc, argv, prior_thread_level, &prov_lev);
823       chk_err(ierr);
824       caf_owns_mpi = true;
825       if (caf_this_image == 0 && MPI_THREAD_FUNNELED > prov_lev)
826         caf_runtime_error("MPI_THREAD_FUNNELED is not supported: %d %d", MPI_THREAD_FUNNELED, prov_lev);
827     }
828 #endif
829     if (unlikely ((ierr != MPI_SUCCESS)))
830       caf_runtime_error("Failure when initializing MPI: %d", ierr);
831 
832     /* Duplicate MPI_COMM_WORLD so that no CAF internal functions use it.
833      * This is critical for MPI-interoperability. */
834     rc = MPI_Comm_dup(MPI_COMM_WORLD, &CAF_COMM_WORLD);
835 #ifdef WITH_FAILED_IMAGES
836     flag = (MPI_SUCCESS == rc);
837     rc = MPIX_Comm_agree(MPI_COMM_WORLD, &flag);
838     if (rc != MPI_SUCCESS)
839     {
840       dprint("MPIX_Comm_agree(flag = %d) = %d.\n", flag, rc);
841       fflush(stderr);
842       MPI_Abort(MPI_COMM_WORLD, 10000);
843     }
844     MPI_Barrier(MPI_COMM_WORLD);
845 #endif
846 
847     ierr = MPI_Comm_size(CAF_COMM_WORLD, &caf_num_images); chk_err(ierr);
848     ierr = MPI_Comm_rank(CAF_COMM_WORLD, &caf_this_image); chk_err(ierr);
849 
850     ++caf_this_image;
851     caf_is_finalized = 0;
852 
853     /* BEGIN SYNC IMAGE preparation
854      * Prepare memory for syncing images. */
855     images_full = (int *) calloc(caf_num_images - 1, sizeof(int));
856     for (i = 1, j = 0; i <= caf_num_images; ++i)
857     {
858       if (i != caf_this_image)
859         images_full[j++] = i;
860     }
861 
862     arrived = calloc(caf_num_images, sizeof(int));
863     sync_handles = malloc(caf_num_images * sizeof(MPI_Request));
864     /* END SYNC IMAGE preparation. */
865 
866     stat_tok = malloc(sizeof(MPI_Win));
867 
868     teams_list = (caf_teams_list *)calloc(1, sizeof(caf_teams_list));
869     teams_list->team_id = -1;
870     MPI_Comm *tmp_comm = (MPI_Comm *)calloc(1, sizeof(MPI_Comm));
871     *tmp_comm = CAF_COMM_WORLD;
872     teams_list->team = tmp_comm;
873     teams_list->prev = NULL;
874     used_teams = (caf_used_teams_list *)calloc(1, sizeof(caf_used_teams_list));
875     used_teams->team_list_elem = teams_list;
876     used_teams->prev = NULL;
877 
878 #ifdef WITH_FAILED_IMAGES
879     MPI_Comm_dup(MPI_COMM_WORLD, &alive_comm);
880     /* Handling of failed/stopped images is done by setting an error handler
881      * on a asynchronous request to each other image.  For a failing image
882      * the request will trigger the call of the error handler thus allowing
883      * each other image to handle the failed/stopped image. */
884     ierr = MPI_Comm_create_errhandler(failed_stopped_errorhandler_function,
885                                       &failed_CAF_COMM_mpi_err_handler);
886     chk_err(ierr);
887     ierr = MPI_Comm_set_errhandler(CAF_COMM_WORLD,
888                                    failed_CAF_COMM_mpi_err_handler);
889     chk_err(ierr);
890     ierr = MPI_Comm_set_errhandler(alive_comm,
891                                    failed_CAF_COMM_mpi_err_handler);
892     chk_err(ierr);
893     ierr = MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
894     chk_err(ierr);
895 
896     ierr = MPI_Irecv(&alive_dummy, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG,
897                      alive_comm, &alive_request); chk_err(ierr);
898 
899     image_stati = (int *) calloc(caf_num_images, sizeof(int));
900 #endif
901 
902 #if MPI_VERSION >= 3
903     ierr = MPI_Info_create(&mpi_info_same_size); chk_err(ierr);
904     ierr = MPI_Info_set(mpi_info_same_size, "same_size", "true"); chk_err(ierr);
905 
906     /* Setting img_status */
907     ierr = MPI_Win_create(&img_status, sizeof(int), 1, mpi_info_same_size,
908                           CAF_COMM_WORLD, stat_tok); chk_err(ierr);
909     CAF_Win_lock_all(*stat_tok);
910 #else
911     ierr = MPI_Win_create(&img_status, sizeof(int), 1, MPI_INFO_NULL,
912                           CAF_COMM_WORLD, stat_tok); chk_err(ierr);
913 #endif // MPI_VERSION
914 
915     /* Create the dynamic window to allow images to asyncronously attach
916      * memory. */
917     ierr = MPI_Win_create_dynamic(MPI_INFO_NULL, CAF_COMM_WORLD,
918                                   &global_dynamic_win); chk_err(ierr);
919     CAF_Win_lock_all(global_dynamic_win);
920   }
921 }
922 
923 
924 /* Internal finalize of coarray program. */
925 
926 void
finalize_internal(int status_code)927 finalize_internal(int status_code)
928 {
929   int ierr;
930   dprint("(status_code = %d)\n", status_code);
931 
932 #ifdef WITH_FAILED_IMAGES
933   no_stopped_images_check_in_errhandler = true;
934   ierr = MPI_Win_flush_all(*stat_tok); chk_err(ierr);
935 #endif
936   /* For future security enclose setting img_status in a lock. */
937   CAF_Win_lock(MPI_LOCK_EXCLUSIVE, caf_this_image - 1, *stat_tok);
938   if (status_code == 0)
939   {
940     img_status = STAT_STOPPED_IMAGE;
941 #ifdef WITH_FAILED_IMAGES
942     image_stati[caf_this_image - 1] = STAT_STOPPED_IMAGE;
943 #endif
944   }
945   else
946   {
947     img_status = status_code;
948 #ifdef WITH_FAILED_IMAGES
949     image_stati[caf_this_image - 1] = status_code;
950 #endif
951   }
952   CAF_Win_unlock(caf_this_image - 1, *stat_tok);
953 
954   /* Announce to all other images, that this one has changed its execution
955    * status. */
956   for (int i = 0; i < caf_num_images - 1; ++i)
957   {
958     ierr = MPI_Send(&img_status, 1, MPI_INT, images_full[i] - 1,
959                     MPI_TAG_CAF_SYNC_IMAGES, CAF_COMM_WORLD); chk_err(ierr);
960   }
961 
962 #ifdef WITH_FAILED_IMAGES
963   /* Terminate the async request before revoking the comm, or we will get
964    * triggered by the errorhandler, which we don't want here anymore. */
965   ierr = MPI_Cancel(&alive_request); chk_err(ierr);
966 
967   if (status_code == 0)
968   {
969     /* In finalization do not report stopped or failed images any more. */
970     ierr = MPI_Errhandler_set(CAF_COMM_WORLD, MPI_ERRORS_RETURN); chk_err(ierr);
971     ierr = MPI_Errhandler_set(alive_comm, MPI_ERRORS_RETURN); chk_err(ierr);
972     /* Only add a conventional barrier to prevent images rom quitting too
973      * early, when this images is not failing. */
974     dprint("Before MPI_Barrier(CAF_COMM_WORLD)\n");
975     ierr = MPI_Barrier(CAF_COMM_WORLD); chk_err(ierr);
976     dprint("After MPI_Barrier(CAF_COMM_WORLD) = %d\n", ierr);
977   }
978   else
979     return;
980 #else
981   /* Add a conventional barrier to prevent images from quitting too early. */
982   if (status_code == 0)
983   {
984     ierr = MPI_Barrier(CAF_COMM_WORLD); chk_err(ierr);
985   }
986   else
987     /* Without failed images support, but a given status_code, we need to
988      * return to the caller, or we will hang in the following instead of
989      * terminating the program. */
990     return;
991 #endif
992 
993 #ifdef GCC_GE_7
994   struct caf_allocated_slave_tokens_t
995     *cur_stok = caf_allocated_slave_tokens,
996     *prev_stok = NULL;
997   CAF_Win_unlock_all(global_dynamic_win);
998   while (cur_stok)
999   {
1000     prev_stok = cur_stok->prev;
1001     ierr = MPI_Win_detach(global_dynamic_win, cur_stok); chk_err(ierr);
1002     if (cur_stok->token->memptr)
1003     {
1004       ierr = MPI_Win_detach(global_dynamic_win, cur_stok->token->memptr);
1005       chk_err(ierr);
1006       free(cur_stok->token->memptr);
1007     }
1008     free(cur_stok->token);
1009     free(cur_stok);
1010     cur_stok = prev_stok;
1011   }
1012 #else
1013   CAF_Win_unlock_all(global_dynamic_win);
1014 #endif
1015 
1016   dprint("Freed all slave tokens.\n");
1017   struct caf_allocated_tokens_t
1018     *cur_tok = caf_allocated_tokens,
1019     *prev = caf_allocated_tokens;
1020   MPI_Win *p;
1021 
1022   while (cur_tok)
1023   {
1024     prev = cur_tok->prev;
1025     p = TOKEN(cur_tok->token);
1026     if (p != NULL)
1027       CAF_Win_unlock_all(*p);
1028 #ifdef GCC_GE_7
1029     /* Unregister the window to the descriptors when freeing the token. */
1030     dprint("MPI_Win_free(p);\n");
1031     ierr = MPI_Win_free(p); chk_err(ierr);
1032     free(cur_tok->token);
1033 #else // GCC_GE_7
1034     ierr = MPI_Win_free(p); chk_err(ierr);
1035 #endif // GCC_GE_7
1036     free(cur_tok);
1037     cur_tok = prev;
1038   }
1039 #if MPI_VERSION >= 3
1040   ierr = MPI_Info_free(&mpi_info_same_size); chk_err(ierr);
1041 #endif // MPI_VERSION
1042 
1043   /* Free the global dynamic window. */
1044   ierr = MPI_Win_free(&global_dynamic_win); chk_err(ierr);
1045 #ifdef WITH_FAILED_IMAGES
1046   if (status_code == 0)
1047   {
1048     dprint("before Win_unlock_all.\n");
1049     CAF_Win_unlock_all(*stat_tok);
1050     dprint("before Win_free(stat_tok)\n");
1051     ierr = MPI_Win_free(stat_tok); chk_err(ierr);
1052     dprint("before Comm_free(CAF_COMM_WORLD)\n");
1053     ierr = MPI_Comm_free(&CAF_COMM_WORLD); chk_err(ierr);
1054     ierr = MPI_Comm_free(&alive_comm); chk_err(ierr);
1055     dprint("after Comm_free(CAF_COMM_WORLD)\n");
1056   }
1057 
1058   ierr = MPI_Errhandler_free(&failed_CAF_COMM_mpi_err_handler); chk_err(ierr);
1059 
1060   /* Only call Finalize if CAF runtime Initialized MPI. */
1061   if (caf_owns_mpi)
1062   {
1063     ierr = MPI_Finalize(); chk_err(ierr);
1064   }
1065 #else
1066   ierr = MPI_Comm_free(&CAF_COMM_WORLD); chk_err(ierr);
1067 
1068   CAF_Win_unlock_all(*stat_tok);
1069   ierr = MPI_Win_free(stat_tok); chk_err(ierr);
1070 
1071   /* Only call Finalize if CAF runtime Initialized MPI. */
1072   if (caf_owns_mpi)
1073   {
1074     ierr = MPI_Finalize(); chk_err(ierr);
1075   }
1076 #endif
1077 
1078   pthread_mutex_lock(&lock_am);
1079   caf_is_finalized = 1;
1080   pthread_mutex_unlock(&lock_am);
1081   free(sync_handles);
1082   dprint("Finalisation done!!!\n");
1083 }
1084 
1085 
1086 /* Finalize coarray program. */
1087 void
PREFIX(finalize)1088 PREFIX(finalize) (void)
1089 {
1090   finalize_internal(0);
1091 }
1092 
1093 /* TODO: This is interface is violating the F2015 standard, but not the gfortran
1094  * API. Fix it (the fortran API). */
1095 int
PREFIX(this_image)1096 PREFIX(this_image) (int distance __attribute__((unused)))
1097 {
1098   return caf_this_image;
1099 }
1100 
1101 /* TODO: This is interface is violating the F2015 standard, but not the gfortran
1102  * API. Fix it (the fortran API). */
1103 int
PREFIX(num_images)1104 PREFIX(num_images) (int distance __attribute__((unused)),
1105                     int failed __attribute__((unused)))
1106 {
1107   return caf_num_images;
1108 }
1109 
1110 #ifdef GCC_GE_7
1111 /* Register an object with the coarray library creating a token where
1112  * necessary/requested.
1113  * See the ABI-documentation of gfortran for the expected behavior.
1114  * Contrary to this expected behavior is this routine not registering memory
1115  * in the descriptor, that is already present.  I.e., when the compiler
1116  * expects the library to allocate the memory for an object in desc, then
1117  * its data_ptr is NULL. This is still missing here.  At the moment the
1118  * compiler also does not make use of it, but it is contrary to the
1119  * documentation. */
1120 void
PREFIX(register)1121 PREFIX(register) (size_t size, caf_register_t type, caf_token_t *token,
1122                   gfc_descriptor_t *desc, int *stat, char *errmsg,
1123                   charlen_t errmsg_len)
1124 {
1125   void *mem = NULL;
1126   size_t actual_size;
1127   int l_var = 0, *init_array = NULL, ierr;
1128 
1129   if (unlikely(caf_is_finalized))
1130     goto error;
1131 
1132   /* Start GASNET if not already started. */
1133   if (caf_num_images == 0)
1134     PREFIX(init) (NULL, NULL);
1135 
1136   if (type == CAF_REGTYPE_LOCK_STATIC || type == CAF_REGTYPE_LOCK_ALLOC ||
1137       type == CAF_REGTYPE_CRITICAL || type == CAF_REGTYPE_EVENT_STATIC ||
1138       type == CAF_REGTYPE_EVENT_ALLOC)
1139   {
1140     actual_size = size * sizeof(int);
1141     l_var = 1;
1142   }
1143   else
1144     actual_size = size;
1145 
1146   switch (type)
1147   {
1148     case CAF_REGTYPE_COARRAY_ALLOC_REGISTER_ONLY:
1149     case CAF_REGTYPE_COARRAY_ALLOC_ALLOCATE_ONLY:
1150       {
1151         /* Create or allocate a slave token. */
1152         mpi_caf_slave_token_t *slave_token;
1153 #ifdef EXTRA_DEBUG_OUTPUT
1154         MPI_Aint mpi_address;
1155 #endif
1156         CAF_Win_unlock_all(global_dynamic_win);
1157         if (type == CAF_REGTYPE_COARRAY_ALLOC_REGISTER_ONLY)
1158         {
1159           *token = calloc(1, sizeof(mpi_caf_slave_token_t));
1160           slave_token = (mpi_caf_slave_token_t *)(*token);
1161           ierr = MPI_Win_attach(global_dynamic_win, *token,
1162                                 sizeof(mpi_caf_slave_token_t)); chk_err(ierr);
1163 #ifdef EXTRA_DEBUG_OUTPUT
1164           ierr = MPI_Get_address(*token, &mpi_address); chk_err(ierr);
1165 #endif
1166           dprint("Attach slave token %p (mpi-address: %zd) to "
1167                  "global_dynamic_window = %d\n",
1168                  slave_token, mpi_address, global_dynamic_win);
1169 
1170           /* Register the memory for auto freeing. */
1171           struct caf_allocated_slave_tokens_t *tmp =
1172             malloc(sizeof(struct caf_allocated_slave_tokens_t));
1173           tmp->prev  = caf_allocated_slave_tokens;
1174           tmp->token = *token;
1175           caf_allocated_slave_tokens = tmp;
1176         }
1177         else // (type == CAF_REGTYPE_COARRAY_ALLOC_ALLOCATE_ONLY)
1178         {
1179           int ierr;
1180           slave_token = (mpi_caf_slave_token_t *)(*token);
1181           mem = malloc(actual_size);
1182           slave_token->memptr = mem;
1183           ierr = MPI_Win_attach(global_dynamic_win, mem, actual_size);
1184           chk_err(ierr);
1185 #ifdef EXTRA_DEBUG_OUTPUT
1186           ierr = MPI_Get_address(mem, &mpi_address); chk_err(ierr);
1187 #endif
1188           dprint("Attach mem %p (mpi-address: %zd) to global_dynamic_window = "
1189                  "%d on slave_token %p, size %zd, ierr: %d\n",
1190                  mem, mpi_address, global_dynamic_win, slave_token,
1191                  actual_size, ierr);
1192           if (desc != NULL && GFC_DESCRIPTOR_RANK(desc) != 0)
1193           {
1194             slave_token->desc = desc;
1195 #ifdef EXTRA_DEBUG_OUTPUT
1196             ierr = MPI_Get_address(desc, &mpi_address); chk_err(ierr);
1197 #endif
1198             dprint("Attached descriptor %p (mpi-address: %zd) to "
1199                    "global_dynamic_window %d at address %p, ierr = %d.\n",
1200                    desc, mpi_address, global_dynamic_win, &slave_token->desc,
1201                    ierr);
1202           }
1203         }
1204         CAF_Win_lock_all(global_dynamic_win);
1205         dprint("Slave token %p on exit: mpi_caf_slave_token_t { desc: %p }\n",
1206                slave_token, slave_token->desc);
1207       }
1208       break;
1209     default:
1210       {
1211         mpi_caf_token_t *mpi_token;
1212         MPI_Win *p;
1213 
1214         *token = calloc(1, sizeof(mpi_caf_token_t));
1215         mpi_token = (mpi_caf_token_t *) (*token);
1216         p = TOKEN(mpi_token);
1217 
1218 #if MPI_VERSION >= 3
1219         ierr = MPI_Win_allocate(actual_size, 1, MPI_INFO_NULL, CAF_COMM_WORLD,
1220                                 &mem, p); chk_err(ierr);
1221         CAF_Win_lock_all(*p);
1222 #else // MPI_VERSION
1223         ierr = MPI_Alloc_mem(actual_size, MPI_INFO_NULL, &mem); chk_err(ierr);
1224         ierr = MPI_Win_create(mem, actual_size, 1, MPI_INFO_NULL,
1225                               CAF_COMM_WORLD, p); chk_err(ierr);
1226 #endif // MPI_VERSION
1227 
1228 #ifndef GCC_GE_8
1229         if (GFC_DESCRIPTOR_RANK(desc) != 0)
1230 #endif
1231           mpi_token->desc = desc;
1232 
1233         if (l_var)
1234         {
1235           init_array = (int *)calloc(size, sizeof(int));
1236           CAF_Win_lock(MPI_LOCK_EXCLUSIVE, caf_this_image - 1, *p);
1237           ierr = MPI_Put(init_array, size, MPI_INT, caf_this_image - 1, 0, size,
1238                          MPI_INT, *p); chk_err(ierr);
1239           CAF_Win_unlock(caf_this_image - 1, *p);
1240           free(init_array);
1241         }
1242 
1243         struct caf_allocated_tokens_t *tmp =
1244             malloc(sizeof(struct caf_allocated_tokens_t));
1245         tmp->prev  = caf_allocated_tokens;
1246         tmp->token = *token;
1247         caf_allocated_tokens = tmp;
1248 
1249         if (stat)
1250           *stat = 0;
1251 
1252         /* The descriptor will be initialized only after the call to
1253          * register. */
1254         mpi_token->memptr = mem;
1255         dprint("Token %p on exit: mpi_caf_token_t "
1256                "{ (local_)memptr: %p, memptr_win: %d }\n",
1257                mpi_token, mpi_token->memptr, mpi_token->memptr_win);
1258       } // default:
1259       break;
1260   } // switch
1261 
1262   desc->base_addr = mem;
1263   return;
1264 
1265 error:
1266   {
1267     char msg[80];
1268     strcpy(msg, "Failed to allocate coarray");
1269     if (caf_is_finalized)
1270       strcat(msg, " - there are stopped images");
1271 
1272     if (stat)
1273     {
1274       *stat = caf_is_finalized ? STAT_STOPPED_IMAGE : 1;
1275       if (errmsg_len > 0)
1276       {
1277         size_t len = (strlen(msg) > (size_t) errmsg_len) ?
1278                      (size_t) errmsg_len : strlen (msg);
1279         memcpy(errmsg, msg, len);
1280         if ((size_t) errmsg_len > len)
1281           memset(&errmsg[len], ' ', errmsg_len - len);
1282       }
1283     }
1284     else
1285       caf_runtime_error(msg);
1286   }
1287 }
1288 #else // GCC_LT_7
1289 void *
PREFIX(register)1290 PREFIX(register) (size_t size, caf_register_t type, caf_token_t *token,
1291                   int *stat, char *errmsg, charlen_t errmsg_len)
1292 {
1293   void *mem;
1294   size_t actual_size;
1295   int l_var = 0, *init_array = NULL, ierr;
1296 
1297   if (unlikely(caf_is_finalized))
1298     goto error;
1299 
1300   /* Start GASNET if not already started. */
1301   if (caf_num_images == 0)
1302 #ifdef COMPILER_SUPPORTS_CAF_INTRINSICS
1303     _gfortran_caf_init(NULL, NULL);
1304 #else
1305     PREFIX(init) (NULL, NULL);
1306 #endif
1307 
1308   /* Token contains only a list of pointers. */
1309   *token = malloc(sizeof(MPI_Win));
1310   MPI_Win *p = *token;
1311 
1312   if (type == CAF_REGTYPE_LOCK_STATIC || type == CAF_REGTYPE_LOCK_ALLOC ||
1313       type == CAF_REGTYPE_CRITICAL || type == CAF_REGTYPE_EVENT_STATIC ||
1314       type == CAF_REGTYPE_EVENT_ALLOC)
1315   {
1316     actual_size = size * sizeof(int);
1317     l_var = 1;
1318   }
1319   else
1320     actual_size = size;
1321 
1322 #if MPI_VERSION >= 3
1323   ierr = MPI_Win_allocate(actual_size, 1, mpi_info_same_size, CAF_COMM_WORLD,
1324                           &mem, p); chk_err(ierr);
1325   CAF_Win_lock_all(*p);
1326 #else // MPI_VERSION
1327   ierr = MPI_Alloc_mem(actual_size, MPI_INFO_NULL, &mem); chk_err(ierr);
1328   ierr = MPI_Win_create(mem, actual_size, 1, MPI_INFO_NULL,
1329                         CAF_COMM_WORLD, p); chk_err(ierr);
1330 #endif // MPI_VERSION
1331 
1332   if (l_var)
1333   {
1334     init_array = (int *)calloc(size, sizeof(int));
1335     CAF_Win_lock(MPI_LOCK_EXCLUSIVE, caf_this_image - 1, *p);
1336     ierr = MPI_Put(init_array, size, MPI_INT, caf_this_image - 1, 0, size,
1337                    MPI_INT, *p); chk_err(ierr);
1338     CAF_Win_unlock(caf_this_image - 1, *p);
1339     free(init_array);
1340   }
1341 
1342   PREFIX(sync_all) (NULL, NULL, 0);
1343 
1344   struct caf_allocated_tokens_t *tmp =
1345          malloc(sizeof(struct caf_allocated_tokens_t));
1346   tmp->prev  = caf_allocated_tokens;
1347   tmp->token = *token;
1348   caf_allocated_tokens = tmp;
1349 
1350   if (stat)
1351     *stat = 0;
1352   return mem;
1353 
1354 error:
1355   {
1356     char msg[80];
1357     strcpy(msg, "Failed to allocate coarray");
1358     if (caf_is_finalized)
1359       strcat(msg, " - there are stopped images");
1360 
1361     if (stat)
1362     {
1363       *stat = caf_is_finalized ? STAT_STOPPED_IMAGE : 1;
1364       if (errmsg_len > 0)
1365       {
1366         size_t len = (strlen(msg) > (size_t) errmsg_len) ?
1367                      (size_t) errmsg_len : strlen (msg);
1368         memcpy(errmsg, msg, len);
1369         if (errmsg_len > len)
1370           memset(&errmsg[len], ' ', errmsg_len - len);
1371       }
1372     }
1373     else
1374       caf_runtime_error(msg);
1375   }
1376   return NULL;
1377 }
1378 #endif // GCC_GE_7
1379 
1380 
1381 #ifdef GCC_GE_7
1382 void
PREFIX(deregister)1383 PREFIX(deregister) (caf_token_t *token, int type, int *stat, char *errmsg,
1384                     charlen_t errmsg_len)
1385 #else
1386 void
1387 PREFIX(deregister) (caf_token_t *token, int *stat, char *errmsg,
1388                     charlen_t errmsg_len)
1389 #endif
1390 {
1391   dprint("deregister(%p)\n", *token);
1392   int ierr;
1393 
1394   if (unlikely(caf_is_finalized))
1395   {
1396     const char msg[] =
1397       "Failed to deallocate coarray - there are stopped images";
1398     if (stat)
1399     {
1400       *stat = STAT_STOPPED_IMAGE;
1401 
1402       if (errmsg_len > 0)
1403       {
1404         size_t len = (sizeof(msg) - 1 > (size_t) errmsg_len) ?
1405           (size_t) errmsg_len : sizeof (msg) - 1;
1406         memcpy(errmsg, msg, len);
1407         if (errmsg_len > len)
1408           memset(&errmsg[len], ' ', errmsg_len - len);
1409       }
1410       return;
1411     }
1412     caf_runtime_error(msg);
1413   }
1414 
1415   if (stat)
1416     *stat = 0;
1417 
1418 #ifdef GCC_GE_7
1419   if (type != CAF_DEREGTYPE_COARRAY_DEALLOCATE_ONLY)
1420   {
1421     /* Sync all images only, when deregistering the token. Just freeing the
1422      * memory needs no sync. */
1423 #ifdef WITH_FAILED_IMAGES
1424     ierr = MPI_Barrier(CAF_COMM_WORLD); chk_err(ierr);
1425 #else
1426     PREFIX(sync_all) (NULL, NULL, 0);
1427 #endif
1428   }
1429 #endif // GCC_GE_7
1430   {
1431     struct caf_allocated_tokens_t
1432       *cur = caf_allocated_tokens,
1433       *next = caf_allocated_tokens,
1434       *prev;
1435     MPI_Win *p;
1436 
1437     while (cur)
1438     {
1439       prev = cur->prev;
1440 
1441       if (cur->token == *token)
1442       {
1443         p = TOKEN(*token);
1444 #ifdef GCC_GE_7
1445         dprint("Found regular token %p for memptr_win: %d.\n",
1446                *token, ((mpi_caf_token_t *)*token)->memptr_win);
1447 #endif
1448         CAF_Win_unlock_all(*p);
1449         ierr = MPI_Win_free(p); chk_err(ierr);
1450 
1451         next->prev = prev ? prev->prev:  NULL;
1452 
1453         if (cur == caf_allocated_tokens)
1454           caf_allocated_tokens = prev;
1455 
1456         free(cur);
1457         free(*token);
1458         return;
1459       }
1460       next = cur;
1461       cur = prev;
1462     }
1463   }
1464 
1465 #ifdef GCC_GE_7
1466   /* Feel through: Has to be a component token. */
1467   {
1468     struct caf_allocated_slave_tokens_t
1469       *cur_stok = caf_allocated_slave_tokens,
1470       *next_stok = caf_allocated_slave_tokens,
1471       *prev_stok;
1472 
1473     while (cur_stok)
1474     {
1475       prev_stok = cur_stok->prev;
1476 
1477       if (cur_stok->token == *token)
1478       {
1479         dprint("Found sub token %p.\n", *token);
1480 
1481         mpi_caf_slave_token_t *slave_token = *(mpi_caf_slave_token_t **)token;
1482         CAF_Win_unlock_all(global_dynamic_win);
1483 
1484         if (slave_token->memptr)
1485         {
1486           ierr = MPI_Win_detach(global_dynamic_win, slave_token->memptr);
1487           chk_err(ierr);
1488           free(slave_token->memptr);
1489           slave_token->memptr = NULL;
1490           if (type == CAF_DEREGTYPE_COARRAY_DEALLOCATE_ONLY)
1491           {
1492             CAF_Win_lock_all(global_dynamic_win);
1493             return; // All done.
1494           }
1495         }
1496         ierr = MPI_Win_detach(global_dynamic_win, slave_token); chk_err(ierr);
1497         CAF_Win_lock_all(global_dynamic_win);
1498 
1499         next_stok->prev = prev_stok ? prev_stok->prev: NULL;
1500 
1501         if (cur_stok == caf_allocated_slave_tokens)
1502           caf_allocated_slave_tokens = prev_stok;
1503 
1504         free(cur_stok);
1505         free(*token);
1506         return;
1507       }
1508 
1509       next_stok = cur_stok;
1510       cur_stok = prev_stok;
1511     }
1512   }
1513 #endif // GCC_GE_7
1514 #ifdef EXTRA_DEBUG_OUTPUT
1515   fprintf(stderr,
1516           "Fortran runtime warning on image %d: "
1517           "Could not find token to free %p", caf_this_image, *token);
1518 #endif
1519 }
1520 
1521 void
PREFIX(sync_memory)1522 PREFIX(sync_memory) (int *stat __attribute__((unused)),
1523                      char *errmsg __attribute__((unused)),
1524                      charlen_t errmsg_len __attribute__((unused)))
1525 {
1526 #if defined(NONBLOCKING_PUT) && !defined(CAF_MPI_LOCK_UNLOCK)
1527   explicit_flush();
1528 #endif
1529 }
1530 
1531 
1532 void
PREFIX(sync_all)1533 PREFIX(sync_all) (int *stat, char *errmsg, charlen_t errmsg_len)
1534 {
1535   int err = 0, ierr;
1536 
1537   dprint("Entering sync all.\n");
1538   if (unlikely(caf_is_finalized))
1539   {
1540     err = STAT_STOPPED_IMAGE;
1541   }
1542   else
1543   {
1544 #if defined(NONBLOCKING_PUT) && !defined(CAF_MPI_LOCK_UNLOCK)
1545     explicit_flush();
1546 #endif
1547 
1548 #ifdef WITH_FAILED_IMAGES
1549     ierr = MPI_Barrier(alive_comm); chk_err(ierr);
1550 #else
1551     ierr = MPI_Barrier(CAF_COMM_WORLD); chk_err(ierr);
1552 #endif
1553     dprint("MPI_Barrier = %d.\n", err);
1554     if (ierr == STAT_FAILED_IMAGE)
1555       err = STAT_FAILED_IMAGE;
1556     else if (ierr != 0)
1557       MPI_Error_class(ierr, &err);
1558   }
1559 
1560   if (stat != NULL)
1561     *stat = err;
1562 #ifdef WITH_FAILED_IMAGES
1563   else if (err == STAT_FAILED_IMAGE)
1564     /* F2015 requests stat to be set for FAILED IMAGES, else error out. */
1565     terminate_internal(err, 0);
1566 #endif
1567 
1568   if (err != 0 && err != STAT_FAILED_IMAGE)
1569   {
1570     char msg[80];
1571     strcpy(msg, "SYNC ALL failed");
1572     if (caf_is_finalized)
1573       strcat(msg, " - there are stopped images");
1574 
1575     if (errmsg_len > 0)
1576     {
1577       size_t len = (strlen(msg) > (size_t) errmsg_len) ?
1578                    (size_t) errmsg_len : strlen (msg);
1579       memcpy(errmsg, msg, len);
1580       if (errmsg_len > len)
1581         memset(&errmsg[len], ' ', errmsg_len - len);
1582     }
1583     else if (stat == NULL)
1584       caf_runtime_error(msg);
1585   }
1586   dprint("Leaving sync all.\n");
1587 }
1588 
1589 /* Convert kind 4 characters into kind 1 one.
1590  * Copied from the gcc:libgfortran/caf/single.c. */
1591 static void
assign_char4_from_char1(size_t dst_size,size_t src_size,uint32_t * dst,unsigned char * src)1592 assign_char4_from_char1(size_t dst_size, size_t src_size, uint32_t *dst,
1593                         unsigned char *src)
1594 {
1595   size_t i, n;
1596   n = (dst_size > src_size) ? src_size : dst_size;
1597   for (i = 0; i < n; ++i)
1598   {
1599     dst[i] = (int32_t) src[i];
1600   }
1601   for (; i < dst_size; ++i)
1602   {
1603     dst[i] = (int32_t) ' ';
1604   }
1605 }
1606 
1607 
1608 /* Convert kind 1 characters into kind 4 one.
1609  * Copied from the gcc:libgfortran/caf/single.c. */
1610 static void
assign_char1_from_char4(size_t dst_size,size_t src_size,unsigned char * dst,uint32_t * src)1611 assign_char1_from_char4(size_t dst_size, size_t src_size, unsigned char *dst,
1612                         uint32_t *src)
1613 {
1614   size_t i, n;
1615   n = (dst_size > src_size) ? src_size : dst_size;
1616   for (i = 0; i < n; ++i)
1617   {
1618     dst[i] = src[i] > UINT8_MAX ? (unsigned char) '?' : (unsigned char) src[i];
1619   }
1620   if (dst_size > n)
1621     memset(&dst[n], ' ', dst_size - n);
1622 }
1623 
1624 /* Convert convertable types.
1625  * Copied from the gcc:libgfortran/caf/single.c. Can't say much about it. */
1626 static void
convert_type(void * dst,int dst_type,int dst_kind,void * src,int src_type,int src_kind,int * stat)1627 convert_type(void *dst, int dst_type, int dst_kind, void *src, int src_type,
1628              int src_kind, int *stat)
1629 {
1630 #ifdef HAVE_GFC_INTEGER_16
1631   typedef __int128 int128t;
1632 #else
1633   typedef int64_t int128t;
1634 #endif
1635 
1636 #if defined(GFC_REAL_16_IS_LONG_DOUBLE)
1637   typedef long double real128t;
1638   typedef _Complex long double complex128t;
1639 #elif defined(HAVE_GFC_REAL_16)
1640   typedef _Complex float __attribute__((mode(TC))) __complex128;
1641   typedef __float128 real128t;
1642   typedef __complex128 complex128t;
1643 #elif defined(HAVE_GFC_REAL_10)
1644   typedef long double real128t;
1645   typedef long double complex128t;
1646 #else
1647   typedef double real128t;
1648   typedef _Complex double complex128t;
1649 #endif
1650 
1651   int128t int_val = 0;
1652   real128t real_val = 0;
1653   complex128t cmpx_val = 0;
1654 
1655   switch (src_type)
1656   {
1657     case BT_INTEGER:
1658       if (src_kind == 1)
1659         int_val = *(int8_t*) src;
1660       else if (src_kind == 2)
1661         int_val = *(int16_t*) src;
1662       else if (src_kind == 4)
1663         int_val = *(int32_t*) src;
1664       else if (src_kind == 8)
1665         int_val = *(int64_t*) src;
1666 #ifdef HAVE_GFC_INTEGER_16
1667       else if (src_kind == 16)
1668         int_val = *(int128t*) src;
1669 #endif
1670       else
1671         goto error;
1672       break;
1673     case BT_REAL:
1674       if (src_kind == 4)
1675         real_val = *(float*) src;
1676       else if (src_kind == 8)
1677         real_val = *(double*) src;
1678 #ifdef HAVE_GFC_REAL_10
1679       else if (src_kind == 10)
1680         real_val = *(long double*) src;
1681 #endif
1682 #ifdef HAVE_GFC_REAL_16
1683       else if (src_kind == 16)
1684         real_val = *(real128t*) src;
1685 #endif
1686       else
1687         goto error;
1688       break;
1689     case BT_COMPLEX:
1690       if (src_kind == 4)
1691         cmpx_val = *(_Complex float*) src;
1692       else if (src_kind == 8)
1693         cmpx_val = *(_Complex double*) src;
1694 #ifdef HAVE_GFC_REAL_10
1695       else if (src_kind == 10)
1696         cmpx_val = *(_Complex long double*) src;
1697 #endif
1698 #ifdef HAVE_GFC_REAL_16
1699       else if (src_kind == 16)
1700         cmpx_val = *(complex128t*) src;
1701 #endif
1702       else
1703         goto error;
1704       break;
1705     default:
1706       goto error;
1707   }
1708 
1709   switch (dst_type)
1710   {
1711     case BT_INTEGER:
1712       if (src_type == BT_INTEGER)
1713       {
1714         if (dst_kind == 1)
1715           *(int8_t*) dst = (int8_t) int_val;
1716         else if (dst_kind == 2)
1717           *(int16_t*) dst = (int16_t) int_val;
1718         else if (dst_kind == 4)
1719           *(int32_t*) dst = (int32_t) int_val;
1720         else if (dst_kind == 8)
1721           *(int64_t*) dst = (int64_t) int_val;
1722 #ifdef HAVE_GFC_INTEGER_16
1723         else if (dst_kind == 16)
1724           *(int128t*) dst = (int128t) int_val;
1725 #endif
1726         else
1727           goto error;
1728       }
1729       else if (src_type == BT_REAL)
1730       {
1731         if (dst_kind == 1)
1732           *(int8_t*) dst = (int8_t) real_val;
1733         else if (dst_kind == 2)
1734           *(int16_t*) dst = (int16_t) real_val;
1735         else if (dst_kind == 4)
1736           *(int32_t*) dst = (int32_t) real_val;
1737         else if (dst_kind == 8)
1738           *(int64_t*) dst = (int64_t) real_val;
1739 #ifdef HAVE_GFC_INTEGER_16
1740         else if (dst_kind == 16)
1741           *(int128t*) dst = (int128t) real_val;
1742 #endif
1743         else
1744           goto error;
1745       }
1746       else if (src_type == BT_COMPLEX)
1747       {
1748         if (dst_kind == 1)
1749           *(int8_t*) dst = (int8_t) cmpx_val;
1750         else if (dst_kind == 2)
1751           *(int16_t*) dst = (int16_t) cmpx_val;
1752         else if (dst_kind == 4)
1753           *(int32_t*) dst = (int32_t) cmpx_val;
1754         else if (dst_kind == 8)
1755           *(int64_t*) dst = (int64_t) cmpx_val;
1756 #ifdef HAVE_GFC_INTEGER_16
1757         else if (dst_kind == 16)
1758           *(int128t*) dst = (int128t) cmpx_val;
1759 #endif
1760         else
1761           goto error;
1762       }
1763       else
1764         goto error;
1765       return;
1766     case BT_REAL:
1767       if (src_type == BT_INTEGER)
1768       {
1769         if (dst_kind == 4)
1770           *(float*) dst = (float) int_val;
1771         else if (dst_kind == 8)
1772           *(double*) dst = (double) int_val;
1773 #ifdef HAVE_GFC_REAL_10
1774         else if (dst_kind == 10)
1775           *(long double*) dst = (long double) int_val;
1776 #endif
1777 #ifdef HAVE_GFC_REAL_16
1778         else if (dst_kind == 16)
1779           *(real128t*) dst = (real128t) int_val;
1780 #endif
1781         else
1782           goto error;
1783       }
1784       else if (src_type == BT_REAL)
1785       {
1786         if (dst_kind == 4)
1787           *(float*) dst = (float) real_val;
1788         else if (dst_kind == 8)
1789           *(double*) dst = (double) real_val;
1790 #ifdef HAVE_GFC_REAL_10
1791         else if (dst_kind == 10)
1792           *(long double*) dst = (long double) real_val;
1793 #endif
1794 #ifdef HAVE_GFC_REAL_16
1795         else if (dst_kind == 16)
1796           *(real128t*) dst = (real128t) real_val;
1797 #endif
1798         else
1799           goto error;
1800       }
1801       else if (src_type == BT_COMPLEX)
1802       {
1803         if (dst_kind == 4)
1804           *(float*) dst = (float) cmpx_val;
1805         else if (dst_kind == 8)
1806           *(double*) dst = (double) cmpx_val;
1807 #ifdef HAVE_GFC_REAL_10
1808         else if (dst_kind == 10)
1809           *(long double*) dst = (long double) cmpx_val;
1810 #endif
1811 #ifdef HAVE_GFC_REAL_16
1812         else if (dst_kind == 16)
1813           *(real128t*) dst = (real128t) cmpx_val;
1814 #endif
1815         else
1816           goto error;
1817       }
1818       return;
1819     case BT_COMPLEX:
1820       if (src_type == BT_INTEGER)
1821       {
1822         if (dst_kind == 4)
1823           *(_Complex float*) dst = (_Complex float) int_val;
1824         else if (dst_kind == 8)
1825           *(_Complex double*) dst = (_Complex double) int_val;
1826 #ifdef HAVE_GFC_REAL_10
1827         else if (dst_kind == 10)
1828           *(_Complex long double*) dst = (_Complex long double) int_val;
1829 #endif
1830 #ifdef HAVE_GFC_REAL_16
1831         else if (dst_kind == 16)
1832           *(complex128t*) dst = (complex128t) int_val;
1833 #endif
1834         else
1835           goto error;
1836       }
1837       else if (src_type == BT_REAL)
1838       {
1839         if (dst_kind == 4)
1840           *(_Complex float*) dst = (_Complex float) real_val;
1841         else if (dst_kind == 8)
1842           *(_Complex double*) dst = (_Complex double) real_val;
1843 #ifdef HAVE_GFC_REAL_10
1844         else if (dst_kind == 10)
1845           *(_Complex long double*) dst = (_Complex long double) real_val;
1846 #endif
1847 #ifdef HAVE_GFC_REAL_16
1848         else if (dst_kind == 16)
1849           *(complex128t*) dst = (complex128t) real_val;
1850 #endif
1851         else
1852           goto error;
1853       }
1854       else if (src_type == BT_COMPLEX)
1855       {
1856         if (dst_kind == 4)
1857           *(_Complex float*) dst = (_Complex float) cmpx_val;
1858         else if (dst_kind == 8)
1859           *(_Complex double*) dst = (_Complex double) cmpx_val;
1860 #ifdef HAVE_GFC_REAL_10
1861         else if (dst_kind == 10)
1862           *(_Complex long double*) dst = (_Complex long double) cmpx_val;
1863 #endif
1864 #ifdef HAVE_GFC_REAL_16
1865         else if (dst_kind == 16)
1866           *(complex128t*) dst = (complex128t) cmpx_val;
1867 #endif
1868         else
1869           goto error;
1870       }
1871       else
1872         goto error;
1873       return;
1874     default:
1875       goto error;
1876   }
1877 
1878 error:
1879   fprintf(stderr,
1880           "libcaf_mpi RUNTIME ERROR: Cannot convert type %d kind %d "
1881           "to type %d kind %d\n", src_type, src_kind, dst_type, dst_kind);
1882   if (stat)
1883     *stat = 1;
1884   else
1885     abort();
1886 }
1887 
1888 static void
convert_with_strides(void * dst,int dst_type,int dst_kind,ptrdiff_t byte_dst_stride,void * src,int src_type,int src_kind,ptrdiff_t byte_src_stride,size_t num,int * stat)1889 convert_with_strides(void *dst, int dst_type, int dst_kind,
1890                      ptrdiff_t byte_dst_stride,
1891                      void *src, int src_type, int src_kind,
1892                      ptrdiff_t byte_src_stride, size_t num, int *stat)
1893 {
1894   /* Compute the step from one item to convert to the next in bytes. The stride
1895    * is expected to be the one or similar to the array.stride, i.e. *_stride is
1896    * expected to be >= 1 to progress from one item to the next. */
1897   for (size_t i = 0; i < num; ++i)
1898   {
1899     convert_type(dst, dst_type, dst_kind, src, src_type, src_kind, stat);
1900     dst += byte_dst_stride;
1901     src += byte_src_stride;
1902   }
1903 }
1904 
1905 static void
copy_char_to_self(void * src,int src_type,int src_size,int src_kind,void * dst,int dst_type,int dst_size,int dst_kind,size_t size,bool src_is_scalar)1906 copy_char_to_self(void *src, int src_type, int src_size, int src_kind,
1907                   void *dst, int dst_type, int dst_size, int dst_kind,
1908                   size_t size, bool src_is_scalar)
1909 {
1910 #ifdef GFC_CAF_CHECK
1911   if (dst_type != BT_CHARACTER || src_type != BT_CHARACTER)
1912     caf_runtime_error("internal error: copy_char_to_self() "
1913                       "for non-char types called.");
1914 #endif
1915   const size_t
1916       dst_len = dst_size / dst_kind,
1917       src_len = src_size / src_kind;
1918   const size_t min_len = (src_len < dst_len) ? src_len : dst_len;
1919   /* The address of dest passed by the compiler points on the right  memory
1920    * location. No offset summation is needed. */
1921   if (dst_kind == src_kind)
1922   {
1923     for (size_t c = 0; c < size; ++c)
1924     {
1925       memmove(dst, src, min_len * dst_kind);
1926       /* Fill dest when source is too short. */
1927       if (dst_len > src_len)
1928       {
1929         int32_t * dest_addr = (int32_t *)(dst + dst_kind * src_len);
1930         const size_t pad_num = dst_len - src_len;
1931         if (dst_kind == 1)
1932           memset(dest_addr, ' ', pad_num);
1933         else if (dst_kind == 4)
1934         {
1935           const void * end_addr = &(dest_addr[pad_num]);
1936           while (dest_addr != end_addr)
1937             *(dest_addr++) = (int32_t)' ';
1938         }
1939         else
1940           caf_runtime_error(unreachable);
1941       }
1942       dst = (void *)((ptrdiff_t)(dst) + dst_size);
1943       if (!src_is_scalar)
1944         src = (void *)((ptrdiff_t)(src) + src_size);
1945     }
1946   }
1947   else
1948   {
1949     /* Assign using kind-conversion. */
1950     if (dst_kind == 1 && src_kind == 4)
1951       for (size_t c = 0; c < size; ++c)
1952       {
1953         assign_char1_from_char4(dst_len, src_len, dst, src);
1954         dst = (void *)((ptrdiff_t)(dst) + dst_size);
1955         if (!src_is_scalar)
1956           src = (void *)((ptrdiff_t)(src) + src_size);
1957       }
1958     else if (dst_kind == 4 && src_kind == 1)
1959       for (size_t c = 0; c < size; ++c)
1960       {
1961         assign_char4_from_char1(dst_len, src_len, dst, src);
1962         dst = (void *)((ptrdiff_t)(dst) + dst_size);
1963         if (!src_is_scalar)
1964           src = (void *)((ptrdiff_t)(src) + src_size);
1965       }
1966     else
1967       caf_runtime_error("_caf_send(): Unsupported char kinds in same image "
1968                         "assignment (kind(lhs)= %d, kind(rhs) = %d)",
1969                         dst_kind, src_kind);
1970   }
1971 }
1972 
1973 static void
copy_to_self(gfc_descriptor_t * src,int src_kind,gfc_descriptor_t * dest,int dst_kind,size_t size,int * stat)1974 copy_to_self(gfc_descriptor_t *src, int src_kind,
1975               gfc_descriptor_t *dest, int dst_kind, size_t size, int *stat)
1976 {
1977 #ifdef GFC_CAF_CHECK
1978   if (GFC_DESCRIPTOR_TYPE(dest) == BT_CHARACTER
1979       || GFC_DESCRIPTOR_TYPE(src) == BT_CHARACTER)
1980     caf_runtime_error("internal error: copy_to_self() for char types called.");
1981 #endif
1982   /* The address of dest passed by the compiler points on the right
1983    * memory location. No offset summation is needed. */
1984   if (dst_kind == src_kind)
1985     memmove(dest->base_addr, src->base_addr, size * GFC_DESCRIPTOR_SIZE(dest));
1986   else
1987     /* When the rank is 0 then a scalar is copied to a vector and the stride
1988      * is zero. */
1989     convert_with_strides(dest->base_addr, GFC_DESCRIPTOR_TYPE(dest), dst_kind,
1990                          GFC_DESCRIPTOR_SIZE(dest), src->base_addr,
1991                          GFC_DESCRIPTOR_TYPE(src), src_kind,
1992                          (GFC_DESCRIPTOR_RANK(src) > 0)
1993                          ? GFC_DESCRIPTOR_SIZE(src) : 0, size, stat);
1994 }
1995 
1996 /* token: The token of the array to be written to.
1997  * offset: Difference between the coarray base address and the actual data,
1998  * used for caf(3)[2] = 8 or caf[4]%a(4)%b = 7.
1999  * image_index: Index of the coarray (typically remote,
2000  * though it can also be on this_image).
2001  * data: Pointer to the to-be-transferred data.
2002  * size: The number of bytes to be transferred.
2003  * asynchronous: Return before the data transfer has been complete */
2004 
selectType(int size,MPI_Datatype * dt)2005 void selectType(int size, MPI_Datatype *dt)
2006 {
2007   int t_s;
2008 
2009 #define SELTYPE(type) MPI_Type_size(type, &t_s);  \
2010 if (t_s == size)                                  \
2011 {                                                 \
2012   *dt = type;                                     \
2013   return;                                         \
2014 }
2015 
2016   SELTYPE(MPI_BYTE)
2017   SELTYPE(MPI_SHORT)
2018   SELTYPE(MPI_INT)
2019   SELTYPE(MPI_DOUBLE)
2020   SELTYPE(MPI_COMPLEX)
2021   SELTYPE(MPI_DOUBLE_COMPLEX)
2022 
2023 #undef SELTYPE
2024 }
2025 
2026 void
PREFIX(sendget)2027 PREFIX(sendget) (caf_token_t token_s, size_t offset_s, int image_index_s,
2028                  gfc_descriptor_t *dest, caf_vector_t *dst_vector,
2029                  caf_token_t token_g, size_t offset_g, int image_index_g,
2030                  gfc_descriptor_t *src , caf_vector_t *src_vector,
2031                  int dst_kind, int src_kind, bool mrt, int *pstat)
2032 {
2033   int j, ierr = 0;
2034   size_t i, size;
2035   ptrdiff_t dimextent;
2036   const int
2037     src_rank = GFC_DESCRIPTOR_RANK(src),
2038     dst_rank = GFC_DESCRIPTOR_RANK(dest);
2039   const size_t
2040     src_size = GFC_DESCRIPTOR_SIZE(src),
2041     dst_size = GFC_DESCRIPTOR_SIZE(dest);
2042   const int
2043     src_type = GFC_DESCRIPTOR_TYPE(src),
2044     dst_type = GFC_DESCRIPTOR_TYPE(dest);
2045   const bool
2046     src_contiguous = PREFIX(is_contiguous) (src),
2047     dst_contiguous = PREFIX(is_contiguous) (dest);
2048   const bool
2049     src_same_image = caf_this_image == image_index_g,
2050     dst_same_image = caf_this_image == image_index_s,
2051     same_type_and_kind = dst_type == src_type && dst_kind == src_kind;
2052 
2053   MPI_Win *p = TOKEN(token_g);
2054   ptrdiff_t src_offset = 0, dst_offset = 0;
2055   void *pad_str = NULL;
2056   bool free_pad_str = false;
2057   void *src_t_buff = NULL, *dst_t_buff = NULL;
2058   bool free_src_t_buff = false, free_dst_t_buff = false;
2059   const bool
2060     dest_char_array_is_longer = dst_type == BT_CHARACTER && dst_size > src_size;
2061   int
2062     src_remote_image = image_index_g - 1,
2063     dst_remote_image = image_index_s - 1;
2064 
2065   if (!src_same_image)
2066   {
2067     MPI_Group current_team_group, win_group;
2068     ierr = MPI_Comm_group(CAF_COMM_WORLD, &current_team_group); chk_err(ierr);
2069     ierr = MPI_Win_get_group(*p, &win_group); chk_err(ierr);
2070     ierr = MPI_Group_translate_ranks(current_team_group, 1,
2071                                      (int[]){src_remote_image}, win_group,
2072                                      &src_remote_image); chk_err(ierr);
2073     ierr = MPI_Group_free(&current_team_group); chk_err(ierr);
2074     ierr = MPI_Group_free(&win_group); chk_err(ierr);
2075   }
2076   if (!dst_same_image)
2077   {
2078     MPI_Group current_team_group, win_group;
2079     ierr = MPI_Comm_group(CAF_COMM_WORLD, &current_team_group); chk_err(ierr);
2080     ierr = MPI_Win_get_group(*p, &win_group); chk_err(ierr);
2081     ierr = MPI_Group_translate_ranks(current_team_group, 1,
2082                                      (int[]){dst_remote_image}, win_group,
2083                                      &dst_remote_image); chk_err(ierr);
2084     ierr = MPI_Group_free(&current_team_group); chk_err(ierr);
2085     ierr = MPI_Group_free(&win_group); chk_err(ierr);
2086   }
2087 
2088   /* Ensure stat is always set. */
2089 #ifdef GCC_GE_7
2090   int * stat = pstat;
2091   if (stat)
2092     *stat = 0;
2093 #else
2094   /* Gcc prior to 7.0 does not have stat here. */
2095   int * stat = NULL;
2096 #endif
2097 
2098   size = 1;
2099   for (j = 0; j < dst_rank; ++j)
2100   {
2101     dimextent = dest->dim[j]._ubound - dest->dim[j].lower_bound + 1;
2102     if (dimextent < 0)
2103       dimextent = 0;
2104     size *= dimextent;
2105   }
2106 
2107   if (size == 0)
2108     return;
2109 
2110   dprint("src_vector = %p, dst_vector = %p, src_image_index = %d, "
2111          "dst_image_index = %d, offset_src = %zd, offset_dst = %zd.\n",
2112          src_vector, dst_vector, image_index_g, image_index_s,
2113          offset_g, offset_s);
2114   check_image_health(image_index_g, stat);
2115   check_image_health(image_index_s, stat);
2116 
2117   /* For char arrays: create the padding array, when dst is longer than src. */
2118   if (dest_char_array_is_longer)
2119   {
2120     const size_t pad_num = (dst_size / dst_kind) - (src_size / src_kind);
2121     const size_t pad_sz = pad_num * dst_kind;
2122     /* For big arrays alloca() may not be able to get the memory on the stack.
2123      * Use a regular malloc then. */
2124     if ((free_pad_str = ((pad_str = alloca(pad_sz)) == NULL)))
2125     {
2126       pad_str = malloc(pad_sz);
2127       if (src_t_buff == NULL)
2128         caf_runtime_error("Unable to allocate memory "
2129                           "for internal buffer in sendget().");
2130     }
2131     if (dst_kind == 1)
2132     {
2133       memset(pad_str, ' ', pad_num);
2134     }
2135     else /* dst_kind == 4. */
2136     {
2137       for (int32_t *it = (int32_t *) pad_str,
2138            *itEnd = ((int32_t *) pad_str) + pad_num; it < itEnd; ++it)
2139       {
2140         *it = (int32_t) ' ';
2141       }
2142     }
2143   }
2144 
2145   if (src_contiguous && src_vector == NULL)
2146   {
2147     if (src_same_image)
2148     {
2149       dprint("in caf_this == image_index, size = %zd, "
2150              "dst_kind = %d, src_kind = %d, dst_size = %zd, src_size = %zd\n",
2151              size, dst_kind, src_kind, dst_size, src_size);
2152       src_t_buff = src->base_addr;
2153       if (same_type_and_kind && !dest_char_array_is_longer)
2154       {
2155         dst_t_buff = src_t_buff;
2156       }
2157       else
2158       {
2159         dprint("allocating %zd bytes for dst_t_buff.\n", dst_size * size);
2160         if ((free_dst_t_buff = ((dst_t_buff = alloca(dst_size * size)) == NULL)))
2161         {
2162           dst_t_buff = malloc(dst_size * size);
2163           if (dst_t_buff == NULL)
2164             caf_runtime_error("Unable to allocate memory "
2165                               "for internal buffer in sendget().");
2166         }
2167         if (dst_type == BT_CHARACTER)
2168         {
2169           /* The size is encoded in the descriptor's type for char arrays. */
2170           copy_char_to_self(src->base_addr, src_type, src_size, src_kind,
2171                             dst_t_buff, dst_type, dst_size, dst_kind,
2172                             size, src_rank == 0);
2173         }
2174         else
2175         {
2176           convert_with_strides(dst_t_buff, dst_type, dst_kind, dst_size,
2177                                src->base_addr, src_type, src_kind,
2178                                (src_rank > 0) ? src_size : 0, size, stat);
2179         }
2180       }
2181     }
2182     else
2183     {
2184       /* When replication is needed, only access the scalar on the remote. */
2185       const size_t src_real_size = src_rank > 0 ?
2186         (src_size * size) : src_size;
2187       if ((free_dst_t_buff = ((dst_t_buff = alloca(dst_size * size)) == NULL)))
2188       {
2189         dst_t_buff = malloc(dst_size * size);
2190         if (dst_t_buff == NULL)
2191           caf_runtime_error("Unable to allocate memory "
2192                             "for internal buffer in sendget().");
2193       }
2194 
2195       if (dst_kind != src_kind || src_rank == 0 || dest_char_array_is_longer)
2196       {
2197         if ((free_src_t_buff = ((src_t_buff = alloca(src_size * size)) == NULL)))
2198         {
2199           src_t_buff = malloc(src_size * size);
2200           if (src_t_buff == NULL)
2201             caf_runtime_error("Unable to allocate memory "
2202                               "for internal buffer in sendget().");
2203         }
2204       }
2205       else
2206         src_t_buff = dst_t_buff;
2207 
2208       CAF_Win_lock(MPI_LOCK_SHARED, src_remote_image, *p);
2209       if ((same_type_and_kind && dst_rank == src_rank)
2210           || dst_type == BT_CHARACTER)
2211       {
2212         if (!dest_char_array_is_longer
2213             && (dst_kind == src_kind || dst_type != BT_CHARACTER))
2214         {
2215           const size_t trans_size =
2216             ((dst_size > src_size) ? src_size : dst_size) * size;
2217           ierr = MPI_Get(dst_t_buff, trans_size, MPI_BYTE, src_remote_image,
2218                          offset_g, trans_size, MPI_BYTE, *p); chk_err(ierr);
2219         }
2220         else
2221         {
2222           ierr = MPI_Get(src_t_buff, src_real_size, MPI_BYTE, src_remote_image,
2223                          offset_g, src_real_size, MPI_BYTE, *p); chk_err(ierr);
2224           dprint("copy_char_to_self(src_size = %zd, src_kind = %d, "
2225                  "dst_size = %zd, dst_kind = %d, size = %zd)\n",
2226                  src_size, src_kind, dst_size, dst_kind, size);
2227           copy_char_to_self(src_t_buff, src_type, src_size, src_kind,
2228                             dst_t_buff, dst_type, dst_size, dst_kind,
2229                             size, src_rank == 0);
2230           dprint("|%s|\n", (char *)dst_t_buff);
2231         }
2232       }
2233       else
2234       {
2235         ierr = MPI_Get(src_t_buff, src_real_size, MPI_BYTE, src_remote_image,
2236                        offset_g, src_real_size, MPI_BYTE, *p); chk_err(ierr);
2237         convert_with_strides(dst_t_buff, dst_type, dst_kind, dst_size,
2238                              src_t_buff, src_type, src_kind,
2239                              (src_rank > 0) ? src_size: 0, size, stat);
2240       }
2241       CAF_Win_unlock(src_remote_image, *p);
2242     }
2243   }
2244 #ifdef STRIDED
2245   else if (!src_same_image && same_type_and_kind && dst_type != BT_CHARACTER)
2246   {
2247     /* For strided copy, no type and kind conversion, copy to self or
2248      * character arrays are supported. */
2249     MPI_Datatype dt_s, dt_d, base_type_src, base_type_dst;
2250     int *arr_bl;
2251     int *arr_dsp_s;
2252 
2253     if ((free_dst_t_buff = ((dst_t_buff = alloca(dst_size * size)) == NULL)))
2254     {
2255       dst_t_buff = malloc(dst_size * size);
2256       if (dst_t_buff == NULL)
2257         caf_runtime_error("Unable to allocate memory "
2258                           "for internal buffer in sendget().");
2259     }
2260 
2261     selectType(src_size, &base_type_src);
2262     selectType(dst_size, &base_type_dst);
2263 
2264     if (src_rank == 1)
2265     {
2266       if (src_vector == NULL)
2267       {
2268         ierr = MPI_Type_vector(size, 1, src->dim[0]._stride, base_type_src,
2269                                &dt_s); chk_err(ierr);
2270       }
2271       else
2272       {
2273         arr_bl = calloc(size, sizeof(int));
2274         arr_dsp_s = calloc(size, sizeof(int));
2275 
2276         dprint("Setting up strided vector index.\n");
2277 #define KINDCASE(kind, type)                                            \
2278 case kind:                                                              \
2279   for (i = 0; i < size; ++i)                                            \
2280   {                                                                     \
2281     arr_dsp_s[i] = ((ptrdiff_t)                                         \
2282       ((type *) src_vector->u.v.vector)[i] - src->dim[0].lower_bound);  \
2283     arr_bl[i] = 1;                                                      \
2284   }                                                                     \
2285   break
2286 
2287         switch (src_vector->u.v.kind)
2288         {
2289           KINDCASE(1, int8_t);
2290           KINDCASE(2, int16_t);
2291           KINDCASE(4, int32_t);
2292           KINDCASE(8, int64_t);
2293 #ifdef HAVE_GFC_INTEGER_16
2294           KINDCASE(16, __int128);
2295 #endif
2296         default:
2297           caf_runtime_error(unreachable);
2298           return;
2299         }
2300 #undef KINDCASE
2301         ierr = MPI_Type_indexed(size, arr_bl, arr_dsp_s, base_type_src, &dt_s);
2302         chk_err(ierr);
2303         free(arr_bl);
2304         free(arr_dsp_s);
2305       }
2306       ierr = MPI_Type_vector(size, 1, 1, base_type_dst, &dt_d); chk_err(ierr);
2307     }
2308     else
2309     {
2310       arr_bl = calloc(size, sizeof(int));
2311       arr_dsp_s = calloc(size, sizeof(int));
2312 
2313       for (i = 0; i < size; ++i)
2314       {
2315         arr_bl[i] = 1;
2316       }
2317 
2318       for (i = 0; i < size; ++i)
2319       {
2320         ptrdiff_t array_offset_sr = 0, extent = 1, tot_ext = 1;
2321         if (src_vector == NULL)
2322         {
2323           for (j = 0; j < src_rank - 1; ++j)
2324           {
2325             extent = src->dim[j]._ubound - src->dim[j].lower_bound + 1;
2326             array_offset_sr += ((i / tot_ext) % extent) * src->dim[j]._stride;
2327             tot_ext *= extent;
2328           }
2329 
2330           array_offset_sr += (i / tot_ext) * src->dim[src_rank - 1]._stride;
2331         }
2332         else
2333         {
2334 #define KINDCASE(kind, type)                                          \
2335 case kind:                                                            \
2336   array_offset_sr = ((ptrdiff_t)                                      \
2337     ((type *) src_vector->u.v.vector)[i] - src->dim[0].lower_bound);  \
2338   break
2339           switch (src_vector->u.v.kind)
2340           {
2341             KINDCASE(1, int8_t);
2342             KINDCASE(2, int16_t);
2343             KINDCASE(4, int32_t);
2344             KINDCASE(8, int64_t);
2345 #ifdef HAVE_GFC_INTEGER_16
2346             KINDCASE(16, __int128);
2347 #endif
2348             default:
2349               caf_runtime_error(unreachable);
2350               return;
2351           }
2352 #undef KINDCASE
2353         }
2354         arr_dsp_s[i] = array_offset_sr;
2355       }
2356 
2357       ierr = MPI_Type_indexed(size, arr_bl, arr_dsp_s, base_type_src, &dt_s);
2358       chk_err(ierr);
2359       ierr = MPI_Type_vector(size, 1, 1, base_type_dst, &dt_d); chk_err(ierr);
2360 
2361       free(arr_bl);
2362       free(arr_dsp_s);
2363     }
2364 
2365     ierr = MPI_Type_commit(&dt_s); chk_err(ierr);
2366     ierr = MPI_Type_commit(&dt_d); chk_err(ierr);
2367 
2368     CAF_Win_lock(MPI_LOCK_SHARED, src_remote_image, *p);
2369     ierr = MPI_Get(dst_t_buff, 1, dt_d, src_remote_image, offset_g, 1,
2370                    dt_s, *p); chk_err(ierr);
2371     CAF_Win_unlock(src_remote_image, *p);
2372 
2373 #ifdef WITH_FAILED_IMAGES
2374     check_image_health(image_index_g, stat);
2375 
2376     if (!stat && ierr == STAT_FAILED_IMAGE)
2377       terminate_internal(ierr, 1);
2378 
2379     if (stat)
2380       *stat = ierr;
2381 #else
2382     if (ierr != 0)
2383     {
2384       terminate_internal(ierr, 1);
2385       return;
2386     }
2387 #endif
2388     ierr = MPI_Type_free(&dt_s); chk_err(ierr);
2389     ierr = MPI_Type_free(&dt_d); chk_err(ierr);
2390   }
2391 #endif // STRIDED
2392   else
2393   {
2394     if ((free_dst_t_buff = ((dst_t_buff = alloca(dst_size * size)) == NULL)))
2395     {
2396       dst_t_buff = malloc(dst_size * size);
2397       if (dst_t_buff == NULL)
2398         caf_runtime_error("Unable to allocate memory "
2399                           "for internal buffer in sendget().");
2400     }
2401 
2402     if (src_same_image)
2403       src_t_buff = src->base_addr;
2404     else if (!same_type_and_kind)
2405     {
2406       if ((free_src_t_buff = (((src_t_buff = alloca(src_size))) == NULL)))
2407       {
2408         src_t_buff = malloc(src_size);
2409         if (src_t_buff == NULL)
2410           caf_runtime_error("Unable to allocate memory "
2411                             "for internal buffer in sendget().");
2412       }
2413     }
2414 
2415     if (!src_same_image)
2416       CAF_Win_lock(MPI_LOCK_SHARED, src_remote_image, *p);
2417     for (i = 0; i < size; ++i)
2418     {
2419       ptrdiff_t array_offset_sr = 0, extent = 1, tot_ext = 1;
2420       if (src_vector == NULL)
2421       {
2422         for (j = 0; j < src_rank - 1; ++j)
2423         {
2424           extent = src->dim[j]._ubound - src->dim[j].lower_bound + 1;
2425           array_offset_sr += ((i / tot_ext) % extent) * src->dim[j]._stride;
2426           tot_ext *= extent;
2427         }
2428 
2429         array_offset_sr += (i / tot_ext) * src->dim[src_rank - 1]._stride;
2430       }
2431       else
2432       {
2433 #define KINDCASE(kind, type)                                        \
2434 case kind:                                                          \
2435   array_offset_sr = ((ptrdiff_t)                                    \
2436     ((type *)src_vector->u.v.vector)[i] - src->dim[0].lower_bound); \
2437   break
2438         switch (src_vector->u.v.kind)
2439         {
2440           KINDCASE(1, int8_t);
2441           KINDCASE(2, int16_t);
2442           KINDCASE(4, int32_t);
2443           KINDCASE(8, int64_t);
2444 #ifdef HAVE_GFC_INTEGER_16
2445           KINDCASE(16, __int128);
2446 #endif
2447           default:
2448             caf_runtime_error(unreachable);
2449             return;
2450         }
2451       }
2452 #undef KINDCASE
2453 
2454       src_offset = array_offset_sr * src_size;
2455       void *dst = (void *)((char *) dst_t_buff + i * dst_size);
2456 
2457       if (!src_same_image)
2458       {
2459         // Do the more likely first.
2460         dprint("kind(dst) = %d, el_sz(dst) = %zd, "
2461                "kind(src) = %d, el_sz(src) = %zd, lb(dst) = %zd.\n",
2462                dst_kind, dst_size, src_kind, src_size, src->dim[0].lower_bound);
2463         if (same_type_and_kind)
2464         {
2465           const size_t trans_size = (src_size < dst_size) ? src_size : dst_size;
2466           ierr = MPI_Get(dst, trans_size, MPI_BYTE, src_remote_image,
2467                          offset_g + src_offset, trans_size, MPI_BYTE, *p);
2468           chk_err(ierr);
2469           if (pad_str)
2470             memcpy((void *)((char *)dst + src_size), pad_str,
2471                    dst_size - src_size);
2472         }
2473         else if (dst_type == BT_CHARACTER)
2474         {
2475           ierr = MPI_Get(src_t_buff, src_size, MPI_BYTE, src_remote_image,
2476                          offset_g + src_offset, src_size, MPI_BYTE, *p);
2477           chk_err(ierr);
2478           copy_char_to_self(src_t_buff, src_type, src_size, src_kind,
2479                             dst, dst_type, dst_size, dst_kind, 1, true);
2480         }
2481         else
2482         {
2483           ierr = MPI_Get(src_t_buff, src_size, MPI_BYTE, src_remote_image,
2484                          offset_g + src_offset, src_size, MPI_BYTE, *p);
2485           chk_err(ierr);
2486           convert_type(dst, dst_type, dst_kind, src_t_buff, src_type,
2487                        src_kind, stat);
2488         }
2489       }
2490       else
2491       {
2492         if (!mrt)
2493         {
2494           dprint("strided same_image, no temp,  for i = %zd, "
2495                  "src_offset = %zd, offset = %zd.\n",
2496                  i, src_offset, offset_g);
2497           if (same_type_and_kind)
2498             memmove(dst, src->base_addr + src_offset, src_size);
2499           else
2500             convert_type(dst, dst_type, dst_kind,
2501                          src->base_addr + src_offset, src_type, src_kind, stat);
2502         }
2503         else
2504         {
2505           dprint("strided same_image, *WITH* temp, for i = %zd.\n", i);
2506           if (same_type_and_kind)
2507             memmove(dst, src->base_addr + src_offset, src_size);
2508           else
2509             convert_type(dst, dst_type, dst_kind,
2510                          src->base_addr + src_offset, src_type, src_kind, stat);
2511         }
2512       }
2513 
2514 #ifndef WITH_FAILED_IMAGES
2515       if (ierr != 0)
2516       {
2517         caf_runtime_error("MPI Error: %d", ierr);
2518         return;
2519       }
2520 #endif
2521     }
2522     if (!src_same_image)
2523       CAF_Win_unlock(src_remote_image, *p);
2524   }
2525 
2526   p = TOKEN(token_s);
2527   /* Now transfer data to the remote dest. */
2528   if (dst_contiguous && dst_vector == NULL)
2529   {
2530     if (dst_same_image)
2531       memmove(dest->base_addr, dst_t_buff, dst_size * size);
2532     else
2533     {
2534       CAF_Win_lock(MPI_LOCK_EXCLUSIVE, dst_remote_image, *p);
2535       const size_t trans_size = size * dst_size;
2536       ierr = MPI_Put(dst_t_buff, trans_size, MPI_BYTE, dst_remote_image,
2537                      offset_s, trans_size, MPI_BYTE, *p); chk_err(ierr);
2538 #ifdef CAF_MPI_LOCK_UNLOCK
2539       MPI_Win_unlock(dst_remote_image, *p);
2540 #elif NONBLOCKING_PUT
2541       /* Pending puts init */
2542       if (pending_puts == NULL)
2543       {
2544         pending_puts = calloc(1,sizeof(win_sync));
2545         pending_puts->next = NULL;
2546         pending_puts->win = token_s;
2547         pending_puts->img = dst_remote_image;
2548         last_elem = pending_puts;
2549         last_elem->next = NULL;
2550       }
2551       else
2552       {
2553         last_elem->next = calloc(1,sizeof(win_sync));
2554         last_elem = last_elem->next;
2555         last_elem->win = token_s;
2556         last_elem->img = dst_remote_image;
2557         last_elem->next = NULL;
2558       }
2559 #else
2560       ierr = MPI_Win_flush(dst_remote_image, *p); chk_err(ierr);
2561 #endif // CAF_MPI_LOCK_UNLOCK
2562     }
2563   }
2564 #ifdef STRIDED
2565   else if (!dst_same_image && same_type_and_kind && dst_type != BT_CHARACTER)
2566   {
2567     /* For strided copy, no type and kind conversion, copy to self or
2568      * character arrays are supported. */
2569     MPI_Datatype dt_s, dt_d, base_type_dst;
2570     int *arr_bl, *arr_dsp_d;
2571 
2572     selectType(dst_size, &base_type_dst);
2573 
2574     if (dst_rank == 1)
2575     {
2576       if (dst_vector == NULL)
2577       {
2578         ierr = MPI_Type_vector(size, 1, dest->dim[0]._stride, base_type_dst,
2579                                &dt_d); chk_err(ierr);
2580       }
2581       else
2582       {
2583         arr_bl = calloc(size, sizeof(int));
2584         arr_dsp_d = calloc(size, sizeof(int));
2585 
2586         dprint("Setting up strided vector index.\n");
2587 #define KINDCASE(kind, type)                                            \
2588 case kind:                                                              \
2589   for (i = 0; i < size; ++i)                                            \
2590   {                                                                     \
2591     arr_dsp_d[i] = ((ptrdiff_t)                                         \
2592       ((type *) dst_vector->u.v.vector)[i] - dest->dim[0].lower_bound); \
2593     arr_bl[i] = 1;                                                      \
2594   }                                                                     \
2595   break
2596         switch (dst_vector->u.v.kind)
2597         {
2598           KINDCASE(1, int8_t);
2599           KINDCASE(2, int16_t);
2600           KINDCASE(4, int32_t);
2601           KINDCASE(8, int64_t);
2602 #ifdef HAVE_GFC_INTEGER_16
2603           KINDCASE(16, __int128);
2604 #endif
2605         default:
2606           caf_runtime_error(unreachable);
2607           return;
2608         }
2609 #undef KINDCASE
2610         ierr = MPI_Type_indexed(size, arr_bl, arr_dsp_d, base_type_dst, &dt_d);
2611         chk_err(ierr);
2612         free(arr_bl);
2613         free(arr_dsp_d);
2614       }
2615       ierr = MPI_Type_vector(size, 1, 1, base_type_dst, &dt_s); chk_err(ierr);
2616     }
2617     else
2618     {
2619       arr_bl = calloc(size, sizeof(int));
2620       arr_dsp_d = calloc(size, sizeof(int));
2621 
2622       for (i = 0; i < size; ++i)
2623       {
2624         arr_bl[i] = 1;
2625       }
2626 
2627       for (i = 0; i < size; ++i)
2628       {
2629         ptrdiff_t array_offset_dst = 0, extent = 1, tot_ext = 1;
2630         if (dst_vector == NULL)
2631         {
2632           for (j = 0; j < dst_rank - 1; ++j)
2633           {
2634             extent = dest->dim[j]._ubound - dest->dim[j].lower_bound + 1;
2635             array_offset_dst += ((i / tot_ext) % extent) * dest->dim[j]._stride;
2636             tot_ext *= extent;
2637           }
2638 
2639           array_offset_dst += (i / tot_ext) * dest->dim[dst_rank - 1]._stride;
2640         }
2641         else
2642         {
2643 #define KINDCASE(kind, type)                                            \
2644 case kind:                                                              \
2645   array_offset_dst = ((ptrdiff_t)                                       \
2646     ((type *) dst_vector->u.v.vector)[i] - dest->dim[0].lower_bound);   \
2647   break
2648           switch (dst_vector->u.v.kind)
2649           {
2650             KINDCASE(1, int8_t);
2651             KINDCASE(2, int16_t);
2652             KINDCASE(4, int32_t);
2653             KINDCASE(8, int64_t);
2654 #ifdef HAVE_GFC_INTEGER_16
2655             KINDCASE(16, __int128);
2656 #endif
2657             default:
2658               caf_runtime_error(unreachable);
2659               return;
2660           }
2661 #undef KINDCASE
2662         }
2663         arr_dsp_d[i] = array_offset_dst;
2664       }
2665 
2666       ierr = MPI_Type_vector(size, 1, 1, base_type_dst, &dt_s); chk_err(ierr);
2667       ierr = MPI_Type_indexed(size, arr_bl, arr_dsp_d, base_type_dst, &dt_d);
2668       chk_err(ierr);
2669 
2670       free(arr_bl);
2671       free(arr_dsp_d);
2672     }
2673 
2674     ierr = MPI_Type_commit(&dt_s); chk_err(ierr);
2675     ierr = MPI_Type_commit(&dt_d); chk_err(ierr);
2676 
2677     CAF_Win_lock(MPI_LOCK_EXCLUSIVE, dst_remote_image, *p);
2678     ierr = MPI_Put(dst_t_buff, 1, dt_s, dst_remote_image, offset_s, 1,
2679                    dt_d, *p); chk_err(ierr);
2680     CAF_Win_unlock(dst_remote_image, *p);
2681 
2682 #ifdef WITH_FAILED_IMAGES
2683     check_image_health(image_index_s, stat);
2684 
2685     if (!stat && ierr == STAT_FAILED_IMAGE)
2686       terminate_internal(ierr, 1);
2687 
2688     if (stat)
2689       *stat = ierr;
2690 #else
2691     if (ierr != 0)
2692     {
2693       terminate_internal(ierr, 1);
2694       return;
2695     }
2696 #endif
2697     ierr = MPI_Type_free(&dt_s); chk_err(ierr);
2698     ierr = MPI_Type_free(&dt_d); chk_err(ierr);
2699   }
2700 #endif // STRIDED
2701   else
2702   {
2703     if (!dst_same_image)
2704       CAF_Win_lock(MPI_LOCK_EXCLUSIVE, dst_remote_image, *p);
2705     for (i = 0; i < size; ++i)
2706     {
2707       ptrdiff_t array_offset_dst = 0, extent = 1, tot_ext = 1;
2708       if (dst_vector == NULL)
2709       {
2710         for (j = 0; j < dst_rank - 1; ++j)
2711         {
2712           extent = dest->dim[j]._ubound - dest->dim[j].lower_bound + 1;
2713           array_offset_dst += ((i / tot_ext) % extent) * dest->dim[j]._stride;
2714           tot_ext *= extent;
2715         }
2716 
2717         array_offset_dst += (i / tot_ext) * dest->dim[dst_rank - 1]._stride;
2718       }
2719       else
2720       {
2721 #define KINDCASE(kind, type)                                         \
2722 case kind:                                                           \
2723   array_offset_dst = ((ptrdiff_t)                                    \
2724     ((type *)dst_vector->u.v.vector)[i] - dest->dim[0].lower_bound); \
2725   break
2726         switch (dst_vector->u.v.kind)
2727         {
2728           KINDCASE(1, int8_t);
2729           KINDCASE(2, int16_t);
2730           KINDCASE(4, int32_t);
2731           KINDCASE(8, int64_t);
2732 #ifdef HAVE_GFC_INTEGER_16
2733           KINDCASE(16, __int128);
2734 #endif
2735           default:
2736             caf_runtime_error(unreachable);
2737             return;
2738         }
2739       }
2740 #undef KINDCASE
2741       dst_offset = array_offset_dst * dst_size;
2742 
2743       void *sr = (void *)((char *)dst_t_buff + i * dst_size);
2744 
2745       if (!dst_same_image)
2746       {
2747         // Do the more likely first.
2748         ierr = MPI_Put(sr, dst_size, MPI_BYTE, dst_remote_image,
2749                        offset_s + dst_offset, dst_size, MPI_BYTE, *p);
2750          chk_err(ierr);
2751       }
2752       else
2753         memmove(dest->base_addr + dst_offset, sr, dst_size);
2754 
2755 #ifndef WITH_FAILED_IMAGES
2756       if (ierr != 0)
2757       {
2758         caf_runtime_error("MPI Error: %d", ierr);
2759         return;
2760       }
2761 #endif
2762     } /* for */
2763     if (!dst_same_image)
2764       CAF_Win_unlock(dst_remote_image, *p);
2765   }
2766 
2767   /* Free memory, when not allocated on stack. */
2768   if (free_src_t_buff)
2769     free(src_t_buff);
2770   if (free_dst_t_buff)
2771     free(dst_t_buff);
2772   if (free_pad_str)
2773     free(pad_str);
2774 
2775 #ifdef WITH_FAILED_IMAGES
2776   /* Catch failed images, when failed image support is active. */
2777   check_image_health(image_index_g , stat);
2778   check_image_health(image_index_s , stat);
2779 #endif
2780 
2781   if (ierr != MPI_SUCCESS)
2782   {
2783     int mpi_error;
2784     MPI_Error_class(ierr, &mpi_error);
2785     if (stat)
2786       *stat = mpi_error;
2787     else
2788     {
2789       int error_len = 2048;
2790       char error_str[error_len];
2791       strcpy(error_str, "MPI-error: ");
2792       MPI_Error_string(mpi_error, &error_str[11], &error_len);
2793       error_stop_str(error_str, error_len + 11, false);
2794     }
2795   }
2796 }
2797 
2798 
2799 /* Send array data from src to dest on a remote image.
2800  * The argument mrt means may_require_temporary */
2801 
2802 void
PREFIX(send)2803 PREFIX(send) (caf_token_t token, size_t offset, int image_index,
2804               gfc_descriptor_t *dest, caf_vector_t *dst_vector,
2805               gfc_descriptor_t *src, int dst_kind, int src_kind,
2806               bool mrt, int *pstat)
2807 {
2808   int j, ierr = 0;
2809   size_t i, size;
2810   ptrdiff_t dimextent;
2811   const int
2812     src_rank = GFC_DESCRIPTOR_RANK(src),
2813     dst_rank = GFC_DESCRIPTOR_RANK(dest);
2814   const size_t
2815     src_size = GFC_DESCRIPTOR_SIZE(src),
2816     dst_size = GFC_DESCRIPTOR_SIZE(dest);
2817   const int
2818     src_type = GFC_DESCRIPTOR_TYPE(src),
2819     dst_type = GFC_DESCRIPTOR_TYPE(dest);
2820   const bool
2821     src_contiguous = PREFIX(is_contiguous) (src),
2822     dst_contiguous = PREFIX(is_contiguous) (dest);
2823   const bool
2824     same_image = caf_this_image == image_index,
2825     same_type_and_kind = dst_type == src_type && dst_kind == src_kind;
2826 
2827   MPI_Win *p = TOKEN(token);
2828   ptrdiff_t dst_offset = 0;
2829   void *pad_str = NULL, *t_buff = NULL;
2830   bool free_pad_str = false, free_t_buff = false;
2831   const bool dest_char_array_is_longer
2832       = dst_type == BT_CHARACTER && dst_size > src_size && !same_image;
2833   int remote_image = image_index - 1;
2834   if (!same_image)
2835   {
2836     MPI_Group current_team_group, win_group;
2837     ierr = MPI_Comm_group(CAF_COMM_WORLD, &current_team_group); chk_err(ierr);
2838     ierr = MPI_Win_get_group(*p, &win_group); chk_err(ierr);
2839     ierr = MPI_Group_translate_ranks(current_team_group, 1,
2840                                      (int[]){remote_image}, win_group,
2841                                      &remote_image); chk_err(ierr);
2842     ierr = MPI_Group_free(&current_team_group); chk_err(ierr);
2843     ierr = MPI_Group_free(&win_group); chk_err(ierr);
2844   }
2845 
2846   /* Ensure stat is always set. */
2847 #ifdef GCC_GE_7
2848   int * stat = pstat;
2849   if (stat)
2850     *stat = 0;
2851 #else
2852   /* Gcc prior to 7.0 does not have stat here. */
2853   int * stat = NULL;
2854 #endif
2855 
2856   size = 1;
2857   for (j = 0; j < dst_rank; ++j)
2858   {
2859     dimextent = dest->dim[j]._ubound - dest->dim[j].lower_bound + 1;
2860     if (dimextent < 0)
2861       dimextent = 0;
2862     size *= dimextent;
2863   }
2864 
2865   if (size == 0)
2866     return;
2867 
2868   dprint("dst_vector = %p, image_index = %d, offset = %zd.\n",
2869          dst_vector, image_index, offset);
2870   check_image_health(image_index, stat);
2871 
2872   /* For char arrays: create the padding array, when dst is longer than src. */
2873   if (dest_char_array_is_longer)
2874   {
2875     const size_t pad_num = (dst_size / dst_kind) - (src_size / src_kind);
2876     const size_t pad_sz = pad_num * dst_kind;
2877     /* For big arrays alloca() may not be able to get the memory on the stack.
2878      * Use a regular malloc then. */
2879     if ((free_pad_str = ((pad_str = alloca(pad_sz)) == NULL)))
2880     {
2881       pad_str = malloc(pad_sz);
2882       if (t_buff == NULL)
2883         caf_runtime_error("Unable to allocate memory "
2884                           "for internal buffer in send().");
2885     }
2886     if (dst_kind == 1)
2887       memset(pad_str, ' ', pad_num);
2888     else /* dst_kind == 4. */
2889     {
2890       for (int32_t *it = (int32_t *) pad_str,
2891            *itEnd = ((int32_t *) pad_str) + pad_num; it < itEnd; ++it)
2892       {
2893         *it = (int32_t) ' ';
2894       }
2895     }
2896   }
2897 
2898   if (src_contiguous && dst_contiguous && dst_vector == NULL)
2899   {
2900     if (same_image)
2901     {
2902       dprint("in caf_this == image_index, size = %zd, dst_kind = %d, "
2903              "src_kind = %d\n", size, dst_kind, src_kind);
2904       if (dst_type == BT_CHARACTER)
2905         /* The size is encoded in the descriptor's type for char arrays. */
2906         copy_char_to_self(src->base_addr, src_type, src_size, src_kind,
2907                           dest->base_addr, dst_type, dst_size, dst_kind,
2908                           size, src_rank == 0);
2909       else
2910         copy_to_self(src, src_kind, dest, dst_kind, size, stat);
2911       return;
2912     }
2913     else
2914     {
2915       CAF_Win_lock(MPI_LOCK_EXCLUSIVE, remote_image, *p);
2916       if (dst_kind != src_kind || dest_char_array_is_longer || src_rank == 0)
2917       {
2918         if ((free_t_buff = ((t_buff = alloca(dst_size * size)) == NULL)))
2919         {
2920           t_buff = malloc(dst_size * size);
2921           if (t_buff == NULL)
2922             caf_runtime_error("Unable to allocate memory "
2923                               "for internal buffer in send().");
2924         }
2925       }
2926 
2927       if ((same_type_and_kind && dst_rank == src_rank)
2928           || dst_type == BT_CHARACTER)
2929         {
2930           if (dest_char_array_is_longer
2931               || (dst_kind != src_kind && dst_type == BT_CHARACTER))
2932           {
2933             copy_char_to_self(src->base_addr, src_type, src_size,
2934                               src_kind, t_buff, dst_type, dst_size,
2935                               dst_kind, size, src_rank == 0);
2936             ierr = MPI_Put(t_buff, dst_size, MPI_BYTE, remote_image,
2937                            offset, dst_size, MPI_BYTE, *p); chk_err(ierr);
2938           }
2939           else
2940           {
2941             const size_t trans_size =
2942               ((dst_size > src_size) ? src_size : dst_size) * size;
2943             ierr = MPI_Put(src->base_addr, trans_size, MPI_BYTE, remote_image,
2944                            offset, trans_size, MPI_BYTE, *p); chk_err(ierr);
2945           }
2946         }
2947       else
2948       {
2949         convert_with_strides(t_buff, dst_type, dst_kind, dst_size,
2950                              src->base_addr, src_type, src_kind,
2951                              (src_rank > 0) ? src_size: 0, size, stat);
2952         ierr = MPI_Put(t_buff, dst_size * size, MPI_BYTE, remote_image,
2953                        offset, dst_size * size, MPI_BYTE, *p); chk_err(ierr);
2954       }
2955 #ifdef CAF_MPI_LOCK_UNLOCK
2956       ierr = MPI_Win_unlock(remote_image, *p); chk_err(ierr);
2957 #elif NONBLOCKING_PUT
2958       /* Pending puts init */
2959       if (pending_puts == NULL)
2960       {
2961         pending_puts = calloc(1,sizeof(win_sync));
2962         pending_puts->next = NULL;
2963         pending_puts->win = token;
2964         pending_puts->img = remote_image;
2965         last_elem = pending_puts;
2966         last_elem->next = NULL;
2967       }
2968       else
2969       {
2970         last_elem->next = calloc(1,sizeof(win_sync));
2971         last_elem = last_elem->next;
2972         last_elem->win = token;
2973         last_elem->img = remote_image;
2974         last_elem->next = NULL;
2975       }
2976 #else
2977       ierr = MPI_Win_flush(remote_image, *p); chk_err(ierr);
2978 #endif // CAF_MPI_LOCK_UNLOCK
2979     }
2980   }
2981 
2982 #ifdef STRIDED
2983   else if (!same_image && same_type_and_kind && dst_type != BT_CHARACTER)
2984   {
2985     /* For strided copy, no type and kind conversion, copy to self or
2986      * character arrays are supported. */
2987     MPI_Datatype dt_s, dt_d, base_type_src, base_type_dst;
2988     int *arr_bl, *arr_dsp_s, *arr_dsp_d;
2989 
2990     selectType(src_size, &base_type_src);
2991     selectType(dst_size, &base_type_dst);
2992 
2993     if (dst_rank == 1)
2994     {
2995       if (dst_vector == NULL)
2996       {
2997         dprint("Setting up mpi datatype vector with "
2998                "stride %d, size %d and offset %d.\n",
2999                dest->dim[0]._stride, size, offset);
3000         ierr = MPI_Type_vector(size, 1, dest->dim[0]._stride, base_type_dst,
3001                                &dt_d); chk_err(ierr);
3002       }
3003       else
3004       {
3005         arr_bl = calloc(size, sizeof(int));
3006         arr_dsp_d = calloc(size, sizeof(int));
3007 
3008         dprint("Setting up strided vector index.\n");
3009 #define KINDCASE(kind, type)                                            \
3010 case kind:                                                              \
3011   for (i = 0; i < size; ++i)                                            \
3012   {                                                                     \
3013     arr_dsp_d[i] = ((ptrdiff_t)                                         \
3014       ((type *) dst_vector->u.v.vector)[i] - dest->dim[0].lower_bound); \
3015     arr_bl[i] = 1;                                                      \
3016   }                                                                     \
3017   break
3018         switch (dst_vector->u.v.kind)
3019         {
3020           KINDCASE(1, int8_t);
3021           KINDCASE(2, int16_t);
3022           KINDCASE(4, int32_t);
3023           KINDCASE(8, int64_t);
3024 #ifdef HAVE_GFC_INTEGER_16
3025           KINDCASE(16, __int128);
3026 #endif
3027         default:
3028           caf_runtime_error(unreachable);
3029           return;
3030         }
3031 #undef KINDCASE
3032         ierr = MPI_Type_indexed(size, arr_bl, arr_dsp_d, base_type_dst, &dt_d);
3033         chk_err(ierr);
3034         free(arr_bl);
3035         free(arr_dsp_d);
3036       }
3037       MPI_Type_vector(size, 1, src->dim[0]._stride, base_type_src, &dt_s);
3038     }
3039     else
3040     {
3041       arr_bl = calloc(size, sizeof(int));
3042       arr_dsp_s = calloc(size, sizeof(int));
3043       arr_dsp_d = calloc(size, sizeof(int));
3044 
3045       for (i = 0; i < size; ++i)
3046       {
3047         arr_bl[i] = 1;
3048       }
3049 
3050       for (i = 0; i < size; ++i)
3051       {
3052         ptrdiff_t array_offset_dst = 0, extent = 1, tot_ext = 1;
3053         if (dst_vector == NULL)
3054         {
3055           for (j = 0; j < dst_rank - 1; ++j)
3056           {
3057             extent = dest->dim[j]._ubound - dest->dim[j].lower_bound + 1;
3058             array_offset_dst += ((i / tot_ext) % extent) * dest->dim[j]._stride;
3059             tot_ext *= extent;
3060           }
3061 
3062           array_offset_dst += (i / tot_ext) * dest->dim[dst_rank - 1]._stride;
3063         }
3064         else
3065         {
3066 #define KINDCASE(kind, type)                                          \
3067 case kind:                                                            \
3068   array_offset_dst = ((ptrdiff_t)                                     \
3069     ((type *) dst_vector->u.v.vector)[i] - dest->dim[0].lower_bound); \
3070   break
3071           switch (dst_vector->u.v.kind)
3072           {
3073             KINDCASE(1, int8_t);
3074             KINDCASE(2, int16_t);
3075             KINDCASE(4, int32_t);
3076             KINDCASE(8, int64_t);
3077 #ifdef HAVE_GFC_INTEGER_16
3078             KINDCASE(16, __int128);
3079 #endif
3080             default:
3081               caf_runtime_error(unreachable);
3082               return;
3083           }
3084 #undef KINDCASE
3085         }
3086         arr_dsp_d[i] = array_offset_dst;
3087 
3088         if (src_rank != 0)
3089         {
3090           ptrdiff_t array_offset_sr = 0;
3091           extent = 1;
3092           tot_ext = 1;
3093           for (j = 0; j < src_rank - 1; ++j)
3094           {
3095             extent = src->dim[j]._ubound - src->dim[j].lower_bound + 1;
3096             array_offset_sr += ((i / tot_ext) % extent) * src->dim[j]._stride;
3097             tot_ext *= extent;
3098           }
3099 
3100           array_offset_sr += (i / tot_ext) * src->dim[src_rank - 1]._stride;
3101           arr_dsp_s[i] = array_offset_sr;
3102         }
3103         else
3104           arr_dsp_s[i] = 0;
3105       }
3106 
3107       ierr = MPI_Type_indexed(size, arr_bl, arr_dsp_s, base_type_src, &dt_s);
3108       chk_err(ierr);
3109       ierr = MPI_Type_indexed(size, arr_bl, arr_dsp_d, base_type_dst, &dt_d);
3110       chk_err(ierr);
3111 
3112       free(arr_bl);
3113       free(arr_dsp_s);
3114       free(arr_dsp_d);
3115     }
3116 
3117     ierr = MPI_Type_commit(&dt_s); chk_err(ierr);
3118     ierr = MPI_Type_commit(&dt_d); chk_err(ierr);
3119 
3120     CAF_Win_lock(MPI_LOCK_EXCLUSIVE, remote_image, *p);
3121     ierr = MPI_Put(src->base_addr, 1, dt_s, remote_image, offset, 1, dt_d, *p);
3122     chk_err(ierr);
3123     CAF_Win_unlock(remote_image, *p);
3124 
3125 #ifdef WITH_FAILED_IMAGES
3126     check_image_health(image_index, stat);
3127 
3128     if (!stat && ierr == STAT_FAILED_IMAGE)
3129       terminate_internal(ierr, 1);
3130 
3131     if (stat)
3132       *stat = ierr;
3133 #else
3134     if (ierr != 0)
3135     {
3136       terminate_internal(ierr, 1);
3137       return;
3138     }
3139 #endif
3140     ierr = MPI_Type_free(&dt_s); chk_err(ierr);
3141     ierr = MPI_Type_free(&dt_d); chk_err(ierr);
3142   }
3143 #endif // STRIDED
3144   else
3145   {
3146     if (same_image && mrt)
3147     {
3148       if ((free_t_buff = (((t_buff = alloca(dst_size * size))) == NULL)))
3149       {
3150         t_buff = malloc(dst_size * size);
3151         if (t_buff == NULL)
3152           caf_runtime_error("Unable to allocate memory "
3153                             "for internal buffer in send().");
3154       }
3155     }
3156     else if (!same_type_and_kind && !same_image)
3157     {
3158       if ((free_t_buff = (((t_buff = alloca(dst_size))) == NULL)))
3159       {
3160         t_buff = malloc(dst_size);
3161         if (t_buff == NULL)
3162           caf_runtime_error("Unable to allocate memory "
3163                             "for internal buffer in send().");
3164       }
3165     }
3166 
3167     if (!same_image)
3168       CAF_Win_lock(MPI_LOCK_EXCLUSIVE, remote_image, *p);
3169     for (i = 0; i < size; ++i)
3170     {
3171       ptrdiff_t array_offset_dst = 0, extent = 1, tot_ext = 1;
3172       if (!same_image || !mrt)
3173       {
3174         /* For same image and may require temp, the dst_offset is
3175          * computed on storage. */
3176         if (dst_vector == NULL)
3177         {
3178           for (j = 0; j < dst_rank - 1; ++j)
3179           {
3180             extent = dest->dim[j]._ubound - dest->dim[j].lower_bound + 1;
3181             array_offset_dst += ((i / tot_ext) % extent) * dest->dim[j]._stride;
3182             tot_ext *= extent;
3183           }
3184           array_offset_dst += (i / tot_ext) * dest->dim[dst_rank - 1]._stride;
3185         }
3186         else
3187         {
3188 #define KINDCASE(kind, type)                                          \
3189 case kind:                                                            \
3190   array_offset_dst = ((ptrdiff_t)                                     \
3191     ((type *)dst_vector->u.v.vector)[i] - dest->dim[0].lower_bound);  \
3192   break
3193           switch (dst_vector->u.v.kind)
3194           {
3195             KINDCASE(1, int8_t);
3196             KINDCASE(2, int16_t);
3197             KINDCASE(4, int32_t);
3198             KINDCASE(8, int64_t);
3199   #ifdef HAVE_GFC_INTEGER_16
3200             KINDCASE(16, __int128);
3201   #endif
3202             default:
3203               caf_runtime_error(unreachable);
3204               return;
3205           }
3206         }
3207         dst_offset = array_offset_dst * dst_size;
3208       }
3209 
3210       void *sr;
3211       if (src_rank != 0)
3212       {
3213         ptrdiff_t array_offset_sr = 0;
3214         extent = 1;
3215         tot_ext = 1;
3216         for (j = 0; j < src_rank - 1; ++j)
3217         {
3218           extent = src->dim[j]._ubound - src->dim[j].lower_bound + 1;
3219           array_offset_sr += ((i / tot_ext) % extent) * src->dim[j]._stride;
3220           tot_ext *= extent;
3221         }
3222 
3223         array_offset_sr += (i / tot_ext) * src->dim[dst_rank - 1]._stride;
3224         sr = (void *)((char *)src->base_addr + array_offset_sr * src_size);
3225       }
3226       else
3227         sr = src->base_addr;
3228 
3229       if (!same_image)
3230       {
3231         // Do the more likely first.
3232         dprint("kind(dst) = %d, el_sz(dst) = %zd, "
3233                "kind(src) = %d, el_sz(src) = %zd, lb(dst) = %zd.\n",
3234                dst_kind, dst_size, src_kind,
3235                src_size, dest->dim[0].lower_bound);
3236         if (same_type_and_kind)
3237         {
3238           const size_t trans_size = (src_size < dst_size) ? src_size : dst_size;
3239           ierr = MPI_Put(sr, trans_size, MPI_BYTE, remote_image,
3240                          offset + dst_offset, trans_size, MPI_BYTE, *p);
3241           chk_err(ierr);
3242           if (pad_str)
3243           {
3244             ierr = MPI_Put(pad_str, dst_size - src_size, MPI_BYTE,
3245                            remote_image, offset + dst_offset + src_size,
3246                            dst_size - src_size, MPI_BYTE, *p); chk_err(ierr);
3247           }
3248         }
3249         else if (dst_type == BT_CHARACTER)
3250         {
3251           copy_char_to_self(sr, src_type, src_size, src_kind,
3252                             t_buff, dst_type, dst_size, dst_kind, 1, true);
3253           ierr = MPI_Put(t_buff, dst_size, MPI_BYTE, remote_image,
3254                          offset + dst_offset, dst_size, MPI_BYTE, *p);
3255           chk_err(ierr);
3256         }
3257         else
3258         {
3259           convert_type(t_buff, dst_type, dst_kind,
3260                        sr, src_type, src_kind, stat);
3261           ierr = MPI_Put(t_buff, dst_size, MPI_BYTE, remote_image,
3262                          offset + dst_offset, dst_size, MPI_BYTE, *p);
3263           chk_err(ierr);
3264         }
3265       }
3266       else
3267       {
3268         if (!mrt)
3269         {
3270           dprint("strided same_image, no temp, for i = %zd, "
3271                  "dst_offset = %zd, offset = %zd.\n",
3272                  i, dst_offset, offset);
3273           if (same_type_and_kind)
3274             memmove(dest->base_addr + dst_offset, sr, src_size);
3275           else
3276             convert_type(dest->base_addr + dst_offset, dst_type,
3277                          dst_kind, sr, src_type, src_kind, stat);
3278         }
3279         else
3280         {
3281           dprint("strided same_image, *WITH* temp, for i = %zd.\n", i);
3282           if (same_type_and_kind)
3283             memmove(t_buff + i * dst_size, sr, src_size);
3284           else
3285             convert_type(t_buff + i * dst_size, dst_type, dst_kind,
3286                          sr, src_type, src_kind, stat);
3287         }
3288       }
3289 
3290 #ifndef WITH_FAILED_IMAGES
3291       if (ierr != 0)
3292       {
3293         caf_runtime_error("MPI Error: %d", ierr);
3294         return;
3295       }
3296 #endif
3297     }
3298     if (!same_image)
3299       CAF_Win_unlock(remote_image, *p);
3300 
3301     if (same_image && mrt)
3302     {
3303       for (i = 0; i < size; ++i)
3304       {
3305         ptrdiff_t array_offset_dst = 0, extent = 1, tot_ext = 1;
3306         if (dst_vector == NULL)
3307         {
3308           for (j = 0; j < dst_rank - 1; ++j)
3309           {
3310             extent = dest->dim[j]._ubound - dest->dim[j].lower_bound + 1;
3311             array_offset_dst += ((i / tot_ext) % extent) * dest->dim[j]._stride;
3312             tot_ext *= extent;
3313           }
3314 
3315           array_offset_dst += (i / tot_ext) * dest->dim[dst_rank - 1]._stride;
3316         }
3317         else
3318         {
3319           switch (dst_vector->u.v.kind)
3320           {
3321             // KINDCASE is defined above.
3322             KINDCASE(1, int8_t);
3323             KINDCASE(2, int16_t);
3324             KINDCASE(4, int32_t);
3325             KINDCASE(8, int64_t);
3326 #ifdef HAVE_GFC_INTEGER_16
3327             KINDCASE(16, __int128);
3328 #endif
3329             default:
3330               caf_runtime_error(unreachable);
3331               return;
3332           }
3333 #undef KINDCASE
3334         }
3335         dst_offset = array_offset_dst * dst_size;
3336         memmove(dest->base_addr + dst_offset, t_buff + i * dst_size, dst_size);
3337       }
3338     }
3339   }
3340 
3341   /* Free memory, when not allocated on stack. */
3342   if (free_t_buff)
3343     free(t_buff);
3344   if (free_pad_str)
3345     free(pad_str);
3346 
3347 #ifdef WITH_FAILED_IMAGES
3348   /* Catch failed images, when failed image support is active. */
3349   check_image_health(image_index , stat);
3350 #endif
3351 
3352   if (ierr != MPI_SUCCESS)
3353   {
3354     int mpi_error;
3355     MPI_Error_class(ierr, &mpi_error);
3356     if (stat)
3357       *stat = mpi_error;
3358     else
3359     {
3360       int error_len = 2048;
3361       char error_str[error_len];
3362       strcpy(error_str, "MPI-error: ");
3363       MPI_Error_string(mpi_error, &error_str[11], &error_len);
3364       error_stop_str(error_str, error_len + 11, false);
3365     }
3366   }
3367 }
3368 
3369 
3370 /* Get array data from a remote src to a local dest. */
3371 
3372 void
PREFIX(get)3373 PREFIX(get) (caf_token_t token, size_t offset, int image_index,
3374              gfc_descriptor_t *src, caf_vector_t *src_vector,
3375              gfc_descriptor_t *dest, int src_kind, int dst_kind,
3376              bool mrt, int *pstat)
3377 {
3378   int j, ierr = 0;
3379   size_t i, size;
3380   const int
3381     src_rank = GFC_DESCRIPTOR_RANK(src),
3382     dst_rank = GFC_DESCRIPTOR_RANK(dest);
3383   const size_t
3384     src_size = GFC_DESCRIPTOR_SIZE(src),
3385     dst_size = GFC_DESCRIPTOR_SIZE(dest);
3386   const int
3387     src_type = GFC_DESCRIPTOR_TYPE(src),
3388     dst_type = GFC_DESCRIPTOR_TYPE(dest);
3389   const bool
3390     src_contiguous = PREFIX(is_contiguous) (src),
3391     dst_contiguous = PREFIX(is_contiguous) (dest);
3392   const bool
3393     same_image = caf_this_image == image_index,
3394     same_type_and_kind = dst_type == src_type && dst_kind == src_kind;
3395 
3396   MPI_Win *p = TOKEN(token);
3397   ptrdiff_t dimextent, src_offset = 0;
3398   void *pad_str = NULL, *t_buff = NULL;
3399   bool free_pad_str = false, free_t_buff = false;
3400   const bool dest_char_array_is_longer
3401       = dst_type == BT_CHARACTER && dst_size > src_size && !same_image;
3402   int remote_image = image_index - 1;
3403   if (!same_image)
3404   {
3405     MPI_Group current_team_group, win_group;
3406     ierr = MPI_Comm_group(CAF_COMM_WORLD, &current_team_group); chk_err(ierr);
3407     ierr = MPI_Win_get_group(*p, &win_group); chk_err(ierr);
3408     ierr = MPI_Group_translate_ranks(current_team_group, 1,
3409                                      (int[]){remote_image}, win_group,
3410                                      &remote_image); chk_err(ierr);
3411     ierr = MPI_Group_free(&current_team_group); chk_err(ierr);
3412     ierr = MPI_Group_free(&win_group); chk_err(ierr);
3413   }
3414 
3415   /* Ensure stat is always set. */
3416 #ifdef GCC_GE_7
3417   int * stat = pstat;
3418   if (stat)
3419     *stat = 0;
3420 #else
3421   /* Gcc prior to 7.0 does not have stat here. */
3422   int * stat = NULL;
3423 #endif
3424 
3425   size = 1;
3426   for (j = 0; j < dst_rank; ++j)
3427   {
3428     dimextent = dest->dim[j]._ubound - dest->dim[j].lower_bound + 1;
3429     if (dimextent < 0)
3430       dimextent = 0;
3431     size *= dimextent;
3432   }
3433 
3434   if (size == 0)
3435     return;
3436 
3437   dprint("src_vector = %p, image_index = %d, offset = %zd.\n",
3438          src_vector, image_index, offset);
3439   check_image_health(image_index, stat);
3440 
3441   /* For char arrays: create the padding array, when dst is longer than src. */
3442   if (dest_char_array_is_longer)
3443   {
3444     const size_t pad_num = (dst_size / dst_kind) - (src_size / src_kind);
3445     const size_t pad_sz = pad_num * dst_kind;
3446     /* For big arrays alloca() may not be able to get the memory on the stack.
3447      * Use a regular malloc then. */
3448     if ((free_pad_str = ((pad_str = alloca(pad_sz)) == NULL)))
3449     {
3450       pad_str = malloc(pad_sz);
3451       if (t_buff == NULL)
3452         caf_runtime_error("Unable to allocate memory "
3453                           "for internal buffer in get().");
3454     }
3455     if (dst_kind == 1)
3456       memset(pad_str, ' ', pad_num);
3457     else /* dst_kind == 4. */
3458     {
3459       for (int32_t *it = (int32_t *) pad_str,
3460            *itEnd = ((int32_t *) pad_str) + pad_num; it < itEnd; ++it)
3461       {
3462         *it = (int32_t) ' ';
3463       }
3464     }
3465   }
3466 
3467   if (src_contiguous && dst_contiguous && src_vector == NULL)
3468   {
3469     if (same_image)
3470     {
3471       dprint("in caf_this == image_index, size = %zd, "
3472              "dst_kind = %d, src_kind = %d\n",
3473              size, dst_kind, src_kind);
3474       if (dst_type == BT_CHARACTER)
3475         /* The size is encoded in the descriptor's type for char arrays. */
3476         copy_char_to_self(src->base_addr, src_type, src_size, src_kind,
3477                           dest->base_addr, dst_type, dst_size, dst_kind,
3478                           size, src_rank == 0);
3479       else
3480         copy_to_self(src, src_kind, dest, dst_kind, size, stat);
3481       return;
3482     }
3483     else
3484     {
3485       CAF_Win_lock(MPI_LOCK_SHARED, remote_image, *p);
3486       if (dst_kind != src_kind || dest_char_array_is_longer || src_rank == 0)
3487       {
3488         if ((free_t_buff = ((t_buff = alloca(src_size * size)) == NULL)))
3489         {
3490           t_buff = malloc(src_size * size);
3491           if (t_buff == NULL)
3492             caf_runtime_error("Unable to allocate memory "
3493                               "for internal buffer in get().");
3494         }
3495       }
3496 
3497       if ((same_type_and_kind && dst_rank == src_rank)
3498           || dst_type == BT_CHARACTER)
3499       {
3500         if (!dest_char_array_is_longer
3501             && (dst_kind == src_kind || dst_type != BT_CHARACTER))
3502         {
3503           const size_t trans_size =
3504             ((dst_size > src_size) ? src_size : dst_size) * size;
3505           ierr = MPI_Get(dest->base_addr, trans_size, MPI_BYTE, remote_image,
3506                          offset, trans_size, MPI_BYTE, *p); chk_err(ierr);
3507         }
3508         else
3509         {
3510           ierr = MPI_Get(t_buff, src_size, MPI_BYTE, remote_image,
3511                          offset, src_size, MPI_BYTE, *p); chk_err(ierr);
3512           copy_char_to_self(t_buff, src_type, src_size, src_kind,
3513                             dest->base_addr, dst_type, dst_size,
3514                             dst_kind, size, src_rank == 0);
3515         }
3516       }
3517       else
3518       {
3519         ierr = MPI_Get(t_buff, src_size * size, MPI_BYTE, remote_image, offset,
3520                        src_size * size, MPI_BYTE, *p); chk_err(ierr);
3521         convert_with_strides(dest->base_addr, dst_type, dst_kind, dst_size,
3522                              t_buff, src_type, src_kind,
3523                              (src_rank > 0) ? src_size: 0, size, stat);
3524       }
3525       CAF_Win_unlock(remote_image, *p);
3526     }
3527   }
3528 #ifdef STRIDED
3529   else if (!same_image && same_type_and_kind && dst_type != BT_CHARACTER)
3530   {
3531     /* For strided copy, no type and kind conversion, copy to self or
3532      * character arrays are supported. */
3533     MPI_Datatype dt_s, dt_d, base_type_src, base_type_dst;
3534     int *arr_bl;
3535     int *arr_dsp_s, *arr_dsp_d;
3536 
3537     selectType(src_size, &base_type_src);
3538     selectType(dst_size, &base_type_dst);
3539 
3540     if (src_rank == 1)
3541     {
3542       if (src_vector == NULL)
3543       {
3544         dprint("Setting up mpi datatype vector with stride %d, "
3545                "size %d and offset %d.\n",
3546                src->dim[0]._stride, size, offset);
3547         ierr = MPI_Type_vector(size, 1, src->dim[0]._stride, base_type_src,
3548                                &dt_s); chk_err(ierr);
3549       }
3550       else
3551       {
3552         arr_bl = calloc(size, sizeof(int));
3553         arr_dsp_s = calloc(size, sizeof(int));
3554 
3555         dprint("Setting up strided vector index.\n",
3556                caf_this_image, caf_num_images, __FUNCTION__);
3557 #define KINDCASE(kind, type)                                            \
3558 case kind:                                                              \
3559   for (i = 0; i < size; ++i)                                            \
3560   {                                                                     \
3561     arr_dsp_s[i] = ((ptrdiff_t)                                         \
3562       ((type *) src_vector->u.v.vector)[i] - src->dim[0].lower_bound);  \
3563     arr_bl[i] = 1;                                                      \
3564   }                                                                     \
3565   break
3566         switch (src_vector->u.v.kind)
3567         {
3568           KINDCASE(1, int8_t);
3569           KINDCASE(2, int16_t);
3570           KINDCASE(4, int32_t);
3571           KINDCASE(8, int64_t);
3572 #ifdef HAVE_GFC_INTEGER_16
3573           KINDCASE(16, __int128);
3574 #endif
3575         default:
3576           caf_runtime_error(unreachable);
3577           return;
3578         }
3579 #undef KINDCASE
3580         ierr = MPI_Type_indexed(size, arr_bl, arr_dsp_s, base_type_src, &dt_s);
3581         chk_err(ierr);
3582         free(arr_bl);
3583         free(arr_dsp_s);
3584       }
3585       ierr = MPI_Type_vector(size, 1, dest->dim[0]._stride, base_type_dst,
3586                              &dt_d); chk_err(ierr);
3587     }
3588     else
3589     {
3590       arr_bl = calloc(size, sizeof(int));
3591       arr_dsp_s = calloc(size, sizeof(int));
3592       arr_dsp_d = calloc(size, sizeof(int));
3593 
3594       for (i = 0; i < size; ++i)
3595       {
3596         arr_bl[i] = 1;
3597       }
3598 
3599       for (i = 0; i < size; ++i)
3600       {
3601         ptrdiff_t array_offset_sr = 0, extent = 1, tot_ext = 1;
3602         if (src_vector == NULL)
3603         {
3604           for (j = 0; j < src_rank - 1; ++j)
3605           {
3606             extent = src->dim[j]._ubound - src->dim[j].lower_bound + 1;
3607             array_offset_sr += ((i / tot_ext) % extent) * src->dim[j]._stride;
3608             tot_ext *= extent;
3609           }
3610 
3611           array_offset_sr += (i / tot_ext) * src->dim[src_rank - 1]._stride;
3612         }
3613         else
3614         {
3615 #define KINDCASE(kind, type)                                          \
3616 case kind:                                                            \
3617   array_offset_sr = ((ptrdiff_t)                                      \
3618     ((type *) src_vector->u.v.vector)[i] - src->dim[0].lower_bound);  \
3619   break
3620           switch (src_vector->u.v.kind)
3621           {
3622             KINDCASE(1, int8_t);
3623             KINDCASE(2, int16_t);
3624             KINDCASE(4, int32_t);
3625             KINDCASE(8, int64_t);
3626 #ifdef HAVE_GFC_INTEGER_16
3627             KINDCASE(16, __int128);
3628 #endif
3629             default:
3630               caf_runtime_error(unreachable);
3631               return;
3632           }
3633 #undef KINDCASE
3634         }
3635         arr_dsp_s[i] = array_offset_sr;
3636 
3637         if (dst_rank != 0)
3638         {
3639           ptrdiff_t array_offset_dst = 0;
3640           extent = 1;
3641           tot_ext = 1;
3642           for (j = 0; j < dst_rank - 1; ++j)
3643           {
3644             extent = dest->dim[j]._ubound - dest->dim[j].lower_bound + 1;
3645             array_offset_dst += ((i / tot_ext) % extent) * dest->dim[j]._stride;
3646             tot_ext *= extent;
3647           }
3648 
3649           array_offset_dst += (i / tot_ext) * dest->dim[src_rank - 1]._stride;
3650           arr_dsp_d[i] = array_offset_dst;
3651         }
3652         else
3653           arr_dsp_d[i] = 0;
3654       }
3655 
3656       ierr = MPI_Type_indexed(size, arr_bl, arr_dsp_s, base_type_src, &dt_s);
3657       chk_err(ierr);
3658       ierr = MPI_Type_indexed(size, arr_bl, arr_dsp_d, base_type_dst, &dt_d);
3659       chk_err(ierr);
3660 
3661       free(arr_bl);
3662       free(arr_dsp_s);
3663       free(arr_dsp_d);
3664     }
3665 
3666     ierr = MPI_Type_commit(&dt_s); chk_err(ierr);
3667     ierr = MPI_Type_commit(&dt_d); chk_err(ierr);
3668 
3669     CAF_Win_lock(MPI_LOCK_SHARED, remote_image, *p);
3670     ierr = MPI_Get(dest->base_addr, 1, dt_d, remote_image, offset, 1, dt_s, *p);
3671     chk_err(ierr);
3672     CAF_Win_unlock(remote_image, *p);
3673 
3674 #ifdef WITH_FAILED_IMAGES
3675     check_image_health(image_index, stat);
3676 
3677     if (!stat && ierr == STAT_FAILED_IMAGE)
3678       terminate_internal(ierr, 1);
3679 
3680     if (stat)
3681       *stat = ierr;
3682 #else
3683     if (ierr != 0)
3684     {
3685       terminate_internal(ierr, 1);
3686       return;
3687     }
3688 #endif
3689     ierr = MPI_Type_free(&dt_s); chk_err(ierr);
3690     ierr = MPI_Type_free(&dt_d); chk_err(ierr);
3691   }
3692 #endif // STRIDED
3693   else
3694   {
3695     if (same_image && mrt)
3696     {
3697       if ((free_t_buff = (((t_buff = alloca(src_size * size))) == NULL)))
3698       {
3699         t_buff = malloc(src_size * size);
3700         if (t_buff == NULL)
3701           caf_runtime_error("Unable to allocate memory "
3702                             "for internal buffer in get().");
3703       }
3704     }
3705     else if (!same_type_and_kind && !same_image)
3706     {
3707       if ((free_t_buff = (((t_buff = alloca(src_size))) == NULL)))
3708       {
3709         t_buff = malloc(src_size);
3710         if (t_buff == NULL)
3711           caf_runtime_error("Unable to allocate memory "
3712                             "for internal buffer in get().");
3713       }
3714     }
3715 
3716     if (!same_image)
3717       CAF_Win_lock(MPI_LOCK_SHARED, remote_image, *p);
3718     for (i = 0; i < size; ++i)
3719     {
3720       ptrdiff_t array_offset_sr = 0, extent = 1, tot_ext = 1;
3721       if (src_vector == NULL)
3722       {
3723         for (j = 0; j < src_rank - 1; ++j)
3724         {
3725           extent = src->dim[j]._ubound - src->dim[j].lower_bound + 1;
3726           array_offset_sr += ((i / tot_ext) % extent) * src->dim[j]._stride;
3727           tot_ext *= extent;
3728         }
3729 
3730         array_offset_sr += (i / tot_ext) * src->dim[src_rank - 1]._stride;
3731       }
3732       else
3733       {
3734 #define KINDCASE(kind, type)                                        \
3735 case kind:                                                          \
3736   array_offset_sr = ((ptrdiff_t)                                    \
3737     ((type *)src_vector->u.v.vector)[i] - src->dim[0].lower_bound); \
3738   break
3739         switch (src_vector->u.v.kind)
3740         {
3741           KINDCASE(1, int8_t);
3742           KINDCASE(2, int16_t);
3743           KINDCASE(4, int32_t);
3744           KINDCASE(8, int64_t);
3745 #ifdef HAVE_GFC_INTEGER_16
3746           KINDCASE(16, __int128);
3747 #endif
3748           default:
3749             caf_runtime_error(unreachable);
3750             return;
3751         }
3752       }
3753       src_offset = array_offset_sr * src_size;
3754 #undef KINDCASE
3755 
3756       void *dst;
3757       if (!same_image || !mrt)
3758       {
3759         if (dst_rank != 0)
3760         {
3761           ptrdiff_t array_offset_dst = 0;
3762           extent = 1;
3763           tot_ext = 1;
3764           for (j = 0; j < dst_rank - 1; ++j)
3765           {
3766             extent = dest->dim[j]._ubound - dest->dim[j].lower_bound + 1;
3767             array_offset_dst += ((i / tot_ext) % extent) * dest->dim[j]._stride;
3768             tot_ext *= extent;
3769           }
3770 
3771           array_offset_dst += (i / tot_ext) * dest->dim[dst_rank - 1]._stride;
3772           dst = (void *)((char *)dest->base_addr + array_offset_dst * dst_size);
3773         }
3774         else
3775           dst = dest->base_addr;
3776        }
3777 
3778       if (!same_image)
3779       {
3780         // Do the more likely first.
3781         dprint("kind(dst) = %d, el_sz(dst) = %zd, "
3782                "kind(src) = %d, el_sz(src) = %zd, lb(dst) = %zd.\n",
3783                dst_kind, dst_size, src_kind, src_size, src->dim[0].lower_bound);
3784         if (same_type_and_kind)
3785         {
3786           const size_t trans_size = (src_size < dst_size) ? src_size : dst_size;
3787           ierr = MPI_Get(dst, trans_size, MPI_BYTE, remote_image,
3788                          offset + src_offset, trans_size, MPI_BYTE, *p);
3789           chk_err(ierr);
3790           if (pad_str)
3791             memcpy((void *)((char *)dst + src_size), pad_str,
3792                    dst_size - src_size);
3793         }
3794         else if (dst_type == BT_CHARACTER)
3795         {
3796           ierr = MPI_Get(t_buff, src_size, MPI_BYTE, remote_image,
3797                          offset + src_offset, src_size, MPI_BYTE, *p);
3798           chk_err(ierr);
3799           copy_char_to_self(t_buff, src_type, src_size, src_kind,
3800                             dst, dst_type, dst_size, dst_kind, 1, true);
3801         }
3802         else
3803         {
3804           ierr = MPI_Get(t_buff, src_size, MPI_BYTE, remote_image,
3805                          offset + src_offset, src_size, MPI_BYTE, *p);
3806           chk_err(ierr);
3807           convert_type(dst, dst_type, dst_kind, t_buff,
3808                        src_type, src_kind, stat);
3809         }
3810       }
3811       else
3812       {
3813         if (!mrt)
3814         {
3815           dprint("strided same_image, no temp, for i = %zd, "
3816                  "src_offset = %zd, offset = %zd.\n",
3817                  i, src_offset, offset);
3818           if (same_type_and_kind)
3819             memmove(dst, src->base_addr + src_offset, src_size);
3820           else
3821             convert_type(dst, dst_type, dst_kind,
3822                          src->base_addr + src_offset, src_type, src_kind, stat);
3823         }
3824         else
3825         {
3826           dprint("strided same_image, *WITH* temp, for i = %zd.\n", i);
3827           if (same_type_and_kind)
3828             memmove(t_buff + i * dst_size,
3829                     src->base_addr + src_offset, src_size);
3830           else
3831             convert_type(t_buff + i * dst_size, dst_type, dst_kind,
3832                          src->base_addr + src_offset, src_type, src_kind, stat);
3833         }
3834       }
3835 
3836 #ifndef WITH_FAILED_IMAGES
3837       if (ierr != 0)
3838       {
3839         caf_runtime_error("MPI Error: %d", ierr);
3840         return;
3841       }
3842 #endif
3843     }
3844     if (!same_image)
3845       CAF_Win_unlock(remote_image, *p);
3846 
3847     if (same_image && mrt)
3848     {
3849       dprint("Same image temporary move.\n");
3850       memmove(dest->base_addr, t_buff, size * dst_size);
3851     }
3852   }
3853 
3854   /* Free memory, when not allocated on stack. */
3855   if (free_t_buff)
3856     free(t_buff);
3857   if (free_pad_str)
3858     free(pad_str);
3859 
3860 #ifdef WITH_FAILED_IMAGES
3861   /* Catch failed images, when failed image support is active. */
3862   check_image_health(image_index , stat);
3863 #endif
3864 
3865   if (ierr != MPI_SUCCESS)
3866   {
3867     int mpi_error;
3868     MPI_Error_class(ierr, &mpi_error);
3869     if (stat)
3870       *stat = mpi_error;
3871     else
3872     {
3873       int error_len = 2048 - 11;
3874       char error_str[error_len + 11];
3875       strcpy(error_str, "MPI-error: ");
3876       MPI_Error_string(mpi_error, &error_str[11], &error_len);
3877       error_stop_str(error_str, error_len + 11, false);
3878     }
3879   }
3880 }
3881 
3882 
3883 #ifdef GCC_GE_7
3884 /* Get a chunk of data from one image to the current one, with type conversion.
3885  *
3886  * Copied from the gcc:libgfortran/caf/single.c. Can't say much about it. */
3887 static void
get_data(void * ds,mpi_caf_token_t * token,MPI_Aint offset,int dst_type,int src_type,int dst_kind,int src_kind,size_t dst_size,size_t src_size,size_t num,int * stat,int image_index)3888 get_data(void *ds, mpi_caf_token_t *token, MPI_Aint offset, int dst_type,
3889          int src_type, int dst_kind, int src_kind, size_t dst_size,
3890          size_t src_size, size_t num, int *stat, int image_index)
3891 {
3892   size_t k;
3893   int ierr;
3894   MPI_Win win = (token == NULL) ? global_dynamic_win : token->memptr_win;
3895 #ifdef EXTRA_DEBUG_OUTPUT
3896   if (token)
3897     dprint("%p = win(%d): %d -> offset: %zd of size %zd -> %zd, "
3898            "dst type %d(%d), src type %d(%d)\n",
3899            ds, win, image_index + 1, offset, src_size, dst_size,
3900            dst_type, dst_kind, src_type, src_kind);
3901   else
3902     dprint("%p = global_win(%d) offset: %zd (%zd) of size %zd -> %zd, "
3903            "dst type %d(%d), src type %d(%d)\n",
3904            ds, image_index + 1, offset, offset, src_size, dst_size,
3905            dst_type, dst_kind, src_type, src_kind);
3906 #endif
3907   if (dst_type == src_type && dst_kind == src_kind)
3908   {
3909     size_t sz = ((dst_size > src_size) ? src_size : dst_size) * num;
3910     ierr = MPI_Get(ds, sz, MPI_BYTE, image_index, offset, sz, MPI_BYTE, win);
3911     chk_err(ierr);
3912     if ((dst_type == BT_CHARACTER || src_type == BT_CHARACTER)
3913         && dst_size > src_size)
3914       {
3915         if (dst_kind == 1)
3916         {
3917           memset((void*)(char*) ds + src_size, ' ', dst_size - src_size);
3918         }
3919         else /* dst_kind == 4. */
3920         {
3921           for (k = src_size / 4; k < dst_size / 4; k++)
3922             ((int32_t*) ds)[k] = (int32_t) ' ';
3923         }
3924       }
3925   }
3926   else if (dst_type == BT_CHARACTER && dst_kind == 1)
3927   {
3928     /* Get the required amount of memory on the stack. */
3929     void *srh = alloca(src_size);
3930     ierr = MPI_Get(srh, src_size, MPI_BYTE, image_index, offset, src_size,
3931                    MPI_BYTE, win); chk_err(ierr);
3932     /* Get of the data needs to be finished before converting the data. */
3933     ierr = MPI_Win_flush(image_index, win); chk_err(ierr);
3934     assign_char1_from_char4(dst_size, src_size, ds, srh);
3935   }
3936   else if (dst_type == BT_CHARACTER)
3937   {
3938     /* Get the required amount of memory on the stack. */
3939     void *srh = alloca(src_size);
3940     ierr = MPI_Get(srh, src_size, MPI_BYTE, image_index, offset, src_size,
3941                    MPI_BYTE, win); chk_err(ierr);
3942     /* Get of the data needs to be finished before converting the data. */
3943     ierr = MPI_Win_flush(image_index, win); chk_err(ierr);
3944     assign_char4_from_char1(dst_size, src_size, ds, srh);
3945   }
3946   else
3947   {
3948     /* Get the required amount of memory on the stack. */
3949     void *srh = alloca(src_size * num);
3950     dprint("type/kind convert %zd items: "
3951            "type %d(%d) -> type %d(%d), local buffer: %p\n",
3952            num, src_type, src_kind, dst_type, dst_kind, srh);
3953     ierr = MPI_Get(srh, src_size * num, MPI_BYTE, image_index, offset,
3954                    src_size * num, MPI_BYTE, win); chk_err(ierr);
3955     /* Get of the data needs to be finished before converting the data. */
3956     ierr = MPI_Win_flush(image_index, win); chk_err(ierr);
3957     dprint("srh[0] = %d, ierr = %d\n", (int)((char *)srh)[0], ierr);
3958     for (k = 0; k < num; ++k)
3959     {
3960       convert_type(ds, dst_type, dst_kind, srh, src_type, src_kind, stat);
3961       ds += dst_size;
3962       srh += src_size;
3963     }
3964   }
3965 }
3966 
3967 
3968 /* Compute the number of items referenced.
3969  *
3970  * Computes the number of items between lower bound (lb) and upper bound (ub)
3971  * with respect to the stride taking corner cases into account. */
3972 #define COMPUTE_NUM_ITEMS(num, stride, lb, ub)                  \
3973 do                                                              \
3974 {                                                               \
3975   ptrdiff_t abs_stride = (stride) > 0 ? (stride) : -(stride);   \
3976   num = (stride) > 0 ? (ub) + 1 - (lb) : (lb) + 1 - (ub);       \
3977   if (num <= 0 || abs_stride < 1) return;                       \
3978   num = (abs_stride > 1) ? (1 + (num - 1) / abs_stride) : num;  \
3979 } while (0)
3980 
3981 
3982 /* Convenience macro to get the extent of a descriptor in a certain dimension
3983  *
3984  * Copied from gcc:libgfortran/libgfortran.h. */
3985 #define GFC_DESCRIPTOR_EXTENT(desc,i) \
3986 ((desc)->dim[i]._ubound + 1 - (desc)->dim[i].lower_bound)
3987 
3988 
3989 #define sizeof_desc_for_rank(rank) \
3990 (sizeof(gfc_descriptor_t) + (rank) * sizeof(descriptor_dimension))
3991 
3992 /* Define the descriptor of max rank.
3993  *
3994  *  This typedef is made to allow storing a copy of a remote descriptor on the
3995  *  stack without having to care about the rank. */
3996 typedef struct gfc_max_dim_descriptor_t
3997 {
3998   gfc_descriptor_t base;
3999   descriptor_dimension dim[GFC_MAX_DIMENSIONS];
4000 } gfc_max_dim_descriptor_t;
4001 
4002 typedef struct gfc_dim1_descriptor_t
4003 {
4004   gfc_descriptor_t base;
4005   descriptor_dimension dim[1];
4006 } gfc_dim1_descriptor_t;
4007 
4008 static void
get_for_ref(caf_reference_t * ref,size_t * i,size_t dst_index,mpi_caf_token_t * mpi_token,gfc_descriptor_t * dst,gfc_descriptor_t * src,void * ds,void * sr,ptrdiff_t sr_byte_offset,ptrdiff_t desc_byte_offset,int dst_kind,int src_kind,size_t dst_dim,size_t src_dim,size_t num,int * stat,int global_dynamic_win_rank,int memptr_win_rank,bool sr_global,bool desc_global,int src_type)4009 get_for_ref(caf_reference_t *ref, size_t *i, size_t dst_index,
4010             mpi_caf_token_t *mpi_token, gfc_descriptor_t *dst,
4011             gfc_descriptor_t *src, void *ds, void *sr,
4012             ptrdiff_t sr_byte_offset, ptrdiff_t desc_byte_offset,
4013             int dst_kind, int src_kind, size_t dst_dim, size_t src_dim,
4014             size_t num, int *stat,
4015             int global_dynamic_win_rank, int memptr_win_rank,
4016             bool sr_global, /* access sr through global_dynamic_win */
4017             bool desc_global /* access desc through global_dynamic_win */
4018 #ifdef GCC_GE_8
4019             , int src_type)
4020 {
4021 #else
4022             )
4023 {
4024   int src_type = -1;
4025 #endif
4026   ptrdiff_t extent_src = 1, array_offset_src = 0, stride_src;
4027   size_t next_dst_dim, ref_rank;
4028   gfc_max_dim_descriptor_t src_desc_data;
4029   int ierr;
4030 
4031   if (unlikely(ref == NULL))
4032   {
4033     /* May be we should issue an error here, because this case should not
4034      * occur. */
4035     return;
4036   }
4037 
4038   dprint("sr_offset = %zd, sr = %p, desc_offset = %zd, src = %p, "
4039          "sr_glb = %d, desc_glb = %d\n",
4040          sr_byte_offset, sr, desc_byte_offset, src, sr_global, desc_global);
4041 
4042   if (ref->next == NULL)
4043   {
4044     size_t dst_size = GFC_DESCRIPTOR_SIZE(dst);
4045 
4046     switch (ref->type)
4047     {
4048       case CAF_REF_COMPONENT:
4049         dprint("caf_offset = %zd\n", ref->u.c.caf_token_offset);
4050         if (ref->u.c.caf_token_offset > 0)
4051         {
4052           sr_byte_offset += ref->u.c.offset;
4053           if (sr_global)
4054           {
4055             ierr = MPI_Get(&sr, stdptr_size, MPI_BYTE, global_dynamic_win_rank,
4056                            MPI_Aint_add((MPI_Aint)sr, sr_byte_offset),
4057                            stdptr_size, MPI_BYTE, global_dynamic_win);
4058             chk_err(ierr);
4059             desc_global = true;
4060           }
4061           else
4062           {
4063             ierr = MPI_Get(&sr, stdptr_size, MPI_BYTE, memptr_win_rank,
4064                            sr_byte_offset, stdptr_size, MPI_BYTE,
4065                            mpi_token->memptr_win); chk_err(ierr);
4066             sr_global = true;
4067           }
4068           sr_byte_offset = 0;
4069         }
4070         else
4071           sr_byte_offset += ref->u.c.offset;
4072         if (sr_global)
4073         {
4074           get_data(ds, NULL, MPI_Aint_add((MPI_Aint)sr, sr_byte_offset),
4075                    GFC_DESCRIPTOR_TYPE(dst),
4076 #ifdef GCC_GE_8
4077                    (src_type != -1) ? src_type : GFC_DESCRIPTOR_TYPE (dst),
4078 #else
4079                    GFC_DESCRIPTOR_TYPE(dst),
4080 #endif
4081                    dst_kind, src_kind, dst_size, ref->item_size, 1, stat,
4082                    global_dynamic_win_rank);
4083         }
4084         else
4085         {
4086           get_data(ds, mpi_token, sr_byte_offset, GFC_DESCRIPTOR_TYPE(dst),
4087 #ifdef GCC_GE_8
4088                    src_type,
4089 #else
4090                    GFC_DESCRIPTOR_TYPE(src),
4091 #endif
4092                    dst_kind, src_kind, dst_size, ref->item_size, 1, stat,
4093                    memptr_win_rank);
4094         }
4095         ++(*i);
4096         return;
4097       case CAF_REF_STATIC_ARRAY:
4098         src_type = ref->u.a.static_array_type;
4099         /* Intentionally fall through. */
4100       case CAF_REF_ARRAY:
4101         if (ref->u.a.mode[src_dim] == CAF_ARR_REF_NONE)
4102         {
4103           if (sr_global)
4104           {
4105             get_data(ds + dst_index * dst_size, NULL,
4106                      MPI_Aint_add((MPI_Aint)sr, sr_byte_offset),
4107                      GFC_DESCRIPTOR_TYPE(dst),
4108 #ifdef GCC_GE_8
4109                      (src_type != -1) ? src_type : GFC_DESCRIPTOR_TYPE (src),
4110 #else
4111                      (src_type == -1) ? GFC_DESCRIPTOR_TYPE(src) : src_type,
4112 #endif
4113                      dst_kind, src_kind, dst_size, ref->item_size, num,
4114                      stat, global_dynamic_win_rank);
4115           }
4116           else
4117           {
4118             get_data(ds + dst_index * dst_size, mpi_token,
4119                      sr_byte_offset, GFC_DESCRIPTOR_TYPE(dst),
4120 #ifdef GCC_GE_8
4121                      (src_type != -1) ? src_type : GFC_DESCRIPTOR_TYPE (src),
4122 #else
4123                      (src_type == -1) ? GFC_DESCRIPTOR_TYPE(src) : src_type,
4124 #endif
4125                      dst_kind, src_kind, dst_size, ref->item_size, num,
4126                      stat, memptr_win_rank);
4127           }
4128           *i += num;
4129           return;
4130         }
4131         break;
4132       default:
4133         caf_runtime_error(unreachable);
4134     }
4135   }
4136 
4137   switch (ref->type)
4138   {
4139     case CAF_REF_COMPONENT:
4140       if (ref->u.c.caf_token_offset > 0)
4141       {
4142         sr_byte_offset += ref->u.c.offset;
4143         desc_byte_offset = sr_byte_offset;
4144         if (sr_global)
4145         {
4146           ierr = MPI_Get(&sr, stdptr_size, MPI_BYTE, global_dynamic_win_rank,
4147                          MPI_Aint_add((MPI_Aint)sr, sr_byte_offset),
4148                          stdptr_size, MPI_BYTE, global_dynamic_win);
4149           chk_err(ierr);
4150           desc_global = true;
4151         }
4152         else
4153         {
4154           ierr = MPI_Get(&sr, stdptr_size, MPI_BYTE, memptr_win_rank,
4155                          sr_byte_offset, stdptr_size, MPI_BYTE,
4156                          mpi_token->memptr_win); chk_err(ierr);
4157           sr_global = true;
4158         }
4159         sr_byte_offset = 0;
4160       }
4161       else
4162       {
4163         sr_byte_offset += ref->u.c.offset;
4164         desc_byte_offset += ref->u.c.offset;
4165       }
4166       get_for_ref(ref->next, i, dst_index, mpi_token, dst, NULL, ds,
4167                   sr, sr_byte_offset, desc_byte_offset, dst_kind, src_kind,
4168                   dst_dim, 0, 1, stat, global_dynamic_win_rank, memptr_win_rank,
4169                   sr_global, desc_global
4170 #ifdef GCC_GE_8
4171                   , src_type
4172 #endif
4173                   );
4174       return;
4175     case CAF_REF_ARRAY:
4176       if (ref->u.a.mode[src_dim] == CAF_ARR_REF_NONE)
4177       {
4178         get_for_ref(ref->next, i, dst_index, mpi_token, dst, src, ds, sr,
4179                     sr_byte_offset, desc_byte_offset, dst_kind, src_kind,
4180                     dst_dim, 0, 1, stat, global_dynamic_win_rank, memptr_win_rank,
4181                     sr_global, desc_global
4182 #ifdef GCC_GE_8
4183                     , src_type
4184 #endif
4185                     );
4186         return;
4187       }
4188       /* Only when on the left most index switch the data pointer to the
4189        * array's data pointer. */
4190       if (src_dim == 0)
4191       {
4192         if (sr_global)
4193         {
4194           for (ref_rank = 0; ref->u.a.mode[ref_rank] != CAF_ARR_REF_NONE;
4195                ++ref_rank) ;
4196           /* Get the remote descriptor. */
4197           if (desc_global)
4198           {
4199             ierr = MPI_Get(&src_desc_data, sizeof_desc_for_rank(ref_rank),
4200                            MPI_BYTE, global_dynamic_win_rank,
4201                            MPI_Aint_add((MPI_Aint)sr, desc_byte_offset),
4202                            sizeof_desc_for_rank(ref_rank), MPI_BYTE,
4203                            global_dynamic_win); chk_err(ierr);
4204           }
4205           else
4206           {
4207             ierr = MPI_Get(&src_desc_data,
4208                            sizeof_desc_for_rank(ref_rank), MPI_BYTE,
4209                            memptr_win_rank, desc_byte_offset,
4210                            sizeof_desc_for_rank(ref_rank),
4211                            MPI_BYTE, mpi_token->memptr_win); chk_err(ierr);
4212             desc_global = true;
4213           }
4214           src = (gfc_descriptor_t *)&src_desc_data;
4215         }
4216         else
4217           src = mpi_token->desc;
4218         sr_byte_offset = 0;
4219         desc_byte_offset = 0;
4220 #ifdef EXTRA_DEBUG_OUTPUT
4221         dprint("remote desc rank: %zd (ref_rank: %zd)\n",
4222                GFC_DESCRIPTOR_RANK(src), ref_rank);
4223         for (int r = 0; r < GFC_DESCRIPTOR_RANK(src); ++r)
4224         {
4225           dprint("remote desc dim[%d] = (lb=%zd, ub=%zd, stride=%zd)\n",
4226                  r, src->dim[r].lower_bound, src->dim[r]._ubound,
4227                  src->dim[r]._stride);
4228         }
4229 #endif
4230       }
4231       switch (ref->u.a.mode[src_dim])
4232       {
4233         case CAF_ARR_REF_VECTOR:
4234           extent_src = GFC_DESCRIPTOR_EXTENT(src, src_dim);
4235           array_offset_src = 0;
4236           for (size_t idx = 0; idx < ref->u.a.dim[src_dim].v.nvec; ++idx)
4237           {
4238 #define KINDCASE(kind, type)                                        \
4239 case kind:                                                          \
4240   array_offset_src = (((ptrdiff_t)                                  \
4241     ((type *)ref->u.a.dim[src_dim].v.vector)[idx])                  \
4242     - src->dim[src_dim].lower_bound  * src->dim[src_dim]._stride);  \
4243   break
4244 
4245             switch (ref->u.a.dim[src_dim].v.kind)
4246             {
4247               KINDCASE(1, int8_t);
4248               KINDCASE(2, int16_t);
4249               KINDCASE(4, int32_t);
4250               KINDCASE(8, int64_t);
4251 #ifdef HAVE_GFC_INTEGER_16
4252               KINDCASE(16, __int128);
4253 #endif
4254               default:
4255                 caf_runtime_error(unreachable);
4256                 return;
4257             }
4258 #undef KINDCASE
4259 
4260             dprint("vector-index computed to: %zd\n", array_offset_src);
4261             get_for_ref(ref, i, dst_index, mpi_token, dst, src, ds, sr,
4262                         sr_byte_offset + array_offset_src * ref->item_size,
4263                         desc_byte_offset + array_offset_src * ref->item_size,
4264                         dst_kind, src_kind, dst_dim + 1, src_dim + 1,
4265                         1, stat, global_dynamic_win_rank, memptr_win_rank,
4266                         sr_global, desc_global
4267 #ifdef GCC_GE_8
4268                         , src_type
4269 #endif
4270                         );
4271             dst_index += dst->dim[dst_dim]._stride;
4272           }
4273           return;
4274         case CAF_ARR_REF_FULL:
4275           COMPUTE_NUM_ITEMS(extent_src,
4276                             ref->u.a.dim[src_dim].s.stride,
4277                             src->dim[src_dim].lower_bound,
4278                             src->dim[src_dim]._ubound);
4279           stride_src =
4280             src->dim[src_dim]._stride * ref->u.a.dim[src_dim].s.stride;
4281           array_offset_src = 0;
4282           for (ptrdiff_t idx = 0; idx < extent_src;
4283                ++idx, array_offset_src += stride_src)
4284             {
4285               get_for_ref(ref, i, dst_index, mpi_token, dst, src, ds, sr,
4286                           sr_byte_offset + array_offset_src * ref->item_size,
4287                           desc_byte_offset + array_offset_src * ref->item_size,
4288                           dst_kind, src_kind, dst_dim + 1, src_dim + 1,
4289                           1, stat, global_dynamic_win_rank, memptr_win_rank,
4290                           sr_global, desc_global
4291 #ifdef GCC_GE_8
4292                           , src_type
4293 #endif
4294                           );
4295               dst_index += dst->dim[dst_dim]._stride;
4296             }
4297           return;
4298         case CAF_ARR_REF_RANGE:
4299           COMPUTE_NUM_ITEMS(extent_src,
4300                             ref->u.a.dim[src_dim].s.stride,
4301                             ref->u.a.dim[src_dim].s.start,
4302                             ref->u.a.dim[src_dim].s.end);
4303           array_offset_src =
4304             (ref->u.a.dim[src_dim].s.start - src->dim[src_dim].lower_bound)
4305             * src->dim[src_dim]._stride;
4306           stride_src =
4307             src->dim[src_dim]._stride * ref->u.a.dim[src_dim].s.stride;
4308           /* Increase the dst_dim only, when the src_extent is greater than one
4309            * or src and dst extent are both one. Don't increase when the scalar
4310            * source is not present in the dst. */
4311           next_dst_dim = (
4312             (extent_src > 1) ||
4313             (GFC_DESCRIPTOR_EXTENT(dst, dst_dim) == 1 && extent_src == 1)
4314           ) ? (dst_dim + 1) : dst_dim;
4315           for (ptrdiff_t idx = 0; idx < extent_src; ++idx)
4316           {
4317             get_for_ref(ref, i, dst_index, mpi_token, dst, src, ds, sr,
4318                         sr_byte_offset + array_offset_src * ref->item_size,
4319                         desc_byte_offset + array_offset_src * ref->item_size,
4320                         dst_kind, src_kind, next_dst_dim, src_dim + 1,
4321                         1, stat, global_dynamic_win_rank, memptr_win_rank,
4322                         sr_global, desc_global
4323 #ifdef GCC_GE_8
4324                         , src_type
4325 #endif
4326                         );
4327             dst_index += dst->dim[dst_dim]._stride;
4328             array_offset_src += stride_src;
4329           }
4330           return;
4331         case CAF_ARR_REF_SINGLE:
4332           array_offset_src =
4333             (ref->u.a.dim[src_dim].s.start - src->dim[src_dim].lower_bound)
4334             * src->dim[src_dim]._stride;
4335           get_for_ref(ref, i, dst_index, mpi_token, dst, src, ds, sr,
4336                       sr_byte_offset + array_offset_src * ref->item_size,
4337                       desc_byte_offset + array_offset_src * ref->item_size,
4338                       dst_kind, src_kind, dst_dim, src_dim + 1, 1,
4339                       stat, global_dynamic_win_rank, memptr_win_rank,
4340                       sr_global, desc_global
4341 #ifdef GCC_GE_8
4342                       , src_type
4343 #endif
4344                       );
4345           return;
4346         case CAF_ARR_REF_OPEN_END:
4347           COMPUTE_NUM_ITEMS(extent_src,
4348                             ref->u.a.dim[src_dim].s.stride,
4349                             ref->u.a.dim[src_dim].s.start,
4350                             src->dim[src_dim]._ubound);
4351           stride_src =
4352             src->dim[src_dim]._stride * ref->u.a.dim[src_dim].s.stride;
4353           array_offset_src = (ref->u.a.dim[src_dim].s.start
4354                               - src->dim[src_dim].lower_bound)
4355                              * src->dim[src_dim]._stride;
4356           for (ptrdiff_t idx = 0; idx < extent_src; ++idx)
4357           {
4358             get_for_ref(ref, i, dst_index, mpi_token, dst, src, ds, sr,
4359                         sr_byte_offset + array_offset_src * ref->item_size,
4360                         desc_byte_offset + array_offset_src * ref->item_size,
4361                         dst_kind, src_kind, dst_dim + 1, src_dim + 1,
4362                         1, stat, global_dynamic_win_rank, memptr_win_rank,
4363                         sr_global, desc_global
4364 #ifdef GCC_GE_8
4365                         , src_type
4366 #endif
4367                         );
4368             dst_index += dst->dim[dst_dim]._stride;
4369             array_offset_src += stride_src;
4370           }
4371           return;
4372         case CAF_ARR_REF_OPEN_START:
4373           COMPUTE_NUM_ITEMS(extent_src,
4374                             ref->u.a.dim[src_dim].s.stride,
4375                             src->dim[src_dim].lower_bound,
4376                             ref->u.a.dim[src_dim].s.end);
4377           stride_src =
4378             src->dim[src_dim]._stride * ref->u.a.dim[src_dim].s.stride;
4379           array_offset_src = 0;
4380           for (ptrdiff_t idx = 0; idx < extent_src; ++idx)
4381           {
4382             get_for_ref(ref, i, dst_index, mpi_token, dst, src, ds, sr,
4383                         sr_byte_offset + array_offset_src * ref->item_size,
4384                         desc_byte_offset + array_offset_src * ref->item_size,
4385                         dst_kind, src_kind, dst_dim + 1, src_dim + 1,
4386                         1, stat, global_dynamic_win_rank, memptr_win_rank,
4387                         sr_global, desc_global
4388 #ifdef GCC_GE_8
4389                         , src_type
4390 #endif
4391                         );
4392             dst_index += dst->dim[dst_dim]._stride;
4393             array_offset_src += stride_src;
4394           }
4395           return;
4396         default:
4397           caf_runtime_error(unreachable);
4398       }
4399       return;
4400     case CAF_REF_STATIC_ARRAY:
4401       if (ref->u.a.mode[src_dim] == CAF_ARR_REF_NONE)
4402       {
4403         get_for_ref(ref->next, i, dst_index, mpi_token, dst, NULL, ds, sr,
4404                     sr_byte_offset, desc_byte_offset, dst_kind, src_kind,
4405                     dst_dim, 0, 1, stat, global_dynamic_win_rank, memptr_win_rank,
4406                     sr_global, desc_global
4407 #ifdef GCC_GE_8
4408                     , src_type
4409 #endif
4410                     );
4411         return;
4412       }
4413       switch (ref->u.a.mode[src_dim])
4414       {
4415       case CAF_ARR_REF_VECTOR:
4416         array_offset_src = 0;
4417         for (size_t idx = 0; idx < ref->u.a.dim[src_dim].v.nvec; ++idx)
4418         {
4419 #define KINDCASE(kind, type)                                        \
4420 case kind:                                                          \
4421   array_offset_src = ((type *)ref->u.a.dim[src_dim].v.vector)[idx]; \
4422   break
4423 
4424           switch (ref->u.a.dim[src_dim].v.kind)
4425           {
4426             KINDCASE(1, int8_t);
4427             KINDCASE(2, int16_t);
4428             KINDCASE(4, int32_t);
4429             KINDCASE(8, int64_t);
4430 #ifdef HAVE_GFC_INTEGER_16
4431             KINDCASE(16, __int128);
4432 #endif
4433             default:
4434               caf_runtime_error(unreachable);
4435               return;
4436           }
4437 #undef KINDCASE
4438 
4439           get_for_ref(ref, i, dst_index, mpi_token, dst, NULL, ds, sr,
4440                       sr_byte_offset + array_offset_src * ref->item_size,
4441                       desc_byte_offset + array_offset_src * ref->item_size,
4442                       dst_kind, src_kind, dst_dim + 1, src_dim + 1,
4443                       1, stat, global_dynamic_win_rank, memptr_win_rank,
4444                       sr_global, desc_global
4445 #ifdef GCC_GE_8
4446                       , src_type
4447 #endif
4448                       );
4449           dst_index += dst->dim[dst_dim]._stride;
4450         }
4451         return;
4452       case CAF_ARR_REF_FULL:
4453         for (array_offset_src = 0 ;
4454              array_offset_src <= ref->u.a.dim[src_dim].s.end;
4455              array_offset_src += ref->u.a.dim[src_dim].s.stride)
4456         {
4457           get_for_ref(ref, i, dst_index, mpi_token, dst, NULL, ds, sr,
4458                       sr_byte_offset + array_offset_src * ref->item_size,
4459                       desc_byte_offset + array_offset_src * ref->item_size,
4460                       dst_kind, src_kind, dst_dim + 1, src_dim + 1,
4461                       1, stat, global_dynamic_win_rank, memptr_win_rank,
4462                       sr_global, desc_global
4463 #ifdef GCC_GE_8
4464                        , src_type
4465 #endif
4466                        );
4467           dst_index += dst->dim[dst_dim]._stride;
4468         }
4469         return;
4470       case CAF_ARR_REF_RANGE:
4471         COMPUTE_NUM_ITEMS(extent_src,
4472                           ref->u.a.dim[src_dim].s.stride,
4473                           ref->u.a.dim[src_dim].s.start,
4474                           ref->u.a.dim[src_dim].s.end);
4475         array_offset_src = ref->u.a.dim[src_dim].s.start;
4476         for (ptrdiff_t idx = 0; idx < extent_src; ++idx)
4477         {
4478           get_for_ref(ref, i, dst_index, mpi_token, dst, NULL, ds, sr,
4479                       sr_byte_offset + array_offset_src * ref->item_size,
4480                       desc_byte_offset + array_offset_src * ref->item_size,
4481                       dst_kind, src_kind, dst_dim + 1, src_dim + 1,
4482                       1, stat, global_dynamic_win_rank, memptr_win_rank,
4483                       sr_global, desc_global
4484 #ifdef GCC_GE_8
4485                       , src_type
4486 #endif
4487                       );
4488           dst_index += dst->dim[dst_dim]._stride;
4489           array_offset_src += ref->u.a.dim[src_dim].s.stride;
4490         }
4491         return;
4492       case CAF_ARR_REF_SINGLE:
4493         array_offset_src = ref->u.a.dim[src_dim].s.start;
4494         get_for_ref(ref, i, dst_index, mpi_token, dst, NULL, ds, sr,
4495                     sr_byte_offset + array_offset_src * ref->item_size,
4496                     desc_byte_offset + array_offset_src * ref->item_size,
4497                     dst_kind, src_kind, dst_dim, src_dim + 1, 1,
4498                     stat, global_dynamic_win_rank, memptr_win_rank,
4499                     sr_global, desc_global
4500 #ifdef GCC_GE_8
4501                     , src_type
4502 #endif
4503                     );
4504         return;
4505         /* The OPEN_* are mapped to a RANGE and therefore can not occur. */
4506       case CAF_ARR_REF_OPEN_END:
4507       case CAF_ARR_REF_OPEN_START:
4508       default:
4509         caf_runtime_error(unreachable);
4510       }
4511       return;
4512     default:
4513       caf_runtime_error(unreachable);
4514   }
4515 }
4516 
4517 void
4518 PREFIX(get_by_ref) (caf_token_t token, int image_index,
4519                     gfc_descriptor_t *dst, caf_reference_t *refs,
4520                     int dst_kind, int src_kind,
4521                     bool may_require_tmp __attribute__((unused)),
4522                     bool dst_reallocatable, int *stat
4523 #ifdef GCC_GE_8
4524                     , int src_type
4525 #endif
4526                     )
4527 {
4528   const char vecrefunknownkind[] =
4529     "libcaf_mpi::caf_get_by_ref(): unknown kind in vector-ref.\n";
4530   const char unknownreftype[] =
4531     "libcaf_mpi::caf_get_by_ref(): unknown reference type.\n";
4532   const char unknownarrreftype[] =
4533     "libcaf_mpi::caf_get_by_ref(): unknown array reference type.\n";
4534   const char rankoutofrange[] =
4535     "libcaf_mpi::caf_get_by_ref(): rank out of range.\n";
4536   const char extentoutofrange[] =
4537     "libcaf_mpi::caf_get_by_ref(): extent out of range.\n";
4538   const char cannotallocdst[] =
4539     "libcaf_mpi::caf_get_by_ref(): can not allocate %d bytes of memory.\n";
4540   const char nonallocextentmismatch[] =
4541     "libcaf_mpi::caf_get_by_ref(): extent of non-allocatable arrays "
4542     "mismatch (%lu != %lu).\n";
4543   const char doublearrayref[] =
4544     "libcaf_mpi::caf_get_by_ref(): two or more array part references "
4545     "are not supported.\n";
4546   size_t size, i, ref_rank, dst_index, src_size;
4547   int ierr, dst_rank = GFC_DESCRIPTOR_RANK(dst), dst_cur_dim = 0;
4548   mpi_caf_token_t *mpi_token = (mpi_caf_token_t *) token;
4549   void *remote_memptr = mpi_token->memptr, *remote_base_memptr = NULL;
4550   gfc_max_dim_descriptor_t src_desc;
4551   gfc_descriptor_t *src = (gfc_descriptor_t *)&src_desc;
4552   caf_reference_t *riter = refs;
4553   long delta;
4554   ptrdiff_t data_offset = 0, desc_offset = 0;
4555   /* Reallocation of dst.data is needed (e.g., array to small). */
4556   bool realloc_needed;
4557   /* Reallocation of dst.data is required, because data is not alloced at
4558    * all. */
4559   bool realloc_required, extent_mismatch = false;
4560   /* Set when the first non-scalar array reference is encountered. */
4561   bool in_array_ref = false, array_extent_fixed = false;
4562   /* Set when remote data is to be accessed through the
4563    * global dynamic window. */
4564   bool access_data_through_global_win = false;
4565   /* Set when the remote descriptor is to accessed through the global window. */
4566   bool access_desc_through_global_win = false;
4567   caf_array_ref_t array_ref;
4568 
4569   realloc_needed = realloc_required = dst->base_addr == NULL;
4570 
4571   if (stat)
4572     *stat = 0;
4573 
4574   MPI_Group current_team_group, win_group;
4575   int global_dynamic_win_rank, memptr_win_rank;
4576   ierr = MPI_Comm_group(CAF_COMM_WORLD, &current_team_group); chk_err(ierr);
4577   ierr = MPI_Win_get_group(global_dynamic_win, &win_group); chk_err(ierr);
4578   ierr = MPI_Group_translate_ranks(current_team_group, 1,
4579                                    (int[]){image_index - 1}, win_group,
4580                                    &global_dynamic_win_rank); chk_err(ierr);
4581   ierr = MPI_Group_free(&win_group); chk_err(ierr);
4582   ierr = MPI_Win_get_group(mpi_token->memptr_win, &win_group); chk_err(ierr);
4583   ierr = MPI_Group_translate_ranks(current_team_group, 1,
4584                                    (int[]){image_index - 1}, win_group,
4585                                    &memptr_win_rank); chk_err(ierr);
4586   ierr = MPI_Group_free(&current_team_group); chk_err(ierr);
4587   ierr = MPI_Group_free(&win_group); chk_err(ierr);
4588 
4589   check_image_health(global_dynamic_win_rank, stat);
4590 
4591   dprint("Entering get_by_ref(may_require_tmp = %d).\n", may_require_tmp);
4592 
4593   /* Compute the size of the result.  In the beginning size just counts the
4594    * number of elements. */
4595   size = 1;
4596   /* Shared lock both windows to prevent bother in the sub-routines. */
4597   CAF_Win_lock(MPI_LOCK_SHARED, global_dynamic_win_rank, global_dynamic_win);
4598   CAF_Win_lock(MPI_LOCK_SHARED, memptr_win_rank, mpi_token->memptr_win);
4599   while (riter)
4600   {
4601     dprint("offset = %zd, remote_mem = %p, access_data(global_win) = %d\n",
4602            data_offset, remote_memptr, access_data_through_global_win);
4603     switch (riter->type)
4604     {
4605       case CAF_REF_COMPONENT:
4606         if (riter->u.c.caf_token_offset > 0)
4607         {
4608           if (access_data_through_global_win)
4609           {
4610             data_offset += riter->u.c.offset;
4611             remote_base_memptr = remote_memptr;
4612             ierr = MPI_Get(&remote_memptr, stdptr_size, MPI_BYTE, global_dynamic_win_rank,
4613                            MPI_Aint_add((MPI_Aint)remote_memptr, data_offset),
4614                            stdptr_size, MPI_BYTE, global_dynamic_win);
4615             chk_err(ierr);
4616             /* On the second indirection access also the remote descriptor
4617              * using the global window. */
4618             access_desc_through_global_win = true;
4619           }
4620           else
4621           {
4622             data_offset += riter->u.c.offset;
4623             ierr = MPI_Get(&remote_memptr, stdptr_size, MPI_BYTE, memptr_win_rank,
4624                            data_offset, stdptr_size, MPI_BYTE,
4625                            mpi_token->memptr_win); chk_err(ierr);
4626             dprint("get(custom_token %d), offset = %zd, res. remote_mem = %p\n",
4627                    mpi_token->memptr_win, data_offset, remote_memptr);
4628             /* All future access is through the global dynamic window. */
4629             access_data_through_global_win = true;
4630           }
4631           desc_offset = data_offset;
4632           data_offset = 0;
4633         }
4634         else
4635         {
4636           data_offset += riter->u.c.offset;
4637           desc_offset += riter->u.c.offset;
4638         }
4639         break;
4640       case CAF_REF_ARRAY:
4641         /* When there has been no CAF_REF_COMP before hand, then the
4642          * descriptor is stored in the token and the extends are the same on
4643          * all images, which is taken care of in the else part. */
4644         if (access_data_through_global_win)
4645         {
4646           for (ref_rank = 0; riter->u.a.mode[ref_rank] != CAF_ARR_REF_NONE;
4647                ++ref_rank) ;
4648           /* Get the remote descriptor and use the stack to store it. Note,
4649            * src may be pointing to mpi_token->desc therefore it needs to be
4650            * reset here. */
4651           src = (gfc_descriptor_t *)&src_desc;
4652           if (access_desc_through_global_win)
4653           {
4654             dprint("remote desc fetch from %p, offset = %zd\n",
4655                    remote_base_memptr, desc_offset);
4656             MPI_Get(src, sizeof_desc_for_rank(ref_rank), MPI_BYTE, global_dynamic_win_rank,
4657                     MPI_Aint_add((MPI_Aint)remote_base_memptr, desc_offset),
4658                     sizeof_desc_for_rank(ref_rank), MPI_BYTE,
4659                     global_dynamic_win);
4660           }
4661           else
4662           {
4663             dprint("remote desc fetch from win %d, offset = %zd\n",
4664                    mpi_token->memptr_win, desc_offset);
4665             MPI_Get(src, sizeof_desc_for_rank(ref_rank), MPI_BYTE, memptr_win_rank,
4666                     desc_offset, sizeof_desc_for_rank(ref_rank), MPI_BYTE,
4667                     mpi_token->memptr_win);
4668             access_desc_through_global_win = true;
4669           }
4670         }
4671         else
4672           src = mpi_token->desc;
4673 
4674 #ifdef EXTRA_DEBUG_OUTPUT
4675         dprint("remote desc rank: %zd (ref_rank: %zd)\n",
4676                GFC_DESCRIPTOR_RANK(src), ref_rank);
4677         for (i = 0; i < GFC_DESCRIPTOR_RANK(src); ++i)
4678         {
4679           dprint("remote desc dim[%zd] = (lb=%zd, ub=%zd, stride=%zd)\n",
4680                  i, src->dim[i].lower_bound, src->dim[i]._ubound,
4681                  src->dim[i]._stride);
4682         }
4683 #endif
4684         for (i = 0; riter->u.a.mode[i] != CAF_ARR_REF_NONE; ++i)
4685         {
4686           array_ref = riter->u.a.mode[i];
4687           switch (array_ref)
4688           {
4689             case CAF_ARR_REF_VECTOR:
4690               delta = riter->u.a.dim[i].v.nvec;
4691 #define KINDCASE(kind, type)                                            \
4692 case kind:                                                              \
4693   remote_memptr += (((ptrdiff_t)                                        \
4694     ((type *)riter->u.a.dim[i].v.vector)[0]) - src->dim[i].lower_bound) \
4695     * src->dim[i]._stride * riter->item_size;                           \
4696   break
4697 
4698               switch (riter->u.a.dim[i].v.kind)
4699               {
4700                 KINDCASE(1, int8_t);
4701                 KINDCASE(2, int16_t);
4702                 KINDCASE(4, int32_t);
4703                 KINDCASE(8, int64_t);
4704 #ifdef HAVE_GFC_INTEGER_16
4705                 KINDCASE(16, __int128);
4706 #endif
4707                 default:
4708                   caf_runtime_error(vecrefunknownkind, stat, NULL, 0);
4709                   return;
4710               }
4711 #undef KINDCASE
4712               break;
4713             case CAF_ARR_REF_FULL:
4714               COMPUTE_NUM_ITEMS(delta,
4715                                 riter->u.a.dim[i].s.stride,
4716                                 src->dim[i].lower_bound,
4717                                 src->dim[i]._ubound);
4718               /* The memptr stays unchanged when ref'ing the first element
4719                * in a dimension. */
4720               break;
4721             case CAF_ARR_REF_RANGE:
4722               COMPUTE_NUM_ITEMS(delta,
4723                                 riter->u.a.dim[i].s.stride,
4724                                 riter->u.a.dim[i].s.start,
4725                                 riter->u.a.dim[i].s.end);
4726               remote_memptr +=
4727                 (riter->u.a.dim[i].s.start - src->dim[i].lower_bound)
4728                 * src->dim[i]._stride * riter->item_size;
4729               break;
4730             case CAF_ARR_REF_SINGLE:
4731               delta = 1;
4732               remote_memptr +=
4733                 (riter->u.a.dim[i].s.start - src->dim[i].lower_bound)
4734                 * src->dim[i]._stride * riter->item_size;
4735               break;
4736             case CAF_ARR_REF_OPEN_END:
4737               COMPUTE_NUM_ITEMS(delta,
4738                                 riter->u.a.dim[i].s.stride,
4739                                 riter->u.a.dim[i].s.start,
4740                                 src->dim[i]._ubound);
4741               remote_memptr +=
4742                 (riter->u.a.dim[i].s.start - src->dim[i].lower_bound)
4743                 * src->dim[i]._stride * riter->item_size;
4744               break;
4745             case CAF_ARR_REF_OPEN_START:
4746               COMPUTE_NUM_ITEMS(delta,
4747                                 riter->u.a.dim[i].s.stride,
4748                                 src->dim[i].lower_bound,
4749                                 riter->u.a.dim[i].s.end);
4750               /* The memptr stays unchanged when ref'ing the first element
4751                * in a dimension. */
4752               break;
4753             default:
4754               caf_runtime_error(unknownarrreftype, stat, NULL, 0);
4755               return;
4756           }
4757           dprint("i = %zd, array_ref = %s, delta = %ld\n", i,
4758                  caf_array_ref_str[array_ref], delta);
4759           if (delta <= 0)
4760             return;
4761           /* Check the various properties of the destination array.
4762            * Is an array expected and present? */
4763           if (delta > 1 && dst_rank == 0)
4764           {
4765             /* No, an array is required, but not provided. */
4766             caf_runtime_error(extentoutofrange, stat, NULL, 0);
4767             return;
4768           }
4769           /* When dst is an array. */
4770           if (dst_rank > 0)
4771           {
4772             /* Check that dst_cur_dim is valid for dst. Can be superceeded
4773              * only by scalar data. */
4774             if (dst_cur_dim >= dst_rank && delta != 1)
4775             {
4776               caf_runtime_error(rankoutofrange, stat, NULL, 0);
4777               return;
4778             }
4779             /* Do further checks, when the source is not scalar. */
4780             else if (delta != 1 || realloc_required)
4781             {
4782               /* Check that the extent is not scalar and we are not in an array
4783                * ref for the dst side. */
4784               if (!in_array_ref)
4785               {
4786                 /* Check that this is the non-scalar extent. */
4787                 if (!array_extent_fixed)
4788                 {
4789                   /* In an array extent now. */
4790                   in_array_ref = true;
4791                   /* Check that we haven't skipped any scalar  dimensions yet
4792                    * and that the dst is compatible. */
4793                   if (i > 0 && dst_rank == GFC_DESCRIPTOR_RANK(src))
4794                   {
4795                     if (dst_reallocatable)
4796                     {
4797                       /* Dst is reallocatable, which means that the bounds are
4798                        * not set. Set them. */
4799                       for (dst_cur_dim = 0; dst_cur_dim < (int)i; ++dst_cur_dim)
4800                       {
4801                         dst->dim[dst_cur_dim].lower_bound = 1;
4802                         dst->dim[dst_cur_dim]._ubound = 1;
4803                         dst->dim[dst_cur_dim]._stride = 1;
4804                       }
4805                     }
4806                     else
4807                       dst_cur_dim = i;
4808                   }
4809                   /* Else press thumbs, that there are enough dimensional refs
4810                    * to come. Checked below. */
4811                 }
4812                 else
4813                 {
4814                   caf_runtime_error(doublearrayref, stat, NULL, 0);
4815                   return;
4816                 }
4817               }
4818               /* When the realloc is required, then no extent may have
4819                * been set. */
4820               extent_mismatch = realloc_required ||
4821                 GFC_DESCRIPTOR_EXTENT(dst, dst_cur_dim) != delta;
4822               /* When it already known, that a realloc is needed or the extent
4823                * does not match the needed one. */
4824               if (realloc_needed || extent_mismatch)
4825               {
4826                 /* Check whether dst is reallocatable. */
4827                 if (unlikely(!dst_reallocatable))
4828                 {
4829                   caf_runtime_error(nonallocextentmismatch, stat,
4830                                     NULL, 0, delta,
4831                                     GFC_DESCRIPTOR_EXTENT(dst, dst_cur_dim));
4832                   return;
4833                 }
4834                 /* Only report an error, when the extent needs to be modified,
4835                  * which is not allowed. */
4836                 else if (!dst_reallocatable && extent_mismatch)
4837                 {
4838                   caf_runtime_error(extentoutofrange, stat, NULL, 0);
4839                   return;
4840                 }
4841                 realloc_needed = true;
4842               }
4843               /* Only change the extent when it does not match.  This is to
4844                * prevent resetting given array bounds. */
4845               if (extent_mismatch)
4846               {
4847                 dst->dim[dst_cur_dim].lower_bound = 1;
4848                 dst->dim[dst_cur_dim]._ubound = delta;
4849                 dst->dim[dst_cur_dim]._stride = size;
4850                 if (realloc_required)
4851                   dst->offset = -1;
4852               }
4853             }
4854 
4855             /* Only increase the dim counter, when in an array ref */
4856             if (in_array_ref && dst_cur_dim < dst_rank)
4857             {
4858               /* Mode != CAF_ARR_REF_SINGLE(delta == 1), and no rank
4859                * reduction */
4860               if (!(delta == 1 && dst_rank != GFC_DESCRIPTOR_RANK(src)))
4861                 ++dst_cur_dim;
4862             }
4863           }
4864           size *= (ptrdiff_t)delta;
4865         }
4866         if (in_array_ref)
4867         {
4868           array_extent_fixed = true;
4869           in_array_ref = false;
4870         }
4871         break;
4872       case CAF_REF_STATIC_ARRAY:
4873         for (i = 0; riter->u.a.mode[i] != CAF_ARR_REF_NONE; ++i)
4874         {
4875           array_ref = riter->u.a.mode[i];
4876           switch (array_ref)
4877           {
4878             case CAF_ARR_REF_VECTOR:
4879               delta = riter->u.a.dim[i].v.nvec;
4880 #define KINDCASE(kind, type)                                    \
4881 case kind:                                                      \
4882   remote_memptr +=                                              \
4883     ((type *)riter->u.a.dim[i].v.vector)[0] * riter->item_size; \
4884   break
4885 
4886               switch (riter->u.a.dim[i].v.kind)
4887               {
4888                 KINDCASE(1, int8_t);
4889                 KINDCASE(2, int16_t);
4890                 KINDCASE(4, int32_t);
4891                 KINDCASE(8, int64_t);
4892 #ifdef HAVE_GFC_INTEGER_16
4893                 KINDCASE(16, __int128);
4894 #endif
4895                 default:
4896                   caf_runtime_error(vecrefunknownkind, stat, NULL, 0);
4897                   return;
4898               }
4899 #undef KINDCASE
4900               break;
4901             case CAF_ARR_REF_FULL:
4902               delta = riter->u.a.dim[i].s.end / riter->u.a.dim[i].s.stride
4903                       + 1;
4904               /* The memptr stays unchanged when ref'ing the first element in a
4905                * dimension. */
4906               break;
4907             case CAF_ARR_REF_RANGE:
4908               COMPUTE_NUM_ITEMS(delta,
4909                                 riter->u.a.dim[i].s.stride,
4910                                 riter->u.a.dim[i].s.start,
4911                                 riter->u.a.dim[i].s.end);
4912               remote_memptr += riter->u.a.dim[i].s.start
4913                                * riter->u.a.dim[i].s.stride
4914                                * riter->item_size;
4915               break;
4916             case CAF_ARR_REF_SINGLE:
4917               delta = 1;
4918               remote_memptr += riter->u.a.dim[i].s.start
4919                                * riter->u.a.dim[i].s.stride
4920                                * riter->item_size;
4921               break;
4922             case CAF_ARR_REF_OPEN_END:
4923               /* This and OPEN_START are mapped to a RANGE and therefore can
4924                * not occur here. */
4925             case CAF_ARR_REF_OPEN_START:
4926             default:
4927               caf_runtime_error(unknownarrreftype, stat, NULL, 0);
4928               return;
4929           }
4930           dprint("i = %zd, array_ref = %s, delta = %ld\n",
4931                  i, caf_array_ref_str[array_ref], delta);
4932           if (delta <= 0)
4933             return;
4934           /* Check the various properties of the destination array.
4935            * Is an array expected and present? */
4936           if (delta > 1 && dst_rank == 0)
4937           {
4938             /* No, an array is required, but not provided. */
4939             caf_runtime_error(extentoutofrange, stat, NULL, 0);
4940             return;
4941           }
4942           /* When dst is an array. */
4943           if (dst_rank > 0)
4944           {
4945             /* Check that dst_cur_dim is valid for dst. Can be superceeded
4946              * only by scalar data. */
4947             if (dst_cur_dim >= dst_rank && delta != 1)
4948             {
4949               caf_runtime_error(rankoutofrange, stat, NULL, 0);
4950               return;
4951             }
4952             /* Do further checks, when the source is not scalar. */
4953             else if (delta != 1 || realloc_required)
4954             {
4955               /* Check that the extent is not scalar and we are not in an array
4956                * ref for the dst side. */
4957               if (!in_array_ref)
4958               {
4959                 /* Check that this is the non-scalar extent. */
4960                 if (!array_extent_fixed)
4961                 {
4962                   /* In an array extent now. */
4963                   in_array_ref = true;
4964                   /* The dst is not reallocatable, so nothing more to do,
4965                    * then correct the dim counter. */
4966                   dst_cur_dim = i;
4967                 }
4968                 else
4969                 {
4970                   caf_runtime_error(doublearrayref, stat, NULL, 0);
4971                   return;
4972                 }
4973               }
4974               /* When the realloc is required, then no extent may have
4975                * been set. */
4976               extent_mismatch = realloc_required ||
4977                 GFC_DESCRIPTOR_EXTENT(dst, dst_cur_dim) != delta;
4978               /* When it is already known, that a realloc is needed or
4979                * the extent does not match the needed one. */
4980               if (realloc_needed || extent_mismatch)
4981               {
4982                 /* Check whether dst is reallocatable. */
4983                 if (unlikely(!dst_reallocatable))
4984                 {
4985                   caf_runtime_error(nonallocextentmismatch, stat,
4986                                     NULL, 0, delta,
4987                                     GFC_DESCRIPTOR_EXTENT(dst, dst_cur_dim));
4988                   return;
4989                 }
4990                 /* Only report an error, when the extent needs to be modified,
4991                  * which is not allowed. */
4992                 else if (!dst_reallocatable && extent_mismatch)
4993                 {
4994                   caf_runtime_error(extentoutofrange, stat, NULL, 0);
4995                   return;
4996                 }
4997                 realloc_needed = true;
4998               }
4999               /* Only change the extent when it does not match.  This is to
5000                * prevent resetting given array bounds. */
5001               if (extent_mismatch)
5002               {
5003                 dst->dim[dst_cur_dim].lower_bound = 1;
5004                 dst->dim[dst_cur_dim]._ubound = delta;
5005                 dst->dim[dst_cur_dim]._stride = size;
5006                 if (realloc_required)
5007                   dst->offset = -1;
5008               }
5009             }
5010             /* Only increase the dim counter, when in an array ref */
5011             if (in_array_ref && dst_cur_dim < dst_rank)
5012             {
5013               /* Mode != CAF_ARR_REF_SINGLE(delta == 1), and no rank
5014                * reduction */
5015               if (!(delta == 1 && dst_rank != GFC_DESCRIPTOR_RANK(src)))
5016                 ++dst_cur_dim;
5017             }
5018           }
5019           size *= (ptrdiff_t)delta;
5020         }
5021         if (in_array_ref)
5022         {
5023           array_extent_fixed = true;
5024           in_array_ref = false;
5025         }
5026         break;
5027       default:
5028         caf_runtime_error(unknownreftype, stat, NULL, 0);
5029         return;
5030     }
5031     src_size = riter->item_size;
5032     riter = riter->next;
5033   }
5034   if (size == 0 || src_size == 0)
5035     return;
5036   /* Postcondition:
5037    * - size contains the number of elements to store in the destination array,
5038    * - src_size gives the size in bytes of each item in the destination array.
5039    */
5040 
5041   if (realloc_needed)
5042   {
5043     if (!array_extent_fixed)
5044     {
5045       /* This can happen only, when the result is scalar. */
5046       for (dst_cur_dim = 0; dst_cur_dim < dst_rank; ++dst_cur_dim)
5047       {
5048         dst->dim[dst_cur_dim].lower_bound = 1;
5049         dst->dim[dst_cur_dim]._ubound = 1;
5050         dst->dim[dst_cur_dim]._stride = 1;
5051       }
5052     }
5053     dst->base_addr = malloc(size * GFC_DESCRIPTOR_SIZE(dst));
5054     if (unlikely(dst->base_addr == NULL))
5055     {
5056       caf_runtime_error(cannotallocdst, stat, size * GFC_DESCRIPTOR_SIZE(dst));
5057       return;
5058     }
5059   }
5060 
5061   /* Reset the token. */
5062   mpi_token = (mpi_caf_token_t *) token;
5063   remote_memptr = mpi_token->memptr;
5064   dst_index = 0;
5065 #ifdef EXTRA_DEBUG_OUTPUT
5066   dprint("dst_rank: %zd\n", dst_rank);
5067   for (i = 0; i < dst_rank; ++i)
5068   {
5069     dprint("dst_dim[%zd] = (%zd, %zd)\n",
5070            i, dst->dim[i].lower_bound, dst->dim[i]._ubound);
5071   }
5072 #endif
5073   i = 0;
5074   dprint("get_by_ref() calling get_for_ref.\n");
5075   get_for_ref(refs, &i, dst_index, mpi_token, dst, mpi_token->desc,
5076               dst->base_addr, remote_memptr, 0, 0, dst_kind, src_kind, 0, 0,
5077               1, stat, global_dynamic_win_rank, memptr_win_rank, false, false
5078 #ifdef GCC_GE_8
5079                , src_type
5080 #endif
5081                );
5082   CAF_Win_unlock(global_dynamic_win_rank, global_dynamic_win);
5083   CAF_Win_unlock(memptr_win_rank, mpi_token->memptr_win);
5084 }
5085 
5086 static void
5087 put_data(mpi_caf_token_t *token, MPI_Aint offset, void *sr, int dst_type,
5088          int src_type, int dst_kind, int src_kind, size_t dst_size,
5089          size_t src_size, size_t num, int *stat, int image_index)
5090 {
5091   size_t k;
5092   int ierr;
5093   MPI_Win win = (token == NULL) ? global_dynamic_win : token->memptr_win;
5094 #ifdef EXTRA_DEBUG_OUTPUT
5095   if (token)
5096     dprint("(win: %d, image: %d, offset: %zd) <- %p, "
5097            "num: %zd, size %zd -> %zd, dst type %d(%d), src type %d(%d)\n",
5098            win, image_index + 1, offset, sr, num, src_size, dst_size,
5099            dst_type, dst_kind, src_type, src_kind);
5100   else
5101     dprint("(global_win: %x, image: %d, offset: %zd (%zd)) <- %p, "
5102            "num: %zd, size %zd -> %zd, dst type %d(%d), src type %d(%d)\n",
5103            win, image_index + 1, offset, offset, sr, num, src_size,
5104            dst_size, dst_type, dst_kind, src_type, src_kind);
5105 #endif
5106   if (dst_type == src_type && dst_kind == src_kind)
5107   {
5108     size_t sz = (dst_size > src_size ? src_size : dst_size) * num;
5109     ierr = MPI_Put(sr, sz, MPI_BYTE, image_index, offset, sz, MPI_BYTE, win);
5110     chk_err(ierr);
5111     dprint("sr[] = %d, num = %zd, num bytes = %zd\n",
5112            (int)((char*)sr)[0], num, sz);
5113     if ((dst_type == BT_CHARACTER || src_type == BT_CHARACTER)
5114         && dst_size > src_size)
5115     {
5116       const size_t trans_size = dst_size / dst_kind - src_size / src_kind;
5117       void *pad = alloca(trans_size * dst_kind);
5118       if (dst_kind == 1)
5119       {
5120         memset((void*)(char*) pad, ' ', trans_size);
5121       }
5122       else /* dst_kind == 4. */
5123       {
5124         for (k = 0; k < trans_size; ++k)
5125         {
5126           ((int32_t*) pad)[k] = (int32_t) ' ';
5127         }
5128       }
5129       ierr = MPI_Put(pad, trans_size * dst_kind, MPI_BYTE, image_index,
5130                      offset + (src_size / src_kind) * dst_kind,
5131                      trans_size * dst_kind, MPI_BYTE, win); chk_err(ierr);
5132     }
5133   }
5134   else if (dst_type == BT_CHARACTER && dst_kind == 1)
5135   {
5136     /* Get the required amount of memory on the stack. */
5137     void *dsh = alloca(dst_size);
5138     assign_char1_from_char4(dst_size, src_size, dsh, sr);
5139     ierr = MPI_Put(dsh, dst_size, MPI_BYTE, image_index, offset, dst_size,
5140                    MPI_BYTE, win); chk_err(ierr);
5141   }
5142   else if (dst_type == BT_CHARACTER)
5143   {
5144     /* Get the required amount of memory on the stack. */
5145     void *dsh = alloca(dst_size);
5146     assign_char4_from_char1(dst_size, src_size, dsh, sr);
5147     ierr = MPI_Put(dsh, dst_size, MPI_BYTE, image_index, offset, dst_size,
5148                    MPI_BYTE, win); chk_err(ierr);
5149   }
5150   else
5151   {
5152     /* Get the required amount of memory on the stack. */
5153     void *dsh = alloca(dst_size * num), *dsh_iter = dsh;
5154     dprint("type/kind convert %zd items: "
5155            "type %d(%d) -> type %d(%d), local buffer: %p\n",
5156            num, src_type, src_kind, dst_type, dst_kind, dsh);
5157     for (k = 0; k < num; ++k)
5158     {
5159       convert_type(dsh_iter, dst_type, dst_kind, sr, src_type, src_kind, stat);
5160       dsh_iter += dst_size;
5161       sr += src_size;
5162     }
5163     // dprint("dsh[0] = %d\n", ((int *)dsh)[0]);
5164     ierr = MPI_Put(dsh, dst_size * num, MPI_BYTE, image_index, offset,
5165                    dst_size * num, MPI_BYTE, win); chk_err(ierr);
5166   }
5167   ierr = MPI_Win_flush(image_index, win); chk_err(ierr);
5168 }
5169 
5170 
5171 static void
5172 send_for_ref(caf_reference_t *ref, size_t *i, size_t src_index,
5173              mpi_caf_token_t *mpi_token, gfc_descriptor_t *dst,
5174              gfc_descriptor_t *src, void *ds, void *sr,
5175              ptrdiff_t dst_byte_offset, ptrdiff_t desc_byte_offset,
5176              int dst_kind, int src_kind, size_t dst_dim, size_t src_dim,
5177              size_t num, int *stat, int global_dynamic_win_rank, int memptr_win_rank,
5178              bool ds_global, /* access ds through global_dynamic_win */
5179              bool desc_global /* access desc through global_dynamic_win */
5180 #ifdef GCC_GE_8
5181              , int dst_type)
5182 {
5183 #else
5184              )
5185 {
5186   int dst_type = -1;
5187 #endif
5188   ptrdiff_t extent_dst = 1, array_offset_dst = 0, dst_stride, src_stride;
5189   size_t next_dst_dim, ref_rank;
5190   gfc_max_dim_descriptor_t dst_desc_data;
5191   caf_ref_type_t ref_type = ref->type;
5192   caf_array_ref_t array_ref_src = ref->u.a.mode[src_dim];
5193   int ierr;
5194 
5195   if (unlikely(ref == NULL))
5196   {
5197     /* May be we should issue an error here, because this case should not
5198      * occur. */
5199     return;
5200   }
5201 
5202   dprint("Entering send_for_ref: [i = %zd] src_index = %zd, "
5203          "dst_offset = %zd, desc_offset = %zd, ds_glb = %d, desc_glb = %d\n",
5204          *i, src_index, dst_byte_offset, desc_byte_offset,
5205          ds_global, desc_global);
5206 
5207   if (ref->next == NULL)
5208   {
5209     size_t src_size = GFC_DESCRIPTOR_SIZE(src);
5210     dprint("[next == NULL]: src_size = %zd, ref_type = %s\n",
5211            src_size, caf_ref_type_str[ref_type]);
5212 
5213     switch (ref_type)
5214     {
5215       case CAF_REF_COMPONENT:
5216         dst_byte_offset += ref->u.c.offset;
5217         if (ref->u.c.caf_token_offset > 0)
5218         {
5219           if (ds_global)
5220           {
5221             ierr = MPI_Get(&ds, stdptr_size, MPI_BYTE, global_dynamic_win_rank,
5222                            MPI_Aint_add((MPI_Aint)ds, dst_byte_offset),
5223                            stdptr_size, MPI_BYTE, global_dynamic_win);
5224             chk_err(ierr);
5225             desc_global = true;
5226           }
5227           else
5228           {
5229             ierr = MPI_Get(&ds, stdptr_size, MPI_BYTE, memptr_win_rank,
5230                            dst_byte_offset, stdptr_size, MPI_BYTE,
5231                            mpi_token->memptr_win); chk_err(ierr);
5232             ds_global = true;
5233           }
5234           dst_byte_offset = 0;
5235         }
5236 
5237         if (ds_global)
5238         {
5239           put_data(NULL, MPI_Aint_add((MPI_Aint)ds, dst_byte_offset), sr,
5240 #ifdef GCC_GE_8
5241                    dst_type,
5242 #else
5243                    GFC_DESCRIPTOR_TYPE(src),
5244 #endif
5245                    GFC_DESCRIPTOR_TYPE(src), dst_kind, src_kind,
5246                    ref->item_size, src_size, 1, stat, global_dynamic_win_rank);
5247         }
5248         else
5249         {
5250           put_data(mpi_token, dst_byte_offset, sr,
5251 #ifdef GCC_GE_8
5252                    dst_type,
5253 #else
5254                    GFC_DESCRIPTOR_TYPE(dst),
5255 #endif
5256                    GFC_DESCRIPTOR_TYPE(src), dst_kind, src_kind,
5257                    ref->item_size, src_size, 1, stat, memptr_win_rank);
5258         }
5259         ++(*i);
5260         return;
5261       case CAF_REF_STATIC_ARRAY:
5262         dst_type = ref->u.a.static_array_type;
5263         /* Intentionally fall through. */
5264       case CAF_REF_ARRAY:
5265         if (array_ref_src == CAF_ARR_REF_NONE)
5266         {
5267           if (ds_global)
5268           {
5269             put_data(NULL, MPI_Aint_add((MPI_Aint)ds, dst_byte_offset),
5270                      sr + src_index * src_size,
5271 #ifdef GCC_GE_8
5272                      dst_type, GFC_DESCRIPTOR_TYPE(src),
5273 #else
5274                      GFC_DESCRIPTOR_TYPE(dst),
5275                      (dst_type == -1) ? GFC_DESCRIPTOR_TYPE(src) : dst_type,
5276 #endif
5277                      dst_kind, src_kind, ref->item_size, src_size, num,
5278                      stat, global_dynamic_win_rank);
5279           }
5280           else
5281           {
5282             put_data(mpi_token, dst_byte_offset, sr + src_index * src_size,
5283 #ifdef GCC_GE_8
5284                      dst_type, GFC_DESCRIPTOR_TYPE(src),
5285 #else
5286                      GFC_DESCRIPTOR_TYPE(dst),
5287                      (dst_type == -1) ? GFC_DESCRIPTOR_TYPE(src) : dst_type,
5288 #endif
5289                      dst_kind, src_kind, ref->item_size, src_size, num,
5290                      stat, memptr_win_rank);
5291           }
5292           *i += num;
5293           return;
5294         }
5295         break;
5296       default:
5297         caf_runtime_error(unreachable);
5298     }
5299   }
5300   caf_array_ref_t array_ref_dst = ref->u.a.mode[dst_dim];
5301 
5302 #if 0
5303   dprint("image_index = %d, num = %zd, src_dim = %zd, dst_dim = %zd, "
5304          "ref_type = %s, array_ref_src = %s\n",
5305          image_index, num, src_dim, dst_dim,
5306          caf_ref_type_str[ref_type],
5307          caf_array_ref_str[array_ref_src]);
5308 #endif
5309 
5310   switch (ref_type)
5311   {
5312     case CAF_REF_COMPONENT:
5313       if (ref->u.c.caf_token_offset > 0)
5314       {
5315         dst_byte_offset += ref->u.c.offset;
5316         desc_byte_offset = dst_byte_offset;
5317         if (ds_global)
5318         {
5319           ierr = MPI_Get(&ds, stdptr_size, MPI_BYTE, global_dynamic_win_rank,
5320                          MPI_Aint_add((MPI_Aint)ds, dst_byte_offset),
5321                          stdptr_size, MPI_BYTE, global_dynamic_win);
5322           chk_err(ierr);
5323           desc_global = true;
5324         }
5325         else
5326         {
5327           ierr = MPI_Get(&ds, stdptr_size, MPI_BYTE, memptr_win_rank,
5328                          dst_byte_offset, stdptr_size, MPI_BYTE,
5329                          mpi_token->memptr_win); chk_err(ierr);
5330           ds_global = true;
5331         }
5332         dst_byte_offset = 0;
5333       }
5334       else
5335       {
5336         dst_byte_offset += ref->u.c.offset;
5337         desc_byte_offset += ref->u.c.offset;
5338       }
5339       send_for_ref(ref->next, i, src_index, mpi_token, dst, src, ds,
5340                    sr, dst_byte_offset, desc_byte_offset, dst_kind, src_kind,
5341                    dst_dim, 0, 1, stat, global_dynamic_win_rank, memptr_win_rank,
5342                    ds_global, desc_global
5343 #ifdef GCC_GE_8
5344                    , dst_type
5345 #endif
5346                    );
5347       return;
5348     case CAF_REF_ARRAY:
5349       if (array_ref_src == CAF_ARR_REF_NONE)
5350       {
5351         send_for_ref(ref->next, i, src_index, mpi_token, dst, src, ds, sr,
5352                      dst_byte_offset, desc_byte_offset, dst_kind, src_kind,
5353                      dst_dim, 0, 1, stat, global_dynamic_win_rank, memptr_win_rank,
5354                      ds_global, desc_global
5355 #ifdef GCC_GE_8
5356                      , dst_type
5357 #endif
5358                     );
5359         return;
5360       }
5361       /* Only when on the left most index switch the data pointer to
5362        * the array's data pointer. */
5363       if (src_dim == 0)
5364       {
5365         if (ds_global)
5366         {
5367           for (ref_rank = 0; ref->u.a.mode[ref_rank] != CAF_ARR_REF_NONE;
5368                ++ref_rank) ;
5369           /* Get the remote descriptor. */
5370           if (desc_global)
5371           {
5372             ierr = MPI_Get(&dst_desc_data, sizeof_desc_for_rank(ref_rank),
5373                            MPI_BYTE, global_dynamic_win_rank,
5374                            MPI_Aint_add((MPI_Aint)ds, desc_byte_offset),
5375                            sizeof_desc_for_rank(ref_rank), MPI_BYTE,
5376                            global_dynamic_win); chk_err(ierr);
5377           }
5378           else
5379           {
5380             ierr = MPI_Get(&dst_desc_data, sizeof_desc_for_rank(ref_rank),
5381                            MPI_BYTE, memptr_win_rank, desc_byte_offset,
5382                            sizeof_desc_for_rank(ref_rank),
5383                            MPI_BYTE, mpi_token->memptr_win); chk_err(ierr);
5384             desc_global = true;
5385           }
5386           dst = (gfc_descriptor_t *)&dst_desc_data;
5387         }
5388         else
5389         {
5390           dst = mpi_token->desc;
5391         }
5392         dst_byte_offset = 0;
5393         desc_byte_offset = 0;
5394 #ifdef EXTRA_DEBUG_OUTPUT
5395         dprint("remote desc rank: %zd (ref_rank: %zd)\n",
5396                GFC_DESCRIPTOR_RANK(src), ref_rank);
5397         for (int r = 0; r < GFC_DESCRIPTOR_RANK(src); ++r)
5398         {
5399           dprint("remote desc dim[%d] = (lb=%zd, ub=%zd, stride=%zd)\n",
5400                  r, src->dim[r].lower_bound, src->dim[r]._ubound,
5401                  src->dim[r]._stride);
5402         }
5403 #endif
5404       }
5405       dprint("array_ref_dst[%zd] = %s := array_ref_src[%zd] = %s",
5406              dst_dim, caf_array_ref_str[array_ref_dst],
5407              src_dim, caf_array_ref_str[array_ref_src]);
5408       switch (array_ref_dst)
5409       {
5410         case CAF_ARR_REF_VECTOR:
5411           extent_dst = GFC_DESCRIPTOR_EXTENT(dst, dst_dim);
5412           array_offset_dst = 0;
5413           for (size_t idx = 0; idx < ref->u.a.dim[dst_dim].v.nvec; ++idx)
5414           {
5415 #define KINDCASE(kind, type)                        \
5416 case kind:                                          \
5417   array_offset_dst = (((ptrdiff_t)                  \
5418     ((type *)ref->u.a.dim[dst_dim].v.vector)[idx])  \
5419     - dst->dim[dst_dim].lower_bound                 \
5420     * dst->dim[dst_dim]._stride);                   \
5421   break
5422 
5423             switch (ref->u.a.dim[dst_dim].v.kind)
5424             {
5425               KINDCASE(1, int8_t);
5426               KINDCASE(2, int16_t);
5427               KINDCASE(4, int32_t);
5428               KINDCASE(8, int64_t);
5429 #ifdef HAVE_GFC_INTEGER_16
5430               KINDCASE(16, __int128);
5431 #endif
5432               default:
5433                 caf_runtime_error(unreachable);
5434                 return;
5435             }
5436 #undef KINDCASE
5437 
5438             dprint("vector-index computed to: %zd\n", array_offset_dst);
5439             send_for_ref(ref, i, src_index, mpi_token, dst, src, ds, sr,
5440                          dst_byte_offset + array_offset_dst * ref->item_size,
5441                          desc_byte_offset + array_offset_dst * ref->item_size,
5442                          dst_kind, src_kind, dst_dim + 1, src_dim + 1,
5443                          1, stat, global_dynamic_win_rank, memptr_win_rank,
5444                          ds_global, desc_global
5445 #ifdef GCC_GE_8
5446                          , dst_type
5447 #endif
5448                          );
5449             src_index += dst->dim[dst_dim]._stride;
5450           }
5451           return;
5452         case CAF_ARR_REF_FULL:
5453           COMPUTE_NUM_ITEMS(extent_dst,
5454                             ref->u.a.dim[dst_dim].s.stride,
5455                             dst->dim[dst_dim].lower_bound,
5456                             dst->dim[dst_dim]._ubound);
5457           dst_stride = dst->dim[dst_dim]._stride
5458                        * ref->u.a.dim[dst_dim].s.stride;
5459           array_offset_dst = 0;
5460           src_stride = (GFC_DESCRIPTOR_RANK(src) > 0) ?
5461             src->dim[src_dim]._stride : 0;
5462           dprint("CAF_ARR_REF_FULL: src_stride = %zd, dst_stride = %zd\n",
5463                  src_stride, dst_stride);
5464           for (ptrdiff_t idx = 0; idx < extent_dst;
5465                ++idx, array_offset_dst += dst_stride)
5466           {
5467             send_for_ref(ref, i, src_index, mpi_token, dst, src, ds, sr,
5468                          dst_byte_offset + array_offset_dst * ref->item_size,
5469                          desc_byte_offset + array_offset_dst * ref->item_size,
5470                          dst_kind, src_kind, dst_dim + 1, src_dim + 1,
5471                          1, stat, global_dynamic_win_rank, memptr_win_rank,
5472                          ds_global, desc_global
5473 #ifdef GCC_GE_8
5474                          , dst_type
5475 #endif
5476                         );
5477             src_index += src_stride;
5478           }
5479           // dprint("CAF_ARR_REF_FULL: return, i = %zd\n", *i);
5480           return;
5481 
5482         case CAF_ARR_REF_RANGE:
5483           COMPUTE_NUM_ITEMS(extent_dst,
5484                             ref->u.a.dim[dst_dim].s.stride,
5485                             ref->u.a.dim[dst_dim].s.start,
5486                             ref->u.a.dim[dst_dim].s.end);
5487           array_offset_dst =
5488             (ref->u.a.dim[dst_dim].s.start - dst->dim[dst_dim].lower_bound)
5489             * dst->dim[dst_dim]._stride;
5490           dst_stride = dst->dim[dst_dim]._stride
5491                        * ref->u.a.dim[dst_dim].s.stride;
5492           src_stride = (GFC_DESCRIPTOR_RANK(src) > 0) ?
5493             src->dim[src_dim]._stride : 0;
5494           /* Increase the dst_dim only, when the src_extent is greater than one
5495            * or src and dst extent are both one. Don't increase when the
5496            * scalar source is not present in the dst. */
5497           next_dst_dim = (
5498             (extent_dst > 1) ||
5499             (GFC_DESCRIPTOR_EXTENT(src, src_dim) == 1 && extent_dst == 1)
5500           ) ? (dst_dim + 1) : dst_dim;
5501           for (ptrdiff_t idx = 0; idx < extent_dst; ++idx)
5502           {
5503             send_for_ref(ref, i, src_index, mpi_token, dst, src, ds, sr,
5504                          dst_byte_offset + array_offset_dst * ref->item_size,
5505                          desc_byte_offset + array_offset_dst * ref->item_size,
5506                          dst_kind, src_kind, next_dst_dim, src_dim + 1,
5507                          1, stat, global_dynamic_win_rank, memptr_win_rank,
5508                          ds_global, desc_global
5509 #ifdef GCC_GE_8
5510                          , dst_type
5511 #endif
5512                          );
5513             src_index += src_stride;
5514             array_offset_dst += dst_stride;
5515           }
5516           // dprint("CAF_ARR_REF_RANGE: return, i = %zd\n", *i);
5517           return;
5518 
5519         case CAF_ARR_REF_SINGLE:
5520           array_offset_dst =
5521             (ref->u.a.dim[dst_dim].s.start - dst->dim[dst_dim].lower_bound)
5522             * dst->dim[dst_dim]._stride;
5523           // FIXME: issue #552
5524           // next_dst_dim = (
5525           //   (extent_dst > 1) ||
5526           //   (GFC_DESCRIPTOR_EXTENT(src, src_dim) == 1 && extent_dst == 1)
5527           // ) ? (dst_dim + 1) : dst_dim;
5528           next_dst_dim = dst_dim;
5529           send_for_ref(ref, i, src_index, mpi_token, dst, src, ds, sr,
5530                        dst_byte_offset + array_offset_dst * ref->item_size,
5531                        desc_byte_offset + array_offset_dst * ref->item_size,
5532                        dst_kind, src_kind, next_dst_dim, src_dim + 1,
5533                        1, stat, global_dynamic_win_rank, memptr_win_rank,
5534                        ds_global, desc_global
5535 #ifdef GCC_GE_8
5536                        , dst_type
5537 #endif
5538                        );
5539 
5540           // dprint("CAF_ARR_REF_SINGLE: return, i = %zd\n", *i);
5541           return;
5542         case CAF_ARR_REF_OPEN_END:
5543           COMPUTE_NUM_ITEMS(extent_dst,
5544                             ref->u.a.dim[dst_dim].s.stride,
5545                             ref->u.a.dim[dst_dim].s.start,
5546                             dst->dim[dst_dim]._ubound);
5547           dst_stride = dst->dim[dst_dim]._stride
5548                        * ref->u.a.dim[dst_dim].s.stride;
5549           src_stride = (GFC_DESCRIPTOR_RANK(src) > 0) ?
5550             src->dim[src_dim]._stride : 0;
5551           array_offset_dst =
5552             (ref->u.a.dim[dst_dim].s.start - dst->dim[dst_dim].lower_bound)
5553             * dst->dim[dst_dim]._stride;
5554           for (ptrdiff_t idx = 0; idx < extent_dst; ++idx)
5555           {
5556             send_for_ref(ref, i, src_index, mpi_token, dst, src, ds, sr,
5557                          dst_byte_offset + array_offset_dst * ref->item_size,
5558                          desc_byte_offset + array_offset_dst * ref->item_size,
5559                          dst_kind, src_kind, dst_dim + 1, src_dim + 1,
5560                          1, stat, global_dynamic_win_rank, memptr_win_rank,
5561                          ds_global, desc_global
5562 #ifdef GCC_GE_8
5563                          , dst_type
5564 #endif
5565                          );
5566             src_index += src_stride;
5567             array_offset_dst += dst_stride;
5568           }
5569           return;
5570         case CAF_ARR_REF_OPEN_START:
5571           COMPUTE_NUM_ITEMS(extent_dst,
5572                             ref->u.a.dim[dst_dim].s.stride,
5573                             dst->dim[dst_dim].lower_bound,
5574                             ref->u.a.dim[dst_dim].s.end);
5575           dst_stride =
5576             dst->dim[dst_dim]._stride * ref->u.a.dim[dst_dim].s.stride;
5577           src_stride = (GFC_DESCRIPTOR_RANK(src) > 0) ?
5578             src->dim[src_dim]._stride : 0;
5579           array_offset_dst = 0;
5580           for (ptrdiff_t idx = 0; idx < extent_dst; ++idx)
5581           {
5582             send_for_ref(ref, i, src_index, mpi_token, dst, src, ds, sr,
5583                          dst_byte_offset + array_offset_dst * ref->item_size,
5584                          desc_byte_offset + array_offset_dst * ref->item_size,
5585                          dst_kind, src_kind, dst_dim + 1, src_dim + 1,
5586                          1, stat, global_dynamic_win_rank, memptr_win_rank,
5587                          ds_global, desc_global
5588 #ifdef GCC_GE_8
5589                          , dst_type
5590 #endif
5591                          );
5592             src_index += src_stride;
5593             array_offset_dst += dst_stride;
5594           }
5595           return;
5596         default:
5597           caf_runtime_error(unreachable);
5598       }
5599       return;
5600     case CAF_REF_STATIC_ARRAY:
5601       if (array_ref_dst == CAF_ARR_REF_NONE)
5602       {
5603         send_for_ref(ref->next, i, src_index, mpi_token, dst, NULL, ds, sr,
5604                      dst_byte_offset, desc_byte_offset, dst_kind, src_kind,
5605                      dst_dim, 0, 1, stat, global_dynamic_win_rank, memptr_win_rank,
5606                      ds_global, desc_global
5607 #ifdef GCC_GE_8
5608                      , dst_type
5609 #endif
5610                      );
5611         return;
5612       }
5613       switch (array_ref_dst)
5614       {
5615         case CAF_ARR_REF_VECTOR:
5616           array_offset_dst = 0;
5617           for (size_t idx = 0; idx < ref->u.a.dim[dst_dim].v.nvec; ++idx)
5618           {
5619 #define KINDCASE(kind, type)                                        \
5620 case kind:                                                          \
5621   array_offset_dst = ((type *)ref->u.a.dim[dst_dim].v.vector)[idx]; \
5622   break
5623 
5624             switch (ref->u.a.dim[dst_dim].v.kind)
5625             {
5626               KINDCASE(1, int8_t);
5627               KINDCASE(2, int16_t);
5628               KINDCASE(4, int32_t);
5629               KINDCASE(8, int64_t);
5630 #ifdef HAVE_GFC_INTEGER_16
5631               KINDCASE(16, __int128);
5632 #endif
5633               default:
5634                 caf_runtime_error(unreachable);
5635                 return;
5636             }
5637 #undef KINDCASE
5638 
5639             send_for_ref(ref, i, src_index, mpi_token, dst, NULL, ds, sr,
5640                          dst_byte_offset + array_offset_dst * ref->item_size,
5641                          desc_byte_offset + array_offset_dst * ref->item_size,
5642                          dst_kind, src_kind, dst_dim + 1, src_dim + 1,
5643                          1, stat, global_dynamic_win_rank, memptr_win_rank,
5644                          ds_global, desc_global
5645 #ifdef GCC_GE_8
5646                          , dst_type
5647 #endif
5648                          );
5649             src_index += src->dim[src_dim]._stride;
5650           }
5651           return;
5652         case CAF_ARR_REF_FULL:
5653           src_stride = (GFC_DESCRIPTOR_RANK(src) > 0) ?
5654             src->dim[src_dim]._stride : 0;
5655           for (array_offset_dst = 0 ;
5656                array_offset_dst <= ref->u.a.dim[dst_dim].s.end;
5657                array_offset_dst += ref->u.a.dim[dst_dim].s.stride)
5658           {
5659             send_for_ref(ref, i, src_index, mpi_token, dst, NULL, ds, sr,
5660                          dst_byte_offset + array_offset_dst * ref->item_size,
5661                          desc_byte_offset + array_offset_dst * ref->item_size,
5662                          dst_kind, src_kind, dst_dim + 1, src_dim + 1,
5663                          1, stat, global_dynamic_win_rank, memptr_win_rank,
5664                          ds_global, desc_global
5665 #ifdef GCC_GE_8
5666                          , dst_type
5667 #endif
5668                          );
5669             src_index += src_stride;
5670           }
5671           return;
5672         case CAF_ARR_REF_RANGE:
5673           COMPUTE_NUM_ITEMS(extent_dst,
5674                             ref->u.a.dim[dst_dim].s.stride,
5675                             ref->u.a.dim[dst_dim].s.start,
5676                             ref->u.a.dim[dst_dim].s.end);
5677           src_stride = (GFC_DESCRIPTOR_RANK (src) > 0) ?
5678             src->dim[src_dim]._stride : 0;
5679           array_offset_dst = ref->u.a.dim[dst_dim].s.start;
5680           for (ptrdiff_t idx = 0; idx < extent_dst; ++idx)
5681           {
5682             send_for_ref(ref, i, src_index, mpi_token, dst, NULL, ds, sr,
5683                          dst_byte_offset + array_offset_dst * ref->item_size,
5684                          desc_byte_offset + array_offset_dst * ref->item_size,
5685                          dst_kind, src_kind, dst_dim + 1, src_dim + 1,
5686                          1, stat, global_dynamic_win_rank, memptr_win_rank,
5687                          ds_global, desc_global
5688 #ifdef GCC_GE_8
5689                          , dst_type
5690 #endif
5691                          );
5692             src_index += src_stride;
5693             array_offset_dst += ref->u.a.dim[dst_dim].s.stride;
5694           }
5695           return;
5696         case CAF_ARR_REF_SINGLE:
5697           array_offset_dst = ref->u.a.dim[dst_dim].s.start;
5698           send_for_ref(ref, i, src_index, mpi_token, dst, NULL, ds, sr,
5699                        dst_byte_offset + array_offset_dst * ref->item_size,
5700                        desc_byte_offset + array_offset_dst * ref->item_size,
5701                        dst_kind, src_kind, dst_dim, src_dim + 1,
5702                        1, stat, global_dynamic_win_rank, memptr_win_rank,
5703                        ds_global, desc_global
5704 #ifdef GCC_GE_8
5705                        , dst_type
5706 #endif
5707                        );
5708           return;
5709           /* The OPEN_* are mapped to a RANGE and therefore can not occur. */
5710         case CAF_ARR_REF_OPEN_END:
5711         case CAF_ARR_REF_OPEN_START:
5712         default:
5713           caf_runtime_error(unreachable);
5714       }
5715       return;
5716     default:
5717       caf_runtime_error(unreachable);
5718   }
5719 }
5720 
5721 
5722 void
5723 PREFIX(send_by_ref) (caf_token_t token, int image_index,
5724                      gfc_descriptor_t *src, caf_reference_t *refs,
5725                      int dst_kind, int src_kind, bool may_require_tmp,
5726                      bool dst_reallocatable, int *stat
5727 #ifdef GCC_GE_8
5728                      , int dst_type
5729 #endif
5730                      )
5731 {
5732   const char vecrefunknownkind[] =
5733     "libcaf_mpi::caf_send_by_ref(): unknown kind in vector-ref.\n";
5734   const char unknownreftype[] =
5735     "libcaf_mpi::caf_send_by_ref(): unknown reference type.\n";
5736   const char unknownarrreftype[] =
5737     "libcaf_mpi::caf_send_by_ref(): unknown array reference type.\n";
5738   const char rankoutofrange[] =
5739     "libcaf_mpi::caf_send_by_ref(): rank out of range.\n";
5740   const char extentoutofrange[] =
5741     "libcaf_mpi::caf_send_by_ref(): extent out of range.\n";
5742   const char cannotallocdst[] =
5743     "libcaf_mpi::caf_send_by_ref(): can not allocate %d bytes of memory.\n";
5744   const char unabletoallocdst[] =
5745     "libcaf_mpi::caf_send_by_ref(): "
5746     "unable to allocate memory on remote image.\n";
5747   const char nonallocextentmismatch[] =
5748     "libcaf_mpi::caf_send_by_ref(): "
5749     "extent of non-allocatable arrays mismatch (%lu != %lu).\n";
5750 
5751   size_t size, i, ref_rank = 0, src_index, dst_size;
5752   int dst_rank = -1, src_cur_dim = 0, ierr;
5753   mpi_caf_token_t *mpi_token = (mpi_caf_token_t *) token;
5754   void *remote_memptr = mpi_token->memptr, *remote_base_memptr = NULL;
5755   gfc_max_dim_descriptor_t dst_desc, temp_src;
5756   gfc_descriptor_t *dst = (gfc_descriptor_t *)&dst_desc;
5757   caf_reference_t *riter = refs;
5758   long delta;
5759   ptrdiff_t data_offset = 0, desc_offset = 0;
5760   /* Reallocation of data on remote is needed (e.g., array to small).  This is
5761    * used for error tracking only.  It is not (yet) possible to allocate memory
5762    * on the remote image. */
5763   bool realloc_dst = false, extent_mismatch = false;
5764   /* Set when the first non-scalar array reference is encountered. */
5765   bool in_array_ref = false;
5766   /* Set when remote data is to be accessed through the
5767    * global dynamic window. */
5768   bool access_data_through_global_win = false;
5769   /* Set when the remote descriptor is to accessed through the global window. */
5770   bool access_desc_through_global_win = false;
5771   bool free_temp_src = false;
5772   caf_array_ref_t array_ref;
5773 
5774   if (stat)
5775     *stat = 0;
5776 
5777   MPI_Group current_team_group, win_group;
5778   int global_dynamic_win_rank, memptr_win_rank;
5779   ierr = MPI_Comm_group(CAF_COMM_WORLD, &current_team_group); chk_err(ierr);
5780   ierr = MPI_Win_get_group(global_dynamic_win, &win_group); chk_err(ierr);
5781   ierr = MPI_Group_translate_ranks(current_team_group, 1,
5782                                    (int[]){image_index - 1}, win_group,
5783                                    &global_dynamic_win_rank); chk_err(ierr);
5784   ierr = MPI_Group_free(&win_group); chk_err(ierr);
5785   ierr = MPI_Win_get_group(mpi_token->memptr_win, &win_group); chk_err(ierr);
5786   ierr = MPI_Group_translate_ranks(current_team_group, 1,
5787                                    (int[]){image_index - 1}, win_group,
5788                                    &memptr_win_rank); chk_err(ierr);
5789   ierr = MPI_Group_free(&current_team_group); chk_err(ierr);
5790   ierr = MPI_Group_free(&win_group); chk_err(ierr);
5791 
5792   check_image_health(global_dynamic_win_rank, stat);
5793 
5794 #ifdef GCC_GE_8
5795   dprint("Entering send_by_ref(may_require_tmp = %d, dst_type = %d)\n",
5796          may_require_tmp, dst_type);
5797 #else
5798   dprint("Entering send_by_ref(may_require_tmp = %d)\n", may_require_tmp);
5799 #endif
5800 
5801   /* Compute the size of the result.  In the beginning size just counts the
5802    * number of elements. */
5803   size = 1;
5804   /* Shared lock both windows to prevent bother in the sub-routines. */
5805   CAF_Win_lock(MPI_LOCK_SHARED, global_dynamic_win_rank, global_dynamic_win);
5806   CAF_Win_lock(MPI_LOCK_SHARED, memptr_win_rank, mpi_token->memptr_win);
5807   while (riter)
5808   {
5809     dprint("remote_image = %d, offset = %zd, remote_mem = %p\n",
5810            global_dynamic_win_rank, data_offset, remote_memptr);
5811     switch (riter->type)
5812     {
5813       case CAF_REF_COMPONENT:
5814         if (riter->u.c.caf_token_offset > 0)
5815         {
5816           if (access_data_through_global_win)
5817           {
5818             data_offset += riter->u.c.offset;
5819             remote_base_memptr = remote_memptr;
5820             ierr = MPI_Get(&remote_memptr, stdptr_size, MPI_BYTE, global_dynamic_win_rank,
5821                            MPI_Aint_add((MPI_Aint)remote_memptr, data_offset),
5822                            stdptr_size, MPI_BYTE, global_dynamic_win);
5823             chk_err(ierr);
5824             /* On the second indirection access also the remote descriptor
5825              * using the global window. */
5826             access_desc_through_global_win = true;
5827           }
5828           else
5829           {
5830             data_offset += riter->u.c.offset;
5831             ierr = MPI_Get(&remote_memptr, stdptr_size, MPI_BYTE, memptr_win_rank,
5832                            data_offset, stdptr_size, MPI_BYTE,
5833                            mpi_token->memptr_win); chk_err(ierr);
5834             /* All future access is through the global dynamic window. */
5835             access_data_through_global_win = true;
5836           }
5837           desc_offset = data_offset;
5838           data_offset = 0;
5839         }
5840         else
5841         {
5842           data_offset += riter->u.c.offset;
5843           desc_offset += riter->u.c.offset;
5844         }
5845         break;
5846       case CAF_REF_ARRAY:
5847         /* When there has been no CAF_REF_COMP before hand, then the descriptor
5848          * is stored in the token and the extends are the same on all images,
5849          * which is taken care of in the else part. */
5850         if (access_data_through_global_win)
5851         {
5852           for (ref_rank = 0;
5853                riter->u.a.mode[ref_rank] != CAF_ARR_REF_NONE; ++ref_rank) ;
5854           /* Get the remote descriptor and use the stack to store it
5855            * Note, dst may be pointing to mpi_token->desc therefore it
5856            * needs to be reset here. */
5857           dst = (gfc_descriptor_t *)&dst_desc;
5858           if (access_desc_through_global_win)
5859           {
5860             dprint("remote desc fetch from %p, offset = %zd\n",
5861                    remote_base_memptr, desc_offset);
5862             ierr = MPI_Get(dst, sizeof_desc_for_rank(ref_rank), MPI_BYTE,
5863                            global_dynamic_win_rank,
5864                            MPI_Aint_add(
5865                             (MPI_Aint)remote_base_memptr, desc_offset),
5866                            sizeof_desc_for_rank(ref_rank), MPI_BYTE,
5867                            global_dynamic_win); chk_err(ierr);
5868           }
5869           else
5870           {
5871             dprint("remote desc fetch from win %d, offset = %zd\n",
5872                    mpi_token->memptr_win, desc_offset);
5873             ierr = MPI_Get(dst, sizeof_desc_for_rank(ref_rank), MPI_BYTE,
5874                            memptr_win_rank, desc_offset,
5875                            sizeof_desc_for_rank(ref_rank), MPI_BYTE,
5876                            mpi_token->memptr_win); chk_err(ierr);
5877             access_desc_through_global_win = true;
5878           }
5879         }
5880         else
5881           dst = mpi_token->desc;
5882 #ifdef EXTRA_DEBUG_OUTPUT
5883         dprint("remote desc rank: %zd (ref_rank: %zd)\n",
5884                GFC_DESCRIPTOR_RANK(dst), ref_rank);
5885         for (i = 0; i < GFC_DESCRIPTOR_RANK(dst); ++i)
5886         {
5887           dprint("remote desc dim[%zd] = (lb=%zd, ub=%zd, stride=%zd)\n",
5888                  i, dst->dim[i].lower_bound, dst->dim[i]._ubound,
5889                  dst->dim[i]._stride);
5890         }
5891 #endif
5892         for (i = 0; riter->u.a.mode[i] != CAF_ARR_REF_NONE; ++i)
5893         {
5894           array_ref = riter->u.a.mode[i];
5895           switch (array_ref)
5896           {
5897             case CAF_ARR_REF_VECTOR:
5898               delta = riter->u.a.dim[i].v.nvec;
5899 #define KINDCASE(kind, type)                                            \
5900 case kind:                                                              \
5901   remote_memptr += (((ptrdiff_t)                                        \
5902     ((type *)riter->u.a.dim[i].v.vector)[0]) - src->dim[i].lower_bound) \
5903     * src->dim[i]._stride * riter->item_size;                           \
5904   break
5905               switch (riter->u.a.dim[i].v.kind)
5906               {
5907                 KINDCASE(1, int8_t);
5908                 KINDCASE(2, int16_t);
5909                 KINDCASE(4, int32_t);
5910                 KINDCASE(8, int64_t);
5911 #ifdef HAVE_GFC_INTEGER_16
5912                 KINDCASE(16, __int128);
5913 #endif
5914                 default:
5915                   caf_runtime_error(vecrefunknownkind, stat, NULL, 0);
5916                   return;
5917               }
5918 #undef KINDCASE
5919               break;
5920             case CAF_ARR_REF_FULL:
5921               COMPUTE_NUM_ITEMS(delta,
5922                                 riter->u.a.dim[i].s.stride,
5923                                 dst->dim[i].lower_bound,
5924                                 dst->dim[i]._ubound);
5925               /* The memptr stays unchanged when ref'ing the first element in
5926                * a dimension. */
5927               break;
5928             case CAF_ARR_REF_RANGE:
5929               COMPUTE_NUM_ITEMS(delta,
5930                                 riter->u.a.dim[i].s.stride,
5931                                 riter->u.a.dim[i].s.start,
5932                                 riter->u.a.dim[i].s.end);
5933               remote_memptr +=
5934                 (riter->u.a.dim[i].s.start - dst->dim[i].lower_bound)
5935                 * dst->dim[i]._stride * riter->item_size;
5936               break;
5937             case CAF_ARR_REF_SINGLE:
5938               delta = 1;
5939               remote_memptr +=
5940                 (riter->u.a.dim[i].s.start - dst->dim[i].lower_bound)
5941                 * dst->dim[i]._stride * riter->item_size;
5942               break;
5943             case CAF_ARR_REF_OPEN_END:
5944               COMPUTE_NUM_ITEMS(delta,
5945                                 riter->u.a.dim[i].s.stride,
5946                                 riter->u.a.dim[i].s.start,
5947                                 dst->dim[i]._ubound);
5948               remote_memptr +=
5949                 (riter->u.a.dim[i].s.start - dst->dim[i].lower_bound)
5950                 * dst->dim[i]._stride * riter->item_size;
5951               break;
5952             case CAF_ARR_REF_OPEN_START:
5953               COMPUTE_NUM_ITEMS(delta,
5954                                 riter->u.a.dim[i].s.stride,
5955                                 dst->dim[i].lower_bound,
5956                                 riter->u.a.dim[i].s.end);
5957               /* The memptr stays unchanged when ref'ing the first element in
5958                * a dimension. */
5959               break;
5960             default:
5961               caf_runtime_error(unknownarrreftype, stat, NULL, 0);
5962               return;
5963           } // switch
5964           dprint("i = %zd, array_ref = %s, delta = %ld\n",
5965                  i, caf_array_ref_str[array_ref], delta);
5966           if (delta <= 0)
5967             return;
5968           if (dst != NULL)
5969             dst_rank = GFC_DESCRIPTOR_RANK(dst);
5970           /* Check the various properties of the destination array.
5971            * Is an array expected and present? */
5972           if (delta > 1 && dst_rank == 0)
5973           {
5974             /* No, an array is required, but not provided. */
5975             caf_runtime_error(extentoutofrange, stat, NULL, 0);
5976             return;
5977           }
5978           /* When dst is an array. */
5979           if (dst_rank > 0)
5980           {
5981             /* Check that src_cur_dim is valid for dst. Can be superceeded
5982              * only by scalar data. */
5983             if (src_cur_dim >= dst_rank && delta != 1)
5984             {
5985               caf_runtime_error(rankoutofrange, stat, NULL, 0);
5986               return;
5987             }
5988             /* Do further checks, when the source is not scalar. */
5989             else if (delta != 1)
5990             {
5991               in_array_ref = true;
5992               /* When the realloc is required, then no extent may have
5993                * been set. */
5994               extent_mismatch = GFC_DESCRIPTOR_EXTENT(dst, src_cur_dim) < delta;
5995               /* When it already known, that a realloc is needed or
5996                * the extent does not match the needed one. */
5997               if (realloc_dst || extent_mismatch)
5998               {
5999                 /* Check whether dst is reallocatable. */
6000                 if (unlikely(!dst_reallocatable))
6001                 {
6002                   caf_runtime_error(nonallocextentmismatch, stat,
6003                                     NULL, 0, delta,
6004                                     GFC_DESCRIPTOR_EXTENT(dst, src_cur_dim));
6005                   return;
6006                 }
6007                 /* Only report an error, when the extent needs to be
6008                  * modified, which is not allowed. */
6009                 else if (!dst_reallocatable && extent_mismatch)
6010                 {
6011                   caf_runtime_error(extentoutofrange, stat, NULL, 0);
6012                   return;
6013                 }
6014                 dprint("extent(dst, %d): %zd != delta: %ld.\n", src_cur_dim,
6015                        GFC_DESCRIPTOR_EXTENT(dst, src_cur_dim), delta);
6016                 realloc_dst = true;
6017               }
6018             }
6019 
6020             if (src_cur_dim < GFC_DESCRIPTOR_RANK(src))
6021               ++src_cur_dim;
6022           }
6023           size *= (ptrdiff_t)delta;
6024         }
6025         in_array_ref = false;
6026         break;
6027       case CAF_REF_STATIC_ARRAY:
6028         for (i = 0; riter->u.a.mode[i] != CAF_ARR_REF_NONE; ++i)
6029         {
6030           array_ref = riter->u.a.mode[i];
6031           switch (array_ref)
6032           {
6033             case CAF_ARR_REF_VECTOR:
6034               delta = riter->u.a.dim[i].v.nvec;
6035 #define KINDCASE(kind, type)                                    \
6036 case kind:                                                      \
6037   remote_memptr +=                                              \
6038     ((type *)riter->u.a.dim[i].v.vector)[0] * riter->item_size; \
6039   break
6040 
6041               switch (riter->u.a.dim[i].v.kind)
6042               {
6043                 KINDCASE(1, int8_t);
6044                 KINDCASE(2, int16_t);
6045                 KINDCASE(4, int32_t);
6046                 KINDCASE(8, int64_t);
6047 #ifdef HAVE_GFC_INTEGER_16
6048                 KINDCASE(16, __int128);
6049 #endif
6050                 default:
6051                   caf_runtime_error(vecrefunknownkind, stat, NULL, 0);
6052                   return;
6053               }
6054 #undef KINDCASE
6055               break;
6056             case CAF_ARR_REF_FULL:
6057               delta =
6058                 riter->u.a.dim[i].s.end / riter->u.a.dim[i].s.stride + 1;
6059               /* The memptr stays unchanged when ref'ing the first element
6060                * in a dimension. */
6061               break;
6062             case CAF_ARR_REF_RANGE:
6063               COMPUTE_NUM_ITEMS(delta,
6064                                 riter->u.a.dim[i].s.stride,
6065                                 riter->u.a.dim[i].s.start,
6066                                 riter->u.a.dim[i].s.end);
6067               remote_memptr += riter->u.a.dim[i].s.start
6068                 * riter->u.a.dim[i].s.stride * riter->item_size;
6069               break;
6070             case CAF_ARR_REF_SINGLE:
6071               delta = 1;
6072               remote_memptr += riter->u.a.dim[i].s.start
6073                                * riter->u.a.dim[i].s.stride
6074                                * riter->item_size;
6075               break;
6076             case CAF_ARR_REF_OPEN_END:
6077               /* This and OPEN_START are mapped to a RANGE and therefore
6078                * can not occur here. */
6079             case CAF_ARR_REF_OPEN_START:
6080             default:
6081               caf_runtime_error(unknownarrreftype, stat, NULL, 0);
6082               return;
6083           } // switch
6084           dprint("i = %zd, array_ref = %s, delta = %ld\n",
6085                  i, caf_array_ref_str[array_ref], delta);
6086           if (delta <= 0)
6087             return;
6088           if (dst != NULL)
6089             dst_rank = GFC_DESCRIPTOR_RANK(dst);
6090           /* Check the various properties of the destination array.
6091            * Is an array expected and present? */
6092           if (delta > 1 && dst_rank == 0)
6093           {
6094             /* No, an array is required, but not provided. */
6095             caf_runtime_error(extentoutofrange, stat, NULL, 0);
6096             return;
6097           }
6098           /* When dst is an array. */
6099           if (dst_rank > 0)
6100           {
6101             /* Check that src_cur_dim is valid for dst. Can be superceeded
6102              * only by scalar data. */
6103             if (src_cur_dim >= dst_rank && delta != 1)
6104             {
6105               caf_runtime_error(rankoutofrange, stat, NULL, 0);
6106               return;
6107             }
6108             /* Do further checks, when the source is not scalar. */
6109             else if (delta != 1)
6110             {
6111               in_array_ref = true;
6112               /* When the realloc is required, then no extent may have
6113                * been set. */
6114               extent_mismatch = GFC_DESCRIPTOR_EXTENT(dst, src_cur_dim) < delta;
6115               /* When it is already known, that a realloc is needed or
6116                * the extent does not match the needed one. */
6117               if (realloc_dst || extent_mismatch)
6118               {
6119                 caf_runtime_error(unabletoallocdst, stat);
6120                 return;
6121               }
6122             }
6123             if (src_cur_dim < GFC_DESCRIPTOR_RANK(src))
6124               ++src_cur_dim;
6125           }
6126           size *= (ptrdiff_t)delta;
6127         }
6128         in_array_ref = false;
6129         break;
6130       default:
6131         caf_runtime_error(unknownreftype, stat, NULL, 0);
6132         return;
6133     }
6134     dst_size = riter->item_size;
6135     riter = riter->next;
6136   }
6137   if (size == 0 || dst_size == 0)
6138     return;
6139   /* Postcondition:
6140    * - size contains the number of elements to store in the destination array,
6141    * - dst_size gives the size in bytes of each item in the destination array.
6142    */
6143 
6144   if (realloc_dst)
6145   {
6146     caf_runtime_error(unabletoallocdst, stat);
6147     return;
6148   }
6149 
6150   /* Reset the token. */
6151   mpi_token = (mpi_caf_token_t *) token;
6152   remote_memptr = mpi_token->memptr;
6153   src_index = 0;
6154 #ifdef EXTRA_DEBUG_OUTPUT
6155   dprint("src_rank: %zd\n", GFC_DESCRIPTOR_RANK(src));
6156   for (i = 0; i < GFC_DESCRIPTOR_RANK(src); ++i)
6157   {
6158     dprint("src_dim[%zd] = (%zd, %zd)\n",
6159            i, src->dim[i].lower_bound, src->dim[i]._ubound);
6160   }
6161 #endif
6162   /* When accessing myself and may_require_tmp is set, then copy the source
6163    * array. */
6164   if (caf_this_image == image_index && may_require_tmp)
6165   {
6166     dprint("preparing temporary source.\n");
6167     memcpy(&temp_src, src, sizeof_desc_for_rank(GFC_DESCRIPTOR_RANK(src)));
6168     size_t cap = 0;
6169     for (int r = 0; r < GFC_DESCRIPTOR_RANK(src); ++r)
6170     {
6171       cap += GFC_DESCRIPTOR_EXTENT(src, r);
6172     }
6173 
6174     cap *= GFC_DESCRIPTOR_SIZE(src);
6175     temp_src.base.base_addr = alloca(cap);
6176     if ((free_temp_src = (temp_src.base.base_addr == NULL)))
6177     {
6178       temp_src.base.base_addr = malloc(cap);
6179       if (temp_src.base.base_addr == NULL)
6180       {
6181         caf_runtime_error(cannotallocdst, stat, NULL, cap);
6182         return;
6183       }
6184     }
6185     memcpy(temp_src.base.base_addr, src->base_addr, cap);
6186     src = (gfc_descriptor_t *)&temp_src;
6187   }
6188 
6189   i = 0;
6190   dprint("calling send_for_ref. num elems: size = %zd, elem size in bytes: "
6191          "dst_size = %zd\n", size, dst_size);
6192   send_for_ref(refs, &i, src_index, mpi_token, mpi_token->desc, src,
6193                remote_memptr, src->base_addr, 0, 0, dst_kind, src_kind, 0, 0,
6194                1, stat, global_dynamic_win_rank, memptr_win_rank,
6195                false, false
6196 #ifdef GCC_GE_8
6197                , dst_type
6198 #endif
6199                );
6200   if (free_temp_src)
6201   {
6202     free(temp_src.base.base_addr);
6203   }
6204   CAF_Win_unlock(global_dynamic_win_rank, global_dynamic_win);
6205   CAF_Win_unlock(memptr_win_rank, mpi_token->memptr_win);
6206 }
6207 
6208 
6209 void
6210 PREFIX(sendget_by_ref) (caf_token_t dst_token, int dst_image_index,
6211                         caf_reference_t *dst_refs, caf_token_t src_token,
6212                         int src_image_index, caf_reference_t *src_refs,
6213                         int dst_kind, int src_kind,
6214                         bool may_require_tmp, int *dst_stat, int *src_stat
6215 #ifdef GCC_GE_8
6216                         , int dst_type, int src_type
6217 #endif
6218                         )
6219 {
6220   const char vecrefunknownkind[] =
6221     "libcaf_mpi::caf_sendget_by_ref(): unknown kind in vector-ref.\n";
6222   const char unknownreftype[] =
6223     "libcaf_mpi::caf_sendget_by_ref(): unknown reference type.\n";
6224   const char unknownarrreftype[] =
6225     "libcaf_mpi::caf_sendget_by_ref(): unknown array reference type.\n";
6226   const char cannotallocdst[] =
6227     "libcaf_mpi::caf_sendget_by_ref(): can not allocate %d bytes of memory.\n";
6228   size_t size, i, ref_rank, dst_index, src_index = 0, src_size;
6229   int dst_rank, ierr;
6230   mpi_caf_token_t
6231     *src_mpi_token = (mpi_caf_token_t *) src_token,
6232     *dst_mpi_token = (mpi_caf_token_t *) dst_token;
6233   void *remote_memptr = src_mpi_token->memptr, *remote_base_memptr = NULL;
6234   gfc_max_dim_descriptor_t src_desc;
6235   gfc_max_dim_descriptor_t temp_src_desc;
6236   gfc_descriptor_t *src = (gfc_descriptor_t *)&src_desc;
6237   caf_reference_t *riter = src_refs;
6238   long delta;
6239   ptrdiff_t data_offset = 0, desc_offset = 0;
6240   MPI_Group current_team_group, win_group;
6241   int global_dst_rank, global_src_rank, memptr_dst_rank, memptr_src_rank;
6242   /* Set when the first non-scalar array reference is encountered. */
6243   bool in_array_ref = false;
6244   bool array_extent_fixed = false;
6245   /* Set when remote data is to be accessed through the
6246    * global dynamic window. */
6247   bool access_data_through_global_win = false;
6248   /* Set when the remote descriptor is to accessed through the global window. */
6249   bool access_desc_through_global_win = false;
6250   caf_array_ref_t array_ref;
6251 #ifndef GCC_GE_8
6252   int dst_type = -1, src_type = -1;
6253 #endif
6254 
6255   if (src_stat)
6256     *src_stat = 0;
6257 
6258   ierr = MPI_Comm_group(CAF_COMM_WORLD, &current_team_group); chk_err(ierr);
6259 
6260   ierr = MPI_Win_get_group(global_dynamic_win, &win_group); chk_err(ierr);
6261   ierr = MPI_Group_translate_ranks(current_team_group, 1,
6262                                   (int[]){src_image_index - 1}, win_group,
6263                                   &global_src_rank); chk_err(ierr);
6264   ierr = MPI_Group_translate_ranks(current_team_group, 1,
6265                                   (int[]){dst_image_index - 1}, win_group,
6266                                   &global_dst_rank); chk_err(ierr);
6267   ierr = MPI_Group_free(&win_group); chk_err(ierr);
6268 
6269   ierr = MPI_Win_get_group(src_mpi_token->memptr_win, &win_group); chk_err(ierr);
6270   ierr = MPI_Group_translate_ranks(current_team_group, 1,
6271                                   (int[]){src_image_index - 1}, win_group,
6272                                   &memptr_src_rank); chk_err(ierr);
6273   ierr = MPI_Group_free(&win_group); chk_err(ierr);
6274   ierr = MPI_Win_get_group(dst_mpi_token->memptr_win, &win_group); chk_err(ierr);
6275   ierr = MPI_Group_translate_ranks(current_team_group, 1,
6276                                   (int[]){dst_image_index - 1}, win_group,
6277                                   &memptr_dst_rank); chk_err(ierr);
6278   ierr = MPI_Group_free(&win_group); chk_err(ierr);
6279   ierr = MPI_Group_free(&current_team_group); chk_err(ierr);
6280 
6281   check_image_health(global_src_rank, src_stat);
6282 
6283   dprint("Entering get_by_ref(may_require_tmp = %d, dst_type = %d(%d), "
6284          "src_type = %d(%d)).\n",
6285          may_require_tmp, dst_type, dst_kind, src_type, src_kind);
6286 
6287   /* Compute the size of the result.  In the beginning size just counts the
6288    * number of elements. */
6289   size = 1;
6290   /* Shared lock both windows to prevent bother in the sub-routines. */
6291   CAF_Win_lock(MPI_LOCK_SHARED, global_src_rank, global_dynamic_win);
6292   CAF_Win_lock(MPI_LOCK_SHARED, memptr_src_rank, src_mpi_token->memptr_win);
6293   while (riter)
6294   {
6295     dprint("offset = %zd, remote_mem = %p\n", data_offset, remote_memptr);
6296     switch (riter->type)
6297     {
6298       case CAF_REF_COMPONENT:
6299         if (riter->u.c.caf_token_offset > 0)
6300         {
6301           if (access_data_through_global_win)
6302           {
6303             data_offset += riter->u.c.offset;
6304             remote_base_memptr = remote_memptr;
6305             ierr = MPI_Get(&remote_memptr, stdptr_size, MPI_BYTE,
6306                            global_src_rank,
6307                            MPI_Aint_add((MPI_Aint)remote_memptr, data_offset),
6308                            stdptr_size, MPI_BYTE, global_dynamic_win);
6309             chk_err(ierr);
6310             /* On the second indirection access also the remote descriptor
6311              * using the global window. */
6312             access_desc_through_global_win = true;
6313           }
6314           else
6315           {
6316             data_offset += riter->u.c.offset;
6317             ierr = MPI_Get(&remote_memptr, stdptr_size, MPI_BYTE,
6318                            memptr_src_rank, data_offset, stdptr_size, MPI_BYTE,
6319                            src_mpi_token->memptr_win); chk_err(ierr);
6320             /* All future access is through the global dynamic window. */
6321             access_data_through_global_win = true;
6322           }
6323           desc_offset = data_offset;
6324           data_offset = 0;
6325         }
6326         else
6327         {
6328           data_offset += riter->u.c.offset;
6329           desc_offset += riter->u.c.offset;
6330         }
6331         break;
6332       case CAF_REF_ARRAY:
6333         /* When there has been no CAF_REF_COMP before hand, then the
6334          * descriptor is stored in the token and the extends are the same on all
6335          * images, which is taken care of in the else part. */
6336         if (access_data_through_global_win)
6337         {
6338           for (ref_rank = 0; riter->u.a.mode[ref_rank] != CAF_ARR_REF_NONE;
6339                ++ref_rank) ;
6340           /* Get the remote descriptor and use the stack to store it. Note,
6341            * src may be pointing to mpi_token->desc therefore it needs to be
6342            * reset here. */
6343           src = (gfc_descriptor_t *)&src_desc;
6344           if (access_desc_through_global_win)
6345           {
6346             dprint("remote desc fetch from %p, offset = %zd\n",
6347                    remote_base_memptr, desc_offset);
6348             ierr = MPI_Get(src, sizeof_desc_for_rank(ref_rank), MPI_BYTE,
6349                            global_src_rank,
6350                            MPI_Aint_add(
6351                             (MPI_Aint)remote_base_memptr, desc_offset),
6352                            sizeof_desc_for_rank(ref_rank), MPI_BYTE,
6353                            global_dynamic_win); chk_err(ierr);
6354           }
6355           else
6356           {
6357             dprint("remote desc fetch from win %d, offset = %zd\n",
6358                    src_mpi_token->memptr_win, desc_offset);
6359             ierr = MPI_Get(src, sizeof_desc_for_rank(ref_rank), MPI_BYTE,
6360                            memptr_src_rank, desc_offset,
6361                            sizeof_desc_for_rank(ref_rank),
6362                            MPI_BYTE, src_mpi_token->memptr_win); chk_err(ierr);
6363             access_desc_through_global_win = true;
6364           }
6365         }
6366         else
6367         {
6368           src = src_mpi_token->desc;
6369         }
6370 #ifdef EXTRA_DEBUG_OUTPUT
6371         dprint("remote desc rank: %zd (ref_rank: %zd)\n",
6372                GFC_DESCRIPTOR_RANK(src), ref_rank);
6373         for (i = 0; i < GFC_DESCRIPTOR_RANK(src); ++i)
6374         {
6375           dprint("remote desc dim[%zd] = (lb=%zd, ub=%zd, stride=%zd)\n",
6376                  i, src->dim[i].lower_bound, src->dim[i]._ubound,
6377                  src->dim[i]._stride);
6378         }
6379 #endif
6380         for (i = 0; riter->u.a.mode[i] != CAF_ARR_REF_NONE; ++i)
6381         {
6382           array_ref = riter->u.a.mode[i];
6383           switch (array_ref)
6384           {
6385             case CAF_ARR_REF_VECTOR:
6386               delta = riter->u.a.dim[i].v.nvec;
6387 #define KINDCASE(kind, type)                                            \
6388 case kind:                                                              \
6389   remote_memptr += (((ptrdiff_t)                                        \
6390     ((type *)riter->u.a.dim[i].v.vector)[0]) - src->dim[i].lower_bound) \
6391     * src->dim[i]._stride * riter->item_size;                           \
6392   break
6393               switch (riter->u.a.dim[i].v.kind)
6394               {
6395                 KINDCASE(1, int8_t);
6396                 KINDCASE(2, int16_t);
6397                 KINDCASE(4, int32_t);
6398                 KINDCASE(8, int64_t);
6399 #ifdef HAVE_GFC_INTEGER_16
6400                 KINDCASE(16, __int128);
6401 #endif
6402                 default:
6403                   caf_runtime_error(vecrefunknownkind, src_stat, NULL, 0);
6404                   return;
6405               }
6406 #undef KINDCASE
6407               break;
6408             case CAF_ARR_REF_FULL:
6409               COMPUTE_NUM_ITEMS(delta,
6410                                 riter->u.a.dim[i].s.stride,
6411                                 src->dim[i].lower_bound,
6412                                 src->dim[i]._ubound);
6413               /* The memptr stays unchanged when ref'ing the first element
6414                * in a dimension. */
6415               break;
6416             case CAF_ARR_REF_RANGE:
6417               COMPUTE_NUM_ITEMS(delta,
6418                                 riter->u.a.dim[i].s.stride,
6419                                 riter->u.a.dim[i].s.start,
6420                                 riter->u.a.dim[i].s.end);
6421               remote_memptr +=
6422                 (riter->u.a.dim[i].s.start - src->dim[i].lower_bound)
6423                 * src->dim[i]._stride * riter->item_size;
6424               break;
6425             case CAF_ARR_REF_SINGLE:
6426               delta = 1;
6427               remote_memptr +=
6428                 (riter->u.a.dim[i].s.start - src->dim[i].lower_bound)
6429                 * src->dim[i]._stride * riter->item_size;
6430               break;
6431             case CAF_ARR_REF_OPEN_END:
6432               COMPUTE_NUM_ITEMS(delta,
6433                                 riter->u.a.dim[i].s.stride,
6434                                 riter->u.a.dim[i].s.start,
6435                                 src->dim[i]._ubound);
6436               remote_memptr +=
6437                 (riter->u.a.dim[i].s.start - src->dim[i].lower_bound)
6438                 * src->dim[i]._stride * riter->item_size;
6439               break;
6440             case CAF_ARR_REF_OPEN_START:
6441               COMPUTE_NUM_ITEMS(delta,
6442                                 riter->u.a.dim[i].s.stride,
6443                                 src->dim[i].lower_bound,
6444                                 riter->u.a.dim[i].s.end);
6445               /* The memptr stays unchanged when ref'ing the first element
6446                * in a dimension. */
6447               break;
6448             default:
6449               caf_runtime_error(unknownarrreftype, src_stat, NULL, 0);
6450               return;
6451           } // switch
6452           dprint("i = %zd, array_ref = %s, delta = %ld\n",
6453                  i, caf_array_ref_str[array_ref], delta);
6454           if (delta <= 0)
6455             return;
6456           size *= (ptrdiff_t)delta;
6457         }
6458         if (in_array_ref)
6459         {
6460           array_extent_fixed = true;
6461           in_array_ref = false;
6462         }
6463         break;
6464       case CAF_REF_STATIC_ARRAY:
6465         for (i = 0; riter->u.a.mode[i] != CAF_ARR_REF_NONE; ++i)
6466         {
6467           array_ref = riter->u.a.mode[i];
6468           switch (array_ref)
6469           {
6470             case CAF_ARR_REF_VECTOR:
6471               delta = riter->u.a.dim[i].v.nvec;
6472 #define KINDCASE(kind, type)                                      \
6473 case kind:                                                        \
6474   remote_memptr +=                                                \
6475     ((type *)riter->u.a.dim[i].v.vector)[0] * riter->item_size;   \
6476   break
6477               switch (riter->u.a.dim[i].v.kind)
6478               {
6479                 KINDCASE(1, int8_t);
6480                 KINDCASE(2, int16_t);
6481                 KINDCASE(4, int32_t);
6482                 KINDCASE(8, int64_t);
6483 #ifdef HAVE_GFC_INTEGER_16
6484                 KINDCASE(16, __int128);
6485 #endif
6486                 default:
6487                   caf_runtime_error(vecrefunknownkind, src_stat, NULL, 0);
6488                   return;
6489               }
6490 #undef KINDCASE
6491               break;
6492             case CAF_ARR_REF_FULL:
6493               delta = riter->u.a.dim[i].s.end / riter->u.a.dim[i].s.stride + 1;
6494               /* The memptr stays unchanged when ref'ing the first element
6495                * in a dimension. */
6496               break;
6497             case CAF_ARR_REF_RANGE:
6498               COMPUTE_NUM_ITEMS(delta,
6499                                 riter->u.a.dim[i].s.stride,
6500                                 riter->u.a.dim[i].s.start,
6501                                 riter->u.a.dim[i].s.end);
6502               remote_memptr += riter->u.a.dim[i].s.start
6503                                * riter->u.a.dim[i].s.stride
6504                                * riter->item_size;
6505               break;
6506             case CAF_ARR_REF_SINGLE:
6507               delta = 1;
6508               remote_memptr += riter->u.a.dim[i].s.start
6509                                * riter->u.a.dim[i].s.stride
6510                                * riter->item_size;
6511               break;
6512             case CAF_ARR_REF_OPEN_END:
6513               /* This and OPEN_START are mapped to a RANGE and therefore
6514                * can not occur here. */
6515             case CAF_ARR_REF_OPEN_START:
6516             default:
6517               caf_runtime_error(unknownarrreftype, src_stat, NULL, 0);
6518               return;
6519           } // switch
6520           dprint("i = %zd, array_ref = %s, delta = %ld\n",
6521                  i, caf_array_ref_str[array_ref], delta);
6522           if (delta <= 0)
6523             return;
6524           size *= (ptrdiff_t)delta;
6525         }
6526         if (in_array_ref)
6527         {
6528           array_extent_fixed = true;
6529           in_array_ref = false;
6530         }
6531         break;
6532       default:
6533         caf_runtime_error(unknownreftype, src_stat, NULL, 0);
6534         return;
6535       } // switch
6536       src_size = riter->item_size;
6537       riter = riter->next;
6538     }
6539   if (size == 0 || src_size == 0)
6540     return;
6541   /* Postcondition:
6542    * - size contains the number of elements to store in the destination array,
6543    * - src_size gives the size in bytes of each item in the destination array.
6544   */
6545 
6546   dst_rank = (size > 1) ? 1 : 0;
6547   memset(&temp_src_desc, 0, sizeof(gfc_dim1_descriptor_t));
6548 #ifdef GCC_GE_8
6549   temp_src_desc.base.dtype.elem_len = (dst_type != BT_COMPLEX) ?
6550     dst_kind : (2 * dst_kind);
6551   temp_src_desc.base.dtype.rank = 1;
6552   temp_src_desc.base.dtype.type = dst_type;
6553 #else // GCC_GE_7
6554   temp_src_desc.base.dtype = GFC_DTYPE_INTEGER_4 | 1;
6555 #endif
6556   temp_src_desc.base.offset = 0;
6557   temp_src_desc.dim[0]._ubound = size - 1;
6558   temp_src_desc.dim[0]._stride = 1;
6559 
6560   temp_src_desc.base.base_addr =
6561     malloc(size * GFC_DESCRIPTOR_SIZE((gfc_descriptor_t *)&temp_src_desc));
6562   if (unlikely(temp_src_desc.base.base_addr == NULL))
6563   {
6564     caf_runtime_error(
6565       cannotallocdst, src_stat,
6566       size * GFC_DESCRIPTOR_SIZE((gfc_descriptor_t *)&temp_src_desc));
6567     return;
6568   }
6569 
6570 #ifndef GCC_GE_8
6571   static bool warning_given = false;
6572   if (!warning_given)
6573   {
6574     fprintf(stderr,
6575             "lib_caf_mpi::sendget_by_ref(): Warning !! sendget_by_ref() is "
6576             "mostly unfunctional due to a design error. Split up your "
6577             "statement with coarray refs on both sides of the assignment "
6578             "when the datatype transfered is non 4-byte-integer compatible, "
6579             "or use gcc >= 8.\n");
6580     warning_given = true;
6581   }
6582 #endif
6583   /* Reset the token. */
6584   src_mpi_token = (mpi_caf_token_t *) src_token;
6585   remote_memptr = src_mpi_token->memptr;
6586   dst_index = 0;
6587 #ifdef EXTRA_DEBUG_OUTPUT
6588   dprint("dst_rank: %d\n", dst_rank);
6589   for (i = 0; i < dst_rank; ++i)
6590   {
6591     dprint("temp_src_dim[%zd] = (%zd, %zd)\n",
6592            i, temp_src_desc.dim[i].lower_bound, temp_src_desc.dim[i]._ubound);
6593   }
6594 #endif
6595   i = 0;
6596   dprint("calling get_for_ref.\n");
6597   get_for_ref(src_refs, &i, dst_index, src_mpi_token,
6598               (gfc_descriptor_t *)&temp_src_desc, src_mpi_token->desc,
6599               temp_src_desc.base.base_addr, remote_memptr, 0, 0, dst_kind,
6600               src_kind, 0, 0, 1, src_stat, global_src_rank, memptr_src_rank,
6601               false, false
6602 #ifdef GCC_GE_8
6603               , src_type
6604 #endif
6605               );
6606   CAF_Win_unlock(global_src_rank, global_dynamic_win);
6607   CAF_Win_unlock(memptr_src_rank, src_mpi_token->memptr_win);
6608   dprint("calling send_for_ref. num elems: size = %zd, elem size in bytes: "
6609          "src_size = %zd\n", size, src_size);
6610   i = 0;
6611 
6612   CAF_Win_lock(MPI_LOCK_EXCLUSIVE, global_dst_rank, global_dynamic_win);
6613   CAF_Win_lock(MPI_LOCK_EXCLUSIVE, memptr_dst_rank, dst_mpi_token->memptr_win);
6614   send_for_ref(dst_refs, &i, src_index, dst_mpi_token, dst_mpi_token->desc,
6615                (gfc_descriptor_t *)&temp_src_desc, dst_mpi_token->memptr,
6616                temp_src_desc.base.base_addr, 0, 0, dst_kind, src_kind, 0, 0,
6617                1, dst_stat, global_dst_rank, memptr_dst_rank, false, false
6618 #ifdef GCC_GE_8
6619                , dst_type
6620 #endif
6621                );
6622   CAF_Win_unlock(global_dst_rank, global_dynamic_win);
6623   CAF_Win_unlock(memptr_dst_rank, src_mpi_token->memptr_win);
6624 }
6625 
6626 int
6627 PREFIX(is_present) (caf_token_t token, int image_index, caf_reference_t *refs)
6628 {
6629   const char unsupportedRefType[] =
6630     "Unsupported ref-type in caf_is_present().";
6631   const char unexpectedEndOfRefs[] =
6632     "Unexpected end of references in caf_is_present.";
6633   const char remotesInnerRefNA[] =
6634     "Memory referenced on the remote image is not allocated.";
6635   const int ptr_size = sizeof(void *);
6636   const int remote_image = image_index - 1;
6637   mpi_caf_token_t *mpi_token = (mpi_caf_token_t *)token;
6638   ptrdiff_t local_offset = 0;
6639   void *remote_memptr = NULL, *remote_base_memptr = NULL;
6640   bool carryOn = true, firstDesc = true;
6641   caf_reference_t *riter = refs, *prev;
6642   size_t i, ref_rank;
6643   int ierr;
6644   gfc_max_dim_descriptor_t src_desc;
6645   caf_array_ref_t array_ref;
6646 
6647   while (carryOn && riter)
6648   {
6649     switch (riter->type)
6650     {
6651       case CAF_REF_COMPONENT:
6652         if (riter->u.c.caf_token_offset)
6653         {
6654           CAF_Win_lock(MPI_LOCK_SHARED, remote_image, mpi_token->memptr_win);
6655           ierr = MPI_Get(&remote_memptr, ptr_size, MPI_BYTE, remote_image,
6656                          local_offset + riter->u.c.offset, ptr_size,
6657                          MPI_BYTE, mpi_token->memptr_win); chk_err(ierr);
6658           CAF_Win_unlock(remote_image, mpi_token->memptr_win);
6659           dprint("Got first remote address %p from offset %zd\n",
6660                  remote_memptr, local_offset);
6661           local_offset = 0;
6662           carryOn = false;
6663         }
6664         else
6665           local_offset += riter->u.c.offset;
6666         break;
6667       case CAF_REF_ARRAY:
6668         {
6669           const gfc_descriptor_t *src =
6670             (gfc_descriptor_t *)(mpi_token->memptr + local_offset);
6671           for (i = 0; riter->u.a.mode[i] != CAF_ARR_REF_NONE; ++i)
6672           {
6673             array_ref = riter->u.a.mode[i];
6674             dprint("i = %zd, array_ref = %s\n",
6675                    i, caf_array_ref_str[array_ref]);
6676             switch (array_ref)
6677             {
6678               case CAF_ARR_REF_FULL:
6679                 /* The local_offset stays unchanged when ref'ing the first
6680                  * element in a dimension. */
6681                 break;
6682               case CAF_ARR_REF_SINGLE:
6683                 local_offset +=
6684                   (riter->u.a.dim[i].s.start - src->dim[i].lower_bound)
6685                   * src->dim[i]._stride * riter->item_size;
6686                 break;
6687               case CAF_ARR_REF_VECTOR:
6688               case CAF_ARR_REF_RANGE:
6689               case CAF_ARR_REF_OPEN_END:
6690               case CAF_ARR_REF_OPEN_START:
6691                 /* Intentionally fall through, because these are not
6692                  * suported here. */
6693               default:
6694                 caf_runtime_error(unsupportedRefType);
6695                 return false;
6696             }
6697           }
6698         }
6699         break;
6700       case CAF_REF_STATIC_ARRAY:
6701         for (i = 0; riter->u.a.mode[i] != CAF_ARR_REF_NONE; ++i)
6702         {
6703           array_ref = riter->u.a.mode[i];
6704           dprint("i = %zd, array_ref = %s\n", i, caf_array_ref_str[array_ref]);
6705           switch (array_ref)
6706           {
6707             case CAF_ARR_REF_FULL:
6708               /* The local_offset stays unchanged when ref'ing the first
6709                * element in a dimension. */
6710               break;
6711             case CAF_ARR_REF_SINGLE:
6712               local_offset += riter->u.a.dim[i].s.start
6713                               * riter->u.a.dim[i].s.stride * riter->item_size;
6714               break;
6715             case CAF_ARR_REF_VECTOR:
6716             case CAF_ARR_REF_RANGE:
6717             case CAF_ARR_REF_OPEN_END:
6718             case CAF_ARR_REF_OPEN_START:
6719             default:
6720               caf_runtime_error(unsupportedRefType);
6721               return false;
6722           }
6723         }
6724         break;
6725       default:
6726         caf_runtime_error(unsupportedRefType);
6727         return false;
6728     } // switch
6729     prev = riter;
6730     riter = riter->next;
6731   }
6732 
6733   if (carryOn)
6734   {
6735     // This can only happen, when riter == NULL.
6736     caf_runtime_error(unexpectedEndOfRefs);
6737   }
6738 
6739   CAF_Win_lock(MPI_LOCK_SHARED, remote_image, global_dynamic_win);
6740   if (remote_memptr != NULL)
6741     remote_base_memptr = remote_memptr + local_offset;
6742 
6743   dprint("Remote desc address is %p from remote memptr %p and offset %zd\n",
6744          remote_base_memptr, remote_memptr, local_offset);
6745 
6746   while (riter)
6747   {
6748     switch (riter->type)
6749     {
6750       case CAF_REF_COMPONENT:
6751         /* After reffing the first allocatable/pointer component, descriptors
6752          * need to be picked up from the global_win. */
6753         firstDesc = firstDesc && riter->u.c.caf_token_offset == 0;
6754         local_offset += riter->u.c.offset;
6755         remote_base_memptr = remote_memptr + local_offset;
6756         ierr = MPI_Get(&remote_memptr, ptr_size, MPI_BYTE, remote_image,
6757                        (MPI_Aint)remote_base_memptr, ptr_size,
6758                        MPI_BYTE, global_dynamic_win); chk_err(ierr);
6759         dprint("Got remote address %p from offset %zd nd base memptr %p\n",
6760                remote_memptr, local_offset, remote_base_memptr);
6761         local_offset = 0;
6762         break;
6763       case CAF_REF_ARRAY:
6764         if (remote_base_memptr == NULL)
6765         {
6766           /* Refing an unallocated array ends in a full_ref. Check that this
6767            * is true. Error when not full-refing. */
6768           for (i = 0; riter->u.a.mode[i] != CAF_ARR_REF_NONE; ++i)
6769           {
6770             if (riter->u.a.mode[i] != CAF_ARR_REF_FULL)
6771               break;
6772           }
6773           if (riter->u.a.mode[i] != CAF_ARR_REF_NONE)
6774             caf_runtime_error(remotesInnerRefNA);
6775           break;
6776         }
6777         if (firstDesc)
6778         {
6779           /* The first descriptor is accessible by the mpi_token->memptr_win.
6780            * Count the dims to fetch. */
6781           for (ref_rank = 0; riter->u.a.mode[ref_rank] != CAF_ARR_REF_NONE;
6782                ++ref_rank)
6783 	    ;
6784           dprint("Getting remote descriptor of rank %zd from win: %d, "
6785                  "sizeof() %zd\n", ref_rank, mpi_token->memptr_win,
6786                  sizeof_desc_for_rank(ref_rank));
6787           ierr = MPI_Get(&src_desc, sizeof_desc_for_rank(ref_rank),
6788                          MPI_BYTE, remote_image, local_offset,
6789                          sizeof_desc_for_rank(ref_rank),
6790                          MPI_BYTE, mpi_token->memptr_win); chk_err(ierr);
6791           firstDesc = false;
6792         }
6793         else
6794         {
6795           /* All inner descriptors go by the dynamic window.
6796            * Count the dims to fetch. */
6797           for (ref_rank = 0; riter->u.a.mode[ref_rank] != CAF_ARR_REF_NONE;
6798                ++ref_rank)
6799 	    ;
6800           dprint("Getting remote descriptor of rank %zd from: %p, "
6801                  "sizeof() %zd\n", ref_rank, remote_base_memptr,
6802                  sizeof_desc_for_rank(ref_rank));
6803           ierr = MPI_Get(&src_desc, sizeof_desc_for_rank(ref_rank), MPI_BYTE,
6804                          remote_image, (MPI_Aint)remote_base_memptr,
6805                          sizeof_desc_for_rank(ref_rank), MPI_BYTE,
6806                          global_dynamic_win); chk_err(ierr);
6807         }
6808 #ifdef EXTRA_DEBUG_OUTPUT
6809         {
6810           gfc_descriptor_t * src = (gfc_descriptor_t *)(&src_desc);
6811           dprint("remote desc rank: %zd (ref_rank: %zd)\n",
6812                  GFC_DESCRIPTOR_RANK(src), ref_rank);
6813           for (i = 0; i < GFC_DESCRIPTOR_RANK(src); ++i)
6814           {
6815             dprint("remote desc dim[%zd] = (lb=%zd, ub=%zd, stride=%zd)\n",
6816                    i, src_desc.dim[i].lower_bound, src_desc.dim[i]._ubound,
6817                    src_desc.dim[i]._stride);
6818           }
6819         }
6820 #endif
6821 
6822         for (i = 0; riter->u.a.mode[i] != CAF_ARR_REF_NONE; ++i)
6823         {
6824           array_ref = riter->u.a.mode[i];
6825           dprint("i = %zd, array_ref = %s\n", i, caf_array_ref_str[array_ref]);
6826           switch (array_ref)
6827           {
6828             case CAF_ARR_REF_FULL:
6829               /* The local_offset stays unchanged when ref'ing the first
6830                * element in a dimension. */
6831               break;
6832             case CAF_ARR_REF_SINGLE:
6833               local_offset +=
6834                 (riter->u.a.dim[i].s.start - src_desc.dim[i].lower_bound)
6835                 * src_desc.dim[i]._stride
6836                 * riter->item_size;
6837               break;
6838             case CAF_ARR_REF_VECTOR:
6839             case CAF_ARR_REF_RANGE:
6840             case CAF_ARR_REF_OPEN_END:
6841             case CAF_ARR_REF_OPEN_START:
6842               /* Intentionally fall through, because these are not suported
6843                * here. */
6844             default:
6845               caf_runtime_error(unsupportedRefType);
6846               CAF_Win_unlock(remote_image, global_dynamic_win);
6847               return false;
6848           }
6849         }
6850         break;
6851       case CAF_REF_STATIC_ARRAY:
6852         for (i = 0; riter->u.a.mode[i] != CAF_ARR_REF_NONE; ++i)
6853         {
6854           array_ref = riter->u.a.mode[i];
6855           dprint("i = %zd, array_ref = %s\n", i, caf_array_ref_str[array_ref]);
6856           switch (array_ref)
6857           {
6858             case CAF_ARR_REF_FULL:
6859               /* The memptr stays unchanged when ref'ing the first element
6860                * in a dimension. */
6861               break;
6862             case CAF_ARR_REF_SINGLE:
6863               local_offset += riter->u.a.dim[i].s.start
6864                               * riter->u.a.dim[i].s.stride * riter->item_size;
6865               break;
6866             case CAF_ARR_REF_VECTOR:
6867             case CAF_ARR_REF_RANGE:
6868             case CAF_ARR_REF_OPEN_END:
6869             case CAF_ARR_REF_OPEN_START:
6870             default:
6871               caf_runtime_error(unsupportedRefType);
6872               CAF_Win_unlock(remote_image, global_dynamic_win);
6873               return false;
6874           }
6875         }
6876         break;
6877       default:
6878         caf_runtime_error(unsupportedRefType);
6879         CAF_Win_unlock(remote_image, global_dynamic_win);
6880         return false;
6881     } // switch
6882     riter = riter->next;
6883   }
6884   CAF_Win_unlock(remote_image, global_dynamic_win);
6885 
6886   dprint("Got remote_memptr: %p\n", remote_memptr);
6887   return remote_memptr != NULL;
6888 }
6889 #endif // GCC_GE_7
6890 
6891 
6892 /* SYNC IMAGES. Note: SYNC IMAGES(*) is passed as count == -1 while
6893  * SYNC IMAGES([]) has count == 0. Note further that SYNC IMAGES(*)
6894  * is not semantically equivalent to SYNC ALL. */
6895 
6896 void
6897 PREFIX(sync_images) (int count, int images[], int *stat, char *errmsg,
6898                      charlen_t errmsg_len)
6899 {
6900   sync_images_internal(count, images, stat, errmsg, errmsg_len, false);
6901 }
6902 
6903 static void
6904 sync_images_internal(int count, int images[], int *stat, char *errmsg,
6905                      size_t errmsg_len, bool internal)
6906 {
6907   int ierr = 0, i = 0, j = 0, int_zero = 0, done_count = 0, flag;
6908   MPI_Status s;
6909 
6910 #ifdef WITH_FAILED_IMAGES
6911   no_stopped_images_check_in_errhandler = true;
6912 #endif
6913   dprint("Entering\n");
6914   if (count == 0 || (count == 1 && images[0] == caf_this_image))
6915   {
6916     if (stat)
6917       *stat = 0;
6918 #ifdef WITH_FAILED_IMAGES
6919     no_stopped_images_check_in_errhandler = false;
6920 #endif
6921     dprint("Leaving early.\n");
6922     return;
6923   }
6924 
6925   /* halt execution if sync images contains duplicate image numbers */
6926   for (i = 0; i < count; ++i)
6927   {
6928     for (j = 0; j < i; ++j)
6929     {
6930       if (images[i] == images[j])
6931       {
6932         ierr = STAT_DUP_SYNC_IMAGES;
6933         if (stat)
6934           *stat = ierr;
6935         goto sync_images_err_chk;
6936       }
6937     }
6938   }
6939 
6940 #ifdef GFC_CAF_CHECK
6941     for (i = 0; i < count; ++i)
6942     {
6943       if (images[i] < 1 || images[i] > caf_num_images)
6944       {
6945         fprintf(stderr, "COARRAY ERROR: Invalid image index %d to SYNC IMAGES",
6946                 images[i]);
6947         terminate_internal(1, 1);
6948       }
6949     }
6950 #endif
6951 
6952   if (unlikely(caf_is_finalized))
6953   {
6954     ierr = STAT_STOPPED_IMAGE;
6955   }
6956   else
6957   {
6958     if (count == -1)
6959     {
6960       count = caf_num_images - 1;
6961       images = images_full;
6962     }
6963 
6964 #if defined(NONBLOCKING_PUT) && !defined(CAF_MPI_LOCK_UNLOCK)
6965     explicit_flush();
6966 #endif
6967 
6968 #ifdef WITH_FAILED_IMAGES
6969     /* Provoke detecting process fails. */
6970     ierr = MPI_Test(&alive_request, &flag, MPI_STATUS_IGNORE); chk_err(ierr);
6971 #endif
6972     /* A rather simple way to synchronice:
6973      * - expect all images to sync with receiving an int,
6974      * - on the other side, send all processes to sync with an int,
6975      * - when the int received is STAT_STOPPED_IMAGE the return immediately,
6976      *   else wait until all images in the current set of images have send
6977      *   some data, i.e., synced.
6978      *
6979      * This approach as best as possible implements the syncing of different
6980      * sets of images and figuring that an image has stopped.  MPI does not
6981      * provide any direct means of syncing non-coherent sets of images.
6982      * The groups/communicators of MPI always need to be consistent, i.e.,
6983      * have the same members on all images participating.  This is
6984      * contradictiory to the sync images statement, where syncing, e.g., in a
6985      * ring pattern is possible.
6986      *
6987      * This implementation guarantees, that as long as no image is stopped
6988      * an image only is allowed to continue, when all its images to sync to
6989      * also have reached a sync images statement.  This implementation makes
6990      * no assumption when the image continues or in which order synced
6991      * images continue. */
6992     for (i = 0; i < count; ++i)
6993     {
6994       /* Need to have the request handlers contigously in the handlers
6995        * array or waitany below will trip about the handler as illegal. */
6996       ierr = MPI_Irecv(&arrived[images[i] - 1], 1, MPI_INT, images[i] - 1,
6997                        MPI_TAG_CAF_SYNC_IMAGES, CAF_COMM_WORLD,
6998                        &sync_handles[i]); chk_err(ierr);
6999     }
7000     for (i = 0; i < count; ++i)
7001     {
7002       ierr = MPI_Send(&int_zero, 1, MPI_INT, images[i] - 1,
7003                       MPI_TAG_CAF_SYNC_IMAGES, CAF_COMM_WORLD); chk_err(ierr);
7004     }
7005     done_count = 0;
7006     while (done_count < count)
7007     {
7008       ierr = MPI_Waitany(count, sync_handles, &i, &s);
7009       if (ierr == MPI_SUCCESS && i != MPI_UNDEFINED)
7010       {
7011         ++done_count;
7012         if (ierr == MPI_SUCCESS && arrived[s.MPI_SOURCE] == STAT_STOPPED_IMAGE)
7013         {
7014           /* Possible future extension: Abort pending receives.  At the
7015            * moment the receives are discarded by the program
7016            * termination.  For the tested mpi-implementation this is ok. */
7017           ierr = STAT_STOPPED_IMAGE;
7018           break;
7019         }
7020       }
7021       else if (ierr != MPI_SUCCESS)
7022 #ifdef WITH_FAILED_IMAGES
7023       {
7024         int err;
7025         MPI_Error_class(ierr, &err);
7026         if (err == MPIX_ERR_PROC_FAILED)
7027         {
7028           dprint("Image failed, provoking error handling.\n");
7029           ierr = STAT_FAILED_IMAGE;
7030           /* Provoke detecting process fails. */
7031           MPI_Test(&alive_request, &flag, MPI_STATUS_IGNORE);
7032         }
7033         break;
7034       }
7035 #else
7036         break;
7037 #endif // WITH_FAILED_IMAGES
7038     }
7039   }
7040 
7041 sync_images_err_chk:
7042 #ifdef WITH_FAILED_IMAGES
7043   no_stopped_images_check_in_errhandler = false;
7044 #endif
7045   dprint("Leaving\n");
7046   if (stat)
7047     *stat = ierr;
7048 #ifdef WITH_FAILED_IMAGES
7049   else if (ierr == STAT_FAILED_IMAGE)
7050     terminate_internal(ierr, 0);
7051 #endif
7052 
7053   if (ierr != 0 && ierr != STAT_FAILED_IMAGE)
7054   {
7055     char msg[80];
7056     strcpy(msg, "SYNC IMAGES failed");
7057     if (caf_is_finalized)
7058       strcat(msg, " - there are stopped images");
7059 
7060     if (errmsg_len > 0)
7061     {
7062       size_t len = (strlen(msg) > errmsg_len) ? errmsg_len : strlen (msg);
7063       memcpy(errmsg, msg, len);
7064       if (errmsg_len > len)
7065         memset(&errmsg[len], ' ', errmsg_len-len);
7066     }
7067     else if (!internal && stat == NULL)
7068       caf_runtime_error(msg);
7069   }
7070 }
7071 
7072 
7073 #define GEN_REDUCTION(name, datatype, operator)       \
7074 static void                                           \
7075 name(datatype *invec, datatype *inoutvec, int *len,   \
7076      MPI_Datatype *datatype __attribute__((unused)))  \
7077 {                                                     \
7078   for (int i = 0; i < len; ++i)                       \
7079   {                                                   \
7080     operator;                                         \
7081   }                                                   \
7082 }
7083 
7084 #define REFERENCE_FUNC(TYPE) TYPE ## _by_reference
7085 #define VALUE_FUNC(TYPE) TYPE ## _by_value
7086 
7087 #define GEN_COREDUCE(name, dt)                                \
7088 static void                                                   \
7089 name##_by_reference_adapter(void *invec, void *inoutvec,      \
7090                             int *len, MPI_Datatype *datatype) \
7091 {                                                             \
7092   for (int i = 0; i < *len; ++i)                              \
7093   {                                                           \
7094     *((dt*)inoutvec) =                                        \
7095       (dt)(REFERENCE_FUNC(dt)((dt *)invec, (dt *)inoutvec));  \
7096     invec += sizeof(dt);                                      \
7097     inoutvec += sizeof(dt);                                   \
7098   }                                                           \
7099 }                                                             \
7100 static void                                                   \
7101 name##_by_value_adapter(void *invec, void *inoutvec,          \
7102                         int *len, MPI_Datatype *datatype)     \
7103 {                                                             \
7104   for (int i = 0; i < *len; ++i)                              \
7105   {                                                           \
7106     *((dt*)inoutvec) =                                        \
7107       (dt)(VALUE_FUNC(dt)(*(dt *)invec, *(dt *)inoutvec));    \
7108     invec += sizeof(dt);                                      \
7109     inoutvec += sizeof(dt);                                   \
7110   }                                                           \
7111 }
7112 
7113 GEN_COREDUCE(redux_int8, int8_t)
7114 GEN_COREDUCE(redux_int16, int16_t)
7115 GEN_COREDUCE(redux_int32, int32_t)
7116 GEN_COREDUCE(redux_int64, int64_t)
7117 GEN_COREDUCE(redux_real32, float)
7118 GEN_COREDUCE(redux_real64, double)
7119 
7120 static void
7121 redux_char_by_reference_adapter(void *invec, void *inoutvec, int *len,
7122                                 MPI_Datatype *datatype)
7123 {
7124   MPI_Aint lb, string_len;
7125   MPI_Type_get_extent(*datatype, &lb, &string_len);
7126   for (int i = 0; i < *len; i++)
7127   {
7128     /* The length of the result is fixed, i.e., no deferred string length is
7129      * allowed there. */
7130     REFERENCE_FUNC(char)(
7131       (char *)inoutvec, string_len, (char *)invec,
7132       (char *)inoutvec, string_len, string_len
7133     );
7134     invec += sizeof(char) * string_len;
7135     inoutvec += sizeof(char) * string_len;
7136   }
7137 }
7138 
7139 #ifndef MPI_INTEGER1
7140 GEN_REDUCTION(do_sum_int1, int8_t, inoutvec[i] += invec[i])
7141 GEN_REDUCTION(do_min_int1, int8_t,
7142               inoutvec[i] = (invec[i] >= inoutvec[i] ? inoutvec[i] : invec[i]))
7143 GEN_REDUCTION(do_max_int1, int8_t,
7144               inoutvec[i] = (invec[i] <= inoutvec[i] ? inoutvec[i] : invec[i]))
7145 #endif
7146 
7147 /*
7148 #ifndef MPI_INTEGER2
7149 GEN_REDUCTION(do_sum_int1, int16_t, inoutvec[i] += invec[i])
7150 GEN_REDUCTION(do_min_int1, int16_t,
7151               inoutvec[i] = (invec[i] >= inoutvec[i] ? inoutvec[i] : invec[i]))
7152 GEN_REDUCTION(do_max_int1, int16_t,
7153              inoutvec[i] = (invec[i] <= inoutvec[i] ? inoutvec[i] : invec[i]))
7154 #endif
7155 */
7156 
7157 #if defined(MPI_INTEGER16) && defined(GFC_INTEGER_16)
7158 GEN_REDUCTION(do_sum_int1, GFC_INTEGER_16, inoutvec[i] += invec[i])
7159 GEN_REDUCTION(do_min_int1, GFC_INTEGER_16,
7160               inoutvec[i] = (invec[i] >= inoutvec[i] ? inoutvec[i] : invec[i]))
7161 GEN_REDUCTION(do_max_int1, GFC_INTEGER_16,
7162               inoutvec[i] = (invec[i] <= inoutvec[i] ? inoutvec[i] : invec[i]))
7163 #endif
7164 
7165 #if defined(GFC_DTYPE_REAL_10) \
7166     || (!defined(GFC_DTYPE_REAL_10) && defined(GFC_DTYPE_REAL_16))
7167 GEN_REDUCTION(do_sum_real10, long double, inoutvec[i] += invec[i])
7168 GEN_REDUCTION(do_min_real10, long double,
7169               inoutvec[i] = (invec[i] >= inoutvec[i] ? inoutvec[i] : invec[i]))
7170 GEN_REDUCTION(do_max_real10, long double,
7171               inoutvec[i] = (invec[i] <= inoutvec[i] ? inoutvec[i] : invec[i]))
7172 GEN_REDUCTION(do_sum_complex10, _Complex long double, inoutvec[i] += invec[i])
7173 GEN_REDUCTION(do_min_complex10, _Complex long double,
7174               inoutvec[i] = (invec[i] >= inoutvec[i] ? inoutvec[i] : invec[i]))
7175 GEN_REDUCTION(do_max_complex10, _Complex long double,
7176               inoutvec[i] = (invec[i] <= inoutvec[i] ? inoutvec[i] : invec[i]))
7177 #endif
7178 
7179 #if defined(GFC_DTYPE_REAL_10) && defined(GFC_DTYPE_REAL_16)
7180 GEN_REDUCTION(do_sum_real10, __float128, inoutvec[i] += invec[i])
7181 GEN_REDUCTION(do_min_real10, __float128,
7182               inoutvec[i] = (invec[i] >= inoutvec[i] ? inoutvec[i] : invec[i]))
7183 GEN_REDUCTION(do_max_real10, __float128,
7184               inoutvec[i] = (invec[i] <= inoutvec[i] ? inoutvec[i] : invec[i]))
7185 GEN_REDUCTION(do_sum_complex10, _Complex __float128, inoutvec[i] += invec[i])
7186 GEN_REDUCTION(do_mincomplexl10, _Complex __float128,
7187               inoutvec[i] = (invec[i] >= inoutvec[i] ? inoutvec[i] : invec[i]))
7188 GEN_REDUCTION(do_max_complex10, _Complex __float128,
7189               inoutvec[i] = (invec[i] <= inoutvec[i] ? inoutvec[i] : invec[i]))
7190 #endif
7191 #undef GEN_REDUCTION
7192 
7193 
7194 static MPI_Datatype
7195 get_MPI_datatype(gfc_descriptor_t *desc, int char_len)
7196 {
7197   int ierr;
7198   /* FIXME: Better check whether the sizes are okay and supported;
7199    * MPI3 adds more types, e.g. MPI_INTEGER1. */
7200   switch (GFC_DTYPE_TYPE_SIZE(desc))
7201   {
7202 #ifdef MPI_INTEGER1
7203     case GFC_DTYPE_INTEGER_1:
7204       return MPI_INTEGER1;
7205 #endif
7206 #ifdef MPI_INTEGER2
7207     case GFC_DTYPE_INTEGER_2:
7208       return MPI_INTEGER2;
7209 #endif
7210     case GFC_DTYPE_INTEGER_4:
7211 #ifdef MPI_INTEGER4
7212       return MPI_INTEGER4;
7213 #else
7214       return MPI_INTEGER;
7215 #endif
7216 #ifdef MPI_INTEGER8
7217     case GFC_DTYPE_INTEGER_8:
7218       return MPI_INTEGER8;
7219 #endif
7220 #if defined(MPI_INTEGER16) && defined(GFC_DTYPE_INTEGER_16)
7221     case GFC_DTYPE_INTEGER_16:
7222       return MPI_INTEGER16;
7223 #endif
7224 
7225     case GFC_DTYPE_LOGICAL_4:
7226       return MPI_INT;
7227 
7228     case GFC_DTYPE_REAL_4:
7229 #ifdef MPI_REAL4
7230       return MPI_REAL4;
7231 #else
7232       return MPI_REAL;
7233 #endif
7234     case GFC_DTYPE_REAL_8:
7235 #ifdef MPI_REAL8
7236       return MPI_REAL8;
7237 #else
7238       return MPI_DOUBLE_PRECISION;
7239 #endif
7240 
7241 /* Note that we cannot use REAL_16 as we do not know whether it matches REAL(10)
7242  * or REAL(16), which have both the same bitsize and only make use of less
7243  * bits. */
7244     case GFC_DTYPE_COMPLEX_4:
7245       return MPI_COMPLEX;
7246     case GFC_DTYPE_COMPLEX_8:
7247       return MPI_DOUBLE_COMPLEX;
7248   }
7249 /* gfortran passes character string arguments with a
7250  * GFC_DTYPE_TYPE_SIZE == GFC_TYPE_CHARACTER + 64*strlen */
7251   if ((GFC_DTYPE_TYPE_SIZE(desc) - GFC_DTYPE_CHARACTER) % 64 == 0)
7252   {
7253     MPI_Datatype string;
7254 
7255     if (char_len == 0)
7256       char_len = GFC_DESCRIPTOR_SIZE(desc);
7257     ierr = MPI_Type_contiguous(char_len, MPI_CHARACTER, &string); chk_err(ierr);
7258     ierr = MPI_Type_commit(&string); chk_err(ierr);
7259     return string;
7260   }
7261 
7262   return MPI_BYTE;
7263   /* caf_runtime_error("Unsupported data type in collective: %zd\n", */
7264   /*                   GFC_DTYPE_TYPE_SIZE(desc)); */
7265   /* return 0; */
7266 }
7267 
7268 
7269 static void
7270 internal_co_reduce(MPI_Op op, gfc_descriptor_t *source, int result_image,
7271                    int *stat, char *errmsg, int src_len, size_t errmsg_len)
7272 {
7273   size_t i, size;
7274   int j, ierr, rank = GFC_DESCRIPTOR_RANK(source);
7275   ptrdiff_t dimextent;
7276 
7277   MPI_Datatype datatype = get_MPI_datatype(source, src_len);
7278 
7279   size = 1;
7280   for (j = 0; j < rank; ++j)
7281   {
7282     dimextent = source->dim[j]._ubound - source->dim[j].lower_bound + 1;
7283     if (dimextent < 0)
7284       dimextent = 0;
7285     size *= dimextent;
7286   }
7287 
7288   if (rank == 0 || PREFIX(is_contiguous) (source))
7289   {
7290     if (result_image == 0)
7291     {
7292       ierr = MPI_Allreduce(MPI_IN_PLACE, source->base_addr, size, datatype, op,
7293                            CAF_COMM_WORLD); chk_err(ierr);
7294     }
7295     else if (result_image == caf_this_image)
7296     {
7297       ierr = MPI_Reduce(MPI_IN_PLACE, source->base_addr, size, datatype, op,
7298                         result_image - 1, CAF_COMM_WORLD); chk_err(ierr);
7299     }
7300     else
7301     {
7302       ierr = MPI_Reduce(source->base_addr, NULL, size, datatype, op,
7303                         result_image - 1, CAF_COMM_WORLD); chk_err(ierr);
7304     }
7305     if (ierr)
7306       goto error;
7307     goto co_reduce_cleanup;
7308   }
7309 
7310   for (i = 0; i < size; ++i)
7311   {
7312     ptrdiff_t array_offset_sr = 0, tot_ext = 1, extent = 1;
7313     for (j = 0; j < rank - 1; ++j)
7314     {
7315       extent = source->dim[j]._ubound - source->dim[j].lower_bound + 1;
7316       array_offset_sr += ((i / tot_ext) % extent) * source->dim[j]._stride;
7317       tot_ext *= extent;
7318     }
7319     array_offset_sr += (i / tot_ext) * source->dim[rank - 1]._stride;
7320     void *sr = (void *)((char *)source->base_addr
7321                         + array_offset_sr * GFC_DESCRIPTOR_SIZE(source));
7322     if (result_image == 0)
7323     {
7324       ierr = MPI_Allreduce(MPI_IN_PLACE, sr, 1, datatype, op, CAF_COMM_WORLD);
7325       chk_err(ierr);
7326     }
7327     else if (result_image == caf_this_image)
7328     {
7329       ierr = MPI_Reduce(MPI_IN_PLACE, sr, 1, datatype, op, result_image - 1,
7330                         CAF_COMM_WORLD); chk_err(ierr);
7331     }
7332     else
7333     {
7334       ierr = MPI_Reduce(sr, NULL, 1, datatype, op, result_image - 1,
7335                         CAF_COMM_WORLD); chk_err(ierr);
7336     }
7337     if (ierr)
7338       goto error;
7339   }
7340 
7341 co_reduce_cleanup:
7342   if (GFC_DESCRIPTOR_TYPE(source) == BT_CHARACTER)
7343   {
7344     ierr = MPI_Type_free(&datatype); chk_err(ierr);
7345   }
7346   if (stat)
7347     *stat = 0;
7348   return;
7349 error:
7350   /* FIXME: Put this in an extra function and use it elsewhere. */
7351   if (stat)
7352   {
7353     *stat = ierr;
7354     if (!errmsg)
7355       return;
7356   }
7357 
7358   int len = sizeof(err_buffer);
7359   MPI_Error_string(ierr, err_buffer, &len);
7360   if (!stat)
7361   {
7362     err_buffer[len == sizeof(err_buffer) ? len - 1 : len] = '\0';
7363     caf_runtime_error("CO_SUM failed with %s\n", err_buffer);
7364   }
7365   memcpy(errmsg, err_buffer, (errmsg_len > len) ? len : errmsg_len);
7366   if (errmsg_len > len)
7367     memset(&errmsg[len], '\0', errmsg_len - len);
7368 }
7369 
7370 void
7371 PREFIX(co_broadcast) (gfc_descriptor_t *a, int source_image, int *stat,
7372                       char *errmsg, charlen_t errmsg_len)
7373 {
7374   size_t i, size;
7375   int j, ierr, rank = GFC_DESCRIPTOR_RANK(a);
7376   ptrdiff_t dimextent;
7377 
7378   MPI_Datatype datatype = get_MPI_datatype(a, 0);
7379 
7380   size = 1;
7381   for (j = 0; j < rank; ++j)
7382   {
7383     dimextent = a->dim[j]._ubound - a->dim[j].lower_bound + 1;
7384     if (dimextent < 0)
7385       dimextent = 0;
7386     size *= dimextent;
7387   }
7388 
7389   if (rank == 0)
7390   {
7391     if( datatype == MPI_BYTE)
7392       {
7393 	ierr = MPI_Bcast(a->base_addr, size*GFC_DESCRIPTOR_SIZE(a),
7394 			 datatype, source_image - 1,
7395 			 CAF_COMM_WORLD); chk_err(ierr);
7396       }
7397     else if (datatype != MPI_CHARACTER)
7398     {
7399       ierr = MPI_Bcast(a->base_addr, size, datatype, source_image - 1,
7400                        CAF_COMM_WORLD); chk_err(ierr);
7401     }
7402     else
7403     {
7404       int a_length;
7405       if (caf_this_image == source_image)
7406         a_length = strlen(a->base_addr);
7407       /* Broadcast the string lenth */
7408       ierr = MPI_Bcast(&a_length, 1, MPI_INT, source_image - 1, CAF_COMM_WORLD);
7409       chk_err(ierr);
7410       if (ierr)
7411         goto error;
7412       /* Broadcast the string itself */
7413       ierr = MPI_Bcast(a->base_addr, a_length, datatype, source_image - 1,
7414                        CAF_COMM_WORLD); chk_err(ierr);
7415     }
7416 
7417     if (ierr)
7418       goto error;
7419     goto co_broadcast_exit;
7420   }
7421   else if (datatype == MPI_CHARACTER) /* rank !=0 */
7422   {
7423       caf_runtime_error("Co_broadcast of character arrays are "
7424                         "not yet supported\n");
7425   }
7426 
7427   for (i = 0; i < size; ++i)
7428   {
7429     ptrdiff_t array_offset_sr = 0, tot_ext = 1, extent = 1;
7430     for (j = 0; j < rank - 1; ++j)
7431     {
7432       extent = a->dim[j]._ubound - a->dim[j].lower_bound + 1;
7433       array_offset_sr += ((i / tot_ext) % extent) * a->dim[j]._stride;
7434       tot_ext *= extent;
7435     }
7436     array_offset_sr += (i / tot_ext) * a->dim[rank - 1]._stride;
7437     void *sr = (void *)(
7438       (char *)a->base_addr + array_offset_sr * GFC_DESCRIPTOR_SIZE(a));
7439 
7440     ierr = MPI_Bcast(sr, 1, datatype, source_image - 1, CAF_COMM_WORLD);
7441     chk_err(ierr);
7442 
7443     if (ierr)
7444       goto error;
7445   }
7446 
7447 co_broadcast_exit:
7448   if (stat)
7449     *stat = 0;
7450   if (GFC_DESCRIPTOR_TYPE(a) == BT_CHARACTER)
7451   {
7452     ierr = MPI_Type_free(&datatype); chk_err(ierr);
7453   }
7454   return;
7455 
7456 error:
7457   /* FIXME: Put this in an extra function and use it elsewhere. */
7458   if (stat)
7459   {
7460     *stat = ierr;
7461     if (!errmsg)
7462       return;
7463   }
7464 
7465   int len = sizeof(err_buffer);
7466   MPI_Error_string(ierr, err_buffer, &len);
7467   if (!stat)
7468   {
7469     err_buffer[len == sizeof(err_buffer) ? len - 1 : len] = '\0';
7470     caf_runtime_error("CO_SUM failed with %s\n", err_buffer);
7471   }
7472   memcpy(errmsg, err_buffer, (errmsg_len > len) ? len : errmsg_len);
7473   if (errmsg_len > len)
7474     memset(&errmsg[len], '\0', errmsg_len - len);
7475 }
7476 
7477 /* The front-end function for co_reduce functionality.  It sets up the MPI_Op
7478  * for use in MPI_*Reduce functions. */
7479 void
7480 PREFIX(co_reduce) (gfc_descriptor_t *a, void *(*opr) (void *, void *),
7481                    int opr_flags, int result_image, int *stat, char *errmsg,
7482                    int a_len, charlen_t errmsg_len)
7483 {
7484   MPI_Op op;
7485   int type_a = GFC_DESCRIPTOR_TYPE(a), ierr;
7486   /* Integers and logicals can be treated the same. */
7487   if (type_a == BT_INTEGER || type_a == BT_LOGICAL)
7488   {
7489     /* When the ARG_VALUE opr_flag is set, then the user-function expects its
7490      * arguments to be passed by value. */
7491     if ((opr_flags & GFC_CAF_ARG_VALUE) > 0)
7492     {
7493 #define ifTypeGen(type)                                                   \
7494 if (GFC_DESCRIPTOR_SIZE(a) == sizeof(type ## _t))                         \
7495 {                                                                         \
7496   type ## _t_by_value = (typeof(VALUE_FUNC(type ## _t)))opr;              \
7497   int ierr = MPI_Op_create(redux_ ## type ## _by_value_adapter, 1, &op);  \
7498   chk_err(ierr);                                                          \
7499 }
7500       ifTypeGen(int8)
7501       else ifTypeGen(int16)
7502       else ifTypeGen(int32)
7503       else ifTypeGen(int64)
7504       else
7505       {
7506         caf_runtime_error("CO_REDUCE unsupported integer datatype");
7507       }
7508 #undef ifTypeGen
7509     }
7510     else
7511     {
7512       int32_t_by_reference = (typeof(REFERENCE_FUNC(int32_t)))opr;
7513       ierr = MPI_Op_create(redux_int32_by_reference_adapter, 1, &op);
7514       chk_err(ierr);
7515     }
7516   }
7517   /* Treat reals/doubles. */
7518   else if (type_a == BT_REAL)
7519   {
7520     /* When the ARG_VALUE opr_flag is set, then the user-function expects its
7521      * arguments to be passed by value. */
7522     if (GFC_DESCRIPTOR_SIZE(a) == sizeof(float))
7523     {
7524       if ((opr_flags & GFC_CAF_ARG_VALUE) > 0)
7525       {
7526         float_by_value = (typeof(VALUE_FUNC(float)))opr;
7527         ierr = MPI_Op_create(redux_real32_by_value_adapter, 1, &op);
7528         chk_err(ierr);
7529       }
7530       else
7531       {
7532         float_by_reference = (typeof(REFERENCE_FUNC(float)))opr;
7533         ierr = MPI_Op_create(redux_real32_by_reference_adapter, 1, &op);
7534         chk_err(ierr);
7535       }
7536     }
7537     else
7538     {
7539       /* When the ARG_VALUE opr_flag is set, then the user-function expects
7540        * its arguments to be passed by value. */
7541       if ((opr_flags & GFC_CAF_ARG_VALUE) > 0)
7542       {
7543         double_by_value = (typeof(VALUE_FUNC(double)))opr;
7544         ierr = MPI_Op_create(redux_real64_by_value_adapter, 1, &op);
7545         chk_err(ierr);
7546       }
7547       else
7548       {
7549         double_by_reference = (typeof(REFERENCE_FUNC(double)))opr;
7550         ierr = MPI_Op_create(redux_real64_by_reference_adapter, 1, &op);
7551         chk_err(ierr);
7552       }
7553     }
7554   }
7555   else if (type_a == BT_CHARACTER)
7556   {
7557     /* Char array functions always pass by reference. */
7558     char_by_reference = (typeof(REFERENCE_FUNC(char)))opr;
7559     ierr = MPI_Op_create(redux_char_by_reference_adapter, 1, &op);
7560     chk_err(ierr);
7561   }
7562   else
7563   {
7564     caf_runtime_error("Data type not yet supported for co_reduce\n");
7565   }
7566 
7567   internal_co_reduce(op, a, result_image, stat, errmsg, a_len, errmsg_len);
7568 }
7569 
7570 void
7571 PREFIX(co_sum) (gfc_descriptor_t *a, int result_image, int *stat, char *errmsg,
7572                 charlen_t errmsg_len)
7573 {
7574   internal_co_reduce(MPI_SUM, a, result_image, stat, errmsg, 0, errmsg_len);
7575 }
7576 
7577 
7578 void
7579 PREFIX(co_min) (gfc_descriptor_t *a, int result_image, int *stat, char *errmsg,
7580                 int src_len, charlen_t errmsg_len)
7581 {
7582   internal_co_reduce(MPI_MIN, a, result_image, stat, errmsg, src_len,
7583                      errmsg_len);
7584 }
7585 
7586 
7587 void
7588 PREFIX(co_max) (gfc_descriptor_t *a, int result_image, int *stat,
7589                 char *errmsg, int src_len, charlen_t errmsg_len)
7590 {
7591   internal_co_reduce(MPI_MAX, a, result_image, stat, errmsg, src_len,
7592                      errmsg_len);
7593 }
7594 
7595 
7596 /* Locking functions */
7597 
7598 void
7599 PREFIX(lock) (caf_token_t token, size_t index, int image_index,
7600               int *acquired_lock, int *stat, char *errmsg,
7601               charlen_t errmsg_len)
7602 {
7603   MPI_Win *p = TOKEN(token);
7604   mutex_lock(*p, (image_index == 0) ? caf_this_image : image_index,
7605              index, stat, acquired_lock, errmsg, errmsg_len);
7606 }
7607 
7608 
7609 void
7610 PREFIX(unlock) (caf_token_t token, size_t index, int image_index,
7611                 int *stat, char *errmsg, charlen_t errmsg_len)
7612 {
7613   MPI_Win *p = TOKEN(token);
7614   mutex_unlock(*p, (image_index == 0) ? caf_this_image : image_index,
7615                index, stat, errmsg, errmsg_len);
7616 }
7617 
7618 /* Atomics operations */
7619 
7620 void
7621 PREFIX(atomic_define) (caf_token_t token, size_t offset,
7622                        int image_index, void *value, int *stat,
7623                        int type __attribute__((unused)), int kind)
7624 {
7625   MPI_Win *p = TOKEN(token);
7626   MPI_Datatype dt;
7627   int ierr = 0,
7628       image = (image_index != 0) ? image_index - 1 : caf_this_image - 1;
7629 
7630   selectType(kind, &dt);
7631 
7632 #if MPI_VERSION >= 3
7633   CAF_Win_lock(MPI_LOCK_EXCLUSIVE, image, *p);
7634   ierr = MPI_Accumulate(value, 1, dt, image, offset, 1, dt, MPI_REPLACE, *p);
7635   chk_err(ierr);
7636   CAF_Win_unlock(image, *p);
7637 #else // MPI_VERSION
7638   CAF_Win_lock(MPI_LOCK_EXCLUSIVE, image, *p);
7639   ierr = MPI_Put(value, 1, dt, image, offset, 1, dt, *p); chk_err(ierr);
7640   CAF_Win_unlock(image, *p);
7641 #endif // MPI_VERSION
7642 
7643   if (stat)
7644     *stat = ierr;
7645   else if (ierr != 0)
7646     terminate_internal(ierr, 0);
7647 
7648   return;
7649 }
7650 
7651 void
7652 PREFIX(atomic_ref) (caf_token_t token, size_t offset,
7653                     int image_index,
7654                     void *value, int *stat,
7655                     int type __attribute__((unused)), int kind)
7656 {
7657   MPI_Win *p = TOKEN(token);
7658   MPI_Datatype dt;
7659   int ierr = 0,
7660       image = (image_index != 0) ? image_index - 1 : caf_this_image - 1;
7661 
7662   selectType(kind, &dt);
7663 
7664 #if MPI_VERSION >= 3
7665   CAF_Win_lock(MPI_LOCK_EXCLUSIVE, image, *p);
7666   ierr = MPI_Fetch_and_op(NULL, value, dt, image, offset, MPI_NO_OP, *p);
7667   chk_err(ierr);
7668   CAF_Win_unlock(image, *p);
7669 #else // MPI_VERSION
7670   CAF_Win_lock(MPI_LOCK_EXCLUSIVE, image, *p);
7671   ierr = MPI_Get(value, 1, dt, image, offset, 1, dt, *p); chk_err(ierr);
7672   CAF_Win_unlock(image, *p);
7673 #endif // MPI_VERSION
7674 
7675   if (stat)
7676     *stat = ierr;
7677   else if (ierr != 0)
7678     terminate_internal(ierr, 0);
7679 
7680   return;
7681 }
7682 
7683 void
7684 PREFIX(atomic_cas) (caf_token_t token, size_t offset, int image_index,
7685                     void *old, void *compare, void *new_val, int *stat,
7686                     int type __attribute__((unused)), int kind)
7687 {
7688   MPI_Win *p = TOKEN(token);
7689   MPI_Datatype dt;
7690   int ierr = 0,
7691       image = (image_index != 0) ? image_index - 1 : caf_this_image - 1;
7692 
7693   selectType(kind, &dt);
7694 
7695 #if MPI_VERSION >= 3
7696   CAF_Win_lock(MPI_LOCK_EXCLUSIVE, image, *p);
7697   ierr = MPI_Compare_and_swap(new_val, compare, old, dt, image, offset, *p);
7698   chk_err(ierr);
7699   CAF_Win_unlock(image, *p);
7700 #else // MPI_VERSION
7701 #warning atomic_cas for MPI-2 is not yet implemented
7702   printf("We apologize but atomic_cas for MPI-2 is not yet implemented\n");
7703   ierr = 1;
7704 #endif // MPI_VERSION
7705 
7706   if (stat)
7707     *stat = ierr;
7708   else if (ierr != 0)
7709     terminate_internal(ierr, 0);
7710 
7711   return;
7712 }
7713 
7714 void
7715 PREFIX(atomic_op) (int op, caf_token_t token, size_t offset, int image_index,
7716                    void *value, void *old, int *stat,
7717                    int type __attribute__((unused)), int kind)
7718 {
7719   int ierr = 0;
7720   MPI_Datatype dt;
7721   MPI_Win *p = TOKEN(token);
7722   int image = (image_index != 0) ? image_index - 1 : caf_this_image - 1;
7723 
7724 #if MPI_VERSION >= 3
7725   old = malloc(kind);
7726   selectType(kind, &dt);
7727 
7728   CAF_Win_lock(MPI_LOCK_EXCLUSIVE, image, *p);
7729   /* Atomic_add */
7730   switch(op) {
7731     case 1:
7732       ierr = MPI_Fetch_and_op(value, old, dt, image, offset, MPI_SUM, *p);
7733       chk_err(ierr);
7734       break;
7735     case 2:
7736       ierr = MPI_Fetch_and_op(value, old, dt, image, offset, MPI_BAND, *p);
7737       chk_err(ierr);
7738       break;
7739     case 4:
7740       ierr = MPI_Fetch_and_op(value, old, dt, image, offset, MPI_BOR, *p);
7741       chk_err(ierr);
7742       break;
7743     case 5:
7744       ierr = MPI_Fetch_and_op(value, old, dt, image, offset, MPI_BXOR, *p);
7745       chk_err(ierr);
7746       break;
7747     default:
7748       printf("We apologize but the atomic operation requested for MPI < 3 "
7749              "is not yet implemented\n");
7750       break;
7751     }
7752   CAF_Win_unlock(image, *p);
7753 
7754   free(old);
7755 #else // MPI_VERSION
7756   #warning atomic_op for MPI is not yet implemented
7757   printf("We apologize but atomic_op for MPI < 3 is not yet implemented\n");
7758 #endif // MPI_VERSION
7759   if (stat)
7760     *stat = ierr;
7761   else if (ierr != 0)
7762     terminate_internal(ierr, 0);
7763 
7764   return;
7765 }
7766 
7767 /* Events */
7768 
7769 void
7770 PREFIX(event_post) (caf_token_t token, size_t index, int image_index,
7771                     int *stat, char *errmsg, charlen_t errmsg_len)
7772 {
7773   int value = 1, ierr = 0, flag;
7774   MPI_Win *p = TOKEN(token);
7775   const char msg[] = "Error on event post";
7776   int image = (image_index == 0) ? caf_this_image - 1 : image_index - 1;
7777 
7778   if (stat != NULL)
7779     *stat = 0;
7780 
7781 #if MPI_VERSION >= 3
7782   CAF_Win_lock(MPI_LOCK_EXCLUSIVE, image, *p);
7783   ierr = MPI_Accumulate(&value, 1, MPI_INT, image, index * sizeof(int), 1,
7784                         MPI_INT, MPI_SUM, *p); chk_err(ierr);
7785   CAF_Win_unlock(image, *p);
7786 #else // MPI_VERSION
7787   #warning Events for MPI-2 are not implemented
7788   printf("Events for MPI-2 are not supported, "
7789          "please update your MPI implementation\n");
7790 #endif // MPI_VERSION
7791 
7792   check_image_health(image_index, stat);
7793 
7794   if (!stat && ierr == STAT_FAILED_IMAGE)
7795     terminate_internal(ierr, 0);
7796 
7797   if (ierr != MPI_SUCCESS)
7798   {
7799     if (stat != NULL)
7800       *stat = ierr;
7801     if (errmsg != NULL)
7802     {
7803       memset(errmsg,' ',errmsg_len);
7804       memcpy(errmsg, msg, MIN(errmsg_len,strlen(msg)));
7805     }
7806   }
7807 }
7808 
7809 void
7810 PREFIX(event_wait) (caf_token_t token, size_t index, int until_count,
7811                     int *stat, char *errmsg, charlen_t errmsg_len)
7812 {
7813   int ierr = 0, count = 0, i, image = caf_this_image - 1;
7814   int *var = NULL, flag, old = 0, newval = 0;
7815   const int spin_loop_max = 20000;
7816   MPI_Win *p = TOKEN(token);
7817   const char msg[] = "Error on event wait";
7818 
7819   if (stat != NULL)
7820     *stat = 0;
7821 
7822   ierr = MPI_Win_get_attr(*p, MPI_WIN_BASE, &var, &flag); chk_err(ierr);
7823 
7824   for (i = 0; i < spin_loop_max; ++i)
7825   {
7826     ierr = MPI_Win_sync(*p); chk_err(ierr);
7827     count = var[index];
7828     if (count >= until_count)
7829       break;
7830   }
7831 
7832   i = 1;
7833   while (count < until_count)
7834   {
7835     ierr = MPI_Win_sync(*p); chk_err(ierr);
7836     count = var[index];
7837     usleep(10 * i);
7838     ++i;
7839     /* Needed to enforce MPI progress */
7840     ierr = MPI_Win_flush(image, *p); chk_err(ierr);
7841   }
7842 
7843   newval = -until_count;
7844 
7845   CAF_Win_lock(MPI_LOCK_EXCLUSIVE, image, *p);
7846   ierr = MPI_Fetch_and_op(&newval, &old, MPI_INT, image, index * sizeof(int),
7847                           MPI_SUM, *p); chk_err(ierr);
7848   CAF_Win_unlock(image, *p);
7849   check_image_health(image, stat);
7850 
7851   if (!stat && ierr == STAT_FAILED_IMAGE)
7852     terminate_internal(ierr, 0);
7853 
7854   if (ierr != MPI_SUCCESS)
7855   {
7856     if (stat != NULL)
7857       *stat = ierr;
7858     if (errmsg != NULL)
7859     {
7860       memset(errmsg,' ',errmsg_len);
7861       memcpy(errmsg, msg, MIN(errmsg_len,strlen(msg)));
7862     }
7863   }
7864 }
7865 
7866 void
7867 PREFIX(event_query) (caf_token_t token, size_t index,
7868                      int image_index, int *count, int *stat)
7869 {
7870   MPI_Win *p = TOKEN(token);
7871   int ierr = 0,
7872       image = (image_index == 0) ? caf_this_image - 1 : image_index - 1;
7873 
7874   if (stat != NULL)
7875     *stat = 0;
7876 
7877 #if MPI_VERSION >= 3
7878   CAF_Win_lock(MPI_LOCK_EXCLUSIVE, image, *p);
7879   ierr = MPI_Fetch_and_op(NULL, count, MPI_INT, image, index * sizeof(int),
7880                           MPI_NO_OP, *p); chk_err(ierr);
7881   CAF_Win_unlock(image, *p);
7882 #else // MPI_VERSION
7883 #warning Events for MPI-2 are not implemented
7884   printf("Events for MPI-2 are not supported, "
7885          "please update your MPI implementation\n");
7886 #endif // MPI_VERSION
7887   if (ierr != MPI_SUCCESS && stat != NULL)
7888     *stat = ierr;
7889 }
7890 
7891 
7892 /* Internal function to execute the part that is common to all (error) stop
7893  * functions. */
7894 
7895 static void
7896 terminate_internal(int stat_code, int exit_code)
7897 {
7898   dprint("terminate_internal (stat_code = %d, exit_code = %d).\n",
7899          stat_code, exit_code);
7900   finalize_internal(stat_code);
7901 
7902 #ifndef WITH_FAILED_IMAGES
7903   MPI_Abort(MPI_COMM_WORLD, exit_code);
7904 #endif
7905   exit(exit_code);
7906 }
7907 
7908 
7909 #ifdef GCC_GE_8
7910 #undef QUIETARG
7911 #define QUIETARG , bool quiet
7912 #endif
7913 
7914 /* STOP function for integer arguments. */
7915 
7916 void
7917 PREFIX(stop_numeric) (int stop_code QUIETARG)
7918 {
7919 #ifndef GCC_GE_8
7920   bool quiet = false;
7921 #endif
7922   if (!quiet)
7923     fprintf(stderr, "STOP %d\n", stop_code);
7924 
7925   /* Stopping includes taking down the runtime regularly and returning the
7926    * stop_code. */
7927   terminate_internal(STAT_STOPPED_IMAGE, stop_code);
7928 }
7929 
7930 
7931 /* STOP function for string arguments. */
7932 
7933 void
7934 PREFIX(stop_str) (const char *string, charlen_t len QUIETARG)
7935 {
7936 #ifndef GCC_GE_8
7937   bool quiet = false;
7938 #endif
7939   if (!quiet)
7940   {
7941     fputs("STOP ", stderr);
7942     while (len--)
7943       fputc(*(string++), stderr);
7944     fputs("\n", stderr);
7945   }
7946   /* Stopping includes taking down the runtime regularly. */
7947   terminate_internal(STAT_STOPPED_IMAGE, 0);
7948 }
7949 
7950 
7951 /* ERROR STOP function for string arguments. */
7952 
7953 static void
7954 error_stop_str(const char *string, size_t len, bool quiet)
7955 {
7956   if (!quiet)
7957   {
7958     fputs("ERROR STOP ", stderr);
7959     while (len--)
7960       fputc(*(string++), stderr);
7961     fputs("\n", stderr);
7962   }
7963   terminate_internal(STAT_STOPPED_IMAGE, 1);
7964 }
7965 
7966 
7967 void
7968 PREFIX(error_stop_str) (const char *string, charlen_t len QUIETARG)
7969 {
7970 #ifndef GCC_GE_8
7971   bool quiet = false;
7972 #endif
7973   error_stop_str(string, len, quiet);
7974 }
7975 
7976 
7977 /* ERROR STOP function for numerical arguments. */
7978 
7979 void
7980 PREFIX(error_stop) (int error QUIETARG)
7981 {
7982 #ifndef GCC_GE_8
7983   bool quiet = false;
7984 #endif
7985   if (!quiet)
7986     fprintf(stderr, "ERROR STOP %d\n", error);
7987 
7988   terminate_internal(STAT_STOPPED_IMAGE, error);
7989 }
7990 
7991 
7992 /* FAIL IMAGE statement. */
7993 
7994 void
7995 PREFIX(fail_image) (void)
7996 {
7997   fputs("IMAGE FAILED!\n", stderr);
7998 
7999   raise(SIGKILL);
8000   /* A failing image is expected to take down the runtime regularly. */
8001   terminate_internal(STAT_FAILED_IMAGE, 0);
8002 }
8003 
8004 int
8005 PREFIX(image_status) (int image)
8006 {
8007 #ifdef GFC_CAF_CHECK
8008   if (image < 1 || image > caf_num_images)
8009   {
8010     char errmsg[60];
8011     sprintf(errmsg, "Image #%d out of bounds of images 1..%d.",
8012             image, caf_num_images);
8013     caf_runtime_error(errmsg);
8014   }
8015 #endif
8016 #ifdef WITH_FAILED_IMAGES
8017   if (image_stati[image - 1] == 0)
8018   {
8019     int status, ierr;
8020     /* Check that we are fine before doing anything.
8021      *
8022      * Do an MPI-operation to learn about failed/stopped images, that have
8023      * not been detected yet. */
8024     ierr = MPI_Test(&alive_request, &status, MPI_STATUSES_IGNORE);
8025     chk_err(ierr);
8026     MPI_Error_class(ierr, &status);
8027     if (ierr == MPI_SUCCESS)
8028     {
8029       CAF_Win_lock(MPI_LOCK_SHARED, image - 1, *stat_tok);
8030       ierr = MPI_Get(&status, 1, MPI_INT, image - 1, 0, 1, MPI_INT, *stat_tok);
8031       chk_err(ierr);
8032       dprint("Image status of image #%d is: %d\n", image, status);
8033       CAF_Win_unlock(image - 1, *stat_tok);
8034       image_stati[image - 1] = status;
8035     }
8036     else if (status == MPIX_ERR_PROC_FAILED)
8037     {
8038       image_stati[image - 1] = STAT_FAILED_IMAGE;
8039     }
8040     else
8041     {
8042       const int strcap = 200;
8043       char errmsg[strcap];
8044       int slen, supplied_len;
8045       sprintf(errmsg, "Image status for image #%d returned mpi error: ",
8046               image);
8047       slen = strlen(errmsg);
8048       supplied_len = strcap - slen;
8049       MPI_Error_string(status, &errmsg[slen], &supplied_len);
8050       caf_runtime_error(errmsg);
8051     }
8052   }
8053   return image_stati[image - 1];
8054 #else
8055   unsupported_fail_images_message("IMAGE_STATUS()");
8056 #endif // WITH_FAILED_IMAGES
8057 
8058   return 0;
8059 }
8060 
8061 void
8062 PREFIX(failed_images) (gfc_descriptor_t *array,
8063                        int team __attribute__((unused)), int * kind)
8064 {
8065   int local_kind = kind ? *kind : 4; /* GFC_DEFAULT_INTEGER_KIND = 4*/
8066 
8067 #ifdef WITH_FAILED_IMAGES
8068   void *mem = calloc(num_images_failed, local_kind);
8069   array->base_addr = mem;
8070   for (int i = 0; i < caf_num_images; ++i)
8071   {
8072     if (image_stati[i] == STAT_FAILED_IMAGE)
8073     {
8074       switch (local_kind)
8075       {
8076         case 1:
8077           *(int8_t *)mem = i + 1;
8078           break;
8079         case 2:
8080           *(int16_t *)mem = i + 1;
8081           break;
8082         case 4:
8083           *(int32_t *)mem = i + 1;
8084           break;
8085         case 8:
8086           *(int64_t *)mem = i + 1;
8087           break;
8088 #ifdef HAVE_GFC_INTEGER_16
8089         case 16:
8090           *(int128t *)mem = i + 1;
8091           break;
8092 #endif
8093         default:
8094           caf_runtime_error("Unsupported integer kind %1 "
8095                             "in caf_failed_images.", local_kind);
8096       }
8097       mem += local_kind;
8098     }
8099   }
8100   array->dim[0]._ubound = num_images_failed - 1;
8101 #else
8102   unsupported_fail_images_message("FAILED_IMAGES()");
8103   array->dim[0]._ubound = -1;
8104   array->base_addr = NULL;
8105 #endif // WITH_FAILED_IMAGES
8106 
8107 #ifdef GCC_GE_8
8108   array->dtype.type = BT_INTEGER;
8109   array->dtype.elem_len = local_kind;
8110 #else
8111   array->dtype = ((BT_INTEGER << GFC_DTYPE_TYPE_SHIFT)
8112                   | (local_kind << GFC_DTYPE_SIZE_SHIFT));
8113 #endif
8114   array->dim[0].lower_bound = 0;
8115   array->dim[0]._stride = 1;
8116   array->offset = 0;
8117 }
8118 
8119 void
8120 PREFIX(stopped_images) (gfc_descriptor_t *array,
8121                         int team __attribute__((unused)), int * kind)
8122 {
8123   int local_kind = kind ? *kind : 4; /* GFC_DEFAULT_INTEGER_KIND = 4*/
8124 
8125 #ifdef WITH_FAILED_IMAGES
8126   void *mem = calloc(num_images_stopped, local_kind);
8127   array->base_addr = mem;
8128   for (int i = 0; i < caf_num_images; ++i)
8129   {
8130     if (image_stati[i])
8131     {
8132       switch (local_kind)
8133       {
8134         case 1:
8135           *(int8_t *)mem = i + 1;
8136           break;
8137         case 2:
8138           *(int16_t *)mem = i + 1;
8139           break;
8140         case 4:
8141           *(int32_t *)mem = i + 1;
8142           break;
8143         case 8:
8144           *(int64_t *)mem = i + 1;
8145           break;
8146 #ifdef HAVE_GFC_INTEGER_16
8147         case 16:
8148           *(int128t *)mem = i + 1;
8149           break;
8150 #endif
8151         default:
8152           caf_runtime_error("Unsupported integer kind %1 "
8153                             "in caf_stopped_images.", local_kind);
8154       }
8155       mem += local_kind;
8156     }
8157   }
8158   array->dim[0]._ubound = num_images_stopped - 1;
8159 #else
8160   unsupported_fail_images_message("STOPPED_IMAGES()");
8161   array->dim[0]._ubound = -1;
8162   array->base_addr = NULL;
8163 #endif // WITH_FAILED_IMAGES
8164 
8165 #ifdef GCC_GE_8
8166   array->dtype.type = BT_INTEGER;
8167   array->dtype.elem_len = local_kind;
8168 #else
8169   array->dtype = ((BT_INTEGER << GFC_DTYPE_TYPE_SHIFT)
8170                   | (local_kind << GFC_DTYPE_SIZE_SHIFT));
8171 #endif
8172   array->dim[0].lower_bound = 0;
8173   array->dim[0]._stride = 1;
8174   array->offset = 0;
8175 }
8176 
8177 /* Give a descriptive message when failed images support is not available. */
8178 void
8179 unsupported_fail_images_message(const char * functionname)
8180 {
8181   fprintf(stderr,
8182           "*** caf_mpi-lib runtime message on image %d:\n"
8183           "*** The failed images feature '%s' "
8184           "*** of Fortran 2015 standard is not available in this build."
8185           "*** You need a compiler with failed images support activated and "
8186           "*** compile OpenCoarrays with failed images support.\n",
8187           caf_this_image, functionname);
8188 #ifdef STOP_ON_UNSUPPORTED
8189   exit(EXIT_FAILURE);
8190 #endif
8191 }
8192 
8193 /* Give a descriptive message when support for an allocatable components
8194  * feature is not available. */
8195 void
8196 unimplemented_alloc_comps_message(const char * functionname)
8197 {
8198   fprintf(stderr,
8199           "*** Message from libcaf_mpi runtime function '%s' on image %d:\n"
8200           "*** Assigning to an allocatable coarray component of a derived type"
8201           "is not yet supported with GCC 7.\n"
8202           "*** Either revert to GCC 6 or convert all "
8203           "puts (type(foo)::x; x%%y[recipient] = z) to "
8204           "gets (z = x%%y[provider]).\n",
8205           functionname, caf_this_image );
8206 #ifdef STOP_ON_UNSUPPORTED
8207   exit(EXIT_FAILURE);
8208 #endif
8209 }
8210 
8211 void PREFIX(form_team) (int team_id, caf_team_t *team,
8212                         int index __attribute__((unused)))
8213 {
8214   struct caf_teams_list *tmp;
8215   void * tmp_team;
8216   MPI_Comm *newcomm;
8217   MPI_Comm *current_comm = &CAF_COMM_WORLD;
8218   int ierr;
8219 
8220   newcomm = (MPI_Comm *)calloc(1,sizeof(MPI_Comm));
8221   ierr = MPI_Comm_split(*current_comm, team_id, caf_this_image, newcomm);
8222   chk_err(ierr);
8223 
8224   tmp = calloc(1,sizeof(struct caf_teams_list));
8225   tmp->prev = teams_list;
8226   teams_list = tmp;
8227   teams_list->team_id = team_id;
8228   teams_list->team = newcomm;
8229   *team = tmp;
8230 }
8231 
8232 void PREFIX(change_team) (caf_team_t *team,
8233                           int coselector __attribute__((unused)))
8234 {
8235   caf_used_teams_list *tmp_used = NULL;
8236   caf_teams_list * tmp_list = NULL;
8237   void *tmp_team;
8238   MPI_Comm *tmp_comm;
8239 
8240   tmp_list = (struct caf_teams_list *)*team;
8241   tmp_team = (void *)tmp_list->team;
8242   tmp_comm = (MPI_Comm *)tmp_team;
8243 
8244   tmp_used = (caf_used_teams_list *)calloc(1,sizeof(caf_used_teams_list));
8245   tmp_used->prev = used_teams;
8246 
8247   /* We need to look in the teams_list and find the appropriate element.
8248    * This is not efficient but can be easily fixed in the future.
8249    * Instead of keeping track of the communicator in the compiler
8250    * we should keep track of the caf_teams_list element associated with it. */
8251 
8252   /*
8253   tmp_list = teams_list;
8254 
8255   while (tmp_list)
8256   {
8257     if (tmp_list->team == tmp_team)
8258       break;
8259     tmp_list = tmp_list->prev;
8260   }
8261   */
8262 
8263   if (tmp_list == NULL)
8264     caf_runtime_error("CHANGE TEAM called on a non-existing team");
8265 
8266   tmp_used->team_list_elem = tmp_list;
8267   used_teams = tmp_used;
8268   tmp_team = tmp_used->team_list_elem->team;
8269   tmp_comm = (MPI_Comm *)tmp_team;
8270   CAF_COMM_WORLD = *tmp_comm;
8271   int ierr = MPI_Comm_rank(*tmp_comm,&caf_this_image); chk_err(ierr);
8272   caf_this_image++;
8273   ierr = MPI_Comm_size(*tmp_comm,&caf_num_images); chk_err(ierr);
8274   ierr = MPI_Barrier(*tmp_comm); chk_err(ierr);
8275 }
8276 
8277 MPI_Fint
8278 PREFIX(get_communicator) (caf_team_t *team)
8279 {
8280   if (team != NULL) caf_runtime_error("get_communicator does not yet support "
8281                                       "the optional team argument");
8282 
8283   MPI_Comm* comm_ptr = teams_list->team;
8284   MPI_Fint ret = MPI_Comm_c2f(*comm_ptr);
8285 
8286   return ret;
8287   // return  *(int*)comm_ptr;
8288 }
8289 
8290 int
8291 PREFIX(team_number) (caf_team_t *team)
8292 {
8293   if (team != NULL)
8294     return ((caf_teams_list *)team)->team_id;
8295   else
8296     return used_teams->team_list_elem->team_id; /* current team */
8297 }
8298 
8299 void PREFIX(end_team) (caf_team_t *team __attribute__((unused)))
8300 {
8301   caf_used_teams_list *tmp_used = NULL;
8302   void *tmp_team;
8303   MPI_Comm *tmp_comm;
8304   int ierr;
8305 
8306   ierr = MPI_Barrier(CAF_COMM_WORLD); chk_err(ierr);
8307   if (used_teams->prev == NULL)
8308     caf_runtime_error("END TEAM called on initial team");
8309 
8310   tmp_used = used_teams;
8311   used_teams = used_teams->prev;
8312   free(tmp_used);
8313   tmp_used = used_teams;
8314   tmp_team = tmp_used->team_list_elem->team;
8315   tmp_comm = (MPI_Comm *)tmp_team;
8316   CAF_COMM_WORLD = *tmp_comm;
8317   /* CAF_COMM_WORLD = (MPI_Comm)*tmp_used->team_list_elem->team; */
8318   ierr = MPI_Comm_rank(CAF_COMM_WORLD,&caf_this_image); chk_err(ierr);
8319   caf_this_image++;
8320   ierr = MPI_Comm_size(CAF_COMM_WORLD,&caf_num_images); chk_err(ierr);
8321 }
8322 
8323 void PREFIX(sync_team) (caf_team_t *team , int unused __attribute__((unused)))
8324 {
8325   caf_teams_list *tmp_list = NULL;
8326   caf_used_teams_list *tmp_used = NULL;
8327   void *tmp_team;
8328   MPI_Comm *tmp_comm;
8329 
8330   tmp_used = used_teams;
8331   tmp_list = (struct caf_teams_list *)*team;
8332   tmp_team = (void *)tmp_list->team;
8333   tmp_comm = (MPI_Comm *)tmp_team;
8334 
8335   /* if the team is not a child */
8336   if (tmp_used->team_list_elem != tmp_list->prev)
8337   /* then search backwards through the team list, first checking if it's the
8338    * current team, then if it is an ancestor team */
8339     while (tmp_used)
8340     {
8341       if (tmp_used->team_list_elem == tmp_list)
8342         break;
8343       tmp_used = tmp_used->prev;
8344     }
8345 
8346   if (tmp_used == NULL)
8347     caf_runtime_error("SYNC TEAM called on team different from current, "
8348                       "or ancestor, or child");
8349 
8350   int ierr = MPI_Barrier(*tmp_comm); chk_err(ierr);
8351 }
8352