1 /* begin_generated_IBM_copyright_prolog                             */
2 /*                                                                  */
3 /* This is an automatically generated copyright prolog.             */
4 /* After initializing,  DO NOT MODIFY OR MOVE                       */
5 /*  --------------------------------------------------------------- */
6 /* Licensed Materials - Property of IBM                             */
7 /* Blue Gene/Q 5765-PER 5765-PRP                                    */
8 /*                                                                  */
9 /* (C) Copyright IBM Corp. 2011, 2012 All Rights Reserved           */
10 /* US Government Users Restricted Rights -                          */
11 /* Use, duplication, or disclosure restricted                       */
12 /* by GSA ADP Schedule Contract with IBM Corp.                      */
13 /*                                                                  */
14 /*  --------------------------------------------------------------- */
15 /*                                                                  */
16 /* end_generated_IBM_copyright_prolog                               */
17 /*  (C)Copyright IBM Corp.  2007, 2011  */
18 /**
19  * \file include/mpidi_datatypes.h
20  * \brief ???
21  */
22 /*
23  *  (C) 2001 by Argonne National Laboratory.
24  *      See COPYRIGHT in top-level directory.
25  */
26 
27 
28 #ifndef __include_mpidi_datatypes_h__
29 #define __include_mpidi_datatypes_h__
30 
31 #ifdef MPIDI_STATISTICS
32 #include <pami_ext_pe.h>
33 #endif
34 
35 #include "mpidi_constants.h"
36 #include "mpidi_platform.h"
37 #include "pami.h"
38 
39 #if (MPIU_HANDLE_ALLOCATION_METHOD == MPIU_HANDLE_ALLOCATION_THREAD_LOCAL) && defined(__BGQ__)
40 struct MPID_Request;
41 typedef struct
42 {
43   struct MPID_Request  * head;
44   size_t                 count;
45 } MPIDI_RequestHandle_t;
46 #endif
47 
48 
49 /**
50  * \brief MPI Process descriptor
51  *
52  * This structure contains global configuration flags.
53  */
54 typedef struct
55 {
56   unsigned avail_contexts;
57   unsigned short_limit;
58   unsigned eager_limit;
59   unsigned eager_limit_local;
60 #if (MPIDI_STATISTICS || MPIDI_PRINTENV)
61   unsigned mp_infolevel;
62   unsigned mp_statistics;     /* print pamid statistcs data                           */
63   unsigned mp_printenv; ;     /* print env data                                       */
64 #endif
65 #ifdef RDMA_FAILOVER
66   unsigned mp_s_use_pami_get; /* force the PAMI_Get path instead of PAMI_Rget         */
67 #endif
68 
69 #if (MPIU_HANDLE_ALLOCATION_METHOD == MPIU_HANDLE_ALLOCATION_THREAD_LOCAL) && defined(__BGQ__)
70   MPIDI_RequestHandle_t request_handles[MPIDI_MAX_THREADS];
71 #endif
72 
73   unsigned verbose;        /**< The current level of verbosity for end-of-job stats. */
74   unsigned statistics;     /**< The current level of stats collection.               */
75   unsigned rma_pending;    /**< The max num outstanding requests during an RMA op    */
76   unsigned shmem_pt2pt;    /**< Enable optimized shared memory point-to-point functions. */
77 
78   pami_geometry_t world_geometry;
79 
80   struct
81   {
82     unsigned collectives;  /**< Enable optimized collective functions. */
83     unsigned subcomms;
84     unsigned select_colls; /**< Enable collective selection */
85   }
86   optimized;
87 
88   struct
89   {
90     volatile unsigned active;  /**< Number of contexts with active async progress */
91     unsigned          mode;    /**< 0 == 'disabled', 1 == 'locked', 2 == 'trigger' */
92   }
93   async_progress;
94 
95   struct
96   {
97     struct
98     {
99       unsigned requested;    /**< 1 == application requests context post */
100       unsigned active;       /**< 1 == context post is currently required */
101     } context_post;
102   } perobj;                  /**< This structure is only used in the 'perobj' mpich lock mode. */
103 
104 } MPIDI_Process_t;
105 
106 
107 enum
108   {
109     MPIDI_Protocols_Short,
110     MPIDI_Protocols_ShortSync,
111     MPIDI_Protocols_Eager,
112     MPIDI_Protocols_RVZ,
113     MPIDI_Protocols_Cancel,
114     MPIDI_Protocols_Control,
115     MPIDI_Protocols_WinCtrl,
116     MPIDI_Protocols_WinAccum,
117     MPIDI_Protocols_RVZ_zerobyte,
118     MPIDI_Protocols_COUNT,
119   };
120 
121 
122 /**
123  * \brief This defines the type of message being sent/received
124  * mpid_startall() invokes the correct start based on the type of the request
125  */
126 typedef enum
127   {
128     MPIDI_REQUEST_PTYPE_RECV,
129     MPIDI_REQUEST_PTYPE_SEND,
130     MPIDI_REQUEST_PTYPE_BSEND,
131     MPIDI_REQUEST_PTYPE_SSEND,
132   } MPIDI_REQUEST_PTYPE;
133 
134 
135 typedef enum
136   {
137     MPIDI_CONTROL_SSEND_ACKNOWLEDGE,
138     MPIDI_CONTROL_CANCEL_REQUEST,
139     MPIDI_CONTROL_CANCEL_ACKNOWLEDGE,
140     MPIDI_CONTROL_CANCEL_NOT_ACKNOWLEDGE,
141     MPIDI_CONTROL_RENDEZVOUS_ACKNOWLEDGE,
142   } MPIDI_CONTROL;
143 
144 
145 /** \brief Request completion actions */
146 typedef enum
147   {
148     MPIDI_CA_COMPLETE,
149     MPIDI_CA_UNPACK_UEBUF_AND_COMPLETE,         /**< Unpack uebuf, then complete. */
150   } MPIDI_CA;
151 
152 
153 /**
154  * \brief MPIDI_Message_match contains enough information to match an
155  * MPI message.
156  */
157 typedef struct
158 {
159   int tag;        /**< match tag     */
160   int rank;       /**< match rank    */
161   int context_id; /**< match context */
162 #ifdef OUT_OF_ORDER_HANDLING
163   int seqno;      /**< match seqno */
164 #endif
165 } MPIDI_Message_match;
166 
167 
168 /**
169  * \brief MPID pt2pt message header
170  */
171 typedef struct
172 {
173   MPI_Request req;         /**< peer's request handle  */
174   unsigned    MPItag;      /**< match tag              */
175   unsigned    MPIrank;     /**< match rank             */
176   uint16_t    MPIctxt;     /**< match context          */
177 
178   union {
179     uint16_t  flags;
180     struct {
181       unsigned control:3;  /**< message type for control protocols */
182       unsigned isSync:1;   /**< set for sync sends     */
183       unsigned isRzv :1;   /**< use pt2pt rendezvous   */
184     } __attribute__ ((__packed__));
185   };
186 
187 #ifdef OUT_OF_ORDER_HANDLING
188   unsigned    MPIseqno;    /**< match seqno            */
189 #endif
190 } MPIDI_MsgInfo;
191 
192 /** \brief Full Rendezvous msg info to be set as two quads of unexpected data. */
193 typedef struct
194 {
195   MPIDI_MsgInfo    msginfo;
196   pami_memregion_t memregion;
197 #ifdef RDMA_FAILOVER
198   uint32_t         memregion_used;
199 #endif
200   void           * data;
201   size_t           length;
202 } MPIDI_MsgEnvelope;
203 
204 /** \brief This defines the portion of MPID_Request that is specific to the Device */
205 struct MPIDI_Request
206 {
207   struct MPID_Request  *next;         /**< Link to next req. in queue */
208   struct MPID_Datatype *datatype_ptr; /**< Info about the datatype    */
209   pami_work_t           post_request; /**<                            */
210 
211   MPIDI_MsgEnvelope     envelope;
212 
213   void                 *userbuf;      /**< User buffer                */
214   unsigned              userbufcount; /**< Userbuf data count         */
215   MPI_Datatype          datatype;     /**< Data type of message       */
216   pami_task_t           peer_pami;    /**< The other guy's rank (in PAMI) */
217   unsigned              peer_comm;    /**< The other guy's rank (in the orig communicator) */
218   unsigned            cancel_pending:16; /**< Cancel status              */
219   unsigned            uebuf_malloc:16;   /**< does uebuf require free()  */
220 
221   unsigned              uebuflen;     /**< Length (bytes) of uebuf    */
222   void                 *uebuf;        /**< Unexpected buffer          */
223 
224   MPIDI_REQUEST_PTYPE   ptype;        /**< The persistent msg type    */
225   MPIDI_CA              ca;           /**< Completion action          */
226   pami_memregion_t      memregion;    /**< Rendezvous recv memregion  */
227 #ifdef OUT_OF_ORDER_HANDLING
228   struct MPID_Request  *prev;         /**< Link to prev req. in queue */
229   void                 *nextR;        /** < pointer to next recv for the out-of-order list, the out-of-order list is a list per source */
230   void                 *prevR;        /** < pointer to prev recv for the out-of-order list, the out-of-order list is a list per source */
231   struct MPID_Request  *oo_peer;      /** < pointer to the matched post recv request to complete in the out-of-order case */
232 #endif
233 #ifdef RDMA_FAILOVER
234   uint32_t             memregion_used:16;
235   uint32_t             shm:16;
236 #endif
237 #ifdef MPIDI_TRACE
238   int   cur_nMsgs;
239   int   partner_id;
240   int   idx;
241   int   PR_idx;
242 #endif
243 };
244 
245 
246 /** \brief This defines the portion of MPID_Comm that is specific to the Device */
247 struct MPIDI_Comm
248 {
249   pami_geometry_t geometry; /**< Geometry component for collectives      */
250   pami_geometry_t parent; /**< The parent geometry this communicator came from */
251   pami_algorithm_t *coll_algorithm[PAMI_XFER_COUNT][2];
252   pami_metadata_t *coll_metadata[PAMI_XFER_COUNT][2];
253   char coll_count[PAMI_XFER_COUNT][2];
254   pami_algorithm_t user_selected[PAMI_XFER_COUNT];
255   /* no way to tell if user_selected[] is NULL */
256   /* could probably union these two though? */
257   char user_selected_type[PAMI_XFER_COUNT];
258   pami_metadata_t user_metadata[PAMI_XFER_COUNT];
259   char last_algorithm[100];
260   char preallreduces[MPID_NUM_PREALLREDUCES];
261   /* \todo Need to figure out how to deal with algorithms above the pami level */
262   char allgathers[4];
263   char allgathervs[4];
264   char scattervs[2];
265   char optgather, optscatter;
266 
267   /* These need to be freed at geom destroy, so we need to store them
268    * inside the communicator struct until destroy time rather than
269    * allocating pointers on the stack
270    */
271   /* For create_taskrange */
272   pami_geometry_range_t *ranges;
273   /* For create_tasklist/endpoints if we ever use it */
274   pami_task_t *tasks;
275   pami_endpoint_t *endpoints;
276    /* There are some protocols where the optimized protocol always works and
277     * is the best performance */
278    /* Assume we have small vs large cutoffs vs medium for some protocols */
279    pami_algorithm_t opt_protocol[PAMI_XFER_COUNT][2];
280    int must_query[PAMI_XFER_COUNT][2];
281    pami_metadata_t opt_protocol_md[PAMI_XFER_COUNT][2];
282    int cutoff_size[PAMI_XFER_COUNT][2];
283    /* Our best allreduce double protocol only works on
284     * doubles and sum/min/max. Since that is a common
285     * occurance let's cache that protocol and call
286     * it without checking */
287    pami_algorithm_t cached_allred_dsmm; /*dsmm = double, sum/min/max */
288    pami_metadata_t cached_allred_dsmm_md;
289    int query_allred_dsmm;
290 
291    /* We have some integer optimized protocols that only work on
292     * sum/min/max but also have datasize/ppn <= 8k limitations */
293    /* Using Amith's protocol, these work on int/min/max/sum of SMALL messages */
294    pami_algorithm_t cached_allred_ismm;
295    pami_metadata_t cached_allred_ismm_md;
296    /* Because this only works at select message sizes, this will have to be
297     * nonzero */
298    int query_allred_ismm;
299 
300   union tasks_descrip_t {
301     /* For create_taskrange */
302     pami_geometry_range_t *ranges;
303     /* For create_tasklist/endpoints if we ever use it */
304     pami_task_t *tasks;
305     pami_endpoint_t *endpoints;
306   } tasks_descriptor;
307 };
308 
309 
310 typedef struct
311 {
312   pami_work_t state;
313   pami_xfer_t *coll_struct;
314 } MPIDI_Post_coll_t;
315 
316 
317 /** \brief Forward declaration of the MPID_Comm structure */
318 struct MPID_Comm;
319 /** \brief Forward declaration of the MPID_Win structure */
320 struct MPID_Win;
321 /** \brief Forward declaration of the MPID_Group structure */
322 struct MPID_Group;
323 
324 
325 struct MPIDI_Win_lock
326 {
327   struct MPIDI_Win_lock *next;
328   unsigned               rank;
329   int                    type;
330 };
331 struct MPIDI_Win_queue
332 {
333   struct MPIDI_Win_lock *head;
334   struct MPIDI_Win_lock *tail;
335 };
336 /**
337  * \brief Collective information related to a window
338  *
339  * This structure is used to share information about a local window with
340  * all nodes in the window communicator. Part of that information includes
341  * statistics about RMA operations during access/exposure epochs.
342  *
343  * The structure is allocated as an array sized for the window communicator.
344  * Each entry in the array corresponds directly to the node of the same rank.
345  */
346 struct MPIDI_Win_info
347 {
348   void             * base_addr;     /**< Node's exposure window base address                  */
349   struct MPID_Win  * win;
350   uint32_t           disp_unit;     /**< Node's exposure window displacement units            */
351   pami_memregion_t   memregion;     /**< Memory region descriptor for each node               */
352 #ifdef RDMA_FAILOVER
353   uint32_t           memregion_used;
354 #endif
355 };
356 /**
357  * \brief Structure of PAMI extensions to MPID_Win structure
358  */
359 struct MPIDI_Win
360 {
361   struct MPIDI_Win_info * info;    /**< allocated array of collective info             */
362   struct MPIDI_Win_sync
363   {
364 #if 0
365     /** \todo optimize some of the synchronization assertion */
366     uint32_t assert; /**< MPI_MODE_* bits asserted at epoch start              */
367 #endif
368 
369     /* These fields are reset by the sync functions */
370     uint32_t          total;    /**< The number of PAMI requests that we know about (updated only by calling thread) */
371     volatile uint32_t started;  /**< The number of PAMI requests made (updated only in the context_post callback) */
372     volatile uint32_t complete; /**< The number of completed PAMI requests (only updated by the done callbacks) */
373 
374     struct MPIDI_Win_sync_pscw
375     {
376       struct MPID_Group * group;
377       volatile unsigned   count;
378     } sc, pw;
379     struct MPIDI_Win_sync_lock
380     {
381       struct
382       {
383         volatile unsigned locked;
384       } remote;
385       struct
386       {
387         struct MPIDI_Win_queue requested;
388         int                    type;
389         unsigned               count;
390       } local;
391     } lock;
392   } sync;
393 };
394 
395 
396 #endif
397