1 /*
2 * Copyright (c) 2013-2018 Intel, Inc. All rights reserved
3 *
4 * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
5 * $COPYRIGHT$
6 *
7 * Additional copyrights may follow
8 *
9 * $HEADER$
10 */
11
12 #ifndef MTL_OFI_TYPES_H_HAS_BEEN_INCLUDED
13 #define MTL_OFI_TYPES_H_HAS_BEEN_INCLUDED
14
15 #include "mtl_ofi.h"
16
17 BEGIN_C_DECLS
18
19 /**
20 * MTL Module Interface
21 */
22
23 typedef struct mca_mtl_ofi_context_t {
24 /* Transmit and receive contexts */
25 struct fid_ep *tx_ep;
26 struct fid_ep *rx_ep;
27
28 /* Completion queue */
29 struct fid_cq *cq;
30
31 /* Thread locking */
32 opal_mutex_t context_lock;
33 } mca_mtl_ofi_context_t;
34
35 typedef struct mca_mtl_ofi_module_t {
36 mca_mtl_base_module_t base;
37
38 /** Fabric Domain handle */
39 struct fid_fabric *fabric;
40
41 /** Access Domain handle */
42 struct fid_domain *domain;
43
44 /** Address vector handle */
45 struct fid_av *av;
46
47 /* Multi-threaded Application flag */
48 bool mpi_thread_multiple;
49
50 /* Scalable Endpoint attributes */
51 struct fid_ep *sep; /* Endpoint object */
52 mca_mtl_ofi_context_t *ofi_ctxt; /* OFI contexts */
53 int threshold_comm_context_id; /* Set threshold communicator ID */
54 int *comm_to_context; /* Map communicator ID to context */
55 int rx_ctx_bits; /* Bits used for RX context */
56 int total_ctxts_used; /* Total number of contexts used */
57 int enable_sep; /* MCA to enable/disable SEP feature */
58 int thread_grouping; /* MCA for thread grouping feature */
59 int num_ofi_contexts; /* MCA for number of contexts to use */
60
61 /** Endpoint name length */
62 size_t epnamelen;
63
64 /** "Any source" address */
65 fi_addr_t any_addr;
66
67 /** OFI provider name */
68 char *provider_name;
69
70 /** Maximum inject size */
71 size_t max_inject_size;
72
73 /** Largest message that can be sent in a single send. */
74 size_t max_msg_size;
75
76 /** Maximum number of CQ events to read in OFI Progress */
77 int ofi_progress_event_count;
78
79 /** Use FI_REMOTE_CQ_DATA*/
80 bool fi_cq_data;
81
82 /** Info used to create the OFI tag **/
83 unsigned long long source_rank_tag_mask;
84 int num_bits_source_rank;
85 unsigned long long source_rank_mask;
86 unsigned long long mpi_tag_mask;
87 int num_bits_mpi_tag;
88 int num_peers;
89
90 /** Synchronous protocol tag bits */
91 unsigned long long sync_send;
92 unsigned long long sync_send_ack;
93 unsigned long long sync_proto_mask;
94
95 /** Optimized function Symbol Tables **/
96 struct ompi_mtl_ofi_symtable sym_table;
97
98 } mca_mtl_ofi_module_t;
99
100 extern mca_mtl_ofi_module_t ompi_mtl_ofi;
101
102 typedef struct mca_mtl_ofi_component_t {
103 /** Base MTL component */
104 mca_mtl_base_component_2_0_0_t super;
105 } mca_mtl_ofi_component_t;
106
107 typedef enum {
108 OFI_REGULAR_EP = 0,
109 OFI_SCALABLE_EP,
110 } mca_mtl_ofi_ep_type;
111
112 /*
113 * Define upper limit for number of events read from a CQ.
114 * Setting this to 100 as this was deemed optimal from empirical data.
115 * If one wants to read lesser number of events from the CQ, the MCA
116 * variable can be used.
117 */
118 #define MTL_OFI_MAX_PROG_EVENT_COUNT 100
119
120 /*OFI TAG:
121 * Define 3 different OFI tag distributions:
122 * 1) Support FI_REMOTE_CQ_DATA: No need for source rank in the tag
123 * 2) ofi_tag_1: fallback when no FI_REMOTE_CQ_DATA is supported
124 * 3) ofi_tag_2: Alternative tag when no FI_REMOTE_CQ_DATA is supported
125 * with more bits for the communicator ID.
126 * More details of the tags are in the README file (mtl_ofi_tag_mode).
127 */
128
129 #define MTL_OFI_MINIMUM_CID_BITS (8)
130
131 /* Support FI_REMOTE_CQ_DATA, send the source rank in the CQ data (4 Bytes is the minimum)
132 * 01234567 01234567 01234567 012345 67 01234567 01234567 01234567 01234567
133 * | |
134 * context_id |prot| message tag
135 */
136 #define MTL_OFI_PROTO_BIT_COUNT (2)
137 #define MTL_OFI_HIGHEST_TAG_BIT (0x8000000000000000ULL)
138
139 #define MTL_OFI_CID_MASK_DATA (0xFFFFFFFC00000000ULL)
140 #define MTL_OFI_CID_BIT_COUNT_DATA (30)
141 #define MTL_OFI_TAG_MASK_DATA (0x00000000FFFFFFFFULL)
142 #define MTL_OFI_TAG_BIT_COUNT_DATA (32)
143 #define MTL_OFI_PROTO_MASK_DATA (0x0000000300000000ULL)
144 #define MTL_OFI_SYNC_SEND_DATA (0x0000000100000000ULL)
145 #define MTL_OFI_SYNC_SEND_ACK_DATA (0x0000000200000000ULL)
146
147 /* Send tag with CQ_DATA */
148 __opal_attribute_always_inline__ static inline uint64_t
mtl_ofi_create_send_tag_CQD(int comm_id,int tag)149 mtl_ofi_create_send_tag_CQD(int comm_id, int tag)
150 {
151 uint64_t match_bits = comm_id;
152 match_bits = (match_bits << (MTL_OFI_TAG_BIT_COUNT_DATA
153 + MTL_OFI_PROTO_BIT_COUNT));
154 match_bits |= (tag & MTL_OFI_TAG_MASK_DATA);
155 return match_bits;
156 }
157
158 /* Receive tag with CQ_DATA */
159 __opal_attribute_always_inline__ static inline void
mtl_ofi_create_recv_tag_CQD(uint64_t * match_bits,uint64_t * mask_bits,int comm_id,int tag)160 mtl_ofi_create_recv_tag_CQD(uint64_t *match_bits, uint64_t *mask_bits,
161 int comm_id, int tag)
162 {
163 *mask_bits = ompi_mtl_ofi.sync_proto_mask;
164 *match_bits = (uint64_t) comm_id;
165 *match_bits = (*match_bits << (MTL_OFI_PROTO_BIT_COUNT
166 + MTL_OFI_TAG_BIT_COUNT_DATA));
167 if (MPI_ANY_TAG == tag) {
168 /* Special negative tags are used for collective operations.
169 * MPI_ANY_TAG should not match these special tags.
170 * See ompi/mca/coll/base/coll_tags.h
171 */
172 *mask_bits |= (ompi_mtl_ofi.mpi_tag_mask>>1);
173 } else {
174 *match_bits |= (ompi_mtl_ofi.mpi_tag_mask & tag);
175 }
176 }
177
178 /*
179 * ofi_tag_1: fallback when no FI_REMOTE_CQ_DATA is supported
180 *
181 * 01234567 0123 4567 01234567 012345 67 01234567 01234567 01234567 01234567
182 * | | |
183 * Comm id | source |prot| message tag
184 */
185
186 #define MTL_OFI_CID_BIT_COUNT_1 (12)
187 #define MTL_OFI_SOURCE_TAG_MASK_1 (0x000FFFFC00000000ULL)
188 #define MTL_OFI_SOURCE_BIT_COUNT_1 (18)
189 #define MTL_OFI_SOURCE_MASK_1 (0x000000000003FFFFULL)
190 #define MTL_OFI_TAG_MASK_1 (0x00000000FFFFFFFFULL)
191 #define MTL_OFI_TAG_BIT_COUNT_1 (32)
192 #define MTL_OFI_PROTO_MASK_1 (0x0000000300000000ULL)
193 #define MTL_OFI_SYNC_SEND_1 (0x0000000100000000ULL)
194 #define MTL_OFI_SYNC_SEND_ACK_1 (0x0000000200000000ULL)
195
196 /*
197 * ofi_tag_2: Alternative tag when no FI_REMOTE_CQ_DATA is supported
198 *
199 * 01234567 01234567 01234567 01234567 01234567 01 23 4567 01234567 01234567
200 * | | |
201 * Comm id | source |prot| message tag
202 */
203
204 #define MTL_OFI_CID_BIT_COUNT_2 (24)
205 #define MTL_OFI_SOURCE_TAG_MASK_2 (0x000000FFFFC00000ULL)
206 #define MTL_OFI_SOURCE_BIT_COUNT_2 (18)
207 #define MTL_OFI_SOURCE_MASK_2 (0x000000000003FFFFULL)
208 #define MTL_OFI_TAG_MASK_2 (0x00000000000FFFFFULL)
209 #define MTL_OFI_TAG_BIT_COUNT_2 (20)
210 #define MTL_OFI_PROTO_MASK_2 (0x0000000000300000ULL)
211 #define MTL_OFI_SYNC_SEND_2 (0x0000000000100000ULL)
212 #define MTL_OFI_SYNC_SEND_ACK_2 (0x0000000000200000ULL)
213
214 /* Send tag */
215 __opal_attribute_always_inline__ static inline uint64_t
mtl_ofi_create_send_tag(int comm_id,int source,int tag)216 mtl_ofi_create_send_tag(int comm_id, int source, int tag)
217 {
218 uint64_t match_bits = comm_id;
219 match_bits = (match_bits << ompi_mtl_ofi.num_bits_source_rank);
220 match_bits |= (uint64_t)(source & ompi_mtl_ofi.source_rank_mask);
221 match_bits = (match_bits << (ompi_mtl_ofi.num_bits_mpi_tag
222 + MTL_OFI_PROTO_BIT_COUNT));
223 match_bits |= (tag & ompi_mtl_ofi.mpi_tag_mask);
224 return match_bits;
225 }
226
227 /* Receive tag*/
228 __opal_attribute_always_inline__ static inline void
mtl_ofi_create_recv_tag(uint64_t * match_bits,uint64_t * mask_bits,int comm_id,int source,int tag)229 mtl_ofi_create_recv_tag(uint64_t *match_bits, uint64_t *mask_bits,
230 int comm_id, int source, int tag)
231 {
232 *mask_bits = ompi_mtl_ofi.sync_proto_mask;
233 *match_bits = comm_id;
234 *match_bits = (*match_bits << ompi_mtl_ofi.num_bits_source_rank);
235
236 if (MPI_ANY_SOURCE == source) {
237 *match_bits = (*match_bits << (ompi_mtl_ofi.num_bits_mpi_tag
238 + MTL_OFI_PROTO_BIT_COUNT));
239 *mask_bits |= ompi_mtl_ofi.source_rank_tag_mask;
240 } else {
241 *match_bits |= (uint64_t)(source & ompi_mtl_ofi.source_rank_mask);
242 *match_bits = (*match_bits << (ompi_mtl_ofi.num_bits_mpi_tag
243 + MTL_OFI_PROTO_BIT_COUNT));
244 }
245
246 if (MPI_ANY_TAG == tag) {
247 /* Special negative tags are used for collective operations.
248 * MPI_ANY_TAG should not match these special tags.
249 * See ompi/mca/coll/base/coll_tags.h
250 */
251 *mask_bits |= (ompi_mtl_ofi.mpi_tag_mask>>1);
252 } else {
253 *match_bits |= (ompi_mtl_ofi.mpi_tag_mask & tag);
254 }
255 }
256
257 #define MTL_OFI_SET_SYNC_SEND(match_bits) \
258 match_bits |= ompi_mtl_ofi.sync_send
259
260 #define MTL_OFI_IS_SYNC_SEND(match_bits) \
261 (ompi_mtl_ofi.sync_send == (ompi_mtl_ofi.sync_proto_mask & match_bits))
262
263 #define MTL_OFI_IS_SYNC_SEND_ACK(match_bits) \
264 (ompi_mtl_ofi.sync_send_ack == (ompi_mtl_ofi.sync_proto_mask & match_bits))
265
266 #define MTL_OFI_GET_TAG(match_bits) \
267 ((int)(match_bits & ompi_mtl_ofi.mpi_tag_mask))
268
269 __opal_attribute_always_inline__ static inline int
mtl_ofi_get_source(struct fi_cq_tagged_entry * wc)270 mtl_ofi_get_source(struct fi_cq_tagged_entry *wc)
271 {
272 int src;
273 if (ompi_mtl_ofi.fi_cq_data) {
274 src = (int) wc->data;
275 }
276 else {
277 src = (int)((wc->tag >> (MTL_OFI_PROTO_BIT_COUNT +
278 ompi_mtl_ofi.num_bits_mpi_tag)) & ompi_mtl_ofi.source_rank_mask);
279 }
280
281 return src;
282 }
283 END_C_DECLS
284
285 #endif /* MTL_OFI_TYPES_H_HAS_BEEN_INCLUDED */
286