1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
4 * University Research and Technology
5 * Corporation. All rights reserved.
6 * Copyright (c) 2004-2011 The University of Tennessee and The University
7 * of Tennessee Research Foundation. All rights
8 * reserved.
9 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10 * University of Stuttgart. All rights reserved.
11 * Copyright (c) 2004-2005 The Regents of the University of California.
12 * All rights reserved.
13 * Copyright (c) 2006 Sandia National Laboratories. All rights
14 * reserved.
15 * Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved.
16 * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
17 * reserved.
18 * $COPYRIGHT$
19 *
20 * Additional copyrights may follow
21 *
22 * $HEADER$
23 */
24 /**
25 * @file
26 */
27 #ifndef OPAL_BTL_USNIC_H
28 #define OPAL_BTL_USNIC_H
29
30 #include "opal_config.h"
31 #include <sys/types.h>
32
33 #include "opal_stdint.h"
34 #include "opal/util/alfg.h"
35 #include "opal/class/opal_hash_table.h"
36 #include "opal/class/opal_hash_table.h"
37 #include "opal/mca/event/event.h"
38
39 #if BTL_IN_OPAL
40 #include "opal/mca/btl/btl.h"
41 #include "opal/mca/btl/base/btl_base_error.h"
42 #include "opal/mca/btl/base/base.h"
43 #include "opal/mca/rcache/rcache.h"
44
45 #include "btl_usnic_compat.h"
46
47 #if RCACHE_VERSION < 30
48 #include "opal/mca/mpool/grdma/mpool_grdma.h"
49 #endif
50 #else
51 #include "ompi/mca/btl/btl.h"
52 #include "ompi/mca/btl/base/btl_base_error.h"
53 #include "ompi/mca/btl/base/base.h"
54 #include "ompi/mca/mpool/grdma/mpool_grdma.h"
55 #endif
56
57 BEGIN_C_DECLS
58
59 /*
60 * We're simulating a clock as best we can without resorting to the
61 * system. The clock is used to defer ACKs, and ticks will be incremented
62 * when progression gets called. It could be incremented by different amounts
63 * at other times as needed or as tuning dictates.
64 */
65 extern uint64_t opal_btl_usnic_ticks;
66
67 /* Lock for MPU_THREAD_MULTIPLE support */
68 extern opal_recursive_mutex_t btl_usnic_lock;
69
70 static inline uint64_t
get_ticks(void)71 get_ticks(void)
72 {
73 return opal_btl_usnic_ticks;
74 }
75
76 /* RNG buffer declaration */
77 extern opal_rng_buff_t opal_btl_usnic_rand_buff;
78
79 #ifndef container_of
80 #define container_of(ptr, type, member) ( \
81 (type *)( ((char *)(ptr)) - offsetof(type,member) ))
82 #endif
83
84 #ifndef max
85 #define max(a, b) (((a) > (b)) ? (a) : (b))
86 #endif
87
88 /* MSGDEBUG2 prints 1 line at each BTL entry point */
89 #define MSGDEBUG2 (MSGDEBUG1||0)
90 /* MSGDEBUG1 prints more info about arguments and internal functions */
91 #define MSGDEBUG1 0
92
93 /* output macros to declutter source */
94 #if MSGDEBUG1
95 #define MSGDEBUG1_OUT(...) opal_output(0, __VA_ARGS__)
96 #else
97 #define MSGDEBUG1_OUT(...) do {} while (0)
98 #endif
99 #if MSGDEBUG2
100 #define MSGDEBUG2_OUT(...) opal_output(0, __VA_ARGS__)
101 #else
102 #define MSGDEBUG2_OUT(...) do {} while (0)
103 #endif
104
105 /* Set to >0 to randomly drop received frags. The higher the number,
106 the more frequent the drops. */
107 #define WANT_RECV_DROPS 0
108 /* Set to >0 to randomly fail to send an ACK, mimicing a lost ACK.
109 The higher the number, the more frequent the failed-to-send-ACK. */
110 #define WANT_FAIL_TO_SEND_ACK 0
111 /* Set to >0 to randomly fail to resend a frag (causing it to be
112 requed to be sent later). The higher the number, the more frequent
113 the failed-to-resend-frag. */
114 #define WANT_FAIL_TO_RESEND_FRAG 0
115
116 #if WANT_RECV_DROPS > 0
117 #define FAKE_RECV_DROP (opal_rand(&opal_btl_usnic_rand_buff) < WANT_RECV_DROPS)
118 #else
119 #define FAKE_RECV_DROP 0
120 #endif
121
122 #if WANT_FAIL_TO_SEND_ACK > 0
123 #define FAKE_FAIL_TO_SEND_ACK (opal_rand(&opal_btl_usnic_rand_buff) < WANT_FAIL_TO_SEND_ACK)
124 #else
125 #define FAKE_FAIL_TO_SEND_ACK 0
126 #endif
127
128 #if WANT_FAIL_TO_RESEND_FRAG > 0
129 #define FAKE_FAIL_TO_RESEND_FRAG (opal_rand(&opal_btl_usnic_rand_buff) < WANT_FAIL_TO_RESEND_FRAG)
130 #else
131 #define FAKE_FAIL_TO_RESEND_FRAG 0
132 #endif
133
134
135 /**
136 * usnic BTL component
137 */
138 typedef struct opal_btl_usnic_component_t {
139 /** base BTL component */
140 mca_btl_base_component_2_0_0_t super;
141
142 /* in the v1.6 series, sizeof(super) is 256, leading to good alignment for
143 * subsequent fastpath fields */
144
145 /** Maximum number of BTL modules */
146 int max_modules;
147 /** Number of available/initialized BTL modules */
148 int num_modules;
149
150 /* Cached hashed version of my RTE proc name (to stuff in
151 protocol headers) */
152 uint64_t my_hashed_rte_name;
153
154 /** array of possible BTLs (>= num_modules elements) */
155 struct opal_btl_usnic_module_t* usnic_all_modules;
156 /** array of pointers to active BTLs (num_modules elements) */
157 struct opal_btl_usnic_module_t** usnic_active_modules;
158
159 /** convertor packing threshold */
160 int pack_lazy_threshold;
161
162 /* vvvvvvvvvv non-fastpath fields go below vvvvvvvvvv */
163
164 /** list of usnic proc structures */
165 opal_list_t usnic_procs;
166
167 #if RCACHE_VERSION == 30
168 /** memory pool hints */
169 char* usnic_mpool_hints;
170
171 /** registration cache name */
172 char *usnic_rcache_name;
173 #else
174 /** name of memory pool */
175 char* usnic_mpool_name;
176 #endif
177
178 char *if_include;
179 char *if_exclude;
180
181 /** Want stats? */
182 bool stats_enabled;
183 bool stats_relative;
184 int stats_frequency;
185
186 /** Whether we want to use NUMA distances to choose which usNIC
187 devices to use for short messages */
188 bool want_numa_device_assignment;
189
190 /** max send descriptors to post per module */
191 int32_t sd_num;
192
193 /** max receive descriptors per module */
194 int32_t rd_num;
195
196 /** max send/receive desriptors for priority channel */
197 int32_t prio_sd_num;
198 int32_t prio_rd_num;
199
200 /** max completion queue entries per module */
201 int32_t cq_num;
202
203 /** max number of entries in AV EQ */
204 int32_t av_eq_num;
205
206 /** retrans characteristics */
207 int retrans_timeout;
208
209 /** max number of messages re-sent during a single progress
210 iteration */
211 int max_resends_per_iteration;
212
213 /** minimum number of times through component progress before
214 checking to see if standalone ACKs need to be sent */
215 int ack_iteration_delay;
216
217 /** transport header length for all usNIC devices on this server
218 (it is guaranteed that all usNIC devices on a single server
219 will have the same underlying transport, and therefore the
220 same transport header length) */
221 int transport_header_len;
222 uint32_t transport_protocol;
223
224 /* what UDP port do we want to use? If 0, the system will pick.
225 If nonzero, it is used as the base -- the final number will be
226 (base+my_local_rank). */
227 int udp_port_base;
228
229 /** disable the "cannot find route" warnings (for network setups
230 where this is known/acceptable) */
231 bool show_route_failures;
232
233 /** connectivity verification: ACK timeout, number of retries
234 before issue an error/abort the job */
235 bool connectivity_enabled;
236 int connectivity_ack_timeout;
237 int connectivity_num_retries;
238
239 /** how many short packets have to be received before outputting
240 the "received short packets" warning? */
241 uint32_t max_short_packets;
242
243 /* Prefix for the connectivity map filename (map will be output if
244 the prefix is non-NULL) */
245 char *connectivity_map_prefix;
246
247 /** Offset into the send buffer where the payload will go. For
248 libfabric v1.0.0 / API v1.0, this is 0. For libfabric >=v1.1
249 / API >=v1.1, this is the endpoint.msg_prefix_size (i.e.,
250 component.transport_header_len). */
251 uint32_t prefix_send_offset;
252
253 /* OPAL async progress event base */
254 opal_event_base_t *opal_evbase;
255 } opal_btl_usnic_component_t;
256
257 OPAL_MODULE_DECLSPEC extern opal_btl_usnic_component_t mca_btl_usnic_component;
258
259 typedef mca_btl_base_recv_reg_t opal_btl_usnic_recv_reg_t;
260
261 /**
262 * Size for sequence numbers (just to ensure we use the same size
263 * everywhere)
264 */
265 typedef uint16_t opal_btl_usnic_seq_t;
266 #define UDSEQ PRIu16
267
268 /* sequence number comparison macros that allow for rollover.
269 * Relies on the fact that sequence numbers should be relatively close
270 * together as compared to (1<<31)
271 */
272 #define SEQ_DIFF(A,B) ((int16_t)((A)-(B)))
273 #define SEQ_LT(A,B) (SEQ_DIFF(A,B) < 0)
274 #define SEQ_LE(A,B) (SEQ_DIFF(A,B) <= 0)
275 #define SEQ_GT(A,B) (SEQ_DIFF(A,B) > 0)
276 #define SEQ_GE(A,B) (SEQ_DIFF(A,B) >= 0)
277
278 /**
279 * Register the usnic BTL MCA params
280 */
281 int opal_btl_usnic_component_register(void);
282
283 /**
284 * Routine which can be called from a debugger to print module, endpoint,
285 * fragment, and segment state to standard output. */
286 void opal_btl_usnic_component_debug(void);
287
288 /**
289 * Called to output the connectivity map
290 */
291 void opal_btl_usnic_connectivity_map(void);
292
293 END_C_DECLS
294 #endif
295