1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3  * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
4  *                         University Research and Technology
5  *                         Corporation.  All rights reserved.
6  * Copyright (c) 2004-2011 The University of Tennessee and The University
7  *                         of Tennessee Research Foundation.  All rights
8  *                         reserved.
9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10  *                         University of Stuttgart.  All rights reserved.
11  * Copyright (c) 2004-2005 The Regents of the University of California.
12  *                         All rights reserved.
13  * Copyright (c) 2006      Sandia National Laboratories. All rights
14  *                         reserved.
15  * Copyright (c) 2011-2016 Cisco Systems, Inc.  All rights reserved.
16  * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
17  *                         reserved.
18  * $COPYRIGHT$
19  *
20  * Additional copyrights may follow
21  *
22  * $HEADER$
23  */
24 /**
25  * @file
26  */
27 #ifndef OPAL_BTL_USNIC_H
28 #define OPAL_BTL_USNIC_H
29 
30 #include "opal_config.h"
31 #include <sys/types.h>
32 
33 #include "opal_stdint.h"
34 #include "opal/util/alfg.h"
35 #include "opal/class/opal_hash_table.h"
36 #include "opal/class/opal_hash_table.h"
37 #include "opal/mca/event/event.h"
38 
39 #if BTL_IN_OPAL
40 #include "opal/mca/btl/btl.h"
41 #include "opal/mca/btl/base/btl_base_error.h"
42 #include "opal/mca/btl/base/base.h"
43 #include "opal/mca/rcache/rcache.h"
44 
45 #include "btl_usnic_compat.h"
46 
47 #if RCACHE_VERSION < 30
48 #include "opal/mca/mpool/grdma/mpool_grdma.h"
49 #endif
50 #else
51 #include "ompi/mca/btl/btl.h"
52 #include "ompi/mca/btl/base/btl_base_error.h"
53 #include "ompi/mca/btl/base/base.h"
54 #include "ompi/mca/mpool/grdma/mpool_grdma.h"
55 #endif
56 
57 BEGIN_C_DECLS
58 
59 /*
60  * We're simulating a clock as best we can without resorting to the
61  * system.  The clock is used to defer ACKs, and ticks will be incremented
62  * when progression gets called.  It could be incremented by different amounts
63  * at other times as needed or as tuning dictates.
64  */
65 extern uint64_t opal_btl_usnic_ticks;
66 
67 /* Lock for MPU_THREAD_MULTIPLE support */
68 extern opal_recursive_mutex_t btl_usnic_lock;
69 
70 static inline uint64_t
get_ticks(void)71 get_ticks(void)
72 {
73     return opal_btl_usnic_ticks;
74 }
75 
76 /* RNG buffer declaration */
77 extern opal_rng_buff_t opal_btl_usnic_rand_buff;
78 
79 #ifndef container_of
80 #define container_of(ptr, type, member) ( \
81         (type *)( ((char *)(ptr)) - offsetof(type,member) ))
82 #endif
83 
84 #ifndef max
85 #define max(a, b) (((a) > (b)) ? (a) : (b))
86 #endif
87 
88 /* MSGDEBUG2 prints 1 line at each BTL entry point */
89 #define MSGDEBUG2 (MSGDEBUG1||0)
90 /* MSGDEBUG1 prints more info about arguments and internal functions */
91 #define MSGDEBUG1 0
92 
93 /* output macros to declutter source */
94 #if MSGDEBUG1
95 #define MSGDEBUG1_OUT(...) opal_output(0, __VA_ARGS__)
96 #else
97 #define MSGDEBUG1_OUT(...) do {} while (0)
98 #endif
99 #if MSGDEBUG2
100 #define MSGDEBUG2_OUT(...) opal_output(0, __VA_ARGS__)
101 #else
102 #define MSGDEBUG2_OUT(...) do {} while (0)
103 #endif
104 
105 /* Set to >0 to randomly drop received frags.  The higher the number,
106    the more frequent the drops. */
107 #define WANT_RECV_DROPS 0
108 /* Set to >0 to randomly fail to send an ACK, mimicing a lost ACK.
109    The higher the number, the more frequent the failed-to-send-ACK. */
110 #define WANT_FAIL_TO_SEND_ACK 0
111 /* Set to >0 to randomly fail to resend a frag (causing it to be
112    requed to be sent later).  The higher the number, the more frequent
113    the failed-to-resend-frag. */
114 #define WANT_FAIL_TO_RESEND_FRAG 0
115 
116 #if WANT_RECV_DROPS > 0
117 #define FAKE_RECV_DROP (opal_rand(&opal_btl_usnic_rand_buff) < WANT_RECV_DROPS)
118 #else
119 #define FAKE_RECV_DROP 0
120 #endif
121 
122 #if WANT_FAIL_TO_SEND_ACK > 0
123 #define FAKE_FAIL_TO_SEND_ACK (opal_rand(&opal_btl_usnic_rand_buff) < WANT_FAIL_TO_SEND_ACK)
124 #else
125 #define FAKE_FAIL_TO_SEND_ACK 0
126 #endif
127 
128 #if WANT_FAIL_TO_RESEND_FRAG > 0
129 #define FAKE_FAIL_TO_RESEND_FRAG (opal_rand(&opal_btl_usnic_rand_buff) < WANT_FAIL_TO_RESEND_FRAG)
130 #else
131 #define FAKE_FAIL_TO_RESEND_FRAG 0
132 #endif
133 
134 
135 /**
136  * usnic BTL component
137  */
138 typedef struct opal_btl_usnic_component_t {
139     /** base BTL component */
140     mca_btl_base_component_2_0_0_t super;
141 
142     /* in the v1.6 series, sizeof(super) is 256, leading to good alignment for
143      * subsequent fastpath fields */
144 
145     /** Maximum number of BTL modules */
146     int max_modules;
147     /** Number of available/initialized BTL modules */
148     int num_modules;
149 
150     /* Cached hashed version of my RTE proc name (to stuff in
151        protocol headers) */
152     uint64_t my_hashed_rte_name;
153 
154     /** array of possible BTLs (>= num_modules elements) */
155     struct opal_btl_usnic_module_t* usnic_all_modules;
156     /** array of pointers to active BTLs (num_modules elements) */
157     struct opal_btl_usnic_module_t** usnic_active_modules;
158 
159     /** convertor packing threshold */
160     int pack_lazy_threshold;
161 
162     /* vvvvvvvvvv non-fastpath fields go below vvvvvvvvvv */
163 
164     /** list of usnic proc structures */
165     opal_list_t usnic_procs;
166 
167 #if RCACHE_VERSION == 30
168     /** memory pool hints */
169     char* usnic_mpool_hints;
170 
171     /** registration cache name */
172     char *usnic_rcache_name;
173 #else
174     /** name of memory pool */
175     char* usnic_mpool_name;
176 #endif
177 
178     char *if_include;
179     char *if_exclude;
180 
181     /** Want stats? */
182     bool stats_enabled;
183     bool stats_relative;
184     int stats_frequency;
185 
186     /** Whether we want to use NUMA distances to choose which usNIC
187         devices to use for short messages */
188     bool want_numa_device_assignment;
189 
190     /** max send descriptors to post per module */
191     int32_t sd_num;
192 
193     /** max receive descriptors per module */
194     int32_t rd_num;
195 
196     /** max send/receive desriptors for priority channel */
197     int32_t prio_sd_num;
198     int32_t prio_rd_num;
199 
200     /** max completion queue entries per module */
201     int32_t cq_num;
202 
203     /** max number of entries in AV EQ */
204     int32_t av_eq_num;
205 
206     /** retrans characteristics */
207     int retrans_timeout;
208 
209     /** max number of messages re-sent during a single progress
210         iteration */
211     int max_resends_per_iteration;
212 
213     /** minimum number of times through component progress before
214         checking to see if standalone ACKs need to be sent */
215     int ack_iteration_delay;
216 
217     /** transport header length for all usNIC devices on this server
218         (it is guaranteed that all usNIC devices on a single server
219         will have the same underlying transport, and therefore the
220         same transport header length) */
221     int transport_header_len;
222     uint32_t transport_protocol;
223 
224     /* what UDP port do we want to use?  If 0, the system will pick.
225        If nonzero, it is used as the base -- the final number will be
226        (base+my_local_rank). */
227     int udp_port_base;
228 
229     /** disable the "cannot find route" warnings (for network setups
230         where this is known/acceptable) */
231     bool show_route_failures;
232 
233     /** connectivity verification: ACK timeout, number of retries
234         before issue an error/abort the job */
235     bool connectivity_enabled;
236     int connectivity_ack_timeout;
237     int connectivity_num_retries;
238 
239     /** how many short packets have to be received before outputting
240         the "received short packets" warning? */
241     uint32_t max_short_packets;
242 
243     /* Prefix for the connectivity map filename (map will be output if
244        the prefix is non-NULL) */
245     char *connectivity_map_prefix;
246 
247     /** Offset into the send buffer where the payload will go.  For
248         libfabric v1.0.0 / API v1.0, this is 0.  For libfabric >=v1.1
249         / API >=v1.1, this is the endpoint.msg_prefix_size (i.e.,
250         component.transport_header_len). */
251     uint32_t prefix_send_offset;
252 
253     /* OPAL async progress event base */
254     opal_event_base_t *opal_evbase;
255 } opal_btl_usnic_component_t;
256 
257 OPAL_MODULE_DECLSPEC extern opal_btl_usnic_component_t mca_btl_usnic_component;
258 
259 typedef mca_btl_base_recv_reg_t opal_btl_usnic_recv_reg_t;
260 
261 /**
262  * Size for sequence numbers (just to ensure we use the same size
263  * everywhere)
264  */
265 typedef uint16_t opal_btl_usnic_seq_t;
266 #define UDSEQ PRIu16
267 
268 /* sequence number comparison macros that allow for rollover.
269  * Relies on the fact that sequence numbers should be relatively close
270  * together as compared to (1<<31)
271  */
272 #define SEQ_DIFF(A,B) ((int16_t)((A)-(B)))
273 #define SEQ_LT(A,B) (SEQ_DIFF(A,B) < 0)
274 #define SEQ_LE(A,B) (SEQ_DIFF(A,B) <= 0)
275 #define SEQ_GT(A,B) (SEQ_DIFF(A,B) > 0)
276 #define SEQ_GE(A,B) (SEQ_DIFF(A,B) >= 0)
277 
278 /**
279  * Register the usnic BTL MCA params
280  */
281 int opal_btl_usnic_component_register(void);
282 
283 /**
284  * Routine which can be called from a debugger to print module, endpoint,
285  * fragment, and segment state to standard output. */
286 void opal_btl_usnic_component_debug(void);
287 
288 /**
289  * Called to output the connectivity map
290  */
291 void opal_btl_usnic_connectivity_map(void);
292 
293 END_C_DECLS
294 #endif
295