1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED.
3 *
4 * See file LICENSE for terms.
5 */
6
7 #ifndef UCT_IB_DEVICE_H
8 #define UCT_IB_DEVICE_H
9
10 #include "ib_verbs.h"
11
12 #include <uct/api/uct.h>
13 #include <uct/base/uct_iface.h>
14 #include <ucs/stats/stats.h>
15 #include <ucs/debug/assert.h>
16 #include <ucs/datastruct/khash.h>
17 #include <ucs/type/spinlock.h>
18 #include <ucs/sys/sock.h>
19 #include <ucs/sys/topo.h>
20
21 #include <endian.h>
22 #include <linux/ip.h>
23
24
25 #define UCT_IB_QPN_ORDER 24 /* How many bits can be an IB QP number */
26 #define UCT_IB_LRH_LEN 8 /* IB Local routing header */
27 #define UCT_IB_GRH_LEN 40 /* IB GLobal routing header */
28 #define UCT_IB_BTH_LEN 12 /* IB base transport header */
29 #define UCT_IB_ROCE_LEN 14 /* Ethernet header -
30 6B for Destination MAC +
31 6B for Source MAC + 2B Type (RoCE) */
32 #define UCT_IB_DETH_LEN 8 /* IB datagram header */
33 #define UCT_IB_RETH_LEN 16 /* IB RDMA header */
34 #define UCT_IB_ATOMIC_ETH_LEN 28 /* IB atomic header */
35 #define UCT_IB_AETH_LEN 4 /* IB ack */
36 #define UCT_IB_PAYLOAD_ALIGN 4 /* IB payload padding */
37 #define UCT_IB_ICRC_LEN 4 /* IB invariant crc footer */
38 #define UCT_IB_VCRC_LEN 2 /* IB variant crc footer */
39 #define UCT_IB_DELIM_LEN 2 /* IB wire delimiter */
40 #define UCT_IB_FDR_PACKET_GAP 64 /* Minimal FDR packet gap */
41 #define UCT_IB_MAX_MESSAGE_SIZE (2UL << 30) /* Maximal IB message size */
42 #define UCT_IB_PKEY_PARTITION_MASK 0x7fff /* IB partition number mask */
43 #define UCT_IB_PKEY_MEMBERSHIP_MASK 0x8000 /* Full/send-only member */
44 #define UCT_IB_DEV_MAX_PORTS 2
45 #define UCT_IB_FABRIC_TIME_MAX 32
46 #define UCT_IB_INVALID_RKEY 0xffffffffu
47 #define UCT_IB_KEY 0x1ee7a330
48 #define UCT_IB_LINK_LOCAL_PREFIX be64toh(0xfe80000000000000ul) /* IBTA 4.1.1 12a */
49 #define UCT_IB_SITE_LOCAL_PREFIX be64toh(0xfec0000000000000ul) /* IBTA 4.1.1 12b */
50 #define UCT_IB_SITE_LOCAL_MASK be64toh(0xffffffffffff0000ul) /* IBTA 4.1.1 12b */
51 #define UCT_IB_DEFAULT_ROCEV2_DSCP 106 /* Default DSCP for RoCE v2 */
52 #define UCT_IB_ROCE_UDP_SRC_PORT_BASE 0xC000
53 #define UCT_IB_DEVICE_SYSFS_PFX "/sys/class/infiniband/%s"
54 #define UCT_IB_DEVICE_SYSFS_FMT UCT_IB_DEVICE_SYSFS_PFX "/device/%s"
55 #define UCT_IB_DEVICE_SYSFS_GID_ATTR_PFX UCT_IB_DEVICE_SYSFS_PFX "/ports/%d/gid_attrs"
56 #define UCT_IB_DEVICE_SYSFS_GID_TYPE_FMT UCT_IB_DEVICE_SYSFS_GID_ATTR_PFX "/types/%d"
57 #define UCT_IB_DEVICE_SYSFS_GID_NDEV_FMT UCT_IB_DEVICE_SYSFS_GID_ATTR_PFX "/ndevs/%d"
58
59
60 enum {
61 UCT_IB_DEVICE_STAT_ASYNC_EVENT,
62 UCT_IB_DEVICE_STAT_LAST
63 };
64
65
66 typedef enum uct_ib_roce_version {
67 UCT_IB_DEVICE_ROCE_V1,
68 UCT_IB_DEVICE_ROCE_V1_5,
69 UCT_IB_DEVICE_ROCE_V2,
70 UCT_IB_DEVICE_ROCE_ANY
71 } uct_ib_roce_version_t;
72
73
74 enum {
75 UCT_IB_DEVICE_FLAG_MLX4_PRM = UCS_BIT(1), /* Device supports mlx4 PRM */
76 UCT_IB_DEVICE_FLAG_MLX5_PRM = UCS_BIT(2), /* Device supports mlx5 PRM */
77 UCT_IB_DEVICE_FLAG_MELLANOX = UCS_BIT(3), /* Mellanox device */
78 UCT_IB_DEVICE_FLAG_LINK_IB = UCS_BIT(5), /* Require only IB */
79 UCT_IB_DEVICE_FLAG_DC_V1 = UCS_BIT(6), /* Device supports DC ver 1 */
80 UCT_IB_DEVICE_FLAG_DC_V2 = UCS_BIT(7), /* Device supports DC ver 2 */
81 UCT_IB_DEVICE_FLAG_AV = UCS_BIT(8), /* Device supports compact AV */
82 UCT_IB_DEVICE_FLAG_DC = UCT_IB_DEVICE_FLAG_DC_V1 |
83 UCT_IB_DEVICE_FLAG_DC_V2, /* Device supports DC */
84 UCT_IB_DEVICE_FLAG_ODP_IMPLICIT = UCS_BIT(9),
85 };
86
87
88 /**
89 * Flags which specify which address fields are present
90 */
91 enum {
92 /* GID index, used for both ETH or IB link layer. */
93 UCT_IB_ADDRESS_FLAG_GID_INDEX = UCS_BIT(0),
94 /* Defines path MTU size, used for both ETH or IB link layer. */
95 UCT_IB_ADDRESS_FLAG_PATH_MTU = UCS_BIT(1),
96 /* PKEY value, used for both ETH or IB link layer. */
97 UCT_IB_ADDRESS_FLAG_PKEY = UCS_BIT(2),
98
99 /* If set - ETH link layer, else- IB link layer. */
100 UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH = UCS_BIT(3),
101
102 /* Used for ETH link layer. */
103 UCT_IB_ADDRESS_FLAG_ROCE_IPV6 = UCS_BIT(4),
104 /* Used for ETH link layer, following bits are used to pack RoCE version. */
105 UCT_IB_ADDRESS_FLAG_ETH_LAST = UCS_BIT(5),
106
107 /* Used for IB link layer. */
108 UCT_IB_ADDRESS_FLAG_SUBNET16 = UCS_BIT(4),
109 /* Used for IB link layer. */
110 UCT_IB_ADDRESS_FLAG_SUBNET64 = UCS_BIT(5),
111 /* Used for IB link layer. */
112 UCT_IB_ADDRESS_FLAG_IF_ID = UCS_BIT(6)
113 };
114
115
116 /**
117 * IB network address
118 */
119 typedef struct uct_ib_address {
120 /* Using flags from UCT_IB_ADDRESS_FLAG_xx
121 * For ETH link layer, the 4 msb's are used to indicate the RoCE version -
122 * (by shifting the UCT_IB_DEVICE_ROCE_xx values when packing and unpacking
123 * the ib address) */
124 uint8_t flags;
125 /* Following fields appear in this order (if specified by flags).
126 * The full gid always appears last:
127 * - uint16_t lid
128 * - uint64_t if_id
129 * - uint16_t subnet16
130 * - uint64_t subnet64
131 * For RoCE:
132 * - uint8_t gid[16]
133 */
134 } UCS_S_PACKED uct_ib_address_t;
135
136
137 /**
138 * PCI identifier of a device
139 */
140 typedef struct {
141 uint16_t vendor;
142 uint16_t device;
143 } uct_ib_pci_id_t;
144
145
146 /**
147 * IB device specification.
148 */
149 typedef struct uct_ib_device_spec {
150 const char *name;
151 uct_ib_pci_id_t pci_id;
152 unsigned flags;
153 uint8_t priority;
154 } uct_ib_device_spec_t;
155
156
157 KHASH_TYPE(uct_ib_ah, struct ibv_ah_attr, struct ibv_ah*);
158
159 /**
160 * IB device (corresponds to HCA)
161 */
162 typedef struct uct_ib_device {
163 struct ibv_context *ibv_context; /* Verbs context */
164 uct_ib_device_attr dev_attr; /* Cached device attributes */
165 uint8_t first_port; /* Number of first port (usually 1) */
166 uint8_t num_ports; /* Amount of physical ports */
167 ucs_sys_cpuset_t local_cpus; /* CPUs local to device */
168 int numa_node; /* NUMA node of the device */
169 int async_events; /* Whether async events are handled */
170 int max_zcopy_log_sge; /* Maximum sges log for zcopy am */
171 UCS_STATS_NODE_DECLARE(stats)
172 struct ibv_port_attr port_attr[UCT_IB_DEV_MAX_PORTS]; /* Cached port attributes */
173 uct_ib_pci_id_t pci_id;
174 unsigned flags;
175 uint8_t atomic_arg_sizes;
176 uint8_t atomic_arg_sizes_be;
177 uint8_t ext_atomic_arg_sizes;
178 uint8_t ext_atomic_arg_sizes_be;
179 uint8_t pci_fadd_arg_sizes;
180 uint8_t pci_cswap_arg_sizes;
181 uint8_t atomic_align;
182 /* AH hash */
183 khash_t(uct_ib_ah) ah_hash;
184 ucs_recursive_spinlock_t ah_lock;
185 } uct_ib_device_t;
186
187
188 /**
189 * RoCE version
190 */
191 typedef struct uct_ib_roce_version_info {
192 /** RoCE version described by the UCT_IB_DEVICE_ROCE_xx values */
193 uct_ib_roce_version_t ver;
194 /** Address family of the port */
195 sa_family_t addr_family;
196 } uct_ib_roce_version_info_t;
197
198
199 typedef struct {
200 union ibv_gid gid;
201 uint8_t gid_index; /* IB/RoCE GID index to use */
202 uct_ib_roce_version_info_t roce_info; /* For a RoCE port */
203 } uct_ib_device_gid_info_t;
204
205
206 typedef struct {
207 enum ibv_event_type event_type;
208 union {
209 uint8_t port_num;
210 uint32_t qp_num;
211 uint32_t dct_num;
212 void *cookie;
213 };
214 } uct_ib_async_event_t;
215
216
217 extern const double uct_ib_qp_rnr_time_ms[];
218
219
220 /**
221 * Check if a port on a device is active and supports the given flags.
222 */
223 ucs_status_t uct_ib_device_port_check(uct_ib_device_t *dev, uint8_t port_num,
224 unsigned flags);
225
226
227 /*
228 * Helper function to list IB transport resources.
229 *
230 * @param dev IB device.
231 * @param flags Transport requirements from IB device (see UCT_IB_RESOURCE_FLAG_xx)
232 * @param devices_p Filled with a pointer to an array of devices.
233 * @param num_devices_p Filled with the number of devices.
234 */
235 ucs_status_t uct_ib_device_query_ports(uct_ib_device_t *dev, unsigned flags,
236 uct_tl_device_resource_t **devices_p,
237 unsigned *num_devices_p);
238
239 ucs_status_t uct_ib_device_query(uct_ib_device_t *dev,
240 struct ibv_device *ibv_device);
241
242 ucs_status_t uct_ib_device_init(uct_ib_device_t *dev,
243 struct ibv_device *ibv_device, int async_events
244 UCS_STATS_ARG(ucs_stats_node_t *stats_parent));
245
246 void uct_ib_device_cleanup(uct_ib_device_t *dev);
247
248
249 /**
250 * @return device specification.
251 */
252 const uct_ib_device_spec_t* uct_ib_device_spec(uct_ib_device_t *dev);
253
254
255 /**
256 * Select the best gid to use and set its information on the RoCE port -
257 * gid index, RoCE version and address family.
258 *
259 * @param [in] dev IB device.
260 * @param [in] port_num Port number.
261 * @param [out] gid_info Filled with the selected gid index and the
262 * port's RoCE version and address family.
263 */
264 ucs_status_t uct_ib_device_select_gid(uct_ib_device_t *dev,
265 uint8_t port_num,
266 uct_ib_device_gid_info_t *gid_info);
267
268
269 /**
270 * @return device name.
271 */
272 const char *uct_ib_device_name(uct_ib_device_t *dev);
273
274
275 /**
276 * For the given IB device find the associated bus information
277 *
278 * @param [in] dev IB device.
279 * @param [in] port_num Port number.
280 * @param [out] bus_id Bus information.
281 */
282 ucs_status_t uct_ib_device_bus(uct_ib_device_t *dev, int port_num,
283 ucs_sys_bus_id_t *bus_id);
284
285 /**
286 * @return whether the port is InfiniBand
287 */
288 int uct_ib_device_is_port_ib(uct_ib_device_t *dev, uint8_t port_num);
289
290
291 /**
292 * @return whether the port is RoCE
293 */
294 int uct_ib_device_is_port_roce(uct_ib_device_t *dev, uint8_t port_num);
295
296
297 /**
298 * @return 1 if the gid_raw is 0, 0 otherwise.
299 */
300 int uct_ib_device_is_gid_raw_empty(uint8_t *gid_raw);
301
302
303 /**
304 * Convert time-in-seconds to IB fabric QP time value
305 */
306 uint8_t uct_ib_to_qp_fabric_time(double time);
307
308
309 /**
310 * Convert time-in-seconds to IB fabric RNR time value
311 */
312 uint8_t uct_ib_to_rnr_fabric_time(double time);
313
314
315 /**
316 * @return MTU in bytes.
317 */
318 size_t uct_ib_mtu_value(enum ibv_mtu mtu);
319
320
321 /**
322 * Modify QP to a given state and check for error
323 */
324 ucs_status_t uct_ib_modify_qp(struct ibv_qp *qp, enum ibv_qp_state state);
325
326
327 /**
328 * find device mtu. This function can be used before ib
329 * interface is created.
330 */
331 ucs_status_t uct_ib_device_mtu(const char *dev_name, uct_md_h md, int *p_mtu);
332
333 ucs_status_t uct_ib_device_find_port(uct_ib_device_t *dev,
334 const char *resource_dev_name,
335 uint8_t *p_port_num);
336
337 size_t uct_ib_device_odp_max_size(uct_ib_device_t *dev);
338
339 const char *uct_ib_wc_status_str(enum ibv_wc_status wc_status);
340
341 ucs_status_t uct_ib_device_create_ah_cached(uct_ib_device_t *dev,
342 struct ibv_ah_attr *ah_attr,
343 struct ibv_pd *pd,
344 struct ibv_ah **ah_p);
345
346 void uct_ib_device_cleanup_ah_cached(uct_ib_device_t *dev);
347
348 unsigned uct_ib_device_get_roce_lag_level(uct_ib_device_t *dev,
349 uint8_t port_num);
350
351
352 static inline struct ibv_port_attr*
uct_ib_device_port_attr(uct_ib_device_t * dev,uint8_t port_num)353 uct_ib_device_port_attr(uct_ib_device_t *dev, uint8_t port_num)
354 {
355 return &dev->port_attr[port_num - dev->first_port];
356 }
357
uct_ib_device_has_pci_atomics(uct_ib_device_t * dev)358 static inline int uct_ib_device_has_pci_atomics(uct_ib_device_t *dev)
359 {
360 return !!((dev->pci_fadd_arg_sizes | dev->pci_cswap_arg_sizes) &
361 (sizeof(uint32_t) | sizeof(uint64_t)));
362 }
363
364 const char *uct_ib_roce_version_str(uct_ib_roce_version_t roce_ver);
365
366 const char *uct_ib_gid_str(const union ibv_gid *gid, char *str, size_t max_size);
367
368 ucs_status_t uct_ib_device_query_gid(uct_ib_device_t *dev, uint8_t port_num,
369 unsigned gid_index, union ibv_gid *gid);
370
371 ucs_status_t uct_ib_device_query_gid_info(struct ibv_context *ctx, const char *dev_name,
372 uint8_t port_num, unsigned gid_index,
373 uct_ib_device_gid_info_t *info);
374
375 int uct_ib_device_test_roce_gid_index(uct_ib_device_t *dev, uint8_t port_num,
376 const union ibv_gid *gid,
377 uint8_t gid_index);
378
379 int uct_ib_get_cqe_size(int cqe_size_min);
380
381 const char* uct_ib_ah_attr_str(char *buf, size_t max,
382 const struct ibv_ah_attr *ah_attr);
383
uct_ib_poll_cq(struct ibv_cq * cq,unsigned * count,struct ibv_wc * wcs)384 static inline ucs_status_t uct_ib_poll_cq(struct ibv_cq *cq, unsigned *count, struct ibv_wc *wcs)
385 {
386 int ret;
387
388 ret = ibv_poll_cq(cq, *count, wcs);
389 if (ret <= 0) {
390 if (ucs_likely(ret == 0)) {
391 return UCS_ERR_NO_PROGRESS;
392 }
393 ucs_fatal("failed to poll receive CQ %d", ret);
394 }
395
396 *count = ret;
397 return UCS_OK;
398 }
399
400 void uct_ib_handle_async_event(uct_ib_device_t *dev, uct_ib_async_event_t *event);
401
402 #endif
403