1 /**
2 * Copyright (C) Mellanox Technologies Ltd. 2001-2014.  ALL RIGHTS RESERVED.
3 *
4 * See file LICENSE for terms.
5 */
6 
7 #ifndef UCT_IB_DEVICE_H
8 #define UCT_IB_DEVICE_H
9 
10 #include "ib_verbs.h"
11 
12 #include <uct/api/uct.h>
13 #include <uct/base/uct_iface.h>
14 #include <ucs/stats/stats.h>
15 #include <ucs/debug/assert.h>
16 #include <ucs/datastruct/khash.h>
17 #include <ucs/type/spinlock.h>
18 #include <ucs/sys/sock.h>
19 #include <ucs/sys/topo.h>
20 
21 #include <endian.h>
22 #include <linux/ip.h>
23 
24 
25 #define UCT_IB_QPN_ORDER                  24  /* How many bits can be an IB QP number */
26 #define UCT_IB_LRH_LEN                    8   /* IB Local routing header */
27 #define UCT_IB_GRH_LEN                    40  /* IB GLobal routing header */
28 #define UCT_IB_BTH_LEN                    12  /* IB base transport header */
29 #define UCT_IB_ROCE_LEN                   14  /* Ethernet header -
30                                                  6B for Destination MAC +
31                                                  6B for Source MAC + 2B Type (RoCE) */
32 #define UCT_IB_DETH_LEN                   8   /* IB datagram header */
33 #define UCT_IB_RETH_LEN                   16  /* IB RDMA header */
34 #define UCT_IB_ATOMIC_ETH_LEN             28  /* IB atomic header */
35 #define UCT_IB_AETH_LEN                   4   /* IB ack */
36 #define UCT_IB_PAYLOAD_ALIGN              4   /* IB payload padding */
37 #define UCT_IB_ICRC_LEN                   4   /* IB invariant crc footer */
38 #define UCT_IB_VCRC_LEN                   2   /* IB variant crc footer */
39 #define UCT_IB_DELIM_LEN                  2   /* IB wire delimiter */
40 #define UCT_IB_FDR_PACKET_GAP             64  /* Minimal FDR packet gap */
41 #define UCT_IB_MAX_MESSAGE_SIZE           (2UL << 30) /* Maximal IB message size */
42 #define UCT_IB_PKEY_PARTITION_MASK        0x7fff /* IB partition number mask */
43 #define UCT_IB_PKEY_MEMBERSHIP_MASK       0x8000 /* Full/send-only member */
44 #define UCT_IB_DEV_MAX_PORTS              2
45 #define UCT_IB_FABRIC_TIME_MAX            32
46 #define UCT_IB_INVALID_RKEY               0xffffffffu
47 #define UCT_IB_KEY                        0x1ee7a330
48 #define UCT_IB_LINK_LOCAL_PREFIX          be64toh(0xfe80000000000000ul) /* IBTA 4.1.1 12a */
49 #define UCT_IB_SITE_LOCAL_PREFIX          be64toh(0xfec0000000000000ul) /* IBTA 4.1.1 12b */
50 #define UCT_IB_SITE_LOCAL_MASK            be64toh(0xffffffffffff0000ul) /* IBTA 4.1.1 12b */
51 #define UCT_IB_DEFAULT_ROCEV2_DSCP        106  /* Default DSCP for RoCE v2 */
52 #define UCT_IB_ROCE_UDP_SRC_PORT_BASE     0xC000
53 #define UCT_IB_DEVICE_SYSFS_PFX           "/sys/class/infiniband/%s"
54 #define UCT_IB_DEVICE_SYSFS_FMT           UCT_IB_DEVICE_SYSFS_PFX "/device/%s"
55 #define UCT_IB_DEVICE_SYSFS_GID_ATTR_PFX  UCT_IB_DEVICE_SYSFS_PFX "/ports/%d/gid_attrs"
56 #define UCT_IB_DEVICE_SYSFS_GID_TYPE_FMT  UCT_IB_DEVICE_SYSFS_GID_ATTR_PFX "/types/%d"
57 #define UCT_IB_DEVICE_SYSFS_GID_NDEV_FMT  UCT_IB_DEVICE_SYSFS_GID_ATTR_PFX "/ndevs/%d"
58 
59 
60 enum {
61     UCT_IB_DEVICE_STAT_ASYNC_EVENT,
62     UCT_IB_DEVICE_STAT_LAST
63 };
64 
65 
66 typedef enum uct_ib_roce_version {
67     UCT_IB_DEVICE_ROCE_V1,
68     UCT_IB_DEVICE_ROCE_V1_5,
69     UCT_IB_DEVICE_ROCE_V2,
70     UCT_IB_DEVICE_ROCE_ANY
71 } uct_ib_roce_version_t;
72 
73 
74 enum {
75     UCT_IB_DEVICE_FLAG_MLX4_PRM = UCS_BIT(1),   /* Device supports mlx4 PRM */
76     UCT_IB_DEVICE_FLAG_MLX5_PRM = UCS_BIT(2),   /* Device supports mlx5 PRM */
77     UCT_IB_DEVICE_FLAG_MELLANOX = UCS_BIT(3),   /* Mellanox device */
78     UCT_IB_DEVICE_FLAG_LINK_IB  = UCS_BIT(5),   /* Require only IB */
79     UCT_IB_DEVICE_FLAG_DC_V1    = UCS_BIT(6),   /* Device supports DC ver 1 */
80     UCT_IB_DEVICE_FLAG_DC_V2    = UCS_BIT(7),   /* Device supports DC ver 2 */
81     UCT_IB_DEVICE_FLAG_AV       = UCS_BIT(8),   /* Device supports compact AV */
82     UCT_IB_DEVICE_FLAG_DC       = UCT_IB_DEVICE_FLAG_DC_V1 |
83                                   UCT_IB_DEVICE_FLAG_DC_V2, /* Device supports DC */
84     UCT_IB_DEVICE_FLAG_ODP_IMPLICIT = UCS_BIT(9),
85 };
86 
87 
88 /**
89  * Flags which specify which address fields are present
90  */
91 enum {
92     /* GID index, used for both ETH or IB link layer.  */
93     UCT_IB_ADDRESS_FLAG_GID_INDEX      = UCS_BIT(0),
94     /* Defines path MTU size, used for both ETH or IB link layer. */
95     UCT_IB_ADDRESS_FLAG_PATH_MTU       = UCS_BIT(1),
96     /* PKEY value, used for both ETH or IB link layer. */
97     UCT_IB_ADDRESS_FLAG_PKEY           = UCS_BIT(2),
98 
99     /* If set - ETH link layer, else- IB link layer. */
100     UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH = UCS_BIT(3),
101 
102     /* Used for ETH link layer. */
103     UCT_IB_ADDRESS_FLAG_ROCE_IPV6      = UCS_BIT(4),
104     /* Used for ETH link layer, following bits are used to pack RoCE version. */
105     UCT_IB_ADDRESS_FLAG_ETH_LAST       = UCS_BIT(5),
106 
107     /* Used for IB link layer. */
108     UCT_IB_ADDRESS_FLAG_SUBNET16       = UCS_BIT(4),
109     /* Used for IB link layer. */
110     UCT_IB_ADDRESS_FLAG_SUBNET64       = UCS_BIT(5),
111     /* Used for IB link layer. */
112     UCT_IB_ADDRESS_FLAG_IF_ID          = UCS_BIT(6)
113 };
114 
115 
116 /**
117  * IB network address
118  */
119 typedef struct uct_ib_address {
120     /* Using flags from UCT_IB_ADDRESS_FLAG_xx
121      * For ETH link layer, the 4 msb's are used to indicate the RoCE version -
122      * (by shifting the UCT_IB_DEVICE_ROCE_xx values when packing and unpacking
123      * the ib address) */
124     uint8_t            flags;
125     /* Following fields appear in this order (if specified by flags).
126      * The full gid always appears last:
127      * - uint16_t lid
128      * - uint64_t if_id
129      * - uint16_t subnet16
130      * - uint64_t subnet64
131      * For RoCE:
132      * - uint8_t gid[16]
133      */
134 } UCS_S_PACKED uct_ib_address_t;
135 
136 
137 /**
138  * PCI identifier of a device
139  */
140 typedef struct {
141     uint16_t                    vendor;
142     uint16_t                    device;
143 } uct_ib_pci_id_t;
144 
145 
146 /**
147  * IB device specification.
148  */
149 typedef struct uct_ib_device_spec {
150     const char                  *name;
151     uct_ib_pci_id_t             pci_id;
152     unsigned                    flags;
153     uint8_t                     priority;
154 } uct_ib_device_spec_t;
155 
156 
157 KHASH_TYPE(uct_ib_ah, struct ibv_ah_attr, struct ibv_ah*);
158 
159 /**
160  * IB device (corresponds to HCA)
161  */
162 typedef struct uct_ib_device {
163     struct ibv_context          *ibv_context;    /* Verbs context */
164     uct_ib_device_attr          dev_attr;        /* Cached device attributes */
165     uint8_t                     first_port;      /* Number of first port (usually 1) */
166     uint8_t                     num_ports;       /* Amount of physical ports */
167     ucs_sys_cpuset_t            local_cpus;      /* CPUs local to device */
168     int                         numa_node;       /* NUMA node of the device */
169     int                         async_events;    /* Whether async events are handled */
170     int                         max_zcopy_log_sge; /* Maximum sges log for zcopy am */
171     UCS_STATS_NODE_DECLARE(stats)
172     struct ibv_port_attr        port_attr[UCT_IB_DEV_MAX_PORTS]; /* Cached port attributes */
173     uct_ib_pci_id_t             pci_id;
174     unsigned                    flags;
175     uint8_t                     atomic_arg_sizes;
176     uint8_t                     atomic_arg_sizes_be;
177     uint8_t                     ext_atomic_arg_sizes;
178     uint8_t                     ext_atomic_arg_sizes_be;
179     uint8_t                     pci_fadd_arg_sizes;
180     uint8_t                     pci_cswap_arg_sizes;
181     uint8_t                     atomic_align;
182     /* AH hash */
183     khash_t(uct_ib_ah)          ah_hash;
184     ucs_recursive_spinlock_t    ah_lock;
185 } uct_ib_device_t;
186 
187 
188 /**
189  * RoCE version
190  */
191 typedef struct uct_ib_roce_version_info {
192     /** RoCE version described by the UCT_IB_DEVICE_ROCE_xx values */
193     uct_ib_roce_version_t ver;
194     /** Address family of the port */
195     sa_family_t           addr_family;
196 } uct_ib_roce_version_info_t;
197 
198 
199 typedef struct {
200     union ibv_gid              gid;
201     uint8_t                    gid_index;    /* IB/RoCE GID index to use */
202     uct_ib_roce_version_info_t roce_info;    /* For a RoCE port */
203 } uct_ib_device_gid_info_t;
204 
205 
206 typedef struct {
207     enum ibv_event_type event_type;
208     union {
209         uint8_t         port_num;
210         uint32_t        qp_num;
211         uint32_t        dct_num;
212         void            *cookie;
213     };
214 } uct_ib_async_event_t;
215 
216 
217 extern const double uct_ib_qp_rnr_time_ms[];
218 
219 
220 /**
221  * Check if a port on a device is active and supports the given flags.
222  */
223 ucs_status_t uct_ib_device_port_check(uct_ib_device_t *dev, uint8_t port_num,
224                                       unsigned flags);
225 
226 
227 /*
228  * Helper function to list IB transport resources.
229  *
230  * @param dev              IB device.
231  * @param flags            Transport requirements from IB device (see UCT_IB_RESOURCE_FLAG_xx)
232  * @param devices_p        Filled with a pointer to an array of devices.
233  * @param num_devices_p    Filled with the number of devices.
234  */
235 ucs_status_t uct_ib_device_query_ports(uct_ib_device_t *dev, unsigned flags,
236                                        uct_tl_device_resource_t **devices_p,
237                                        unsigned *num_devices_p);
238 
239 ucs_status_t uct_ib_device_query(uct_ib_device_t *dev,
240                                  struct ibv_device *ibv_device);
241 
242 ucs_status_t uct_ib_device_init(uct_ib_device_t *dev,
243                                 struct ibv_device *ibv_device, int async_events
244                                 UCS_STATS_ARG(ucs_stats_node_t *stats_parent));
245 
246 void uct_ib_device_cleanup(uct_ib_device_t *dev);
247 
248 
249 /**
250  * @return device specification.
251  */
252 const uct_ib_device_spec_t* uct_ib_device_spec(uct_ib_device_t *dev);
253 
254 
255 /**
256  * Select the best gid to use and set its information on the RoCE port -
257  * gid index, RoCE version and address family.
258  *
259  * @param [in]  dev             IB device.
260  * @param [in]  port_num        Port number.
261  * @param [out] gid_info        Filled with the selected gid index and the
262  *                              port's RoCE version and address family.
263  */
264 ucs_status_t uct_ib_device_select_gid(uct_ib_device_t *dev,
265                                       uint8_t port_num,
266                                       uct_ib_device_gid_info_t *gid_info);
267 
268 
269 /**
270  * @return device name.
271  */
272 const char *uct_ib_device_name(uct_ib_device_t *dev);
273 
274 
275 /**
276  * For the given IB device find the associated bus information
277  *
278  * @param [in]  dev             IB device.
279  * @param [in]  port_num        Port number.
280  * @param [out] bus_id          Bus information.
281  */
282 ucs_status_t uct_ib_device_bus(uct_ib_device_t *dev, int port_num,
283                                ucs_sys_bus_id_t *bus_id);
284 
285 /**
286  * @return whether the port is InfiniBand
287  */
288 int uct_ib_device_is_port_ib(uct_ib_device_t *dev, uint8_t port_num);
289 
290 
291 /**
292  * @return whether the port is RoCE
293  */
294 int uct_ib_device_is_port_roce(uct_ib_device_t *dev, uint8_t port_num);
295 
296 
297 /**
298  * @return 1 if the gid_raw is 0, 0 otherwise.
299  */
300 int uct_ib_device_is_gid_raw_empty(uint8_t *gid_raw);
301 
302 
303 /**
304  * Convert time-in-seconds to IB fabric QP time value
305  */
306 uint8_t uct_ib_to_qp_fabric_time(double time);
307 
308 
309 /**
310  * Convert time-in-seconds to IB fabric RNR time value
311  */
312 uint8_t uct_ib_to_rnr_fabric_time(double time);
313 
314 
315 /**
316  * @return MTU in bytes.
317  */
318 size_t uct_ib_mtu_value(enum ibv_mtu mtu);
319 
320 
321 /**
322  * Modify QP to a given state and check for error
323  */
324 ucs_status_t uct_ib_modify_qp(struct ibv_qp *qp, enum ibv_qp_state state);
325 
326 
327 /**
328  * find device mtu. This function can be used before ib
329  * interface is created.
330  */
331 ucs_status_t uct_ib_device_mtu(const char *dev_name, uct_md_h md, int *p_mtu);
332 
333 ucs_status_t uct_ib_device_find_port(uct_ib_device_t *dev,
334                                      const char *resource_dev_name,
335                                      uint8_t *p_port_num);
336 
337 size_t uct_ib_device_odp_max_size(uct_ib_device_t *dev);
338 
339 const char *uct_ib_wc_status_str(enum ibv_wc_status wc_status);
340 
341 ucs_status_t uct_ib_device_create_ah_cached(uct_ib_device_t *dev,
342                                             struct ibv_ah_attr *ah_attr,
343                                             struct ibv_pd *pd,
344                                             struct ibv_ah **ah_p);
345 
346 void uct_ib_device_cleanup_ah_cached(uct_ib_device_t *dev);
347 
348 unsigned uct_ib_device_get_roce_lag_level(uct_ib_device_t *dev,
349                                           uint8_t port_num);
350 
351 
352 static inline struct ibv_port_attr*
uct_ib_device_port_attr(uct_ib_device_t * dev,uint8_t port_num)353 uct_ib_device_port_attr(uct_ib_device_t *dev, uint8_t port_num)
354 {
355     return &dev->port_attr[port_num - dev->first_port];
356 }
357 
uct_ib_device_has_pci_atomics(uct_ib_device_t * dev)358 static inline int uct_ib_device_has_pci_atomics(uct_ib_device_t *dev)
359 {
360     return !!((dev->pci_fadd_arg_sizes | dev->pci_cswap_arg_sizes) &
361               (sizeof(uint32_t) | sizeof(uint64_t)));
362 }
363 
364 const char *uct_ib_roce_version_str(uct_ib_roce_version_t roce_ver);
365 
366 const char *uct_ib_gid_str(const union ibv_gid *gid, char *str, size_t max_size);
367 
368 ucs_status_t uct_ib_device_query_gid(uct_ib_device_t *dev, uint8_t port_num,
369                                      unsigned gid_index, union ibv_gid *gid);
370 
371 ucs_status_t uct_ib_device_query_gid_info(struct ibv_context *ctx, const char *dev_name,
372                                           uint8_t port_num, unsigned gid_index,
373                                           uct_ib_device_gid_info_t *info);
374 
375 int uct_ib_device_test_roce_gid_index(uct_ib_device_t *dev, uint8_t port_num,
376                                       const union ibv_gid *gid,
377                                       uint8_t gid_index);
378 
379 int uct_ib_get_cqe_size(int cqe_size_min);
380 
381 const char* uct_ib_ah_attr_str(char *buf, size_t max,
382                                const struct ibv_ah_attr *ah_attr);
383 
uct_ib_poll_cq(struct ibv_cq * cq,unsigned * count,struct ibv_wc * wcs)384 static inline ucs_status_t uct_ib_poll_cq(struct ibv_cq *cq, unsigned *count, struct ibv_wc *wcs)
385 {
386     int ret;
387 
388     ret = ibv_poll_cq(cq, *count, wcs);
389     if (ret <= 0) {
390         if (ucs_likely(ret == 0)) {
391             return UCS_ERR_NO_PROGRESS;
392         }
393         ucs_fatal("failed to poll receive CQ %d", ret);
394     }
395 
396     *count = ret;
397     return UCS_OK;
398 }
399 
400 void uct_ib_handle_async_event(uct_ib_device_t *dev, uct_ib_async_event_t *event);
401 
402 #endif
403