1 /**
2 * Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED.
3 * Copyright (C) Mellanox Technologies Ltd. 2001-2015. ALL RIGHTS RESERVED.
4 * Copyright (C) ARM Ltd. 2016. ALL RIGHTS RESERVED.
5 * See file LICENSE for terms.
6 */
7
8 #ifndef UCT_MM_IFACE_H
9 #define UCT_MM_IFACE_H
10
11 #include "mm_md.h"
12
13 #include <uct/base/uct_iface.h>
14 #include <uct/sm/base/sm_iface.h>
15 #include <ucs/arch/cpu.h>
16 #include <ucs/debug/memtrack.h>
17 #include <ucs/datastruct/arbiter.h>
18 #include <ucs/sys/compiler.h>
19 #include <ucs/sys/sys.h>
20 #include <sys/shm.h>
21 #include <sys/un.h>
22
23
24 enum {
25 UCT_MM_FIFO_ELEM_FLAG_OWNER = UCS_BIT(0), /* new/old info */
26 UCT_MM_FIFO_ELEM_FLAG_INLINE = UCS_BIT(1), /* if inline or not */
27 };
28
29
30 #define UCT_MM_FIFO_CTL_SIZE \
31 ucs_align_up(sizeof(uct_mm_fifo_ctl_t), UCS_SYS_CACHE_LINE_SIZE)
32
33
34 #define UCT_MM_GET_FIFO_SIZE(_iface) \
35 (UCT_MM_FIFO_CTL_SIZE + \
36 ((_iface)->config.fifo_size * (_iface)->config.fifo_elem_size) + \
37 (UCS_SYS_CACHE_LINE_SIZE - 1))
38
39
40 #define UCT_MM_IFACE_GET_FIFO_ELEM(_iface, _fifo, _index) \
41 ((uct_mm_fifo_element_t*) \
42 UCS_PTR_BYTE_OFFSET(_fifo, (_index) * (_iface)->config.fifo_elem_size))
43
44
45 #define uct_mm_iface_mapper_call(_iface, _func, ...) \
46 ({ \
47 uct_mm_md_t *md = ucs_derived_of((_iface)->super.super.md, uct_mm_md_t); \
48 uct_mm_md_mapper_call(md, _func, ## __VA_ARGS__); \
49 })
50
51 /* AIMD (additive increase/multiplicative decrease) algorithm adopted for FIFO
52 * polling mechanism to adjust FIFO polling window.
53 * - FIFO window is increased if the number of completed RX operations during
54 * the current iface progress call reaches FIFO window size and previous iface
55 * progress call was able to fully consume FIFO window (protection against
56 * impacting ping-pong pattern where handling of > 1 RX operation should not
57 * be expected).
58 * - FIFO window is decreased if the number of completed RX operations during
59 * the current iface progress call does not reach FIFO window size.
60 * See https://en.wikipedia.org/wiki/Additive_increase/multiplicative_decrease
61 * for more information about original AIMD algorithm used for congestion
62 * avoidance. */
63 #define UCT_MM_IFACE_FIFO_MIN_POLL 1 /* Minimal FIFO window size */
64 #define UCT_MM_IFACE_FIFO_MAX_POLL 16 /* Default value for FIFO maximal
65 * window size */
66 #define UCT_MM_IFACE_FIFO_AI_VALUE 1 /* FIFO window += AI value */
67 #define UCT_MM_IFACE_FIFO_MD_FACTOR 2 /* FIFO window /= MD factor */
68
69
70 /**
71 * MM interface configuration
72 */
73 typedef struct uct_mm_iface_config {
74 uct_sm_iface_config_t super;
75 size_t seg_size; /* Size of the receive
76 * descriptor (for payload) */
77 unsigned fifo_size; /* Size of the receive FIFO */
78 size_t fifo_max_poll; /* Maximal RX completions to pick
79 * during RX poll */
80 double release_fifo_factor; /* Tail index update frequency */
81 ucs_ternary_value_t hugetlb_mode; /* Enable using huge pages for
82 * shared memory buffers */
83 unsigned fifo_elem_size; /* Size of the FIFO element size */
84 uct_iface_mpool_config_t mp;
85 } uct_mm_iface_config_t;
86
87
88 /**
89 * MM interface address
90 */
91 typedef struct uct_mm_iface_addr {
92 uct_mm_seg_id_t fifo_seg_id; /* Shared memory identifier of FIFO */
93 /* mapper-specific iface address follows */
94 } UCS_S_PACKED uct_mm_iface_addr_t;
95
96
97 /**
98 * MM FIFO control segment
99 */
100 typedef struct uct_mm_fifo_ctl {
101 /* 1st cacheline */
102 volatile uint64_t head; /* Where to write next */
103 socklen_t signal_addrlen; /* Address length of signaling socket */
104 struct sockaddr_un signal_sockaddr;/* Address of signaling socket */
105 UCS_CACHELINE_PADDING(uint64_t,
106 socklen_t,
107 struct sockaddr_un);
108
109 /* 2nd cacheline */
110 volatile uint64_t tail; /* How much was consumed */
111 } UCS_S_PACKED UCS_V_ALIGNED(UCS_SYS_CACHE_LINE_SIZE) uct_mm_fifo_ctl_t;
112
113
114 /**
115 * MM receive descriptor info in the shared FIFO
116 */
117 typedef struct uct_mm_desc_info {
118 uct_mm_seg_id_t seg_id; /* shared memory segment id */
119 unsigned seg_size; /* size of the shared memory segment */
120 unsigned offset; /* offset inside the shared memory
121 segment */
122 } UCS_S_PACKED uct_mm_desc_info_t;
123
124
125 /**
126 * MM FIFO element
127 */
128 typedef struct uct_mm_fifo_element {
129 uint8_t flags; /* UCT_MM_FIFO_ELEM_FLAG_xx */
130 uint8_t am_id; /* active message id */
131 uint16_t length; /* length of actual data written
132 by producer */
133 uct_mm_desc_info_t desc; /* remote receive descriptor
134 parameters for am_bcopy */
135 void *desc_data; /* pointer to receive descriptor,
136 valid only on receiver */
137
138 /* the data follows here (in case of inline messaging) */
139 } UCS_S_PACKED uct_mm_fifo_element_t;
140
141
142 /*
143 * MM receive descriptor:
144 *
145 * +--------------------+---------------+-----------+
146 * | uct_mm_recv_desc_t | user-defined | data |
147 * | (info + rdesc) | rx headroom | (payload) |
148 * +--------------------+---------------+-----------+
149 */
150 typedef struct uct_mm_recv_desc {
151 uct_mm_desc_info_t info; /* descriptor information for the
152 remote side which writes to it */
153 uct_recv_desc_t recv; /* has to be in the end */
154 } uct_mm_recv_desc_t;
155
156
157 /**
158 * MM trandport interface
159 */
160 typedef struct uct_mm_iface {
161 uct_sm_iface_t super;
162
163 /* Receive FIFO */
164 uct_allocated_memory_t recv_fifo_mem;
165
166 uct_mm_fifo_ctl_t *recv_fifo_ctl; /* pointer to the struct at the */
167 /* beginning of the receive fifo */
168 /* which holds the head and the tail. */
169 /* this struct is cache line aligned and */
170 /* doesn't necessarily start where */
171 /* shared_mem starts */
172 void *recv_fifo_elems; /* pointer to the first fifo element
173 in the receive fifo */
174 uct_mm_fifo_element_t *read_index_elem;
175 uint64_t read_index; /* actual reading location */
176
177 uint8_t fifo_shift; /* = log2(fifo_size) */
178 unsigned fifo_mask; /* = 2^fifo_shift - 1 */
179 uint64_t fifo_release_factor_mask;
180
181 unsigned fifo_poll_count; /* How much RX operations can be polled
182 * during an iface progress call */
183 int fifo_prev_wnd_cons; /* Was FIFO window size fully consumed by
184 * the previous call to iface progress */
185
186 ucs_mpool_t recv_desc_mp;
187 uct_mm_recv_desc_t *last_recv_desc; /* next receive descriptor to use */
188
189 int signal_fd; /* Unix socket for receiving remote signal */
190
191 size_t rx_headroom;
192 ucs_arbiter_t arbiter;
193 uct_recv_desc_t release_desc;
194
195 struct {
196 unsigned fifo_size;
197 unsigned fifo_elem_size;
198 unsigned seg_size; /* size of the receive descriptor (for payload)*/
199 unsigned fifo_max_poll;
200 } config;
201 } uct_mm_iface_t;
202
203
204 /*
205 * Define a memory-mapper transport for MM.
206 *
207 * @param _name Component name token
208 * @param _md_ops Memory domain operations, of type uct_mm_md_ops_t.
209 * @param _rkey_unpack Remote key unpack function
210 * @param _rkey_release Remote key release function
211 * @param _cfg_prefix Prefix for configuration variables.
212 */
213 #define UCT_MM_TL_DEFINE(_name, _md_ops, _rkey_unpack, _rkey_release, \
214 _cfg_prefix) \
215 \
216 UCT_MM_COMPONENT_DEFINE(uct_##_name##_component, _name, _md_ops, \
217 _rkey_unpack, _rkey_release, _cfg_prefix) \
218 \
219 UCT_TL_DEFINE(&(uct_##_name##_component).super, \
220 _name, \
221 uct_sm_base_query_tl_devices, \
222 uct_mm_iface_t, \
223 "MM_", \
224 uct_mm_iface_config_table, \
225 uct_mm_iface_config_t);
226
227
228 extern ucs_config_field_t uct_mm_iface_config_table[];
229
230
231 static UCS_F_ALWAYS_INLINE ucs_status_t
uct_mm_iface_invoke_am(uct_mm_iface_t * iface,uint8_t am_id,void * data,unsigned length,unsigned flags)232 uct_mm_iface_invoke_am(uct_mm_iface_t *iface, uint8_t am_id, void *data,
233 unsigned length, unsigned flags)
234 {
235 ucs_status_t status;
236 void *desc;
237
238 status = uct_iface_invoke_am(&iface->super.super, am_id, data, length,
239 flags);
240
241 if (status == UCS_INPROGRESS) {
242 desc = (void *)((uintptr_t)data - iface->rx_headroom);
243 /* save the release_desc for later release of this desc */
244 uct_recv_desc(desc) = &iface->release_desc;
245 }
246
247 return status;
248 }
249
250
251 /**
252 * Set aligned pointers of the FIFO according to the beginning of the allocated
253 * memory.
254 * @param [in] fifo_mem Pointer to the beginning of the allocated memory.
255 * @param [out] fifo_ctl_p Pointer to the FIFO control structure.
256 * @param [out] fifo_elems Pointer to the array of FIFO elements.
257 */
258 void uct_mm_iface_set_fifo_ptrs(void *fifo_mem, uct_mm_fifo_ctl_t **fifo_ctl_p,
259 void **fifo_elems_p);
260
261
262 UCS_CLASS_DECLARE_NEW_FUNC(uct_mm_iface_t, uct_iface_t, uct_md_h, uct_worker_h,
263 const uct_iface_params_t*, const uct_iface_config_t*);
264
265
266 void uct_mm_iface_release_desc(uct_recv_desc_t *self, void *desc);
267
268
269 ucs_status_t uct_mm_flush();
270
271
272 #endif
273