1 /**
2  * Copyright (c) UT-Battelle, LLC. 2014-2015. ALL RIGHTS RESERVED.
3  * Copyright (C) Mellanox Technologies Ltd. 2001-2015.  ALL RIGHTS RESERVED.
4  * Copyright (C) ARM Ltd. 2016.  ALL RIGHTS RESERVED.
5  * See file LICENSE for terms.
6  */
7 
8 #ifndef UCT_MM_IFACE_H
9 #define UCT_MM_IFACE_H
10 
11 #include "mm_md.h"
12 
13 #include <uct/base/uct_iface.h>
14 #include <uct/sm/base/sm_iface.h>
15 #include <ucs/arch/cpu.h>
16 #include <ucs/debug/memtrack.h>
17 #include <ucs/datastruct/arbiter.h>
18 #include <ucs/sys/compiler.h>
19 #include <ucs/sys/sys.h>
20 #include <sys/shm.h>
21 #include <sys/un.h>
22 
23 
24 enum {
25     UCT_MM_FIFO_ELEM_FLAG_OWNER  = UCS_BIT(0), /* new/old info */
26     UCT_MM_FIFO_ELEM_FLAG_INLINE = UCS_BIT(1), /* if inline or not */
27 };
28 
29 
30 #define UCT_MM_FIFO_CTL_SIZE \
31     ucs_align_up(sizeof(uct_mm_fifo_ctl_t), UCS_SYS_CACHE_LINE_SIZE)
32 
33 
34 #define UCT_MM_GET_FIFO_SIZE(_iface) \
35     (UCT_MM_FIFO_CTL_SIZE + \
36      ((_iface)->config.fifo_size * (_iface)->config.fifo_elem_size) + \
37       (UCS_SYS_CACHE_LINE_SIZE - 1))
38 
39 
40 #define UCT_MM_IFACE_GET_FIFO_ELEM(_iface, _fifo, _index) \
41     ((uct_mm_fifo_element_t*) \
42      UCS_PTR_BYTE_OFFSET(_fifo, (_index) * (_iface)->config.fifo_elem_size))
43 
44 
45 #define uct_mm_iface_mapper_call(_iface, _func, ...) \
46     ({ \
47         uct_mm_md_t *md = ucs_derived_of((_iface)->super.super.md, uct_mm_md_t); \
48         uct_mm_md_mapper_call(md, _func, ## __VA_ARGS__); \
49     })
50 
51 /* AIMD (additive increase/multiplicative decrease) algorithm adopted for FIFO
52  * polling mechanism to adjust FIFO polling window.
53  * - FIFO window is increased if the number of completed RX operations during
54  *   the current iface progress call reaches FIFO window size and previous iface
55  *   progress call was able to fully consume FIFO window (protection against
56  *   impacting ping-pong pattern where handling of > 1 RX operation should not
57  *   be expected).
58  * - FIFO window is decreased if the number of completed RX operations during
59  *   the current iface progress call does not reach FIFO window size.
60  * See https://en.wikipedia.org/wiki/Additive_increase/multiplicative_decrease
61  * for more information about original AIMD algorithm used for congestion
62  * avoidance. */
63 #define UCT_MM_IFACE_FIFO_MIN_POLL              1 /* Minimal FIFO window size */
64 #define UCT_MM_IFACE_FIFO_MAX_POLL             16 /* Default value for FIFO maximal
65                                                    * window size */
66 #define UCT_MM_IFACE_FIFO_AI_VALUE              1 /* FIFO window += AI value */
67 #define UCT_MM_IFACE_FIFO_MD_FACTOR             2 /* FIFO window /= MD factor */
68 
69 
70 /**
71  * MM interface configuration
72  */
73 typedef struct uct_mm_iface_config {
74     uct_sm_iface_config_t    super;
75     size_t                   seg_size;            /* Size of the receive
76                                                    * descriptor (for payload) */
77     unsigned                 fifo_size;           /* Size of the receive FIFO */
78     size_t                   fifo_max_poll;       /* Maximal RX completions to pick
79                                                    * during RX poll */
80     double                   release_fifo_factor; /* Tail index update frequency */
81     ucs_ternary_value_t      hugetlb_mode;        /* Enable using huge pages for
82                                                    * shared memory buffers */
83     unsigned                 fifo_elem_size;      /* Size of the FIFO element size */
84     uct_iface_mpool_config_t mp;
85 } uct_mm_iface_config_t;
86 
87 
88 /**
89  * MM interface address
90  */
91 typedef struct uct_mm_iface_addr {
92     uct_mm_seg_id_t          fifo_seg_id;     /* Shared memory identifier of FIFO */
93     /* mapper-specific iface address follows */
94 } UCS_S_PACKED uct_mm_iface_addr_t;
95 
96 
97 /**
98  * MM FIFO control segment
99  */
100 typedef struct uct_mm_fifo_ctl {
101     /* 1st cacheline */
102     volatile uint64_t         head;           /* Where to write next */
103     socklen_t                 signal_addrlen; /* Address length of signaling socket */
104     struct sockaddr_un        signal_sockaddr;/* Address of signaling socket */
105     UCS_CACHELINE_PADDING(uint64_t,
106                           socklen_t,
107                           struct sockaddr_un);
108 
109     /* 2nd cacheline */
110     volatile uint64_t         tail;           /* How much was consumed */
111 } UCS_S_PACKED UCS_V_ALIGNED(UCS_SYS_CACHE_LINE_SIZE) uct_mm_fifo_ctl_t;
112 
113 
114 /**
115  * MM receive descriptor info in the shared FIFO
116  */
117 typedef struct uct_mm_desc_info {
118     uct_mm_seg_id_t         seg_id;           /* shared memory segment id */
119     unsigned                seg_size;         /* size of the shared memory segment */
120     unsigned                offset;           /* offset inside the shared memory
121                                                  segment */
122 } UCS_S_PACKED uct_mm_desc_info_t;
123 
124 
125 /**
126  * MM FIFO element
127  */
128 typedef struct uct_mm_fifo_element {
129     uint8_t                   flags;          /* UCT_MM_FIFO_ELEM_FLAG_xx */
130     uint8_t                   am_id;          /* active message id */
131     uint16_t                  length;         /* length of actual data written
132                                                  by producer */
133     uct_mm_desc_info_t        desc;           /* remote receive descriptor
134                                                  parameters for am_bcopy */
135     void                      *desc_data;     /* pointer to receive descriptor,
136                                                  valid only on receiver */
137 
138     /* the data follows here (in case of inline messaging) */
139 } UCS_S_PACKED uct_mm_fifo_element_t;
140 
141 
142 /*
143  * MM receive descriptor:
144  *
145  * +--------------------+---------------+-----------+
146  * | uct_mm_recv_desc_t | user-defined  | data      |
147  * | (info + rdesc)     | rx headroom   | (payload) |
148  * +--------------------+---------------+-----------+
149  */
150 typedef struct uct_mm_recv_desc {
151     uct_mm_desc_info_t        info;           /* descriptor information for the
152                                                  remote side which writes to it */
153     uct_recv_desc_t           recv;           /* has to be in the end */
154 } uct_mm_recv_desc_t;
155 
156 
157 /**
158  * MM trandport interface
159  */
160 typedef struct uct_mm_iface {
161     uct_sm_iface_t          super;
162 
163     /* Receive FIFO */
164     uct_allocated_memory_t  recv_fifo_mem;
165 
166     uct_mm_fifo_ctl_t       *recv_fifo_ctl;   /* pointer to the struct at the */
167                                               /* beginning of the receive fifo */
168                                               /* which holds the head and the tail. */
169                                               /* this struct is cache line aligned and */
170                                               /* doesn't necessarily start where */
171                                               /* shared_mem starts */
172     void                    *recv_fifo_elems; /* pointer to the first fifo element
173                                                  in the receive fifo */
174     uct_mm_fifo_element_t   *read_index_elem;
175     uint64_t                read_index;       /* actual reading location */
176 
177     uint8_t                 fifo_shift;       /* = log2(fifo_size) */
178     unsigned                fifo_mask;        /* = 2^fifo_shift - 1 */
179     uint64_t                fifo_release_factor_mask;
180 
181     unsigned                fifo_poll_count;     /* How much RX operations can be polled
182                                                   * during an iface progress call */
183     int                     fifo_prev_wnd_cons;  /* Was FIFO window size fully consumed by
184                                                   * the previous call to iface progress */
185 
186     ucs_mpool_t             recv_desc_mp;
187     uct_mm_recv_desc_t      *last_recv_desc;  /* next receive descriptor to use */
188 
189     int                     signal_fd;        /* Unix socket for receiving remote signal */
190 
191     size_t                  rx_headroom;
192     ucs_arbiter_t           arbiter;
193     uct_recv_desc_t         release_desc;
194 
195     struct {
196         unsigned            fifo_size;
197         unsigned            fifo_elem_size;
198         unsigned            seg_size;         /* size of the receive descriptor (for payload)*/
199         unsigned            fifo_max_poll;
200     } config;
201 } uct_mm_iface_t;
202 
203 
204 /*
205  * Define a memory-mapper transport for MM.
206  *
207  * @param _name         Component name token
208  * @param _md_ops       Memory domain operations, of type uct_mm_md_ops_t.
209  * @param _rkey_unpack  Remote key unpack function
210  * @param _rkey_release Remote key release function
211  * @param _cfg_prefix   Prefix for configuration variables.
212  */
213 #define UCT_MM_TL_DEFINE(_name, _md_ops, _rkey_unpack, _rkey_release, \
214                          _cfg_prefix) \
215     \
216     UCT_MM_COMPONENT_DEFINE(uct_##_name##_component, _name, _md_ops, \
217                             _rkey_unpack, _rkey_release, _cfg_prefix) \
218     \
219     UCT_TL_DEFINE(&(uct_##_name##_component).super, \
220                   _name, \
221                   uct_sm_base_query_tl_devices, \
222                   uct_mm_iface_t, \
223                   "MM_", \
224                   uct_mm_iface_config_table, \
225                   uct_mm_iface_config_t);
226 
227 
228 extern ucs_config_field_t uct_mm_iface_config_table[];
229 
230 
231 static UCS_F_ALWAYS_INLINE ucs_status_t
uct_mm_iface_invoke_am(uct_mm_iface_t * iface,uint8_t am_id,void * data,unsigned length,unsigned flags)232 uct_mm_iface_invoke_am(uct_mm_iface_t *iface, uint8_t am_id, void *data,
233                        unsigned length, unsigned flags)
234 {
235     ucs_status_t status;
236     void         *desc;
237 
238     status = uct_iface_invoke_am(&iface->super.super, am_id, data, length,
239                                  flags);
240 
241     if (status == UCS_INPROGRESS) {
242         desc = (void *)((uintptr_t)data - iface->rx_headroom);
243         /* save the release_desc for later release of this desc */
244         uct_recv_desc(desc) = &iface->release_desc;
245     }
246 
247     return status;
248 }
249 
250 
251 /**
252  * Set aligned pointers of the FIFO according to the beginning of the allocated
253  * memory.
254  * @param [in] fifo_mem      Pointer to the beginning of the allocated memory.
255  * @param [out] fifo_ctl_p   Pointer to the FIFO control structure.
256  * @param [out] fifo_elems   Pointer to the array of FIFO elements.
257  */
258 void uct_mm_iface_set_fifo_ptrs(void *fifo_mem, uct_mm_fifo_ctl_t **fifo_ctl_p,
259                                 void **fifo_elems_p);
260 
261 
262 UCS_CLASS_DECLARE_NEW_FUNC(uct_mm_iface_t, uct_iface_t, uct_md_h, uct_worker_h,
263                            const uct_iface_params_t*, const uct_iface_config_t*);
264 
265 
266 void uct_mm_iface_release_desc(uct_recv_desc_t *self, void *desc);
267 
268 
269 ucs_status_t uct_mm_flush();
270 
271 
272 #endif
273