1 /*
2 * Copyright (C) 1999-2001 The Regents of the University of California
3 * (through E.O. Lawrence Berkeley National Laboratory), subject to
4 * approval by the U.S. Department of Energy.
5 *
6 * Use of this software is under license. The license agreement is included
7 * in the file MVICH_LICENSE.TXT.
8 *
9 * Developed at Berkeley Lab as part of MVICH.
10 *
11 * Authors: Bill Saphir <wcsaphir@lbl.gov>
12 * Michael Welcome <mlwelcome@lbl.gov>
13 */
14
15 /* Copyright (c) 2002-2008, The Ohio State University. All rights
16 * reserved.
17 *
18 * This file is part of the MVAPICH software package developed by the
19 * team members of The Ohio State University's Network-Based Computing
20 * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
21 *
22 * For detailed copyright and licensing information, please refer to the
23 * copyright file COPYRIGHT_MVAPICH in the top level MPICH directory.
24 *
25 */
26
27 #define _XOPEN_SOURCE 600
28
29 #include "cbuf.h"
30 #include <assert.h>
31
32
33 /*
34 * cbufs
35 *
36 * cbufs provide system buffers for VMPICH. They are analogous to mbufs
37 * in BSD networking.
38 * The primary motivation for cbufs is that implementing MPI on VIA
39 * seems to requiring pre-posting a number of fixed-sized buffers.
40 * These buffers must be registered (pinned). Life is easier if
41 * they are all registered at once so there is only one memory
42 * handle. We manage a fixed-size pool of cbufs that are
43 * allocated and pinned when a progam starts up. We manage
44 * the free cbuf list as a singly linked list.
45 *
46 * Two different ways to manage the free list as a singly-linked list.
47 * 1. head and tail pointers. Add to tail, remove from head.
48 * 2. only head pointer, treat as a stack.
49 *
50 * #1 Eliminates contention between adding to list and removing from list
51 * Lock-free possible?
52 *
53 * #2 Has slightly less overhead when there is no contention, and is more
54 * likely to produce a cbuf that is already in cache.
55 *
56 * Currently anticipate that most access near-term will be single-threaded,
57 * so go with head only. (#2)
58 */
59
60 /* head of list of allocated cbuf regions */
61 static cbuf_region *cbuf_region_head = NULL;
62
63 /*
64 * free_cbuf_head is the head of the free list
65 */
66
67 static cbuf *free_cbuf_head = NULL;
68
69 static int cbuf_n_allocated = 0;
70 static long num_free_cbuf = 0;
71 static long num_cbuf_get = 0;
72 static long num_cbuf_free = 0;
73
74 static pthread_spinlock_t cbuf_lock;
75 int viadev_cbuf_max = -1;
76 int viadev_cbuf_total_size = (2 * 1024);
77 int viadev_cbuf_secondary_pool_size = 128;
78
init_cbuf_lock()79 void init_cbuf_lock()
80 {
81 pthread_spin_init(&cbuf_lock, 0);
82 }
83
lock_cbuf()84 static void lock_cbuf()
85 {
86 pthread_spin_lock(&cbuf_lock);
87 return;
88 }
89
unlock_cbuf()90 static void unlock_cbuf()
91 {
92 pthread_spin_unlock(&cbuf_lock);
93 return;
94 }
95
96
dump_cbuf_region(cbuf_region * r)97 void dump_cbuf_region(cbuf_region * r)
98 {
99 }
100
dump_cbuf_regions()101 void dump_cbuf_regions()
102 {
103 cbuf_region *r = cbuf_region_head;
104
105 while (r) {
106 dump_cbuf_region(r);
107 r = r->next;
108 }
109 }
deallocate_cbufs()110 void deallocate_cbufs()
111 {
112 cbuf_region *r = cbuf_region_head;
113
114 lock_cbuf();
115
116 while (r) {
117 if (r->mem_handle != NULL) {
118 /* free cbufs add it later */
119 }
120 r = r->next;
121 }
122
123 unlock_cbuf();
124 }
125
allocate_cbuf_region(int ncbufs)126 static void allocate_cbuf_region(int ncbufs)
127 {
128 struct cbuf_region *reg;
129 void *mem;
130 void *cbuf_dma_buffer;
131
132 int i;
133 cbuf *cur;
134 int alignment_cbuf = 64;
135 int alignment_dma;
136
137 alignment_dma = getpagesize();
138
139 if (free_cbuf_head != NULL) {
140 }
141
142 if (ncbufs <= 0) {
143 }
144
145 /* are we limiting cbuf allocation? If so, make sure
146 * we dont alloc more than allowed
147 */
148
149 reg = (struct cbuf_region *) malloc(sizeof(struct cbuf_region));
150 if (NULL == reg) {
151 }
152
153 if(posix_memalign((void **) &mem, alignment_cbuf, ncbufs * sizeof(cbuf))) {
154 }
155
156 /* ALLOCATE THE DMA BUFFER */
157
158 if(posix_memalign((void **) &cbuf_dma_buffer, alignment_dma,
159 ncbufs * viadev_cbuf_total_size)) {
160 }
161
162 memset(mem, 0, ncbufs * sizeof(cbuf));
163 memset(cbuf_dma_buffer, 0, ncbufs * viadev_cbuf_total_size);
164
165 cbuf_n_allocated += ncbufs;
166 num_free_cbuf += ncbufs;
167 reg->malloc_start = mem;
168
169 reg->malloc_buf_start = cbuf_dma_buffer;
170 reg->malloc_end = (void *) ((char *) mem + ncbufs * sizeof(cbuf));
171 reg->malloc_buf_end = (void *) ((char *) cbuf_dma_buffer +
172 ncbufs * viadev_cbuf_total_size);
173
174 reg->count = ncbufs;
175
176 free_cbuf_head = (cbuf *) ((aint_t) mem);
177
178 reg->cbuf_head = free_cbuf_head;
179
180
181 reg->mem_handle = armci_register_memory(cbuf_dma_buffer,
182 ncbufs * viadev_cbuf_total_size);
183
184 if (reg->mem_handle == NULL) {
185 }
186
187 /* init the free list */
188 for (i = 0; i < ncbufs - 1; i++) {
189 cur = free_cbuf_head + i;
190
191 cur->desc.next = free_cbuf_head + i + 1;
192 cur->region = reg;
193
194 #ifdef ADAPTIVE_RDMA_FAST_PATH
195 #else
196 cur->buffer = (unsigned char *) ((char *)(cbuf_dma_buffer) +
197 (i * viadev_cbuf_total_size));
198 #endif
199 }
200 /* last one needs to be set to NULL */
201 cur = free_cbuf_head + ncbufs - 1;
202
203 cur->desc.next = NULL;
204
205 cur->region = reg;
206
207 #ifdef ADAPTIVE_RDMA_FAST_PATH
208 #else
209 cur->buffer = (unsigned char *) ((char *)cbuf_dma_buffer +
210 ((ncbufs - 1) * viadev_cbuf_total_size));
211
212 #endif
213
214 /* thread region list */
215 reg->next = cbuf_region_head;
216 cbuf_region_head = reg;
217
218 }
allocate_cbufs(int ncbufs)219 void allocate_cbufs(int ncbufs)
220 {
221 /* this function is only called by the init routines.
222 * cache the nic handle and ptag for later cbuf_region allocations
223 */
224 /* now allocate the first cbuf region */
225 allocate_cbuf_region(ncbufs);
226 }
227
228
229 /*
230 * Get a cbuf off the free list
231 */
232
get_cbuf(void)233 cbuf *get_cbuf(void)
234 {
235 cbuf *v;
236
237 lock_cbuf();
238
239 /*
240 * It will often be possible for higher layers to recover
241 * when no cbuf is available, but waiting for more descriptors
242 * to complete. For now, just abort.
243 */
244 if (NULL == free_cbuf_head) {
245 allocate_cbuf_region(viadev_cbuf_secondary_pool_size);
246 if (NULL == free_cbuf_head) {
247 }
248 }
249 v = free_cbuf_head;
250 num_free_cbuf--;
251 num_cbuf_get++;
252
253 /* this correctly handles removing from single entry free list */
254 free_cbuf_head = free_cbuf_head->desc.next;
255 #ifdef ADAPTIVE_RDMA_FAST_PATH
256 /* need to change this to RPUT_CBUF_FLAG or RGET_CBUF_FLAG later
257 * if we are doing rput */
258 v->padding = NORMAL_CBUF_FLAG;
259 #endif
260
261 /* this is probably not the right place to initialize shandle to NULL.
262 * Do it here for now because it will make sure it is always initialized.
263 * Otherwise we would need to very carefully add the initialization in
264 * a dozen other places, and probably miss one.
265 */
266 v->shandle = NULL;
267
268 v->ref_count = 0;
269 v->len = 0;
270
271 v->grank = -1; /* Make sure it is not inadvertantly used anywhere */
272
273 unlock_cbuf();
274
275 return (v);
276 }
277
278 /*
279 * Put a cbuf back on the free list
280 */
281
release_cbuf(cbuf * v)282 void release_cbuf(cbuf * v)
283 {
284
285 lock_cbuf();
286
287 /* note this correctly handles appending to empty free list */
288
289
290 assert(v != free_cbuf_head);
291
292 v->desc.next = free_cbuf_head;
293
294 #ifdef ADAPTIVE_RDMA_FAST_PATH
295 #endif
296
297
298 free_cbuf_head = v;
299 num_free_cbuf++;
300 num_cbuf_free++;
301
302 unlock_cbuf();
303 }
304
305
306 /*
307 * fill in cbuf descriptor with all necessary info
308 */
309
310
311
cbuf_init_send(cbuf * v,unsigned long len)312 void cbuf_init_send(cbuf * v, unsigned long len)
313 {
314 v->desc.u.sr.next = NULL;
315 v->desc.u.sr.send_flags = IBV_SEND_SIGNALED;
316 v->desc.u.sr.opcode = IBV_WR_SEND;
317 v->desc.u.sr.wr_id = (aint_t) v;
318 v->desc.u.sr.num_sge = 1;
319 v->desc.u.sr.sg_list = &(v->desc.sg_entry);
320
321 v->desc.sg_entry.addr = (uintptr_t) v->buffer;
322 v->desc.sg_entry.length = len;
323 v->desc.sg_entry.lkey = v->region->mem_handle->lkey;
324 }
325
cbuf_init_recv(cbuf * v,unsigned long len)326 void cbuf_init_recv(cbuf * v, unsigned long len)
327 {
328 v->desc.u.rr.next = NULL;
329 v->desc.u.rr.wr_id = (aint_t) v;
330 v->desc.u.rr.num_sge = 1;
331 v->desc.u.rr.sg_list = &(v->desc.sg_entry);
332
333 v->desc.sg_entry.addr = (uintptr_t) v->buffer;
334 v->desc.sg_entry.length = len;
335 v->desc.sg_entry.lkey = v->region->mem_handle->lkey;
336
337 #ifdef ADAPTIVE_RDMA_FAST_PATH
338 v->padding = NORMAL_CBUF_FLAG;
339 #endif
340 }
cbuf_init_sendrecv(cbuf * v,unsigned long len)341 void cbuf_init_sendrecv(cbuf * v, unsigned long len)
342 {
343 }
344
cbuf_init_rput(cbuf * v,void * local_address,uint32_t lkey,void * remote_address,uint32_t rkey,int len)345 void cbuf_init_rput(cbuf * v, void *local_address,
346 uint32_t lkey, void *remote_address,
347 uint32_t rkey, int len)
348 {
349 v->desc.u.sr.next = NULL;
350 v->desc.u.sr.send_flags = IBV_SEND_SIGNALED;
351 v->desc.u.sr.opcode = IBV_WR_RDMA_WRITE;
352 v->desc.u.sr.wr_id = (aint_t) v;
353
354 v->desc.u.sr.num_sge = 1;
355 v->desc.u.sr.sg_list = &(v->desc.sg_entry);
356
357 v->desc.sg_entry.length = len;
358 v->desc.sg_entry.lkey = lkey;
359 v->desc.sg_entry.addr = (uintptr_t) local_address;
360
361 v->desc.u.sr.wr.rdma.remote_addr = (uintptr_t) remote_address;
362 v->desc.u.sr.wr.rdma.rkey = rkey;
363
364 #ifdef ADAPTIVE_RDMA_FAST_PATH
365 v->padding = RPUT_CBUF_FLAG;
366 #endif
367
368 }
369
370
371
cbuf_init_rget(cbuf * v,void * local_address,uint32_t lkey,void * remote_address,uint32_t rkey,int len)372 void cbuf_init_rget(cbuf * v,
373 void *local_address,
374 uint32_t lkey,
375 void *remote_address,
376 uint32_t rkey, int len)
377 {
378 v->desc.u.sr.next = NULL;
379 v->desc.u.sr.send_flags = IBV_SEND_SIGNALED;
380 v->desc.u.sr.opcode = IBV_WR_RDMA_READ;
381 v->desc.u.sr.wr_id = (aint_t) v;
382
383 v->desc.u.sr.num_sge = 1;
384 v->desc.u.sr.sg_list = &(v->desc.sg_entry);
385
386 v->desc.sg_entry.length = len;
387 v->desc.sg_entry.lkey = lkey;
388 v->desc.sg_entry.addr = (uintptr_t) local_address;
389
390 v->desc.u.sr.wr.rdma.remote_addr = (uintptr_t) remote_address;
391 v->desc.u.sr.wr.rdma.rkey = rkey;
392
393 #ifdef ADAPTIVE_RDMA_FAST_PATH
394 v->padding = RGET_CBUF_FLAG;
395 #endif
396
397 }
398
399 /*
400 * print out cbuf contents for debugging
401 */
402
dump_cbuf(char * msg,cbuf * v)403 void dump_cbuf(char *msg, cbuf * v)
404 {
405 }
406
407 #ifdef ADAPTIVE_RDMA_FAST_PATH
408 #endif
409