1 /*
2  * Copyright (C) 1999-2001 The Regents of the University of California
3  * (through E.O. Lawrence Berkeley National Laboratory), subject to
4  * approval by the U.S. Department of Energy.
5  *
6  * Use of this software is under license. The license agreement is included
7  * in the file MVICH_LICENSE.TXT.
8  *
9  * Developed at Berkeley Lab as part of MVICH.
10  *
11  * Authors: Bill Saphir      <wcsaphir@lbl.gov>
12  *          Michael Welcome  <mlwelcome@lbl.gov>
13  */
14 
15 /* Copyright (c) 2002-2008, The Ohio State University. All rights
16  * reserved.
17  *
18  * This file is part of the MVAPICH software package developed by the
19  * team members of The Ohio State University's Network-Based Computing
20  * Laboratory (NBCL), headed by Professor Dhabaleswar K. (DK) Panda.
21  *
22  * For detailed copyright and licensing information, please refer to the
23  * copyright file COPYRIGHT_MVAPICH in the top level MPICH directory.
24  *
25  */
26 
27 #define _XOPEN_SOURCE 600
28 
29 #include "cbuf.h"
30 #include <assert.h>
31 
32 
33 /*
34  * cbufs
35  *
36  * cbufs provide system buffers for VMPICH. They are analogous to mbufs
37  * in BSD networking.
38  * The primary motivation for cbufs is that implementing MPI on VIA
39  * seems to requiring pre-posting a number of fixed-sized buffers.
40  * These buffers must be registered (pinned). Life is easier if
41  * they are all registered at once so there is only one memory
42  * handle. We manage a fixed-size pool of cbufs that are
43  * allocated and pinned when a progam starts up. We manage
44  * the free cbuf list as a singly linked list.
45  *
46  *  Two different ways to manage the free list as a singly-linked list.
47  *  1. head and tail pointers. Add to tail, remove from head.
48  *  2. only head pointer, treat as a stack.
49  *
50  *  #1 Eliminates contention between adding to list and removing from list
51  *   Lock-free possible?
52  *
53  *  #2 Has slightly less overhead when there is no contention, and is more
54  *  likely to produce a cbuf that is already in cache.
55  *
56  *  Currently anticipate that most access near-term will be single-threaded,
57  *  so go with head only.  (#2)
58  */
59 
60 /* head of list of allocated cbuf regions */
61 static cbuf_region *cbuf_region_head = NULL;
62 
63 /*
64  * free_cbuf_head is the head of the free list
65  */
66 
67 static cbuf *free_cbuf_head = NULL;
68 
69 static int cbuf_n_allocated = 0;
70 static long num_free_cbuf = 0;
71 static long num_cbuf_get = 0;
72 static long num_cbuf_free = 0;
73 
74 static pthread_spinlock_t cbuf_lock;
75 int viadev_cbuf_max = -1;
76 int viadev_cbuf_total_size = (2 * 1024);
77 int viadev_cbuf_secondary_pool_size = 128;
78 
init_cbuf_lock()79 void init_cbuf_lock()
80 {
81     pthread_spin_init(&cbuf_lock, 0);
82 }
83 
lock_cbuf()84 static void lock_cbuf()
85 {
86     pthread_spin_lock(&cbuf_lock);
87     return;
88 }
89 
unlock_cbuf()90 static void unlock_cbuf()
91 {
92     pthread_spin_unlock(&cbuf_lock);
93     return;
94 }
95 
96 
dump_cbuf_region(cbuf_region * r)97 void dump_cbuf_region(cbuf_region * r)
98 {
99 }
100 
dump_cbuf_regions()101 void dump_cbuf_regions()
102 {
103     cbuf_region *r = cbuf_region_head;
104 
105     while (r) {
106         dump_cbuf_region(r);
107         r = r->next;
108     }
109 }
deallocate_cbufs()110 void deallocate_cbufs()
111 {
112     cbuf_region *r = cbuf_region_head;
113 
114     lock_cbuf();
115 
116     while (r) {
117         if (r->mem_handle != NULL) {
118             /* free cbufs add it later */
119         }
120         r = r->next;
121     }
122 
123     unlock_cbuf();
124 }
125 
allocate_cbuf_region(int ncbufs)126 static void allocate_cbuf_region(int ncbufs)
127 {
128     struct cbuf_region *reg;
129     void *mem;
130     void *cbuf_dma_buffer;
131 
132     int i;
133     cbuf *cur;
134     int alignment_cbuf = 64;
135     int alignment_dma;
136 
137     alignment_dma = getpagesize();
138 
139     if (free_cbuf_head != NULL) {
140     }
141 
142     if (ncbufs <= 0) {
143     }
144 
145     /* are we limiting cbuf allocation?  If so, make sure
146      * we dont alloc more than allowed
147      */
148 
149     reg = (struct cbuf_region *) malloc(sizeof(struct cbuf_region));
150     if (NULL == reg) {
151     }
152 
153     if(posix_memalign((void **) &mem, alignment_cbuf, ncbufs * sizeof(cbuf))) {
154     }
155 
156     /* ALLOCATE THE DMA BUFFER */
157 
158     if(posix_memalign((void **) &cbuf_dma_buffer, alignment_dma,
159                 ncbufs * viadev_cbuf_total_size)) {
160     }
161 
162     memset(mem, 0, ncbufs * sizeof(cbuf));
163     memset(cbuf_dma_buffer, 0, ncbufs * viadev_cbuf_total_size);
164 
165     cbuf_n_allocated += ncbufs;
166     num_free_cbuf += ncbufs;
167     reg->malloc_start = mem;
168 
169     reg->malloc_buf_start = cbuf_dma_buffer;
170     reg->malloc_end = (void *) ((char *) mem + ncbufs * sizeof(cbuf));
171     reg->malloc_buf_end = (void *) ((char *) cbuf_dma_buffer +
172             ncbufs * viadev_cbuf_total_size);
173 
174     reg->count = ncbufs;
175 
176     free_cbuf_head = (cbuf *) ((aint_t) mem);
177 
178     reg->cbuf_head = free_cbuf_head;
179 
180 
181     reg->mem_handle = armci_register_memory(cbuf_dma_buffer,
182             ncbufs * viadev_cbuf_total_size);
183 
184     if (reg->mem_handle == NULL) {
185     }
186 
187     /* init the free list */
188     for (i = 0; i < ncbufs - 1; i++) {
189         cur = free_cbuf_head + i;
190 
191         cur->desc.next = free_cbuf_head + i + 1;
192         cur->region = reg;
193 
194 #ifdef ADAPTIVE_RDMA_FAST_PATH
195 #else
196         cur->buffer = (unsigned char *) ((char *)(cbuf_dma_buffer) +
197                 (i * viadev_cbuf_total_size));
198 #endif
199     }
200     /* last one needs to be set to NULL */
201     cur = free_cbuf_head + ncbufs - 1;
202 
203     cur->desc.next = NULL;
204 
205     cur->region = reg;
206 
207 #ifdef ADAPTIVE_RDMA_FAST_PATH
208 #else
209     cur->buffer = (unsigned char *) ((char *)cbuf_dma_buffer +
210             ((ncbufs - 1) * viadev_cbuf_total_size));
211 
212 #endif
213 
214     /* thread region list */
215     reg->next = cbuf_region_head;
216     cbuf_region_head = reg;
217 
218 }
allocate_cbufs(int ncbufs)219 void allocate_cbufs(int ncbufs)
220 {
221     /* this function is only called by the init routines.
222      * cache the nic handle and ptag for later cbuf_region allocations
223      */
224     /* now allocate the first cbuf region */
225     allocate_cbuf_region(ncbufs);
226 }
227 
228 
229 /*
230  * Get a cbuf off the free list
231  */
232 
get_cbuf(void)233 cbuf *get_cbuf(void)
234 {
235     cbuf *v;
236 
237     lock_cbuf();
238 
239     /*
240      * It will often be possible for higher layers to recover
241      * when no cbuf is available, but waiting for more descriptors
242      * to complete. For now, just abort.
243      */
244     if (NULL == free_cbuf_head) {
245         allocate_cbuf_region(viadev_cbuf_secondary_pool_size);
246         if (NULL == free_cbuf_head) {
247         }
248     }
249     v = free_cbuf_head;
250     num_free_cbuf--;
251     num_cbuf_get++;
252 
253     /* this correctly handles removing from single entry free list */
254     free_cbuf_head = free_cbuf_head->desc.next;
255 #ifdef ADAPTIVE_RDMA_FAST_PATH
256     /* need to change this to RPUT_CBUF_FLAG or RGET_CBUF_FLAG later
257      * if we are doing rput */
258     v->padding = NORMAL_CBUF_FLAG;
259 #endif
260 
261     /* this is probably not the right place to initialize shandle to NULL.
262      * Do it here for now because it will make sure it is always initialized.
263      * Otherwise we would need to very carefully add the initialization in
264      * a dozen other places, and probably miss one.
265      */
266     v->shandle = NULL;
267 
268     v->ref_count = 0;
269     v->len = 0;
270 
271     v->grank = -1; /* Make sure it is not inadvertantly used anywhere */
272 
273     unlock_cbuf();
274 
275     return (v);
276 }
277 
278 /*
279  * Put a cbuf back on the free list
280  */
281 
release_cbuf(cbuf * v)282 void release_cbuf(cbuf * v)
283 {
284 
285     lock_cbuf();
286 
287     /* note this correctly handles appending to empty free list */
288 
289 
290     assert(v != free_cbuf_head);
291 
292     v->desc.next = free_cbuf_head;
293 
294 #ifdef ADAPTIVE_RDMA_FAST_PATH
295 #endif
296 
297 
298     free_cbuf_head = v;
299     num_free_cbuf++;
300     num_cbuf_free++;
301 
302     unlock_cbuf();
303 }
304 
305 
306 /*
307  * fill in cbuf descriptor with all necessary info
308  */
309 
310 
311 
cbuf_init_send(cbuf * v,unsigned long len)312 void cbuf_init_send(cbuf * v, unsigned long len)
313 {
314     v->desc.u.sr.next = NULL;
315     v->desc.u.sr.send_flags = IBV_SEND_SIGNALED;
316     v->desc.u.sr.opcode = IBV_WR_SEND;
317     v->desc.u.sr.wr_id = (aint_t) v;
318     v->desc.u.sr.num_sge = 1;
319     v->desc.u.sr.sg_list = &(v->desc.sg_entry);
320 
321     v->desc.sg_entry.addr = (uintptr_t) v->buffer;
322     v->desc.sg_entry.length = len;
323     v->desc.sg_entry.lkey = v->region->mem_handle->lkey;
324 }
325 
cbuf_init_recv(cbuf * v,unsigned long len)326 void cbuf_init_recv(cbuf * v, unsigned long len)
327 {
328     v->desc.u.rr.next = NULL;
329     v->desc.u.rr.wr_id = (aint_t) v;
330     v->desc.u.rr.num_sge = 1;
331     v->desc.u.rr.sg_list = &(v->desc.sg_entry);
332 
333     v->desc.sg_entry.addr = (uintptr_t) v->buffer;
334     v->desc.sg_entry.length = len;
335     v->desc.sg_entry.lkey = v->region->mem_handle->lkey;
336 
337 #ifdef ADAPTIVE_RDMA_FAST_PATH
338     v->padding = NORMAL_CBUF_FLAG;
339 #endif
340 }
cbuf_init_sendrecv(cbuf * v,unsigned long len)341 void cbuf_init_sendrecv(cbuf * v, unsigned long len)
342 {
343 }
344 
cbuf_init_rput(cbuf * v,void * local_address,uint32_t lkey,void * remote_address,uint32_t rkey,int len)345 void cbuf_init_rput(cbuf * v, void *local_address,
346                     uint32_t lkey, void *remote_address,
347                     uint32_t rkey, int len)
348 {
349     v->desc.u.sr.next = NULL;
350     v->desc.u.sr.send_flags = IBV_SEND_SIGNALED;
351     v->desc.u.sr.opcode = IBV_WR_RDMA_WRITE;
352     v->desc.u.sr.wr_id = (aint_t) v;
353 
354     v->desc.u.sr.num_sge = 1;
355     v->desc.u.sr.sg_list = &(v->desc.sg_entry);
356 
357     v->desc.sg_entry.length = len;
358     v->desc.sg_entry.lkey = lkey;
359     v->desc.sg_entry.addr = (uintptr_t) local_address;
360 
361     v->desc.u.sr.wr.rdma.remote_addr = (uintptr_t) remote_address;
362     v->desc.u.sr.wr.rdma.rkey = rkey;
363 
364 #ifdef ADAPTIVE_RDMA_FAST_PATH
365     v->padding = RPUT_CBUF_FLAG;
366 #endif
367 
368 }
369 
370 
371 
cbuf_init_rget(cbuf * v,void * local_address,uint32_t lkey,void * remote_address,uint32_t rkey,int len)372 void cbuf_init_rget(cbuf * v,
373                     void *local_address,
374                     uint32_t lkey,
375                     void *remote_address,
376                     uint32_t rkey, int len)
377 {
378     v->desc.u.sr.next = NULL;
379     v->desc.u.sr.send_flags = IBV_SEND_SIGNALED;
380     v->desc.u.sr.opcode = IBV_WR_RDMA_READ;
381     v->desc.u.sr.wr_id = (aint_t) v;
382 
383     v->desc.u.sr.num_sge = 1;
384     v->desc.u.sr.sg_list = &(v->desc.sg_entry);
385 
386     v->desc.sg_entry.length = len;
387     v->desc.sg_entry.lkey = lkey;
388     v->desc.sg_entry.addr = (uintptr_t) local_address;
389 
390     v->desc.u.sr.wr.rdma.remote_addr = (uintptr_t) remote_address;
391     v->desc.u.sr.wr.rdma.rkey = rkey;
392 
393 #ifdef ADAPTIVE_RDMA_FAST_PATH
394     v->padding = RGET_CBUF_FLAG;
395 #endif
396 
397 }
398 
399 /*
400  * print out cbuf contents for debugging
401  */
402 
dump_cbuf(char * msg,cbuf * v)403 void dump_cbuf(char *msg, cbuf * v)
404 {
405 }
406 
407 #ifdef ADAPTIVE_RDMA_FAST_PATH
408 #endif
409