1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 /*
3 * Copyright(c) 2016 - 2018 Intel Corporation.
4 */
5
6 #include <linux/slab.h>
7 #include <linux/vmalloc.h>
8 #include <rdma/uverbs_ioctl.h>
9 #include "cq.h"
10 #include "vt.h"
11 #include "trace.h"
12
13 static struct workqueue_struct *comp_vector_wq;
14
15 /**
16 * rvt_cq_enter - add a new entry to the completion queue
17 * @cq: completion queue
18 * @entry: work completion entry to add
19 * @solicited: true if @entry is solicited
20 *
21 * This may be called with qp->s_lock held.
22 *
23 * Return: return true on success, else return
24 * false if cq is full.
25 */
rvt_cq_enter(struct rvt_cq * cq,struct ib_wc * entry,bool solicited)26 bool rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
27 {
28 struct ib_uverbs_wc *uqueue = NULL;
29 struct ib_wc *kqueue = NULL;
30 struct rvt_cq_wc *u_wc = NULL;
31 struct rvt_k_cq_wc *k_wc = NULL;
32 unsigned long flags;
33 u32 head;
34 u32 next;
35 u32 tail;
36
37 spin_lock_irqsave(&cq->lock, flags);
38
39 if (cq->ip) {
40 u_wc = cq->queue;
41 uqueue = &u_wc->uqueue[0];
42 head = RDMA_READ_UAPI_ATOMIC(u_wc->head);
43 tail = RDMA_READ_UAPI_ATOMIC(u_wc->tail);
44 } else {
45 k_wc = cq->kqueue;
46 kqueue = &k_wc->kqueue[0];
47 head = k_wc->head;
48 tail = k_wc->tail;
49 }
50
51 /*
52 * Note that the head pointer might be writable by
53 * user processes.Take care to verify it is a sane value.
54 */
55 if (head >= (unsigned)cq->ibcq.cqe) {
56 head = cq->ibcq.cqe;
57 next = 0;
58 } else {
59 next = head + 1;
60 }
61
62 if (unlikely(next == tail || cq->cq_full)) {
63 struct rvt_dev_info *rdi = cq->rdi;
64
65 if (!cq->cq_full)
66 rvt_pr_err_ratelimited(rdi, "CQ is full!\n");
67 cq->cq_full = true;
68 spin_unlock_irqrestore(&cq->lock, flags);
69 if (cq->ibcq.event_handler) {
70 struct ib_event ev;
71
72 ev.device = cq->ibcq.device;
73 ev.element.cq = &cq->ibcq;
74 ev.event = IB_EVENT_CQ_ERR;
75 cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
76 }
77 return false;
78 }
79 trace_rvt_cq_enter(cq, entry, head);
80 if (uqueue) {
81 uqueue[head].wr_id = entry->wr_id;
82 uqueue[head].status = entry->status;
83 uqueue[head].opcode = entry->opcode;
84 uqueue[head].vendor_err = entry->vendor_err;
85 uqueue[head].byte_len = entry->byte_len;
86 uqueue[head].ex.imm_data = entry->ex.imm_data;
87 uqueue[head].qp_num = entry->qp->qp_num;
88 uqueue[head].src_qp = entry->src_qp;
89 uqueue[head].wc_flags = entry->wc_flags;
90 uqueue[head].pkey_index = entry->pkey_index;
91 uqueue[head].slid = ib_lid_cpu16(entry->slid);
92 uqueue[head].sl = entry->sl;
93 uqueue[head].dlid_path_bits = entry->dlid_path_bits;
94 uqueue[head].port_num = entry->port_num;
95 /* Make sure entry is written before the head index. */
96 RDMA_WRITE_UAPI_ATOMIC(u_wc->head, next);
97 } else {
98 kqueue[head] = *entry;
99 k_wc->head = next;
100 }
101
102 if (cq->notify == IB_CQ_NEXT_COMP ||
103 (cq->notify == IB_CQ_SOLICITED &&
104 (solicited || entry->status != IB_WC_SUCCESS))) {
105 /*
106 * This will cause send_complete() to be called in
107 * another thread.
108 */
109 cq->notify = RVT_CQ_NONE;
110 cq->triggered++;
111 queue_work_on(cq->comp_vector_cpu, comp_vector_wq,
112 &cq->comptask);
113 }
114
115 spin_unlock_irqrestore(&cq->lock, flags);
116 return true;
117 }
118 EXPORT_SYMBOL(rvt_cq_enter);
119
send_complete(struct work_struct * work)120 static void send_complete(struct work_struct *work)
121 {
122 struct rvt_cq *cq = container_of(work, struct rvt_cq, comptask);
123
124 /*
125 * The completion handler will most likely rearm the notification
126 * and poll for all pending entries. If a new completion entry
127 * is added while we are in this routine, queue_work()
128 * won't call us again until we return so we check triggered to
129 * see if we need to call the handler again.
130 */
131 for (;;) {
132 u8 triggered = cq->triggered;
133
134 /*
135 * IPoIB connected mode assumes the callback is from a
136 * soft IRQ. We simulate this by blocking "bottom halves".
137 * See the implementation for ipoib_cm_handle_tx_wc(),
138 * netif_tx_lock_bh() and netif_tx_lock().
139 */
140 local_bh_disable();
141 cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
142 local_bh_enable();
143
144 if (cq->triggered == triggered)
145 return;
146 }
147 }
148
149 /**
150 * rvt_create_cq - create a completion queue
151 * @ibcq: Allocated CQ
152 * @attr: creation attributes
153 * @attrs: uverbs bundle
154 *
155 * Called by ib_create_cq() in the generic verbs code.
156 *
157 * Return: 0 on success
158 */
rvt_create_cq(struct ib_cq * ibcq,const struct ib_cq_init_attr * attr,struct uverbs_attr_bundle * attrs)159 int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
160 struct uverbs_attr_bundle *attrs)
161 {
162 struct ib_udata *udata = &attrs->driver_udata;
163 struct ib_device *ibdev = ibcq->device;
164 struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
165 struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
166 struct rvt_cq_wc *u_wc = NULL;
167 struct rvt_k_cq_wc *k_wc = NULL;
168 u32 sz;
169 unsigned int entries = attr->cqe;
170 int comp_vector = attr->comp_vector;
171 int err;
172
173 if (attr->flags)
174 return -EOPNOTSUPP;
175
176 if (entries < 1 || entries > rdi->dparms.props.max_cqe)
177 return -EINVAL;
178
179 if (comp_vector < 0)
180 comp_vector = 0;
181
182 comp_vector = comp_vector % rdi->ibdev.num_comp_vectors;
183
184 /*
185 * Allocate the completion queue entries and head/tail pointers.
186 * This is allocated separately so that it can be resized and
187 * also mapped into user space.
188 * We need to use vmalloc() in order to support mmap and large
189 * numbers of entries.
190 */
191 if (udata && udata->outlen >= sizeof(__u64)) {
192 sz = sizeof(struct ib_uverbs_wc) * (entries + 1);
193 sz += sizeof(*u_wc);
194 u_wc = vmalloc_user(sz);
195 if (!u_wc)
196 return -ENOMEM;
197 } else {
198 sz = sizeof(struct ib_wc) * (entries + 1);
199 sz += sizeof(*k_wc);
200 k_wc = vzalloc_node(sz, rdi->dparms.node);
201 if (!k_wc)
202 return -ENOMEM;
203 }
204
205 /*
206 * Return the address of the WC as the offset to mmap.
207 * See rvt_mmap() for details.
208 */
209 if (udata && udata->outlen >= sizeof(__u64)) {
210 cq->ip = rvt_create_mmap_info(rdi, sz, udata, u_wc);
211 if (IS_ERR(cq->ip)) {
212 err = PTR_ERR(cq->ip);
213 goto bail_wc;
214 }
215
216 err = ib_copy_to_udata(udata, &cq->ip->offset,
217 sizeof(cq->ip->offset));
218 if (err)
219 goto bail_ip;
220 }
221
222 spin_lock_irq(&rdi->n_cqs_lock);
223 if (rdi->n_cqs_allocated == rdi->dparms.props.max_cq) {
224 spin_unlock_irq(&rdi->n_cqs_lock);
225 err = -ENOMEM;
226 goto bail_ip;
227 }
228
229 rdi->n_cqs_allocated++;
230 spin_unlock_irq(&rdi->n_cqs_lock);
231
232 if (cq->ip) {
233 spin_lock_irq(&rdi->pending_lock);
234 list_add(&cq->ip->pending_mmaps, &rdi->pending_mmaps);
235 spin_unlock_irq(&rdi->pending_lock);
236 }
237
238 /*
239 * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe.
240 * The number of entries should be >= the number requested or return
241 * an error.
242 */
243 cq->rdi = rdi;
244 if (rdi->driver_f.comp_vect_cpu_lookup)
245 cq->comp_vector_cpu =
246 rdi->driver_f.comp_vect_cpu_lookup(rdi, comp_vector);
247 else
248 cq->comp_vector_cpu =
249 cpumask_first(cpumask_of_node(rdi->dparms.node));
250
251 cq->ibcq.cqe = entries;
252 cq->notify = RVT_CQ_NONE;
253 spin_lock_init(&cq->lock);
254 INIT_WORK(&cq->comptask, send_complete);
255 if (u_wc)
256 cq->queue = u_wc;
257 else
258 cq->kqueue = k_wc;
259
260 trace_rvt_create_cq(cq, attr);
261 return 0;
262
263 bail_ip:
264 kfree(cq->ip);
265 bail_wc:
266 vfree(u_wc);
267 vfree(k_wc);
268 return err;
269 }
270
271 /**
272 * rvt_destroy_cq - destroy a completion queue
273 * @ibcq: the completion queue to destroy.
274 * @udata: user data or NULL for kernel object
275 *
276 * Called by ib_destroy_cq() in the generic verbs code.
277 */
rvt_destroy_cq(struct ib_cq * ibcq,struct ib_udata * udata)278 int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
279 {
280 struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
281 struct rvt_dev_info *rdi = cq->rdi;
282
283 flush_work(&cq->comptask);
284 spin_lock_irq(&rdi->n_cqs_lock);
285 rdi->n_cqs_allocated--;
286 spin_unlock_irq(&rdi->n_cqs_lock);
287 if (cq->ip)
288 kref_put(&cq->ip->ref, rvt_release_mmap_info);
289 else
290 vfree(cq->kqueue);
291 return 0;
292 }
293
294 /**
295 * rvt_req_notify_cq - change the notification type for a completion queue
296 * @ibcq: the completion queue
297 * @notify_flags: the type of notification to request
298 *
299 * This may be called from interrupt context. Also called by
300 * ib_req_notify_cq() in the generic verbs code.
301 *
302 * Return: 0 for success.
303 */
rvt_req_notify_cq(struct ib_cq * ibcq,enum ib_cq_notify_flags notify_flags)304 int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
305 {
306 struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
307 unsigned long flags;
308 int ret = 0;
309
310 spin_lock_irqsave(&cq->lock, flags);
311 /*
312 * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow
313 * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2).
314 */
315 if (cq->notify != IB_CQ_NEXT_COMP)
316 cq->notify = notify_flags & IB_CQ_SOLICITED_MASK;
317
318 if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
319 if (cq->queue) {
320 if (RDMA_READ_UAPI_ATOMIC(cq->queue->head) !=
321 RDMA_READ_UAPI_ATOMIC(cq->queue->tail))
322 ret = 1;
323 } else {
324 if (cq->kqueue->head != cq->kqueue->tail)
325 ret = 1;
326 }
327 }
328
329 spin_unlock_irqrestore(&cq->lock, flags);
330
331 return ret;
332 }
333
334 /*
335 * rvt_resize_cq - change the size of the CQ
336 * @ibcq: the completion queue
337 *
338 * Return: 0 for success.
339 */
rvt_resize_cq(struct ib_cq * ibcq,int cqe,struct ib_udata * udata)340 int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
341 {
342 struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
343 u32 head, tail, n;
344 int ret;
345 u32 sz;
346 struct rvt_dev_info *rdi = cq->rdi;
347 struct rvt_cq_wc *u_wc = NULL;
348 struct rvt_cq_wc *old_u_wc = NULL;
349 struct rvt_k_cq_wc *k_wc = NULL;
350 struct rvt_k_cq_wc *old_k_wc = NULL;
351
352 if (cqe < 1 || cqe > rdi->dparms.props.max_cqe)
353 return -EINVAL;
354
355 /*
356 * Need to use vmalloc() if we want to support large #s of entries.
357 */
358 if (udata && udata->outlen >= sizeof(__u64)) {
359 sz = sizeof(struct ib_uverbs_wc) * (cqe + 1);
360 sz += sizeof(*u_wc);
361 u_wc = vmalloc_user(sz);
362 if (!u_wc)
363 return -ENOMEM;
364 } else {
365 sz = sizeof(struct ib_wc) * (cqe + 1);
366 sz += sizeof(*k_wc);
367 k_wc = vzalloc_node(sz, rdi->dparms.node);
368 if (!k_wc)
369 return -ENOMEM;
370 }
371 /* Check that we can write the offset to mmap. */
372 if (udata && udata->outlen >= sizeof(__u64)) {
373 __u64 offset = 0;
374
375 ret = ib_copy_to_udata(udata, &offset, sizeof(offset));
376 if (ret)
377 goto bail_free;
378 }
379
380 spin_lock_irq(&cq->lock);
381 /*
382 * Make sure head and tail are sane since they
383 * might be user writable.
384 */
385 if (u_wc) {
386 old_u_wc = cq->queue;
387 head = RDMA_READ_UAPI_ATOMIC(old_u_wc->head);
388 tail = RDMA_READ_UAPI_ATOMIC(old_u_wc->tail);
389 } else {
390 old_k_wc = cq->kqueue;
391 head = old_k_wc->head;
392 tail = old_k_wc->tail;
393 }
394
395 if (head > (u32)cq->ibcq.cqe)
396 head = (u32)cq->ibcq.cqe;
397 if (tail > (u32)cq->ibcq.cqe)
398 tail = (u32)cq->ibcq.cqe;
399 if (head < tail)
400 n = cq->ibcq.cqe + 1 + head - tail;
401 else
402 n = head - tail;
403 if (unlikely((u32)cqe < n)) {
404 ret = -EINVAL;
405 goto bail_unlock;
406 }
407 for (n = 0; tail != head; n++) {
408 if (u_wc)
409 u_wc->uqueue[n] = old_u_wc->uqueue[tail];
410 else
411 k_wc->kqueue[n] = old_k_wc->kqueue[tail];
412 if (tail == (u32)cq->ibcq.cqe)
413 tail = 0;
414 else
415 tail++;
416 }
417 cq->ibcq.cqe = cqe;
418 if (u_wc) {
419 RDMA_WRITE_UAPI_ATOMIC(u_wc->head, n);
420 RDMA_WRITE_UAPI_ATOMIC(u_wc->tail, 0);
421 cq->queue = u_wc;
422 } else {
423 k_wc->head = n;
424 k_wc->tail = 0;
425 cq->kqueue = k_wc;
426 }
427 spin_unlock_irq(&cq->lock);
428
429 if (u_wc)
430 vfree(old_u_wc);
431 else
432 vfree(old_k_wc);
433
434 if (cq->ip) {
435 struct rvt_mmap_info *ip = cq->ip;
436
437 rvt_update_mmap_info(rdi, ip, sz, u_wc);
438
439 /*
440 * Return the offset to mmap.
441 * See rvt_mmap() for details.
442 */
443 if (udata && udata->outlen >= sizeof(__u64)) {
444 ret = ib_copy_to_udata(udata, &ip->offset,
445 sizeof(ip->offset));
446 if (ret)
447 return ret;
448 }
449
450 spin_lock_irq(&rdi->pending_lock);
451 if (list_empty(&ip->pending_mmaps))
452 list_add(&ip->pending_mmaps, &rdi->pending_mmaps);
453 spin_unlock_irq(&rdi->pending_lock);
454 }
455
456 return 0;
457
458 bail_unlock:
459 spin_unlock_irq(&cq->lock);
460 bail_free:
461 vfree(u_wc);
462 vfree(k_wc);
463
464 return ret;
465 }
466
467 /**
468 * rvt_poll_cq - poll for work completion entries
469 * @ibcq: the completion queue to poll
470 * @num_entries: the maximum number of entries to return
471 * @entry: pointer to array where work completions are placed
472 *
473 * This may be called from interrupt context. Also called by ib_poll_cq()
474 * in the generic verbs code.
475 *
476 * Return: the number of completion entries polled.
477 */
rvt_poll_cq(struct ib_cq * ibcq,int num_entries,struct ib_wc * entry)478 int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
479 {
480 struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
481 struct rvt_k_cq_wc *wc;
482 unsigned long flags;
483 int npolled;
484 u32 tail;
485
486 /* The kernel can only poll a kernel completion queue */
487 if (cq->ip)
488 return -EINVAL;
489
490 spin_lock_irqsave(&cq->lock, flags);
491
492 wc = cq->kqueue;
493 tail = wc->tail;
494 if (tail > (u32)cq->ibcq.cqe)
495 tail = (u32)cq->ibcq.cqe;
496 for (npolled = 0; npolled < num_entries; ++npolled, ++entry) {
497 if (tail == wc->head)
498 break;
499 /* The kernel doesn't need a RMB since it has the lock. */
500 trace_rvt_cq_poll(cq, &wc->kqueue[tail], npolled);
501 *entry = wc->kqueue[tail];
502 if (tail >= cq->ibcq.cqe)
503 tail = 0;
504 else
505 tail++;
506 }
507 wc->tail = tail;
508
509 spin_unlock_irqrestore(&cq->lock, flags);
510
511 return npolled;
512 }
513
514 /**
515 * rvt_driver_cq_init - Init cq resources on behalf of driver
516 *
517 * Return: 0 on success
518 */
rvt_driver_cq_init(void)519 int rvt_driver_cq_init(void)
520 {
521 comp_vector_wq = alloc_workqueue("%s", WQ_HIGHPRI | WQ_CPU_INTENSIVE,
522 0, "rdmavt_cq");
523 if (!comp_vector_wq)
524 return -ENOMEM;
525
526 return 0;
527 }
528
529 /**
530 * rvt_cq_exit - tear down cq reources
531 */
rvt_cq_exit(void)532 void rvt_cq_exit(void)
533 {
534 destroy_workqueue(comp_vector_wq);
535 comp_vector_wq = NULL;
536 }
537