1 /* 2 * Copyright (c) 2018, Mellanox Technologies. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33 #include <linux/bpf_trace.h> 34 #include <net/xdp_sock.h> 35 #include "en/xdp.h" 36 #include "en/params.h" 37 38 int mlx5e_xdp_max_mtu(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk) 39 { 40 int hr = mlx5e_get_linear_rq_headroom(params, xsk); 41 42 /* Let S := SKB_DATA_ALIGN(sizeof(struct skb_shared_info)). 43 * The condition checked in mlx5e_rx_is_linear_skb is: 44 * SKB_DATA_ALIGN(sw_mtu + hard_mtu + hr) + S <= PAGE_SIZE (1) 45 * (Note that hw_mtu == sw_mtu + hard_mtu.) 46 * What is returned from this function is: 47 * max_mtu = PAGE_SIZE - S - hr - hard_mtu (2) 48 * After assigning sw_mtu := max_mtu, the left side of (1) turns to 49 * SKB_DATA_ALIGN(PAGE_SIZE - S) + S, which is equal to PAGE_SIZE, 50 * because both PAGE_SIZE and S are already aligned. Any number greater 51 * than max_mtu would make the left side of (1) greater than PAGE_SIZE, 52 * so max_mtu is the maximum MTU allowed. 53 */ 54 55 return MLX5E_HW2SW_MTU(params, SKB_MAX_HEAD(hr)); 56 } 57 58 static inline bool 59 mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq, 60 struct mlx5e_dma_info *di, struct xdp_buff *xdp) 61 { 62 struct mlx5e_xdp_xmit_data xdptxd; 63 struct mlx5e_xdp_info xdpi; 64 struct xdp_frame *xdpf; 65 dma_addr_t dma_addr; 66 67 xdpf = convert_to_xdp_frame(xdp); 68 if (unlikely(!xdpf)) 69 return false; 70 71 xdptxd.data = xdpf->data; 72 xdptxd.len = xdpf->len; 73 74 if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) { 75 /* The xdp_buff was in the UMEM and was copied into a newly 76 * allocated page. The UMEM page was returned via the ZCA, and 77 * this new page has to be mapped at this point and has to be 78 * unmapped and returned via xdp_return_frame on completion. 79 */ 80 81 /* Prevent double recycling of the UMEM page. Even in case this 82 * function returns false, the xdp_buff shouldn't be recycled, 83 * as it was already done in xdp_convert_zc_to_xdp_frame. 84 */ 85 __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); /* non-atomic */ 86 87 xdpi.mode = MLX5E_XDP_XMIT_MODE_FRAME; 88 89 dma_addr = dma_map_single(sq->pdev, xdptxd.data, xdptxd.len, 90 DMA_TO_DEVICE); 91 if (dma_mapping_error(sq->pdev, dma_addr)) { 92 xdp_return_frame(xdpf); 93 return false; 94 } 95 96 xdptxd.dma_addr = dma_addr; 97 xdpi.frame.xdpf = xdpf; 98 xdpi.frame.dma_addr = dma_addr; 99 } else { 100 /* Driver assumes that convert_to_xdp_frame returns an xdp_frame 101 * that points to the same memory region as the original 102 * xdp_buff. It allows to map the memory only once and to use 103 * the DMA_BIDIRECTIONAL mode. 104 */ 105 106 xdpi.mode = MLX5E_XDP_XMIT_MODE_PAGE; 107 108 dma_addr = di->addr + (xdpf->data - (void *)xdpf); 109 dma_sync_single_for_device(sq->pdev, dma_addr, xdptxd.len, 110 DMA_TO_DEVICE); 111 112 xdptxd.dma_addr = dma_addr; 113 xdpi.page.rq = rq; 114 xdpi.page.di = *di; 115 } 116 117 return sq->xmit_xdp_frame(sq, &xdptxd, &xdpi, 0); 118 } 119 120 /* returns true if packet was consumed by xdp */ 121 bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di, 122 void *va, u16 *rx_headroom, u32 *len, bool xsk) 123 { 124 struct bpf_prog *prog = READ_ONCE(rq->xdp_prog); 125 struct xdp_umem *umem = rq->umem; 126 struct xdp_buff xdp; 127 u32 act; 128 int err; 129 130 if (!prog) 131 return false; 132 133 xdp.data = va + *rx_headroom; 134 xdp_set_data_meta_invalid(&xdp); 135 xdp.data_end = xdp.data + *len; 136 xdp.data_hard_start = va; 137 if (xsk) 138 xdp.handle = di->xsk.handle; 139 xdp.rxq = &rq->xdp_rxq; 140 141 act = bpf_prog_run_xdp(prog, &xdp); 142 if (xsk) { 143 u64 off = xdp.data - xdp.data_hard_start; 144 145 xdp.handle = xsk_umem_adjust_offset(umem, xdp.handle, off); 146 } 147 switch (act) { 148 case XDP_PASS: 149 *rx_headroom = xdp.data - xdp.data_hard_start; 150 *len = xdp.data_end - xdp.data; 151 return false; 152 case XDP_TX: 153 if (unlikely(!mlx5e_xmit_xdp_buff(rq->xdpsq, rq, di, &xdp))) 154 goto xdp_abort; 155 __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); /* non-atomic */ 156 return true; 157 case XDP_REDIRECT: 158 /* When XDP enabled then page-refcnt==1 here */ 159 err = xdp_do_redirect(rq->netdev, &xdp, prog); 160 if (unlikely(err)) 161 goto xdp_abort; 162 __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); 163 __set_bit(MLX5E_RQ_FLAG_XDP_REDIRECT, rq->flags); 164 if (!xsk) 165 mlx5e_page_dma_unmap(rq, di); 166 rq->stats->xdp_redirect++; 167 return true; 168 default: 169 bpf_warn_invalid_xdp_action(act); 170 /* fall through */ 171 case XDP_ABORTED: 172 xdp_abort: 173 trace_xdp_exception(rq->netdev, prog, act); 174 /* fall through */ 175 case XDP_DROP: 176 rq->stats->xdp_drop++; 177 return true; 178 } 179 } 180 181 static void mlx5e_xdp_mpwqe_session_start(struct mlx5e_xdpsq *sq) 182 { 183 struct mlx5e_xdp_mpwqe *session = &sq->mpwqe; 184 struct mlx5e_xdpsq_stats *stats = sq->stats; 185 struct mlx5_wq_cyc *wq = &sq->wq; 186 u16 pi, contig_wqebbs; 187 188 pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); 189 contig_wqebbs = mlx5_wq_cyc_get_contig_wqebbs(wq, pi); 190 191 if (unlikely(contig_wqebbs < MLX5_SEND_WQE_MAX_WQEBBS)) 192 mlx5e_fill_xdpsq_frag_edge(sq, wq, pi, contig_wqebbs); 193 194 session->wqe = mlx5e_xdpsq_fetch_wqe(sq, &pi); 195 196 prefetchw(session->wqe->data); 197 session->ds_count = MLX5E_XDP_TX_EMPTY_DS_COUNT; 198 session->pkt_count = 0; 199 200 mlx5e_xdp_update_inline_state(sq); 201 202 stats->mpwqe++; 203 } 204 205 void mlx5e_xdp_mpwqe_complete(struct mlx5e_xdpsq *sq) 206 { 207 struct mlx5_wq_cyc *wq = &sq->wq; 208 struct mlx5e_xdp_mpwqe *session = &sq->mpwqe; 209 struct mlx5_wqe_ctrl_seg *cseg = &session->wqe->ctrl; 210 u16 ds_count = session->ds_count; 211 u16 pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); 212 struct mlx5e_xdp_wqe_info *wi = &sq->db.wqe_info[pi]; 213 214 cseg->opmod_idx_opcode = 215 cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_ENHANCED_MPSW); 216 cseg->qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_count); 217 218 wi->num_wqebbs = DIV_ROUND_UP(ds_count, MLX5_SEND_WQEBB_NUM_DS); 219 wi->num_pkts = session->pkt_count; 220 221 sq->pc += wi->num_wqebbs; 222 223 sq->doorbell_cseg = cseg; 224 225 session->wqe = NULL; /* Close session */ 226 } 227 228 enum { 229 MLX5E_XDP_CHECK_OK = 1, 230 MLX5E_XDP_CHECK_START_MPWQE = 2, 231 }; 232 233 static int mlx5e_xmit_xdp_frame_check_mpwqe(struct mlx5e_xdpsq *sq) 234 { 235 if (unlikely(!sq->mpwqe.wqe)) { 236 if (unlikely(!mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc, 237 MLX5E_XDPSQ_STOP_ROOM))) { 238 /* SQ is full, ring doorbell */ 239 mlx5e_xmit_xdp_doorbell(sq); 240 sq->stats->full++; 241 return -EBUSY; 242 } 243 244 return MLX5E_XDP_CHECK_START_MPWQE; 245 } 246 247 return MLX5E_XDP_CHECK_OK; 248 } 249 250 static bool mlx5e_xmit_xdp_frame_mpwqe(struct mlx5e_xdpsq *sq, 251 struct mlx5e_xdp_xmit_data *xdptxd, 252 struct mlx5e_xdp_info *xdpi, 253 int check_result) 254 { 255 struct mlx5e_xdp_mpwqe *session = &sq->mpwqe; 256 struct mlx5e_xdpsq_stats *stats = sq->stats; 257 258 if (unlikely(xdptxd->len > sq->hw_mtu)) { 259 stats->err++; 260 return false; 261 } 262 263 if (!check_result) 264 check_result = mlx5e_xmit_xdp_frame_check_mpwqe(sq); 265 if (unlikely(check_result < 0)) 266 return false; 267 268 if (check_result == MLX5E_XDP_CHECK_START_MPWQE) { 269 /* Start the session when nothing can fail, so it's guaranteed 270 * that if there is an active session, it has at least one dseg, 271 * and it's safe to complete it at any time. 272 */ 273 mlx5e_xdp_mpwqe_session_start(sq); 274 } 275 276 mlx5e_xdp_mpwqe_add_dseg(sq, xdptxd, stats); 277 278 if (unlikely(mlx5e_xdp_no_room_for_inline_pkt(session) || 279 session->ds_count == MLX5E_XDP_MPW_MAX_NUM_DS)) 280 mlx5e_xdp_mpwqe_complete(sq); 281 282 mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo, xdpi); 283 stats->xmit++; 284 return true; 285 } 286 287 static int mlx5e_xmit_xdp_frame_check(struct mlx5e_xdpsq *sq) 288 { 289 if (unlikely(!mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc, 1))) { 290 /* SQ is full, ring doorbell */ 291 mlx5e_xmit_xdp_doorbell(sq); 292 sq->stats->full++; 293 return -EBUSY; 294 } 295 296 return MLX5E_XDP_CHECK_OK; 297 } 298 299 static bool mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, 300 struct mlx5e_xdp_xmit_data *xdptxd, 301 struct mlx5e_xdp_info *xdpi, 302 int check_result) 303 { 304 struct mlx5_wq_cyc *wq = &sq->wq; 305 u16 pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); 306 struct mlx5e_tx_wqe *wqe = mlx5_wq_cyc_get_wqe(wq, pi); 307 308 struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl; 309 struct mlx5_wqe_eth_seg *eseg = &wqe->eth; 310 struct mlx5_wqe_data_seg *dseg = wqe->data; 311 312 dma_addr_t dma_addr = xdptxd->dma_addr; 313 u32 dma_len = xdptxd->len; 314 315 struct mlx5e_xdpsq_stats *stats = sq->stats; 316 317 prefetchw(wqe); 318 319 if (unlikely(dma_len < MLX5E_XDP_MIN_INLINE || sq->hw_mtu < dma_len)) { 320 stats->err++; 321 return false; 322 } 323 324 if (!check_result) 325 check_result = mlx5e_xmit_xdp_frame_check(sq); 326 if (unlikely(check_result < 0)) 327 return false; 328 329 cseg->fm_ce_se = 0; 330 331 /* copy the inline part if required */ 332 if (sq->min_inline_mode != MLX5_INLINE_MODE_NONE) { 333 memcpy(eseg->inline_hdr.start, xdptxd->data, MLX5E_XDP_MIN_INLINE); 334 eseg->inline_hdr.sz = cpu_to_be16(MLX5E_XDP_MIN_INLINE); 335 dma_len -= MLX5E_XDP_MIN_INLINE; 336 dma_addr += MLX5E_XDP_MIN_INLINE; 337 dseg++; 338 } 339 340 /* write the dma part */ 341 dseg->addr = cpu_to_be64(dma_addr); 342 dseg->byte_count = cpu_to_be32(dma_len); 343 344 cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_SEND); 345 346 sq->pc++; 347 348 sq->doorbell_cseg = cseg; 349 350 mlx5e_xdpi_fifo_push(&sq->db.xdpi_fifo, xdpi); 351 stats->xmit++; 352 return true; 353 } 354 355 static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq, 356 struct mlx5e_xdp_wqe_info *wi, 357 u32 *xsk_frames, 358 bool recycle) 359 { 360 struct mlx5e_xdp_info_fifo *xdpi_fifo = &sq->db.xdpi_fifo; 361 u16 i; 362 363 for (i = 0; i < wi->num_pkts; i++) { 364 struct mlx5e_xdp_info xdpi = mlx5e_xdpi_fifo_pop(xdpi_fifo); 365 366 switch (xdpi.mode) { 367 case MLX5E_XDP_XMIT_MODE_FRAME: 368 /* XDP_TX from the XSK RQ and XDP_REDIRECT */ 369 dma_unmap_single(sq->pdev, xdpi.frame.dma_addr, 370 xdpi.frame.xdpf->len, DMA_TO_DEVICE); 371 xdp_return_frame(xdpi.frame.xdpf); 372 break; 373 case MLX5E_XDP_XMIT_MODE_PAGE: 374 /* XDP_TX from the regular RQ */ 375 mlx5e_page_release_dynamic(xdpi.page.rq, &xdpi.page.di, recycle); 376 break; 377 case MLX5E_XDP_XMIT_MODE_XSK: 378 /* AF_XDP send */ 379 (*xsk_frames)++; 380 break; 381 default: 382 WARN_ON_ONCE(true); 383 } 384 } 385 } 386 387 bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq) 388 { 389 struct mlx5e_xdpsq *sq; 390 struct mlx5_cqe64 *cqe; 391 u32 xsk_frames = 0; 392 u16 sqcc; 393 int i; 394 395 sq = container_of(cq, struct mlx5e_xdpsq, cq); 396 397 if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))) 398 return false; 399 400 cqe = mlx5_cqwq_get_cqe(&cq->wq); 401 if (!cqe) 402 return false; 403 404 /* sq->cc must be updated only after mlx5_cqwq_update_db_record(), 405 * otherwise a cq overrun may occur 406 */ 407 sqcc = sq->cc; 408 409 i = 0; 410 do { 411 u16 wqe_counter; 412 bool last_wqe; 413 414 mlx5_cqwq_pop(&cq->wq); 415 416 wqe_counter = be16_to_cpu(cqe->wqe_counter); 417 418 if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ)) 419 netdev_WARN_ONCE(sq->channel->netdev, 420 "Bad OP in XDPSQ CQE: 0x%x\n", 421 get_cqe_opcode(cqe)); 422 423 do { 424 struct mlx5e_xdp_wqe_info *wi; 425 u16 ci; 426 427 last_wqe = (sqcc == wqe_counter); 428 ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc); 429 wi = &sq->db.wqe_info[ci]; 430 431 sqcc += wi->num_wqebbs; 432 433 mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, true); 434 } while (!last_wqe); 435 } while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = mlx5_cqwq_get_cqe(&cq->wq))); 436 437 if (xsk_frames) 438 xsk_umem_complete_tx(sq->umem, xsk_frames); 439 440 sq->stats->cqes += i; 441 442 mlx5_cqwq_update_db_record(&cq->wq); 443 444 /* ensure cq space is freed before enabling more cqes */ 445 wmb(); 446 447 sq->cc = sqcc; 448 return (i == MLX5E_TX_CQ_POLL_BUDGET); 449 } 450 451 void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq) 452 { 453 u32 xsk_frames = 0; 454 455 while (sq->cc != sq->pc) { 456 struct mlx5e_xdp_wqe_info *wi; 457 u16 ci; 458 459 ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->cc); 460 wi = &sq->db.wqe_info[ci]; 461 462 sq->cc += wi->num_wqebbs; 463 464 mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, false); 465 } 466 467 if (xsk_frames) 468 xsk_umem_complete_tx(sq->umem, xsk_frames); 469 } 470 471 int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, 472 u32 flags) 473 { 474 struct mlx5e_priv *priv = netdev_priv(dev); 475 struct mlx5e_xdpsq *sq; 476 int drops = 0; 477 int sq_num; 478 int i; 479 480 /* this flag is sufficient, no need to test internal sq state */ 481 if (unlikely(!mlx5e_xdp_tx_is_enabled(priv))) 482 return -ENETDOWN; 483 484 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 485 return -EINVAL; 486 487 sq_num = smp_processor_id(); 488 489 if (unlikely(sq_num >= priv->channels.num)) 490 return -ENXIO; 491 492 sq = &priv->channels.c[sq_num]->xdpsq; 493 494 for (i = 0; i < n; i++) { 495 struct xdp_frame *xdpf = frames[i]; 496 struct mlx5e_xdp_xmit_data xdptxd; 497 struct mlx5e_xdp_info xdpi; 498 499 xdptxd.data = xdpf->data; 500 xdptxd.len = xdpf->len; 501 xdptxd.dma_addr = dma_map_single(sq->pdev, xdptxd.data, 502 xdptxd.len, DMA_TO_DEVICE); 503 504 if (unlikely(dma_mapping_error(sq->pdev, xdptxd.dma_addr))) { 505 xdp_return_frame_rx_napi(xdpf); 506 drops++; 507 continue; 508 } 509 510 xdpi.mode = MLX5E_XDP_XMIT_MODE_FRAME; 511 xdpi.frame.xdpf = xdpf; 512 xdpi.frame.dma_addr = xdptxd.dma_addr; 513 514 if (unlikely(!sq->xmit_xdp_frame(sq, &xdptxd, &xdpi, 0))) { 515 dma_unmap_single(sq->pdev, xdptxd.dma_addr, 516 xdptxd.len, DMA_TO_DEVICE); 517 xdp_return_frame_rx_napi(xdpf); 518 drops++; 519 } 520 } 521 522 if (flags & XDP_XMIT_FLUSH) { 523 if (sq->mpwqe.wqe) 524 mlx5e_xdp_mpwqe_complete(sq); 525 mlx5e_xmit_xdp_doorbell(sq); 526 } 527 528 return n - drops; 529 } 530 531 void mlx5e_xdp_rx_poll_complete(struct mlx5e_rq *rq) 532 { 533 struct mlx5e_xdpsq *xdpsq = rq->xdpsq; 534 535 if (xdpsq->mpwqe.wqe) 536 mlx5e_xdp_mpwqe_complete(xdpsq); 537 538 mlx5e_xmit_xdp_doorbell(xdpsq); 539 540 if (test_bit(MLX5E_RQ_FLAG_XDP_REDIRECT, rq->flags)) { 541 xdp_do_flush_map(); 542 __clear_bit(MLX5E_RQ_FLAG_XDP_REDIRECT, rq->flags); 543 } 544 } 545 546 void mlx5e_set_xmit_fp(struct mlx5e_xdpsq *sq, bool is_mpw) 547 { 548 sq->xmit_xdp_frame_check = is_mpw ? 549 mlx5e_xmit_xdp_frame_check_mpwqe : mlx5e_xmit_xdp_frame_check; 550 sq->xmit_xdp_frame = is_mpw ? 551 mlx5e_xmit_xdp_frame_mpwqe : mlx5e_xmit_xdp_frame; 552 } 553 554