1 // SPDX-License-Identifier: GPL-2.0 2 // Copyright (c) 2019 Mellanox Technologies. 3 4 #include "health.h" 5 #include "params.h" 6 7 static int mlx5e_query_rq_state(struct mlx5_core_dev *dev, u32 rqn, u8 *state) 8 { 9 int outlen = MLX5_ST_SZ_BYTES(query_rq_out); 10 void *out; 11 void *rqc; 12 int err; 13 14 out = kvzalloc(outlen, GFP_KERNEL); 15 if (!out) 16 return -ENOMEM; 17 18 err = mlx5_core_query_rq(dev, rqn, out); 19 if (err) 20 goto out; 21 22 rqc = MLX5_ADDR_OF(query_rq_out, out, rq_context); 23 *state = MLX5_GET(rqc, rqc, state); 24 25 out: 26 kvfree(out); 27 return err; 28 } 29 30 static int mlx5e_wait_for_icosq_flush(struct mlx5e_icosq *icosq) 31 { 32 unsigned long exp_time = jiffies + msecs_to_jiffies(2000); 33 34 while (time_before(jiffies, exp_time)) { 35 if (icosq->cc == icosq->pc) 36 return 0; 37 38 msleep(20); 39 } 40 41 netdev_err(icosq->channel->netdev, 42 "Wait for ICOSQ 0x%x flush timeout (cc = 0x%x, pc = 0x%x)\n", 43 icosq->sqn, icosq->cc, icosq->pc); 44 45 return -ETIMEDOUT; 46 } 47 48 static void mlx5e_reset_icosq_cc_pc(struct mlx5e_icosq *icosq) 49 { 50 WARN_ONCE(icosq->cc != icosq->pc, "ICOSQ 0x%x: cc (0x%x) != pc (0x%x)\n", 51 icosq->sqn, icosq->cc, icosq->pc); 52 icosq->cc = 0; 53 icosq->pc = 0; 54 } 55 56 static int mlx5e_rx_reporter_err_icosq_cqe_recover(void *ctx) 57 { 58 struct mlx5_core_dev *mdev; 59 struct mlx5e_icosq *icosq; 60 struct net_device *dev; 61 struct mlx5e_rq *rq; 62 u8 state; 63 int err; 64 65 icosq = ctx; 66 rq = &icosq->channel->rq; 67 mdev = icosq->channel->mdev; 68 dev = icosq->channel->netdev; 69 err = mlx5_core_query_sq_state(mdev, icosq->sqn, &state); 70 if (err) { 71 netdev_err(dev, "Failed to query ICOSQ 0x%x state. err = %d\n", 72 icosq->sqn, err); 73 goto out; 74 } 75 76 if (state != MLX5_SQC_STATE_ERR) 77 goto out; 78 79 mlx5e_deactivate_rq(rq); 80 err = mlx5e_wait_for_icosq_flush(icosq); 81 if (err) 82 goto out; 83 84 mlx5e_deactivate_icosq(icosq); 85 86 /* At this point, both the rq and the icosq are disabled */ 87 88 err = mlx5e_health_sq_to_ready(icosq->channel, icosq->sqn); 89 if (err) 90 goto out; 91 92 mlx5e_reset_icosq_cc_pc(icosq); 93 mlx5e_free_rx_descs(rq); 94 clear_bit(MLX5E_SQ_STATE_RECOVERING, &icosq->state); 95 mlx5e_activate_icosq(icosq); 96 mlx5e_activate_rq(rq); 97 98 rq->stats->recover++; 99 return 0; 100 out: 101 clear_bit(MLX5E_SQ_STATE_RECOVERING, &icosq->state); 102 return err; 103 } 104 105 void mlx5e_reporter_icosq_cqe_err(struct mlx5e_icosq *icosq) 106 { 107 struct mlx5e_priv *priv = icosq->channel->priv; 108 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 109 struct mlx5e_err_ctx err_ctx = {}; 110 111 err_ctx.ctx = icosq; 112 err_ctx.recover = mlx5e_rx_reporter_err_icosq_cqe_recover; 113 sprintf(err_str, "ERR CQE on ICOSQ: 0x%x", icosq->sqn); 114 115 mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx); 116 } 117 118 static int mlx5e_rq_to_ready(struct mlx5e_rq *rq, int curr_state) 119 { 120 struct net_device *dev = rq->netdev; 121 int err; 122 123 err = mlx5e_modify_rq_state(rq, curr_state, MLX5_RQC_STATE_RST); 124 if (err) { 125 netdev_err(dev, "Failed to move rq 0x%x to reset\n", rq->rqn); 126 return err; 127 } 128 err = mlx5e_modify_rq_state(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY); 129 if (err) { 130 netdev_err(dev, "Failed to move rq 0x%x to ready\n", rq->rqn); 131 return err; 132 } 133 134 return 0; 135 } 136 137 static int mlx5e_rx_reporter_err_rq_cqe_recover(void *ctx) 138 { 139 struct mlx5_core_dev *mdev; 140 struct net_device *dev; 141 struct mlx5e_rq *rq; 142 u8 state; 143 int err; 144 145 rq = ctx; 146 mdev = rq->mdev; 147 dev = rq->netdev; 148 err = mlx5e_query_rq_state(mdev, rq->rqn, &state); 149 if (err) { 150 netdev_err(dev, "Failed to query RQ 0x%x state. err = %d\n", 151 rq->rqn, err); 152 goto out; 153 } 154 155 if (state != MLX5_RQC_STATE_ERR) 156 goto out; 157 158 mlx5e_deactivate_rq(rq); 159 mlx5e_free_rx_descs(rq); 160 161 err = mlx5e_rq_to_ready(rq, MLX5_RQC_STATE_ERR); 162 if (err) 163 goto out; 164 165 clear_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state); 166 mlx5e_activate_rq(rq); 167 rq->stats->recover++; 168 return 0; 169 out: 170 clear_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state); 171 return err; 172 } 173 174 void mlx5e_reporter_rq_cqe_err(struct mlx5e_rq *rq) 175 { 176 struct mlx5e_priv *priv = rq->channel->priv; 177 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 178 struct mlx5e_err_ctx err_ctx = {}; 179 180 err_ctx.ctx = rq; 181 err_ctx.recover = mlx5e_rx_reporter_err_rq_cqe_recover; 182 sprintf(err_str, "ERR CQE on RQ: 0x%x", rq->rqn); 183 184 mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx); 185 } 186 187 static int mlx5e_rx_reporter_timeout_recover(void *ctx) 188 { 189 struct mlx5e_icosq *icosq; 190 struct mlx5_eq_comp *eq; 191 struct mlx5e_rq *rq; 192 int err; 193 194 rq = ctx; 195 icosq = &rq->channel->icosq; 196 eq = rq->cq.mcq.eq; 197 err = mlx5e_health_channel_eq_recover(eq, rq->channel); 198 if (err) 199 clear_bit(MLX5E_SQ_STATE_ENABLED, &icosq->state); 200 201 return err; 202 } 203 204 void mlx5e_reporter_rx_timeout(struct mlx5e_rq *rq) 205 { 206 struct mlx5e_icosq *icosq = &rq->channel->icosq; 207 struct mlx5e_priv *priv = rq->channel->priv; 208 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 209 struct mlx5e_err_ctx err_ctx = {}; 210 211 err_ctx.ctx = rq; 212 err_ctx.recover = mlx5e_rx_reporter_timeout_recover; 213 sprintf(err_str, "RX timeout on channel: %d, ICOSQ: 0x%x RQ: 0x%x, CQ: 0x%x\n", 214 icosq->channel->ix, icosq->sqn, rq->rqn, rq->cq.mcq.cqn); 215 216 mlx5e_health_report(priv, priv->rx_reporter, err_str, &err_ctx); 217 } 218 219 static int mlx5e_rx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx) 220 { 221 return err_ctx->recover(err_ctx->ctx); 222 } 223 224 static int mlx5e_rx_reporter_recover(struct devlink_health_reporter *reporter, 225 void *context) 226 { 227 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 228 struct mlx5e_err_ctx *err_ctx = context; 229 230 return err_ctx ? mlx5e_rx_reporter_recover_from_ctx(err_ctx) : 231 mlx5e_health_recover_channels(priv); 232 } 233 234 static int mlx5e_rx_reporter_build_diagnose_output(struct mlx5e_rq *rq, 235 struct devlink_fmsg *fmsg) 236 { 237 struct mlx5e_priv *priv = rq->channel->priv; 238 struct mlx5e_params *params; 239 struct mlx5e_icosq *icosq; 240 u8 icosq_hw_state; 241 int wqes_sz; 242 u8 hw_state; 243 u16 wq_head; 244 int err; 245 246 params = &priv->channels.params; 247 icosq = &rq->channel->icosq; 248 err = mlx5e_query_rq_state(priv->mdev, rq->rqn, &hw_state); 249 if (err) 250 return err; 251 252 err = mlx5_core_query_sq_state(priv->mdev, icosq->sqn, &icosq_hw_state); 253 if (err) 254 return err; 255 256 wqes_sz = mlx5e_rqwq_get_cur_sz(rq); 257 wq_head = params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ ? 258 rq->mpwqe.wq.head : mlx5_wq_cyc_get_head(&rq->wqe.wq); 259 260 err = devlink_fmsg_obj_nest_start(fmsg); 261 if (err) 262 return err; 263 264 err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", rq->channel->ix); 265 if (err) 266 return err; 267 268 err = devlink_fmsg_u32_pair_put(fmsg, "rqn", rq->rqn); 269 if (err) 270 return err; 271 272 err = devlink_fmsg_u8_pair_put(fmsg, "HW state", hw_state); 273 if (err) 274 return err; 275 276 err = devlink_fmsg_u8_pair_put(fmsg, "SW state", rq->state); 277 if (err) 278 return err; 279 280 err = devlink_fmsg_u32_pair_put(fmsg, "posted WQEs", wqes_sz); 281 if (err) 282 return err; 283 284 err = devlink_fmsg_u32_pair_put(fmsg, "cc", wq_head); 285 if (err) 286 return err; 287 288 err = devlink_fmsg_u8_pair_put(fmsg, "ICOSQ HW state", icosq_hw_state); 289 if (err) 290 return err; 291 292 err = mlx5e_reporter_cq_diagnose(&rq->cq, fmsg); 293 if (err) 294 return err; 295 296 err = devlink_fmsg_obj_nest_end(fmsg); 297 if (err) 298 return err; 299 300 return 0; 301 } 302 303 static int mlx5e_rx_reporter_diagnose(struct devlink_health_reporter *reporter, 304 struct devlink_fmsg *fmsg) 305 { 306 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 307 struct mlx5e_params *params = &priv->channels.params; 308 struct mlx5e_rq *generic_rq; 309 u32 rq_stride, rq_sz; 310 int i, err = 0; 311 312 mutex_lock(&priv->state_lock); 313 314 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 315 goto unlock; 316 317 generic_rq = &priv->channels.c[0]->rq; 318 rq_sz = mlx5e_rqwq_get_size(generic_rq); 319 rq_stride = BIT(mlx5e_mpwqe_get_log_stride_size(priv->mdev, params, NULL)); 320 321 err = mlx5e_reporter_named_obj_nest_start(fmsg, "Common config"); 322 if (err) 323 goto unlock; 324 325 err = mlx5e_reporter_named_obj_nest_start(fmsg, "RQ"); 326 if (err) 327 goto unlock; 328 329 err = devlink_fmsg_u8_pair_put(fmsg, "type", params->rq_wq_type); 330 if (err) 331 goto unlock; 332 333 err = devlink_fmsg_u64_pair_put(fmsg, "stride size", rq_stride); 334 if (err) 335 goto unlock; 336 337 err = devlink_fmsg_u32_pair_put(fmsg, "size", rq_sz); 338 if (err) 339 goto unlock; 340 341 err = mlx5e_reporter_named_obj_nest_end(fmsg); 342 if (err) 343 goto unlock; 344 345 err = mlx5e_reporter_cq_common_diagnose(&generic_rq->cq, fmsg); 346 if (err) 347 goto unlock; 348 349 err = mlx5e_reporter_named_obj_nest_end(fmsg); 350 if (err) 351 goto unlock; 352 353 err = devlink_fmsg_arr_pair_nest_start(fmsg, "RQs"); 354 if (err) 355 goto unlock; 356 357 for (i = 0; i < priv->channels.num; i++) { 358 struct mlx5e_rq *rq = &priv->channels.c[i]->rq; 359 360 err = mlx5e_rx_reporter_build_diagnose_output(rq, fmsg); 361 if (err) 362 goto unlock; 363 } 364 err = devlink_fmsg_arr_pair_nest_end(fmsg); 365 if (err) 366 goto unlock; 367 unlock: 368 mutex_unlock(&priv->state_lock); 369 return err; 370 } 371 372 static const struct devlink_health_reporter_ops mlx5_rx_reporter_ops = { 373 .name = "rx", 374 .recover = mlx5e_rx_reporter_recover, 375 .diagnose = mlx5e_rx_reporter_diagnose, 376 }; 377 378 #define MLX5E_REPORTER_RX_GRACEFUL_PERIOD 500 379 380 int mlx5e_reporter_rx_create(struct mlx5e_priv *priv) 381 { 382 struct devlink *devlink = priv_to_devlink(priv->mdev); 383 struct devlink_health_reporter *reporter; 384 385 reporter = devlink_health_reporter_create(devlink, 386 &mlx5_rx_reporter_ops, 387 MLX5E_REPORTER_RX_GRACEFUL_PERIOD, 388 true, priv); 389 if (IS_ERR(reporter)) { 390 netdev_warn(priv->netdev, "Failed to create rx reporter, err = %ld\n", 391 PTR_ERR(reporter)); 392 return PTR_ERR(reporter); 393 } 394 priv->rx_reporter = reporter; 395 return 0; 396 } 397 398 void mlx5e_reporter_rx_destroy(struct mlx5e_priv *priv) 399 { 400 if (!priv->rx_reporter) 401 return; 402 403 devlink_health_reporter_destroy(priv->rx_reporter); 404 } 405