1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* Copyright (c) 2019 Mellanox Technologies. */ 3 4 #include "health.h" 5 6 static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq) 7 { 8 unsigned long exp_time = jiffies + msecs_to_jiffies(2000); 9 10 while (time_before(jiffies, exp_time)) { 11 if (sq->cc == sq->pc) 12 return 0; 13 14 msleep(20); 15 } 16 17 netdev_err(sq->channel->netdev, 18 "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n", 19 sq->sqn, sq->cc, sq->pc); 20 21 return -ETIMEDOUT; 22 } 23 24 static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq) 25 { 26 WARN_ONCE(sq->cc != sq->pc, 27 "SQ 0x%x: cc (0x%x) != pc (0x%x)\n", 28 sq->sqn, sq->cc, sq->pc); 29 sq->cc = 0; 30 sq->dma_fifo_cc = 0; 31 sq->pc = 0; 32 } 33 34 static int mlx5e_tx_reporter_err_cqe_recover(void *ctx) 35 { 36 struct mlx5_core_dev *mdev; 37 struct net_device *dev; 38 struct mlx5e_txqsq *sq; 39 u8 state; 40 int err; 41 42 sq = ctx; 43 mdev = sq->channel->mdev; 44 dev = sq->channel->netdev; 45 46 if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) 47 return 0; 48 49 err = mlx5_core_query_sq_state(mdev, sq->sqn, &state); 50 if (err) { 51 netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n", 52 sq->sqn, err); 53 goto out; 54 } 55 56 if (state != MLX5_SQC_STATE_ERR) 57 goto out; 58 59 mlx5e_tx_disable_queue(sq->txq); 60 61 err = mlx5e_wait_for_sq_flush(sq); 62 if (err) 63 goto out; 64 65 /* At this point, no new packets will arrive from the stack as TXQ is 66 * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all 67 * pending WQEs. SQ can safely reset the SQ. 68 */ 69 70 err = mlx5e_health_sq_to_ready(sq->channel, sq->sqn); 71 if (err) 72 goto out; 73 74 mlx5e_reset_txqsq_cc_pc(sq); 75 sq->stats->recover++; 76 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 77 mlx5e_activate_txqsq(sq); 78 79 return 0; 80 out: 81 clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); 82 return err; 83 } 84 85 void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq) 86 { 87 struct mlx5e_priv *priv = sq->channel->priv; 88 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 89 struct mlx5e_err_ctx err_ctx = {0}; 90 91 err_ctx.ctx = sq; 92 err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover; 93 sprintf(err_str, "ERR CQE on SQ: 0x%x", sq->sqn); 94 95 mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 96 } 97 98 static int mlx5e_tx_reporter_timeout_recover(void *ctx) 99 { 100 struct mlx5_eq_comp *eq; 101 struct mlx5e_txqsq *sq; 102 int err; 103 104 sq = ctx; 105 eq = sq->cq.mcq.eq; 106 err = mlx5e_health_channel_eq_recover(eq, sq->channel); 107 if (err) 108 clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); 109 110 return err; 111 } 112 113 int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq) 114 { 115 struct mlx5e_priv *priv = sq->channel->priv; 116 char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; 117 struct mlx5e_err_ctx err_ctx; 118 119 err_ctx.ctx = sq; 120 err_ctx.recover = mlx5e_tx_reporter_timeout_recover; 121 sprintf(err_str, 122 "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u\n", 123 sq->channel->ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc, 124 jiffies_to_usecs(jiffies - sq->txq->trans_start)); 125 126 return mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); 127 } 128 129 /* state lock cannot be grabbed within this function. 130 * It can cause a dead lock or a read-after-free. 131 */ 132 static int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_err_ctx *err_ctx) 133 { 134 return err_ctx->recover(err_ctx->ctx); 135 } 136 137 static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter, 138 void *context, 139 struct netlink_ext_ack *extack) 140 { 141 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 142 struct mlx5e_err_ctx *err_ctx = context; 143 144 return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) : 145 mlx5e_health_recover_channels(priv); 146 } 147 148 static int 149 mlx5e_tx_reporter_build_diagnose_output(struct devlink_fmsg *fmsg, 150 struct mlx5e_txqsq *sq, int tc) 151 { 152 struct mlx5e_priv *priv = sq->channel->priv; 153 bool stopped = netif_xmit_stopped(sq->txq); 154 u8 state; 155 int err; 156 157 err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state); 158 if (err) 159 return err; 160 161 err = devlink_fmsg_obj_nest_start(fmsg); 162 if (err) 163 return err; 164 165 err = devlink_fmsg_u32_pair_put(fmsg, "channel ix", sq->ch_ix); 166 if (err) 167 return err; 168 169 err = devlink_fmsg_u32_pair_put(fmsg, "tc", tc); 170 if (err) 171 return err; 172 173 err = devlink_fmsg_u32_pair_put(fmsg, "txq ix", sq->txq_ix); 174 if (err) 175 return err; 176 177 err = devlink_fmsg_u32_pair_put(fmsg, "sqn", sq->sqn); 178 if (err) 179 return err; 180 181 err = devlink_fmsg_u8_pair_put(fmsg, "HW state", state); 182 if (err) 183 return err; 184 185 err = devlink_fmsg_bool_pair_put(fmsg, "stopped", stopped); 186 if (err) 187 return err; 188 189 err = devlink_fmsg_u32_pair_put(fmsg, "cc", sq->cc); 190 if (err) 191 return err; 192 193 err = devlink_fmsg_u32_pair_put(fmsg, "pc", sq->pc); 194 if (err) 195 return err; 196 197 err = mlx5e_reporter_cq_diagnose(&sq->cq, fmsg); 198 if (err) 199 return err; 200 201 err = devlink_fmsg_obj_nest_end(fmsg); 202 if (err) 203 return err; 204 205 return 0; 206 } 207 208 static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter, 209 struct devlink_fmsg *fmsg, 210 struct netlink_ext_ack *extack) 211 { 212 struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); 213 struct mlx5e_txqsq *generic_sq = priv->txq2sq[0]; 214 u32 sq_stride, sq_sz; 215 216 int i, tc, err = 0; 217 218 mutex_lock(&priv->state_lock); 219 220 if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) 221 goto unlock; 222 223 sq_sz = mlx5_wq_cyc_get_size(&generic_sq->wq); 224 sq_stride = MLX5_SEND_WQE_BB; 225 226 err = mlx5e_reporter_named_obj_nest_start(fmsg, "Common Config"); 227 if (err) 228 goto unlock; 229 230 err = mlx5e_reporter_named_obj_nest_start(fmsg, "SQ"); 231 if (err) 232 goto unlock; 233 234 err = devlink_fmsg_u64_pair_put(fmsg, "stride size", sq_stride); 235 if (err) 236 goto unlock; 237 238 err = devlink_fmsg_u32_pair_put(fmsg, "size", sq_sz); 239 if (err) 240 goto unlock; 241 242 err = mlx5e_reporter_cq_common_diagnose(&generic_sq->cq, fmsg); 243 if (err) 244 goto unlock; 245 246 err = mlx5e_reporter_named_obj_nest_end(fmsg); 247 if (err) 248 goto unlock; 249 250 err = mlx5e_reporter_named_obj_nest_end(fmsg); 251 if (err) 252 goto unlock; 253 254 err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); 255 if (err) 256 goto unlock; 257 258 for (i = 0; i < priv->channels.num; i++) { 259 struct mlx5e_channel *c = priv->channels.c[i]; 260 261 for (tc = 0; tc < priv->channels.params.num_tc; tc++) { 262 struct mlx5e_txqsq *sq = &c->sq[tc]; 263 264 err = mlx5e_tx_reporter_build_diagnose_output(fmsg, sq, tc); 265 if (err) 266 goto unlock; 267 } 268 } 269 err = devlink_fmsg_arr_pair_nest_end(fmsg); 270 if (err) 271 goto unlock; 272 273 unlock: 274 mutex_unlock(&priv->state_lock); 275 return err; 276 } 277 278 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { 279 .name = "tx", 280 .recover = mlx5e_tx_reporter_recover, 281 .diagnose = mlx5e_tx_reporter_diagnose, 282 }; 283 284 #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 285 286 int mlx5e_reporter_tx_create(struct mlx5e_priv *priv) 287 { 288 struct devlink_health_reporter *reporter; 289 struct mlx5_core_dev *mdev = priv->mdev; 290 struct devlink *devlink; 291 292 devlink = priv_to_devlink(mdev); 293 reporter = 294 devlink_health_reporter_create(devlink, &mlx5_tx_reporter_ops, 295 MLX5_REPORTER_TX_GRACEFUL_PERIOD, 296 true, priv); 297 if (IS_ERR(reporter)) { 298 netdev_warn(priv->netdev, 299 "Failed to create tx reporter, err = %ld\n", 300 PTR_ERR(reporter)); 301 return PTR_ERR(reporter); 302 } 303 priv->tx_reporter = reporter; 304 return 0; 305 } 306 307 void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv) 308 { 309 if (!priv->tx_reporter) 310 return; 311 312 devlink_health_reporter_destroy(priv->tx_reporter); 313 } 314