xref: /freebsd/sys/dev/mlx5/mlx5_core/mlx5_health.c (revision ee9d634b)
1 /*-
2  * Copyright (c) 2013-2019, Mellanox Technologies, Ltd.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD$
26  */
27 
28 #include "opt_rss.h"
29 #include "opt_ratelimit.h"
30 
31 #include <linux/kernel.h>
32 #include <linux/module.h>
33 #include <linux/random.h>
34 #include <linux/vmalloc.h>
35 #include <linux/hardirq.h>
36 #include <linux/delay.h>
37 #include <dev/mlx5/driver.h>
38 #include <dev/mlx5/mlx5_ifc.h>
39 #include "mlx5_core.h"
40 
41 #define	MLX5_HEALTH_POLL_INTERVAL	(2 * HZ)
42 #define	MAX_MISSES			3
43 
44 enum {
45 	MLX5_DROP_NEW_HEALTH_WORK,
46 	MLX5_DROP_NEW_RECOVERY_WORK,
47 	MLX5_DROP_NEW_WATCHDOG_WORK,
48 };
49 
50 enum  {
51 	MLX5_SENSOR_NO_ERR		= 0,
52 	MLX5_SENSOR_PCI_COMM_ERR	= 1,
53 	MLX5_SENSOR_PCI_ERR		= 2,
54 	MLX5_SENSOR_NIC_DISABLED	= 3,
55 	MLX5_SENSOR_NIC_SW_RESET	= 4,
56 	MLX5_SENSOR_FW_SYND_RFR		= 5,
57 };
58 
59 static int mlx5_fw_reset_enable = 1;
60 SYSCTL_INT(_hw_mlx5, OID_AUTO, fw_reset_enable, CTLFLAG_RWTUN,
61     &mlx5_fw_reset_enable, 0,
62     "Enable firmware reset");
63 
64 static unsigned int sw_reset_to = 1200;
65 SYSCTL_UINT(_hw_mlx5, OID_AUTO, sw_reset_timeout, CTLFLAG_RWTUN,
66     &sw_reset_to, 0,
67     "Minimum timeout in seconds between two firmware resets");
68 
69 
70 static int lock_sem_sw_reset(struct mlx5_core_dev *dev)
71 {
72 	int ret;
73 
74 	/* Lock GW access */
75 	ret = -mlx5_vsc_lock(dev);
76 	if (ret) {
77 		mlx5_core_warn(dev, "Timed out locking gateway %d\n", ret);
78 		return ret;
79 	}
80 
81 	ret = -mlx5_vsc_lock_addr_space(dev, MLX5_SEMAPHORE_SW_RESET);
82 	if (ret) {
83 		if (ret == -EBUSY)
84 			mlx5_core_dbg(dev,
85 			    "SW reset FW semaphore already locked, another function will handle the reset\n");
86 		else
87 			mlx5_core_warn(dev,
88 			    "SW reset semaphore lock return %d\n", ret);
89 	}
90 
91 	/* Unlock GW access */
92 	mlx5_vsc_unlock(dev);
93 
94 	return ret;
95 }
96 
97 static int unlock_sem_sw_reset(struct mlx5_core_dev *dev)
98 {
99 	int ret;
100 
101 	/* Lock GW access */
102 	ret = -mlx5_vsc_lock(dev);
103 	if (ret) {
104 		mlx5_core_warn(dev, "Timed out locking gateway %d\n", ret);
105 		return ret;
106 	}
107 
108 	ret = -mlx5_vsc_unlock_addr_space(dev, MLX5_SEMAPHORE_SW_RESET);
109 
110 	/* Unlock GW access */
111 	mlx5_vsc_unlock(dev);
112 
113 	return ret;
114 }
115 
116 u8 mlx5_get_nic_state(struct mlx5_core_dev *dev)
117 {
118 	return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 7;
119 }
120 
121 void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state)
122 {
123 	u32 cur_cmdq_addr_l_sz;
124 
125 	cur_cmdq_addr_l_sz = ioread32be(&dev->iseg->cmdq_addr_l_sz);
126 	iowrite32be((cur_cmdq_addr_l_sz & 0xFFFFF000) |
127 		    state << MLX5_NIC_IFC_OFFSET,
128 		    &dev->iseg->cmdq_addr_l_sz);
129 }
130 
131 static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev)
132 {
133 	struct mlx5_core_health *health = &dev->priv.health;
134 	struct mlx5_health_buffer __iomem *h = health->health;
135 	u32 rfr = ioread32be(&h->rfr) >> MLX5_RFR_OFFSET;
136 	u8 synd = ioread8(&h->synd);
137 
138 	if (rfr && synd)
139 		mlx5_core_dbg(dev, "FW requests reset, synd: %d\n", synd);
140 	return rfr && synd;
141 }
142 
143 static void mlx5_trigger_cmd_completions(struct work_struct *work)
144 {
145 	struct mlx5_core_dev *dev =
146 	    container_of(work, struct mlx5_core_dev, priv.health.work_cmd_completion);
147 	unsigned long flags;
148 	u64 vector;
149 
150 	/* wait for pending handlers to complete */
151 	synchronize_irq(dev->priv.msix_arr[MLX5_EQ_VEC_CMD].vector);
152 	spin_lock_irqsave(&dev->cmd.alloc_lock, flags);
153 	vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1);
154 	if (!vector)
155 		goto no_trig;
156 
157 	vector |= MLX5_TRIGGERED_CMD_COMP;
158 	spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
159 
160 	mlx5_core_dbg(dev, "vector 0x%jx\n", (uintmax_t)vector);
161 	mlx5_cmd_comp_handler(dev, vector, MLX5_CMD_MODE_EVENTS);
162 	return;
163 
164 no_trig:
165 	spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
166 }
167 
168 static bool sensor_pci_no_comm(struct mlx5_core_dev *dev)
169 {
170 	struct mlx5_core_health *health = &dev->priv.health;
171 	struct mlx5_health_buffer __iomem *h = health->health;
172 	bool err = ioread32be(&h->fw_ver) == 0xffffffff;
173 
174 	return err;
175 }
176 
177 static bool sensor_nic_disabled(struct mlx5_core_dev *dev)
178 {
179 	return mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED;
180 }
181 
182 static bool sensor_nic_sw_reset(struct mlx5_core_dev *dev)
183 {
184 	return mlx5_get_nic_state(dev) == MLX5_NIC_IFC_SW_RESET;
185 }
186 
187 static u32 check_fatal_sensors(struct mlx5_core_dev *dev)
188 {
189 	if (sensor_pci_no_comm(dev))
190 		return MLX5_SENSOR_PCI_COMM_ERR;
191 	if (pci_channel_offline(dev->pdev))
192 		return MLX5_SENSOR_PCI_ERR;
193 	if (sensor_nic_disabled(dev))
194 		return MLX5_SENSOR_NIC_DISABLED;
195 	if (sensor_nic_sw_reset(dev))
196 		return MLX5_SENSOR_NIC_SW_RESET;
197 	if (sensor_fw_synd_rfr(dev))
198 		return MLX5_SENSOR_FW_SYND_RFR;
199 
200 	return MLX5_SENSOR_NO_ERR;
201 }
202 
203 static void reset_fw_if_needed(struct mlx5_core_dev *dev)
204 {
205 	bool supported;
206 	u32 cmdq_addr, fatal_error;
207 
208 	if (!mlx5_fw_reset_enable)
209 		return;
210 	supported = (ioread32be(&dev->iseg->initializing) >>
211 	    MLX5_FW_RESET_SUPPORTED_OFFSET) & 1;
212 	if (!supported)
213 		return;
214 
215 	/* The reset only needs to be issued by one PF. The health buffer is
216 	 * shared between all functions, and will be cleared during a reset.
217 	 * Check again to avoid a redundant 2nd reset. If the fatal erros was
218 	 * PCI related a reset won't help.
219 	 */
220 	fatal_error = check_fatal_sensors(dev);
221 	if (fatal_error == MLX5_SENSOR_PCI_COMM_ERR ||
222 	    fatal_error == MLX5_SENSOR_NIC_DISABLED ||
223 	    fatal_error == MLX5_SENSOR_NIC_SW_RESET) {
224 		mlx5_core_warn(dev,
225 		    "Not issuing FW reset. Either it's already done or won't help.\n");
226 		return;
227 	}
228 
229 	mlx5_core_info(dev, "Issuing FW Reset\n");
230 	/* Write the NIC interface field to initiate the reset, the command
231 	 * interface address also resides here, don't overwrite it.
232 	 */
233 	cmdq_addr = ioread32be(&dev->iseg->cmdq_addr_l_sz);
234 	iowrite32be((cmdq_addr & 0xFFFFF000) |
235 		    MLX5_NIC_IFC_SW_RESET << MLX5_NIC_IFC_OFFSET,
236 		    &dev->iseg->cmdq_addr_l_sz);
237 }
238 
239 static bool
240 mlx5_health_allow_reset(struct mlx5_core_dev *dev)
241 {
242 	struct mlx5_core_health *health = &dev->priv.health;
243 	unsigned int delta;
244 	bool ret;
245 
246 	if (health->last_reset_req != 0) {
247 		delta = ticks - health->last_reset_req;
248 		delta /= hz;
249 		ret = delta >= sw_reset_to;
250 	} else {
251 		ret = true;
252 	}
253 
254 	/*
255 	 * In principle, ticks may be 0. Setting it to off by one (-1)
256 	 * to prevent certain reset in next request.
257 	 */
258 	health->last_reset_req = ticks ? : -1;
259 	if (!ret)
260 		mlx5_core_warn(dev,
261 		    "Firmware reset elided due to auto-reset frequency threshold.\n");
262 	return (ret);
263 }
264 
265 #define MLX5_CRDUMP_WAIT_MS	60000
266 #define MLX5_FW_RESET_WAIT_MS	1000
267 #define MLX5_NIC_STATE_POLL_MS	5
268 void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
269 {
270 	int end, delay_ms = MLX5_CRDUMP_WAIT_MS;
271 	u32 fatal_error;
272 	int lock = -EBUSY;
273 
274 	fatal_error = check_fatal_sensors(dev);
275 
276 	if (fatal_error || force) {
277 		if (xchg(&dev->state, MLX5_DEVICE_STATE_INTERNAL_ERROR) ==
278 		    MLX5_DEVICE_STATE_INTERNAL_ERROR)
279 			return;
280 		if (!force)
281 			mlx5_core_err(dev, "internal state error detected\n");
282 
283 		/*
284 		 * Queue the command completion handler on the command
285 		 * work queue to avoid racing with the real command
286 		 * completion handler and then wait for it to
287 		 * complete:
288 		 */
289 		queue_work(dev->priv.health.wq_cmd, &dev->priv.health.work_cmd_completion);
290 		flush_workqueue(dev->priv.health.wq_cmd);
291 	}
292 
293 	mutex_lock(&dev->intf_state_mutex);
294 
295 	if (force)
296 		goto err_state_done;
297 
298 	if (fatal_error == MLX5_SENSOR_FW_SYND_RFR &&
299 	    mlx5_health_allow_reset(dev)) {
300 		/* Get cr-dump and reset FW semaphore */
301 		if (mlx5_core_is_pf(dev))
302 			lock = lock_sem_sw_reset(dev);
303 
304 		/* Execute cr-dump and SW reset */
305 		if (lock != -EBUSY) {
306 			(void)mlx5_fwdump(dev);
307 			reset_fw_if_needed(dev);
308 			delay_ms = MLX5_FW_RESET_WAIT_MS;
309 		}
310 	}
311 
312 	/* Recover from SW reset */
313 	end = jiffies + msecs_to_jiffies(delay_ms);
314 	do {
315 		if (sensor_nic_disabled(dev))
316 			break;
317 
318 		msleep(MLX5_NIC_STATE_POLL_MS);
319 	} while (!time_after(jiffies, end));
320 
321 	if (!sensor_nic_disabled(dev)) {
322 		mlx5_core_err(dev, "NIC IFC still %d after %ums.\n",
323 			mlx5_get_nic_state(dev), delay_ms);
324 	}
325 
326 	/* Release FW semaphore if you are the lock owner */
327 	if (!lock)
328 		unlock_sem_sw_reset(dev);
329 
330 	mlx5_core_info(dev, "System error event triggered\n");
331 
332 err_state_done:
333 	mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 1);
334 	mutex_unlock(&dev->intf_state_mutex);
335 }
336 
337 static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
338 {
339 	u8 nic_mode = mlx5_get_nic_state(dev);
340 
341 	if (nic_mode == MLX5_NIC_IFC_SW_RESET) {
342 		/* The IFC mode field is 3 bits, so it will read 0x7 in two cases:
343 		 * 1. PCI has been disabled (ie. PCI-AER, PF driver unloaded
344 		 *    and this is a VF), this is not recoverable by SW reset.
345 		 *    Logging of this is handled elsewhere.
346 		 * 2. FW reset has been issued by another function, driver can
347 		 *    be reloaded to recover after the mode switches to
348 		 *    MLX5_NIC_IFC_DISABLED.
349 		 */
350 		if (dev->priv.health.fatal_error != MLX5_SENSOR_PCI_COMM_ERR)
351 			mlx5_core_warn(dev,
352 			    "NIC SW reset is already progress\n");
353 		else
354 			mlx5_core_warn(dev,
355 			    "Communication with FW over the PCI link is down\n");
356 	} else {
357 		mlx5_core_warn(dev, "NIC mode %d\n", nic_mode);
358 	}
359 
360 	mlx5_disable_device(dev);
361 }
362 
363 #define MLX5_FW_RESET_WAIT_MS	1000
364 #define MLX5_NIC_STATE_POLL_MS	5
365 static void health_recover(struct work_struct *work)
366 {
367 	unsigned long end = jiffies + msecs_to_jiffies(MLX5_FW_RESET_WAIT_MS);
368 	struct mlx5_core_health *health;
369 	struct delayed_work *dwork;
370 	struct mlx5_core_dev *dev;
371 	struct mlx5_priv *priv;
372 	bool recover = true;
373 	u8 nic_mode;
374 
375 	dwork = container_of(work, struct delayed_work, work);
376 	health = container_of(dwork, struct mlx5_core_health, recover_work);
377 	priv = container_of(health, struct mlx5_priv, health);
378 	dev = container_of(priv, struct mlx5_core_dev, priv);
379 
380 	/* This might likely be wrong, cut and paste from elsewhere? */
381 	bus_topo_lock();
382 
383 	if (sensor_pci_no_comm(dev)) {
384 		mlx5_core_err(dev,
385 		    "health recovery flow aborted, PCI reads still not working\n");
386 		recover = false;
387 	}
388 
389 	nic_mode = mlx5_get_nic_state(dev);
390 	while (nic_mode != MLX5_NIC_IFC_DISABLED &&
391 	       !time_after(jiffies, end)) {
392 		msleep(MLX5_NIC_STATE_POLL_MS);
393 		nic_mode = mlx5_get_nic_state(dev);
394 	}
395 
396 	if (nic_mode != MLX5_NIC_IFC_DISABLED) {
397 		mlx5_core_err(dev,
398 		    "health recovery flow aborted, unexpected NIC IFC mode %d.\n",
399 		    nic_mode);
400 		recover = false;
401 	}
402 
403 	if (recover) {
404 		mlx5_core_info(dev, "Starting health recovery flow\n");
405 		mlx5_recover_device(dev);
406 	}
407 
408 	bus_topo_unlock();
409 }
410 
411 /* How much time to wait until health resetting the driver (in msecs) */
412 #define MLX5_RECOVERY_DELAY_MSECS 60000
413 #define MLX5_RECOVERY_NO_DELAY 0
414 static unsigned long get_recovery_delay(struct mlx5_core_dev *dev)
415 {
416 	return dev->priv.health.fatal_error == MLX5_SENSOR_PCI_ERR ||
417 		dev->priv.health.fatal_error == MLX5_SENSOR_PCI_COMM_ERR	?
418 		MLX5_RECOVERY_DELAY_MSECS : MLX5_RECOVERY_NO_DELAY;
419 }
420 
421 static void health_care(struct work_struct *work)
422 {
423 	struct mlx5_core_health *health;
424 	unsigned long recover_delay;
425 	struct mlx5_core_dev *dev;
426 	struct mlx5_priv *priv;
427 	unsigned long flags;
428 
429 	health = container_of(work, struct mlx5_core_health, work);
430 	priv = container_of(health, struct mlx5_priv, health);
431 	dev = container_of(priv, struct mlx5_core_dev, priv);
432 
433 	mlx5_core_warn(dev, "handling bad device here\n");
434 	mlx5_handle_bad_state(dev);
435 	recover_delay = msecs_to_jiffies(get_recovery_delay(dev));
436 
437 	spin_lock_irqsave(&health->wq_lock, flags);
438 	if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags)) {
439 		mlx5_core_warn(dev,
440 		    "Scheduling recovery work with %lums delay\n",
441 		    recover_delay);
442 		schedule_delayed_work(&health->recover_work, recover_delay);
443 	} else {
444 		mlx5_core_err(dev,
445 		    "new health works are not permitted at this stage\n");
446 	}
447 	spin_unlock_irqrestore(&health->wq_lock, flags);
448 }
449 
450 static int get_next_poll_jiffies(void)
451 {
452 	unsigned long next;
453 
454 	get_random_bytes(&next, sizeof(next));
455 	next %= HZ;
456 	next += jiffies + MLX5_HEALTH_POLL_INTERVAL;
457 
458 	return next;
459 }
460 
461 void mlx5_trigger_health_work(struct mlx5_core_dev *dev)
462 {
463 	struct mlx5_core_health *health = &dev->priv.health;
464 	unsigned long flags;
465 
466 	spin_lock_irqsave(&health->wq_lock, flags);
467 	if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
468 		queue_work(health->wq, &health->work);
469 	else
470 		mlx5_core_err(dev,
471 			"new health works are not permitted at this stage\n");
472 	spin_unlock_irqrestore(&health->wq_lock, flags);
473 }
474 
475 static const char *hsynd_str(u8 synd)
476 {
477 	switch (synd) {
478 	case MLX5_HEALTH_SYNDR_FW_ERR:
479 		return "firmware internal error";
480 	case MLX5_HEALTH_SYNDR_IRISC_ERR:
481 		return "irisc not responding";
482 	case MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR:
483 		return "unrecoverable hardware error";
484 	case MLX5_HEALTH_SYNDR_CRC_ERR:
485 		return "firmware CRC error";
486 	case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR:
487 		return "ICM fetch PCI error";
488 	case MLX5_HEALTH_SYNDR_HW_FTL_ERR:
489 		return "HW fatal error\n";
490 	case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR:
491 		return "async EQ buffer overrun";
492 	case MLX5_HEALTH_SYNDR_EQ_ERR:
493 		return "EQ error";
494 	case MLX5_HEALTH_SYNDR_EQ_INV:
495 		return "Invalid EQ referenced";
496 	case MLX5_HEALTH_SYNDR_FFSER_ERR:
497 		return "FFSER error";
498 	case MLX5_HEALTH_SYNDR_HIGH_TEMP:
499 		return "High temperature";
500 	default:
501 		return "unrecognized error";
502 	}
503 }
504 
505 static u8
506 print_health_info(struct mlx5_core_dev *dev)
507 {
508 	struct mlx5_core_health *health = &dev->priv.health;
509 	struct mlx5_health_buffer __iomem *h = health->health;
510 	u8 synd = ioread8(&h->synd);
511 	char fw_str[18];
512 	u32 fw;
513 	int i;
514 
515 	/*
516 	 * If synd is 0x0 - this indicates that FW is unable to
517 	 * respond to initialization segment reads and health buffer
518 	 * should not be read.
519 	 */
520 	if (synd == 0)
521 		return (0);
522 
523 	for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
524 		mlx5_core_info(dev, "assert_var[%d] 0x%08x\n", i,
525 		    ioread32be(h->assert_var + i));
526 
527 	mlx5_core_info(dev, "assert_exit_ptr 0x%08x\n",
528 	    ioread32be(&h->assert_exit_ptr));
529 	mlx5_core_info(dev, "assert_callra 0x%08x\n",
530 	    ioread32be(&h->assert_callra));
531 	snprintf(fw_str, sizeof(fw_str), "%d.%d.%d",
532 	    fw_rev_maj(dev), fw_rev_min(dev), fw_rev_sub(dev));
533 	mlx5_core_info(dev, "fw_ver %s\n", fw_str);
534 	mlx5_core_info(dev, "hw_id 0x%08x\n", ioread32be(&h->hw_id));
535 	mlx5_core_info(dev, "irisc_index %d\n", ioread8(&h->irisc_index));
536 	mlx5_core_info(dev, "synd 0x%x: %s\n",
537 	    ioread8(&h->synd), hsynd_str(ioread8(&h->synd)));
538 	mlx5_core_info(dev, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd));
539 	fw = ioread32be(&h->fw_ver);
540 	mlx5_core_info(dev, "raw fw_ver 0x%08x\n", fw);
541 
542 	return synd;
543 }
544 
545 static void health_watchdog(struct work_struct *work)
546 {
547 	struct mlx5_core_dev *dev;
548 	u16 power;
549 	u8 status;
550 	int err;
551 
552 	dev = container_of(work, struct mlx5_core_dev, priv.health.work_watchdog);
553 
554 	if (!MLX5_CAP_GEN(dev, mcam_reg) ||
555 	    !MLX5_CAP_MCAM_FEATURE(dev, pcie_status_and_power))
556 		return;
557 
558 	err = mlx5_pci_read_power_status(dev, &power, &status);
559 	if (err < 0) {
560 		mlx5_core_warn(dev, "Failed reading power status: %d\n",
561 		    err);
562 		return;
563 	}
564 
565 	dev->pwr_value = power;
566 
567 	if (dev->pwr_status != status) {
568 
569 		switch (status) {
570 		case 0:
571 			dev->pwr_status = status;
572 			mlx5_core_info(dev,
573 			    "PCI power is not published by the PCIe slot.\n");
574 			break;
575 		case 1:
576 			dev->pwr_status = status;
577 			mlx5_core_info(dev,
578 			    "PCIe slot advertised sufficient power (%uW).\n",
579 			    power);
580 			break;
581 		case 2:
582 			dev->pwr_status = status;
583 			mlx5_core_warn(dev,
584 			    "Detected insufficient power on the PCIe slot (%uW).\n",
585 			    power);
586 			break;
587 		default:
588 			dev->pwr_status = 0;
589 			mlx5_core_warn(dev,
590 			    "Unknown power state detected(%d).\n",
591 			    status);
592 			break;
593 		}
594 	}
595 }
596 
597 void
598 mlx5_trigger_health_watchdog(struct mlx5_core_dev *dev)
599 {
600 	struct mlx5_core_health *health = &dev->priv.health;
601 	unsigned long flags;
602 
603 	spin_lock_irqsave(&health->wq_lock, flags);
604 	if (!test_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags))
605 		queue_work(health->wq_watchdog, &health->work_watchdog);
606 	else
607 		mlx5_core_err(dev,
608 		    "scheduling watchdog is not permitted at this stage\n");
609 	spin_unlock_irqrestore(&health->wq_lock, flags);
610 }
611 
612 static void poll_health(unsigned long data)
613 {
614 	struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data;
615 	struct mlx5_core_health *health = &dev->priv.health;
616 	u32 fatal_error;
617 	u32 count;
618 
619 	if (dev->state != MLX5_DEVICE_STATE_UP)
620 		return;
621 
622 	count = ioread32be(health->health_counter);
623 	if (count == health->prev)
624 		++health->miss_counter;
625 	else
626 		health->miss_counter = 0;
627 
628 	health->prev = count;
629 	if (health->miss_counter == MAX_MISSES) {
630 		mlx5_core_err(dev, "device's health compromised - reached miss count\n");
631 		if (print_health_info(dev) == 0)
632 			mlx5_core_err(dev, "FW is unable to respond to initialization segment reads\n");
633 	}
634 
635 	fatal_error = check_fatal_sensors(dev);
636 
637 	if (fatal_error && !health->fatal_error) {
638 		mlx5_core_err(dev,
639 		    "Fatal error %u detected\n", fatal_error);
640 		dev->priv.health.fatal_error = fatal_error;
641 		print_health_info(dev);
642 		mlx5_trigger_health_work(dev);
643 	}
644 
645 	mod_timer(&health->timer, get_next_poll_jiffies());
646 }
647 
648 void mlx5_start_health_poll(struct mlx5_core_dev *dev)
649 {
650 	struct mlx5_core_health *health = &dev->priv.health;
651 
652 	init_timer(&health->timer);
653 	health->fatal_error = MLX5_SENSOR_NO_ERR;
654 	clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
655 	clear_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
656 	clear_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags);
657 	health->health = &dev->iseg->health;
658 	health->health_counter = &dev->iseg->health_counter;
659 
660 	setup_timer(&health->timer, poll_health, (unsigned long)dev);
661 	mod_timer(&health->timer,
662 		  round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL));
663 
664 	/* do initial PCI power state readout */
665 	mlx5_trigger_health_watchdog(dev);
666 }
667 
668 void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health)
669 {
670 	struct mlx5_core_health *health = &dev->priv.health;
671 	unsigned long flags;
672 
673 	if (disable_health) {
674 		spin_lock_irqsave(&health->wq_lock, flags);
675 		set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
676 		set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
677 		set_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags);
678 		spin_unlock_irqrestore(&health->wq_lock, flags);
679 	}
680 
681 	del_timer_sync(&health->timer);
682 }
683 
684 void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
685 {
686 	struct mlx5_core_health *health = &dev->priv.health;
687 	unsigned long flags;
688 
689 	spin_lock_irqsave(&health->wq_lock, flags);
690 	set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
691 	set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
692 	set_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags);
693 	spin_unlock_irqrestore(&health->wq_lock, flags);
694 	cancel_delayed_work_sync(&health->recover_work);
695 	cancel_work_sync(&health->work);
696 	cancel_work_sync(&health->work_watchdog);
697 }
698 
699 void mlx5_drain_health_recovery(struct mlx5_core_dev *dev)
700 {
701 	struct mlx5_core_health *health = &dev->priv.health;
702 	unsigned long flags;
703 
704 	spin_lock_irqsave(&health->wq_lock, flags);
705 	set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
706 	spin_unlock_irqrestore(&health->wq_lock, flags);
707 	cancel_delayed_work_sync(&dev->priv.health.recover_work);
708 }
709 
710 void mlx5_health_cleanup(struct mlx5_core_dev *dev)
711 {
712 	struct mlx5_core_health *health = &dev->priv.health;
713 
714 	destroy_workqueue(health->wq);
715 	destroy_workqueue(health->wq_watchdog);
716 	destroy_workqueue(health->wq_cmd);
717 }
718 
719 int mlx5_health_init(struct mlx5_core_dev *dev)
720 {
721 	struct mlx5_core_health *health;
722 	char name[64];
723 
724 	health = &dev->priv.health;
725 
726 	snprintf(name, sizeof(name), "%s-rec", dev_name(&dev->pdev->dev));
727 	health->wq = create_singlethread_workqueue(name);
728 	if (!health->wq)
729 		goto err_recovery;
730 
731 	snprintf(name, sizeof(name), "%s-wdg", dev_name(&dev->pdev->dev));
732 	health->wq_watchdog = create_singlethread_workqueue(name);
733 	if (!health->wq_watchdog)
734 		goto err_watchdog;
735 
736 	snprintf(name, sizeof(name), "%s-cmd", dev_name(&dev->pdev->dev));
737 	health->wq_cmd = create_singlethread_workqueue(name);
738 	if (!health->wq_cmd)
739 		goto err_cmd;
740 
741 	spin_lock_init(&health->wq_lock);
742 	INIT_WORK(&health->work, health_care);
743 	INIT_WORK(&health->work_watchdog, health_watchdog);
744 	INIT_WORK(&health->work_cmd_completion, mlx5_trigger_cmd_completions);
745 	INIT_DELAYED_WORK(&health->recover_work, health_recover);
746 
747 	return 0;
748 
749 err_cmd:
750 	destroy_workqueue(health->wq_watchdog);
751 err_watchdog:
752 	destroy_workqueue(health->wq);
753 err_recovery:
754 	return -ENOMEM;
755 }
756