xref: /freebsd/sys/dev/mlx5/mlx5_core/mlx5_health.c (revision c9bb26ae)
1dc7e38acSHans Petter Selasky /*-
28d1eeedbSHans Petter Selasky  * Copyright (c) 2013-2019, Mellanox Technologies, Ltd.  All rights reserved.
3dc7e38acSHans Petter Selasky  *
4dc7e38acSHans Petter Selasky  * Redistribution and use in source and binary forms, with or without
5dc7e38acSHans Petter Selasky  * modification, are permitted provided that the following conditions
6dc7e38acSHans Petter Selasky  * are met:
7dc7e38acSHans Petter Selasky  * 1. Redistributions of source code must retain the above copyright
8dc7e38acSHans Petter Selasky  *    notice, this list of conditions and the following disclaimer.
9dc7e38acSHans Petter Selasky  * 2. Redistributions in binary form must reproduce the above copyright
10dc7e38acSHans Petter Selasky  *    notice, this list of conditions and the following disclaimer in the
11dc7e38acSHans Petter Selasky  *    documentation and/or other materials provided with the distribution.
12dc7e38acSHans Petter Selasky  *
13dc7e38acSHans Petter Selasky  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14dc7e38acSHans Petter Selasky  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15dc7e38acSHans Petter Selasky  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16dc7e38acSHans Petter Selasky  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17dc7e38acSHans Petter Selasky  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18dc7e38acSHans Petter Selasky  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19dc7e38acSHans Petter Selasky  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20dc7e38acSHans Petter Selasky  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21dc7e38acSHans Petter Selasky  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22dc7e38acSHans Petter Selasky  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23dc7e38acSHans Petter Selasky  * SUCH DAMAGE.
24dc7e38acSHans Petter Selasky  *
25dc7e38acSHans Petter Selasky  * $FreeBSD$
26dc7e38acSHans Petter Selasky  */
27dc7e38acSHans Petter Selasky 
28dc7e38acSHans Petter Selasky #include <linux/kernel.h>
29dc7e38acSHans Petter Selasky #include <linux/module.h>
30dc7e38acSHans Petter Selasky #include <linux/random.h>
31dc7e38acSHans Petter Selasky #include <linux/vmalloc.h>
32a2485fe5SHans Petter Selasky #include <linux/hardirq.h>
331900b6f8SHans Petter Selasky #include <linux/delay.h>
34dc7e38acSHans Petter Selasky #include <dev/mlx5/driver.h>
35dc7e38acSHans Petter Selasky #include <dev/mlx5/mlx5_ifc.h>
36dc7e38acSHans Petter Selasky #include "mlx5_core.h"
37dc7e38acSHans Petter Selasky 
38dc7e38acSHans Petter Selasky #define	MLX5_HEALTH_POLL_INTERVAL	(2 * HZ)
39dc7e38acSHans Petter Selasky #define	MAX_MISSES			3
40dc7e38acSHans Petter Selasky 
41a2485fe5SHans Petter Selasky enum {
42ca551594SHans Petter Selasky 	MLX5_DROP_NEW_HEALTH_WORK,
43519774eaSHans Petter Selasky 	MLX5_DROP_NEW_RECOVERY_WORK,
44adb6fd50SHans Petter Selasky 	MLX5_DROP_NEW_WATCHDOG_WORK,
45ca551594SHans Petter Selasky };
46ca551594SHans Petter Selasky 
471900b6f8SHans Petter Selasky enum  {
481900b6f8SHans Petter Selasky 	MLX5_SENSOR_NO_ERR		= 0,
491900b6f8SHans Petter Selasky 	MLX5_SENSOR_PCI_COMM_ERR	= 1,
501900b6f8SHans Petter Selasky 	MLX5_SENSOR_PCI_ERR		= 2,
511900b6f8SHans Petter Selasky 	MLX5_SENSOR_NIC_DISABLED	= 3,
521900b6f8SHans Petter Selasky 	MLX5_SENSOR_NIC_SW_RESET	= 4,
53fe242ba7SHans Petter Selasky 	MLX5_SENSOR_FW_SYND_RFR		= 5,
541900b6f8SHans Petter Selasky };
551900b6f8SHans Petter Selasky 
5629e54451SSlava Shwartsman static int mlx5_fw_reset_enable = 1;
5729e54451SSlava Shwartsman SYSCTL_INT(_hw_mlx5, OID_AUTO, fw_reset_enable, CTLFLAG_RWTUN,
5829e54451SSlava Shwartsman     &mlx5_fw_reset_enable, 0,
5929e54451SSlava Shwartsman     "Enable firmware reset");
6029e54451SSlava Shwartsman 
615169fb81SHans Petter Selasky static unsigned int sw_reset_to = 1200;
625169fb81SHans Petter Selasky SYSCTL_UINT(_hw_mlx5, OID_AUTO, sw_reset_timeout, CTLFLAG_RWTUN,
635169fb81SHans Petter Selasky     &sw_reset_to, 0,
645169fb81SHans Petter Selasky     "Minimum timeout in seconds between two firmware resets");
655169fb81SHans Petter Selasky 
665169fb81SHans Petter Selasky 
67b575d8c8SHans Petter Selasky static int lock_sem_sw_reset(struct mlx5_core_dev *dev)
68f20b553dSHans Petter Selasky {
69b575d8c8SHans Petter Selasky 	int ret;
70f20b553dSHans Petter Selasky 
71f20b553dSHans Petter Selasky 	/* Lock GW access */
72b575d8c8SHans Petter Selasky 	ret = -mlx5_vsc_lock(dev);
73f20b553dSHans Petter Selasky 	if (ret) {
74b575d8c8SHans Petter Selasky 		mlx5_core_warn(dev, "Timed out locking gateway %d\n", ret);
75f20b553dSHans Petter Selasky 		return ret;
76f20b553dSHans Petter Selasky 	}
77f20b553dSHans Petter Selasky 
78b575d8c8SHans Petter Selasky 	ret = -mlx5_vsc_lock_addr_space(dev, MLX5_SEMAPHORE_SW_RESET);
79b575d8c8SHans Petter Selasky 	if (ret) {
80f20b553dSHans Petter Selasky 		if (ret == -EBUSY)
81f20b553dSHans Petter Selasky 			mlx5_core_dbg(dev, "SW reset FW semaphore already locked, another function will handle the reset\n");
82f20b553dSHans Petter Selasky 		else
83f20b553dSHans Petter Selasky 			mlx5_core_warn(dev, "SW reset semaphore lock return %d\n", ret);
84f20b553dSHans Petter Selasky 	}
85f20b553dSHans Petter Selasky 
86f20b553dSHans Petter Selasky 	/* Unlock GW access */
87b575d8c8SHans Petter Selasky 	mlx5_vsc_unlock(dev);
88b575d8c8SHans Petter Selasky 
89b575d8c8SHans Petter Selasky 	return ret;
90b575d8c8SHans Petter Selasky }
91b575d8c8SHans Petter Selasky 
92b575d8c8SHans Petter Selasky static int unlock_sem_sw_reset(struct mlx5_core_dev *dev)
93b575d8c8SHans Petter Selasky {
94b575d8c8SHans Petter Selasky 	int ret;
95b575d8c8SHans Petter Selasky 
96b575d8c8SHans Petter Selasky 	/* Lock GW access */
97b575d8c8SHans Petter Selasky 	ret = -mlx5_vsc_lock(dev);
98b575d8c8SHans Petter Selasky 	if (ret) {
99b575d8c8SHans Petter Selasky 		mlx5_core_warn(dev, "Timed out locking gateway %d\n", ret);
100b575d8c8SHans Petter Selasky 		return ret;
101b575d8c8SHans Petter Selasky 	}
102b575d8c8SHans Petter Selasky 
103b575d8c8SHans Petter Selasky 	ret = -mlx5_vsc_unlock_addr_space(dev, MLX5_SEMAPHORE_SW_RESET);
104b575d8c8SHans Petter Selasky 
105b575d8c8SHans Petter Selasky 	/* Unlock GW access */
106b575d8c8SHans Petter Selasky 	mlx5_vsc_unlock(dev);
107f20b553dSHans Petter Selasky 
108f20b553dSHans Petter Selasky 	return ret;
109f20b553dSHans Petter Selasky }
110f20b553dSHans Petter Selasky 
111ba11bcecSHans Petter Selasky u8 mlx5_get_nic_state(struct mlx5_core_dev *dev)
112a2485fe5SHans Petter Selasky {
1131900b6f8SHans Petter Selasky 	return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 7;
114a2485fe5SHans Petter Selasky }
115a2485fe5SHans Petter Selasky 
116ba11bcecSHans Petter Selasky void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state)
117ba11bcecSHans Petter Selasky {
118ba11bcecSHans Petter Selasky 	u32 cur_cmdq_addr_l_sz;
119ba11bcecSHans Petter Selasky 
120ba11bcecSHans Petter Selasky 	cur_cmdq_addr_l_sz = ioread32be(&dev->iseg->cmdq_addr_l_sz);
121ba11bcecSHans Petter Selasky 	iowrite32be((cur_cmdq_addr_l_sz & 0xFFFFF000) |
122ba11bcecSHans Petter Selasky 		    state << MLX5_NIC_IFC_OFFSET,
123ba11bcecSHans Petter Selasky 		    &dev->iseg->cmdq_addr_l_sz);
124ba11bcecSHans Petter Selasky }
125ba11bcecSHans Petter Selasky 
126fe242ba7SHans Petter Selasky static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev)
127fe242ba7SHans Petter Selasky {
128fe242ba7SHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
129fe242ba7SHans Petter Selasky 	struct mlx5_health_buffer __iomem *h = health->health;
130fe242ba7SHans Petter Selasky 	u32 rfr = ioread32be(&h->rfr) >> MLX5_RFR_OFFSET;
131fe242ba7SHans Petter Selasky 	u8 synd = ioread8(&h->synd);
132fe242ba7SHans Petter Selasky 
133fe242ba7SHans Petter Selasky 	if (rfr && synd)
134fe242ba7SHans Petter Selasky 		mlx5_core_dbg(dev, "FW requests reset, synd: %d\n", synd);
135fe242ba7SHans Petter Selasky 	return rfr && synd;
136fe242ba7SHans Petter Selasky }
137fe242ba7SHans Petter Selasky 
138a0a4fd77SHans Petter Selasky static void mlx5_trigger_cmd_completions(struct work_struct *work)
139a2485fe5SHans Petter Selasky {
140a0a4fd77SHans Petter Selasky 	struct mlx5_core_dev *dev =
141a0a4fd77SHans Petter Selasky 	    container_of(work, struct mlx5_core_dev, priv.health.work_cmd_completion);
142a2485fe5SHans Petter Selasky 	unsigned long flags;
143a2485fe5SHans Petter Selasky 	u64 vector;
144a2485fe5SHans Petter Selasky 
145a2485fe5SHans Petter Selasky 	/* wait for pending handlers to complete */
146a2485fe5SHans Petter Selasky 	synchronize_irq(dev->priv.msix_arr[MLX5_EQ_VEC_CMD].vector);
147a2485fe5SHans Petter Selasky 	spin_lock_irqsave(&dev->cmd.alloc_lock, flags);
148a2485fe5SHans Petter Selasky 	vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1);
149a2485fe5SHans Petter Selasky 	if (!vector)
150a2485fe5SHans Petter Selasky 		goto no_trig;
151a2485fe5SHans Petter Selasky 
152a2485fe5SHans Petter Selasky 	vector |= MLX5_TRIGGERED_CMD_COMP;
153a2485fe5SHans Petter Selasky 	spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
154a2485fe5SHans Petter Selasky 
1552cec1528SKonstantin Belousov 	mlx5_core_dbg(dev, "vector 0x%jx\n", (uintmax_t)vector);
156721a1a6aSSlava Shwartsman 	mlx5_cmd_comp_handler(dev, vector, MLX5_CMD_MODE_EVENTS);
157a2485fe5SHans Petter Selasky 	return;
158a2485fe5SHans Petter Selasky 
159a2485fe5SHans Petter Selasky no_trig:
160a2485fe5SHans Petter Selasky 	spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
161a2485fe5SHans Petter Selasky }
162a2485fe5SHans Petter Selasky 
1631900b6f8SHans Petter Selasky static bool sensor_pci_no_comm(struct mlx5_core_dev *dev)
164a2485fe5SHans Petter Selasky {
165a2485fe5SHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
166a2485fe5SHans Petter Selasky 	struct mlx5_health_buffer __iomem *h = health->health;
1671900b6f8SHans Petter Selasky 	bool err = ioread32be(&h->fw_ver) == 0xffffffff;
168a2485fe5SHans Petter Selasky 
1691900b6f8SHans Petter Selasky 	return err;
1701900b6f8SHans Petter Selasky }
171a2485fe5SHans Petter Selasky 
1721900b6f8SHans Petter Selasky static bool sensor_nic_disabled(struct mlx5_core_dev *dev)
1731900b6f8SHans Petter Selasky {
174ba11bcecSHans Petter Selasky 	return mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED;
1751900b6f8SHans Petter Selasky }
176a2485fe5SHans Petter Selasky 
1771900b6f8SHans Petter Selasky static bool sensor_nic_sw_reset(struct mlx5_core_dev *dev)
1781900b6f8SHans Petter Selasky {
179ba11bcecSHans Petter Selasky 	return mlx5_get_nic_state(dev) == MLX5_NIC_IFC_SW_RESET;
1801900b6f8SHans Petter Selasky }
1811900b6f8SHans Petter Selasky 
1821900b6f8SHans Petter Selasky static u32 check_fatal_sensors(struct mlx5_core_dev *dev)
1831900b6f8SHans Petter Selasky {
1841900b6f8SHans Petter Selasky 	if (sensor_pci_no_comm(dev))
1851900b6f8SHans Petter Selasky 		return MLX5_SENSOR_PCI_COMM_ERR;
1861900b6f8SHans Petter Selasky 	if (pci_channel_offline(dev->pdev))
1871900b6f8SHans Petter Selasky 		return MLX5_SENSOR_PCI_ERR;
1881900b6f8SHans Petter Selasky 	if (sensor_nic_disabled(dev))
1891900b6f8SHans Petter Selasky 		return MLX5_SENSOR_NIC_DISABLED;
1901900b6f8SHans Petter Selasky 	if (sensor_nic_sw_reset(dev))
1911900b6f8SHans Petter Selasky 		return MLX5_SENSOR_NIC_SW_RESET;
192fe242ba7SHans Petter Selasky 	if (sensor_fw_synd_rfr(dev))
193fe242ba7SHans Petter Selasky 		return MLX5_SENSOR_FW_SYND_RFR;
1941900b6f8SHans Petter Selasky 
1951900b6f8SHans Petter Selasky 	return MLX5_SENSOR_NO_ERR;
196a2485fe5SHans Petter Selasky }
197a2485fe5SHans Petter Selasky 
198fe242ba7SHans Petter Selasky static void reset_fw_if_needed(struct mlx5_core_dev *dev)
199fe242ba7SHans Petter Selasky {
20029e54451SSlava Shwartsman 	bool supported;
201fe242ba7SHans Petter Selasky 	u32 cmdq_addr, fatal_error;
202fe242ba7SHans Petter Selasky 
20329e54451SSlava Shwartsman 	if (!mlx5_fw_reset_enable)
20429e54451SSlava Shwartsman 		return;
20529e54451SSlava Shwartsman 	supported = (ioread32be(&dev->iseg->initializing) >>
20629e54451SSlava Shwartsman 	    MLX5_FW_RESET_SUPPORTED_OFFSET) & 1;
207fe242ba7SHans Petter Selasky 	if (!supported)
208fe242ba7SHans Petter Selasky 		return;
209fe242ba7SHans Petter Selasky 
210fe242ba7SHans Petter Selasky 	/* The reset only needs to be issued by one PF. The health buffer is
211fe242ba7SHans Petter Selasky 	 * shared between all functions, and will be cleared during a reset.
212fe242ba7SHans Petter Selasky 	 * Check again to avoid a redundant 2nd reset. If the fatal erros was
213fe242ba7SHans Petter Selasky 	 * PCI related a reset won't help.
214fe242ba7SHans Petter Selasky 	 */
215fe242ba7SHans Petter Selasky 	fatal_error = check_fatal_sensors(dev);
216fe242ba7SHans Petter Selasky 	if (fatal_error == MLX5_SENSOR_PCI_COMM_ERR ||
217fe242ba7SHans Petter Selasky 	    fatal_error == MLX5_SENSOR_NIC_DISABLED ||
218d28b6b55SHans Petter Selasky 	    fatal_error == MLX5_SENSOR_NIC_SW_RESET) {
2194950c6ecSHans Petter Selasky 		mlx5_core_warn(dev, "Not issuing FW reset. Either it's already done or won't help.\n");
220fe242ba7SHans Petter Selasky 		return;
221fe242ba7SHans Petter Selasky 	}
222fe242ba7SHans Petter Selasky 
223fe242ba7SHans Petter Selasky 	mlx5_core_warn(dev, "Issuing FW Reset\n");
224fe242ba7SHans Petter Selasky 	/* Write the NIC interface field to initiate the reset, the command
225fe242ba7SHans Petter Selasky 	 * interface address also resides here, don't overwrite it.
226fe242ba7SHans Petter Selasky 	 */
227fe242ba7SHans Petter Selasky 	cmdq_addr = ioread32be(&dev->iseg->cmdq_addr_l_sz);
228fe242ba7SHans Petter Selasky 	iowrite32be((cmdq_addr & 0xFFFFF000) |
229fe242ba7SHans Petter Selasky 		    MLX5_NIC_IFC_SW_RESET << MLX5_NIC_IFC_OFFSET,
230fe242ba7SHans Petter Selasky 		    &dev->iseg->cmdq_addr_l_sz);
231fe242ba7SHans Petter Selasky }
232fe242ba7SHans Petter Selasky 
2335169fb81SHans Petter Selasky static bool
2345169fb81SHans Petter Selasky mlx5_health_allow_reset(struct mlx5_core_dev *dev)
2355169fb81SHans Petter Selasky {
2365169fb81SHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
2375169fb81SHans Petter Selasky 	unsigned int delta;
2385169fb81SHans Petter Selasky 	bool ret;
2395169fb81SHans Petter Selasky 
2405169fb81SHans Petter Selasky 	if (health->last_reset_req != 0) {
2415169fb81SHans Petter Selasky 		delta = ticks - health->last_reset_req;
2425169fb81SHans Petter Selasky 		delta /= hz;
2435169fb81SHans Petter Selasky 		ret = delta >= sw_reset_to;
2445169fb81SHans Petter Selasky 	} else {
2455169fb81SHans Petter Selasky 		ret = true;
2465169fb81SHans Petter Selasky 	}
2475169fb81SHans Petter Selasky 
2485169fb81SHans Petter Selasky 	/*
2495169fb81SHans Petter Selasky 	 * In principle, ticks may be 0. Setting it to off by one (-1)
2505169fb81SHans Petter Selasky 	 * to prevent certain reset in next request.
2515169fb81SHans Petter Selasky 	 */
2525169fb81SHans Petter Selasky 	health->last_reset_req = ticks ? : -1;
2535169fb81SHans Petter Selasky 	if (!ret)
2545169fb81SHans Petter Selasky 		mlx5_core_warn(dev, "Firmware reset elided due to "
2555169fb81SHans Petter Selasky 		    "auto-reset frequency threshold.\n");
2565169fb81SHans Petter Selasky 	return (ret);
2575169fb81SHans Petter Selasky }
2585169fb81SHans Petter Selasky 
259d28b6b55SHans Petter Selasky #define MLX5_CRDUMP_WAIT_MS	60000
260d28b6b55SHans Petter Selasky #define MLX5_FW_RESET_WAIT_MS	1000
261d28b6b55SHans Petter Selasky #define MLX5_NIC_STATE_POLL_MS	5
262c0902569SHans Petter Selasky void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
263a2485fe5SHans Petter Selasky {
264c2a1e807SHans Petter Selasky 	int end, delay_ms = MLX5_CRDUMP_WAIT_MS;
265d28b6b55SHans Petter Selasky 	u32 fatal_error;
266d28b6b55SHans Petter Selasky 	int lock = -EBUSY;
267d28b6b55SHans Petter Selasky 
268d28b6b55SHans Petter Selasky 	fatal_error = check_fatal_sensors(dev);
269d28b6b55SHans Petter Selasky 
270d28b6b55SHans Petter Selasky 	if (fatal_error || force) {
2716d54b22dSHans Petter Selasky 		if (xchg(&dev->state, MLX5_DEVICE_STATE_INTERNAL_ERROR) ==
2726d54b22dSHans Petter Selasky 		    MLX5_DEVICE_STATE_INTERNAL_ERROR)
2736d54b22dSHans Petter Selasky 			return;
2741fb6089cSHans Petter Selasky 		if (!force)
2751fb6089cSHans Petter Selasky 			mlx5_core_err(dev, "internal state error detected\n");
276a0a4fd77SHans Petter Selasky 
277a0a4fd77SHans Petter Selasky 		/*
278a0a4fd77SHans Petter Selasky 		 * Queue the command completion handler on the command
279a0a4fd77SHans Petter Selasky 		 * work queue to avoid racing with the real command
280a0a4fd77SHans Petter Selasky 		 * completion handler and then wait for it to
281a0a4fd77SHans Petter Selasky 		 * complete:
282a0a4fd77SHans Petter Selasky 		 */
2838d1eeedbSHans Petter Selasky 		queue_work(dev->priv.health.wq_cmd, &dev->priv.health.work_cmd_completion);
2848d1eeedbSHans Petter Selasky 		flush_workqueue(dev->priv.health.wq_cmd);
2857053deebSHans Petter Selasky 	}
286a2485fe5SHans Petter Selasky 
2876d54b22dSHans Petter Selasky 	mutex_lock(&dev->intf_state_mutex);
2886d54b22dSHans Petter Selasky 
289d28b6b55SHans Petter Selasky 	if (force)
290d28b6b55SHans Petter Selasky 		goto err_state_done;
291d28b6b55SHans Petter Selasky 
2925169fb81SHans Petter Selasky 	if (fatal_error == MLX5_SENSOR_FW_SYND_RFR &&
2935169fb81SHans Petter Selasky 	    mlx5_health_allow_reset(dev)) {
29492d23c82SHans Petter Selasky 		/* Get cr-dump and reset FW semaphore */
295d28b6b55SHans Petter Selasky 		if (mlx5_core_is_pf(dev))
296b575d8c8SHans Petter Selasky 			lock = lock_sem_sw_reset(dev);
297d28b6b55SHans Petter Selasky 
29892d23c82SHans Petter Selasky 		/* Execute cr-dump and SW reset */
299d28b6b55SHans Petter Selasky 		if (lock != -EBUSY) {
30092d23c82SHans Petter Selasky 			mlx5_fwdump(dev);
301d28b6b55SHans Petter Selasky 			reset_fw_if_needed(dev);
302d28b6b55SHans Petter Selasky 			delay_ms = MLX5_FW_RESET_WAIT_MS;
303d28b6b55SHans Petter Selasky 		}
304d28b6b55SHans Petter Selasky 	}
305d28b6b55SHans Petter Selasky 
306d28b6b55SHans Petter Selasky 	/* Recover from SW reset */
307d28b6b55SHans Petter Selasky 	end = jiffies + msecs_to_jiffies(delay_ms);
308d28b6b55SHans Petter Selasky 	do {
309d28b6b55SHans Petter Selasky 		if (sensor_nic_disabled(dev))
310d28b6b55SHans Petter Selasky 			break;
311d28b6b55SHans Petter Selasky 
312d28b6b55SHans Petter Selasky 		msleep(MLX5_NIC_STATE_POLL_MS);
313d28b6b55SHans Petter Selasky 	} while (!time_after(jiffies, end));
314d28b6b55SHans Petter Selasky 
315d28b6b55SHans Petter Selasky 	if (!sensor_nic_disabled(dev)) {
316c2a1e807SHans Petter Selasky 		dev_err(&dev->pdev->dev, "NIC IFC still %d after %ums.\n",
317ba11bcecSHans Petter Selasky 			mlx5_get_nic_state(dev), delay_ms);
318d28b6b55SHans Petter Selasky 	}
319d28b6b55SHans Petter Selasky 
320d28b6b55SHans Petter Selasky 	/* Release FW semaphore if you are the lock owner */
321d28b6b55SHans Petter Selasky 	if (!lock)
322b575d8c8SHans Petter Selasky 		unlock_sem_sw_reset(dev);
323d28b6b55SHans Petter Selasky 
3241fb6089cSHans Petter Selasky 	mlx5_core_err(dev, "system error event triggered\n");
3257053deebSHans Petter Selasky 
326d28b6b55SHans Petter Selasky err_state_done:
327843a89d3SSlava Shwartsman 	mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 1);
3287053deebSHans Petter Selasky 	mutex_unlock(&dev->intf_state_mutex);
329a2485fe5SHans Petter Selasky }
330a2485fe5SHans Petter Selasky 
331a2485fe5SHans Petter Selasky static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
332a2485fe5SHans Petter Selasky {
333ba11bcecSHans Petter Selasky 	u8 nic_mode = mlx5_get_nic_state(dev);
334a2485fe5SHans Petter Selasky 
3351900b6f8SHans Petter Selasky 	if (nic_mode == MLX5_NIC_IFC_SW_RESET) {
3361900b6f8SHans Petter Selasky 		/* The IFC mode field is 3 bits, so it will read 0x7 in two cases:
3371900b6f8SHans Petter Selasky 		 * 1. PCI has been disabled (ie. PCI-AER, PF driver unloaded
3381900b6f8SHans Petter Selasky 		 *    and this is a VF), this is not recoverable by SW reset.
3391900b6f8SHans Petter Selasky 		 *    Logging of this is handled elsewhere.
3401900b6f8SHans Petter Selasky 		 * 2. FW reset has been issued by another function, driver can
3411900b6f8SHans Petter Selasky 		 *    be reloaded to recover after the mode switches to
3421900b6f8SHans Petter Selasky 		 *    MLX5_NIC_IFC_DISABLED.
3431900b6f8SHans Petter Selasky 		 */
3441900b6f8SHans Petter Selasky 		if (dev->priv.health.fatal_error != MLX5_SENSOR_PCI_COMM_ERR)
3451900b6f8SHans Petter Selasky 			mlx5_core_warn(dev, "NIC SW reset is already progress\n");
3461900b6f8SHans Petter Selasky 		else
3471900b6f8SHans Petter Selasky 			mlx5_core_warn(dev, "Communication with FW over the PCI link is down\n");
3481900b6f8SHans Petter Selasky 	} else {
3491900b6f8SHans Petter Selasky 		mlx5_core_warn(dev, "NIC mode %d\n", nic_mode);
350a2485fe5SHans Petter Selasky 	}
351a2485fe5SHans Petter Selasky 
352a2485fe5SHans Petter Selasky 	mlx5_disable_device(dev);
353a2485fe5SHans Petter Selasky }
354dc7e38acSHans Petter Selasky 
3551900b6f8SHans Petter Selasky #define MLX5_FW_RESET_WAIT_MS	1000
3561900b6f8SHans Petter Selasky #define MLX5_NIC_STATE_POLL_MS	5
3574bb7662bSHans Petter Selasky static void health_recover(struct work_struct *work)
3584bb7662bSHans Petter Selasky {
3591900b6f8SHans Petter Selasky 	unsigned long end = jiffies + msecs_to_jiffies(MLX5_FW_RESET_WAIT_MS);
3604bb7662bSHans Petter Selasky 	struct mlx5_core_health *health;
3614bb7662bSHans Petter Selasky 	struct delayed_work *dwork;
3624bb7662bSHans Petter Selasky 	struct mlx5_core_dev *dev;
3634bb7662bSHans Petter Selasky 	struct mlx5_priv *priv;
364f20b553dSHans Petter Selasky 	bool recover = true;
3651900b6f8SHans Petter Selasky 	u8 nic_mode;
3664bb7662bSHans Petter Selasky 
3674bb7662bSHans Petter Selasky 	dwork = container_of(work, struct delayed_work, work);
3684bb7662bSHans Petter Selasky 	health = container_of(dwork, struct mlx5_core_health, recover_work);
3694bb7662bSHans Petter Selasky 	priv = container_of(health, struct mlx5_priv, health);
3704bb7662bSHans Petter Selasky 	dev = container_of(priv, struct mlx5_core_dev, priv);
3714bb7662bSHans Petter Selasky 
372ca2345a0SHans Petter Selasky 	mtx_lock(&Giant);	/* XXX newbus needs this */
373ca2345a0SHans Petter Selasky 
3741900b6f8SHans Petter Selasky 	if (sensor_pci_no_comm(dev)) {
3751900b6f8SHans Petter Selasky 		dev_err(&dev->pdev->dev, "health recovery flow aborted, PCI reads still not working\n");
376f20b553dSHans Petter Selasky 		recover = false;
3771900b6f8SHans Petter Selasky 	}
3781900b6f8SHans Petter Selasky 
379ba11bcecSHans Petter Selasky 	nic_mode = mlx5_get_nic_state(dev);
3801900b6f8SHans Petter Selasky 	while (nic_mode != MLX5_NIC_IFC_DISABLED &&
3811900b6f8SHans Petter Selasky 	       !time_after(jiffies, end)) {
3821900b6f8SHans Petter Selasky 		msleep(MLX5_NIC_STATE_POLL_MS);
383ba11bcecSHans Petter Selasky 		nic_mode = mlx5_get_nic_state(dev);
3841900b6f8SHans Petter Selasky 	}
3851900b6f8SHans Petter Selasky 
3861900b6f8SHans Petter Selasky 	if (nic_mode != MLX5_NIC_IFC_DISABLED) {
3871900b6f8SHans Petter Selasky 		dev_err(&dev->pdev->dev, "health recovery flow aborted, unexpected NIC IFC mode %d.\n",
3881900b6f8SHans Petter Selasky 			nic_mode);
389f20b553dSHans Petter Selasky 		recover = false;
3904bb7662bSHans Petter Selasky 	}
3914bb7662bSHans Petter Selasky 
392f20b553dSHans Petter Selasky 	if (recover) {
3934bb7662bSHans Petter Selasky 		dev_err(&dev->pdev->dev, "starting health recovery flow\n");
3944bb7662bSHans Petter Selasky 		mlx5_recover_device(dev);
3954bb7662bSHans Petter Selasky 	}
396ca2345a0SHans Petter Selasky 
397ca2345a0SHans Petter Selasky 	mtx_unlock(&Giant);
398f20b553dSHans Petter Selasky }
3994bb7662bSHans Petter Selasky 
4004bb7662bSHans Petter Selasky /* How much time to wait until health resetting the driver (in msecs) */
4014bb7662bSHans Petter Selasky #define MLX5_RECOVERY_DELAY_MSECS 60000
4021900b6f8SHans Petter Selasky #define MLX5_RECOVERY_NO_DELAY 0
4031900b6f8SHans Petter Selasky static unsigned long get_recovery_delay(struct mlx5_core_dev *dev)
4041900b6f8SHans Petter Selasky {
4051900b6f8SHans Petter Selasky 	return dev->priv.health.fatal_error == MLX5_SENSOR_PCI_ERR ||
4061900b6f8SHans Petter Selasky 		dev->priv.health.fatal_error == MLX5_SENSOR_PCI_COMM_ERR	?
4071900b6f8SHans Petter Selasky 		MLX5_RECOVERY_DELAY_MSECS : MLX5_RECOVERY_NO_DELAY;
4081900b6f8SHans Petter Selasky }
4091900b6f8SHans Petter Selasky 
410dc7e38acSHans Petter Selasky static void health_care(struct work_struct *work)
411dc7e38acSHans Petter Selasky {
412a2485fe5SHans Petter Selasky 	struct mlx5_core_health *health;
4131900b6f8SHans Petter Selasky 	unsigned long recover_delay;
414dc7e38acSHans Petter Selasky 	struct mlx5_core_dev *dev;
415dc7e38acSHans Petter Selasky 	struct mlx5_priv *priv;
4164bb7662bSHans Petter Selasky 	unsigned long flags;
417dc7e38acSHans Petter Selasky 
418a2485fe5SHans Petter Selasky 	health = container_of(work, struct mlx5_core_health, work);
419dc7e38acSHans Petter Selasky 	priv = container_of(health, struct mlx5_priv, health);
420dc7e38acSHans Petter Selasky 	dev = container_of(priv, struct mlx5_core_dev, priv);
421f20b553dSHans Petter Selasky 
422dc7e38acSHans Petter Selasky 	mlx5_core_warn(dev, "handling bad device here\n");
423a2485fe5SHans Petter Selasky 	mlx5_handle_bad_state(dev);
4241900b6f8SHans Petter Selasky 	recover_delay = msecs_to_jiffies(get_recovery_delay(dev));
4254bb7662bSHans Petter Selasky 
4264bb7662bSHans Petter Selasky 	spin_lock_irqsave(&health->wq_lock, flags);
427fe242ba7SHans Petter Selasky 	if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags)) {
428fe242ba7SHans Petter Selasky 		mlx5_core_warn(dev, "Scheduling recovery work with %lums delay\n",
429fe242ba7SHans Petter Selasky 			       recover_delay);
4304bb7662bSHans Petter Selasky 		schedule_delayed_work(&health->recover_work, recover_delay);
431fe242ba7SHans Petter Selasky 	} else {
4324bb7662bSHans Petter Selasky 		dev_err(&dev->pdev->dev,
4334bb7662bSHans Petter Selasky 			"new health works are not permitted at this stage\n");
434fe242ba7SHans Petter Selasky 	}
4354bb7662bSHans Petter Selasky 	spin_unlock_irqrestore(&health->wq_lock, flags);
436dc7e38acSHans Petter Selasky }
437a2485fe5SHans Petter Selasky 
438a2485fe5SHans Petter Selasky static int get_next_poll_jiffies(void)
439a2485fe5SHans Petter Selasky {
440a2485fe5SHans Petter Selasky 	unsigned long next;
441a2485fe5SHans Petter Selasky 
442a2485fe5SHans Petter Selasky 	get_random_bytes(&next, sizeof(next));
443a2485fe5SHans Petter Selasky 	next %= HZ;
444a2485fe5SHans Petter Selasky 	next += jiffies + MLX5_HEALTH_POLL_INTERVAL;
445a2485fe5SHans Petter Selasky 
446a2485fe5SHans Petter Selasky 	return next;
447dc7e38acSHans Petter Selasky }
448dc7e38acSHans Petter Selasky 
4494bb7662bSHans Petter Selasky void mlx5_trigger_health_work(struct mlx5_core_dev *dev)
4504bb7662bSHans Petter Selasky {
4514bb7662bSHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
4524bb7662bSHans Petter Selasky 	unsigned long flags;
4534bb7662bSHans Petter Selasky 
4544bb7662bSHans Petter Selasky 	spin_lock_irqsave(&health->wq_lock, flags);
4554bb7662bSHans Petter Selasky 	if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
4564bb7662bSHans Petter Selasky 		queue_work(health->wq, &health->work);
4574bb7662bSHans Petter Selasky 	else
4584bb7662bSHans Petter Selasky 		dev_err(&dev->pdev->dev,
4594bb7662bSHans Petter Selasky 			"new health works are not permitted at this stage\n");
4604bb7662bSHans Petter Selasky 	spin_unlock_irqrestore(&health->wq_lock, flags);
4614bb7662bSHans Petter Selasky }
4624bb7662bSHans Petter Selasky 
463dc7e38acSHans Petter Selasky static const char *hsynd_str(u8 synd)
464dc7e38acSHans Petter Selasky {
465dc7e38acSHans Petter Selasky 	switch (synd) {
466dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_FW_ERR:
467dc7e38acSHans Petter Selasky 		return "firmware internal error";
468dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_IRISC_ERR:
469dc7e38acSHans Petter Selasky 		return "irisc not responding";
470a2485fe5SHans Petter Selasky 	case MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR:
471a2485fe5SHans Petter Selasky 		return "unrecoverable hardware error";
472dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_CRC_ERR:
473dc7e38acSHans Petter Selasky 		return "firmware CRC error";
474dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR:
475dc7e38acSHans Petter Selasky 		return "ICM fetch PCI error";
476dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_HW_FTL_ERR:
477dc7e38acSHans Petter Selasky 		return "HW fatal error\n";
478dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR:
479dc7e38acSHans Petter Selasky 		return "async EQ buffer overrun";
480dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_EQ_ERR:
481dc7e38acSHans Petter Selasky 		return "EQ error";
482a2485fe5SHans Petter Selasky 	case MLX5_HEALTH_SYNDR_EQ_INV:
483a2485fe5SHans Petter Selasky 		return "Invalid EQ referenced";
484dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_FFSER_ERR:
485dc7e38acSHans Petter Selasky 		return "FFSER error";
486a2485fe5SHans Petter Selasky 	case MLX5_HEALTH_SYNDR_HIGH_TEMP:
487a2485fe5SHans Petter Selasky 		return "High temprature";
488dc7e38acSHans Petter Selasky 	default:
489dc7e38acSHans Petter Selasky 		return "unrecognized error";
490dc7e38acSHans Petter Selasky 	}
491dc7e38acSHans Petter Selasky }
492dc7e38acSHans Petter Selasky 
493c9bb26aeSHans Petter Selasky static u8
494c9bb26aeSHans Petter Selasky print_health_info(struct mlx5_core_dev *dev)
495dc7e38acSHans Petter Selasky {
496dc7e38acSHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
497dc7e38acSHans Petter Selasky 	struct mlx5_health_buffer __iomem *h = health->health;
498c9bb26aeSHans Petter Selasky 	u8 synd = ioread8(&h->synd);
499a2485fe5SHans Petter Selasky 	char fw_str[18];
500a2485fe5SHans Petter Selasky 	u32 fw;
501dc7e38acSHans Petter Selasky 	int i;
502dc7e38acSHans Petter Selasky 
503c9bb26aeSHans Petter Selasky 	/*
504c9bb26aeSHans Petter Selasky 	 * If synd is 0x0 - this indicates that FW is unable to
505c9bb26aeSHans Petter Selasky 	 * respond to initialization segment reads and health buffer
506c9bb26aeSHans Petter Selasky 	 * should not be read.
507c9bb26aeSHans Petter Selasky 	 */
508c9bb26aeSHans Petter Selasky 	if (synd == 0)
509c9bb26aeSHans Petter Selasky 		return (0);
510dc7e38acSHans Petter Selasky 
511a2485fe5SHans Petter Selasky 	for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
512a2485fe5SHans Petter Selasky 		printf("mlx5_core: INFO: ""assert_var[%d] 0x%08x\n", i, ioread32be(h->assert_var + i));
513a2485fe5SHans Petter Selasky 
514a2485fe5SHans Petter Selasky 	printf("mlx5_core: INFO: ""assert_exit_ptr 0x%08x\n", ioread32be(&h->assert_exit_ptr));
515a2485fe5SHans Petter Selasky 	printf("mlx5_core: INFO: ""assert_callra 0x%08x\n", ioread32be(&h->assert_callra));
516a2485fe5SHans Petter Selasky 	snprintf(fw_str, sizeof(fw_str), "%d.%d.%d", fw_rev_maj(dev), fw_rev_min(dev), fw_rev_sub(dev));
517a2485fe5SHans Petter Selasky 	printf("mlx5_core: INFO: ""fw_ver %s\n", fw_str);
518a2485fe5SHans Petter Selasky 	printf("mlx5_core: INFO: ""hw_id 0x%08x\n", ioread32be(&h->hw_id));
519a2485fe5SHans Petter Selasky 	printf("mlx5_core: INFO: ""irisc_index %d\n", ioread8(&h->irisc_index));
520c9bb26aeSHans Petter Selasky 	printf("mlx5_core: INFO: ""synd 0x%x: %s\n", synd, hsynd_str(synd));
521a2485fe5SHans Petter Selasky 	printf("mlx5_core: INFO: ""ext_synd 0x%04x\n", ioread16be(&h->ext_synd));
522a2485fe5SHans Petter Selasky 	fw = ioread32be(&h->fw_ver);
523a2485fe5SHans Petter Selasky 	printf("mlx5_core: INFO: ""raw fw_ver 0x%08x\n", fw);
524c9bb26aeSHans Petter Selasky 
525c9bb26aeSHans Petter Selasky 	return synd;
526dc7e38acSHans Petter Selasky }
527dc7e38acSHans Petter Selasky 
528adb6fd50SHans Petter Selasky static void health_watchdog(struct work_struct *work)
529adb6fd50SHans Petter Selasky {
530adb6fd50SHans Petter Selasky 	struct mlx5_core_dev *dev;
531adb6fd50SHans Petter Selasky 	u16 power;
532adb6fd50SHans Petter Selasky 	u8 status;
533adb6fd50SHans Petter Selasky 	int err;
534adb6fd50SHans Petter Selasky 
535adb6fd50SHans Petter Selasky 	dev = container_of(work, struct mlx5_core_dev, priv.health.work_watchdog);
536adb6fd50SHans Petter Selasky 
537adb6fd50SHans Petter Selasky 	if (!MLX5_CAP_GEN(dev, mcam_reg) ||
538adb6fd50SHans Petter Selasky 	    !MLX5_CAP_MCAM_FEATURE(dev, pcie_status_and_power))
539adb6fd50SHans Petter Selasky 		return;
540adb6fd50SHans Petter Selasky 
541adb6fd50SHans Petter Selasky 	err = mlx5_pci_read_power_status(dev, &power, &status);
542adb6fd50SHans Petter Selasky 	if (err < 0) {
543adb6fd50SHans Petter Selasky 		mlx5_core_warn(dev, "Failed reading power status: %d\n", err);
544adb6fd50SHans Petter Selasky 		return;
545adb6fd50SHans Petter Selasky 	}
546adb6fd50SHans Petter Selasky 
547adb6fd50SHans Petter Selasky 	dev->pwr_value = power;
548adb6fd50SHans Petter Selasky 
549adb6fd50SHans Petter Selasky 	if (dev->pwr_status != status) {
550adb6fd50SHans Petter Selasky 		device_t bsddev = dev->pdev->dev.bsddev;
551adb6fd50SHans Petter Selasky 
552adb6fd50SHans Petter Selasky 		switch (status) {
553adb6fd50SHans Petter Selasky 		case 0:
554adb6fd50SHans Petter Selasky 			dev->pwr_status = status;
555adb6fd50SHans Petter Selasky 			device_printf(bsddev, "PCI power is not published by the PCIe slot.\n");
556adb6fd50SHans Petter Selasky 			break;
557adb6fd50SHans Petter Selasky 		case 1:
558adb6fd50SHans Petter Selasky 			dev->pwr_status = status;
559adb6fd50SHans Petter Selasky 			device_printf(bsddev, "PCIe slot advertised sufficient power (%uW).\n", power);
560adb6fd50SHans Petter Selasky 			break;
561adb6fd50SHans Petter Selasky 		case 2:
562adb6fd50SHans Petter Selasky 			dev->pwr_status = status;
563adb6fd50SHans Petter Selasky 			device_printf(bsddev, "WARN: Detected insufficient power on the PCIe slot (%uW).\n", power);
564adb6fd50SHans Petter Selasky 			break;
565adb6fd50SHans Petter Selasky 		default:
566adb6fd50SHans Petter Selasky 			dev->pwr_status = 0;
567adb6fd50SHans Petter Selasky 			device_printf(bsddev, "WARN: Unknown power state detected(%d).\n", status);
568adb6fd50SHans Petter Selasky 			break;
569adb6fd50SHans Petter Selasky 		}
570adb6fd50SHans Petter Selasky 	}
571adb6fd50SHans Petter Selasky }
572adb6fd50SHans Petter Selasky 
573adb6fd50SHans Petter Selasky void
574adb6fd50SHans Petter Selasky mlx5_trigger_health_watchdog(struct mlx5_core_dev *dev)
575adb6fd50SHans Petter Selasky {
576adb6fd50SHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
577adb6fd50SHans Petter Selasky 	unsigned long flags;
578adb6fd50SHans Petter Selasky 
579adb6fd50SHans Petter Selasky 	spin_lock_irqsave(&health->wq_lock, flags);
580adb6fd50SHans Petter Selasky 	if (!test_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags))
581adb6fd50SHans Petter Selasky 		queue_work(health->wq_watchdog, &health->work_watchdog);
582adb6fd50SHans Petter Selasky 	else
583adb6fd50SHans Petter Selasky 		dev_err(&dev->pdev->dev,
584adb6fd50SHans Petter Selasky 			"scheduling watchdog is not permitted at this stage\n");
585adb6fd50SHans Petter Selasky 	spin_unlock_irqrestore(&health->wq_lock, flags);
586adb6fd50SHans Petter Selasky }
587adb6fd50SHans Petter Selasky 
58803ab395eSHans Petter Selasky static void poll_health(unsigned long data)
589dc7e38acSHans Petter Selasky {
590dc7e38acSHans Petter Selasky 	struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data;
591dc7e38acSHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
5921900b6f8SHans Petter Selasky 	u32 fatal_error;
593dc7e38acSHans Petter Selasky 	u32 count;
594dc7e38acSHans Petter Selasky 
59530dfc051SHans Petter Selasky 	if (dev->state != MLX5_DEVICE_STATE_UP)
59630dfc051SHans Petter Selasky 		return;
59730dfc051SHans Petter Selasky 
598dc7e38acSHans Petter Selasky 	count = ioread32be(health->health_counter);
599dc7e38acSHans Petter Selasky 	if (count == health->prev)
600dc7e38acSHans Petter Selasky 		++health->miss_counter;
601dc7e38acSHans Petter Selasky 	else
602dc7e38acSHans Petter Selasky 		health->miss_counter = 0;
603dc7e38acSHans Petter Selasky 
604dc7e38acSHans Petter Selasky 	health->prev = count;
605dc7e38acSHans Petter Selasky 	if (health->miss_counter == MAX_MISSES) {
606a2485fe5SHans Petter Selasky 		mlx5_core_err(dev, "device's health compromised - reached miss count\n");
607c9bb26aeSHans Petter Selasky 		if (print_health_info(dev) == 0)
608c9bb26aeSHans Petter Selasky 			mlx5_core_err(dev, "FW is unable to respond to initialization segment reads\n");
609a2485fe5SHans Petter Selasky 	}
610a2485fe5SHans Petter Selasky 
6111900b6f8SHans Petter Selasky 	fatal_error = check_fatal_sensors(dev);
6121900b6f8SHans Petter Selasky 
6131900b6f8SHans Petter Selasky 	if (fatal_error && !health->fatal_error) {
6141900b6f8SHans Petter Selasky 		mlx5_core_err(dev, "Fatal error %u detected\n", fatal_error);
6151900b6f8SHans Petter Selasky 		dev->priv.health.fatal_error = fatal_error;
616a2485fe5SHans Petter Selasky 		print_health_info(dev);
6174bb7662bSHans Petter Selasky 		mlx5_trigger_health_work(dev);
618dc7e38acSHans Petter Selasky 	}
6194bb7662bSHans Petter Selasky 
6204bb7662bSHans Petter Selasky 	mod_timer(&health->timer, get_next_poll_jiffies());
621dc7e38acSHans Petter Selasky }
622dc7e38acSHans Petter Selasky 
623dc7e38acSHans Petter Selasky void mlx5_start_health_poll(struct mlx5_core_dev *dev)
624dc7e38acSHans Petter Selasky {
625dc7e38acSHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
626dc7e38acSHans Petter Selasky 
627dc7e38acSHans Petter Selasky 	init_timer(&health->timer);
6281900b6f8SHans Petter Selasky 	health->fatal_error = MLX5_SENSOR_NO_ERR;
629ca551594SHans Petter Selasky 	clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
630519774eaSHans Petter Selasky 	clear_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
631adb6fd50SHans Petter Selasky 	clear_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags);
632dc7e38acSHans Petter Selasky 	health->health = &dev->iseg->health;
633dc7e38acSHans Petter Selasky 	health->health_counter = &dev->iseg->health_counter;
634dc7e38acSHans Petter Selasky 
63503ab395eSHans Petter Selasky 	setup_timer(&health->timer, poll_health, (unsigned long)dev);
636dc7e38acSHans Petter Selasky 	mod_timer(&health->timer,
637dc7e38acSHans Petter Selasky 		  round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL));
638adb6fd50SHans Petter Selasky 
639adb6fd50SHans Petter Selasky 	/* do initial PCI power state readout */
640adb6fd50SHans Petter Selasky 	mlx5_trigger_health_watchdog(dev);
641dc7e38acSHans Petter Selasky }
642dc7e38acSHans Petter Selasky 
6432119f825SSlava Shwartsman void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health)
644dc7e38acSHans Petter Selasky {
645dc7e38acSHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
6462119f825SSlava Shwartsman 	unsigned long flags;
6472119f825SSlava Shwartsman 
6482119f825SSlava Shwartsman 	if (disable_health) {
6492119f825SSlava Shwartsman 		spin_lock_irqsave(&health->wq_lock, flags);
6502119f825SSlava Shwartsman 		set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
6512119f825SSlava Shwartsman 		set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
652adb6fd50SHans Petter Selasky 		set_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags);
6532119f825SSlava Shwartsman 		spin_unlock_irqrestore(&health->wq_lock, flags);
6542119f825SSlava Shwartsman 	}
655dc7e38acSHans Petter Selasky 
656dc7e38acSHans Petter Selasky 	del_timer_sync(&health->timer);
657dc7e38acSHans Petter Selasky }
658dc7e38acSHans Petter Selasky 
659ca551594SHans Petter Selasky void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
660ca551594SHans Petter Selasky {
661ca551594SHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
6624bb7662bSHans Petter Selasky 	unsigned long flags;
663ca551594SHans Petter Selasky 
6644bb7662bSHans Petter Selasky 	spin_lock_irqsave(&health->wq_lock, flags);
665ca551594SHans Petter Selasky 	set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
666519774eaSHans Petter Selasky 	set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
667adb6fd50SHans Petter Selasky 	set_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags);
6684bb7662bSHans Petter Selasky 	spin_unlock_irqrestore(&health->wq_lock, flags);
6694bb7662bSHans Petter Selasky 	cancel_delayed_work_sync(&health->recover_work);
670ca551594SHans Petter Selasky 	cancel_work_sync(&health->work);
671adb6fd50SHans Petter Selasky 	cancel_work_sync(&health->work_watchdog);
672ca551594SHans Petter Selasky }
673ca551594SHans Petter Selasky 
674519774eaSHans Petter Selasky void mlx5_drain_health_recovery(struct mlx5_core_dev *dev)
675519774eaSHans Petter Selasky {
676519774eaSHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
677519774eaSHans Petter Selasky 	unsigned long flags;
678519774eaSHans Petter Selasky 
679519774eaSHans Petter Selasky 	spin_lock_irqsave(&health->wq_lock, flags);
680519774eaSHans Petter Selasky 	set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
681519774eaSHans Petter Selasky 	spin_unlock_irqrestore(&health->wq_lock, flags);
682519774eaSHans Petter Selasky 	cancel_delayed_work_sync(&dev->priv.health.recover_work);
683519774eaSHans Petter Selasky }
684519774eaSHans Petter Selasky 
685a2485fe5SHans Petter Selasky void mlx5_health_cleanup(struct mlx5_core_dev *dev)
686dc7e38acSHans Petter Selasky {
687a2485fe5SHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
688a2485fe5SHans Petter Selasky 
689a2485fe5SHans Petter Selasky 	destroy_workqueue(health->wq);
69040218d73SHans Petter Selasky 	destroy_workqueue(health->wq_watchdog);
6918d1eeedbSHans Petter Selasky 	destroy_workqueue(health->wq_cmd);
692dc7e38acSHans Petter Selasky }
693dc7e38acSHans Petter Selasky 
694a2485fe5SHans Petter Selasky int mlx5_health_init(struct mlx5_core_dev *dev)
695dc7e38acSHans Petter Selasky {
696a2485fe5SHans Petter Selasky 	struct mlx5_core_health *health;
69740218d73SHans Petter Selasky 	char name[64];
698dc7e38acSHans Petter Selasky 
699a2485fe5SHans Petter Selasky 	health = &dev->priv.health;
700a2485fe5SHans Petter Selasky 
70140218d73SHans Petter Selasky 	snprintf(name, sizeof(name), "%s-rec", dev_name(&dev->pdev->dev));
702a2485fe5SHans Petter Selasky 	health->wq = create_singlethread_workqueue(name);
703a2485fe5SHans Petter Selasky 	if (!health->wq)
7048d1eeedbSHans Petter Selasky 		goto err_recovery;
705a2485fe5SHans Petter Selasky 
70640218d73SHans Petter Selasky 	snprintf(name, sizeof(name), "%s-wdg", dev_name(&dev->pdev->dev));
70740218d73SHans Petter Selasky 	health->wq_watchdog = create_singlethread_workqueue(name);
7088d1eeedbSHans Petter Selasky 	if (!health->wq_watchdog)
7098d1eeedbSHans Petter Selasky 		goto err_watchdog;
7108d1eeedbSHans Petter Selasky 
7118d1eeedbSHans Petter Selasky 	snprintf(name, sizeof(name), "%s-cmd", dev_name(&dev->pdev->dev));
7128d1eeedbSHans Petter Selasky 	health->wq_cmd = create_singlethread_workqueue(name);
7138d1eeedbSHans Petter Selasky 	if (!health->wq_cmd)
7148d1eeedbSHans Petter Selasky 		goto err_cmd;
71540218d73SHans Petter Selasky 
716ca551594SHans Petter Selasky 	spin_lock_init(&health->wq_lock);
717a2485fe5SHans Petter Selasky 	INIT_WORK(&health->work, health_care);
718adb6fd50SHans Petter Selasky 	INIT_WORK(&health->work_watchdog, health_watchdog);
719a0a4fd77SHans Petter Selasky 	INIT_WORK(&health->work_cmd_completion, mlx5_trigger_cmd_completions);
7204bb7662bSHans Petter Selasky 	INIT_DELAYED_WORK(&health->recover_work, health_recover);
721a2485fe5SHans Petter Selasky 
722a2485fe5SHans Petter Selasky 	return 0;
7238d1eeedbSHans Petter Selasky 
7248d1eeedbSHans Petter Selasky err_cmd:
7258d1eeedbSHans Petter Selasky 	destroy_workqueue(health->wq_watchdog);
7268d1eeedbSHans Petter Selasky err_watchdog:
7278d1eeedbSHans Petter Selasky 	destroy_workqueue(health->wq);
7288d1eeedbSHans Petter Selasky err_recovery:
7298d1eeedbSHans Petter Selasky 	return -ENOMEM;
730dc7e38acSHans Petter Selasky }
731