xref: /freebsd/sys/dev/mlx5/mlx5_core/mlx5_health.c (revision 12c56d7d)
1dc7e38acSHans Petter Selasky /*-
28d1eeedbSHans Petter Selasky  * Copyright (c) 2013-2019, Mellanox Technologies, Ltd.  All rights reserved.
3dc7e38acSHans Petter Selasky  *
4dc7e38acSHans Petter Selasky  * Redistribution and use in source and binary forms, with or without
5dc7e38acSHans Petter Selasky  * modification, are permitted provided that the following conditions
6dc7e38acSHans Petter Selasky  * are met:
7dc7e38acSHans Petter Selasky  * 1. Redistributions of source code must retain the above copyright
8dc7e38acSHans Petter Selasky  *    notice, this list of conditions and the following disclaimer.
9dc7e38acSHans Petter Selasky  * 2. Redistributions in binary form must reproduce the above copyright
10dc7e38acSHans Petter Selasky  *    notice, this list of conditions and the following disclaimer in the
11dc7e38acSHans Petter Selasky  *    documentation and/or other materials provided with the distribution.
12dc7e38acSHans Petter Selasky  *
13dc7e38acSHans Petter Selasky  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14dc7e38acSHans Petter Selasky  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15dc7e38acSHans Petter Selasky  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16dc7e38acSHans Petter Selasky  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17dc7e38acSHans Petter Selasky  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18dc7e38acSHans Petter Selasky  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19dc7e38acSHans Petter Selasky  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20dc7e38acSHans Petter Selasky  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21dc7e38acSHans Petter Selasky  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22dc7e38acSHans Petter Selasky  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23dc7e38acSHans Petter Selasky  * SUCH DAMAGE.
24dc7e38acSHans Petter Selasky  *
25dc7e38acSHans Petter Selasky  * $FreeBSD$
26dc7e38acSHans Petter Selasky  */
27dc7e38acSHans Petter Selasky 
28ee9d634bSKonstantin Belousov #include "opt_rss.h"
29ee9d634bSKonstantin Belousov #include "opt_ratelimit.h"
30ee9d634bSKonstantin Belousov 
31dc7e38acSHans Petter Selasky #include <linux/kernel.h>
32dc7e38acSHans Petter Selasky #include <linux/module.h>
33dc7e38acSHans Petter Selasky #include <linux/random.h>
34dc7e38acSHans Petter Selasky #include <linux/vmalloc.h>
35a2485fe5SHans Petter Selasky #include <linux/hardirq.h>
361900b6f8SHans Petter Selasky #include <linux/delay.h>
37dc7e38acSHans Petter Selasky #include <dev/mlx5/driver.h>
38dc7e38acSHans Petter Selasky #include <dev/mlx5/mlx5_ifc.h>
3912c56d7dSHans Petter Selasky #include <dev/mlx5/mlx5_core/mlx5_core.h>
40dc7e38acSHans Petter Selasky 
41dc7e38acSHans Petter Selasky #define	MLX5_HEALTH_POLL_INTERVAL	(2 * HZ)
42dc7e38acSHans Petter Selasky #define	MAX_MISSES			3
43dc7e38acSHans Petter Selasky 
44a2485fe5SHans Petter Selasky enum {
45ca551594SHans Petter Selasky 	MLX5_DROP_NEW_HEALTH_WORK,
46519774eaSHans Petter Selasky 	MLX5_DROP_NEW_RECOVERY_WORK,
47adb6fd50SHans Petter Selasky 	MLX5_DROP_NEW_WATCHDOG_WORK,
48ca551594SHans Petter Selasky };
49ca551594SHans Petter Selasky 
501900b6f8SHans Petter Selasky enum  {
511900b6f8SHans Petter Selasky 	MLX5_SENSOR_NO_ERR		= 0,
521900b6f8SHans Petter Selasky 	MLX5_SENSOR_PCI_COMM_ERR	= 1,
531900b6f8SHans Petter Selasky 	MLX5_SENSOR_PCI_ERR		= 2,
541900b6f8SHans Petter Selasky 	MLX5_SENSOR_NIC_DISABLED	= 3,
551900b6f8SHans Petter Selasky 	MLX5_SENSOR_NIC_SW_RESET	= 4,
56fe242ba7SHans Petter Selasky 	MLX5_SENSOR_FW_SYND_RFR		= 5,
571900b6f8SHans Petter Selasky };
581900b6f8SHans Petter Selasky 
5929e54451SSlava Shwartsman static int mlx5_fw_reset_enable = 1;
6029e54451SSlava Shwartsman SYSCTL_INT(_hw_mlx5, OID_AUTO, fw_reset_enable, CTLFLAG_RWTUN,
6129e54451SSlava Shwartsman     &mlx5_fw_reset_enable, 0,
6229e54451SSlava Shwartsman     "Enable firmware reset");
6329e54451SSlava Shwartsman 
645169fb81SHans Petter Selasky static unsigned int sw_reset_to = 1200;
655169fb81SHans Petter Selasky SYSCTL_UINT(_hw_mlx5, OID_AUTO, sw_reset_timeout, CTLFLAG_RWTUN,
665169fb81SHans Petter Selasky     &sw_reset_to, 0,
675169fb81SHans Petter Selasky     "Minimum timeout in seconds between two firmware resets");
685169fb81SHans Petter Selasky 
695169fb81SHans Petter Selasky 
70b575d8c8SHans Petter Selasky static int lock_sem_sw_reset(struct mlx5_core_dev *dev)
71f20b553dSHans Petter Selasky {
72b575d8c8SHans Petter Selasky 	int ret;
73f20b553dSHans Petter Selasky 
74f20b553dSHans Petter Selasky 	/* Lock GW access */
75b575d8c8SHans Petter Selasky 	ret = -mlx5_vsc_lock(dev);
76f20b553dSHans Petter Selasky 	if (ret) {
77b575d8c8SHans Petter Selasky 		mlx5_core_warn(dev, "Timed out locking gateway %d\n", ret);
78f20b553dSHans Petter Selasky 		return ret;
79f20b553dSHans Petter Selasky 	}
80f20b553dSHans Petter Selasky 
81b575d8c8SHans Petter Selasky 	ret = -mlx5_vsc_lock_addr_space(dev, MLX5_SEMAPHORE_SW_RESET);
82b575d8c8SHans Petter Selasky 	if (ret) {
83f20b553dSHans Petter Selasky 		if (ret == -EBUSY)
84a2f4f59cSHans Petter Selasky 			mlx5_core_dbg(dev,
85a2f4f59cSHans Petter Selasky 			    "SW reset FW semaphore already locked, another function will handle the reset\n");
86f20b553dSHans Petter Selasky 		else
87a2f4f59cSHans Petter Selasky 			mlx5_core_warn(dev,
88a2f4f59cSHans Petter Selasky 			    "SW reset semaphore lock return %d\n", ret);
89f20b553dSHans Petter Selasky 	}
90f20b553dSHans Petter Selasky 
91f20b553dSHans Petter Selasky 	/* Unlock GW access */
92b575d8c8SHans Petter Selasky 	mlx5_vsc_unlock(dev);
93b575d8c8SHans Petter Selasky 
94b575d8c8SHans Petter Selasky 	return ret;
95b575d8c8SHans Petter Selasky }
96b575d8c8SHans Petter Selasky 
97b575d8c8SHans Petter Selasky static int unlock_sem_sw_reset(struct mlx5_core_dev *dev)
98b575d8c8SHans Petter Selasky {
99b575d8c8SHans Petter Selasky 	int ret;
100b575d8c8SHans Petter Selasky 
101b575d8c8SHans Petter Selasky 	/* Lock GW access */
102b575d8c8SHans Petter Selasky 	ret = -mlx5_vsc_lock(dev);
103b575d8c8SHans Petter Selasky 	if (ret) {
104b575d8c8SHans Petter Selasky 		mlx5_core_warn(dev, "Timed out locking gateway %d\n", ret);
105b575d8c8SHans Petter Selasky 		return ret;
106b575d8c8SHans Petter Selasky 	}
107b575d8c8SHans Petter Selasky 
108b575d8c8SHans Petter Selasky 	ret = -mlx5_vsc_unlock_addr_space(dev, MLX5_SEMAPHORE_SW_RESET);
109b575d8c8SHans Petter Selasky 
110b575d8c8SHans Petter Selasky 	/* Unlock GW access */
111b575d8c8SHans Petter Selasky 	mlx5_vsc_unlock(dev);
112f20b553dSHans Petter Selasky 
113f20b553dSHans Petter Selasky 	return ret;
114f20b553dSHans Petter Selasky }
115f20b553dSHans Petter Selasky 
116ba11bcecSHans Petter Selasky u8 mlx5_get_nic_state(struct mlx5_core_dev *dev)
117a2485fe5SHans Petter Selasky {
1181900b6f8SHans Petter Selasky 	return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 7;
119a2485fe5SHans Petter Selasky }
120a2485fe5SHans Petter Selasky 
121ba11bcecSHans Petter Selasky void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state)
122ba11bcecSHans Petter Selasky {
123ba11bcecSHans Petter Selasky 	u32 cur_cmdq_addr_l_sz;
124ba11bcecSHans Petter Selasky 
125ba11bcecSHans Petter Selasky 	cur_cmdq_addr_l_sz = ioread32be(&dev->iseg->cmdq_addr_l_sz);
126ba11bcecSHans Petter Selasky 	iowrite32be((cur_cmdq_addr_l_sz & 0xFFFFF000) |
127ba11bcecSHans Petter Selasky 		    state << MLX5_NIC_IFC_OFFSET,
128ba11bcecSHans Petter Selasky 		    &dev->iseg->cmdq_addr_l_sz);
129ba11bcecSHans Petter Selasky }
130ba11bcecSHans Petter Selasky 
131fe242ba7SHans Petter Selasky static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev)
132fe242ba7SHans Petter Selasky {
133fe242ba7SHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
134fe242ba7SHans Petter Selasky 	struct mlx5_health_buffer __iomem *h = health->health;
135fe242ba7SHans Petter Selasky 	u32 rfr = ioread32be(&h->rfr) >> MLX5_RFR_OFFSET;
136fe242ba7SHans Petter Selasky 	u8 synd = ioread8(&h->synd);
137fe242ba7SHans Petter Selasky 
138fe242ba7SHans Petter Selasky 	if (rfr && synd)
139fe242ba7SHans Petter Selasky 		mlx5_core_dbg(dev, "FW requests reset, synd: %d\n", synd);
140fe242ba7SHans Petter Selasky 	return rfr && synd;
141fe242ba7SHans Petter Selasky }
142fe242ba7SHans Petter Selasky 
143a0a4fd77SHans Petter Selasky static void mlx5_trigger_cmd_completions(struct work_struct *work)
144a2485fe5SHans Petter Selasky {
145a0a4fd77SHans Petter Selasky 	struct mlx5_core_dev *dev =
146a0a4fd77SHans Petter Selasky 	    container_of(work, struct mlx5_core_dev, priv.health.work_cmd_completion);
147a2485fe5SHans Petter Selasky 	unsigned long flags;
148a2485fe5SHans Petter Selasky 	u64 vector;
149a2485fe5SHans Petter Selasky 
150a2485fe5SHans Petter Selasky 	/* wait for pending handlers to complete */
151a2485fe5SHans Petter Selasky 	synchronize_irq(dev->priv.msix_arr[MLX5_EQ_VEC_CMD].vector);
152a2485fe5SHans Petter Selasky 	spin_lock_irqsave(&dev->cmd.alloc_lock, flags);
153a2485fe5SHans Petter Selasky 	vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1);
154a2485fe5SHans Petter Selasky 	if (!vector)
155a2485fe5SHans Petter Selasky 		goto no_trig;
156a2485fe5SHans Petter Selasky 
157a2485fe5SHans Petter Selasky 	vector |= MLX5_TRIGGERED_CMD_COMP;
158a2485fe5SHans Petter Selasky 	spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
159a2485fe5SHans Petter Selasky 
1602cec1528SKonstantin Belousov 	mlx5_core_dbg(dev, "vector 0x%jx\n", (uintmax_t)vector);
161721a1a6aSSlava Shwartsman 	mlx5_cmd_comp_handler(dev, vector, MLX5_CMD_MODE_EVENTS);
162a2485fe5SHans Petter Selasky 	return;
163a2485fe5SHans Petter Selasky 
164a2485fe5SHans Petter Selasky no_trig:
165a2485fe5SHans Petter Selasky 	spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
166a2485fe5SHans Petter Selasky }
167a2485fe5SHans Petter Selasky 
1681900b6f8SHans Petter Selasky static bool sensor_pci_no_comm(struct mlx5_core_dev *dev)
169a2485fe5SHans Petter Selasky {
170a2485fe5SHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
171a2485fe5SHans Petter Selasky 	struct mlx5_health_buffer __iomem *h = health->health;
1721900b6f8SHans Petter Selasky 	bool err = ioread32be(&h->fw_ver) == 0xffffffff;
173a2485fe5SHans Petter Selasky 
1741900b6f8SHans Petter Selasky 	return err;
1751900b6f8SHans Petter Selasky }
176a2485fe5SHans Petter Selasky 
1771900b6f8SHans Petter Selasky static bool sensor_nic_disabled(struct mlx5_core_dev *dev)
1781900b6f8SHans Petter Selasky {
179ba11bcecSHans Petter Selasky 	return mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED;
1801900b6f8SHans Petter Selasky }
181a2485fe5SHans Petter Selasky 
1821900b6f8SHans Petter Selasky static bool sensor_nic_sw_reset(struct mlx5_core_dev *dev)
1831900b6f8SHans Petter Selasky {
184ba11bcecSHans Petter Selasky 	return mlx5_get_nic_state(dev) == MLX5_NIC_IFC_SW_RESET;
1851900b6f8SHans Petter Selasky }
1861900b6f8SHans Petter Selasky 
1871900b6f8SHans Petter Selasky static u32 check_fatal_sensors(struct mlx5_core_dev *dev)
1881900b6f8SHans Petter Selasky {
1891900b6f8SHans Petter Selasky 	if (sensor_pci_no_comm(dev))
1901900b6f8SHans Petter Selasky 		return MLX5_SENSOR_PCI_COMM_ERR;
1911900b6f8SHans Petter Selasky 	if (pci_channel_offline(dev->pdev))
1921900b6f8SHans Petter Selasky 		return MLX5_SENSOR_PCI_ERR;
1931900b6f8SHans Petter Selasky 	if (sensor_nic_disabled(dev))
1941900b6f8SHans Petter Selasky 		return MLX5_SENSOR_NIC_DISABLED;
1951900b6f8SHans Petter Selasky 	if (sensor_nic_sw_reset(dev))
1961900b6f8SHans Petter Selasky 		return MLX5_SENSOR_NIC_SW_RESET;
197fe242ba7SHans Petter Selasky 	if (sensor_fw_synd_rfr(dev))
198fe242ba7SHans Petter Selasky 		return MLX5_SENSOR_FW_SYND_RFR;
1991900b6f8SHans Petter Selasky 
2001900b6f8SHans Petter Selasky 	return MLX5_SENSOR_NO_ERR;
201a2485fe5SHans Petter Selasky }
202a2485fe5SHans Petter Selasky 
203fe242ba7SHans Petter Selasky static void reset_fw_if_needed(struct mlx5_core_dev *dev)
204fe242ba7SHans Petter Selasky {
20529e54451SSlava Shwartsman 	bool supported;
206fe242ba7SHans Petter Selasky 	u32 cmdq_addr, fatal_error;
207fe242ba7SHans Petter Selasky 
20829e54451SSlava Shwartsman 	if (!mlx5_fw_reset_enable)
20929e54451SSlava Shwartsman 		return;
21029e54451SSlava Shwartsman 	supported = (ioread32be(&dev->iseg->initializing) >>
21129e54451SSlava Shwartsman 	    MLX5_FW_RESET_SUPPORTED_OFFSET) & 1;
212fe242ba7SHans Petter Selasky 	if (!supported)
213fe242ba7SHans Petter Selasky 		return;
214fe242ba7SHans Petter Selasky 
215fe242ba7SHans Petter Selasky 	/* The reset only needs to be issued by one PF. The health buffer is
216fe242ba7SHans Petter Selasky 	 * shared between all functions, and will be cleared during a reset.
217fe242ba7SHans Petter Selasky 	 * Check again to avoid a redundant 2nd reset. If the fatal erros was
218fe242ba7SHans Petter Selasky 	 * PCI related a reset won't help.
219fe242ba7SHans Petter Selasky 	 */
220fe242ba7SHans Petter Selasky 	fatal_error = check_fatal_sensors(dev);
221fe242ba7SHans Petter Selasky 	if (fatal_error == MLX5_SENSOR_PCI_COMM_ERR ||
222fe242ba7SHans Petter Selasky 	    fatal_error == MLX5_SENSOR_NIC_DISABLED ||
223d28b6b55SHans Petter Selasky 	    fatal_error == MLX5_SENSOR_NIC_SW_RESET) {
224a2f4f59cSHans Petter Selasky 		mlx5_core_warn(dev,
225a2f4f59cSHans Petter Selasky 		    "Not issuing FW reset. Either it's already done or won't help.\n");
226fe242ba7SHans Petter Selasky 		return;
227fe242ba7SHans Petter Selasky 	}
228fe242ba7SHans Petter Selasky 
229a2f4f59cSHans Petter Selasky 	mlx5_core_info(dev, "Issuing FW Reset\n");
230fe242ba7SHans Petter Selasky 	/* Write the NIC interface field to initiate the reset, the command
231fe242ba7SHans Petter Selasky 	 * interface address also resides here, don't overwrite it.
232fe242ba7SHans Petter Selasky 	 */
233fe242ba7SHans Petter Selasky 	cmdq_addr = ioread32be(&dev->iseg->cmdq_addr_l_sz);
234fe242ba7SHans Petter Selasky 	iowrite32be((cmdq_addr & 0xFFFFF000) |
235fe242ba7SHans Petter Selasky 		    MLX5_NIC_IFC_SW_RESET << MLX5_NIC_IFC_OFFSET,
236fe242ba7SHans Petter Selasky 		    &dev->iseg->cmdq_addr_l_sz);
237fe242ba7SHans Petter Selasky }
238fe242ba7SHans Petter Selasky 
2395169fb81SHans Petter Selasky static bool
2405169fb81SHans Petter Selasky mlx5_health_allow_reset(struct mlx5_core_dev *dev)
2415169fb81SHans Petter Selasky {
2425169fb81SHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
2435169fb81SHans Petter Selasky 	unsigned int delta;
2445169fb81SHans Petter Selasky 	bool ret;
2455169fb81SHans Petter Selasky 
2465169fb81SHans Petter Selasky 	if (health->last_reset_req != 0) {
2475169fb81SHans Petter Selasky 		delta = ticks - health->last_reset_req;
2485169fb81SHans Petter Selasky 		delta /= hz;
2495169fb81SHans Petter Selasky 		ret = delta >= sw_reset_to;
2505169fb81SHans Petter Selasky 	} else {
2515169fb81SHans Petter Selasky 		ret = true;
2525169fb81SHans Petter Selasky 	}
2535169fb81SHans Petter Selasky 
2545169fb81SHans Petter Selasky 	/*
2555169fb81SHans Petter Selasky 	 * In principle, ticks may be 0. Setting it to off by one (-1)
2565169fb81SHans Petter Selasky 	 * to prevent certain reset in next request.
2575169fb81SHans Petter Selasky 	 */
2585169fb81SHans Petter Selasky 	health->last_reset_req = ticks ? : -1;
2595169fb81SHans Petter Selasky 	if (!ret)
260a2f4f59cSHans Petter Selasky 		mlx5_core_warn(dev,
261a2f4f59cSHans Petter Selasky 		    "Firmware reset elided due to auto-reset frequency threshold.\n");
2625169fb81SHans Petter Selasky 	return (ret);
2635169fb81SHans Petter Selasky }
2645169fb81SHans Petter Selasky 
265d28b6b55SHans Petter Selasky #define MLX5_CRDUMP_WAIT_MS	60000
266d28b6b55SHans Petter Selasky #define MLX5_FW_RESET_WAIT_MS	1000
267d28b6b55SHans Petter Selasky #define MLX5_NIC_STATE_POLL_MS	5
268c0902569SHans Petter Selasky void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
269a2485fe5SHans Petter Selasky {
270c2a1e807SHans Petter Selasky 	int end, delay_ms = MLX5_CRDUMP_WAIT_MS;
271d28b6b55SHans Petter Selasky 	u32 fatal_error;
272d28b6b55SHans Petter Selasky 	int lock = -EBUSY;
273d28b6b55SHans Petter Selasky 
274d28b6b55SHans Petter Selasky 	fatal_error = check_fatal_sensors(dev);
275d28b6b55SHans Petter Selasky 
276d28b6b55SHans Petter Selasky 	if (fatal_error || force) {
2776d54b22dSHans Petter Selasky 		if (xchg(&dev->state, MLX5_DEVICE_STATE_INTERNAL_ERROR) ==
2786d54b22dSHans Petter Selasky 		    MLX5_DEVICE_STATE_INTERNAL_ERROR)
2796d54b22dSHans Petter Selasky 			return;
2801fb6089cSHans Petter Selasky 		if (!force)
2811fb6089cSHans Petter Selasky 			mlx5_core_err(dev, "internal state error detected\n");
282a0a4fd77SHans Petter Selasky 
283a0a4fd77SHans Petter Selasky 		/*
284a0a4fd77SHans Petter Selasky 		 * Queue the command completion handler on the command
285a0a4fd77SHans Petter Selasky 		 * work queue to avoid racing with the real command
286a0a4fd77SHans Petter Selasky 		 * completion handler and then wait for it to
287a0a4fd77SHans Petter Selasky 		 * complete:
288a0a4fd77SHans Petter Selasky 		 */
2898d1eeedbSHans Petter Selasky 		queue_work(dev->priv.health.wq_cmd, &dev->priv.health.work_cmd_completion);
2908d1eeedbSHans Petter Selasky 		flush_workqueue(dev->priv.health.wq_cmd);
2917053deebSHans Petter Selasky 	}
292a2485fe5SHans Petter Selasky 
2936d54b22dSHans Petter Selasky 	mutex_lock(&dev->intf_state_mutex);
2946d54b22dSHans Petter Selasky 
295d28b6b55SHans Petter Selasky 	if (force)
296d28b6b55SHans Petter Selasky 		goto err_state_done;
297d28b6b55SHans Petter Selasky 
2985169fb81SHans Petter Selasky 	if (fatal_error == MLX5_SENSOR_FW_SYND_RFR &&
2995169fb81SHans Petter Selasky 	    mlx5_health_allow_reset(dev)) {
30092d23c82SHans Petter Selasky 		/* Get cr-dump and reset FW semaphore */
301d28b6b55SHans Petter Selasky 		if (mlx5_core_is_pf(dev))
302b575d8c8SHans Petter Selasky 			lock = lock_sem_sw_reset(dev);
303d28b6b55SHans Petter Selasky 
30492d23c82SHans Petter Selasky 		/* Execute cr-dump and SW reset */
305d28b6b55SHans Petter Selasky 		if (lock != -EBUSY) {
3063e40712eSHans Petter Selasky 			(void)mlx5_fwdump(dev);
307d28b6b55SHans Petter Selasky 			reset_fw_if_needed(dev);
308d28b6b55SHans Petter Selasky 			delay_ms = MLX5_FW_RESET_WAIT_MS;
309d28b6b55SHans Petter Selasky 		}
310d28b6b55SHans Petter Selasky 	}
311d28b6b55SHans Petter Selasky 
312d28b6b55SHans Petter Selasky 	/* Recover from SW reset */
313d28b6b55SHans Petter Selasky 	end = jiffies + msecs_to_jiffies(delay_ms);
314d28b6b55SHans Petter Selasky 	do {
315d28b6b55SHans Petter Selasky 		if (sensor_nic_disabled(dev))
316d28b6b55SHans Petter Selasky 			break;
317d28b6b55SHans Petter Selasky 
318d28b6b55SHans Petter Selasky 		msleep(MLX5_NIC_STATE_POLL_MS);
319d28b6b55SHans Petter Selasky 	} while (!time_after(jiffies, end));
320d28b6b55SHans Petter Selasky 
321d28b6b55SHans Petter Selasky 	if (!sensor_nic_disabled(dev)) {
322a2f4f59cSHans Petter Selasky 		mlx5_core_err(dev, "NIC IFC still %d after %ums.\n",
323ba11bcecSHans Petter Selasky 			mlx5_get_nic_state(dev), delay_ms);
324d28b6b55SHans Petter Selasky 	}
325d28b6b55SHans Petter Selasky 
326d28b6b55SHans Petter Selasky 	/* Release FW semaphore if you are the lock owner */
327d28b6b55SHans Petter Selasky 	if (!lock)
328b575d8c8SHans Petter Selasky 		unlock_sem_sw_reset(dev);
329d28b6b55SHans Petter Selasky 
330a2f4f59cSHans Petter Selasky 	mlx5_core_info(dev, "System error event triggered\n");
3317053deebSHans Petter Selasky 
332d28b6b55SHans Petter Selasky err_state_done:
333843a89d3SSlava Shwartsman 	mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 1);
3347053deebSHans Petter Selasky 	mutex_unlock(&dev->intf_state_mutex);
335a2485fe5SHans Petter Selasky }
336a2485fe5SHans Petter Selasky 
337a2485fe5SHans Petter Selasky static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
338a2485fe5SHans Petter Selasky {
339ba11bcecSHans Petter Selasky 	u8 nic_mode = mlx5_get_nic_state(dev);
340a2485fe5SHans Petter Selasky 
3411900b6f8SHans Petter Selasky 	if (nic_mode == MLX5_NIC_IFC_SW_RESET) {
3421900b6f8SHans Petter Selasky 		/* The IFC mode field is 3 bits, so it will read 0x7 in two cases:
3431900b6f8SHans Petter Selasky 		 * 1. PCI has been disabled (ie. PCI-AER, PF driver unloaded
3441900b6f8SHans Petter Selasky 		 *    and this is a VF), this is not recoverable by SW reset.
3451900b6f8SHans Petter Selasky 		 *    Logging of this is handled elsewhere.
3461900b6f8SHans Petter Selasky 		 * 2. FW reset has been issued by another function, driver can
3471900b6f8SHans Petter Selasky 		 *    be reloaded to recover after the mode switches to
3481900b6f8SHans Petter Selasky 		 *    MLX5_NIC_IFC_DISABLED.
3491900b6f8SHans Petter Selasky 		 */
3501900b6f8SHans Petter Selasky 		if (dev->priv.health.fatal_error != MLX5_SENSOR_PCI_COMM_ERR)
351a2f4f59cSHans Petter Selasky 			mlx5_core_warn(dev,
352a2f4f59cSHans Petter Selasky 			    "NIC SW reset is already progress\n");
3531900b6f8SHans Petter Selasky 		else
354a2f4f59cSHans Petter Selasky 			mlx5_core_warn(dev,
355a2f4f59cSHans Petter Selasky 			    "Communication with FW over the PCI link is down\n");
3561900b6f8SHans Petter Selasky 	} else {
3571900b6f8SHans Petter Selasky 		mlx5_core_warn(dev, "NIC mode %d\n", nic_mode);
358a2485fe5SHans Petter Selasky 	}
359a2485fe5SHans Petter Selasky 
360a2485fe5SHans Petter Selasky 	mlx5_disable_device(dev);
361a2485fe5SHans Petter Selasky }
362dc7e38acSHans Petter Selasky 
3631900b6f8SHans Petter Selasky #define MLX5_FW_RESET_WAIT_MS	1000
3641900b6f8SHans Petter Selasky #define MLX5_NIC_STATE_POLL_MS	5
3654bb7662bSHans Petter Selasky static void health_recover(struct work_struct *work)
3664bb7662bSHans Petter Selasky {
3671900b6f8SHans Petter Selasky 	unsigned long end = jiffies + msecs_to_jiffies(MLX5_FW_RESET_WAIT_MS);
3684bb7662bSHans Petter Selasky 	struct mlx5_core_health *health;
3694bb7662bSHans Petter Selasky 	struct delayed_work *dwork;
3704bb7662bSHans Petter Selasky 	struct mlx5_core_dev *dev;
3714bb7662bSHans Petter Selasky 	struct mlx5_priv *priv;
372f20b553dSHans Petter Selasky 	bool recover = true;
3731900b6f8SHans Petter Selasky 	u8 nic_mode;
3744bb7662bSHans Petter Selasky 
3754bb7662bSHans Petter Selasky 	dwork = container_of(work, struct delayed_work, work);
3764bb7662bSHans Petter Selasky 	health = container_of(dwork, struct mlx5_core_health, recover_work);
3774bb7662bSHans Petter Selasky 	priv = container_of(health, struct mlx5_priv, health);
3784bb7662bSHans Petter Selasky 	dev = container_of(priv, struct mlx5_core_dev, priv);
3794bb7662bSHans Petter Selasky 
380c6df6f53SWarner Losh 	/* This might likely be wrong, cut and paste from elsewhere? */
381c6df6f53SWarner Losh 	bus_topo_lock();
382ca2345a0SHans Petter Selasky 
3831900b6f8SHans Petter Selasky 	if (sensor_pci_no_comm(dev)) {
384a2f4f59cSHans Petter Selasky 		mlx5_core_err(dev,
385a2f4f59cSHans Petter Selasky 		    "health recovery flow aborted, PCI reads still not working\n");
386f20b553dSHans Petter Selasky 		recover = false;
3871900b6f8SHans Petter Selasky 	}
3881900b6f8SHans Petter Selasky 
389ba11bcecSHans Petter Selasky 	nic_mode = mlx5_get_nic_state(dev);
3901900b6f8SHans Petter Selasky 	while (nic_mode != MLX5_NIC_IFC_DISABLED &&
3911900b6f8SHans Petter Selasky 	       !time_after(jiffies, end)) {
3921900b6f8SHans Petter Selasky 		msleep(MLX5_NIC_STATE_POLL_MS);
393ba11bcecSHans Petter Selasky 		nic_mode = mlx5_get_nic_state(dev);
3941900b6f8SHans Petter Selasky 	}
3951900b6f8SHans Petter Selasky 
3961900b6f8SHans Petter Selasky 	if (nic_mode != MLX5_NIC_IFC_DISABLED) {
397a2f4f59cSHans Petter Selasky 		mlx5_core_err(dev,
398a2f4f59cSHans Petter Selasky 		    "health recovery flow aborted, unexpected NIC IFC mode %d.\n",
3991900b6f8SHans Petter Selasky 		    nic_mode);
400f20b553dSHans Petter Selasky 		recover = false;
4014bb7662bSHans Petter Selasky 	}
4024bb7662bSHans Petter Selasky 
403f20b553dSHans Petter Selasky 	if (recover) {
404a2f4f59cSHans Petter Selasky 		mlx5_core_info(dev, "Starting health recovery flow\n");
4054bb7662bSHans Petter Selasky 		mlx5_recover_device(dev);
4064bb7662bSHans Petter Selasky 	}
407ca2345a0SHans Petter Selasky 
408c6df6f53SWarner Losh 	bus_topo_unlock();
409f20b553dSHans Petter Selasky }
4104bb7662bSHans Petter Selasky 
4114bb7662bSHans Petter Selasky /* How much time to wait until health resetting the driver (in msecs) */
4124bb7662bSHans Petter Selasky #define MLX5_RECOVERY_DELAY_MSECS 60000
4131900b6f8SHans Petter Selasky #define MLX5_RECOVERY_NO_DELAY 0
4141900b6f8SHans Petter Selasky static unsigned long get_recovery_delay(struct mlx5_core_dev *dev)
4151900b6f8SHans Petter Selasky {
4161900b6f8SHans Petter Selasky 	return dev->priv.health.fatal_error == MLX5_SENSOR_PCI_ERR ||
4171900b6f8SHans Petter Selasky 		dev->priv.health.fatal_error == MLX5_SENSOR_PCI_COMM_ERR	?
4181900b6f8SHans Petter Selasky 		MLX5_RECOVERY_DELAY_MSECS : MLX5_RECOVERY_NO_DELAY;
4191900b6f8SHans Petter Selasky }
4201900b6f8SHans Petter Selasky 
421dc7e38acSHans Petter Selasky static void health_care(struct work_struct *work)
422dc7e38acSHans Petter Selasky {
423a2485fe5SHans Petter Selasky 	struct mlx5_core_health *health;
4241900b6f8SHans Petter Selasky 	unsigned long recover_delay;
425dc7e38acSHans Petter Selasky 	struct mlx5_core_dev *dev;
426dc7e38acSHans Petter Selasky 	struct mlx5_priv *priv;
4274bb7662bSHans Petter Selasky 	unsigned long flags;
428dc7e38acSHans Petter Selasky 
429a2485fe5SHans Petter Selasky 	health = container_of(work, struct mlx5_core_health, work);
430dc7e38acSHans Petter Selasky 	priv = container_of(health, struct mlx5_priv, health);
431dc7e38acSHans Petter Selasky 	dev = container_of(priv, struct mlx5_core_dev, priv);
432f20b553dSHans Petter Selasky 
433dc7e38acSHans Petter Selasky 	mlx5_core_warn(dev, "handling bad device here\n");
434a2485fe5SHans Petter Selasky 	mlx5_handle_bad_state(dev);
4351900b6f8SHans Petter Selasky 	recover_delay = msecs_to_jiffies(get_recovery_delay(dev));
4364bb7662bSHans Petter Selasky 
4374bb7662bSHans Petter Selasky 	spin_lock_irqsave(&health->wq_lock, flags);
438fe242ba7SHans Petter Selasky 	if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags)) {
439a2f4f59cSHans Petter Selasky 		mlx5_core_warn(dev,
440a2f4f59cSHans Petter Selasky 		    "Scheduling recovery work with %lums delay\n",
441fe242ba7SHans Petter Selasky 		    recover_delay);
4424bb7662bSHans Petter Selasky 		schedule_delayed_work(&health->recover_work, recover_delay);
443fe242ba7SHans Petter Selasky 	} else {
444a2f4f59cSHans Petter Selasky 		mlx5_core_err(dev,
4454bb7662bSHans Petter Selasky 		    "new health works are not permitted at this stage\n");
446fe242ba7SHans Petter Selasky 	}
4474bb7662bSHans Petter Selasky 	spin_unlock_irqrestore(&health->wq_lock, flags);
448dc7e38acSHans Petter Selasky }
449a2485fe5SHans Petter Selasky 
450a2485fe5SHans Petter Selasky static int get_next_poll_jiffies(void)
451a2485fe5SHans Petter Selasky {
452a2485fe5SHans Petter Selasky 	unsigned long next;
453a2485fe5SHans Petter Selasky 
454a2485fe5SHans Petter Selasky 	get_random_bytes(&next, sizeof(next));
455a2485fe5SHans Petter Selasky 	next %= HZ;
456a2485fe5SHans Petter Selasky 	next += jiffies + MLX5_HEALTH_POLL_INTERVAL;
457a2485fe5SHans Petter Selasky 
458a2485fe5SHans Petter Selasky 	return next;
459dc7e38acSHans Petter Selasky }
460dc7e38acSHans Petter Selasky 
4614bb7662bSHans Petter Selasky void mlx5_trigger_health_work(struct mlx5_core_dev *dev)
4624bb7662bSHans Petter Selasky {
4634bb7662bSHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
4644bb7662bSHans Petter Selasky 	unsigned long flags;
4654bb7662bSHans Petter Selasky 
4664bb7662bSHans Petter Selasky 	spin_lock_irqsave(&health->wq_lock, flags);
4674bb7662bSHans Petter Selasky 	if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
4684bb7662bSHans Petter Selasky 		queue_work(health->wq, &health->work);
4694bb7662bSHans Petter Selasky 	else
470a2f4f59cSHans Petter Selasky 		mlx5_core_err(dev,
4714bb7662bSHans Petter Selasky 			"new health works are not permitted at this stage\n");
4724bb7662bSHans Petter Selasky 	spin_unlock_irqrestore(&health->wq_lock, flags);
4734bb7662bSHans Petter Selasky }
4744bb7662bSHans Petter Selasky 
475dc7e38acSHans Petter Selasky static const char *hsynd_str(u8 synd)
476dc7e38acSHans Petter Selasky {
477dc7e38acSHans Petter Selasky 	switch (synd) {
478dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_FW_ERR:
479dc7e38acSHans Petter Selasky 		return "firmware internal error";
480dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_IRISC_ERR:
481dc7e38acSHans Petter Selasky 		return "irisc not responding";
482a2485fe5SHans Petter Selasky 	case MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR:
483a2485fe5SHans Petter Selasky 		return "unrecoverable hardware error";
484dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_CRC_ERR:
485dc7e38acSHans Petter Selasky 		return "firmware CRC error";
486dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR:
487dc7e38acSHans Petter Selasky 		return "ICM fetch PCI error";
488dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_HW_FTL_ERR:
489dc7e38acSHans Petter Selasky 		return "HW fatal error\n";
490dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR:
491dc7e38acSHans Petter Selasky 		return "async EQ buffer overrun";
492dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_EQ_ERR:
493dc7e38acSHans Petter Selasky 		return "EQ error";
494a2485fe5SHans Petter Selasky 	case MLX5_HEALTH_SYNDR_EQ_INV:
495a2485fe5SHans Petter Selasky 		return "Invalid EQ referenced";
496dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_FFSER_ERR:
497dc7e38acSHans Petter Selasky 		return "FFSER error";
498a2485fe5SHans Petter Selasky 	case MLX5_HEALTH_SYNDR_HIGH_TEMP:
49987b3c8ccSHans Petter Selasky 		return "High temperature";
500dc7e38acSHans Petter Selasky 	default:
501dc7e38acSHans Petter Selasky 		return "unrecognized error";
502dc7e38acSHans Petter Selasky 	}
503dc7e38acSHans Petter Selasky }
504dc7e38acSHans Petter Selasky 
505c9bb26aeSHans Petter Selasky static u8
506c9bb26aeSHans Petter Selasky print_health_info(struct mlx5_core_dev *dev)
507dc7e38acSHans Petter Selasky {
508dc7e38acSHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
509dc7e38acSHans Petter Selasky 	struct mlx5_health_buffer __iomem *h = health->health;
510c9bb26aeSHans Petter Selasky 	u8 synd = ioread8(&h->synd);
511a2485fe5SHans Petter Selasky 	char fw_str[18];
512a2485fe5SHans Petter Selasky 	u32 fw;
513dc7e38acSHans Petter Selasky 	int i;
514dc7e38acSHans Petter Selasky 
515c9bb26aeSHans Petter Selasky 	/*
516c9bb26aeSHans Petter Selasky 	 * If synd is 0x0 - this indicates that FW is unable to
517c9bb26aeSHans Petter Selasky 	 * respond to initialization segment reads and health buffer
518c9bb26aeSHans Petter Selasky 	 * should not be read.
519c9bb26aeSHans Petter Selasky 	 */
520c9bb26aeSHans Petter Selasky 	if (synd == 0)
521c9bb26aeSHans Petter Selasky 		return (0);
522dc7e38acSHans Petter Selasky 
523a2485fe5SHans Petter Selasky 	for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
524a2f4f59cSHans Petter Selasky 		mlx5_core_info(dev, "assert_var[%d] 0x%08x\n", i,
525a2f4f59cSHans Petter Selasky 		    ioread32be(h->assert_var + i));
526a2485fe5SHans Petter Selasky 
527a2f4f59cSHans Petter Selasky 	mlx5_core_info(dev, "assert_exit_ptr 0x%08x\n",
528a2f4f59cSHans Petter Selasky 	    ioread32be(&h->assert_exit_ptr));
529a2f4f59cSHans Petter Selasky 	mlx5_core_info(dev, "assert_callra 0x%08x\n",
530a2f4f59cSHans Petter Selasky 	    ioread32be(&h->assert_callra));
531a2f4f59cSHans Petter Selasky 	snprintf(fw_str, sizeof(fw_str), "%d.%d.%d",
532a2f4f59cSHans Petter Selasky 	    fw_rev_maj(dev), fw_rev_min(dev), fw_rev_sub(dev));
533a2f4f59cSHans Petter Selasky 	mlx5_core_info(dev, "fw_ver %s\n", fw_str);
534a2f4f59cSHans Petter Selasky 	mlx5_core_info(dev, "hw_id 0x%08x\n", ioread32be(&h->hw_id));
535a2f4f59cSHans Petter Selasky 	mlx5_core_info(dev, "irisc_index %d\n", ioread8(&h->irisc_index));
536a2f4f59cSHans Petter Selasky 	mlx5_core_info(dev, "synd 0x%x: %s\n",
537a2f4f59cSHans Petter Selasky 	    ioread8(&h->synd), hsynd_str(ioread8(&h->synd)));
538a2f4f59cSHans Petter Selasky 	mlx5_core_info(dev, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd));
539a2485fe5SHans Petter Selasky 	fw = ioread32be(&h->fw_ver);
540a2f4f59cSHans Petter Selasky 	mlx5_core_info(dev, "raw fw_ver 0x%08x\n", fw);
541c9bb26aeSHans Petter Selasky 
542c9bb26aeSHans Petter Selasky 	return synd;
543dc7e38acSHans Petter Selasky }
544dc7e38acSHans Petter Selasky 
545adb6fd50SHans Petter Selasky static void health_watchdog(struct work_struct *work)
546adb6fd50SHans Petter Selasky {
547adb6fd50SHans Petter Selasky 	struct mlx5_core_dev *dev;
548adb6fd50SHans Petter Selasky 	u16 power;
549adb6fd50SHans Petter Selasky 	u8 status;
550adb6fd50SHans Petter Selasky 	int err;
551adb6fd50SHans Petter Selasky 
552adb6fd50SHans Petter Selasky 	dev = container_of(work, struct mlx5_core_dev, priv.health.work_watchdog);
553adb6fd50SHans Petter Selasky 
554adb6fd50SHans Petter Selasky 	if (!MLX5_CAP_GEN(dev, mcam_reg) ||
555adb6fd50SHans Petter Selasky 	    !MLX5_CAP_MCAM_FEATURE(dev, pcie_status_and_power))
556adb6fd50SHans Petter Selasky 		return;
557adb6fd50SHans Petter Selasky 
558adb6fd50SHans Petter Selasky 	err = mlx5_pci_read_power_status(dev, &power, &status);
559adb6fd50SHans Petter Selasky 	if (err < 0) {
560a2f4f59cSHans Petter Selasky 		mlx5_core_warn(dev, "Failed reading power status: %d\n",
561a2f4f59cSHans Petter Selasky 		    err);
562adb6fd50SHans Petter Selasky 		return;
563adb6fd50SHans Petter Selasky 	}
564adb6fd50SHans Petter Selasky 
565adb6fd50SHans Petter Selasky 	dev->pwr_value = power;
566adb6fd50SHans Petter Selasky 
567adb6fd50SHans Petter Selasky 	if (dev->pwr_status != status) {
568adb6fd50SHans Petter Selasky 
569adb6fd50SHans Petter Selasky 		switch (status) {
570adb6fd50SHans Petter Selasky 		case 0:
571adb6fd50SHans Petter Selasky 			dev->pwr_status = status;
572a2f4f59cSHans Petter Selasky 			mlx5_core_info(dev,
573a2f4f59cSHans Petter Selasky 			    "PCI power is not published by the PCIe slot.\n");
574adb6fd50SHans Petter Selasky 			break;
575adb6fd50SHans Petter Selasky 		case 1:
576adb6fd50SHans Petter Selasky 			dev->pwr_status = status;
577a2f4f59cSHans Petter Selasky 			mlx5_core_info(dev,
578a2f4f59cSHans Petter Selasky 			    "PCIe slot advertised sufficient power (%uW).\n",
579a2f4f59cSHans Petter Selasky 			    power);
580adb6fd50SHans Petter Selasky 			break;
581adb6fd50SHans Petter Selasky 		case 2:
582adb6fd50SHans Petter Selasky 			dev->pwr_status = status;
583a2f4f59cSHans Petter Selasky 			mlx5_core_warn(dev,
584a2f4f59cSHans Petter Selasky 			    "Detected insufficient power on the PCIe slot (%uW).\n",
585a2f4f59cSHans Petter Selasky 			    power);
586adb6fd50SHans Petter Selasky 			break;
587adb6fd50SHans Petter Selasky 		default:
588adb6fd50SHans Petter Selasky 			dev->pwr_status = 0;
589a2f4f59cSHans Petter Selasky 			mlx5_core_warn(dev,
590a2f4f59cSHans Petter Selasky 			    "Unknown power state detected(%d).\n",
591a2f4f59cSHans Petter Selasky 			    status);
592adb6fd50SHans Petter Selasky 			break;
593adb6fd50SHans Petter Selasky 		}
594adb6fd50SHans Petter Selasky 	}
595adb6fd50SHans Petter Selasky }
596adb6fd50SHans Petter Selasky 
597adb6fd50SHans Petter Selasky void
598adb6fd50SHans Petter Selasky mlx5_trigger_health_watchdog(struct mlx5_core_dev *dev)
599adb6fd50SHans Petter Selasky {
600adb6fd50SHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
601adb6fd50SHans Petter Selasky 	unsigned long flags;
602adb6fd50SHans Petter Selasky 
603adb6fd50SHans Petter Selasky 	spin_lock_irqsave(&health->wq_lock, flags);
604adb6fd50SHans Petter Selasky 	if (!test_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags))
605adb6fd50SHans Petter Selasky 		queue_work(health->wq_watchdog, &health->work_watchdog);
606adb6fd50SHans Petter Selasky 	else
607a2f4f59cSHans Petter Selasky 		mlx5_core_err(dev,
608adb6fd50SHans Petter Selasky 		    "scheduling watchdog is not permitted at this stage\n");
609adb6fd50SHans Petter Selasky 	spin_unlock_irqrestore(&health->wq_lock, flags);
610adb6fd50SHans Petter Selasky }
611adb6fd50SHans Petter Selasky 
61203ab395eSHans Petter Selasky static void poll_health(unsigned long data)
613dc7e38acSHans Petter Selasky {
614dc7e38acSHans Petter Selasky 	struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data;
615dc7e38acSHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
6161900b6f8SHans Petter Selasky 	u32 fatal_error;
617dc7e38acSHans Petter Selasky 	u32 count;
618dc7e38acSHans Petter Selasky 
61930dfc051SHans Petter Selasky 	if (dev->state != MLX5_DEVICE_STATE_UP)
62030dfc051SHans Petter Selasky 		return;
62130dfc051SHans Petter Selasky 
622dc7e38acSHans Petter Selasky 	count = ioread32be(health->health_counter);
623dc7e38acSHans Petter Selasky 	if (count == health->prev)
624dc7e38acSHans Petter Selasky 		++health->miss_counter;
625dc7e38acSHans Petter Selasky 	else
626dc7e38acSHans Petter Selasky 		health->miss_counter = 0;
627dc7e38acSHans Petter Selasky 
628dc7e38acSHans Petter Selasky 	health->prev = count;
629dc7e38acSHans Petter Selasky 	if (health->miss_counter == MAX_MISSES) {
630a2485fe5SHans Petter Selasky 		mlx5_core_err(dev, "device's health compromised - reached miss count\n");
631c9bb26aeSHans Petter Selasky 		if (print_health_info(dev) == 0)
632c9bb26aeSHans Petter Selasky 			mlx5_core_err(dev, "FW is unable to respond to initialization segment reads\n");
633a2485fe5SHans Petter Selasky 	}
634a2485fe5SHans Petter Selasky 
6351900b6f8SHans Petter Selasky 	fatal_error = check_fatal_sensors(dev);
6361900b6f8SHans Petter Selasky 
6371900b6f8SHans Petter Selasky 	if (fatal_error && !health->fatal_error) {
638a2f4f59cSHans Petter Selasky 		mlx5_core_err(dev,
639a2f4f59cSHans Petter Selasky 		    "Fatal error %u detected\n", fatal_error);
6401900b6f8SHans Petter Selasky 		dev->priv.health.fatal_error = fatal_error;
641a2485fe5SHans Petter Selasky 		print_health_info(dev);
6424bb7662bSHans Petter Selasky 		mlx5_trigger_health_work(dev);
643dc7e38acSHans Petter Selasky 	}
6444bb7662bSHans Petter Selasky 
6454bb7662bSHans Petter Selasky 	mod_timer(&health->timer, get_next_poll_jiffies());
646dc7e38acSHans Petter Selasky }
647dc7e38acSHans Petter Selasky 
648dc7e38acSHans Petter Selasky void mlx5_start_health_poll(struct mlx5_core_dev *dev)
649dc7e38acSHans Petter Selasky {
650dc7e38acSHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
651dc7e38acSHans Petter Selasky 
652dc7e38acSHans Petter Selasky 	init_timer(&health->timer);
6531900b6f8SHans Petter Selasky 	health->fatal_error = MLX5_SENSOR_NO_ERR;
654ca551594SHans Petter Selasky 	clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
655519774eaSHans Petter Selasky 	clear_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
656adb6fd50SHans Petter Selasky 	clear_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags);
657dc7e38acSHans Petter Selasky 	health->health = &dev->iseg->health;
658dc7e38acSHans Petter Selasky 	health->health_counter = &dev->iseg->health_counter;
659dc7e38acSHans Petter Selasky 
66003ab395eSHans Petter Selasky 	setup_timer(&health->timer, poll_health, (unsigned long)dev);
661dc7e38acSHans Petter Selasky 	mod_timer(&health->timer,
662dc7e38acSHans Petter Selasky 		  round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL));
663adb6fd50SHans Petter Selasky 
664adb6fd50SHans Petter Selasky 	/* do initial PCI power state readout */
665adb6fd50SHans Petter Selasky 	mlx5_trigger_health_watchdog(dev);
666dc7e38acSHans Petter Selasky }
667dc7e38acSHans Petter Selasky 
6682119f825SSlava Shwartsman void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health)
669dc7e38acSHans Petter Selasky {
670dc7e38acSHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
6712119f825SSlava Shwartsman 	unsigned long flags;
6722119f825SSlava Shwartsman 
6732119f825SSlava Shwartsman 	if (disable_health) {
6742119f825SSlava Shwartsman 		spin_lock_irqsave(&health->wq_lock, flags);
6752119f825SSlava Shwartsman 		set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
6762119f825SSlava Shwartsman 		set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
677adb6fd50SHans Petter Selasky 		set_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags);
6782119f825SSlava Shwartsman 		spin_unlock_irqrestore(&health->wq_lock, flags);
6792119f825SSlava Shwartsman 	}
680dc7e38acSHans Petter Selasky 
681dc7e38acSHans Petter Selasky 	del_timer_sync(&health->timer);
682dc7e38acSHans Petter Selasky }
683dc7e38acSHans Petter Selasky 
684ca551594SHans Petter Selasky void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
685ca551594SHans Petter Selasky {
686ca551594SHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
6874bb7662bSHans Petter Selasky 	unsigned long flags;
688ca551594SHans Petter Selasky 
6894bb7662bSHans Petter Selasky 	spin_lock_irqsave(&health->wq_lock, flags);
690ca551594SHans Petter Selasky 	set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
691519774eaSHans Petter Selasky 	set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
692adb6fd50SHans Petter Selasky 	set_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags);
6934bb7662bSHans Petter Selasky 	spin_unlock_irqrestore(&health->wq_lock, flags);
6944bb7662bSHans Petter Selasky 	cancel_delayed_work_sync(&health->recover_work);
695ca551594SHans Petter Selasky 	cancel_work_sync(&health->work);
696adb6fd50SHans Petter Selasky 	cancel_work_sync(&health->work_watchdog);
697ca551594SHans Petter Selasky }
698ca551594SHans Petter Selasky 
699519774eaSHans Petter Selasky void mlx5_drain_health_recovery(struct mlx5_core_dev *dev)
700519774eaSHans Petter Selasky {
701519774eaSHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
702519774eaSHans Petter Selasky 	unsigned long flags;
703519774eaSHans Petter Selasky 
704519774eaSHans Petter Selasky 	spin_lock_irqsave(&health->wq_lock, flags);
705519774eaSHans Petter Selasky 	set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
706519774eaSHans Petter Selasky 	spin_unlock_irqrestore(&health->wq_lock, flags);
707519774eaSHans Petter Selasky 	cancel_delayed_work_sync(&dev->priv.health.recover_work);
708519774eaSHans Petter Selasky }
709519774eaSHans Petter Selasky 
710a2485fe5SHans Petter Selasky void mlx5_health_cleanup(struct mlx5_core_dev *dev)
711dc7e38acSHans Petter Selasky {
712a2485fe5SHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
713a2485fe5SHans Petter Selasky 
714a2485fe5SHans Petter Selasky 	destroy_workqueue(health->wq);
71540218d73SHans Petter Selasky 	destroy_workqueue(health->wq_watchdog);
7168d1eeedbSHans Petter Selasky 	destroy_workqueue(health->wq_cmd);
717dc7e38acSHans Petter Selasky }
718dc7e38acSHans Petter Selasky 
719a2485fe5SHans Petter Selasky int mlx5_health_init(struct mlx5_core_dev *dev)
720dc7e38acSHans Petter Selasky {
721a2485fe5SHans Petter Selasky 	struct mlx5_core_health *health;
72240218d73SHans Petter Selasky 	char name[64];
723dc7e38acSHans Petter Selasky 
724a2485fe5SHans Petter Selasky 	health = &dev->priv.health;
725a2485fe5SHans Petter Selasky 
72640218d73SHans Petter Selasky 	snprintf(name, sizeof(name), "%s-rec", dev_name(&dev->pdev->dev));
727a2485fe5SHans Petter Selasky 	health->wq = create_singlethread_workqueue(name);
728a2485fe5SHans Petter Selasky 	if (!health->wq)
7298d1eeedbSHans Petter Selasky 		goto err_recovery;
730a2485fe5SHans Petter Selasky 
73140218d73SHans Petter Selasky 	snprintf(name, sizeof(name), "%s-wdg", dev_name(&dev->pdev->dev));
73240218d73SHans Petter Selasky 	health->wq_watchdog = create_singlethread_workqueue(name);
7338d1eeedbSHans Petter Selasky 	if (!health->wq_watchdog)
7348d1eeedbSHans Petter Selasky 		goto err_watchdog;
7358d1eeedbSHans Petter Selasky 
7368d1eeedbSHans Petter Selasky 	snprintf(name, sizeof(name), "%s-cmd", dev_name(&dev->pdev->dev));
7378d1eeedbSHans Petter Selasky 	health->wq_cmd = create_singlethread_workqueue(name);
7388d1eeedbSHans Petter Selasky 	if (!health->wq_cmd)
7398d1eeedbSHans Petter Selasky 		goto err_cmd;
74040218d73SHans Petter Selasky 
741ca551594SHans Petter Selasky 	spin_lock_init(&health->wq_lock);
742a2485fe5SHans Petter Selasky 	INIT_WORK(&health->work, health_care);
743adb6fd50SHans Petter Selasky 	INIT_WORK(&health->work_watchdog, health_watchdog);
744a0a4fd77SHans Petter Selasky 	INIT_WORK(&health->work_cmd_completion, mlx5_trigger_cmd_completions);
7454bb7662bSHans Petter Selasky 	INIT_DELAYED_WORK(&health->recover_work, health_recover);
746a2485fe5SHans Petter Selasky 
747a2485fe5SHans Petter Selasky 	return 0;
7488d1eeedbSHans Petter Selasky 
7498d1eeedbSHans Petter Selasky err_cmd:
7508d1eeedbSHans Petter Selasky 	destroy_workqueue(health->wq_watchdog);
7518d1eeedbSHans Petter Selasky err_watchdog:
7528d1eeedbSHans Petter Selasky 	destroy_workqueue(health->wq);
7538d1eeedbSHans Petter Selasky err_recovery:
7548d1eeedbSHans Petter Selasky 	return -ENOMEM;
755dc7e38acSHans Petter Selasky }
756