xref: /freebsd/sys/dev/mlx5/mlx5_core/mlx5_health.c (revision 95ee2897)
1dc7e38acSHans Petter Selasky /*-
28d1eeedbSHans Petter Selasky  * Copyright (c) 2013-2019, Mellanox Technologies, Ltd.  All rights reserved.
3dc7e38acSHans Petter Selasky  *
4dc7e38acSHans Petter Selasky  * Redistribution and use in source and binary forms, with or without
5dc7e38acSHans Petter Selasky  * modification, are permitted provided that the following conditions
6dc7e38acSHans Petter Selasky  * are met:
7dc7e38acSHans Petter Selasky  * 1. Redistributions of source code must retain the above copyright
8dc7e38acSHans Petter Selasky  *    notice, this list of conditions and the following disclaimer.
9dc7e38acSHans Petter Selasky  * 2. Redistributions in binary form must reproduce the above copyright
10dc7e38acSHans Petter Selasky  *    notice, this list of conditions and the following disclaimer in the
11dc7e38acSHans Petter Selasky  *    documentation and/or other materials provided with the distribution.
12dc7e38acSHans Petter Selasky  *
13dc7e38acSHans Petter Selasky  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14dc7e38acSHans Petter Selasky  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15dc7e38acSHans Petter Selasky  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16dc7e38acSHans Petter Selasky  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17dc7e38acSHans Petter Selasky  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18dc7e38acSHans Petter Selasky  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19dc7e38acSHans Petter Selasky  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20dc7e38acSHans Petter Selasky  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21dc7e38acSHans Petter Selasky  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22dc7e38acSHans Petter Selasky  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23dc7e38acSHans Petter Selasky  * SUCH DAMAGE.
24dc7e38acSHans Petter Selasky  */
25dc7e38acSHans Petter Selasky 
26ee9d634bSKonstantin Belousov #include "opt_rss.h"
27ee9d634bSKonstantin Belousov #include "opt_ratelimit.h"
28ee9d634bSKonstantin Belousov 
29dc7e38acSHans Petter Selasky #include <linux/kernel.h>
30dc7e38acSHans Petter Selasky #include <linux/module.h>
31dc7e38acSHans Petter Selasky #include <linux/random.h>
32dc7e38acSHans Petter Selasky #include <linux/vmalloc.h>
33a2485fe5SHans Petter Selasky #include <linux/hardirq.h>
341900b6f8SHans Petter Selasky #include <linux/delay.h>
35dc7e38acSHans Petter Selasky #include <dev/mlx5/driver.h>
36dc7e38acSHans Petter Selasky #include <dev/mlx5/mlx5_ifc.h>
3712c56d7dSHans Petter Selasky #include <dev/mlx5/mlx5_core/mlx5_core.h>
38dc7e38acSHans Petter Selasky 
39dc7e38acSHans Petter Selasky #define	MLX5_HEALTH_POLL_INTERVAL	(2 * HZ)
40dc7e38acSHans Petter Selasky #define	MAX_MISSES			3
41dc7e38acSHans Petter Selasky 
42a2485fe5SHans Petter Selasky enum {
43ca551594SHans Petter Selasky 	MLX5_DROP_NEW_HEALTH_WORK,
44519774eaSHans Petter Selasky 	MLX5_DROP_NEW_RECOVERY_WORK,
45adb6fd50SHans Petter Selasky 	MLX5_DROP_NEW_WATCHDOG_WORK,
46ca551594SHans Petter Selasky };
47ca551594SHans Petter Selasky 
481900b6f8SHans Petter Selasky enum  {
491900b6f8SHans Petter Selasky 	MLX5_SENSOR_NO_ERR		= 0,
501900b6f8SHans Petter Selasky 	MLX5_SENSOR_PCI_COMM_ERR	= 1,
511900b6f8SHans Petter Selasky 	MLX5_SENSOR_PCI_ERR		= 2,
521900b6f8SHans Petter Selasky 	MLX5_SENSOR_NIC_DISABLED	= 3,
531900b6f8SHans Petter Selasky 	MLX5_SENSOR_NIC_SW_RESET	= 4,
54fe242ba7SHans Petter Selasky 	MLX5_SENSOR_FW_SYND_RFR		= 5,
551900b6f8SHans Petter Selasky };
561900b6f8SHans Petter Selasky 
5729e54451SSlava Shwartsman static int mlx5_fw_reset_enable = 1;
5829e54451SSlava Shwartsman SYSCTL_INT(_hw_mlx5, OID_AUTO, fw_reset_enable, CTLFLAG_RWTUN,
5929e54451SSlava Shwartsman     &mlx5_fw_reset_enable, 0,
6029e54451SSlava Shwartsman     "Enable firmware reset");
6129e54451SSlava Shwartsman 
625169fb81SHans Petter Selasky static unsigned int sw_reset_to = 1200;
635169fb81SHans Petter Selasky SYSCTL_UINT(_hw_mlx5, OID_AUTO, sw_reset_timeout, CTLFLAG_RWTUN,
645169fb81SHans Petter Selasky     &sw_reset_to, 0,
655169fb81SHans Petter Selasky     "Minimum timeout in seconds between two firmware resets");
665169fb81SHans Petter Selasky 
675169fb81SHans Petter Selasky 
lock_sem_sw_reset(struct mlx5_core_dev * dev)68b575d8c8SHans Petter Selasky static int lock_sem_sw_reset(struct mlx5_core_dev *dev)
69f20b553dSHans Petter Selasky {
70b575d8c8SHans Petter Selasky 	int ret;
71f20b553dSHans Petter Selasky 
72f20b553dSHans Petter Selasky 	/* Lock GW access */
73b575d8c8SHans Petter Selasky 	ret = -mlx5_vsc_lock(dev);
74f20b553dSHans Petter Selasky 	if (ret) {
75b575d8c8SHans Petter Selasky 		mlx5_core_warn(dev, "Timed out locking gateway %d\n", ret);
76f20b553dSHans Petter Selasky 		return ret;
77f20b553dSHans Petter Selasky 	}
78f20b553dSHans Petter Selasky 
79b575d8c8SHans Petter Selasky 	ret = -mlx5_vsc_lock_addr_space(dev, MLX5_SEMAPHORE_SW_RESET);
80b575d8c8SHans Petter Selasky 	if (ret) {
81f20b553dSHans Petter Selasky 		if (ret == -EBUSY)
82a2f4f59cSHans Petter Selasky 			mlx5_core_dbg(dev,
83a2f4f59cSHans Petter Selasky 			    "SW reset FW semaphore already locked, another function will handle the reset\n");
84f20b553dSHans Petter Selasky 		else
85a2f4f59cSHans Petter Selasky 			mlx5_core_warn(dev,
86a2f4f59cSHans Petter Selasky 			    "SW reset semaphore lock return %d\n", ret);
87f20b553dSHans Petter Selasky 	}
88f20b553dSHans Petter Selasky 
89f20b553dSHans Petter Selasky 	/* Unlock GW access */
90b575d8c8SHans Petter Selasky 	mlx5_vsc_unlock(dev);
91b575d8c8SHans Petter Selasky 
92b575d8c8SHans Petter Selasky 	return ret;
93b575d8c8SHans Petter Selasky }
94b575d8c8SHans Petter Selasky 
unlock_sem_sw_reset(struct mlx5_core_dev * dev)95b575d8c8SHans Petter Selasky static int unlock_sem_sw_reset(struct mlx5_core_dev *dev)
96b575d8c8SHans Petter Selasky {
97b575d8c8SHans Petter Selasky 	int ret;
98b575d8c8SHans Petter Selasky 
99b575d8c8SHans Petter Selasky 	/* Lock GW access */
100b575d8c8SHans Petter Selasky 	ret = -mlx5_vsc_lock(dev);
101b575d8c8SHans Petter Selasky 	if (ret) {
102b575d8c8SHans Petter Selasky 		mlx5_core_warn(dev, "Timed out locking gateway %d\n", ret);
103b575d8c8SHans Petter Selasky 		return ret;
104b575d8c8SHans Petter Selasky 	}
105b575d8c8SHans Petter Selasky 
106b575d8c8SHans Petter Selasky 	ret = -mlx5_vsc_unlock_addr_space(dev, MLX5_SEMAPHORE_SW_RESET);
107b575d8c8SHans Petter Selasky 
108b575d8c8SHans Petter Selasky 	/* Unlock GW access */
109b575d8c8SHans Petter Selasky 	mlx5_vsc_unlock(dev);
110f20b553dSHans Petter Selasky 
111f20b553dSHans Petter Selasky 	return ret;
112f20b553dSHans Petter Selasky }
113f20b553dSHans Petter Selasky 
mlx5_get_nic_state(struct mlx5_core_dev * dev)114ba11bcecSHans Petter Selasky u8 mlx5_get_nic_state(struct mlx5_core_dev *dev)
115a2485fe5SHans Petter Selasky {
1161900b6f8SHans Petter Selasky 	return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 7;
117a2485fe5SHans Petter Selasky }
118a2485fe5SHans Petter Selasky 
mlx5_set_nic_state(struct mlx5_core_dev * dev,u8 state)119ba11bcecSHans Petter Selasky void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state)
120ba11bcecSHans Petter Selasky {
121ba11bcecSHans Petter Selasky 	u32 cur_cmdq_addr_l_sz;
122ba11bcecSHans Petter Selasky 
123ba11bcecSHans Petter Selasky 	cur_cmdq_addr_l_sz = ioread32be(&dev->iseg->cmdq_addr_l_sz);
124ba11bcecSHans Petter Selasky 	iowrite32be((cur_cmdq_addr_l_sz & 0xFFFFF000) |
125ba11bcecSHans Petter Selasky 		    state << MLX5_NIC_IFC_OFFSET,
126ba11bcecSHans Petter Selasky 		    &dev->iseg->cmdq_addr_l_sz);
127ba11bcecSHans Petter Selasky }
128ba11bcecSHans Petter Selasky 
sensor_fw_synd_rfr(struct mlx5_core_dev * dev)129fe242ba7SHans Petter Selasky static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev)
130fe242ba7SHans Petter Selasky {
131fe242ba7SHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
132fe242ba7SHans Petter Selasky 	struct mlx5_health_buffer __iomem *h = health->health;
133fe242ba7SHans Petter Selasky 	u32 rfr = ioread32be(&h->rfr) >> MLX5_RFR_OFFSET;
134fe242ba7SHans Petter Selasky 	u8 synd = ioread8(&h->synd);
135fe242ba7SHans Petter Selasky 
136fe242ba7SHans Petter Selasky 	if (rfr && synd)
137fe242ba7SHans Petter Selasky 		mlx5_core_dbg(dev, "FW requests reset, synd: %d\n", synd);
138fe242ba7SHans Petter Selasky 	return rfr && synd;
139fe242ba7SHans Petter Selasky }
140fe242ba7SHans Petter Selasky 
mlx5_trigger_cmd_completions(struct work_struct * work)141a0a4fd77SHans Petter Selasky static void mlx5_trigger_cmd_completions(struct work_struct *work)
142a2485fe5SHans Petter Selasky {
143a0a4fd77SHans Petter Selasky 	struct mlx5_core_dev *dev =
144a0a4fd77SHans Petter Selasky 	    container_of(work, struct mlx5_core_dev, priv.health.work_cmd_completion);
145a2485fe5SHans Petter Selasky 	unsigned long flags;
146a2485fe5SHans Petter Selasky 	u64 vector;
147a2485fe5SHans Petter Selasky 
148a2485fe5SHans Petter Selasky 	/* wait for pending handlers to complete */
149a2485fe5SHans Petter Selasky 	synchronize_irq(dev->priv.msix_arr[MLX5_EQ_VEC_CMD].vector);
150a2485fe5SHans Petter Selasky 	spin_lock_irqsave(&dev->cmd.alloc_lock, flags);
151a2485fe5SHans Petter Selasky 	vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1);
152a2485fe5SHans Petter Selasky 	if (!vector)
153a2485fe5SHans Petter Selasky 		goto no_trig;
154a2485fe5SHans Petter Selasky 
155a2485fe5SHans Petter Selasky 	vector |= MLX5_TRIGGERED_CMD_COMP;
156a2485fe5SHans Petter Selasky 	spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
157a2485fe5SHans Petter Selasky 
1582cec1528SKonstantin Belousov 	mlx5_core_dbg(dev, "vector 0x%jx\n", (uintmax_t)vector);
159721a1a6aSSlava Shwartsman 	mlx5_cmd_comp_handler(dev, vector, MLX5_CMD_MODE_EVENTS);
160a2485fe5SHans Petter Selasky 	return;
161a2485fe5SHans Petter Selasky 
162a2485fe5SHans Petter Selasky no_trig:
163a2485fe5SHans Petter Selasky 	spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
164a2485fe5SHans Petter Selasky }
165a2485fe5SHans Petter Selasky 
sensor_pci_no_comm(struct mlx5_core_dev * dev)1661900b6f8SHans Petter Selasky static bool sensor_pci_no_comm(struct mlx5_core_dev *dev)
167a2485fe5SHans Petter Selasky {
168a2485fe5SHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
169a2485fe5SHans Petter Selasky 	struct mlx5_health_buffer __iomem *h = health->health;
1701900b6f8SHans Petter Selasky 	bool err = ioread32be(&h->fw_ver) == 0xffffffff;
171a2485fe5SHans Petter Selasky 
1721900b6f8SHans Petter Selasky 	return err;
1731900b6f8SHans Petter Selasky }
174a2485fe5SHans Petter Selasky 
sensor_nic_disabled(struct mlx5_core_dev * dev)1751900b6f8SHans Petter Selasky static bool sensor_nic_disabled(struct mlx5_core_dev *dev)
1761900b6f8SHans Petter Selasky {
177ba11bcecSHans Petter Selasky 	return mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED;
1781900b6f8SHans Petter Selasky }
179a2485fe5SHans Petter Selasky 
sensor_nic_sw_reset(struct mlx5_core_dev * dev)1801900b6f8SHans Petter Selasky static bool sensor_nic_sw_reset(struct mlx5_core_dev *dev)
1811900b6f8SHans Petter Selasky {
182ba11bcecSHans Petter Selasky 	return mlx5_get_nic_state(dev) == MLX5_NIC_IFC_SW_RESET;
1831900b6f8SHans Petter Selasky }
1841900b6f8SHans Petter Selasky 
check_fatal_sensors(struct mlx5_core_dev * dev)1851900b6f8SHans Petter Selasky static u32 check_fatal_sensors(struct mlx5_core_dev *dev)
1861900b6f8SHans Petter Selasky {
1871900b6f8SHans Petter Selasky 	if (sensor_pci_no_comm(dev))
1881900b6f8SHans Petter Selasky 		return MLX5_SENSOR_PCI_COMM_ERR;
1891900b6f8SHans Petter Selasky 	if (pci_channel_offline(dev->pdev))
1901900b6f8SHans Petter Selasky 		return MLX5_SENSOR_PCI_ERR;
1911900b6f8SHans Petter Selasky 	if (sensor_nic_disabled(dev))
1921900b6f8SHans Petter Selasky 		return MLX5_SENSOR_NIC_DISABLED;
1931900b6f8SHans Petter Selasky 	if (sensor_nic_sw_reset(dev))
1941900b6f8SHans Petter Selasky 		return MLX5_SENSOR_NIC_SW_RESET;
195fe242ba7SHans Petter Selasky 	if (sensor_fw_synd_rfr(dev))
196fe242ba7SHans Petter Selasky 		return MLX5_SENSOR_FW_SYND_RFR;
1971900b6f8SHans Petter Selasky 
1981900b6f8SHans Petter Selasky 	return MLX5_SENSOR_NO_ERR;
199a2485fe5SHans Petter Selasky }
200a2485fe5SHans Petter Selasky 
reset_fw_if_needed(struct mlx5_core_dev * dev)201fe242ba7SHans Petter Selasky static void reset_fw_if_needed(struct mlx5_core_dev *dev)
202fe242ba7SHans Petter Selasky {
20329e54451SSlava Shwartsman 	bool supported;
204fe242ba7SHans Petter Selasky 	u32 cmdq_addr, fatal_error;
205fe242ba7SHans Petter Selasky 
20629e54451SSlava Shwartsman 	if (!mlx5_fw_reset_enable)
20729e54451SSlava Shwartsman 		return;
20829e54451SSlava Shwartsman 	supported = (ioread32be(&dev->iseg->initializing) >>
20929e54451SSlava Shwartsman 	    MLX5_FW_RESET_SUPPORTED_OFFSET) & 1;
210fe242ba7SHans Petter Selasky 	if (!supported)
211fe242ba7SHans Petter Selasky 		return;
212fe242ba7SHans Petter Selasky 
213fe242ba7SHans Petter Selasky 	/* The reset only needs to be issued by one PF. The health buffer is
214fe242ba7SHans Petter Selasky 	 * shared between all functions, and will be cleared during a reset.
215fe242ba7SHans Petter Selasky 	 * Check again to avoid a redundant 2nd reset. If the fatal erros was
216fe242ba7SHans Petter Selasky 	 * PCI related a reset won't help.
217fe242ba7SHans Petter Selasky 	 */
218fe242ba7SHans Petter Selasky 	fatal_error = check_fatal_sensors(dev);
219fe242ba7SHans Petter Selasky 	if (fatal_error == MLX5_SENSOR_PCI_COMM_ERR ||
220fe242ba7SHans Petter Selasky 	    fatal_error == MLX5_SENSOR_NIC_DISABLED ||
221d28b6b55SHans Petter Selasky 	    fatal_error == MLX5_SENSOR_NIC_SW_RESET) {
222a2f4f59cSHans Petter Selasky 		mlx5_core_warn(dev,
223a2f4f59cSHans Petter Selasky 		    "Not issuing FW reset. Either it's already done or won't help.\n");
224fe242ba7SHans Petter Selasky 		return;
225fe242ba7SHans Petter Selasky 	}
226fe242ba7SHans Petter Selasky 
227a2f4f59cSHans Petter Selasky 	mlx5_core_info(dev, "Issuing FW Reset\n");
228fe242ba7SHans Petter Selasky 	/* Write the NIC interface field to initiate the reset, the command
229fe242ba7SHans Petter Selasky 	 * interface address also resides here, don't overwrite it.
230fe242ba7SHans Petter Selasky 	 */
231fe242ba7SHans Petter Selasky 	cmdq_addr = ioread32be(&dev->iseg->cmdq_addr_l_sz);
232fe242ba7SHans Petter Selasky 	iowrite32be((cmdq_addr & 0xFFFFF000) |
233fe242ba7SHans Petter Selasky 		    MLX5_NIC_IFC_SW_RESET << MLX5_NIC_IFC_OFFSET,
234fe242ba7SHans Petter Selasky 		    &dev->iseg->cmdq_addr_l_sz);
235fe242ba7SHans Petter Selasky }
236fe242ba7SHans Petter Selasky 
2375169fb81SHans Petter Selasky static bool
mlx5_health_allow_reset(struct mlx5_core_dev * dev)2385169fb81SHans Petter Selasky mlx5_health_allow_reset(struct mlx5_core_dev *dev)
2395169fb81SHans Petter Selasky {
2405169fb81SHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
2415169fb81SHans Petter Selasky 	unsigned int delta;
2425169fb81SHans Petter Selasky 	bool ret;
2435169fb81SHans Petter Selasky 
2445169fb81SHans Petter Selasky 	if (health->last_reset_req != 0) {
2455169fb81SHans Petter Selasky 		delta = ticks - health->last_reset_req;
2465169fb81SHans Petter Selasky 		delta /= hz;
2475169fb81SHans Petter Selasky 		ret = delta >= sw_reset_to;
2485169fb81SHans Petter Selasky 	} else {
2495169fb81SHans Petter Selasky 		ret = true;
2505169fb81SHans Petter Selasky 	}
2515169fb81SHans Petter Selasky 
2525169fb81SHans Petter Selasky 	/*
2535169fb81SHans Petter Selasky 	 * In principle, ticks may be 0. Setting it to off by one (-1)
2545169fb81SHans Petter Selasky 	 * to prevent certain reset in next request.
2555169fb81SHans Petter Selasky 	 */
2565169fb81SHans Petter Selasky 	health->last_reset_req = ticks ? : -1;
2575169fb81SHans Petter Selasky 	if (!ret)
258a2f4f59cSHans Petter Selasky 		mlx5_core_warn(dev,
259a2f4f59cSHans Petter Selasky 		    "Firmware reset elided due to auto-reset frequency threshold.\n");
2605169fb81SHans Petter Selasky 	return (ret);
2615169fb81SHans Petter Selasky }
2625169fb81SHans Petter Selasky 
263d28b6b55SHans Petter Selasky #define MLX5_CRDUMP_WAIT_MS	60000
264d28b6b55SHans Petter Selasky #define MLX5_FW_RESET_WAIT_MS	1000
265d28b6b55SHans Petter Selasky #define MLX5_NIC_STATE_POLL_MS	5
mlx5_enter_error_state(struct mlx5_core_dev * dev,bool force)266c0902569SHans Petter Selasky void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
267a2485fe5SHans Petter Selasky {
268c2a1e807SHans Petter Selasky 	int end, delay_ms = MLX5_CRDUMP_WAIT_MS;
269d28b6b55SHans Petter Selasky 	u32 fatal_error;
270d28b6b55SHans Petter Selasky 	int lock = -EBUSY;
271d28b6b55SHans Petter Selasky 
272d28b6b55SHans Petter Selasky 	fatal_error = check_fatal_sensors(dev);
273d28b6b55SHans Petter Selasky 
274d28b6b55SHans Petter Selasky 	if (fatal_error || force) {
2756d54b22dSHans Petter Selasky 		if (xchg(&dev->state, MLX5_DEVICE_STATE_INTERNAL_ERROR) ==
2766d54b22dSHans Petter Selasky 		    MLX5_DEVICE_STATE_INTERNAL_ERROR)
2776d54b22dSHans Petter Selasky 			return;
2781fb6089cSHans Petter Selasky 		if (!force)
2791fb6089cSHans Petter Selasky 			mlx5_core_err(dev, "internal state error detected\n");
280a0a4fd77SHans Petter Selasky 
281a0a4fd77SHans Petter Selasky 		/*
282a0a4fd77SHans Petter Selasky 		 * Queue the command completion handler on the command
283a0a4fd77SHans Petter Selasky 		 * work queue to avoid racing with the real command
284a0a4fd77SHans Petter Selasky 		 * completion handler and then wait for it to
285a0a4fd77SHans Petter Selasky 		 * complete:
286a0a4fd77SHans Petter Selasky 		 */
2878d1eeedbSHans Petter Selasky 		queue_work(dev->priv.health.wq_cmd, &dev->priv.health.work_cmd_completion);
2888d1eeedbSHans Petter Selasky 		flush_workqueue(dev->priv.health.wq_cmd);
2897053deebSHans Petter Selasky 	}
290a2485fe5SHans Petter Selasky 
2916d54b22dSHans Petter Selasky 	mutex_lock(&dev->intf_state_mutex);
2926d54b22dSHans Petter Selasky 
293d28b6b55SHans Petter Selasky 	if (force)
294d28b6b55SHans Petter Selasky 		goto err_state_done;
295d28b6b55SHans Petter Selasky 
2965169fb81SHans Petter Selasky 	if (fatal_error == MLX5_SENSOR_FW_SYND_RFR &&
2975169fb81SHans Petter Selasky 	    mlx5_health_allow_reset(dev)) {
29892d23c82SHans Petter Selasky 		/* Get cr-dump and reset FW semaphore */
299d28b6b55SHans Petter Selasky 		if (mlx5_core_is_pf(dev))
300b575d8c8SHans Petter Selasky 			lock = lock_sem_sw_reset(dev);
301d28b6b55SHans Petter Selasky 
30292d23c82SHans Petter Selasky 		/* Execute cr-dump and SW reset */
303d28b6b55SHans Petter Selasky 		if (lock != -EBUSY) {
3043e40712eSHans Petter Selasky 			(void)mlx5_fwdump(dev);
305d28b6b55SHans Petter Selasky 			reset_fw_if_needed(dev);
306d28b6b55SHans Petter Selasky 			delay_ms = MLX5_FW_RESET_WAIT_MS;
307d28b6b55SHans Petter Selasky 		}
308d28b6b55SHans Petter Selasky 	}
309d28b6b55SHans Petter Selasky 
310d28b6b55SHans Petter Selasky 	/* Recover from SW reset */
311d28b6b55SHans Petter Selasky 	end = jiffies + msecs_to_jiffies(delay_ms);
312d28b6b55SHans Petter Selasky 	do {
313d28b6b55SHans Petter Selasky 		if (sensor_nic_disabled(dev))
314d28b6b55SHans Petter Selasky 			break;
315d28b6b55SHans Petter Selasky 
316d28b6b55SHans Petter Selasky 		msleep(MLX5_NIC_STATE_POLL_MS);
317d28b6b55SHans Petter Selasky 	} while (!time_after(jiffies, end));
318d28b6b55SHans Petter Selasky 
319d28b6b55SHans Petter Selasky 	if (!sensor_nic_disabled(dev)) {
320a2f4f59cSHans Petter Selasky 		mlx5_core_err(dev, "NIC IFC still %d after %ums.\n",
321ba11bcecSHans Petter Selasky 			mlx5_get_nic_state(dev), delay_ms);
322d28b6b55SHans Petter Selasky 	}
323d28b6b55SHans Petter Selasky 
324d28b6b55SHans Petter Selasky 	/* Release FW semaphore if you are the lock owner */
325d28b6b55SHans Petter Selasky 	if (!lock)
326b575d8c8SHans Petter Selasky 		unlock_sem_sw_reset(dev);
327d28b6b55SHans Petter Selasky 
328a2f4f59cSHans Petter Selasky 	mlx5_core_info(dev, "System error event triggered\n");
3297053deebSHans Petter Selasky 
330d28b6b55SHans Petter Selasky err_state_done:
331843a89d3SSlava Shwartsman 	mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 1);
3327053deebSHans Petter Selasky 	mutex_unlock(&dev->intf_state_mutex);
333a2485fe5SHans Petter Selasky }
334a2485fe5SHans Petter Selasky 
mlx5_handle_bad_state(struct mlx5_core_dev * dev)335a2485fe5SHans Petter Selasky static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
336a2485fe5SHans Petter Selasky {
337ba11bcecSHans Petter Selasky 	u8 nic_mode = mlx5_get_nic_state(dev);
338a2485fe5SHans Petter Selasky 
3391900b6f8SHans Petter Selasky 	if (nic_mode == MLX5_NIC_IFC_SW_RESET) {
3401900b6f8SHans Petter Selasky 		/* The IFC mode field is 3 bits, so it will read 0x7 in two cases:
3411900b6f8SHans Petter Selasky 		 * 1. PCI has been disabled (ie. PCI-AER, PF driver unloaded
3421900b6f8SHans Petter Selasky 		 *    and this is a VF), this is not recoverable by SW reset.
3431900b6f8SHans Petter Selasky 		 *    Logging of this is handled elsewhere.
3441900b6f8SHans Petter Selasky 		 * 2. FW reset has been issued by another function, driver can
3451900b6f8SHans Petter Selasky 		 *    be reloaded to recover after the mode switches to
3461900b6f8SHans Petter Selasky 		 *    MLX5_NIC_IFC_DISABLED.
3471900b6f8SHans Petter Selasky 		 */
3481900b6f8SHans Petter Selasky 		if (dev->priv.health.fatal_error != MLX5_SENSOR_PCI_COMM_ERR)
349a2f4f59cSHans Petter Selasky 			mlx5_core_warn(dev,
350a2f4f59cSHans Petter Selasky 			    "NIC SW reset is already progress\n");
3511900b6f8SHans Petter Selasky 		else
352a2f4f59cSHans Petter Selasky 			mlx5_core_warn(dev,
353a2f4f59cSHans Petter Selasky 			    "Communication with FW over the PCI link is down\n");
3541900b6f8SHans Petter Selasky 	} else {
3551900b6f8SHans Petter Selasky 		mlx5_core_warn(dev, "NIC mode %d\n", nic_mode);
356a2485fe5SHans Petter Selasky 	}
357a2485fe5SHans Petter Selasky 
358a2485fe5SHans Petter Selasky 	mlx5_disable_device(dev);
359a2485fe5SHans Petter Selasky }
360dc7e38acSHans Petter Selasky 
3611900b6f8SHans Petter Selasky #define MLX5_FW_RESET_WAIT_MS	1000
3621900b6f8SHans Petter Selasky #define MLX5_NIC_STATE_POLL_MS	5
health_recover(struct work_struct * work)3634bb7662bSHans Petter Selasky static void health_recover(struct work_struct *work)
3644bb7662bSHans Petter Selasky {
3651900b6f8SHans Petter Selasky 	unsigned long end = jiffies + msecs_to_jiffies(MLX5_FW_RESET_WAIT_MS);
3664bb7662bSHans Petter Selasky 	struct mlx5_core_health *health;
3674bb7662bSHans Petter Selasky 	struct delayed_work *dwork;
3684bb7662bSHans Petter Selasky 	struct mlx5_core_dev *dev;
3694bb7662bSHans Petter Selasky 	struct mlx5_priv *priv;
370f20b553dSHans Petter Selasky 	bool recover = true;
3711900b6f8SHans Petter Selasky 	u8 nic_mode;
3724bb7662bSHans Petter Selasky 
3734bb7662bSHans Petter Selasky 	dwork = container_of(work, struct delayed_work, work);
3744bb7662bSHans Petter Selasky 	health = container_of(dwork, struct mlx5_core_health, recover_work);
3754bb7662bSHans Petter Selasky 	priv = container_of(health, struct mlx5_priv, health);
3764bb7662bSHans Petter Selasky 	dev = container_of(priv, struct mlx5_core_dev, priv);
3774bb7662bSHans Petter Selasky 
378c6df6f53SWarner Losh 	/* This might likely be wrong, cut and paste from elsewhere? */
379c6df6f53SWarner Losh 	bus_topo_lock();
380ca2345a0SHans Petter Selasky 
3811900b6f8SHans Petter Selasky 	if (sensor_pci_no_comm(dev)) {
382a2f4f59cSHans Petter Selasky 		mlx5_core_err(dev,
383a2f4f59cSHans Petter Selasky 		    "health recovery flow aborted, PCI reads still not working\n");
384f20b553dSHans Petter Selasky 		recover = false;
3851900b6f8SHans Petter Selasky 	}
3861900b6f8SHans Petter Selasky 
387ba11bcecSHans Petter Selasky 	nic_mode = mlx5_get_nic_state(dev);
3881900b6f8SHans Petter Selasky 	while (nic_mode != MLX5_NIC_IFC_DISABLED &&
3891900b6f8SHans Petter Selasky 	       !time_after(jiffies, end)) {
3901900b6f8SHans Petter Selasky 		msleep(MLX5_NIC_STATE_POLL_MS);
391ba11bcecSHans Petter Selasky 		nic_mode = mlx5_get_nic_state(dev);
3921900b6f8SHans Petter Selasky 	}
3931900b6f8SHans Petter Selasky 
3941900b6f8SHans Petter Selasky 	if (nic_mode != MLX5_NIC_IFC_DISABLED) {
395a2f4f59cSHans Petter Selasky 		mlx5_core_err(dev,
396a2f4f59cSHans Petter Selasky 		    "health recovery flow aborted, unexpected NIC IFC mode %d.\n",
3971900b6f8SHans Petter Selasky 		    nic_mode);
398f20b553dSHans Petter Selasky 		recover = false;
3994bb7662bSHans Petter Selasky 	}
4004bb7662bSHans Petter Selasky 
401f20b553dSHans Petter Selasky 	if (recover) {
402a2f4f59cSHans Petter Selasky 		mlx5_core_info(dev, "Starting health recovery flow\n");
4034bb7662bSHans Petter Selasky 		mlx5_recover_device(dev);
4044bb7662bSHans Petter Selasky 	}
405ca2345a0SHans Petter Selasky 
406c6df6f53SWarner Losh 	bus_topo_unlock();
407f20b553dSHans Petter Selasky }
4084bb7662bSHans Petter Selasky 
4094bb7662bSHans Petter Selasky /* How much time to wait until health resetting the driver (in msecs) */
4104bb7662bSHans Petter Selasky #define MLX5_RECOVERY_DELAY_MSECS 60000
4111900b6f8SHans Petter Selasky #define MLX5_RECOVERY_NO_DELAY 0
get_recovery_delay(struct mlx5_core_dev * dev)4121900b6f8SHans Petter Selasky static unsigned long get_recovery_delay(struct mlx5_core_dev *dev)
4131900b6f8SHans Petter Selasky {
4141900b6f8SHans Petter Selasky 	return dev->priv.health.fatal_error == MLX5_SENSOR_PCI_ERR ||
4151900b6f8SHans Petter Selasky 		dev->priv.health.fatal_error == MLX5_SENSOR_PCI_COMM_ERR	?
4161900b6f8SHans Petter Selasky 		MLX5_RECOVERY_DELAY_MSECS : MLX5_RECOVERY_NO_DELAY;
4171900b6f8SHans Petter Selasky }
4181900b6f8SHans Petter Selasky 
health_care(struct work_struct * work)419dc7e38acSHans Petter Selasky static void health_care(struct work_struct *work)
420dc7e38acSHans Petter Selasky {
421a2485fe5SHans Petter Selasky 	struct mlx5_core_health *health;
4221900b6f8SHans Petter Selasky 	unsigned long recover_delay;
423dc7e38acSHans Petter Selasky 	struct mlx5_core_dev *dev;
424dc7e38acSHans Petter Selasky 	struct mlx5_priv *priv;
4254bb7662bSHans Petter Selasky 	unsigned long flags;
426dc7e38acSHans Petter Selasky 
427a2485fe5SHans Petter Selasky 	health = container_of(work, struct mlx5_core_health, work);
428dc7e38acSHans Petter Selasky 	priv = container_of(health, struct mlx5_priv, health);
429dc7e38acSHans Petter Selasky 	dev = container_of(priv, struct mlx5_core_dev, priv);
430f20b553dSHans Petter Selasky 
431dc7e38acSHans Petter Selasky 	mlx5_core_warn(dev, "handling bad device here\n");
432a2485fe5SHans Petter Selasky 	mlx5_handle_bad_state(dev);
4331900b6f8SHans Petter Selasky 	recover_delay = msecs_to_jiffies(get_recovery_delay(dev));
4344bb7662bSHans Petter Selasky 
4354bb7662bSHans Petter Selasky 	spin_lock_irqsave(&health->wq_lock, flags);
436fe242ba7SHans Petter Selasky 	if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags)) {
437a2f4f59cSHans Petter Selasky 		mlx5_core_warn(dev,
438a2f4f59cSHans Petter Selasky 		    "Scheduling recovery work with %lums delay\n",
439fe242ba7SHans Petter Selasky 		    recover_delay);
4404bb7662bSHans Petter Selasky 		schedule_delayed_work(&health->recover_work, recover_delay);
441fe242ba7SHans Petter Selasky 	} else {
442a2f4f59cSHans Petter Selasky 		mlx5_core_err(dev,
4434bb7662bSHans Petter Selasky 		    "new health works are not permitted at this stage\n");
444fe242ba7SHans Petter Selasky 	}
4454bb7662bSHans Petter Selasky 	spin_unlock_irqrestore(&health->wq_lock, flags);
446dc7e38acSHans Petter Selasky }
447a2485fe5SHans Petter Selasky 
get_next_poll_jiffies(void)448a2485fe5SHans Petter Selasky static int get_next_poll_jiffies(void)
449a2485fe5SHans Petter Selasky {
450a2485fe5SHans Petter Selasky 	unsigned long next;
451a2485fe5SHans Petter Selasky 
452a2485fe5SHans Petter Selasky 	get_random_bytes(&next, sizeof(next));
453a2485fe5SHans Petter Selasky 	next %= HZ;
454a2485fe5SHans Petter Selasky 	next += jiffies + MLX5_HEALTH_POLL_INTERVAL;
455a2485fe5SHans Petter Selasky 
456a2485fe5SHans Petter Selasky 	return next;
457dc7e38acSHans Petter Selasky }
458dc7e38acSHans Petter Selasky 
mlx5_trigger_health_work(struct mlx5_core_dev * dev)4594bb7662bSHans Petter Selasky void mlx5_trigger_health_work(struct mlx5_core_dev *dev)
4604bb7662bSHans Petter Selasky {
4614bb7662bSHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
4624bb7662bSHans Petter Selasky 	unsigned long flags;
4634bb7662bSHans Petter Selasky 
4644bb7662bSHans Petter Selasky 	spin_lock_irqsave(&health->wq_lock, flags);
4654bb7662bSHans Petter Selasky 	if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
4664bb7662bSHans Petter Selasky 		queue_work(health->wq, &health->work);
4674bb7662bSHans Petter Selasky 	else
468a2f4f59cSHans Petter Selasky 		mlx5_core_err(dev,
4694bb7662bSHans Petter Selasky 			"new health works are not permitted at this stage\n");
4704bb7662bSHans Petter Selasky 	spin_unlock_irqrestore(&health->wq_lock, flags);
4714bb7662bSHans Petter Selasky }
4724bb7662bSHans Petter Selasky 
hsynd_str(u8 synd)473dc7e38acSHans Petter Selasky static const char *hsynd_str(u8 synd)
474dc7e38acSHans Petter Selasky {
475dc7e38acSHans Petter Selasky 	switch (synd) {
476dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_FW_ERR:
477dc7e38acSHans Petter Selasky 		return "firmware internal error";
478dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_IRISC_ERR:
479dc7e38acSHans Petter Selasky 		return "irisc not responding";
480a2485fe5SHans Petter Selasky 	case MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR:
481a2485fe5SHans Petter Selasky 		return "unrecoverable hardware error";
482dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_CRC_ERR:
483dc7e38acSHans Petter Selasky 		return "firmware CRC error";
484dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR:
485dc7e38acSHans Petter Selasky 		return "ICM fetch PCI error";
486dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_HW_FTL_ERR:
487dc7e38acSHans Petter Selasky 		return "HW fatal error\n";
488dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR:
489dc7e38acSHans Petter Selasky 		return "async EQ buffer overrun";
490dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_EQ_ERR:
491dc7e38acSHans Petter Selasky 		return "EQ error";
492a2485fe5SHans Petter Selasky 	case MLX5_HEALTH_SYNDR_EQ_INV:
493a2485fe5SHans Petter Selasky 		return "Invalid EQ referenced";
494dc7e38acSHans Petter Selasky 	case MLX5_HEALTH_SYNDR_FFSER_ERR:
495dc7e38acSHans Petter Selasky 		return "FFSER error";
496a2485fe5SHans Petter Selasky 	case MLX5_HEALTH_SYNDR_HIGH_TEMP:
49787b3c8ccSHans Petter Selasky 		return "High temperature";
498dc7e38acSHans Petter Selasky 	default:
499dc7e38acSHans Petter Selasky 		return "unrecognized error";
500dc7e38acSHans Petter Selasky 	}
501dc7e38acSHans Petter Selasky }
502dc7e38acSHans Petter Selasky 
503c9bb26aeSHans Petter Selasky static u8
print_health_info(struct mlx5_core_dev * dev)504c9bb26aeSHans Petter Selasky print_health_info(struct mlx5_core_dev *dev)
505dc7e38acSHans Petter Selasky {
506dc7e38acSHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
507dc7e38acSHans Petter Selasky 	struct mlx5_health_buffer __iomem *h = health->health;
508c9bb26aeSHans Petter Selasky 	u8 synd = ioread8(&h->synd);
509a2485fe5SHans Petter Selasky 	char fw_str[18];
510a2485fe5SHans Petter Selasky 	u32 fw;
511dc7e38acSHans Petter Selasky 	int i;
512dc7e38acSHans Petter Selasky 
513c9bb26aeSHans Petter Selasky 	/*
514c9bb26aeSHans Petter Selasky 	 * If synd is 0x0 - this indicates that FW is unable to
515c9bb26aeSHans Petter Selasky 	 * respond to initialization segment reads and health buffer
516c9bb26aeSHans Petter Selasky 	 * should not be read.
517c9bb26aeSHans Petter Selasky 	 */
518c9bb26aeSHans Petter Selasky 	if (synd == 0)
519c9bb26aeSHans Petter Selasky 		return (0);
520dc7e38acSHans Petter Selasky 
521a2485fe5SHans Petter Selasky 	for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
522a2f4f59cSHans Petter Selasky 		mlx5_core_info(dev, "assert_var[%d] 0x%08x\n", i,
523a2f4f59cSHans Petter Selasky 		    ioread32be(h->assert_var + i));
524a2485fe5SHans Petter Selasky 
525a2f4f59cSHans Petter Selasky 	mlx5_core_info(dev, "assert_exit_ptr 0x%08x\n",
526a2f4f59cSHans Petter Selasky 	    ioread32be(&h->assert_exit_ptr));
527a2f4f59cSHans Petter Selasky 	mlx5_core_info(dev, "assert_callra 0x%08x\n",
528a2f4f59cSHans Petter Selasky 	    ioread32be(&h->assert_callra));
529a2f4f59cSHans Petter Selasky 	snprintf(fw_str, sizeof(fw_str), "%d.%d.%d",
530a2f4f59cSHans Petter Selasky 	    fw_rev_maj(dev), fw_rev_min(dev), fw_rev_sub(dev));
531a2f4f59cSHans Petter Selasky 	mlx5_core_info(dev, "fw_ver %s\n", fw_str);
532a2f4f59cSHans Petter Selasky 	mlx5_core_info(dev, "hw_id 0x%08x\n", ioread32be(&h->hw_id));
533a2f4f59cSHans Petter Selasky 	mlx5_core_info(dev, "irisc_index %d\n", ioread8(&h->irisc_index));
534a2f4f59cSHans Petter Selasky 	mlx5_core_info(dev, "synd 0x%x: %s\n",
535a2f4f59cSHans Petter Selasky 	    ioread8(&h->synd), hsynd_str(ioread8(&h->synd)));
536a2f4f59cSHans Petter Selasky 	mlx5_core_info(dev, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd));
537a2485fe5SHans Petter Selasky 	fw = ioread32be(&h->fw_ver);
538a2f4f59cSHans Petter Selasky 	mlx5_core_info(dev, "raw fw_ver 0x%08x\n", fw);
539c9bb26aeSHans Petter Selasky 
540c9bb26aeSHans Petter Selasky 	return synd;
541dc7e38acSHans Petter Selasky }
542dc7e38acSHans Petter Selasky 
health_watchdog(struct work_struct * work)543adb6fd50SHans Petter Selasky static void health_watchdog(struct work_struct *work)
544adb6fd50SHans Petter Selasky {
545adb6fd50SHans Petter Selasky 	struct mlx5_core_dev *dev;
546adb6fd50SHans Petter Selasky 	u16 power;
547adb6fd50SHans Petter Selasky 	u8 status;
548adb6fd50SHans Petter Selasky 	int err;
549adb6fd50SHans Petter Selasky 
550adb6fd50SHans Petter Selasky 	dev = container_of(work, struct mlx5_core_dev, priv.health.work_watchdog);
551adb6fd50SHans Petter Selasky 
552adb6fd50SHans Petter Selasky 	if (!MLX5_CAP_GEN(dev, mcam_reg) ||
553adb6fd50SHans Petter Selasky 	    !MLX5_CAP_MCAM_FEATURE(dev, pcie_status_and_power))
554adb6fd50SHans Petter Selasky 		return;
555adb6fd50SHans Petter Selasky 
556adb6fd50SHans Petter Selasky 	err = mlx5_pci_read_power_status(dev, &power, &status);
557adb6fd50SHans Petter Selasky 	if (err < 0) {
558a2f4f59cSHans Petter Selasky 		mlx5_core_warn(dev, "Failed reading power status: %d\n",
559a2f4f59cSHans Petter Selasky 		    err);
560adb6fd50SHans Petter Selasky 		return;
561adb6fd50SHans Petter Selasky 	}
562adb6fd50SHans Petter Selasky 
563adb6fd50SHans Petter Selasky 	dev->pwr_value = power;
564adb6fd50SHans Petter Selasky 
565adb6fd50SHans Petter Selasky 	if (dev->pwr_status != status) {
566adb6fd50SHans Petter Selasky 
567adb6fd50SHans Petter Selasky 		switch (status) {
568adb6fd50SHans Petter Selasky 		case 0:
569adb6fd50SHans Petter Selasky 			dev->pwr_status = status;
570a2f4f59cSHans Petter Selasky 			mlx5_core_info(dev,
571a2f4f59cSHans Petter Selasky 			    "PCI power is not published by the PCIe slot.\n");
572adb6fd50SHans Petter Selasky 			break;
573adb6fd50SHans Petter Selasky 		case 1:
574adb6fd50SHans Petter Selasky 			dev->pwr_status = status;
575a2f4f59cSHans Petter Selasky 			mlx5_core_info(dev,
576a2f4f59cSHans Petter Selasky 			    "PCIe slot advertised sufficient power (%uW).\n",
577a2f4f59cSHans Petter Selasky 			    power);
578adb6fd50SHans Petter Selasky 			break;
579adb6fd50SHans Petter Selasky 		case 2:
580adb6fd50SHans Petter Selasky 			dev->pwr_status = status;
581a2f4f59cSHans Petter Selasky 			mlx5_core_warn(dev,
582a2f4f59cSHans Petter Selasky 			    "Detected insufficient power on the PCIe slot (%uW).\n",
583a2f4f59cSHans Petter Selasky 			    power);
584adb6fd50SHans Petter Selasky 			break;
585adb6fd50SHans Petter Selasky 		default:
586adb6fd50SHans Petter Selasky 			dev->pwr_status = 0;
587a2f4f59cSHans Petter Selasky 			mlx5_core_warn(dev,
588a2f4f59cSHans Petter Selasky 			    "Unknown power state detected(%d).\n",
589a2f4f59cSHans Petter Selasky 			    status);
590adb6fd50SHans Petter Selasky 			break;
591adb6fd50SHans Petter Selasky 		}
592adb6fd50SHans Petter Selasky 	}
593adb6fd50SHans Petter Selasky }
594adb6fd50SHans Petter Selasky 
595adb6fd50SHans Petter Selasky void
mlx5_trigger_health_watchdog(struct mlx5_core_dev * dev)596adb6fd50SHans Petter Selasky mlx5_trigger_health_watchdog(struct mlx5_core_dev *dev)
597adb6fd50SHans Petter Selasky {
598adb6fd50SHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
599adb6fd50SHans Petter Selasky 	unsigned long flags;
600adb6fd50SHans Petter Selasky 
601adb6fd50SHans Petter Selasky 	spin_lock_irqsave(&health->wq_lock, flags);
602adb6fd50SHans Petter Selasky 	if (!test_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags))
603adb6fd50SHans Petter Selasky 		queue_work(health->wq_watchdog, &health->work_watchdog);
604adb6fd50SHans Petter Selasky 	else
605a2f4f59cSHans Petter Selasky 		mlx5_core_err(dev,
606adb6fd50SHans Petter Selasky 		    "scheduling watchdog is not permitted at this stage\n");
607adb6fd50SHans Petter Selasky 	spin_unlock_irqrestore(&health->wq_lock, flags);
608adb6fd50SHans Petter Selasky }
609adb6fd50SHans Petter Selasky 
poll_health(unsigned long data)61003ab395eSHans Petter Selasky static void poll_health(unsigned long data)
611dc7e38acSHans Petter Selasky {
612dc7e38acSHans Petter Selasky 	struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data;
613dc7e38acSHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
6141900b6f8SHans Petter Selasky 	u32 fatal_error;
615dc7e38acSHans Petter Selasky 	u32 count;
616dc7e38acSHans Petter Selasky 
61730dfc051SHans Petter Selasky 	if (dev->state != MLX5_DEVICE_STATE_UP)
61830dfc051SHans Petter Selasky 		return;
61930dfc051SHans Petter Selasky 
620dc7e38acSHans Petter Selasky 	count = ioread32be(health->health_counter);
621dc7e38acSHans Petter Selasky 	if (count == health->prev)
622dc7e38acSHans Petter Selasky 		++health->miss_counter;
623dc7e38acSHans Petter Selasky 	else
624dc7e38acSHans Petter Selasky 		health->miss_counter = 0;
625dc7e38acSHans Petter Selasky 
626dc7e38acSHans Petter Selasky 	health->prev = count;
627dc7e38acSHans Petter Selasky 	if (health->miss_counter == MAX_MISSES) {
628a2485fe5SHans Petter Selasky 		mlx5_core_err(dev, "device's health compromised - reached miss count\n");
629c9bb26aeSHans Petter Selasky 		if (print_health_info(dev) == 0)
630c9bb26aeSHans Petter Selasky 			mlx5_core_err(dev, "FW is unable to respond to initialization segment reads\n");
631a2485fe5SHans Petter Selasky 	}
632a2485fe5SHans Petter Selasky 
6331900b6f8SHans Petter Selasky 	fatal_error = check_fatal_sensors(dev);
6341900b6f8SHans Petter Selasky 
6351900b6f8SHans Petter Selasky 	if (fatal_error && !health->fatal_error) {
636a2f4f59cSHans Petter Selasky 		mlx5_core_err(dev,
637a2f4f59cSHans Petter Selasky 		    "Fatal error %u detected\n", fatal_error);
6381900b6f8SHans Petter Selasky 		dev->priv.health.fatal_error = fatal_error;
639a2485fe5SHans Petter Selasky 		print_health_info(dev);
6404bb7662bSHans Petter Selasky 		mlx5_trigger_health_work(dev);
641dc7e38acSHans Petter Selasky 	}
6424bb7662bSHans Petter Selasky 
6434bb7662bSHans Petter Selasky 	mod_timer(&health->timer, get_next_poll_jiffies());
644dc7e38acSHans Petter Selasky }
645dc7e38acSHans Petter Selasky 
mlx5_start_health_poll(struct mlx5_core_dev * dev)646dc7e38acSHans Petter Selasky void mlx5_start_health_poll(struct mlx5_core_dev *dev)
647dc7e38acSHans Petter Selasky {
648dc7e38acSHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
649dc7e38acSHans Petter Selasky 
650dc7e38acSHans Petter Selasky 	init_timer(&health->timer);
6511900b6f8SHans Petter Selasky 	health->fatal_error = MLX5_SENSOR_NO_ERR;
652ca551594SHans Petter Selasky 	clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
653519774eaSHans Petter Selasky 	clear_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
654adb6fd50SHans Petter Selasky 	clear_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags);
655dc7e38acSHans Petter Selasky 	health->health = &dev->iseg->health;
656dc7e38acSHans Petter Selasky 	health->health_counter = &dev->iseg->health_counter;
657dc7e38acSHans Petter Selasky 
65803ab395eSHans Petter Selasky 	setup_timer(&health->timer, poll_health, (unsigned long)dev);
659dc7e38acSHans Petter Selasky 	mod_timer(&health->timer,
660dc7e38acSHans Petter Selasky 		  round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL));
661adb6fd50SHans Petter Selasky 
662adb6fd50SHans Petter Selasky 	/* do initial PCI power state readout */
663adb6fd50SHans Petter Selasky 	mlx5_trigger_health_watchdog(dev);
664dc7e38acSHans Petter Selasky }
665dc7e38acSHans Petter Selasky 
mlx5_stop_health_poll(struct mlx5_core_dev * dev,bool disable_health)6662119f825SSlava Shwartsman void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health)
667dc7e38acSHans Petter Selasky {
668dc7e38acSHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
6692119f825SSlava Shwartsman 	unsigned long flags;
6702119f825SSlava Shwartsman 
6712119f825SSlava Shwartsman 	if (disable_health) {
6722119f825SSlava Shwartsman 		spin_lock_irqsave(&health->wq_lock, flags);
6732119f825SSlava Shwartsman 		set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
6742119f825SSlava Shwartsman 		set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
675adb6fd50SHans Petter Selasky 		set_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags);
6762119f825SSlava Shwartsman 		spin_unlock_irqrestore(&health->wq_lock, flags);
6772119f825SSlava Shwartsman 	}
678dc7e38acSHans Petter Selasky 
679dc7e38acSHans Petter Selasky 	del_timer_sync(&health->timer);
680dc7e38acSHans Petter Selasky }
681dc7e38acSHans Petter Selasky 
mlx5_drain_health_wq(struct mlx5_core_dev * dev)682ca551594SHans Petter Selasky void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
683ca551594SHans Petter Selasky {
684ca551594SHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
6854bb7662bSHans Petter Selasky 	unsigned long flags;
686ca551594SHans Petter Selasky 
6874bb7662bSHans Petter Selasky 	spin_lock_irqsave(&health->wq_lock, flags);
688ca551594SHans Petter Selasky 	set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
689519774eaSHans Petter Selasky 	set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
690adb6fd50SHans Petter Selasky 	set_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags);
6914bb7662bSHans Petter Selasky 	spin_unlock_irqrestore(&health->wq_lock, flags);
6924bb7662bSHans Petter Selasky 	cancel_delayed_work_sync(&health->recover_work);
693ca551594SHans Petter Selasky 	cancel_work_sync(&health->work);
694adb6fd50SHans Petter Selasky 	cancel_work_sync(&health->work_watchdog);
695ca551594SHans Petter Selasky }
696ca551594SHans Petter Selasky 
mlx5_drain_health_recovery(struct mlx5_core_dev * dev)697519774eaSHans Petter Selasky void mlx5_drain_health_recovery(struct mlx5_core_dev *dev)
698519774eaSHans Petter Selasky {
699519774eaSHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
700519774eaSHans Petter Selasky 	unsigned long flags;
701519774eaSHans Petter Selasky 
702519774eaSHans Petter Selasky 	spin_lock_irqsave(&health->wq_lock, flags);
703519774eaSHans Petter Selasky 	set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
704519774eaSHans Petter Selasky 	spin_unlock_irqrestore(&health->wq_lock, flags);
705519774eaSHans Petter Selasky 	cancel_delayed_work_sync(&dev->priv.health.recover_work);
706519774eaSHans Petter Selasky }
707519774eaSHans Petter Selasky 
mlx5_health_cleanup(struct mlx5_core_dev * dev)708a2485fe5SHans Petter Selasky void mlx5_health_cleanup(struct mlx5_core_dev *dev)
709dc7e38acSHans Petter Selasky {
710a2485fe5SHans Petter Selasky 	struct mlx5_core_health *health = &dev->priv.health;
711a2485fe5SHans Petter Selasky 
712a2485fe5SHans Petter Selasky 	destroy_workqueue(health->wq);
71340218d73SHans Petter Selasky 	destroy_workqueue(health->wq_watchdog);
7148d1eeedbSHans Petter Selasky 	destroy_workqueue(health->wq_cmd);
715dc7e38acSHans Petter Selasky }
716dc7e38acSHans Petter Selasky 
mlx5_health_init(struct mlx5_core_dev * dev)717a2485fe5SHans Petter Selasky int mlx5_health_init(struct mlx5_core_dev *dev)
718dc7e38acSHans Petter Selasky {
719a2485fe5SHans Petter Selasky 	struct mlx5_core_health *health;
72040218d73SHans Petter Selasky 	char name[64];
721dc7e38acSHans Petter Selasky 
722a2485fe5SHans Petter Selasky 	health = &dev->priv.health;
723a2485fe5SHans Petter Selasky 
72440218d73SHans Petter Selasky 	snprintf(name, sizeof(name), "%s-rec", dev_name(&dev->pdev->dev));
725a2485fe5SHans Petter Selasky 	health->wq = create_singlethread_workqueue(name);
726a2485fe5SHans Petter Selasky 	if (!health->wq)
7278d1eeedbSHans Petter Selasky 		goto err_recovery;
728a2485fe5SHans Petter Selasky 
72940218d73SHans Petter Selasky 	snprintf(name, sizeof(name), "%s-wdg", dev_name(&dev->pdev->dev));
73040218d73SHans Petter Selasky 	health->wq_watchdog = create_singlethread_workqueue(name);
7318d1eeedbSHans Petter Selasky 	if (!health->wq_watchdog)
7328d1eeedbSHans Petter Selasky 		goto err_watchdog;
7338d1eeedbSHans Petter Selasky 
7348d1eeedbSHans Petter Selasky 	snprintf(name, sizeof(name), "%s-cmd", dev_name(&dev->pdev->dev));
7358d1eeedbSHans Petter Selasky 	health->wq_cmd = create_singlethread_workqueue(name);
7368d1eeedbSHans Petter Selasky 	if (!health->wq_cmd)
7378d1eeedbSHans Petter Selasky 		goto err_cmd;
73840218d73SHans Petter Selasky 
739ca551594SHans Petter Selasky 	spin_lock_init(&health->wq_lock);
740a2485fe5SHans Petter Selasky 	INIT_WORK(&health->work, health_care);
741adb6fd50SHans Petter Selasky 	INIT_WORK(&health->work_watchdog, health_watchdog);
742a0a4fd77SHans Petter Selasky 	INIT_WORK(&health->work_cmd_completion, mlx5_trigger_cmd_completions);
7434bb7662bSHans Petter Selasky 	INIT_DELAYED_WORK(&health->recover_work, health_recover);
744a2485fe5SHans Petter Selasky 
745a2485fe5SHans Petter Selasky 	return 0;
7468d1eeedbSHans Petter Selasky 
7478d1eeedbSHans Petter Selasky err_cmd:
7488d1eeedbSHans Petter Selasky 	destroy_workqueue(health->wq_watchdog);
7498d1eeedbSHans Petter Selasky err_watchdog:
7508d1eeedbSHans Petter Selasky 	destroy_workqueue(health->wq);
7518d1eeedbSHans Petter Selasky err_recovery:
7528d1eeedbSHans Petter Selasky 	return -ENOMEM;
753dc7e38acSHans Petter Selasky }
754