1dc7e38acSHans Petter Selasky /*-
28d1eeedbSHans Petter Selasky * Copyright (c) 2013-2019, Mellanox Technologies, Ltd. All rights reserved.
3dc7e38acSHans Petter Selasky *
4dc7e38acSHans Petter Selasky * Redistribution and use in source and binary forms, with or without
5dc7e38acSHans Petter Selasky * modification, are permitted provided that the following conditions
6dc7e38acSHans Petter Selasky * are met:
7dc7e38acSHans Petter Selasky * 1. Redistributions of source code must retain the above copyright
8dc7e38acSHans Petter Selasky * notice, this list of conditions and the following disclaimer.
9dc7e38acSHans Petter Selasky * 2. Redistributions in binary form must reproduce the above copyright
10dc7e38acSHans Petter Selasky * notice, this list of conditions and the following disclaimer in the
11dc7e38acSHans Petter Selasky * documentation and/or other materials provided with the distribution.
12dc7e38acSHans Petter Selasky *
13dc7e38acSHans Petter Selasky * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14dc7e38acSHans Petter Selasky * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15dc7e38acSHans Petter Selasky * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16dc7e38acSHans Petter Selasky * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17dc7e38acSHans Petter Selasky * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18dc7e38acSHans Petter Selasky * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19dc7e38acSHans Petter Selasky * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20dc7e38acSHans Petter Selasky * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21dc7e38acSHans Petter Selasky * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22dc7e38acSHans Petter Selasky * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23dc7e38acSHans Petter Selasky * SUCH DAMAGE.
24dc7e38acSHans Petter Selasky */
25dc7e38acSHans Petter Selasky
26ee9d634bSKonstantin Belousov #include "opt_rss.h"
27ee9d634bSKonstantin Belousov #include "opt_ratelimit.h"
28ee9d634bSKonstantin Belousov
29dc7e38acSHans Petter Selasky #include <linux/kernel.h>
30dc7e38acSHans Petter Selasky #include <linux/module.h>
31dc7e38acSHans Petter Selasky #include <linux/random.h>
32dc7e38acSHans Petter Selasky #include <linux/vmalloc.h>
33a2485fe5SHans Petter Selasky #include <linux/hardirq.h>
341900b6f8SHans Petter Selasky #include <linux/delay.h>
35dc7e38acSHans Petter Selasky #include <dev/mlx5/driver.h>
36dc7e38acSHans Petter Selasky #include <dev/mlx5/mlx5_ifc.h>
3712c56d7dSHans Petter Selasky #include <dev/mlx5/mlx5_core/mlx5_core.h>
38dc7e38acSHans Petter Selasky
39dc7e38acSHans Petter Selasky #define MLX5_HEALTH_POLL_INTERVAL (2 * HZ)
40dc7e38acSHans Petter Selasky #define MAX_MISSES 3
41dc7e38acSHans Petter Selasky
42a2485fe5SHans Petter Selasky enum {
43ca551594SHans Petter Selasky MLX5_DROP_NEW_HEALTH_WORK,
44519774eaSHans Petter Selasky MLX5_DROP_NEW_RECOVERY_WORK,
45adb6fd50SHans Petter Selasky MLX5_DROP_NEW_WATCHDOG_WORK,
46ca551594SHans Petter Selasky };
47ca551594SHans Petter Selasky
481900b6f8SHans Petter Selasky enum {
491900b6f8SHans Petter Selasky MLX5_SENSOR_NO_ERR = 0,
501900b6f8SHans Petter Selasky MLX5_SENSOR_PCI_COMM_ERR = 1,
511900b6f8SHans Petter Selasky MLX5_SENSOR_PCI_ERR = 2,
521900b6f8SHans Petter Selasky MLX5_SENSOR_NIC_DISABLED = 3,
531900b6f8SHans Petter Selasky MLX5_SENSOR_NIC_SW_RESET = 4,
54fe242ba7SHans Petter Selasky MLX5_SENSOR_FW_SYND_RFR = 5,
551900b6f8SHans Petter Selasky };
561900b6f8SHans Petter Selasky
5729e54451SSlava Shwartsman static int mlx5_fw_reset_enable = 1;
5829e54451SSlava Shwartsman SYSCTL_INT(_hw_mlx5, OID_AUTO, fw_reset_enable, CTLFLAG_RWTUN,
5929e54451SSlava Shwartsman &mlx5_fw_reset_enable, 0,
6029e54451SSlava Shwartsman "Enable firmware reset");
6129e54451SSlava Shwartsman
625169fb81SHans Petter Selasky static unsigned int sw_reset_to = 1200;
635169fb81SHans Petter Selasky SYSCTL_UINT(_hw_mlx5, OID_AUTO, sw_reset_timeout, CTLFLAG_RWTUN,
645169fb81SHans Petter Selasky &sw_reset_to, 0,
655169fb81SHans Petter Selasky "Minimum timeout in seconds between two firmware resets");
665169fb81SHans Petter Selasky
675169fb81SHans Petter Selasky
lock_sem_sw_reset(struct mlx5_core_dev * dev)68b575d8c8SHans Petter Selasky static int lock_sem_sw_reset(struct mlx5_core_dev *dev)
69f20b553dSHans Petter Selasky {
70b575d8c8SHans Petter Selasky int ret;
71f20b553dSHans Petter Selasky
72f20b553dSHans Petter Selasky /* Lock GW access */
73b575d8c8SHans Petter Selasky ret = -mlx5_vsc_lock(dev);
74f20b553dSHans Petter Selasky if (ret) {
75b575d8c8SHans Petter Selasky mlx5_core_warn(dev, "Timed out locking gateway %d\n", ret);
76f20b553dSHans Petter Selasky return ret;
77f20b553dSHans Petter Selasky }
78f20b553dSHans Petter Selasky
79b575d8c8SHans Petter Selasky ret = -mlx5_vsc_lock_addr_space(dev, MLX5_SEMAPHORE_SW_RESET);
80b575d8c8SHans Petter Selasky if (ret) {
81f20b553dSHans Petter Selasky if (ret == -EBUSY)
82a2f4f59cSHans Petter Selasky mlx5_core_dbg(dev,
83a2f4f59cSHans Petter Selasky "SW reset FW semaphore already locked, another function will handle the reset\n");
84f20b553dSHans Petter Selasky else
85a2f4f59cSHans Petter Selasky mlx5_core_warn(dev,
86a2f4f59cSHans Petter Selasky "SW reset semaphore lock return %d\n", ret);
87f20b553dSHans Petter Selasky }
88f20b553dSHans Petter Selasky
89f20b553dSHans Petter Selasky /* Unlock GW access */
90b575d8c8SHans Petter Selasky mlx5_vsc_unlock(dev);
91b575d8c8SHans Petter Selasky
92b575d8c8SHans Petter Selasky return ret;
93b575d8c8SHans Petter Selasky }
94b575d8c8SHans Petter Selasky
unlock_sem_sw_reset(struct mlx5_core_dev * dev)95b575d8c8SHans Petter Selasky static int unlock_sem_sw_reset(struct mlx5_core_dev *dev)
96b575d8c8SHans Petter Selasky {
97b575d8c8SHans Petter Selasky int ret;
98b575d8c8SHans Petter Selasky
99b575d8c8SHans Petter Selasky /* Lock GW access */
100b575d8c8SHans Petter Selasky ret = -mlx5_vsc_lock(dev);
101b575d8c8SHans Petter Selasky if (ret) {
102b575d8c8SHans Petter Selasky mlx5_core_warn(dev, "Timed out locking gateway %d\n", ret);
103b575d8c8SHans Petter Selasky return ret;
104b575d8c8SHans Petter Selasky }
105b575d8c8SHans Petter Selasky
106b575d8c8SHans Petter Selasky ret = -mlx5_vsc_unlock_addr_space(dev, MLX5_SEMAPHORE_SW_RESET);
107b575d8c8SHans Petter Selasky
108b575d8c8SHans Petter Selasky /* Unlock GW access */
109b575d8c8SHans Petter Selasky mlx5_vsc_unlock(dev);
110f20b553dSHans Petter Selasky
111f20b553dSHans Petter Selasky return ret;
112f20b553dSHans Petter Selasky }
113f20b553dSHans Petter Selasky
mlx5_get_nic_state(struct mlx5_core_dev * dev)114ba11bcecSHans Petter Selasky u8 mlx5_get_nic_state(struct mlx5_core_dev *dev)
115a2485fe5SHans Petter Selasky {
1161900b6f8SHans Petter Selasky return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 7;
117a2485fe5SHans Petter Selasky }
118a2485fe5SHans Petter Selasky
mlx5_set_nic_state(struct mlx5_core_dev * dev,u8 state)119ba11bcecSHans Petter Selasky void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state)
120ba11bcecSHans Petter Selasky {
121ba11bcecSHans Petter Selasky u32 cur_cmdq_addr_l_sz;
122ba11bcecSHans Petter Selasky
123ba11bcecSHans Petter Selasky cur_cmdq_addr_l_sz = ioread32be(&dev->iseg->cmdq_addr_l_sz);
124ba11bcecSHans Petter Selasky iowrite32be((cur_cmdq_addr_l_sz & 0xFFFFF000) |
125ba11bcecSHans Petter Selasky state << MLX5_NIC_IFC_OFFSET,
126ba11bcecSHans Petter Selasky &dev->iseg->cmdq_addr_l_sz);
127ba11bcecSHans Petter Selasky }
128ba11bcecSHans Petter Selasky
sensor_fw_synd_rfr(struct mlx5_core_dev * dev)129fe242ba7SHans Petter Selasky static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev)
130fe242ba7SHans Petter Selasky {
131fe242ba7SHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health;
132fe242ba7SHans Petter Selasky struct mlx5_health_buffer __iomem *h = health->health;
133fe242ba7SHans Petter Selasky u32 rfr = ioread32be(&h->rfr) >> MLX5_RFR_OFFSET;
134fe242ba7SHans Petter Selasky u8 synd = ioread8(&h->synd);
135fe242ba7SHans Petter Selasky
136fe242ba7SHans Petter Selasky if (rfr && synd)
137fe242ba7SHans Petter Selasky mlx5_core_dbg(dev, "FW requests reset, synd: %d\n", synd);
138fe242ba7SHans Petter Selasky return rfr && synd;
139fe242ba7SHans Petter Selasky }
140fe242ba7SHans Petter Selasky
mlx5_trigger_cmd_completions(struct work_struct * work)141a0a4fd77SHans Petter Selasky static void mlx5_trigger_cmd_completions(struct work_struct *work)
142a2485fe5SHans Petter Selasky {
143a0a4fd77SHans Petter Selasky struct mlx5_core_dev *dev =
144a0a4fd77SHans Petter Selasky container_of(work, struct mlx5_core_dev, priv.health.work_cmd_completion);
145a2485fe5SHans Petter Selasky unsigned long flags;
146a2485fe5SHans Petter Selasky u64 vector;
147a2485fe5SHans Petter Selasky
148a2485fe5SHans Petter Selasky /* wait for pending handlers to complete */
149a2485fe5SHans Petter Selasky synchronize_irq(dev->priv.msix_arr[MLX5_EQ_VEC_CMD].vector);
150a2485fe5SHans Petter Selasky spin_lock_irqsave(&dev->cmd.alloc_lock, flags);
151a2485fe5SHans Petter Selasky vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1);
152a2485fe5SHans Petter Selasky if (!vector)
153a2485fe5SHans Petter Selasky goto no_trig;
154a2485fe5SHans Petter Selasky
155a2485fe5SHans Petter Selasky vector |= MLX5_TRIGGERED_CMD_COMP;
156a2485fe5SHans Petter Selasky spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
157a2485fe5SHans Petter Selasky
1582cec1528SKonstantin Belousov mlx5_core_dbg(dev, "vector 0x%jx\n", (uintmax_t)vector);
159721a1a6aSSlava Shwartsman mlx5_cmd_comp_handler(dev, vector, MLX5_CMD_MODE_EVENTS);
160a2485fe5SHans Petter Selasky return;
161a2485fe5SHans Petter Selasky
162a2485fe5SHans Petter Selasky no_trig:
163a2485fe5SHans Petter Selasky spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
164a2485fe5SHans Petter Selasky }
165a2485fe5SHans Petter Selasky
sensor_pci_no_comm(struct mlx5_core_dev * dev)1661900b6f8SHans Petter Selasky static bool sensor_pci_no_comm(struct mlx5_core_dev *dev)
167a2485fe5SHans Petter Selasky {
168a2485fe5SHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health;
169a2485fe5SHans Petter Selasky struct mlx5_health_buffer __iomem *h = health->health;
1701900b6f8SHans Petter Selasky bool err = ioread32be(&h->fw_ver) == 0xffffffff;
171a2485fe5SHans Petter Selasky
1721900b6f8SHans Petter Selasky return err;
1731900b6f8SHans Petter Selasky }
174a2485fe5SHans Petter Selasky
sensor_nic_disabled(struct mlx5_core_dev * dev)1751900b6f8SHans Petter Selasky static bool sensor_nic_disabled(struct mlx5_core_dev *dev)
1761900b6f8SHans Petter Selasky {
177ba11bcecSHans Petter Selasky return mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED;
1781900b6f8SHans Petter Selasky }
179a2485fe5SHans Petter Selasky
sensor_nic_sw_reset(struct mlx5_core_dev * dev)1801900b6f8SHans Petter Selasky static bool sensor_nic_sw_reset(struct mlx5_core_dev *dev)
1811900b6f8SHans Petter Selasky {
182ba11bcecSHans Petter Selasky return mlx5_get_nic_state(dev) == MLX5_NIC_IFC_SW_RESET;
1831900b6f8SHans Petter Selasky }
1841900b6f8SHans Petter Selasky
check_fatal_sensors(struct mlx5_core_dev * dev)1851900b6f8SHans Petter Selasky static u32 check_fatal_sensors(struct mlx5_core_dev *dev)
1861900b6f8SHans Petter Selasky {
1871900b6f8SHans Petter Selasky if (sensor_pci_no_comm(dev))
1881900b6f8SHans Petter Selasky return MLX5_SENSOR_PCI_COMM_ERR;
1891900b6f8SHans Petter Selasky if (pci_channel_offline(dev->pdev))
1901900b6f8SHans Petter Selasky return MLX5_SENSOR_PCI_ERR;
1911900b6f8SHans Petter Selasky if (sensor_nic_disabled(dev))
1921900b6f8SHans Petter Selasky return MLX5_SENSOR_NIC_DISABLED;
1931900b6f8SHans Petter Selasky if (sensor_nic_sw_reset(dev))
1941900b6f8SHans Petter Selasky return MLX5_SENSOR_NIC_SW_RESET;
195fe242ba7SHans Petter Selasky if (sensor_fw_synd_rfr(dev))
196fe242ba7SHans Petter Selasky return MLX5_SENSOR_FW_SYND_RFR;
1971900b6f8SHans Petter Selasky
1981900b6f8SHans Petter Selasky return MLX5_SENSOR_NO_ERR;
199a2485fe5SHans Petter Selasky }
200a2485fe5SHans Petter Selasky
reset_fw_if_needed(struct mlx5_core_dev * dev)201fe242ba7SHans Petter Selasky static void reset_fw_if_needed(struct mlx5_core_dev *dev)
202fe242ba7SHans Petter Selasky {
20329e54451SSlava Shwartsman bool supported;
204fe242ba7SHans Petter Selasky u32 cmdq_addr, fatal_error;
205fe242ba7SHans Petter Selasky
20629e54451SSlava Shwartsman if (!mlx5_fw_reset_enable)
20729e54451SSlava Shwartsman return;
20829e54451SSlava Shwartsman supported = (ioread32be(&dev->iseg->initializing) >>
20929e54451SSlava Shwartsman MLX5_FW_RESET_SUPPORTED_OFFSET) & 1;
210fe242ba7SHans Petter Selasky if (!supported)
211fe242ba7SHans Petter Selasky return;
212fe242ba7SHans Petter Selasky
213fe242ba7SHans Petter Selasky /* The reset only needs to be issued by one PF. The health buffer is
214fe242ba7SHans Petter Selasky * shared between all functions, and will be cleared during a reset.
215fe242ba7SHans Petter Selasky * Check again to avoid a redundant 2nd reset. If the fatal erros was
216fe242ba7SHans Petter Selasky * PCI related a reset won't help.
217fe242ba7SHans Petter Selasky */
218fe242ba7SHans Petter Selasky fatal_error = check_fatal_sensors(dev);
219fe242ba7SHans Petter Selasky if (fatal_error == MLX5_SENSOR_PCI_COMM_ERR ||
220fe242ba7SHans Petter Selasky fatal_error == MLX5_SENSOR_NIC_DISABLED ||
221d28b6b55SHans Petter Selasky fatal_error == MLX5_SENSOR_NIC_SW_RESET) {
222a2f4f59cSHans Petter Selasky mlx5_core_warn(dev,
223a2f4f59cSHans Petter Selasky "Not issuing FW reset. Either it's already done or won't help.\n");
224fe242ba7SHans Petter Selasky return;
225fe242ba7SHans Petter Selasky }
226fe242ba7SHans Petter Selasky
227a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "Issuing FW Reset\n");
228fe242ba7SHans Petter Selasky /* Write the NIC interface field to initiate the reset, the command
229fe242ba7SHans Petter Selasky * interface address also resides here, don't overwrite it.
230fe242ba7SHans Petter Selasky */
231fe242ba7SHans Petter Selasky cmdq_addr = ioread32be(&dev->iseg->cmdq_addr_l_sz);
232fe242ba7SHans Petter Selasky iowrite32be((cmdq_addr & 0xFFFFF000) |
233fe242ba7SHans Petter Selasky MLX5_NIC_IFC_SW_RESET << MLX5_NIC_IFC_OFFSET,
234fe242ba7SHans Petter Selasky &dev->iseg->cmdq_addr_l_sz);
235fe242ba7SHans Petter Selasky }
236fe242ba7SHans Petter Selasky
2375169fb81SHans Petter Selasky static bool
mlx5_health_allow_reset(struct mlx5_core_dev * dev)2385169fb81SHans Petter Selasky mlx5_health_allow_reset(struct mlx5_core_dev *dev)
2395169fb81SHans Petter Selasky {
2405169fb81SHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health;
2415169fb81SHans Petter Selasky unsigned int delta;
2425169fb81SHans Petter Selasky bool ret;
2435169fb81SHans Petter Selasky
2445169fb81SHans Petter Selasky if (health->last_reset_req != 0) {
2455169fb81SHans Petter Selasky delta = ticks - health->last_reset_req;
2465169fb81SHans Petter Selasky delta /= hz;
2475169fb81SHans Petter Selasky ret = delta >= sw_reset_to;
2485169fb81SHans Petter Selasky } else {
2495169fb81SHans Petter Selasky ret = true;
2505169fb81SHans Petter Selasky }
2515169fb81SHans Petter Selasky
2525169fb81SHans Petter Selasky /*
2535169fb81SHans Petter Selasky * In principle, ticks may be 0. Setting it to off by one (-1)
2545169fb81SHans Petter Selasky * to prevent certain reset in next request.
2555169fb81SHans Petter Selasky */
2565169fb81SHans Petter Selasky health->last_reset_req = ticks ? : -1;
2575169fb81SHans Petter Selasky if (!ret)
258a2f4f59cSHans Petter Selasky mlx5_core_warn(dev,
259a2f4f59cSHans Petter Selasky "Firmware reset elided due to auto-reset frequency threshold.\n");
2605169fb81SHans Petter Selasky return (ret);
2615169fb81SHans Petter Selasky }
2625169fb81SHans Petter Selasky
263d28b6b55SHans Petter Selasky #define MLX5_CRDUMP_WAIT_MS 60000
264d28b6b55SHans Petter Selasky #define MLX5_FW_RESET_WAIT_MS 1000
265d28b6b55SHans Petter Selasky #define MLX5_NIC_STATE_POLL_MS 5
mlx5_enter_error_state(struct mlx5_core_dev * dev,bool force)266c0902569SHans Petter Selasky void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
267a2485fe5SHans Petter Selasky {
268c2a1e807SHans Petter Selasky int end, delay_ms = MLX5_CRDUMP_WAIT_MS;
269d28b6b55SHans Petter Selasky u32 fatal_error;
270d28b6b55SHans Petter Selasky int lock = -EBUSY;
271d28b6b55SHans Petter Selasky
272d28b6b55SHans Petter Selasky fatal_error = check_fatal_sensors(dev);
273d28b6b55SHans Petter Selasky
274d28b6b55SHans Petter Selasky if (fatal_error || force) {
2756d54b22dSHans Petter Selasky if (xchg(&dev->state, MLX5_DEVICE_STATE_INTERNAL_ERROR) ==
2766d54b22dSHans Petter Selasky MLX5_DEVICE_STATE_INTERNAL_ERROR)
2776d54b22dSHans Petter Selasky return;
2781fb6089cSHans Petter Selasky if (!force)
2791fb6089cSHans Petter Selasky mlx5_core_err(dev, "internal state error detected\n");
280a0a4fd77SHans Petter Selasky
281a0a4fd77SHans Petter Selasky /*
282a0a4fd77SHans Petter Selasky * Queue the command completion handler on the command
283a0a4fd77SHans Petter Selasky * work queue to avoid racing with the real command
284a0a4fd77SHans Petter Selasky * completion handler and then wait for it to
285a0a4fd77SHans Petter Selasky * complete:
286a0a4fd77SHans Petter Selasky */
2878d1eeedbSHans Petter Selasky queue_work(dev->priv.health.wq_cmd, &dev->priv.health.work_cmd_completion);
2888d1eeedbSHans Petter Selasky flush_workqueue(dev->priv.health.wq_cmd);
2897053deebSHans Petter Selasky }
290a2485fe5SHans Petter Selasky
2916d54b22dSHans Petter Selasky mutex_lock(&dev->intf_state_mutex);
2926d54b22dSHans Petter Selasky
293d28b6b55SHans Petter Selasky if (force)
294d28b6b55SHans Petter Selasky goto err_state_done;
295d28b6b55SHans Petter Selasky
2965169fb81SHans Petter Selasky if (fatal_error == MLX5_SENSOR_FW_SYND_RFR &&
2975169fb81SHans Petter Selasky mlx5_health_allow_reset(dev)) {
29892d23c82SHans Petter Selasky /* Get cr-dump and reset FW semaphore */
299d28b6b55SHans Petter Selasky if (mlx5_core_is_pf(dev))
300b575d8c8SHans Petter Selasky lock = lock_sem_sw_reset(dev);
301d28b6b55SHans Petter Selasky
30292d23c82SHans Petter Selasky /* Execute cr-dump and SW reset */
303d28b6b55SHans Petter Selasky if (lock != -EBUSY) {
3043e40712eSHans Petter Selasky (void)mlx5_fwdump(dev);
305d28b6b55SHans Petter Selasky reset_fw_if_needed(dev);
306d28b6b55SHans Petter Selasky delay_ms = MLX5_FW_RESET_WAIT_MS;
307d28b6b55SHans Petter Selasky }
308d28b6b55SHans Petter Selasky }
309d28b6b55SHans Petter Selasky
310d28b6b55SHans Petter Selasky /* Recover from SW reset */
311d28b6b55SHans Petter Selasky end = jiffies + msecs_to_jiffies(delay_ms);
312d28b6b55SHans Petter Selasky do {
313d28b6b55SHans Petter Selasky if (sensor_nic_disabled(dev))
314d28b6b55SHans Petter Selasky break;
315d28b6b55SHans Petter Selasky
316d28b6b55SHans Petter Selasky msleep(MLX5_NIC_STATE_POLL_MS);
317d28b6b55SHans Petter Selasky } while (!time_after(jiffies, end));
318d28b6b55SHans Petter Selasky
319d28b6b55SHans Petter Selasky if (!sensor_nic_disabled(dev)) {
320a2f4f59cSHans Petter Selasky mlx5_core_err(dev, "NIC IFC still %d after %ums.\n",
321ba11bcecSHans Petter Selasky mlx5_get_nic_state(dev), delay_ms);
322d28b6b55SHans Petter Selasky }
323d28b6b55SHans Petter Selasky
324d28b6b55SHans Petter Selasky /* Release FW semaphore if you are the lock owner */
325d28b6b55SHans Petter Selasky if (!lock)
326b575d8c8SHans Petter Selasky unlock_sem_sw_reset(dev);
327d28b6b55SHans Petter Selasky
328a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "System error event triggered\n");
3297053deebSHans Petter Selasky
330d28b6b55SHans Petter Selasky err_state_done:
331843a89d3SSlava Shwartsman mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 1);
3327053deebSHans Petter Selasky mutex_unlock(&dev->intf_state_mutex);
333a2485fe5SHans Petter Selasky }
334a2485fe5SHans Petter Selasky
mlx5_handle_bad_state(struct mlx5_core_dev * dev)335a2485fe5SHans Petter Selasky static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
336a2485fe5SHans Petter Selasky {
337ba11bcecSHans Petter Selasky u8 nic_mode = mlx5_get_nic_state(dev);
338a2485fe5SHans Petter Selasky
3391900b6f8SHans Petter Selasky if (nic_mode == MLX5_NIC_IFC_SW_RESET) {
3401900b6f8SHans Petter Selasky /* The IFC mode field is 3 bits, so it will read 0x7 in two cases:
3411900b6f8SHans Petter Selasky * 1. PCI has been disabled (ie. PCI-AER, PF driver unloaded
3421900b6f8SHans Petter Selasky * and this is a VF), this is not recoverable by SW reset.
3431900b6f8SHans Petter Selasky * Logging of this is handled elsewhere.
3441900b6f8SHans Petter Selasky * 2. FW reset has been issued by another function, driver can
3451900b6f8SHans Petter Selasky * be reloaded to recover after the mode switches to
3461900b6f8SHans Petter Selasky * MLX5_NIC_IFC_DISABLED.
3471900b6f8SHans Petter Selasky */
3481900b6f8SHans Petter Selasky if (dev->priv.health.fatal_error != MLX5_SENSOR_PCI_COMM_ERR)
349a2f4f59cSHans Petter Selasky mlx5_core_warn(dev,
350a2f4f59cSHans Petter Selasky "NIC SW reset is already progress\n");
3511900b6f8SHans Petter Selasky else
352a2f4f59cSHans Petter Selasky mlx5_core_warn(dev,
353a2f4f59cSHans Petter Selasky "Communication with FW over the PCI link is down\n");
3541900b6f8SHans Petter Selasky } else {
3551900b6f8SHans Petter Selasky mlx5_core_warn(dev, "NIC mode %d\n", nic_mode);
356a2485fe5SHans Petter Selasky }
357a2485fe5SHans Petter Selasky
358a2485fe5SHans Petter Selasky mlx5_disable_device(dev);
359a2485fe5SHans Petter Selasky }
360dc7e38acSHans Petter Selasky
3611900b6f8SHans Petter Selasky #define MLX5_FW_RESET_WAIT_MS 1000
3621900b6f8SHans Petter Selasky #define MLX5_NIC_STATE_POLL_MS 5
health_recover(struct work_struct * work)3634bb7662bSHans Petter Selasky static void health_recover(struct work_struct *work)
3644bb7662bSHans Petter Selasky {
3651900b6f8SHans Petter Selasky unsigned long end = jiffies + msecs_to_jiffies(MLX5_FW_RESET_WAIT_MS);
3664bb7662bSHans Petter Selasky struct mlx5_core_health *health;
3674bb7662bSHans Petter Selasky struct delayed_work *dwork;
3684bb7662bSHans Petter Selasky struct mlx5_core_dev *dev;
3694bb7662bSHans Petter Selasky struct mlx5_priv *priv;
370f20b553dSHans Petter Selasky bool recover = true;
3711900b6f8SHans Petter Selasky u8 nic_mode;
3724bb7662bSHans Petter Selasky
3734bb7662bSHans Petter Selasky dwork = container_of(work, struct delayed_work, work);
3744bb7662bSHans Petter Selasky health = container_of(dwork, struct mlx5_core_health, recover_work);
3754bb7662bSHans Petter Selasky priv = container_of(health, struct mlx5_priv, health);
3764bb7662bSHans Petter Selasky dev = container_of(priv, struct mlx5_core_dev, priv);
3774bb7662bSHans Petter Selasky
378c6df6f53SWarner Losh /* This might likely be wrong, cut and paste from elsewhere? */
379c6df6f53SWarner Losh bus_topo_lock();
380ca2345a0SHans Petter Selasky
3811900b6f8SHans Petter Selasky if (sensor_pci_no_comm(dev)) {
382a2f4f59cSHans Petter Selasky mlx5_core_err(dev,
383a2f4f59cSHans Petter Selasky "health recovery flow aborted, PCI reads still not working\n");
384f20b553dSHans Petter Selasky recover = false;
3851900b6f8SHans Petter Selasky }
3861900b6f8SHans Petter Selasky
387ba11bcecSHans Petter Selasky nic_mode = mlx5_get_nic_state(dev);
3881900b6f8SHans Petter Selasky while (nic_mode != MLX5_NIC_IFC_DISABLED &&
3891900b6f8SHans Petter Selasky !time_after(jiffies, end)) {
3901900b6f8SHans Petter Selasky msleep(MLX5_NIC_STATE_POLL_MS);
391ba11bcecSHans Petter Selasky nic_mode = mlx5_get_nic_state(dev);
3921900b6f8SHans Petter Selasky }
3931900b6f8SHans Petter Selasky
3941900b6f8SHans Petter Selasky if (nic_mode != MLX5_NIC_IFC_DISABLED) {
395a2f4f59cSHans Petter Selasky mlx5_core_err(dev,
396a2f4f59cSHans Petter Selasky "health recovery flow aborted, unexpected NIC IFC mode %d.\n",
3971900b6f8SHans Petter Selasky nic_mode);
398f20b553dSHans Petter Selasky recover = false;
3994bb7662bSHans Petter Selasky }
4004bb7662bSHans Petter Selasky
401f20b553dSHans Petter Selasky if (recover) {
402a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "Starting health recovery flow\n");
4034bb7662bSHans Petter Selasky mlx5_recover_device(dev);
4044bb7662bSHans Petter Selasky }
405ca2345a0SHans Petter Selasky
406c6df6f53SWarner Losh bus_topo_unlock();
407f20b553dSHans Petter Selasky }
4084bb7662bSHans Petter Selasky
4094bb7662bSHans Petter Selasky /* How much time to wait until health resetting the driver (in msecs) */
4104bb7662bSHans Petter Selasky #define MLX5_RECOVERY_DELAY_MSECS 60000
4111900b6f8SHans Petter Selasky #define MLX5_RECOVERY_NO_DELAY 0
get_recovery_delay(struct mlx5_core_dev * dev)4121900b6f8SHans Petter Selasky static unsigned long get_recovery_delay(struct mlx5_core_dev *dev)
4131900b6f8SHans Petter Selasky {
4141900b6f8SHans Petter Selasky return dev->priv.health.fatal_error == MLX5_SENSOR_PCI_ERR ||
4151900b6f8SHans Petter Selasky dev->priv.health.fatal_error == MLX5_SENSOR_PCI_COMM_ERR ?
4161900b6f8SHans Petter Selasky MLX5_RECOVERY_DELAY_MSECS : MLX5_RECOVERY_NO_DELAY;
4171900b6f8SHans Petter Selasky }
4181900b6f8SHans Petter Selasky
health_care(struct work_struct * work)419dc7e38acSHans Petter Selasky static void health_care(struct work_struct *work)
420dc7e38acSHans Petter Selasky {
421a2485fe5SHans Petter Selasky struct mlx5_core_health *health;
4221900b6f8SHans Petter Selasky unsigned long recover_delay;
423dc7e38acSHans Petter Selasky struct mlx5_core_dev *dev;
424dc7e38acSHans Petter Selasky struct mlx5_priv *priv;
4254bb7662bSHans Petter Selasky unsigned long flags;
426dc7e38acSHans Petter Selasky
427a2485fe5SHans Petter Selasky health = container_of(work, struct mlx5_core_health, work);
428dc7e38acSHans Petter Selasky priv = container_of(health, struct mlx5_priv, health);
429dc7e38acSHans Petter Selasky dev = container_of(priv, struct mlx5_core_dev, priv);
430f20b553dSHans Petter Selasky
431dc7e38acSHans Petter Selasky mlx5_core_warn(dev, "handling bad device here\n");
432a2485fe5SHans Petter Selasky mlx5_handle_bad_state(dev);
4331900b6f8SHans Petter Selasky recover_delay = msecs_to_jiffies(get_recovery_delay(dev));
4344bb7662bSHans Petter Selasky
4354bb7662bSHans Petter Selasky spin_lock_irqsave(&health->wq_lock, flags);
436fe242ba7SHans Petter Selasky if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags)) {
437a2f4f59cSHans Petter Selasky mlx5_core_warn(dev,
438a2f4f59cSHans Petter Selasky "Scheduling recovery work with %lums delay\n",
439fe242ba7SHans Petter Selasky recover_delay);
4404bb7662bSHans Petter Selasky schedule_delayed_work(&health->recover_work, recover_delay);
441fe242ba7SHans Petter Selasky } else {
442a2f4f59cSHans Petter Selasky mlx5_core_err(dev,
4434bb7662bSHans Petter Selasky "new health works are not permitted at this stage\n");
444fe242ba7SHans Petter Selasky }
4454bb7662bSHans Petter Selasky spin_unlock_irqrestore(&health->wq_lock, flags);
446dc7e38acSHans Petter Selasky }
447a2485fe5SHans Petter Selasky
get_next_poll_jiffies(void)448a2485fe5SHans Petter Selasky static int get_next_poll_jiffies(void)
449a2485fe5SHans Petter Selasky {
450a2485fe5SHans Petter Selasky unsigned long next;
451a2485fe5SHans Petter Selasky
452a2485fe5SHans Petter Selasky get_random_bytes(&next, sizeof(next));
453a2485fe5SHans Petter Selasky next %= HZ;
454a2485fe5SHans Petter Selasky next += jiffies + MLX5_HEALTH_POLL_INTERVAL;
455a2485fe5SHans Petter Selasky
456a2485fe5SHans Petter Selasky return next;
457dc7e38acSHans Petter Selasky }
458dc7e38acSHans Petter Selasky
mlx5_trigger_health_work(struct mlx5_core_dev * dev)4594bb7662bSHans Petter Selasky void mlx5_trigger_health_work(struct mlx5_core_dev *dev)
4604bb7662bSHans Petter Selasky {
4614bb7662bSHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health;
4624bb7662bSHans Petter Selasky unsigned long flags;
4634bb7662bSHans Petter Selasky
4644bb7662bSHans Petter Selasky spin_lock_irqsave(&health->wq_lock, flags);
4654bb7662bSHans Petter Selasky if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
4664bb7662bSHans Petter Selasky queue_work(health->wq, &health->work);
4674bb7662bSHans Petter Selasky else
468a2f4f59cSHans Petter Selasky mlx5_core_err(dev,
4694bb7662bSHans Petter Selasky "new health works are not permitted at this stage\n");
4704bb7662bSHans Petter Selasky spin_unlock_irqrestore(&health->wq_lock, flags);
4714bb7662bSHans Petter Selasky }
4724bb7662bSHans Petter Selasky
hsynd_str(u8 synd)473dc7e38acSHans Petter Selasky static const char *hsynd_str(u8 synd)
474dc7e38acSHans Petter Selasky {
475dc7e38acSHans Petter Selasky switch (synd) {
476dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_FW_ERR:
477dc7e38acSHans Petter Selasky return "firmware internal error";
478dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_IRISC_ERR:
479dc7e38acSHans Petter Selasky return "irisc not responding";
480a2485fe5SHans Petter Selasky case MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR:
481a2485fe5SHans Petter Selasky return "unrecoverable hardware error";
482dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_CRC_ERR:
483dc7e38acSHans Petter Selasky return "firmware CRC error";
484dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR:
485dc7e38acSHans Petter Selasky return "ICM fetch PCI error";
486dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_HW_FTL_ERR:
487dc7e38acSHans Petter Selasky return "HW fatal error\n";
488dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR:
489dc7e38acSHans Petter Selasky return "async EQ buffer overrun";
490dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_EQ_ERR:
491dc7e38acSHans Petter Selasky return "EQ error";
492a2485fe5SHans Petter Selasky case MLX5_HEALTH_SYNDR_EQ_INV:
493a2485fe5SHans Petter Selasky return "Invalid EQ referenced";
494dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_FFSER_ERR:
495dc7e38acSHans Petter Selasky return "FFSER error";
496a2485fe5SHans Petter Selasky case MLX5_HEALTH_SYNDR_HIGH_TEMP:
49787b3c8ccSHans Petter Selasky return "High temperature";
498dc7e38acSHans Petter Selasky default:
499dc7e38acSHans Petter Selasky return "unrecognized error";
500dc7e38acSHans Petter Selasky }
501dc7e38acSHans Petter Selasky }
502dc7e38acSHans Petter Selasky
503c9bb26aeSHans Petter Selasky static u8
print_health_info(struct mlx5_core_dev * dev)504c9bb26aeSHans Petter Selasky print_health_info(struct mlx5_core_dev *dev)
505dc7e38acSHans Petter Selasky {
506dc7e38acSHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health;
507dc7e38acSHans Petter Selasky struct mlx5_health_buffer __iomem *h = health->health;
508c9bb26aeSHans Petter Selasky u8 synd = ioread8(&h->synd);
509a2485fe5SHans Petter Selasky char fw_str[18];
510a2485fe5SHans Petter Selasky u32 fw;
511dc7e38acSHans Petter Selasky int i;
512dc7e38acSHans Petter Selasky
513c9bb26aeSHans Petter Selasky /*
514c9bb26aeSHans Petter Selasky * If synd is 0x0 - this indicates that FW is unable to
515c9bb26aeSHans Petter Selasky * respond to initialization segment reads and health buffer
516c9bb26aeSHans Petter Selasky * should not be read.
517c9bb26aeSHans Petter Selasky */
518c9bb26aeSHans Petter Selasky if (synd == 0)
519c9bb26aeSHans Petter Selasky return (0);
520dc7e38acSHans Petter Selasky
521a2485fe5SHans Petter Selasky for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
522a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "assert_var[%d] 0x%08x\n", i,
523a2f4f59cSHans Petter Selasky ioread32be(h->assert_var + i));
524a2485fe5SHans Petter Selasky
525a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "assert_exit_ptr 0x%08x\n",
526a2f4f59cSHans Petter Selasky ioread32be(&h->assert_exit_ptr));
527a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "assert_callra 0x%08x\n",
528a2f4f59cSHans Petter Selasky ioread32be(&h->assert_callra));
529a2f4f59cSHans Petter Selasky snprintf(fw_str, sizeof(fw_str), "%d.%d.%d",
530a2f4f59cSHans Petter Selasky fw_rev_maj(dev), fw_rev_min(dev), fw_rev_sub(dev));
531a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "fw_ver %s\n", fw_str);
532a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "hw_id 0x%08x\n", ioread32be(&h->hw_id));
533a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "irisc_index %d\n", ioread8(&h->irisc_index));
534a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "synd 0x%x: %s\n",
535a2f4f59cSHans Petter Selasky ioread8(&h->synd), hsynd_str(ioread8(&h->synd)));
536a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd));
537a2485fe5SHans Petter Selasky fw = ioread32be(&h->fw_ver);
538a2f4f59cSHans Petter Selasky mlx5_core_info(dev, "raw fw_ver 0x%08x\n", fw);
539c9bb26aeSHans Petter Selasky
540c9bb26aeSHans Petter Selasky return synd;
541dc7e38acSHans Petter Selasky }
542dc7e38acSHans Petter Selasky
health_watchdog(struct work_struct * work)543adb6fd50SHans Petter Selasky static void health_watchdog(struct work_struct *work)
544adb6fd50SHans Petter Selasky {
545adb6fd50SHans Petter Selasky struct mlx5_core_dev *dev;
546adb6fd50SHans Petter Selasky u16 power;
547adb6fd50SHans Petter Selasky u8 status;
548adb6fd50SHans Petter Selasky int err;
549adb6fd50SHans Petter Selasky
550adb6fd50SHans Petter Selasky dev = container_of(work, struct mlx5_core_dev, priv.health.work_watchdog);
551adb6fd50SHans Petter Selasky
552adb6fd50SHans Petter Selasky if (!MLX5_CAP_GEN(dev, mcam_reg) ||
553adb6fd50SHans Petter Selasky !MLX5_CAP_MCAM_FEATURE(dev, pcie_status_and_power))
554adb6fd50SHans Petter Selasky return;
555adb6fd50SHans Petter Selasky
556adb6fd50SHans Petter Selasky err = mlx5_pci_read_power_status(dev, &power, &status);
557adb6fd50SHans Petter Selasky if (err < 0) {
558a2f4f59cSHans Petter Selasky mlx5_core_warn(dev, "Failed reading power status: %d\n",
559a2f4f59cSHans Petter Selasky err);
560adb6fd50SHans Petter Selasky return;
561adb6fd50SHans Petter Selasky }
562adb6fd50SHans Petter Selasky
563adb6fd50SHans Petter Selasky dev->pwr_value = power;
564adb6fd50SHans Petter Selasky
565adb6fd50SHans Petter Selasky if (dev->pwr_status != status) {
566adb6fd50SHans Petter Selasky
567adb6fd50SHans Petter Selasky switch (status) {
568adb6fd50SHans Petter Selasky case 0:
569adb6fd50SHans Petter Selasky dev->pwr_status = status;
570a2f4f59cSHans Petter Selasky mlx5_core_info(dev,
571a2f4f59cSHans Petter Selasky "PCI power is not published by the PCIe slot.\n");
572adb6fd50SHans Petter Selasky break;
573adb6fd50SHans Petter Selasky case 1:
574adb6fd50SHans Petter Selasky dev->pwr_status = status;
575a2f4f59cSHans Petter Selasky mlx5_core_info(dev,
576a2f4f59cSHans Petter Selasky "PCIe slot advertised sufficient power (%uW).\n",
577a2f4f59cSHans Petter Selasky power);
578adb6fd50SHans Petter Selasky break;
579adb6fd50SHans Petter Selasky case 2:
580adb6fd50SHans Petter Selasky dev->pwr_status = status;
581a2f4f59cSHans Petter Selasky mlx5_core_warn(dev,
582a2f4f59cSHans Petter Selasky "Detected insufficient power on the PCIe slot (%uW).\n",
583a2f4f59cSHans Petter Selasky power);
584adb6fd50SHans Petter Selasky break;
585adb6fd50SHans Petter Selasky default:
586adb6fd50SHans Petter Selasky dev->pwr_status = 0;
587a2f4f59cSHans Petter Selasky mlx5_core_warn(dev,
588a2f4f59cSHans Petter Selasky "Unknown power state detected(%d).\n",
589a2f4f59cSHans Petter Selasky status);
590adb6fd50SHans Petter Selasky break;
591adb6fd50SHans Petter Selasky }
592adb6fd50SHans Petter Selasky }
593adb6fd50SHans Petter Selasky }
594adb6fd50SHans Petter Selasky
595adb6fd50SHans Petter Selasky void
mlx5_trigger_health_watchdog(struct mlx5_core_dev * dev)596adb6fd50SHans Petter Selasky mlx5_trigger_health_watchdog(struct mlx5_core_dev *dev)
597adb6fd50SHans Petter Selasky {
598adb6fd50SHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health;
599adb6fd50SHans Petter Selasky unsigned long flags;
600adb6fd50SHans Petter Selasky
601adb6fd50SHans Petter Selasky spin_lock_irqsave(&health->wq_lock, flags);
602adb6fd50SHans Petter Selasky if (!test_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags))
603adb6fd50SHans Petter Selasky queue_work(health->wq_watchdog, &health->work_watchdog);
604adb6fd50SHans Petter Selasky else
605a2f4f59cSHans Petter Selasky mlx5_core_err(dev,
606adb6fd50SHans Petter Selasky "scheduling watchdog is not permitted at this stage\n");
607adb6fd50SHans Petter Selasky spin_unlock_irqrestore(&health->wq_lock, flags);
608adb6fd50SHans Petter Selasky }
609adb6fd50SHans Petter Selasky
poll_health(unsigned long data)61003ab395eSHans Petter Selasky static void poll_health(unsigned long data)
611dc7e38acSHans Petter Selasky {
612dc7e38acSHans Petter Selasky struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data;
613dc7e38acSHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health;
6141900b6f8SHans Petter Selasky u32 fatal_error;
615dc7e38acSHans Petter Selasky u32 count;
616dc7e38acSHans Petter Selasky
61730dfc051SHans Petter Selasky if (dev->state != MLX5_DEVICE_STATE_UP)
61830dfc051SHans Petter Selasky return;
61930dfc051SHans Petter Selasky
620dc7e38acSHans Petter Selasky count = ioread32be(health->health_counter);
621dc7e38acSHans Petter Selasky if (count == health->prev)
622dc7e38acSHans Petter Selasky ++health->miss_counter;
623dc7e38acSHans Petter Selasky else
624dc7e38acSHans Petter Selasky health->miss_counter = 0;
625dc7e38acSHans Petter Selasky
626dc7e38acSHans Petter Selasky health->prev = count;
627dc7e38acSHans Petter Selasky if (health->miss_counter == MAX_MISSES) {
628a2485fe5SHans Petter Selasky mlx5_core_err(dev, "device's health compromised - reached miss count\n");
629c9bb26aeSHans Petter Selasky if (print_health_info(dev) == 0)
630c9bb26aeSHans Petter Selasky mlx5_core_err(dev, "FW is unable to respond to initialization segment reads\n");
631a2485fe5SHans Petter Selasky }
632a2485fe5SHans Petter Selasky
6331900b6f8SHans Petter Selasky fatal_error = check_fatal_sensors(dev);
6341900b6f8SHans Petter Selasky
6351900b6f8SHans Petter Selasky if (fatal_error && !health->fatal_error) {
636a2f4f59cSHans Petter Selasky mlx5_core_err(dev,
637a2f4f59cSHans Petter Selasky "Fatal error %u detected\n", fatal_error);
6381900b6f8SHans Petter Selasky dev->priv.health.fatal_error = fatal_error;
639a2485fe5SHans Petter Selasky print_health_info(dev);
6404bb7662bSHans Petter Selasky mlx5_trigger_health_work(dev);
641dc7e38acSHans Petter Selasky }
6424bb7662bSHans Petter Selasky
6434bb7662bSHans Petter Selasky mod_timer(&health->timer, get_next_poll_jiffies());
644dc7e38acSHans Petter Selasky }
645dc7e38acSHans Petter Selasky
mlx5_start_health_poll(struct mlx5_core_dev * dev)646dc7e38acSHans Petter Selasky void mlx5_start_health_poll(struct mlx5_core_dev *dev)
647dc7e38acSHans Petter Selasky {
648dc7e38acSHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health;
649dc7e38acSHans Petter Selasky
650dc7e38acSHans Petter Selasky init_timer(&health->timer);
6511900b6f8SHans Petter Selasky health->fatal_error = MLX5_SENSOR_NO_ERR;
652ca551594SHans Petter Selasky clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
653519774eaSHans Petter Selasky clear_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
654adb6fd50SHans Petter Selasky clear_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags);
655dc7e38acSHans Petter Selasky health->health = &dev->iseg->health;
656dc7e38acSHans Petter Selasky health->health_counter = &dev->iseg->health_counter;
657dc7e38acSHans Petter Selasky
65803ab395eSHans Petter Selasky setup_timer(&health->timer, poll_health, (unsigned long)dev);
659dc7e38acSHans Petter Selasky mod_timer(&health->timer,
660dc7e38acSHans Petter Selasky round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL));
661adb6fd50SHans Petter Selasky
662adb6fd50SHans Petter Selasky /* do initial PCI power state readout */
663adb6fd50SHans Petter Selasky mlx5_trigger_health_watchdog(dev);
664dc7e38acSHans Petter Selasky }
665dc7e38acSHans Petter Selasky
mlx5_stop_health_poll(struct mlx5_core_dev * dev,bool disable_health)6662119f825SSlava Shwartsman void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health)
667dc7e38acSHans Petter Selasky {
668dc7e38acSHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health;
6692119f825SSlava Shwartsman unsigned long flags;
6702119f825SSlava Shwartsman
6712119f825SSlava Shwartsman if (disable_health) {
6722119f825SSlava Shwartsman spin_lock_irqsave(&health->wq_lock, flags);
6732119f825SSlava Shwartsman set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
6742119f825SSlava Shwartsman set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
675adb6fd50SHans Petter Selasky set_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags);
6762119f825SSlava Shwartsman spin_unlock_irqrestore(&health->wq_lock, flags);
6772119f825SSlava Shwartsman }
678dc7e38acSHans Petter Selasky
679dc7e38acSHans Petter Selasky del_timer_sync(&health->timer);
680dc7e38acSHans Petter Selasky }
681dc7e38acSHans Petter Selasky
mlx5_drain_health_wq(struct mlx5_core_dev * dev)682ca551594SHans Petter Selasky void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
683ca551594SHans Petter Selasky {
684ca551594SHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health;
6854bb7662bSHans Petter Selasky unsigned long flags;
686ca551594SHans Petter Selasky
6874bb7662bSHans Petter Selasky spin_lock_irqsave(&health->wq_lock, flags);
688ca551594SHans Petter Selasky set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
689519774eaSHans Petter Selasky set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
690adb6fd50SHans Petter Selasky set_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags);
6914bb7662bSHans Petter Selasky spin_unlock_irqrestore(&health->wq_lock, flags);
6924bb7662bSHans Petter Selasky cancel_delayed_work_sync(&health->recover_work);
693ca551594SHans Petter Selasky cancel_work_sync(&health->work);
694adb6fd50SHans Petter Selasky cancel_work_sync(&health->work_watchdog);
695ca551594SHans Petter Selasky }
696ca551594SHans Petter Selasky
mlx5_drain_health_recovery(struct mlx5_core_dev * dev)697519774eaSHans Petter Selasky void mlx5_drain_health_recovery(struct mlx5_core_dev *dev)
698519774eaSHans Petter Selasky {
699519774eaSHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health;
700519774eaSHans Petter Selasky unsigned long flags;
701519774eaSHans Petter Selasky
702519774eaSHans Petter Selasky spin_lock_irqsave(&health->wq_lock, flags);
703519774eaSHans Petter Selasky set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
704519774eaSHans Petter Selasky spin_unlock_irqrestore(&health->wq_lock, flags);
705519774eaSHans Petter Selasky cancel_delayed_work_sync(&dev->priv.health.recover_work);
706519774eaSHans Petter Selasky }
707519774eaSHans Petter Selasky
mlx5_health_cleanup(struct mlx5_core_dev * dev)708a2485fe5SHans Petter Selasky void mlx5_health_cleanup(struct mlx5_core_dev *dev)
709dc7e38acSHans Petter Selasky {
710a2485fe5SHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health;
711a2485fe5SHans Petter Selasky
712a2485fe5SHans Petter Selasky destroy_workqueue(health->wq);
71340218d73SHans Petter Selasky destroy_workqueue(health->wq_watchdog);
7148d1eeedbSHans Petter Selasky destroy_workqueue(health->wq_cmd);
715dc7e38acSHans Petter Selasky }
716dc7e38acSHans Petter Selasky
mlx5_health_init(struct mlx5_core_dev * dev)717a2485fe5SHans Petter Selasky int mlx5_health_init(struct mlx5_core_dev *dev)
718dc7e38acSHans Petter Selasky {
719a2485fe5SHans Petter Selasky struct mlx5_core_health *health;
72040218d73SHans Petter Selasky char name[64];
721dc7e38acSHans Petter Selasky
722a2485fe5SHans Petter Selasky health = &dev->priv.health;
723a2485fe5SHans Petter Selasky
72440218d73SHans Petter Selasky snprintf(name, sizeof(name), "%s-rec", dev_name(&dev->pdev->dev));
725a2485fe5SHans Petter Selasky health->wq = create_singlethread_workqueue(name);
726a2485fe5SHans Petter Selasky if (!health->wq)
7278d1eeedbSHans Petter Selasky goto err_recovery;
728a2485fe5SHans Petter Selasky
72940218d73SHans Petter Selasky snprintf(name, sizeof(name), "%s-wdg", dev_name(&dev->pdev->dev));
73040218d73SHans Petter Selasky health->wq_watchdog = create_singlethread_workqueue(name);
7318d1eeedbSHans Petter Selasky if (!health->wq_watchdog)
7328d1eeedbSHans Petter Selasky goto err_watchdog;
7338d1eeedbSHans Petter Selasky
7348d1eeedbSHans Petter Selasky snprintf(name, sizeof(name), "%s-cmd", dev_name(&dev->pdev->dev));
7358d1eeedbSHans Petter Selasky health->wq_cmd = create_singlethread_workqueue(name);
7368d1eeedbSHans Petter Selasky if (!health->wq_cmd)
7378d1eeedbSHans Petter Selasky goto err_cmd;
73840218d73SHans Petter Selasky
739ca551594SHans Petter Selasky spin_lock_init(&health->wq_lock);
740a2485fe5SHans Petter Selasky INIT_WORK(&health->work, health_care);
741adb6fd50SHans Petter Selasky INIT_WORK(&health->work_watchdog, health_watchdog);
742a0a4fd77SHans Petter Selasky INIT_WORK(&health->work_cmd_completion, mlx5_trigger_cmd_completions);
7434bb7662bSHans Petter Selasky INIT_DELAYED_WORK(&health->recover_work, health_recover);
744a2485fe5SHans Petter Selasky
745a2485fe5SHans Petter Selasky return 0;
7468d1eeedbSHans Petter Selasky
7478d1eeedbSHans Petter Selasky err_cmd:
7488d1eeedbSHans Petter Selasky destroy_workqueue(health->wq_watchdog);
7498d1eeedbSHans Petter Selasky err_watchdog:
7508d1eeedbSHans Petter Selasky destroy_workqueue(health->wq);
7518d1eeedbSHans Petter Selasky err_recovery:
7528d1eeedbSHans Petter Selasky return -ENOMEM;
753dc7e38acSHans Petter Selasky }
754