1dc7e38acSHans Petter Selasky /*- 28d1eeedbSHans Petter Selasky * Copyright (c) 2013-2019, Mellanox Technologies, Ltd. All rights reserved. 3dc7e38acSHans Petter Selasky * 4dc7e38acSHans Petter Selasky * Redistribution and use in source and binary forms, with or without 5dc7e38acSHans Petter Selasky * modification, are permitted provided that the following conditions 6dc7e38acSHans Petter Selasky * are met: 7dc7e38acSHans Petter Selasky * 1. Redistributions of source code must retain the above copyright 8dc7e38acSHans Petter Selasky * notice, this list of conditions and the following disclaimer. 9dc7e38acSHans Petter Selasky * 2. Redistributions in binary form must reproduce the above copyright 10dc7e38acSHans Petter Selasky * notice, this list of conditions and the following disclaimer in the 11dc7e38acSHans Petter Selasky * documentation and/or other materials provided with the distribution. 12dc7e38acSHans Petter Selasky * 13dc7e38acSHans Petter Selasky * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND 14dc7e38acSHans Petter Selasky * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15dc7e38acSHans Petter Selasky * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16dc7e38acSHans Petter Selasky * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 17dc7e38acSHans Petter Selasky * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18dc7e38acSHans Petter Selasky * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19dc7e38acSHans Petter Selasky * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20dc7e38acSHans Petter Selasky * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21dc7e38acSHans Petter Selasky * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22dc7e38acSHans Petter Selasky * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23dc7e38acSHans Petter Selasky * SUCH DAMAGE. 24dc7e38acSHans Petter Selasky * 25dc7e38acSHans Petter Selasky * $FreeBSD$ 26dc7e38acSHans Petter Selasky */ 27dc7e38acSHans Petter Selasky 28dc7e38acSHans Petter Selasky #include <linux/kernel.h> 29dc7e38acSHans Petter Selasky #include <linux/module.h> 30dc7e38acSHans Petter Selasky #include <linux/random.h> 31dc7e38acSHans Petter Selasky #include <linux/vmalloc.h> 32a2485fe5SHans Petter Selasky #include <linux/hardirq.h> 331900b6f8SHans Petter Selasky #include <linux/delay.h> 34dc7e38acSHans Petter Selasky #include <dev/mlx5/driver.h> 35dc7e38acSHans Petter Selasky #include <dev/mlx5/mlx5_ifc.h> 36dc7e38acSHans Petter Selasky #include "mlx5_core.h" 37dc7e38acSHans Petter Selasky 38dc7e38acSHans Petter Selasky #define MLX5_HEALTH_POLL_INTERVAL (2 * HZ) 39dc7e38acSHans Petter Selasky #define MAX_MISSES 3 40dc7e38acSHans Petter Selasky 41a2485fe5SHans Petter Selasky enum { 42ca551594SHans Petter Selasky MLX5_DROP_NEW_HEALTH_WORK, 43519774eaSHans Petter Selasky MLX5_DROP_NEW_RECOVERY_WORK, 44adb6fd50SHans Petter Selasky MLX5_DROP_NEW_WATCHDOG_WORK, 45ca551594SHans Petter Selasky }; 46ca551594SHans Petter Selasky 471900b6f8SHans Petter Selasky enum { 481900b6f8SHans Petter Selasky MLX5_SENSOR_NO_ERR = 0, 491900b6f8SHans Petter Selasky MLX5_SENSOR_PCI_COMM_ERR = 1, 501900b6f8SHans Petter Selasky MLX5_SENSOR_PCI_ERR = 2, 511900b6f8SHans Petter Selasky MLX5_SENSOR_NIC_DISABLED = 3, 521900b6f8SHans Petter Selasky MLX5_SENSOR_NIC_SW_RESET = 4, 53fe242ba7SHans Petter Selasky MLX5_SENSOR_FW_SYND_RFR = 5, 541900b6f8SHans Petter Selasky }; 551900b6f8SHans Petter Selasky 5629e54451SSlava Shwartsman static int mlx5_fw_reset_enable = 1; 5729e54451SSlava Shwartsman SYSCTL_INT(_hw_mlx5, OID_AUTO, fw_reset_enable, CTLFLAG_RWTUN, 5829e54451SSlava Shwartsman &mlx5_fw_reset_enable, 0, 5929e54451SSlava Shwartsman "Enable firmware reset"); 6029e54451SSlava Shwartsman 615169fb81SHans Petter Selasky static unsigned int sw_reset_to = 1200; 625169fb81SHans Petter Selasky SYSCTL_UINT(_hw_mlx5, OID_AUTO, sw_reset_timeout, CTLFLAG_RWTUN, 635169fb81SHans Petter Selasky &sw_reset_to, 0, 645169fb81SHans Petter Selasky "Minimum timeout in seconds between two firmware resets"); 655169fb81SHans Petter Selasky 665169fb81SHans Petter Selasky 67b575d8c8SHans Petter Selasky static int lock_sem_sw_reset(struct mlx5_core_dev *dev) 68f20b553dSHans Petter Selasky { 69b575d8c8SHans Petter Selasky int ret; 70f20b553dSHans Petter Selasky 71f20b553dSHans Petter Selasky /* Lock GW access */ 72b575d8c8SHans Petter Selasky ret = -mlx5_vsc_lock(dev); 73f20b553dSHans Petter Selasky if (ret) { 74b575d8c8SHans Petter Selasky mlx5_core_warn(dev, "Timed out locking gateway %d\n", ret); 75f20b553dSHans Petter Selasky return ret; 76f20b553dSHans Petter Selasky } 77f20b553dSHans Petter Selasky 78b575d8c8SHans Petter Selasky ret = -mlx5_vsc_lock_addr_space(dev, MLX5_SEMAPHORE_SW_RESET); 79b575d8c8SHans Petter Selasky if (ret) { 80f20b553dSHans Petter Selasky if (ret == -EBUSY) 81f20b553dSHans Petter Selasky mlx5_core_dbg(dev, "SW reset FW semaphore already locked, another function will handle the reset\n"); 82f20b553dSHans Petter Selasky else 83f20b553dSHans Petter Selasky mlx5_core_warn(dev, "SW reset semaphore lock return %d\n", ret); 84f20b553dSHans Petter Selasky } 85f20b553dSHans Petter Selasky 86f20b553dSHans Petter Selasky /* Unlock GW access */ 87b575d8c8SHans Petter Selasky mlx5_vsc_unlock(dev); 88b575d8c8SHans Petter Selasky 89b575d8c8SHans Petter Selasky return ret; 90b575d8c8SHans Petter Selasky } 91b575d8c8SHans Petter Selasky 92b575d8c8SHans Petter Selasky static int unlock_sem_sw_reset(struct mlx5_core_dev *dev) 93b575d8c8SHans Petter Selasky { 94b575d8c8SHans Petter Selasky int ret; 95b575d8c8SHans Petter Selasky 96b575d8c8SHans Petter Selasky /* Lock GW access */ 97b575d8c8SHans Petter Selasky ret = -mlx5_vsc_lock(dev); 98b575d8c8SHans Petter Selasky if (ret) { 99b575d8c8SHans Petter Selasky mlx5_core_warn(dev, "Timed out locking gateway %d\n", ret); 100b575d8c8SHans Petter Selasky return ret; 101b575d8c8SHans Petter Selasky } 102b575d8c8SHans Petter Selasky 103b575d8c8SHans Petter Selasky ret = -mlx5_vsc_unlock_addr_space(dev, MLX5_SEMAPHORE_SW_RESET); 104b575d8c8SHans Petter Selasky 105b575d8c8SHans Petter Selasky /* Unlock GW access */ 106b575d8c8SHans Petter Selasky mlx5_vsc_unlock(dev); 107f20b553dSHans Petter Selasky 108f20b553dSHans Petter Selasky return ret; 109f20b553dSHans Petter Selasky } 110f20b553dSHans Petter Selasky 111ba11bcecSHans Petter Selasky u8 mlx5_get_nic_state(struct mlx5_core_dev *dev) 112a2485fe5SHans Petter Selasky { 1131900b6f8SHans Petter Selasky return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 7; 114a2485fe5SHans Petter Selasky } 115a2485fe5SHans Petter Selasky 116ba11bcecSHans Petter Selasky void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state) 117ba11bcecSHans Petter Selasky { 118ba11bcecSHans Petter Selasky u32 cur_cmdq_addr_l_sz; 119ba11bcecSHans Petter Selasky 120ba11bcecSHans Petter Selasky cur_cmdq_addr_l_sz = ioread32be(&dev->iseg->cmdq_addr_l_sz); 121ba11bcecSHans Petter Selasky iowrite32be((cur_cmdq_addr_l_sz & 0xFFFFF000) | 122ba11bcecSHans Petter Selasky state << MLX5_NIC_IFC_OFFSET, 123ba11bcecSHans Petter Selasky &dev->iseg->cmdq_addr_l_sz); 124ba11bcecSHans Petter Selasky } 125ba11bcecSHans Petter Selasky 126fe242ba7SHans Petter Selasky static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev) 127fe242ba7SHans Petter Selasky { 128fe242ba7SHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 129fe242ba7SHans Petter Selasky struct mlx5_health_buffer __iomem *h = health->health; 130fe242ba7SHans Petter Selasky u32 rfr = ioread32be(&h->rfr) >> MLX5_RFR_OFFSET; 131fe242ba7SHans Petter Selasky u8 synd = ioread8(&h->synd); 132fe242ba7SHans Petter Selasky 133fe242ba7SHans Petter Selasky if (rfr && synd) 134fe242ba7SHans Petter Selasky mlx5_core_dbg(dev, "FW requests reset, synd: %d\n", synd); 135fe242ba7SHans Petter Selasky return rfr && synd; 136fe242ba7SHans Petter Selasky } 137fe242ba7SHans Petter Selasky 138a0a4fd77SHans Petter Selasky static void mlx5_trigger_cmd_completions(struct work_struct *work) 139a2485fe5SHans Petter Selasky { 140a0a4fd77SHans Petter Selasky struct mlx5_core_dev *dev = 141a0a4fd77SHans Petter Selasky container_of(work, struct mlx5_core_dev, priv.health.work_cmd_completion); 142a2485fe5SHans Petter Selasky unsigned long flags; 143a2485fe5SHans Petter Selasky u64 vector; 144a2485fe5SHans Petter Selasky 145a2485fe5SHans Petter Selasky /* wait for pending handlers to complete */ 146a2485fe5SHans Petter Selasky synchronize_irq(dev->priv.msix_arr[MLX5_EQ_VEC_CMD].vector); 147a2485fe5SHans Petter Selasky spin_lock_irqsave(&dev->cmd.alloc_lock, flags); 148a2485fe5SHans Petter Selasky vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1); 149a2485fe5SHans Petter Selasky if (!vector) 150a2485fe5SHans Petter Selasky goto no_trig; 151a2485fe5SHans Petter Selasky 152a2485fe5SHans Petter Selasky vector |= MLX5_TRIGGERED_CMD_COMP; 153a2485fe5SHans Petter Selasky spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); 154a2485fe5SHans Petter Selasky 1552cec1528SKonstantin Belousov mlx5_core_dbg(dev, "vector 0x%jx\n", (uintmax_t)vector); 156721a1a6aSSlava Shwartsman mlx5_cmd_comp_handler(dev, vector, MLX5_CMD_MODE_EVENTS); 157a2485fe5SHans Petter Selasky return; 158a2485fe5SHans Petter Selasky 159a2485fe5SHans Petter Selasky no_trig: 160a2485fe5SHans Petter Selasky spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); 161a2485fe5SHans Petter Selasky } 162a2485fe5SHans Petter Selasky 1631900b6f8SHans Petter Selasky static bool sensor_pci_no_comm(struct mlx5_core_dev *dev) 164a2485fe5SHans Petter Selasky { 165a2485fe5SHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 166a2485fe5SHans Petter Selasky struct mlx5_health_buffer __iomem *h = health->health; 1671900b6f8SHans Petter Selasky bool err = ioread32be(&h->fw_ver) == 0xffffffff; 168a2485fe5SHans Petter Selasky 1691900b6f8SHans Petter Selasky return err; 1701900b6f8SHans Petter Selasky } 171a2485fe5SHans Petter Selasky 1721900b6f8SHans Petter Selasky static bool sensor_nic_disabled(struct mlx5_core_dev *dev) 1731900b6f8SHans Petter Selasky { 174ba11bcecSHans Petter Selasky return mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED; 1751900b6f8SHans Petter Selasky } 176a2485fe5SHans Petter Selasky 1771900b6f8SHans Petter Selasky static bool sensor_nic_sw_reset(struct mlx5_core_dev *dev) 1781900b6f8SHans Petter Selasky { 179ba11bcecSHans Petter Selasky return mlx5_get_nic_state(dev) == MLX5_NIC_IFC_SW_RESET; 1801900b6f8SHans Petter Selasky } 1811900b6f8SHans Petter Selasky 1821900b6f8SHans Petter Selasky static u32 check_fatal_sensors(struct mlx5_core_dev *dev) 1831900b6f8SHans Petter Selasky { 1841900b6f8SHans Petter Selasky if (sensor_pci_no_comm(dev)) 1851900b6f8SHans Petter Selasky return MLX5_SENSOR_PCI_COMM_ERR; 1861900b6f8SHans Petter Selasky if (pci_channel_offline(dev->pdev)) 1871900b6f8SHans Petter Selasky return MLX5_SENSOR_PCI_ERR; 1881900b6f8SHans Petter Selasky if (sensor_nic_disabled(dev)) 1891900b6f8SHans Petter Selasky return MLX5_SENSOR_NIC_DISABLED; 1901900b6f8SHans Petter Selasky if (sensor_nic_sw_reset(dev)) 1911900b6f8SHans Petter Selasky return MLX5_SENSOR_NIC_SW_RESET; 192fe242ba7SHans Petter Selasky if (sensor_fw_synd_rfr(dev)) 193fe242ba7SHans Petter Selasky return MLX5_SENSOR_FW_SYND_RFR; 1941900b6f8SHans Petter Selasky 1951900b6f8SHans Petter Selasky return MLX5_SENSOR_NO_ERR; 196a2485fe5SHans Petter Selasky } 197a2485fe5SHans Petter Selasky 198fe242ba7SHans Petter Selasky static void reset_fw_if_needed(struct mlx5_core_dev *dev) 199fe242ba7SHans Petter Selasky { 20029e54451SSlava Shwartsman bool supported; 201fe242ba7SHans Petter Selasky u32 cmdq_addr, fatal_error; 202fe242ba7SHans Petter Selasky 20329e54451SSlava Shwartsman if (!mlx5_fw_reset_enable) 20429e54451SSlava Shwartsman return; 20529e54451SSlava Shwartsman supported = (ioread32be(&dev->iseg->initializing) >> 20629e54451SSlava Shwartsman MLX5_FW_RESET_SUPPORTED_OFFSET) & 1; 207fe242ba7SHans Petter Selasky if (!supported) 208fe242ba7SHans Petter Selasky return; 209fe242ba7SHans Petter Selasky 210fe242ba7SHans Petter Selasky /* The reset only needs to be issued by one PF. The health buffer is 211fe242ba7SHans Petter Selasky * shared between all functions, and will be cleared during a reset. 212fe242ba7SHans Petter Selasky * Check again to avoid a redundant 2nd reset. If the fatal erros was 213fe242ba7SHans Petter Selasky * PCI related a reset won't help. 214fe242ba7SHans Petter Selasky */ 215fe242ba7SHans Petter Selasky fatal_error = check_fatal_sensors(dev); 216fe242ba7SHans Petter Selasky if (fatal_error == MLX5_SENSOR_PCI_COMM_ERR || 217fe242ba7SHans Petter Selasky fatal_error == MLX5_SENSOR_NIC_DISABLED || 218d28b6b55SHans Petter Selasky fatal_error == MLX5_SENSOR_NIC_SW_RESET) { 2194950c6ecSHans Petter Selasky mlx5_core_warn(dev, "Not issuing FW reset. Either it's already done or won't help.\n"); 220fe242ba7SHans Petter Selasky return; 221fe242ba7SHans Petter Selasky } 222fe242ba7SHans Petter Selasky 223fe242ba7SHans Petter Selasky mlx5_core_warn(dev, "Issuing FW Reset\n"); 224fe242ba7SHans Petter Selasky /* Write the NIC interface field to initiate the reset, the command 225fe242ba7SHans Petter Selasky * interface address also resides here, don't overwrite it. 226fe242ba7SHans Petter Selasky */ 227fe242ba7SHans Petter Selasky cmdq_addr = ioread32be(&dev->iseg->cmdq_addr_l_sz); 228fe242ba7SHans Petter Selasky iowrite32be((cmdq_addr & 0xFFFFF000) | 229fe242ba7SHans Petter Selasky MLX5_NIC_IFC_SW_RESET << MLX5_NIC_IFC_OFFSET, 230fe242ba7SHans Petter Selasky &dev->iseg->cmdq_addr_l_sz); 231fe242ba7SHans Petter Selasky } 232fe242ba7SHans Petter Selasky 2335169fb81SHans Petter Selasky static bool 2345169fb81SHans Petter Selasky mlx5_health_allow_reset(struct mlx5_core_dev *dev) 2355169fb81SHans Petter Selasky { 2365169fb81SHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 2375169fb81SHans Petter Selasky unsigned int delta; 2385169fb81SHans Petter Selasky bool ret; 2395169fb81SHans Petter Selasky 2405169fb81SHans Petter Selasky if (health->last_reset_req != 0) { 2415169fb81SHans Petter Selasky delta = ticks - health->last_reset_req; 2425169fb81SHans Petter Selasky delta /= hz; 2435169fb81SHans Petter Selasky ret = delta >= sw_reset_to; 2445169fb81SHans Petter Selasky } else { 2455169fb81SHans Petter Selasky ret = true; 2465169fb81SHans Petter Selasky } 2475169fb81SHans Petter Selasky 2485169fb81SHans Petter Selasky /* 2495169fb81SHans Petter Selasky * In principle, ticks may be 0. Setting it to off by one (-1) 2505169fb81SHans Petter Selasky * to prevent certain reset in next request. 2515169fb81SHans Petter Selasky */ 2525169fb81SHans Petter Selasky health->last_reset_req = ticks ? : -1; 2535169fb81SHans Petter Selasky if (!ret) 2545169fb81SHans Petter Selasky mlx5_core_warn(dev, "Firmware reset elided due to " 2555169fb81SHans Petter Selasky "auto-reset frequency threshold.\n"); 2565169fb81SHans Petter Selasky return (ret); 2575169fb81SHans Petter Selasky } 2585169fb81SHans Petter Selasky 259d28b6b55SHans Petter Selasky #define MLX5_CRDUMP_WAIT_MS 60000 260d28b6b55SHans Petter Selasky #define MLX5_FW_RESET_WAIT_MS 1000 261d28b6b55SHans Petter Selasky #define MLX5_NIC_STATE_POLL_MS 5 262c0902569SHans Petter Selasky void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force) 263a2485fe5SHans Petter Selasky { 264c2a1e807SHans Petter Selasky int end, delay_ms = MLX5_CRDUMP_WAIT_MS; 265d28b6b55SHans Petter Selasky u32 fatal_error; 266d28b6b55SHans Petter Selasky int lock = -EBUSY; 267d28b6b55SHans Petter Selasky 268d28b6b55SHans Petter Selasky fatal_error = check_fatal_sensors(dev); 269d28b6b55SHans Petter Selasky 270d28b6b55SHans Petter Selasky if (fatal_error || force) { 2716d54b22dSHans Petter Selasky if (xchg(&dev->state, MLX5_DEVICE_STATE_INTERNAL_ERROR) == 2726d54b22dSHans Petter Selasky MLX5_DEVICE_STATE_INTERNAL_ERROR) 2736d54b22dSHans Petter Selasky return; 2741fb6089cSHans Petter Selasky if (!force) 2751fb6089cSHans Petter Selasky mlx5_core_err(dev, "internal state error detected\n"); 276a0a4fd77SHans Petter Selasky 277a0a4fd77SHans Petter Selasky /* 278a0a4fd77SHans Petter Selasky * Queue the command completion handler on the command 279a0a4fd77SHans Petter Selasky * work queue to avoid racing with the real command 280a0a4fd77SHans Petter Selasky * completion handler and then wait for it to 281a0a4fd77SHans Petter Selasky * complete: 282a0a4fd77SHans Petter Selasky */ 2838d1eeedbSHans Petter Selasky queue_work(dev->priv.health.wq_cmd, &dev->priv.health.work_cmd_completion); 2848d1eeedbSHans Petter Selasky flush_workqueue(dev->priv.health.wq_cmd); 2857053deebSHans Petter Selasky } 286a2485fe5SHans Petter Selasky 2876d54b22dSHans Petter Selasky mutex_lock(&dev->intf_state_mutex); 2886d54b22dSHans Petter Selasky 289d28b6b55SHans Petter Selasky if (force) 290d28b6b55SHans Petter Selasky goto err_state_done; 291d28b6b55SHans Petter Selasky 2925169fb81SHans Petter Selasky if (fatal_error == MLX5_SENSOR_FW_SYND_RFR && 2935169fb81SHans Petter Selasky mlx5_health_allow_reset(dev)) { 29492d23c82SHans Petter Selasky /* Get cr-dump and reset FW semaphore */ 295d28b6b55SHans Petter Selasky if (mlx5_core_is_pf(dev)) 296b575d8c8SHans Petter Selasky lock = lock_sem_sw_reset(dev); 297d28b6b55SHans Petter Selasky 29892d23c82SHans Petter Selasky /* Execute cr-dump and SW reset */ 299d28b6b55SHans Petter Selasky if (lock != -EBUSY) { 30092d23c82SHans Petter Selasky mlx5_fwdump(dev); 301d28b6b55SHans Petter Selasky reset_fw_if_needed(dev); 302d28b6b55SHans Petter Selasky delay_ms = MLX5_FW_RESET_WAIT_MS; 303d28b6b55SHans Petter Selasky } 304d28b6b55SHans Petter Selasky } 305d28b6b55SHans Petter Selasky 306d28b6b55SHans Petter Selasky /* Recover from SW reset */ 307d28b6b55SHans Petter Selasky end = jiffies + msecs_to_jiffies(delay_ms); 308d28b6b55SHans Petter Selasky do { 309d28b6b55SHans Petter Selasky if (sensor_nic_disabled(dev)) 310d28b6b55SHans Petter Selasky break; 311d28b6b55SHans Petter Selasky 312d28b6b55SHans Petter Selasky msleep(MLX5_NIC_STATE_POLL_MS); 313d28b6b55SHans Petter Selasky } while (!time_after(jiffies, end)); 314d28b6b55SHans Petter Selasky 315d28b6b55SHans Petter Selasky if (!sensor_nic_disabled(dev)) { 316c2a1e807SHans Petter Selasky dev_err(&dev->pdev->dev, "NIC IFC still %d after %ums.\n", 317ba11bcecSHans Petter Selasky mlx5_get_nic_state(dev), delay_ms); 318d28b6b55SHans Petter Selasky } 319d28b6b55SHans Petter Selasky 320d28b6b55SHans Petter Selasky /* Release FW semaphore if you are the lock owner */ 321d28b6b55SHans Petter Selasky if (!lock) 322b575d8c8SHans Petter Selasky unlock_sem_sw_reset(dev); 323d28b6b55SHans Petter Selasky 3241fb6089cSHans Petter Selasky mlx5_core_err(dev, "system error event triggered\n"); 3257053deebSHans Petter Selasky 326d28b6b55SHans Petter Selasky err_state_done: 327843a89d3SSlava Shwartsman mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 1); 3287053deebSHans Petter Selasky mutex_unlock(&dev->intf_state_mutex); 329a2485fe5SHans Petter Selasky } 330a2485fe5SHans Petter Selasky 331a2485fe5SHans Petter Selasky static void mlx5_handle_bad_state(struct mlx5_core_dev *dev) 332a2485fe5SHans Petter Selasky { 333ba11bcecSHans Petter Selasky u8 nic_mode = mlx5_get_nic_state(dev); 334a2485fe5SHans Petter Selasky 3351900b6f8SHans Petter Selasky if (nic_mode == MLX5_NIC_IFC_SW_RESET) { 3361900b6f8SHans Petter Selasky /* The IFC mode field is 3 bits, so it will read 0x7 in two cases: 3371900b6f8SHans Petter Selasky * 1. PCI has been disabled (ie. PCI-AER, PF driver unloaded 3381900b6f8SHans Petter Selasky * and this is a VF), this is not recoverable by SW reset. 3391900b6f8SHans Petter Selasky * Logging of this is handled elsewhere. 3401900b6f8SHans Petter Selasky * 2. FW reset has been issued by another function, driver can 3411900b6f8SHans Petter Selasky * be reloaded to recover after the mode switches to 3421900b6f8SHans Petter Selasky * MLX5_NIC_IFC_DISABLED. 3431900b6f8SHans Petter Selasky */ 3441900b6f8SHans Petter Selasky if (dev->priv.health.fatal_error != MLX5_SENSOR_PCI_COMM_ERR) 3451900b6f8SHans Petter Selasky mlx5_core_warn(dev, "NIC SW reset is already progress\n"); 3461900b6f8SHans Petter Selasky else 3471900b6f8SHans Petter Selasky mlx5_core_warn(dev, "Communication with FW over the PCI link is down\n"); 3481900b6f8SHans Petter Selasky } else { 3491900b6f8SHans Petter Selasky mlx5_core_warn(dev, "NIC mode %d\n", nic_mode); 350a2485fe5SHans Petter Selasky } 351a2485fe5SHans Petter Selasky 352a2485fe5SHans Petter Selasky mlx5_disable_device(dev); 353a2485fe5SHans Petter Selasky } 354dc7e38acSHans Petter Selasky 3551900b6f8SHans Petter Selasky #define MLX5_FW_RESET_WAIT_MS 1000 3561900b6f8SHans Petter Selasky #define MLX5_NIC_STATE_POLL_MS 5 3574bb7662bSHans Petter Selasky static void health_recover(struct work_struct *work) 3584bb7662bSHans Petter Selasky { 3591900b6f8SHans Petter Selasky unsigned long end = jiffies + msecs_to_jiffies(MLX5_FW_RESET_WAIT_MS); 3604bb7662bSHans Petter Selasky struct mlx5_core_health *health; 3614bb7662bSHans Petter Selasky struct delayed_work *dwork; 3624bb7662bSHans Petter Selasky struct mlx5_core_dev *dev; 3634bb7662bSHans Petter Selasky struct mlx5_priv *priv; 364f20b553dSHans Petter Selasky bool recover = true; 3651900b6f8SHans Petter Selasky u8 nic_mode; 3664bb7662bSHans Petter Selasky 3674bb7662bSHans Petter Selasky dwork = container_of(work, struct delayed_work, work); 3684bb7662bSHans Petter Selasky health = container_of(dwork, struct mlx5_core_health, recover_work); 3694bb7662bSHans Petter Selasky priv = container_of(health, struct mlx5_priv, health); 3704bb7662bSHans Petter Selasky dev = container_of(priv, struct mlx5_core_dev, priv); 3714bb7662bSHans Petter Selasky 372ca2345a0SHans Petter Selasky mtx_lock(&Giant); /* XXX newbus needs this */ 373ca2345a0SHans Petter Selasky 3741900b6f8SHans Petter Selasky if (sensor_pci_no_comm(dev)) { 3751900b6f8SHans Petter Selasky dev_err(&dev->pdev->dev, "health recovery flow aborted, PCI reads still not working\n"); 376f20b553dSHans Petter Selasky recover = false; 3771900b6f8SHans Petter Selasky } 3781900b6f8SHans Petter Selasky 379ba11bcecSHans Petter Selasky nic_mode = mlx5_get_nic_state(dev); 3801900b6f8SHans Petter Selasky while (nic_mode != MLX5_NIC_IFC_DISABLED && 3811900b6f8SHans Petter Selasky !time_after(jiffies, end)) { 3821900b6f8SHans Petter Selasky msleep(MLX5_NIC_STATE_POLL_MS); 383ba11bcecSHans Petter Selasky nic_mode = mlx5_get_nic_state(dev); 3841900b6f8SHans Petter Selasky } 3851900b6f8SHans Petter Selasky 3861900b6f8SHans Petter Selasky if (nic_mode != MLX5_NIC_IFC_DISABLED) { 3871900b6f8SHans Petter Selasky dev_err(&dev->pdev->dev, "health recovery flow aborted, unexpected NIC IFC mode %d.\n", 3881900b6f8SHans Petter Selasky nic_mode); 389f20b553dSHans Petter Selasky recover = false; 3904bb7662bSHans Petter Selasky } 3914bb7662bSHans Petter Selasky 392f20b553dSHans Petter Selasky if (recover) { 3934bb7662bSHans Petter Selasky dev_err(&dev->pdev->dev, "starting health recovery flow\n"); 3944bb7662bSHans Petter Selasky mlx5_recover_device(dev); 3954bb7662bSHans Petter Selasky } 396ca2345a0SHans Petter Selasky 397ca2345a0SHans Petter Selasky mtx_unlock(&Giant); 398f20b553dSHans Petter Selasky } 3994bb7662bSHans Petter Selasky 4004bb7662bSHans Petter Selasky /* How much time to wait until health resetting the driver (in msecs) */ 4014bb7662bSHans Petter Selasky #define MLX5_RECOVERY_DELAY_MSECS 60000 4021900b6f8SHans Petter Selasky #define MLX5_RECOVERY_NO_DELAY 0 4031900b6f8SHans Petter Selasky static unsigned long get_recovery_delay(struct mlx5_core_dev *dev) 4041900b6f8SHans Petter Selasky { 4051900b6f8SHans Petter Selasky return dev->priv.health.fatal_error == MLX5_SENSOR_PCI_ERR || 4061900b6f8SHans Petter Selasky dev->priv.health.fatal_error == MLX5_SENSOR_PCI_COMM_ERR ? 4071900b6f8SHans Petter Selasky MLX5_RECOVERY_DELAY_MSECS : MLX5_RECOVERY_NO_DELAY; 4081900b6f8SHans Petter Selasky } 4091900b6f8SHans Petter Selasky 410dc7e38acSHans Petter Selasky static void health_care(struct work_struct *work) 411dc7e38acSHans Petter Selasky { 412a2485fe5SHans Petter Selasky struct mlx5_core_health *health; 4131900b6f8SHans Petter Selasky unsigned long recover_delay; 414dc7e38acSHans Petter Selasky struct mlx5_core_dev *dev; 415dc7e38acSHans Petter Selasky struct mlx5_priv *priv; 4164bb7662bSHans Petter Selasky unsigned long flags; 417dc7e38acSHans Petter Selasky 418a2485fe5SHans Petter Selasky health = container_of(work, struct mlx5_core_health, work); 419dc7e38acSHans Petter Selasky priv = container_of(health, struct mlx5_priv, health); 420dc7e38acSHans Petter Selasky dev = container_of(priv, struct mlx5_core_dev, priv); 421f20b553dSHans Petter Selasky 422dc7e38acSHans Petter Selasky mlx5_core_warn(dev, "handling bad device here\n"); 423a2485fe5SHans Petter Selasky mlx5_handle_bad_state(dev); 4241900b6f8SHans Petter Selasky recover_delay = msecs_to_jiffies(get_recovery_delay(dev)); 4254bb7662bSHans Petter Selasky 4264bb7662bSHans Petter Selasky spin_lock_irqsave(&health->wq_lock, flags); 427fe242ba7SHans Petter Selasky if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags)) { 428fe242ba7SHans Petter Selasky mlx5_core_warn(dev, "Scheduling recovery work with %lums delay\n", 429fe242ba7SHans Petter Selasky recover_delay); 4304bb7662bSHans Petter Selasky schedule_delayed_work(&health->recover_work, recover_delay); 431fe242ba7SHans Petter Selasky } else { 4324bb7662bSHans Petter Selasky dev_err(&dev->pdev->dev, 4334bb7662bSHans Petter Selasky "new health works are not permitted at this stage\n"); 434fe242ba7SHans Petter Selasky } 4354bb7662bSHans Petter Selasky spin_unlock_irqrestore(&health->wq_lock, flags); 436dc7e38acSHans Petter Selasky } 437a2485fe5SHans Petter Selasky 438a2485fe5SHans Petter Selasky static int get_next_poll_jiffies(void) 439a2485fe5SHans Petter Selasky { 440a2485fe5SHans Petter Selasky unsigned long next; 441a2485fe5SHans Petter Selasky 442a2485fe5SHans Petter Selasky get_random_bytes(&next, sizeof(next)); 443a2485fe5SHans Petter Selasky next %= HZ; 444a2485fe5SHans Petter Selasky next += jiffies + MLX5_HEALTH_POLL_INTERVAL; 445a2485fe5SHans Petter Selasky 446a2485fe5SHans Petter Selasky return next; 447dc7e38acSHans Petter Selasky } 448dc7e38acSHans Petter Selasky 4494bb7662bSHans Petter Selasky void mlx5_trigger_health_work(struct mlx5_core_dev *dev) 4504bb7662bSHans Petter Selasky { 4514bb7662bSHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 4524bb7662bSHans Petter Selasky unsigned long flags; 4534bb7662bSHans Petter Selasky 4544bb7662bSHans Petter Selasky spin_lock_irqsave(&health->wq_lock, flags); 4554bb7662bSHans Petter Selasky if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags)) 4564bb7662bSHans Petter Selasky queue_work(health->wq, &health->work); 4574bb7662bSHans Petter Selasky else 4584bb7662bSHans Petter Selasky dev_err(&dev->pdev->dev, 4594bb7662bSHans Petter Selasky "new health works are not permitted at this stage\n"); 4604bb7662bSHans Petter Selasky spin_unlock_irqrestore(&health->wq_lock, flags); 4614bb7662bSHans Petter Selasky } 4624bb7662bSHans Petter Selasky 463dc7e38acSHans Petter Selasky static const char *hsynd_str(u8 synd) 464dc7e38acSHans Petter Selasky { 465dc7e38acSHans Petter Selasky switch (synd) { 466dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_FW_ERR: 467dc7e38acSHans Petter Selasky return "firmware internal error"; 468dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_IRISC_ERR: 469dc7e38acSHans Petter Selasky return "irisc not responding"; 470a2485fe5SHans Petter Selasky case MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR: 471a2485fe5SHans Petter Selasky return "unrecoverable hardware error"; 472dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_CRC_ERR: 473dc7e38acSHans Petter Selasky return "firmware CRC error"; 474dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR: 475dc7e38acSHans Petter Selasky return "ICM fetch PCI error"; 476dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_HW_FTL_ERR: 477dc7e38acSHans Petter Selasky return "HW fatal error\n"; 478dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR: 479dc7e38acSHans Petter Selasky return "async EQ buffer overrun"; 480dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_EQ_ERR: 481dc7e38acSHans Petter Selasky return "EQ error"; 482a2485fe5SHans Petter Selasky case MLX5_HEALTH_SYNDR_EQ_INV: 483a2485fe5SHans Petter Selasky return "Invalid EQ referenced"; 484dc7e38acSHans Petter Selasky case MLX5_HEALTH_SYNDR_FFSER_ERR: 485dc7e38acSHans Petter Selasky return "FFSER error"; 486a2485fe5SHans Petter Selasky case MLX5_HEALTH_SYNDR_HIGH_TEMP: 487a2485fe5SHans Petter Selasky return "High temprature"; 488dc7e38acSHans Petter Selasky default: 489dc7e38acSHans Petter Selasky return "unrecognized error"; 490dc7e38acSHans Petter Selasky } 491dc7e38acSHans Petter Selasky } 492dc7e38acSHans Petter Selasky 493c9bb26aeSHans Petter Selasky static u8 494c9bb26aeSHans Petter Selasky print_health_info(struct mlx5_core_dev *dev) 495dc7e38acSHans Petter Selasky { 496dc7e38acSHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 497dc7e38acSHans Petter Selasky struct mlx5_health_buffer __iomem *h = health->health; 498c9bb26aeSHans Petter Selasky u8 synd = ioread8(&h->synd); 499a2485fe5SHans Petter Selasky char fw_str[18]; 500a2485fe5SHans Petter Selasky u32 fw; 501dc7e38acSHans Petter Selasky int i; 502dc7e38acSHans Petter Selasky 503c9bb26aeSHans Petter Selasky /* 504c9bb26aeSHans Petter Selasky * If synd is 0x0 - this indicates that FW is unable to 505c9bb26aeSHans Petter Selasky * respond to initialization segment reads and health buffer 506c9bb26aeSHans Petter Selasky * should not be read. 507c9bb26aeSHans Petter Selasky */ 508c9bb26aeSHans Petter Selasky if (synd == 0) 509c9bb26aeSHans Petter Selasky return (0); 510dc7e38acSHans Petter Selasky 511a2485fe5SHans Petter Selasky for (i = 0; i < ARRAY_SIZE(h->assert_var); i++) 512a2485fe5SHans Petter Selasky printf("mlx5_core: INFO: ""assert_var[%d] 0x%08x\n", i, ioread32be(h->assert_var + i)); 513a2485fe5SHans Petter Selasky 514a2485fe5SHans Petter Selasky printf("mlx5_core: INFO: ""assert_exit_ptr 0x%08x\n", ioread32be(&h->assert_exit_ptr)); 515a2485fe5SHans Petter Selasky printf("mlx5_core: INFO: ""assert_callra 0x%08x\n", ioread32be(&h->assert_callra)); 516a2485fe5SHans Petter Selasky snprintf(fw_str, sizeof(fw_str), "%d.%d.%d", fw_rev_maj(dev), fw_rev_min(dev), fw_rev_sub(dev)); 517a2485fe5SHans Petter Selasky printf("mlx5_core: INFO: ""fw_ver %s\n", fw_str); 518a2485fe5SHans Petter Selasky printf("mlx5_core: INFO: ""hw_id 0x%08x\n", ioread32be(&h->hw_id)); 519a2485fe5SHans Petter Selasky printf("mlx5_core: INFO: ""irisc_index %d\n", ioread8(&h->irisc_index)); 520c9bb26aeSHans Petter Selasky printf("mlx5_core: INFO: ""synd 0x%x: %s\n", synd, hsynd_str(synd)); 521a2485fe5SHans Petter Selasky printf("mlx5_core: INFO: ""ext_synd 0x%04x\n", ioread16be(&h->ext_synd)); 522a2485fe5SHans Petter Selasky fw = ioread32be(&h->fw_ver); 523a2485fe5SHans Petter Selasky printf("mlx5_core: INFO: ""raw fw_ver 0x%08x\n", fw); 524c9bb26aeSHans Petter Selasky 525c9bb26aeSHans Petter Selasky return synd; 526dc7e38acSHans Petter Selasky } 527dc7e38acSHans Petter Selasky 528adb6fd50SHans Petter Selasky static void health_watchdog(struct work_struct *work) 529adb6fd50SHans Petter Selasky { 530adb6fd50SHans Petter Selasky struct mlx5_core_dev *dev; 531adb6fd50SHans Petter Selasky u16 power; 532adb6fd50SHans Petter Selasky u8 status; 533adb6fd50SHans Petter Selasky int err; 534adb6fd50SHans Petter Selasky 535adb6fd50SHans Petter Selasky dev = container_of(work, struct mlx5_core_dev, priv.health.work_watchdog); 536adb6fd50SHans Petter Selasky 537adb6fd50SHans Petter Selasky if (!MLX5_CAP_GEN(dev, mcam_reg) || 538adb6fd50SHans Petter Selasky !MLX5_CAP_MCAM_FEATURE(dev, pcie_status_and_power)) 539adb6fd50SHans Petter Selasky return; 540adb6fd50SHans Petter Selasky 541adb6fd50SHans Petter Selasky err = mlx5_pci_read_power_status(dev, &power, &status); 542adb6fd50SHans Petter Selasky if (err < 0) { 543adb6fd50SHans Petter Selasky mlx5_core_warn(dev, "Failed reading power status: %d\n", err); 544adb6fd50SHans Petter Selasky return; 545adb6fd50SHans Petter Selasky } 546adb6fd50SHans Petter Selasky 547adb6fd50SHans Petter Selasky dev->pwr_value = power; 548adb6fd50SHans Petter Selasky 549adb6fd50SHans Petter Selasky if (dev->pwr_status != status) { 550adb6fd50SHans Petter Selasky device_t bsddev = dev->pdev->dev.bsddev; 551adb6fd50SHans Petter Selasky 552adb6fd50SHans Petter Selasky switch (status) { 553adb6fd50SHans Petter Selasky case 0: 554adb6fd50SHans Petter Selasky dev->pwr_status = status; 555adb6fd50SHans Petter Selasky device_printf(bsddev, "PCI power is not published by the PCIe slot.\n"); 556adb6fd50SHans Petter Selasky break; 557adb6fd50SHans Petter Selasky case 1: 558adb6fd50SHans Petter Selasky dev->pwr_status = status; 559adb6fd50SHans Petter Selasky device_printf(bsddev, "PCIe slot advertised sufficient power (%uW).\n", power); 560adb6fd50SHans Petter Selasky break; 561adb6fd50SHans Petter Selasky case 2: 562adb6fd50SHans Petter Selasky dev->pwr_status = status; 563adb6fd50SHans Petter Selasky device_printf(bsddev, "WARN: Detected insufficient power on the PCIe slot (%uW).\n", power); 564adb6fd50SHans Petter Selasky break; 565adb6fd50SHans Petter Selasky default: 566adb6fd50SHans Petter Selasky dev->pwr_status = 0; 567adb6fd50SHans Petter Selasky device_printf(bsddev, "WARN: Unknown power state detected(%d).\n", status); 568adb6fd50SHans Petter Selasky break; 569adb6fd50SHans Petter Selasky } 570adb6fd50SHans Petter Selasky } 571adb6fd50SHans Petter Selasky } 572adb6fd50SHans Petter Selasky 573adb6fd50SHans Petter Selasky void 574adb6fd50SHans Petter Selasky mlx5_trigger_health_watchdog(struct mlx5_core_dev *dev) 575adb6fd50SHans Petter Selasky { 576adb6fd50SHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 577adb6fd50SHans Petter Selasky unsigned long flags; 578adb6fd50SHans Petter Selasky 579adb6fd50SHans Petter Selasky spin_lock_irqsave(&health->wq_lock, flags); 580adb6fd50SHans Petter Selasky if (!test_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags)) 581adb6fd50SHans Petter Selasky queue_work(health->wq_watchdog, &health->work_watchdog); 582adb6fd50SHans Petter Selasky else 583adb6fd50SHans Petter Selasky dev_err(&dev->pdev->dev, 584adb6fd50SHans Petter Selasky "scheduling watchdog is not permitted at this stage\n"); 585adb6fd50SHans Petter Selasky spin_unlock_irqrestore(&health->wq_lock, flags); 586adb6fd50SHans Petter Selasky } 587adb6fd50SHans Petter Selasky 58803ab395eSHans Petter Selasky static void poll_health(unsigned long data) 589dc7e38acSHans Petter Selasky { 590dc7e38acSHans Petter Selasky struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data; 591dc7e38acSHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 5921900b6f8SHans Petter Selasky u32 fatal_error; 593dc7e38acSHans Petter Selasky u32 count; 594dc7e38acSHans Petter Selasky 59530dfc051SHans Petter Selasky if (dev->state != MLX5_DEVICE_STATE_UP) 59630dfc051SHans Petter Selasky return; 59730dfc051SHans Petter Selasky 598dc7e38acSHans Petter Selasky count = ioread32be(health->health_counter); 599dc7e38acSHans Petter Selasky if (count == health->prev) 600dc7e38acSHans Petter Selasky ++health->miss_counter; 601dc7e38acSHans Petter Selasky else 602dc7e38acSHans Petter Selasky health->miss_counter = 0; 603dc7e38acSHans Petter Selasky 604dc7e38acSHans Petter Selasky health->prev = count; 605dc7e38acSHans Petter Selasky if (health->miss_counter == MAX_MISSES) { 606a2485fe5SHans Petter Selasky mlx5_core_err(dev, "device's health compromised - reached miss count\n"); 607c9bb26aeSHans Petter Selasky if (print_health_info(dev) == 0) 608c9bb26aeSHans Petter Selasky mlx5_core_err(dev, "FW is unable to respond to initialization segment reads\n"); 609a2485fe5SHans Petter Selasky } 610a2485fe5SHans Petter Selasky 6111900b6f8SHans Petter Selasky fatal_error = check_fatal_sensors(dev); 6121900b6f8SHans Petter Selasky 6131900b6f8SHans Petter Selasky if (fatal_error && !health->fatal_error) { 6141900b6f8SHans Petter Selasky mlx5_core_err(dev, "Fatal error %u detected\n", fatal_error); 6151900b6f8SHans Petter Selasky dev->priv.health.fatal_error = fatal_error; 616a2485fe5SHans Petter Selasky print_health_info(dev); 6174bb7662bSHans Petter Selasky mlx5_trigger_health_work(dev); 618dc7e38acSHans Petter Selasky } 6194bb7662bSHans Petter Selasky 6204bb7662bSHans Petter Selasky mod_timer(&health->timer, get_next_poll_jiffies()); 621dc7e38acSHans Petter Selasky } 622dc7e38acSHans Petter Selasky 623dc7e38acSHans Petter Selasky void mlx5_start_health_poll(struct mlx5_core_dev *dev) 624dc7e38acSHans Petter Selasky { 625dc7e38acSHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 626dc7e38acSHans Petter Selasky 627dc7e38acSHans Petter Selasky init_timer(&health->timer); 6281900b6f8SHans Petter Selasky health->fatal_error = MLX5_SENSOR_NO_ERR; 629ca551594SHans Petter Selasky clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); 630519774eaSHans Petter Selasky clear_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); 631adb6fd50SHans Petter Selasky clear_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags); 632dc7e38acSHans Petter Selasky health->health = &dev->iseg->health; 633dc7e38acSHans Petter Selasky health->health_counter = &dev->iseg->health_counter; 634dc7e38acSHans Petter Selasky 63503ab395eSHans Petter Selasky setup_timer(&health->timer, poll_health, (unsigned long)dev); 636dc7e38acSHans Petter Selasky mod_timer(&health->timer, 637dc7e38acSHans Petter Selasky round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL)); 638adb6fd50SHans Petter Selasky 639adb6fd50SHans Petter Selasky /* do initial PCI power state readout */ 640adb6fd50SHans Petter Selasky mlx5_trigger_health_watchdog(dev); 641dc7e38acSHans Petter Selasky } 642dc7e38acSHans Petter Selasky 6432119f825SSlava Shwartsman void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health) 644dc7e38acSHans Petter Selasky { 645dc7e38acSHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 6462119f825SSlava Shwartsman unsigned long flags; 6472119f825SSlava Shwartsman 6482119f825SSlava Shwartsman if (disable_health) { 6492119f825SSlava Shwartsman spin_lock_irqsave(&health->wq_lock, flags); 6502119f825SSlava Shwartsman set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); 6512119f825SSlava Shwartsman set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); 652adb6fd50SHans Petter Selasky set_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags); 6532119f825SSlava Shwartsman spin_unlock_irqrestore(&health->wq_lock, flags); 6542119f825SSlava Shwartsman } 655dc7e38acSHans Petter Selasky 656dc7e38acSHans Petter Selasky del_timer_sync(&health->timer); 657dc7e38acSHans Petter Selasky } 658dc7e38acSHans Petter Selasky 659ca551594SHans Petter Selasky void mlx5_drain_health_wq(struct mlx5_core_dev *dev) 660ca551594SHans Petter Selasky { 661ca551594SHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 6624bb7662bSHans Petter Selasky unsigned long flags; 663ca551594SHans Petter Selasky 6644bb7662bSHans Petter Selasky spin_lock_irqsave(&health->wq_lock, flags); 665ca551594SHans Petter Selasky set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); 666519774eaSHans Petter Selasky set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); 667adb6fd50SHans Petter Selasky set_bit(MLX5_DROP_NEW_WATCHDOG_WORK, &health->flags); 6684bb7662bSHans Petter Selasky spin_unlock_irqrestore(&health->wq_lock, flags); 6694bb7662bSHans Petter Selasky cancel_delayed_work_sync(&health->recover_work); 670ca551594SHans Petter Selasky cancel_work_sync(&health->work); 671adb6fd50SHans Petter Selasky cancel_work_sync(&health->work_watchdog); 672ca551594SHans Petter Selasky } 673ca551594SHans Petter Selasky 674519774eaSHans Petter Selasky void mlx5_drain_health_recovery(struct mlx5_core_dev *dev) 675519774eaSHans Petter Selasky { 676519774eaSHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 677519774eaSHans Petter Selasky unsigned long flags; 678519774eaSHans Petter Selasky 679519774eaSHans Petter Selasky spin_lock_irqsave(&health->wq_lock, flags); 680519774eaSHans Petter Selasky set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); 681519774eaSHans Petter Selasky spin_unlock_irqrestore(&health->wq_lock, flags); 682519774eaSHans Petter Selasky cancel_delayed_work_sync(&dev->priv.health.recover_work); 683519774eaSHans Petter Selasky } 684519774eaSHans Petter Selasky 685a2485fe5SHans Petter Selasky void mlx5_health_cleanup(struct mlx5_core_dev *dev) 686dc7e38acSHans Petter Selasky { 687a2485fe5SHans Petter Selasky struct mlx5_core_health *health = &dev->priv.health; 688a2485fe5SHans Petter Selasky 689a2485fe5SHans Petter Selasky destroy_workqueue(health->wq); 69040218d73SHans Petter Selasky destroy_workqueue(health->wq_watchdog); 6918d1eeedbSHans Petter Selasky destroy_workqueue(health->wq_cmd); 692dc7e38acSHans Petter Selasky } 693dc7e38acSHans Petter Selasky 694a2485fe5SHans Petter Selasky int mlx5_health_init(struct mlx5_core_dev *dev) 695dc7e38acSHans Petter Selasky { 696a2485fe5SHans Petter Selasky struct mlx5_core_health *health; 69740218d73SHans Petter Selasky char name[64]; 698dc7e38acSHans Petter Selasky 699a2485fe5SHans Petter Selasky health = &dev->priv.health; 700a2485fe5SHans Petter Selasky 70140218d73SHans Petter Selasky snprintf(name, sizeof(name), "%s-rec", dev_name(&dev->pdev->dev)); 702a2485fe5SHans Petter Selasky health->wq = create_singlethread_workqueue(name); 703a2485fe5SHans Petter Selasky if (!health->wq) 7048d1eeedbSHans Petter Selasky goto err_recovery; 705a2485fe5SHans Petter Selasky 70640218d73SHans Petter Selasky snprintf(name, sizeof(name), "%s-wdg", dev_name(&dev->pdev->dev)); 70740218d73SHans Petter Selasky health->wq_watchdog = create_singlethread_workqueue(name); 7088d1eeedbSHans Petter Selasky if (!health->wq_watchdog) 7098d1eeedbSHans Petter Selasky goto err_watchdog; 7108d1eeedbSHans Petter Selasky 7118d1eeedbSHans Petter Selasky snprintf(name, sizeof(name), "%s-cmd", dev_name(&dev->pdev->dev)); 7128d1eeedbSHans Petter Selasky health->wq_cmd = create_singlethread_workqueue(name); 7138d1eeedbSHans Petter Selasky if (!health->wq_cmd) 7148d1eeedbSHans Petter Selasky goto err_cmd; 71540218d73SHans Petter Selasky 716ca551594SHans Petter Selasky spin_lock_init(&health->wq_lock); 717a2485fe5SHans Petter Selasky INIT_WORK(&health->work, health_care); 718adb6fd50SHans Petter Selasky INIT_WORK(&health->work_watchdog, health_watchdog); 719a0a4fd77SHans Petter Selasky INIT_WORK(&health->work_cmd_completion, mlx5_trigger_cmd_completions); 7204bb7662bSHans Petter Selasky INIT_DELAYED_WORK(&health->recover_work, health_recover); 721a2485fe5SHans Petter Selasky 722a2485fe5SHans Petter Selasky return 0; 7238d1eeedbSHans Petter Selasky 7248d1eeedbSHans Petter Selasky err_cmd: 7258d1eeedbSHans Petter Selasky destroy_workqueue(health->wq_watchdog); 7268d1eeedbSHans Petter Selasky err_watchdog: 7278d1eeedbSHans Petter Selasky destroy_workqueue(health->wq); 7288d1eeedbSHans Petter Selasky err_recovery: 7298d1eeedbSHans Petter Selasky return -ENOMEM; 730dc7e38acSHans Petter Selasky } 731